diff --git a/conftest.py b/conftest.py
index 6f691332a..9eecba3e3 100644
--- a/conftest.py
+++ b/conftest.py
@@ -149,6 +149,15 @@ def pytest_addoption(parser):
         help="Enable L2 swimlane. Bare flag=level 4 (full). "
         "1=AICore timing, 2=+dispatch/fanout, 3=+sched phases, 4=+orch phases",
     )
+    parser.addoption(
+        "--use-example-exec-time",
+        action="store_true",
+        default=False,
+        help="(fully_distributed_within_core sim only) Replace each incore kernel with a "
+        "busy-wait of its CALLABLE example_execute_time (microseconds) instead of running "
+        "the real kernel, so a fast sim run reflects measured on-hardware kernel durations "
+        "plus orchestration overhead. Other runtimes reject this flag.",
+    )
     parser.addoption(
         "--enable-device-log-timing",
         action="store_true",
diff --git a/docs/fully_distributed_within_core.md b/docs/fully_distributed_within_core.md
new file mode 100644
index 000000000..fbc441897
--- /dev/null
+++ b/docs/fully_distributed_within_core.md
@@ -0,0 +1,1236 @@
+# AICore 上的全分布式 Runtime
+
+本文档定义 **simpler** 的一种运行模式：编排（orchestration）、调度（scheduling）
+与执行（execution）全部以 SPMD 方式运行**在 AICore 自身**之上，**AICPU 完全不参与**。
+不存在独立的调度器：每个核自行构建、拥有并执行自己的任务。
+
+这是一份自洽的设计。第一部分描述系统如何工作（核的行为 + 伪代码）；第二部分列举
+各数据结构及其共享特性（全局共享 / 每核私有 / 每核复制）。
+
+本设计所替代的、当前以 AICPU 为中心的模型，参见
+[chip-level-arch.md](chip-level-arch.md) 与 [scheduler.md](scheduler.md)。编排编写
+API（`rt_submit_aic_task` / `rt_submit_aiv_task`，`pto_orchestration_api.h`）参见
+`src/{arch}/runtime/` 下的 `tensormap_and_ringbuffer` runtime。
+
+---
+
+# 第一部分 — 系统设计
+
+## 1. 概述
+
+- 编排函数**被加载并同时运行在每一个参与的 AICore 上**（SPMD）。所有核执行完全相同
+  的编排程序。
+- 每个核同时是**编排器 + 调度器 + worker**。经典的“调度器↔worker”握手（任务门铃、
+  ready 队列、完成邮箱、依赖连线线程）被**彻底取消**。
+- 面向编排的 API 保持不变。通用原语是 `rt_submit_task(MixedKernels, args)`；
+  `rt_submit_aic_task` / `rt_submit_aiv_task` 只是它的轻量便捷封装（**不存在**
+  `rt_submit_mixed_task`——MIX 任务就是一个填了多个 kernel 槽的 `MixedKernels`）。
+  在这些 API 背后，runtime 决定所有权、在本地构建任务，随后由同一个核执行它。
+- AICPU 不在编排与调度的关键路径上。
+
+本设计建立在以下四个支柱之上（下文逐一展开）：
+
+1. 任务所有权的**抢占竞争（claim race）**（§2）。
+2. **owner = builder = executor**，并配合核类型匹配（§3）。
+3. 用于依赖发现的**每核全量复制 TensorMap**（§4）。
+4. **每核私有任务环 + 一个全局完成标志环**，驱动一个采用拉取式依赖解析的
+   run-ahead 执行循环（§5–§6）。
+
+## 2. 任务所有权 —— 抢占竞争（Claim Race）
+
+所有核走**完全相同**的、确定性的 submit 序列。任务身份就是它在该序列中的位置：第 N 次
+`rt_submit_*` 调用在每个核上都是**任务 id `N`**，与最终由谁执行无关。
+
+所有权由以下两个量驱动：
+
+| 计数器 | 作用域 | 含义 |
+| ------ | ------ | ---- |
+| `claim_cursor[T]`（`cube_cursor`、`vector_cursor`） | **全局、原子** | 类型 `T` 已被认领任务 id 的高水位线。共**两个** cursor（cube = AIC-anchored，vector = AIV-only），二者都索引同一个共享 id 空间（§3.1） |
+| `local_current_task_index` | **每核** | 本核走 submit 序列时当前到达的任务 id |
+
+每次 `rt_submit_*`，匹配 anchor 类型的核执行如下逻辑（设 `T` 为此任务类型——若 AIC-anchored
+则为 cube，若 AIV-only 则为 vector）：
+
+```text
+local_current_task_index++                        # 到达下一个 submit 点 = 任务 id N
+if local_current_task_index > claim_cursor[T]:    # 我是否领先于 T 的高水位线？
+    # 本核是 T 类型中走得最靠前的 → 它 WIN，拥有任务 N。
+    claim_cursor[T] = local_current_task_index     # 发布（原子）
+    own = true
+else:
+    # 已有一个 T 类型的核更早认领了此 id（它跑在前面）。
+    own = false
+```
+
+胜者是该任务 id 的唯一 owner。所有权决定的是*谁来构建与执行*；它**不会**改变任务 id——
+该 id 是处处使用的确定性 submit 序号（完成标志环的索引、以及每个核的 producer 引用）。
+对于多核任务，胜者是 *anchor*；与它配对的同 block 核共同拥有其余子任务（§3.1）。
+
+为什么需要两个 cursor（以及为什么单一共享 cursor 是错的）在 §3.1 解释：两个 cursor 扫过
+同一 id 空间，各自只认领自己类型的 id，并**跨过**另一类型的 id，因此落后类型尚未认领的
+id 只是在等待它自己的 cursor —— 它们绝不会被跳过。
+
+> 确切原子原语（`atomic_fetch_max`，无则 CAS 回路）与内存序在 §11.1 定为规范；
+> 语义上每个任务 id 恰好有一个 anchor 胜出。
+
+## 3. owner = builder = executor；核类型匹配
+
+**抢到任务的提交者就是它的 owner。** owner 同时负责任务的**创建**（构建
+descriptor/payload、记录 fan-in producer id）与**执行**（调用 incore 函数）。一个核只会
+认领它自己能执行的类型的任务。
+
+任务由 `MixedKernels` 描述，最多携带三个子任务槽：
+
+```cpp
+struct MixedKernels {
+    int32_t aic_kernel_id  { INVALID_KERNEL_ID };   // AIC 子任务
+    int32_t aiv0_kernel_id { INVALID_KERNEL_ID };   // AIV 子任务 0
+    int32_t aiv1_kernel_id { INVALID_KERNEL_ID };   // AIV 子任务 1
+};
+```
+
+`active_mask` = 哪些槽有效，它恰好记录了一个 MIX 任务的 AIV 数量——**1C+1V**
+（`aic` + `aiv0`）还是 **1C+2V**（`aic` + `aiv0` + `aiv1`）。这一区分对所有权很关键：
+1C+1V 任务只绑定 AIV0_c，让 AIV1_c 保持空闲（§3.1）。因此任务是以下之一：AIC-only、
+AIV-only（1 个或 2 个 AIV 子任务）、或 **MIX**（AIC + 1 个或 2 个 AIV 子任务）。
+
+| 任务形态 | 子任务槽 | owner |
+| -------- | -------- | ----- |
+| **AIC-only** | `aic` | 任意一个 AIC 核 |
+| **AIV-only (1V)** | `aiv0` | **任意一个 AIV 核（AIV0 或 AIV1）** |
+| **AIV-only (2V)** | `aiv0`、`aiv1` | 同一 block 的两个 AIV 核 |
+| **MIX (1C+1V)** | `aic`、`aiv0` | 一个 AIC + 同 block 一个 AIV（共同 owner） |
+| **MIX (1C+2V)** | `aic`、`aiv0`、`aiv1` | 一个 AIC + 同 block 两个 AIV（共同 owner） |
+
+单槽封装（`rt_submit_aic_task` → 填 `aic`，`rt_submit_aiv_task` → 填 `aiv0`）是常见路径；
+多槽任务直接走 `rt_submit_task(MixedKernels, …)`。
+
+**单核 vs 多核——竞争资格按“类型”而非“固定槽角色”。** 竞争一个任务的资格由任务**类型**
+（cube / vector）决定，而非某个具体的 `aiv0`/`aiv1` 角色：
+
+- **单核任务（1C、1V）**：没有配对、没有 anchor/follower。任意一个**匹配类型**的核通过 §2 的
+  claim race 认领，胜者独自构建并执行那唯一的子任务。特别地，**1V（AIV-only 单核）由所有 AIV 核
+  竞争——AIV0 与 AIV1 同等参与**；胜者执行 `aiv0_kernel_id`，与它在 block 中是 AIV0 还是 AIV1
+  无关（两者都是 vector 核，可执行任意 AIV kernel）。
+- **多核任务（2V、MIX）**：需要同一物理 block 的多个核共同拥有，走 §3.1 的固定配对（anchor 胜出
+  后把其余子任务推送给同 block 伙伴）。
+
+换言之，`aiv0`/`aiv1` 的“固定角色”**只**在多核任务里用来把子任务映射到 block 内具体的核；对单核
+任务它不构成竞争限制。
+
+### 3.1 通过固定物理配对实现多核任务的共同所有权
+
+本节**只针对多核任务**（任意 MIX 任务，以及 2V 的 AIV-only 情况）——它们含多于一个有效子任务
+槽，必须被多个核同时拥有。单核任务（1C、1V）不走本节机制：由任意匹配类型的核（1V 即任意 AIV 核
+AIV0/AIV1）通过 §2 的 claim race 直接认领、独自执行，无 anchor/follower。本节规定多核任务的
+共同 owner 如何被选出、如何达成一致——这是模型中最难的部分。
+
+**配对被 FIXED（固定）到硬件 block。** 核被组织成硬件 block（cluster）；在本平台上一个
+block = **1 AIC + 2 AIV**（AIV0、AIV1）。这个 block 是永久的共同所有权单位：AIC_c 与
+AIV0_c、AIV1_c 静态配对。不存在动态配对选举。子任务槽到 block 内角色是固定映射：
+
+| 子任务槽 | 由谁执行（block `c` 内） |
+| -------- | ------------------------ |
+| `aic_kernel_id` | AIC_c |
+| `aiv0_kernel_id` | AIV0_c |
+| `aiv1_kernel_id` | AIV1_c |
+
+**Anchor + 同 block 跟随规则。** 一个多核任务只被**认领一次**，由一个 *anchor* 核认领；
+其 block 的其余核跟随：
+
+1. **谁竞争（anchor 类型）**：竞争按任务**类型** `T` 进行——含 AIC 子任务的任务（所有 MIX）
+   是 **cube 类型，只有 AIC 核竞争**；纯 AIV 的 2V 是 **vector 类型，由所有 AIV 核（AIV0/AIV1）
+   竞争**。胜出者即该任务的 **anchor**，它执行**自己物理角色**对应的那个槽（AIC 胜者执行 `aic`；
+   2V 由某个 AIV 胜出则执行它自己角色的 `aiv0`/`aiv1`），其余激活槽推送给同 block 伙伴。
+   **MIX 的 vector co-owner 绝不靠自己竞争得来**——它*完全*由“哪个 AIC 胜出”决定，即由胜者
+   所在的 block 决定（一个 AIV 核绝不会因为先到达就赢得某 MIX 的 vector 子任务）。
+2. 抢占竞争（§2）**仅在 anchor 类型之间**进行，竞争对象是 `cursor[T]`。胜出的 anchor 核
+   所在的 **block** 成为拥有该任务的 block。anchor 在胜出时**一次性解析整个任务的 fan-in**
+   producer id（从它在 `N` 处的 TensorMap 副本读取，各核内容相同——§4），把*自己*那个槽的
+   子任务构建进自己的私有环，并把该任务**其余激活槽**的子任务记录**推送（deposit）**进一张
+   **以任务 id 为键的 block-local 投递表** —— `block.won[N]` —— 内容为
+   `{active_mask = M, 各激活槽 kernel id, args, 已解析的 fan-in producer id, 剩余子任务计数
+   = popcount(M)}`。
+3. 同 block 的 follower 核**既不竞争、也不在自己的编排走位上对该任务做“等待 anchor 决定”
+   的判断**——它**永不因 anchor 而阻塞**。follower 的所有权完全靠 anchor 的**推送**到达：
+   follower **异步地从 `block.won` 抽取（drain）**属于自己槽的子任务投递，在私有环有空槽时
+   把它构建进环。follower 在自己的编排走位中遇到 MIX 任务时，只做 §4 的无条件 TensorMap
+   更新，然后继续前进，**不**对该 MIX 任务做任何所有权决定、**不**等待它的 anchor。
+
+**为什么是 anchor 推送，而不是 follower 自己走位 + 等待。** 两个 cursor 独立推进（§2），所以
+cube 与 vector 的进度可能任意错位。若让 follower 在自己的走位上“走到 N 再判断我的 block 是否
+赢了 N”，当它的 anchor 落后（`cube_cursor < vector_cursor`）时，follower 就无法区分“anchor
+还没决定 N”与“anchor 输了 N（别的 block 赢了）”，只能**阻塞等待** anchor 推进到 N——这会把
+vector 的吞吐死死耦合到 cube 的吞吐上，是不可接受的。**改为 anchor 推送即彻底消除这种 per-task
+阻塞**：
+
+- **cube 落后时**：`block.won` 里还没有给这个 AIV 的 MIX 投递 → AIV **不等待**，继续竞争并执行
+  它自己的 AIV-only 任务（以及抽取已到的其他投递）。零停顿。
+- **cube 领先时**：投递在 `block.won` 中累积 → AIV 有空槽就抽取构建。若 AIV 落后到填满
+  `block.won`，则 anchor **暂缓认领新的多核任务**（反压；见 §6 中 anchor 转去执行 Phase B 而
+  非自旋），方向正确：不让 cube 无限超前。
+
+`block.won` 以任务 id 为键（而非单一会被覆盖的槽），既承载每任务的剩余子任务计数，也允许同一
+block 多个并发多核任务的投递互不串扰。由于配对是静态的，投递的目标 follower 由 anchor 所在
+block 唯一确定，无需任何跨 block 协商。
+
+> 唯一残留的等待发生在**收尾**：若某 block 的 anchor 严重落后，它的 follower 在做完自己其余
+> 全部工作、私有环清空后，可能要在终止前空转，等 anchor 把最后的多核子任务推送过来（§7）。
+> 这是固定配对的固有代价——多核子任务的归属由 anchor 的认领决定；它不是 per-task 的串行阻塞，
+> 而只是尾部的一次空转，且在 cube 密集（cube 领先）的常见场景下根本不出现。
+
+**按形态的行为（设胜出 anchor 在 block `c`）：**
+
+| 任务形态（`active_mask`） | 谁竞争 | Anchor（胜者） | 被推送子任务的 follower | 同 block 未被绑定（保持空闲） |
+| ------------------------- | ------ | -------------- | ----------------------- | ----------------------------- |
+| **1C + 2V**（多核） | 所有 AIC | AIC_c | AIV0_c、AIV1_c | — |
+| **1C + 1V**（多核） | 所有 AIC | AIC_c | AIV0_c | **AIV1_c** |
+| **2V**（多核，AIV-only） | 所有 AIV（AIV0/AIV1） | 胜出的那个 AIV_c | 同 block 的另一个 AIV_c | AIC_c |
+| **1C**（单核，AIC-only） | 所有 AIC | 胜者独自执行，无配对 | — | （不涉及 block 配对） |
+| **1V**（单核，AIV-only） | **所有 AIV（AIV0/AIV1）** | 胜者独自执行，无配对 | — | （不涉及 block 配对） |
+
+多核任务（前三行）的 follower 身份都由 anchor 所在 block 唯一确定——不存在跨 block 协商。单核
+任务（后两行）没有 anchor/follower，胜者是哪个核就由哪个核独自执行；**1V 由 AIV0 与 AIV1 同等
+竞争**。
+
+**未被绑定的 block 伙伴不是闲着——它对其他任务保持空闲可用。** 当一个 block 赢得一个不激活
+某 block 伙伴槽位的任务时，那个核就**不被该任务占用**，且**绝不能**因它而阻塞或等待。它继续
+运行自己的编排，继续竞争并拥有其类型的其他任务。具体地：
+
+- 一个 **1C+1V** 任务只绑定 AIC_c + AIV0_c。**AIV1_c 是空闲的**，可继续竞争、认领并执行其他
+  AIV 任务（它自己竞争到的任意 1V/2V AIV-only 任务，或本 block 后续某个 1C+2V 任务的 AIV1 槽）。
+- 一个 **1C（AIC-only）** 任务只绑定一个 AIC 核；AIV 核**都**对 AIV 工作保持空闲。
+- 一个 **1V（AIV-only）** 任务是单核：由**任意一个 AIV 核（AIV0 或 AIV1）**竞争得到并独自执行，
+  其余 AIV 核与 AIC 核保持空闲。它不绑定任何固定角色。
+
+这是模型的自然结论：每个核都走相同的确定性 submit 序列，并逐任务判断自己的槽是否激活。在某个
+自己的槽未激活的 submit 点，该核就是不绑定该任务（但它仍执行 §4 的无条件 TensorMap 更新），
+然后继续——去认领它下一个有资格的任务。每个任务记录的 `active_mask`（1C+1V vs 1C+2V 等）
+就是告诉每个 block 伙伴自己是被绑定还是空闲的依据。
+
+**多核任务只有一个完成标志。** 即使有多个共同 owner，一个任务也恰好只有一个全局
+`task_completed_flag[N]`。每个共同 owner 执行自己的子任务后，递减 `block.won[N]` 中那个用
+`popcount(active_mask)` 初始化的**per-task 剩余计数器**。（该计数器存在以 id 为键的记录里，
+而非单一 block 字段，因此同一 block 的多个并发 MIX 任务不会互相串扰。）把计数器递减到零的那个
+共同 owner（最后完成的子任务）执行唯一一次全局写 `task_completed_flag[N] = true`。因此无论
+任务有多少个子任务，消费者都只看到一个原子的完成信号。每个共同 owner 在自己的子任务完成后
+立即释放自己的私有环槽位。
+
+**Claim 流一致性 —— 同一任务 id 空间上的两个全局 cursor。**
+
+只有**一个**任务 id 空间——确定性 submit 序列（第 N 次 submit = id `N`），处处用于完成标志
+环与 producer 引用。
+
+所有权由**两个全局 claim cursor** 决定，二者都由所有核共享，且都索引进*同一个* id 空间：
+
+- `cube_cursor` —— 已认领的 **cube（AIC-anchored）** 任务 id 的高水位线（AIC-only 与所有
+  MIX 任务）。
+- `vector_cursor` —— 已认领的 **vector（AIV-only）** 任务 id 的高水位线。
+
+一个到达类型 `T` 的任务 `N` 的核，当且仅当 `N > cursor[T]` 时赢得它；赢得后把 `cursor[T]`
+推进到 `N`。一个核只会推进它自己类型的 cursor；它**跨过**另一类型的 id 而不去碰它。
+
+两个 cursor 在共享 id 空间上**独立**推进，因此任意时刻其中一个可能领先于另一个。**推进一个
+cursor 不会认领它跨过的另一类型的 id。** 因此在领先 cursor 与落后 cursor 之间的 id 区间里
+可能存在**尚未认领的空洞**——这些是*落后*类型的、还没有任何核到达的 id。这是正确的，不是 bug：
+一个空洞只表示“暂时还没认领”；当一个该类型的核到达它时，落后类型的 cursor 会把它填上。
+
+```text
+任务 id:      0    1    2    3    4    5    6
+类型:         C    V    C    C    V    V    C
+                              ^cube_cursor=3        (cube 任务 0,2,3 已认领)
+                   ^vector_cursor=1                 (vector 任务 1 已认领)
+空洞: id 4 和 5 是位于 cube_cursor 之下的 vector 任务——仍 UNCLAIMED，
+      等待 vector_cursor 推进到它们。没有 orphaning。
+```
+
+在单一类型内部不存在空洞：每个核按 id 递增顺序遇到该类型的任务，而 cursor（一个单调高水位线）
+总是被设为刚刚认领的那个 id——因此该类型中所有 ≤ 其 cursor 的 id 都已被某个核拥有。（计数器的
+确切表示属于实现细节——§11。）
+
+**取舍。** 固定配对消除了一切跨 block 协商，并把唯一的共享协调状态保持在 **block-local**
+（1 AIC + 2 AIV 共享一小块区域），而非全局 per-task。代价是多核任务没有跨 block 的负载均衡；
+动态配对方案是未来的改进（§11）。
+
+### 3.2 为什么 vector 不竞争 MIX（以及“不会缺失 co-owner”的论证）
+
+> 这一节直接回答一个常见疑问：既然 vector 不参与 MIX 的竞争，会不会出现“cube 认领了某个 MIX
+> 任务，却没有任何 vector 核作为它的 co-owner”？答案是**不会**。并解释为什么不采用“让 vector
+> 也竞争 MIX”或“先到先得、由后到的同 block cube 反向认领”的替代方案。
+
+**结论一：vector 核不参与 MIX 的竞争。** MIX 永远 cube-anchored（§3.1）。vector 核遇到一个
+MIX 任务时走的是 follower 路径：它**不**碰 `vector_cursor`，只按 id 查 `block.won[N]`，看自己
+所在 block 的 AIC 是否赢了。它“先到达” MIX 任务这件事不授予它任何东西。
+
+**结论二：永远不会缺失 vector co-owner。** 原因有三条，缺一不可：
+
+1. MIX 任务是 cube 任务，**只**会推进 `cube_cursor`。`vector_cursor` 永远不认领 MIX 任务——
+   即便 `vector_cursor` 追上甚至越过 `cube_cursor`，它也只是在认领它路过的 *AIV-only* 任务，
+   绝不会“占用”任何 MIX 任务。所以不存在“被 vector_cursor 抢走却没有 vector 执行者”的 MIX 任务。
+2. 当某个 AIC 核 `AIC_x` 赢得 MIX 任务 `N` 时，它的 vector co-owner 由**固定物理配对**确定：
+   就是同 block 的 `AIV0_x`（若 1C+2V 还有 `AIV1_x`）。这个身份在胜负确定的瞬间就被钉死，
+   不需要任何额外竞争或选举。
+3. 当 `AIC_x` 赢得 `N` 时，它把 `AIV0_x`（及 1C+2V 的 `AIV1_x`）的子任务**推送**进
+   `block.won[N]`（§3.1）；`AIV0_x` 异步抽取并执行。**co-owner 的存在是被保证的。**
+
+**那么 `vector_cursor` 追上 `cube_cursor` 时究竟会发生什么？会不会变成 blocking wait？**
+不会。注意 MIX 归属靠 **anchor 推送**而非 follower 走位判断（§3.1），所以：
+
+- **cube 落后（`cube_cursor < vector_cursor`）时**：AIC 还没认领 `N`，因此 `block.won` 里还没有
+  给 AIV 的投递。AIV **不阻塞、不空等**——它继续竞争并执行自己的 AIV-only 任务，同时抽取已到的
+  其他投递。它在自己的走位上遇到 MIX 任务时只做 TensorMap 更新就走，**不**对该任务做归属判断、
+  **不**等待它的 cube 伙伴。
+- 等 AIC 日后认领到 `N`，投递才出现在 `block.won`，AIV 再抽取执行。
+
+换言之，不存在“AIV 走到 MIX 任务就 blocking wait 到 cube 追上来”的情况——这正是把旧设计的
+`wait_until(block.anchor_progress >= N)` 去掉、改为推送的原因。唯一残留的等待是**尾部空转**
+（§3.1、§7）：若某 block 的 AIC 严重落后，AIV 做完其余全部工作后会在终止前等 AIC 推送最后的
+多核子任务。这不是 per-task 串行阻塞，且 cube 领先（常见）时根本不出现。
+
+**为什么不让 vector 也竞争 MIX（方案 A）。** 因为 MIX 的 AIC 与 AIV 子任务必须在**同一物理
+block 内协同执行**（共享 local memory / 相互配合，这正是固定配对的意义），所以所有权的单位
+是 **block**，不是单个核。若允许 vector 核也去 anchor 一个 MIX 任务，会立刻破坏 §2 的 cursor
+不变式：
+
+- 若让 vector 核去推进 `cube_cursor` 来认领 MIX，它就会把位于旧 `cube_cursor` 与 `N` 之间的
+  那些 **cube-only 任务 orphan 掉**（跳过且无人认领）——这正是双 cursor 设计要避免的问题。
+- 若让 vector 核在 `vector_cursor` 上 anchor MIX，而某个 cube 核同时在 `cube_cursor` 上 anchor
+  同一个 MIX `N`，那么同一任务会被两个 cursor 各认领一次 → **两个不同的 block 都认为自己拥有
+  `N`**（跨 block 撕裂 / 双重认领）。错误。
+
+因此结论是：**每一类任务必须只有一个 anchor 类**。MIX 选 cube 作为唯一的 anchor 类，保证
+claim 是单写者、无 orphan、无跨 block 双重认领。
+
+**为什么“先到先得 + 后到的 cube 反向认领”（方案 B）也不采用。** 这个想法只能作为 **block
+内部**的“探测优化”（block 内谁先到达 `N` 谁就代表本 block 发布认领），而**不能**跨 block——
+跨 block 的正确性仍然要求一条单一的 claim 流，且该流必须是 cube 的（否则就 orphan 掉 cube-only
+任务，同方案 A）。也就是说，即便 block 内允许 vector 先“代发布”，真正权威的 anchor 流仍是 cube
+的 `cube_cursor`。其收益只是偶尔省去 follower 的一次等待，却显著增加了 block 内两条 cursor
+交叉认领的复杂度与正确性论证负担。因此当前**不采用**，仅在 §11 作为未来可选优化列出。
+
+> 一句话总结：vector 不竞争 MIX 是**有意为之**的正确选择。co-owner 由固定配对保证存在；让
+> vector 参与只会重新引入 orphan 或跨 block 双重认领。需要权衡的不是“会不会缺 co-owner”，而是
+> cube 落后时 follower 的等待——这属于负载均衡/性能问题，留待动态配对方案（§11）解决。
+
+## 4. 依赖发现 —— 每核全量复制 TensorMap
+
+依赖与今天完全一样，从 tensor 的读/写重叠推导，途径是一个把 tensor 区域映射到其
+**producer 任务 id** 的 **TensorMap**。本 runtime 的决定是：
+
+> **TensorMap 是每核全量 DUPLICATE（复制）—— 每个核持有一份完整、相同的副本。它绝不被
+> 分区，也绝不做成私有/部分。**
+
+**为什么部分 map 是错的。** producer 条目只在处理某任务的 `OUTPUT`/`INOUT` tensor 时创建。
+若一个核只为它*拥有*的任务插入，它的 map 就会缺失所有由别的核拥有的任务产出的 tensor；本核
+上的某个消费者去查这样一个 tensor 会查不到——依赖发现会悄无声息地失效。
+
+**所要求的 submit 行为（胜者 AND 败者都做）。** 为保持副本完整，submit 路径被拆分：TensorMap
+维护是**无条件**的，只有 build+execute 才受所有权门控。每次 `rt_submit_*`，*每个*核都做：
+
+1. **查**每个 `INPUT` / `INOUT` tensor → 解析出本任务的 fan-in producer 任务 id。
+2. **插**每个 `OUTPUT` **以及 `INOUT`** tensor → 以**本任务 id**作为 producer 登记。`INOUT`
+   两侧都算——它消费旧版本（第 1 步）并产出新版本（第 2 步）。
+
+**胜者**额外构建并执行该任务；**败者**在 TensorMap 更新后停止并前进。
+
+因为 submit 流与任务 id 在各核之间是确定且相同的，每个核重建出**相同**的 TensorMap。各核仅在
+**进度**上不同：跑得更靠前的核有更多条目，但每个条目都与其他核在同一逻辑位置产出的一致——
+**内容相同，进度不同**。
+
+**取舍。** 每个核都要付出完整的 TensorMap 插入/查询开销与内存，即使是它永远不会执行的任务。
+作为回报，解析 producer **零跨核通信**：消费者的 fan-in producer id 在本地副本里就能拿到，在
+构建时存入任务的私有环槽位，执行时再对全局完成标志环轮询。
+
+## 5. 任务存储 —— 私有环 + 全局完成标志
+
+AICPU 模型的全局任务环被移除。两个结构替代它们：
+
+- **每核私有任务环** —— 每个核拥有一个**小**环，存放它已认领的任务，保存每个任务的
+  descriptor + payload + 本地状态（kernel id、args、fan-in producer id）。其他核都不读它；
+  无锁。容量：
+
+  ```cpp
+  #define PRIVATE_TASK_SLOT_NUM 4   // 故意取小：见下方“为何要小”与 §6.1
+  ```
+
+  **这个容量是关键调优旋钮，不是越大越好。** 全系统的乱序窗口 = **核数 × `PRIVATE_TASK_SLOT_NUM`**，
+  同时它也封顶了**单个核能比“当前就绪可执行”超前认领多少个任务**。把它开大会让某个快核一口气
+  抢入一长串连续任务再独自串行执行，造成严重负载倾斜（详见 §6.1）。因此应**保持其很小**（如 2–4），
+  让乱序能力主要来自“核数”维度；具体值按 kernel 时长 / 访存延迟实测调优。
+
+- **全局 `task_completed_flag` 环** —— *唯一*全局共享的 per-task 状态：每个任务 id 一个
+  一次性置位的布尔，标记完成。各核轮询它以检查某个 fan-in producer 是否已完成。
+
+这使依赖解析成为**拉取（pull）**模型（消费者轮询 producer 标志），而非**推送（push）**模型
+（producer 遍历 fanout 列表）。**没有 fanout 列表、没有 fanin/fanout 引用计数、没有依赖列表
+池、也没有完成邮箱。**
+
+### 5.1 私有任务环与 `block.won` 是两个分开的 ring
+
+私有任务环与 `block.won`（§3.1、§8.1）**是两个独立的结构，职责不同，不可混为一谈**：
+
+| | **私有任务环** | **`block.won[N]`** |
+| ---- | ---- | ---- |
+| 归属 | **每核私有**（每个 worker 各一个） | **block-共享**（1 AIC + 2 AIV 共一份） |
+| 作用 | **执行队列**：存放本核已拥有、要*亲自执行*的（子）任务 | anchor → follower 的**投递/交接箱**：暂存多核任务中 anchor 没亲自构建的其余激活槽子任务 |
+| 谁读写 | 仅本核读写，单一 owner、无锁 | anchor 插入（release）、follower 抽取（acquire）、`remaining` 原子递减 |
+| 谁会用到 | 所有任务（含单核 1C/1V） | **仅多核任务（2V / MIX）**；单核任务根本不碰它 |
+| 容量含义 | 默认小（如 4）：封顶“单核可超前多少”，故意取小以抑制负载倾斜（§6.1） | 默认 8：封顶“anchor 相对 follower 可超前多少”，满则触发反压（§11.2） |
+
+**真正的执行永远只发生在各核自己的私有任务环里。** `block.won` 不是执行环，只是把多核子任务从
+anchor **搬运**到 follower 私有环的中转站。两者如何配合：
+
+```
+anchor 赢下多核任务 N：
+  ├─ 自己物理角色那一槽 ──→ 写进【anchor 自己的私有任务环】（亲自执行）
+  └─ 其余激活槽          ──→ 写进【block.won[N]】（投递给伙伴）
+
+follower 异步抽取：
+  从【block.won[N]】取出属于自己槽的项 ──→ 写进【follower 自己的私有任务环】（再亲自执行）
+
+子任务一旦进入某核私有环，其执行、置完成标志、block.won[N].remaining 递减都照常进行；
+remaining 归零时释放该 block.won 条目。
+```
+
+单核任务（1C / 1V）的胜者直接把唯一子任务写进自己的私有环执行，**没有配对、没有投递、不写
+`block.won`**。
+
+## 6. 核执行循环（执行优先的 Run-Ahead）
+
+每个核运行下面的循环。其核心准则是 **“执行优先、认领其次、一次只认领一个”**：每轮循环都
+**先寻找执行机会**（腾空私有环里任何已就绪的任务），**再至多认领一个**新任务——而**不是**先把
+私有环一口气抢满、再开始执行。编排仍会**向前跑（run ahead）**，但只在没有就绪任务可执行时才
+逐个认领，借此把“单核超前认领”限制在很小的范围。这一改动的动机见 §6.1。
+
+该循环从单个物理核 `self` 的视角写出，它在所在 block 中的角色是 `{AIC, AIV0, AIV1}` 之一。
+竞争按**任务类型**进行（vector 任务由 AIV0/AIV1 同等竞争）；单核任务胜者独自执行，多核任务
+胜者作 anchor 并把其余子任务推送给同 block 伙伴（§3、§3.1）。
+
+> 术语对照：本文其余处（§3.1、§11）沿用旧称 **“Phase B”** 指代下方**步骤 1**（执行 / 腾空就绪
+> 任务），**“Phase A”** 指代**步骤 2**（认领新任务）。差别仅在于:执行优先版**每轮只认领一个**、且
+> **认领与执行严格交替**，不再“先把环填满再统一腾空”。
+
+```text
+# 全局（所有核共享），一个共享任务 id 空间（§2、§3.1）：
+#   cube_cursor   : 已认领的 AIC-anchored 任务 id 高水位线
+#   vector_cursor : 已认领的 AIV-only 任务 id 高水位线
+# 每核：
+#   self.role ∈ {AIC, AIV0, AIV1}
+#   my_type(self) = cube  (若 self 是 AIC)  /  vector (若 self 是 AIV0 或 AIV1)
+#   local_current_task_index : 本核已到达的任务 id
+
+loop:
+    # ============================================================================
+    # 执行优先：每轮循环按 步骤0 → 步骤1 → 步骤2 顺序走，一轮只认领【一个】新任务。
+    # 关键修正：不再“先把环填满再执行”。先腾空就绪任务（步骤1），再认领一个（步骤2）；
+    # 认领后立刻回到循环顶部，下一轮又先找执行机会。核在执行一个长任务期间不推进认领，
+    # 这段时间其它核会推进 cursor 认领后续任务 → 负载自然均衡（理由见 §6.1）。
+    # ============================================================================
+
+    # --- 步骤 0：抽取 anchor 推送给我的多核子任务（异步、非阻塞）---
+    # 同 block 的 anchor 胜出某多核任务后，会把它没亲自构建的其余激活槽放进 block.won。
+    # 本核按自己的物理角色（AIV0→aiv0 / AIV1→aiv1）抽取属于自己的那个槽。取空就停，不等待。
+    while 私有环有空槽 AND block.won 有“我角色对应槽”尚未被本核构建的待处理项:
+        从 block.won 取出该子任务，构建进一个空闲私有环槽    # fan-in 已由 anchor 解析好
+
+    # --- 步骤 1：寻找执行机会，腾空就绪的（子）任务（执行优先）---
+    # 每轮都先做这一步：只要 fan-in 已满足就执行，绝不等环填满才开始执行。
+    freed = 0
+    for each 私有环中已占用的槽:
+        if 所有 fan-in producer 的 task_completed_flag == true:    # 依赖已满足（pull）
+            execute(slot)                                          # 调用我的 incore 函数（长耗时）
+            # 完成：多核任务只有一个全局标志，由其共同 owner 中最后完成的子任务置位（§3.1）。
+            if slot.is_multicore:
+                if atomic_dec(block.won[slot.task_id].remaining) == 0:
+                    task_completed_flag[slot.task_id] = true       # 最后一个子任务胜出
+                    free block.won[slot.task_id]                   # 回收以 id 为键的记录
+            else:
+                task_completed_flag[slot.task_id] = true           # 单核：直接置位
+            free(slot)                                             # 释放我自己的槽；无 fanout 计数
+            freed++
+
+    # --- 步骤 2：至多认领【一个】新任务（仅当环有空槽且编排未结束）---
+    # 一次只认领一个，认领后立即回到步骤 0/1 找执行机会，避免一口气把环抢满。
+    # 若步骤 1 没有就绪任务可执行（freed==0），步骤 2 仍会认领一个 → 这就是受控的 run-ahead：
+    # 没活可干时才逐个超前认领，且超前量被私有环容量（很小）封顶。
+    if 私有环有空槽 AND 编排未结束:
+        推进编排到下一个 submit 点                            # 任务 id N
+        local_current_task_index = N
+        M = task.active_mask                                  # 记录 1C+1V vs 1C+2V 等
+
+        # (a) TensorMap 维护是无条件的（胜者、败者、follower 都做）—— §4：
+        #     - 查 INPUT/INOUT tensor    → fan-in producer 任务 id
+        #     - 插 OUTPUT + INOUT tensor → 以本任务 id 作为 producer
+        update_tensormap(task)
+
+        # (b) 确定本任务的类型与 cursor（§2、§3）：cube 任务由 AIC 竞争；
+        #     vector 任务由所有 AIV 核（AIV0 与 AIV1）竞争。
+        T         = (cube if M.has(aic) else vector)          # 有 AIC → cube；否则 vector（含 1V 与 2V）
+        cursor[T] = (cube_cursor if T==cube else vector_cursor)
+
+        if my_type(self) == T:
+            # 我是该类型的合格竞争者（vector 任务时 AIV0/AIV1 都在此参与）。
+            if popcount(M) > 1 AND block.won 已满:             # 多核反压：本轮不认领（§11.3）
+                pass                                          # 留待步骤 1 腾空 block.won 后的下一轮再试
+            else:
+                # 单原子推进：返回旧值；旧值 < N 即我赢。恰一胜者且无跳过见 §11.1。
+                old = atomic_fetch_max(cursor[T], N)          # N = local_current_task_index
+                if old < N:                                   # WIN：我是 owner/anchor
+                    fanin_ids = resolve_fanin(task)           # 一次性解析整任务 fan-in（本地 TensorMap）
+                    if popcount(M) == 1:
+                        # 单核（1C 或 1V）：独自执行那唯一子任务，与 AIV0/AIV1 身份无关，无配对、无推送。
+                        把该唯一子任务构建进一个空闲私有环槽
+                    else:
+                        # 多核（2V / MIX）：我是 anchor。构建我自己物理角色对应的槽，
+                        # 把其余激活槽推送给同 block 伙伴（以 id 为键，互不串扰）。§3.1
+                        把我自己角色的槽对应的子任务构建进一个空闲私有环槽
+                        block.won[N] = { active_mask:M, kernels, args, fanin_ids,
+                                         remaining: popcount(M) }      # block-shared（§3.1）
+                # else（old >= N）：已有一个 T 类型的核认领了 N（它跑在前面）→ 跳过
+        # else: 类型不匹配（例如 AIC 核遇到 1V 任务）→ 只做了 TensorMap，跳过
+
+    # --- 步骤 3：终止与前向进展 ---
+    if 编排已结束 AND 私有环为空 AND 无针对我的待抽取投递（收尾条件见 §7）:
+        break                                                 # 本核完成
+    if freed == 0 AND (私有环已满 OR 编排已结束):
+        # 这一轮既没执行成任何任务、也无法（或无需）再认领：
+        # 唯一能取得进展的是别的核置位我等待的某个完成标志 → 自旋后重扫步骤 1。
+        spin_wait()
+    # 否则回到 loop 顶部：继续“执行优先、再认领一个”
+```
+
+性质：
+
+- **MIX = anchor 推送 + follower 异步抽取（§3.1）。** AIC 核为 MIX 任务 anchor，胜出后把其余
+  激活槽的子任务推送进以 id 为键的 block 投递表 `block.won[N]`；block 的 AIV 核绝不为它竞争、
+  **也绝不阻塞等待**——它只异步从 `block.won` 抽取属于自己槽的投递并构建。cube 落后时 AIV 没有
+  待抽取的投递，便继续做自己的 AIV-only 工作（零停顿）；cube 领先时投递累积、AIV 有空槽就抽取，
+  若 AIV 落后到填满 `block.won`，anchor 暂缓认领新多核任务（反压，转去 Phase B）。槽未激活的
+  block 伙伴（例如 **1C+1V 上的 AIV1**）从不收到投递，照常去认领其他工作。
+- **每任务一个标志，由最后一个子任务置位。** 单核任务直接置 `task_completed_flag`；多核任务
+  递减一个 block-local 计数器（= `popcount(active_mask)`），由最后完成的子任务置位。消费者
+  始终看到一个原子完成信号。
+- **执行优先、一次认领一个。** 每轮循环先腾空就绪任务、再至多认领一个；不再“填满环才执行”。
+  这是把单核的“超前认领”量压到很小、避免负载倾斜的关键（§6.1）。
+- **反压** = 私有环填满（`PRIVATE_TASK_SLOT_NUM` 个槽）。私有环很小，所以单核任何时刻最多只
+  比“已就绪可执行”超前这么几个任务。
+- **即时回收槽**：每个共同 owner 在*自己*的子任务完成时释放*自己*的槽。没有全局环尾推进，
+  没有跨核的槽复位协调，因为环是私有的。
+- **前向进展**：环满且无就绪任务时自旋重扫，直到另一个核的完成标志解锁某个任务；一旦腾出
+  一个槽，该核就回到编排去竞争新任务。
+
+### 6.1 为什么“执行优先 + 小环”——乱序窗口与负载均衡
+
+**乱序（out-of-order, OoO）窗口 = 核数 × 私有环槽数。** 这是整个系统在任一时刻能“同时在飞”
+并允许乱序执行的（子）任务上限。它决定了无依赖的后续任务能否绕过排在前面、但尚未就绪的任务
+被尽早执行（避免 head-of-line blocking）。
+
+**旧设计（填满环再执行）为什么会负载倾斜。** `claim + build` 极快，而 `execute` 很慢。若每个核
+都“先把私有环填满再开始执行”，那么跑得最靠前的核会在极短时间内把**一连串连续的任务**全部
+`atomic_fetch_max` 抢进自己的环（把 `cursor` 一路推高），随后独自长时间串行执行这一串任务；
+其它核因 `cursor` 已被推高而**抢不到**这段连续 id → 严重负载不均衡。更糟的是 head-of-line：
+环里靠前但未就绪的任务会一直占着槽，挡住它后面其实已就绪、本可被别的核分担的任务。
+
+**两点改进。**
+
+1. **执行优先（本节伪代码）。** 每轮先腾空就绪任务、只认领一个新任务。核在执行一个长任务期间
+   **不推进认领**，这段时间里其它核会推进 `cursor` 认领后续任务 → 工作自然铺开。认领不再是
+   “抢满即止”的突发，而是“没就绪活干时才逐个超前”的受控行为。
+2. **保持私有环小（缩小 `PRIVATE_TASK_SLOT_NUM`）。** OoO 能力主要应由**核数**这一维度提供，
+   而不是把单核的环开大——开大只会让单核一次能独吞更长的连续任务串，放大倾斜。把环取较小值
+   （如 2–4）即可在保留足够乱序窗口（核数已经不小）的同时，把单核超前量压到最低。环大小应按
+   访存延迟 / kernel 时长实测调优，而非默认开大。
+
+> 一句话：乱序靠“多核 × 小环”，不靠“单核 × 大环”。执行优先确保快核在执行长任务时把后续认领
+> 让给其它核；小环确保即便要超前，超前量也很小。
+
+**实测泳道图。** 下图是 `benchmark_bgemm`（`FullCore24`，`block_dim=24` → 24 AIC + 48 AIV
+共 72 条 lane，240 个 GEMM(1C) + 240 个 ADD(1V)）在 a2a3sim 上的每核执行泳道：每条横轴是一个
+物理 lane（AIC / AIV0 / AIV1），每个色块是一次 incore 函数执行（蓝=GEMM、红=ADD）。可见执行优先
+策略把 GEMM 较均匀地铺满了 24 个 AIC，而非堆积在少数快核上——这正是 §6.1 论证的负载均衡效果。
+
+![fully_distributed_within_core 每核执行泳道（benchmark_bgemm FullCore24）](fully_distributed_within_core/swimlane_bgemm_fullcore.png)
+
+> 复现：`dist_engine` 内置一个环境变量门控的 Chrome-trace 导出器（中心化 L2 采集器不适用于本
+> runtime 的 AICPU 桩）。设 `PTO_DIST_SWIMLANE=<path.json>` 跑用例即生成 trace，再用
+> `python -m simpler_setup.tools.dist_swimlane_render <path.json> -o <out.png>` 渲染为上图；
+> 或把 JSON 直接拖入 [Perfetto](https://ui.perfetto.dev/) 交互查看。incore 函数名由 `scene_test`
+> 在捕获后从 CALLABLE spec 注入（叶子 `CoreCallable` 不携带名字），故图例显示 GEMM/ADD 而非 f0/f1。
+
+### 6.2 实测：编排/调度开销随核数的代价
+
+全分布式模式用"无中心调度器"换来的代价是：**编排被每个核完整重放（SPMD），且认领要在共享 cursor
+上原子竞争**。为了把这部分纯开销与 kernel 计算分离测量，`dist_engine` 提供一个环境变量门控
+`PTO_DIST_SKIP_EXEC=1`：置位后 `execute_slot` **跳过 incore kernel 调用**（每个子任务当 0 代价
+瞬时完成），但**保留全部 ownership/完成/frontier 簿记**，核循环照常终止。这样测得的片上编排墙钟
+就只反映 orchestration + claim race + scheduling。
+
+下表用 `benchmark_bgemm`（`matmul_add_task_num=480`，约 960 个任务）在 a2a3sim 上扫 `block_dim`
+（1 block = 1 AIC + 2 AIV），取多轮中位数。`device` 为片上编排墙钟（PTO2 profiling），是关注指标；
+`host` 含 Python/sim 启动等固定开销，仅作参照。复现：
+`python examples/a2a3/fully_distributed_within_core/runtime_overhead_test/test_runtime_overhead.py -p a2a3sim`。
+
+| blocks | cores | device 编排墙钟 (ms) | us/task | 相对 1 block |
+| -----: | ----: | -------------------: | ------: | -----------: |
+|      1 |     3 |                 3.93 |    4.09 |        1.00× |
+|      2 |     6 |                 4.71 |    4.91 |        1.20× |
+|     12 |    36 |                21.23 |   22.11 |        5.41× |
+|     24 |    72 |                42.87 |   44.65 |       10.92× |
+
+**结论。** 纯编排/调度墙钟**随核数近线性增长**（3→72 核约 11×）：核越多，重复重放的编排和 cursor
+竞争越多。少核时增量很小（2 块仅比 1 块高约 20%），随核数增大才陡升。这部分固定开销要靠**真实
+kernel 执行被多核并行摊薄**来回本——本实验故意跳过执行，所以只暴露开销本身。它也说明：私有环要小、
+执行优先（§6.1）等设计的价值，正是让有限的核尽快投入真实执行，而不是把时间耗在超前认领/竞争上。
+
+### 6.3 绑核（CPU 亲和）对测量噪声的影响
+
+仿真把每个 AICore/AICPU“核”实现为一个 host 线程，默认由 OS 在全部物理核（本机 320 核 / 8 个 NUMA
+节点，每节点 40 核）上自由调度。跨核迁移与跨 NUMA 访问会给 §6.2 的 `device` 墙钟带来明显抖动（单次运行间方差很
+大）。`test_runtime_overhead.py` 新增 `--bind` 开关，用 `sched_setaffinity` 在**进程级**绑核（后续所有
+sim 线程自动继承，无需外部 `numactl`，也避免 `--membind` 的内存压力）：
+
+* `--bind none`（默认）：不绑核；
+* `--bind node:<nodes>`：绑到指定 NUMA 节点的全部 CPU（如 `node:0,1`）；
+* `--bind cpu:<list>` 或裸 `<list>`：绑到显式 CPU 列表/区间（如 `cpu:0-119`）。
+
+> **绑核曾暴露的崩溃 bug（已修复）。** AICore kernel `.so` 每个 `run` 都 dlopen/dlclose 重载，而其
+> `pthread_once` 创建的 TLS key 在 dlclose 时不被 glibc 回收，逐 `run` 泄漏；约 200 个 `run` 后耗尽
+> `PTHREAD_KEYS_MAX`（1024），`pthread_key_create` 失败 → `sim_get_reg_base()` 返回 NULL → 在
+> `write_reg` 上空指针 SIGSEGV（全量 1→24 扫描在 `block≈23` 必崩）。修复：在
+> `src/{a2a3,a5}/platform/sim/aicore/kernel.cpp` 增加卸载析构 `__attribute__((destructor))`，于
+> dlclose 时 `pthread_key_delete` 全部 key，使每轮重载对 key 池**净零占用**；绑核全量 sweep 现可稳定
+> 跑完。
+
+**为何把评估限制在单 NUMA 核范围。** 本机拓扑为 **8 个 NUMA 节点 × 40 核 = 320 核**（无超线程），
+**跨 NUMA 访问代价显著**。仿真里每个 sim“核”是一个 host 线程，`cores = block_dim × 3`。当一次运行用到的
+核数超过单个节点的 40 核（即 `block_dim ≥ 14`，42 核起），AICore 工作集被迫横跨多个 NUMA 节点，**跨节点
+的 cursor 原子认领竞争 + 远端内存访问**会主导 `device` 墙钟：实测在 `block≈13→14` 出现明显台阶、且
+`block 14–24` 在本共享机上随其它租户的突发负载剧烈抖动（同一配置重测可差 2–3×）。这类数字是**平台 NUMA
+伪影**，并非引擎本身的编排复杂度。因此我们**只评估 AICore 核数落在单个 NUMA 节点内的 block 范围**
+（`cores = block_dim × 3 ≤ 40 ⟹ block_dim ∈ [1, 13]`），不再做更大范围扫描。
+
+**把 AICore 线程真正钉进同一个 NUMA 节点（线程级 1:1 绑核）。** 仅靠进程级 `--bind` 还不够：
+
+* **绑单个 40 核节点很脆弱。** sim 的**总线程占用**远大于 AICore 数（还含每次 spawn 的 50 个 AICPU
+  over-launch 线程、4 个存活 AICPU、采集与主线程），全挤进 40 核。空闲时 `--bind node:<单节点>` 尚能干净到
+  `block 12`，但 `block 13`（39 AICore ≈ 节点满）即超订、`device` 跳升约 2×（见
+  `build/sweep_singlenuma_node2_40cores.txt`）；更糟的是它对**外部负载极敏感**——因为该引擎用自旋式
+  cursor 认领竞争，一旦该节点被其它租户占用一部分核，持锁线程被抢占、其余线程空转自旋（lock-convoy
+  崩溃），`device` 会从 `block≈6` 起就抖升到 20–30 ms。两种情况都是 CPU 争抢伪影，非真实编排开销。
+* **只绑多个节点（进程级）也不够干净。** 进程绑到 3 节点时，OS 会把 AICore 线程**散布到多个 NUMA 节点**，
+  AICore 之间的 cursor 认领竞争又变成跨节点访问——这正是之前看到 1→13 增长偏大（~2.5×）的部分原因。
+
+正确做法是**线程级绑核**：新增 `--aicore-numa <node>`（置 `PTO_SIM_AICORE_NUMA_NODE`），让 device_runner
+在拉起 AICore 线程时把**第 i 个 AICore 线程用 `sched_setaffinity` 1:1 钉到该节点的第 i 个 CPU**，从而整个
+AICore 工作集严格留在同一个 NUMA 节点、每核独占一个物理 CPU；而 AICPU/主/采集等辅助线程**不钉核**，由
+进程级 `--bind`（给足若干空闲节点）承载，避免超订。要求 `cores = block_dim × 3 ≤ 单节点核数(40)`，即
+`block_dim ∈ [1, 13]`。
+
+> **绑核确认。** `PTO_SIM_AICORE_PIN_VERBOSE=1` 下逐线程打印落核情况；`block_dim=13`（39 个 AICore 线程，
+> `--aicore-numa 2`）实测 **39/39 线程全部运行在 node2 的 cpu 80–118**，零越界，确认 AICore 工作集完全位于
+> 单个 NUMA 内。
+
+下表为该单 NUMA 区间的完整统计（**当前引擎，已含 §6.4 的 O(N) per-core TensorMap 优化**；`tasks=480`，
+**25 轮中位数**；`--bind node:1,2,3` 承载辅助线程 + `--aicore-numa 2` 把全部 AICore 钉进 node2；归档
+`build/sweep_singlenuma_aicorepin_node2.txt`）：
+
+| blocks | cores | device 编排墙钟 (ms) | us/task | 相对 1 block |
+| -----: | ----: | -------------------: | ------: | -----------: |
+|      1 |     3 |                 2.09 |    2.17 |        1.00× |
+|      2 |     6 |                 2.22 |    2.31 |        1.06× |
+|      3 |     9 |                 2.39 |    2.49 |        1.15× |
+|      4 |    12 |                 2.54 |    2.64 |        1.22× |
+|      5 |    15 |                 2.80 |    2.91 |        1.34× |
+|      6 |    18 |                 3.00 |    3.13 |        1.44× |
+|      7 |    21 |                 3.05 |    3.18 |        1.46× |
+|      8 |    24 |                 3.24 |    3.38 |        1.56× |
+|      9 |    27 |                 3.39 |    3.53 |        1.62× |
+|     10 |    30 |                 3.73 |    3.88 |        1.79× |
+|     11 |    33 |                 3.84 |    4.00 |        1.84× |
+|     12 |    36 |                 4.20 |    4.38 |        2.02× |
+|     13 |    39 |                 4.25 |    4.42 |        2.04× |
+
+**结论。**
+
+* AICore 全部钉进单个 NUMA 节点后，单 NUMA 核范围（`block ≤ 13`，≤40 核）内编排/调度开销**平滑、单调、
+  且低**地随核数上升，1→13 仅约 **2.0×**（`us/task` 2.17→4.42）——SPMD 冗余重放 + cursor 认领竞争的真实
+  代价在节点内增长很温和。
+* **对比"只进程级绑核（AICore 被散布到 3 节点）"**：同样 25 轮、同样 block 区间，后者 1→13 约 2.5×、
+  `block 13` 的 `us/task` 5.47（见 `build/sweep_singlenuma_1_13_120cores.txt`）。线程级单 NUMA 绑核把
+  `block 13` 降到 4.42（**−19%**）且整体更平——多出来的那部分增长确属**跨 NUMA 散布**，而非引擎本身。
+* 低 `block`（≤4）相比优化前明显下降（如 1 块 `us/task` 3.36→2.17），印证 §6.4 的 O(N) 优化。
+* **越过单节点（`block ≥ 14`，>40 核）**必然跨 NUMA：台阶 + 强抖动，是平台 NUMA + 共享机外部负载的伪影，
+  本评估**不纳入**。
+* **共享机注意**：本机为多租户共享，即便绑核别的任务仍可能突发占用同批核；故采用 25 轮中位数并先用
+  `mpstat -P ALL 1 1` 选空闲节点。曾观察到全 8 节点 ~100% 占用时数值整体抬升数倍。
+
+归档：AICore 单 NUMA 线程级绑核 `build/sweep_singlenuma_aicorepin_node2.txt`；仅进程级绑核对照
+`build/sweep_singlenuma_1_13_120cores.txt`；单节点超订对照 `build/sweep_singlenuma_node2_40cores.txt`。
+（历史全 1–24 跨 NUMA 扫描 `build/sweep_1_24*.txt` 仅作平台伪影参照。）
+
+### 6.4 降低每任务编排开销：把 per-core TensorMap 从 O(N²) 降到 O(N)
+
+§6.2/§6.3 测的是开销随**核数**的变化。另一条正交的轴是开销随**任务数**的变化——它暴露了单核
+编排算法的复杂度。把 `block_dim=1`（3 核、无认领竞争）固定下来扫任务数，就能把 per-core 编排算法
+的成本从多核竞争噪声里隔离出来。
+
+**定位。** 每个核对每个任务都要维护一份"生产者表"（per-core duplicate TensorMap，§9）：fan-in
+解析要 `lookup` 输入区间的生产者，注册输出要 `insert`。最初的 `DistTensorMap` 是一个**扁平数组 +
+线性扫描**，且**从不回收**条目：
+
+```
+struct DistTensorMap { MapEntry entries[kMapCap]; int32_t count; };
+// lookup / insert 都是 for (i in 0..count) 线性比对
+```
+
+对 bgemm 这类"**单个扁平输出 buffer + 大量不相交 tile**"的负载，`count` 会随整个运行近线性增长
+（每个 tile 是不同的 `[lo,hi)`，精确匹配替换帮不上忙），于是每次 `lookup`/`insert` 都是 O(count)，
+全程 **O(N²)**。仅靠"按 buffer 基址哈希"也救不了——所有 tile 共享同一个基址，落进同一条链。
+
+**修复（对齐 `tensormap_and_ringbuffer` 的 `PTO2TensorMap` 方案）。** 改写 `DistTensorMap` 为该
+runtime 久经验证的结构：**按 buffer 基址哈希分桶 + 桶内双向链 + 按生产者任务的 entry 链 + 空闲链表
++ lazy invalidation + `cleanup_retired` 按任务精确回收**。决定性的一步是**回收**：
+
+> 依据 H 跨度契约（§9.5/§11.4），任务 N 的消费者 id ≤ N+H；因此 producer 早于 `N − H` 的条目
+> **不可能**再被任何未来任务作为 fan-in（其 GM 堆区也已在同一界限下被回收）。每次 submit 用确定性
+> 阈值 `alive_floor = N − H` 推进，沿**生产者任务链**精确释放刚离开 H 窗口的那一个任务的条目（绝不
+> 扫描整池）。这把每条链长从"全程任务数"压到"H 窗口内"，O(N²) → O(N·H) ≈ O(N)。
+
+阈值取自 N（确定性、各核一致），**不**取自 frontier（与时序相关），故每核的 map（含空闲链表与回收
+进度）演化完全一致，"每核副本一致"不变量得以保持。与参考实现一样，`insert` **总是挂新条目**到其
+生产者任务链（不做就地替换），`lookup` 返回区间重叠者中 producer **最大**（最新）的那个——语义上
+等价于原先的就地替换，但让 `cleanup_retired` 能按任务链 O(1) 回收。
+
+**附带优化：把认领门提前，让败者跳过赢家专属工作。** SPMD 下每个核都重放 submit，但一个任务只有
+约 1/3 的核会赢得认领。原先所有核都先做了 fan-in `lookup` 和 `built[]` 组装（tc × `sizeof(Tensor)`
+拷贝）才去认领。把 **anchor 类型判定 + cursor 认领提前到 map 操作之前**，则：
+* **fan-in `lookup` 改为赢家专属**——败者从不消费 fanin，直接跳过 input 查找（output `insert` 仍
+  无条件执行，保持各核 map 一致）；
+* **`built[]` 组装移到认领成功之后**——失败的核省掉无用拷贝。
+
+这正是"负载随核数摊销"能显现的关键：核越多，每个核赢得的任务越少、跳过的 fan-in 查找越多。实测
+`dev vs 1blk`（tasks=4000）从改前的 1.7×/2.2×（2/4 block）压平到约 **0.7–1.1×**（多核档不再随核数爬升，
+甚至偶尔低于 1 block）。注意它**动不了**每核必做的"地板"——堆物化 + output `insert`（每核全量副本的
+固有代价），故 1-block 绝对值基本不变。
+
+**A/B 实测（`block_dim=1`，跳过执行，7 轮中位数）。** 隔离单核编排算法成本，扫任务数：
+
+| matmul_add_task_num | 旧 device (ms) | 旧 us/task | 新 device (ms) | 新 us/task | 加速 |
+| ------------------: | -------------: | ---------: | -------------: | ---------: | ---: |
+|                 480 |          3.10  |      3.23  |          2.95  |      3.08  | 1.05× |
+|                1920 |         13.28  |      3.46  |          5.42  |      1.41  | 2.45× |
+|                3840 |         34.76  |      4.53  |          4.01  |      0.52  | **8.66×** |
+
+（新列为"哈希+回收"与"`built[]` 后置"两项优化叠加后的最终值。）
+
+旧实现 device 随任务数**超线性**（任务 ×8 → device ×11.2，`us/task` 3.23↑4.53），正是线性 map 不
+回收的 O(N²) 尾巴；新实现**亚线性**（任务 ×8 → device 仅 ×1.3，`us/task` 反而 3.08↓0.52），即 O(N)。
+在 §6.2 关注的 480 任务规模，新版与旧版持平（略优）；规模越大优势越显著。
+
+**结论。** per-core 编排里真正随规模恶化的是"无回收的线性生产者表"。沿用 `tensormap_and_ringbuffer`
+的哈希 + 按任务回收方案、并用确定性的 `N − H` 作回收阈值，即可把单核编排从 O(N²) 降到 O(N)，同时保持
+SPMD 各核 map 完全一致与全部 golden 正确性（bgemm / paged_attention / paged_attention_ringbuffer /
+mix_coown 等用例校验通过）。复现：
+`python examples/a2a3/fully_distributed_within_core/runtime_overhead_test/test_runtime_overhead.py -p a2a3sim --blocks 1 --tasks 3840`。
+
+**附带优化：把认领门提前，让败者跳过 fan-in 查找。** 见 §6.4 上文同名段落——把 anchor 类型判定 +
+cursor 认领提前到 map 操作之前，fan-in `lookup` 改为赢家专属，`built[]` 组装移到认领之后。这是"负载
+随核数摊销"能显现的关键优化，效果见下节 §6.5。
+
+### 6.5 核数 scale up 时 us/task 为何回升：cursor CAS 等共享原子的竞争
+
+**测试条件（截至本节最新）。** workload=`benchmark_bgemm`，`PTO_DIST_SKIP_EXEC=1`（跳过 incore
+执行，只测编排/调度墙钟），`device` 为片上编排墙钟（PTO2 profiling），多轮取中位数。`--blocks` 默认
+随平台：macOS `1-4`、Linux `1-13`。运行用项目自带 `.venv` 解释器（含编译好的 `_task_interface` 绑定）。
+当前代码含三项优化：哈希 + H 回收的 TensorMap（§6.4）、`built[]` 后置、**winner-only fan-in**。复现：
+`./.venv/bin/python examples/a2a3/fully_distributed_within_core/runtime_overhead_test/test_runtime_overhead.py -p a2a3sim --tasks 4000`。
+
+**结果 1：单核（block=1）随任务数仍是 O(N)。** 固定 1 block 扫 batch（`--tasks`，总任务约 2×）：
+
+| matmul_add_task_num | ~tasks | device (ms) | us/task |
+| ------------------: | -----: | ----------: | ------: |
+|                1000 |  ~2000 |        2.08 |    1.04 |
+|                4000 |  ~8000 |        3.99 |    0.50 |
+
+任务量 ×4、device 仅约 ×2、`us/task` 反而下降 → per-core 编排算法是 O(N)（§6.4 的 TensorMap 改造之效）。
+
+**结果 2：多核（Mac，tasks=4000，blocks 1–4）device 随核数回升。**
+
+| blocks | cores | device (ms) | us/task | dev vs 1blk |
+| -----: | ----: | ----------: | ------: | ----------: |
+|      1 |     3 |        3.99 |    0.50 |       1.00× |
+|      2 |     6 |        3.22 |    0.40 |       0.81× |
+|      3 |     9 |        4.46 |    0.56 |       1.12× |
+|      4 |    12 |        9.14 |    1.14 |       2.29× |
+
+winner-only fan-in 使中低核数出现摊销（2 block 一度低于 1 block，约 0.8×；多轮中 `dev vs 1blk` 多在
+0.7–1.3× 间）；但核数继续增大时 `device` 仍会**回升**（如上 4 block；Mac 上 12 线程超订使该档方差很大，
+不同轮在 1.1×–2.3× 间跳）。下面分析这部分回升的算法性根因。
+
+**根因：认领走的是对单个共享 cursor 的 CAS 循环 fetch_max。**
+
+```text
+bool claim(cursor, N):
+    c = cursor.load()
+    loop:
+      if N <= c: return false          // 落后核只 load、不写（便宜）
+      if cursor.CAS(c -> N): return true // 争胜:在同一条 cache line 上 CAS
+```
+
+认领**按类型共享同一个 cursor**：所有 AIC 核抢 `cube_cursor`、所有 AIV 核抢 `vector_cursor`。于是：
+
+* **单一热点 cache line。** `block_dim=B` 时，cube 任务由 `B` 个 AIC 核、vector 任务由 `2B` 个 AIV 核
+  对同一原子量每任务 load+CAS。该 line 在竞争核间反复转移独占权（MESI），竞争核越多 → 单次 CAS 延迟
+  越高、失败重试越多、一致性流量越大。`device` 取最慢核墙钟，最慢核要排队等这条线 → device 随 B 回升。
+  （AIV 数是 AIC 的 2 倍，故 vector 认领竞争更重——bgemm 的 ADD(1V) 即走此路。）
+* **skip-exec 放大竞争。** 跳过执行后每任务 0 代价，各核近**锁步**推进 → 对任意任务 N 几乎同时争抢 cursor，
+  达最坏竞争。真实执行时 kernel 耗时让各核去同步、认领被自然错开，竞争反而小。**故本测试是 cursor 竞争
+  的悲观上界。**
+
+**其它随核数增长的全局原子（次要但同向）：**
+
+| 原子 | 访问模式 | 随核数扩展 |
+| --- | --- | --- |
+| `cube/vector_cursor` CAS（认领） | 每核每任务，单一热点线 | **强（主因）** |
+| `frontier` CAS（`advance_frontier`） | 每次完成扩展前缀时 CAS 单一 `frontier` | 中–强 |
+| `flags[N]` 完成标志（`uint8_t`，64 个/行） | 相邻任务标志**伪共享** | 中 |
+| `block.won`（state/remaining/drained） | **每 block 局部，仅 3 核内** | 否（不随总核数涨） |
+
+此外**仿真特有**：每个核是 host 线程，核多→线程多→在物理核上**超订** + 跨 NUMA，放大 device 抖动
+（非算法因素，Mac 上尤甚；干净曲线应在 Linux 用 §6.3 的绑核测）。
+
+**小结与缓解方向。** us/task 在核数增大时回升，主因是**全局单热点 cursor 的 CAS 竞争**（其次为 frontier
+CAS 与 flag 伪共享），而非每核的 map 维护（那块已 O(N) 且被 winner-only fan-in 进一步减负）。若要把这条
+曲线进一步压平，可考虑去掉"全局单热点"：
+
+* **批量认领（claim stride）**：一次 CAS 抢一段连续 id，把 N 次 CAS 摊销成 N/stride 次；
+* **分片认领（cursor sharding）**：把 `cube/vector_cursor` 各扩成 `G` 个，按 `task_index % G` 选 cursor，
+  把单热点 CAS 摊到 `G` 条 cache line（详见 §6.6——认领语义与单一 cursor 等价，不引入偏差/不均衡）；
+* `flags` 按 cache line 对齐分散以消伪共享。
+
+这些都属于"认领/完成同步"层的可选优化，与 §6.4 的 map 改造正交。认领最初用最简单的全局 cursor；**现已
+落地 §6.6 的 cursor 分片（`G=4`）+ winner-only fan-in**，实测见 §6.7。
+
+### 6.6 cursor 分片（sharding）：按 `task_index % G` 切 cursor，认领效果与单一 cursor 等价
+
+§6.5 把"分片认领"列为压平 cursor CAS 竞争的方向之一。本节给出**具体方案**并论证一个重要结论：**只要按
+`task_index` 给 cursor 变量分片、而绝不对 worker 分组，分片在"认领任务"上的语义与单一全局 cursor 完全一致
+——不产生额外进度偏差、不加剧 worker 间负载不均衡，仅把对 cursor 的访存竞争摊到 `G` 条 cache line 上。**
+
+**方案。** 把今天的两个全局 cursor（`cube_cursor` / `vector_cursor`，§11.1）各扩成 `G` 个：
+`cube_cursor[G]` / `vector_cursor[G]`。某任务 id `N` 做认领时，访问 `vector_cursor[N % G]`（cube 任务同理用
+`cube_cursor[N % G]`），即 **shard = `N % G`**。`claim` 仍是同一套 CAS-loop fetch_max（§11.1）。关键在于：
+**shard 只由 task_index `N` 决定**，而 `N` 在每个核上完全一致（各核 replay 同一条 submit 流），所以**任一核
+认领 `N` 时算出的 shard 相同、访问的是同一个 `cursor[N%G]`**——**没有"哪些核只能碰哪个 shard"的核分组**。
+
+**为什么认领效果与单一 cursor 完全一致。**
+
+* **仍是"每任务恰好一个 owner、不漏不重"。** `vector_cursor[g]` 只承接 `N ≡ g (mod G)` 的那串 id
+  （`g, g+G, g+2G, …`），它们被每个核**按序**处理 → 在该 residue 子序列上仍是单调 fetch_max，首个把它从
+  `<N` 推到 `N` 的核独占 `N`。这与单 cursor 在全序列上的不变式**逐字相同**，只是把"一条单调序列"拆成 `G`
+  条交织的单调子序列，每条仍单调、连续、无跳过。
+* **任一核都能赢任一任务（工作窃取原样保留）。** shard 由 `N`、而非核身份决定，每个核处理到 `N` 就去抢
+  `cursor[N%G]`，**没有核被排除在任何任务之外**。于是"谁空谁抢下一个 id"的窃取式负载均衡**完全保留**，
+  不会出现某组核闲、另一组过载。
+* **不产生额外进度偏差。** 不存在"各自独立推进的分片"：每个核都走完整条流，对连续的 `N, N+1, N+2, …`
+  轮流落在 `cursor[0..G-1]` 上，故 `G` 个 cursor 始终贴着**同一条认领前沿**、彼此相差不超过约 `Δ+G`
+  （`Δ` 为单核 run-ahead 上界）。整体推进仍由**同一个全局完成前沿 `F` + 同一个私有环 run-ahead 上限**封顶
+  （与是否分片无关），所以偏差与单 cursor 时**一模一样**。
+* **确定性不变。** 认领只决定"谁执行"，不改变 id、不改变 per-core map 的 replay/insert 顺序，golden 结果不变。
+
+**结论（直接回答"是否等价"）。** **是。** 按 `N % G` 给 cursor 变量分片，在**认领语义、负载分布、推进/偏差、
+确定性**四个方面与单一全局 cursor 等价；**唯一区别**是把对一条 cursor cache line 的 CAS 竞争分摊到 `G` 条
+独立 line，降低访存争用。因此 cursor sharding **不会**带来更大的进度偏差，也**不会**加剧 worker 间负载不
+均衡——它**只**降低了竞争这个 cursor 的访存代价。
+
+**一处要点：收益何时兑现，以及 `G` 怎么取。** 对**同一个** id `N`，认领前沿上的核仍然撞同一个
+`cursor[N%G]`；分摊之所以有效，是因为各核在任一时刻分布在一段**连续 id 窗口**上（核 A 在 `N`、核 B 在
+`N+1` …，窗口宽约 run-ahead `Δ`），这些连续 id 落在不同的 `cursor[N%G]` 上。**只要在飞 id 窗口 ≥ G**，
+CAS 写竞争就被摊到 ≈ `G` 条 line。故 `G` 取到"每条 line 的竞争核数不再是瓶颈"即可（量级上
+`G ~ 同类型核数 / 期望每线核数`），不必更大；`G=1` 即退回今天的实现，零行为变化。
+
+**务必区分：分片 cursor 变量 ≠ 给 worker 分组。** 上面的等价性**只**在"shard 由 `task_index` 决定、所有核
+对所有任务一律可竞争"时成立。若改成另一种做法——**按核/按 block 把 id 空间静态切给不相交的核组、各组只
+认领自己那片 id**——那就是"分 worker"，会引入**独立分片进度**（慢分片顶住全局完成前沿 `F`、拖慢回收）与
+**工作窃取丢失**（某组核闲、另一组过载的负载不均衡）。那种核分组才需要额外的"显式认领窗口 + 跨分片窃取
+兜底"等机制来补救，得不偿失。**本方案刻意避免它**：我们分片的是 **cursor 变量（按 `task_index % G`）**，
+不是 worker——这正是它能与单一 cursor 等价、却又降竞争的原因。
+
+### 6.7 cursor 分片实测：G=4 已落地；单 NUMA 区间收益与最优 G
+
+§6.6 的方案已落地（`kCursorShards` 默认 **G=4**，每个子 cursor 独占一条 64B cache line；并配合 winner-only
+fan-in，§6.4.1）。本节给出在**单 NUMA 区间**的实测结论。
+
+**测量口径。** skip-exec（仅编排/调度），`~10000 tasks`（`--tasks 5000`），`rounds=15` 取中位数，AICore 线程级
+钉进 node2（`--aicore-numa 2`，§6.3），辅助线程 `--bind node:1,2,3`，**空闲机器**上取干净单调曲线（共享机
+偶发外部负载会污染后段 block，已剔除被污染的运行）。
+
+**(1) 分片前（单一全局 cursor）→ 分片后（G=4）。**
+
+| blocks | cores | 单 cursor us/task | G=4 us/task | 改善 |
+|--------|-------|-------------------|-------------|------|
+| 1  | 3  | 1.05 | 0.99 | −6% |
+| 4  | 12 | 1.36 | 1.29 | −5% |
+| 8  | 24 | 1.92 | 1.68 | −12% |
+| 10 | 30 | 1.97 | 1.83 | −7% |
+| 12 | 36 | 2.23 | 2.10 | −6% |
+| 13 | 39 | 2.33 | 2.20 | −6% |
+
+全程 `us/task` 一致小幅下降，**中高 block 段（8–13）改善约 6–12%**，曲线仍干净单调。方向正确——把单热点
+cursor 的 CAS 竞争摊到 4 条 cache line 确实压低了 §6.5 所述的访存争用。
+
+**(2) G=4 vs G=8：单 NUMA 内 G=4 是甜点。** 把 `G` 加倍到 8 重测（同口径）：
+
+| blocks | cores | G=4 us/task | G=8 us/task | 差异 |
+|--------|-------|-------------|-------------|------|
+| 1  | 3  | 0.99 | 1.01 | +2% |
+| 7  | 21 | 1.65 | 1.75 | +6% |
+| 8  | 24 | 1.68 | 1.81 | +8% |
+| 9  | 27 | 1.74 | 1.94 | +11% |
+| 10 | 30 | 1.83 | 2.05 | +12% |
+| 11 | 33 | 2.00 | 2.26 | +13% |
+| 13 | 39 | 2.20 | 2.29 | +4% |
+
+**G=8 不升反降**（中高 block 段慢 8–13%）。原因（单 NUMA、≤39 核区间）：
+
+1. **G=4 已摊够竞争。** `block=13` 也才 13 个 AIC 核 / 26 个 AIV 核；G=4 下每 shard 平均仅 ~3 个同类型核竞争，
+   已逼近"每条 line 竞争核数不再是瓶颈"（§6.6 对 `G` 的取值分析），再加倍几乎没有进一步降竞争的空间。
+2. **G=8 反而增大 cursor 的 cache footprint**（每类型 8×64B=512B，更多 cache line 同时在核间弹跳），总相干
+   流量与局部性变差，得不偿失。
+3. 分片越多、单核能赢的任务越稀疏（只拿 `≡ s (mod G)` 的 id），动态窃取式负载均衡的交织略变差。
+
+**结论。** 在评估约束的**单 NUMA、核数 ≤ 一个节点（≤39 核）**区间内，**`G=4` 为最优**，故保持默认 `G=4`。
+更大的 `G` 要等到**跨 NUMA / 更高核数**(`block ≥ 14`)、单条 line 上竞争核数显著上升时才可能回正——但那已属
+跨 NUMA 区间（§6.3 说明其数字是平台伪影，不在本评估范围）。
+
+归档：G=4 干净扫描 `build/sweep_singlenuma_shardG4_node2.txt`；G=8 对照 `build/sweep_singlenuma_shardG8_node2.txt`。
+
+## 7. 终止
+
+一个核在其编排不再产生任务**且**私有环为空（所有拥有的任务都已执行）时结束。对 follower
+（AIV）还有一条额外条件：它必须等到**其 block 的 anchor 编排也结束**且 `block.won` 中再无
+针对它的待抽取投递——否则可能有尚未推送的多核子任务漏执行。这就是 §3.1 提到的**尾部空转**：
+当某 block 的 anchor 严重落后时，它的 follower 做完自身其余全部工作后，会在终止前空转等待
+anchor 推送最后的多核子任务。这不是 per-task 串行阻塞，只发生在收尾，且 cube 领先时不出现。
+
+所有核都结束时达到全局完成；最终的图输出位置被发布以供 host 拷回（见 §8 的
+`graph_output_ptr`）。一个全局“所有核完成”屏障替代了旧的单一 `orchestrator_done` 标志。
+
+---
+
+# 第二部分 — 数据结构与共享特性
+
+## 8. 共享模型
+
+每个结构被归为以下之一：
+
+| 类别 | 含义 |
+| ---- | ---- |
+| **全局共享** | 唯一权威实例；多个核读/写；需要显式访问机制 |
+| **block-共享** | 仅在一个固定 block（1 AIC + 2 AIV）的核之间共享；用于 MIX 共同所有权（§3.1） |
+| **每核私有** | 由单个核拥有；无跨核可见性 |
+| **每核复制** | 每核复制一份；内容相同、各自独立重建（或只读副本） |
+
+### 8.1 新引入的结构
+
+| 结构 | 类别 | 作用 | 访问机制 |
+| ---- | ---- | ---- | -------- |
+| `cursor[T]`：`cube_cursor` / `vector_cursor` | **全局共享** | 每个类型的 claim 高水位线；到达 `N` 时 `old < N` 即胜出并拥有该任务（§2、§3.1） | 单条 `atomic_fetch_max(cursor[T], N)`（无则 CAS 回路），acq-rel；无跳过性证明见 §11.1 |
+| `task_completed_flag` 连续完成前沿 `F` / 回收前沿 `R` | **全局共享** | `F` = 全已完成前缀；`R = F − H` 决定堆/标志环回收（§9.5、§11.3、§11.4） | `F` 协作式 CAS 推进；`R` 派生；单调 |
+| `local_current_task_index` | **每核私有** | 编排进度游标；每次 submit `++` | 普通标量 |
+| **私有任务环**（`PRIVATE_TASK_SLOT_NUM`，默认小，如 4） | **每核私有** | 保存已拥有的（子）任务：descriptor + payload + 本地状态 + fan-in producer id；故意取小（OoO 窗口 = 核数 × 槽数，§6.1） | 无（单一 owner，无锁） |
+| `task_completed_flag` 环 | **全局共享** | 每任务 id 一个一次性置位布尔；唯一共享的 per-task 状态 | 最后一个（子）任务 owner 做 release 存储；消费者做 acquire 加载（轮询） |
+| **`block.won[N]` —— 以 id 为键的子任务投递表** | **block-共享** | anchor → follower 的**推送**通道，以任务 id 为键：`{active_mask M, 各激活槽 kernels/args, 已解析 fan-in, 剩余计数}`。anchor 胜出时把其余激活槽子任务投递进来；follower **异步抽取**属于自己槽的项（不阻塞、不按走位等待）。承载每任务剩余计数，互不串扰（§3.1）。填满时 anchor 暂缓认领新多核任务（反压） | anchor 插入（release）；follower 抽取（acquire）；`remaining` 原子递减；最后一个子任务完成时释放条目 |
+
+### 8.2 TensorMap
+
+| 结构 | 类别 | 作用 | 访问机制 |
+| ---- | ---- | ---- | -------- |
+| `PTO2TensorMap` / `PTO2TensorMapEntry` | **每核复制（全量）** | tensor 区域 → producer 任务 id；在每个核上相同地构建（§4） | 无跨核锁；通过重放确定性 submit 流重建。有效性由 `task_completed_flag` 环开窗 |
+
+### 8.3 全局共享，超出 per-task 状态之外
+
+| 结构 | 类别 | 作用 | 访问机制 |
+| ---- | ---- | ---- | -------- |
+| GM 输出堆（打包的输出缓冲） | **全局共享（物理）** | 任务输出/中间结果的后备存储，可被任意核作为下游输入读取 | 一块全局物理区域；分配记账（堆顶、scope arena 基址）是**每核复制、确定性**的（§9），写入由 owner 完成。完整策略见 §9 |
+| `heap_top` / scope arena 基址栈 | **每核复制（确定性，非全局）** | 在确定性 submit 重放中无条件推进，使任务 N 的输出地址成为 id 的纯函数（§9） | 无原子、无跨核通信；与 TensorMap 同理（§4） |
+| `heap_reclaim_frontier`（全局回收水位线） | **全局共享** | 全局最旧“仍可能被读”的任务 id；据此在 id 顺序上回收堆（§9） | 由完成标志环 + 各核进度最小值推导；单调 |
+| `func_id_to_addr_`（kernel id → GM 地址） | **全局共享，只读** | 把 `kernel_id` 解析为要调用的 incore 函数 | init 时一次性设置，之后只读 |
+| `graph_output_ptr` / `graph_output_size` | **全局共享** | 供 host 拷回的最终输出位置 | 产出核做原子发布 |
+| 全局错误字（原 `orch_error_code`） | **全局共享** | 任意核的致命错误 → 所有核 + host | 原子；首个写者胜出 |
+| “所有核完成”屏障（原 `orchestrator_done`） | **全局共享** | 全局终止检测（§7） | 原子计数器 / 屏障 |
+
+### 8.4 每核私有的编排状态
+
+| 结构 | 类别 | 作用 | 访问机制 |
+| ---- | ---- | ---- | -------- |
+| Scope 栈（`scope_stack_top` + 各层 arena 基址） | **每核复制（确定性）** | `PTO2_SCOPE` 生命周期跟踪；同时界定 GM 输出堆的 arena 栈（§9）。各核结构相同、进度不同 | 无锁；由确定性重放重建。注意：原 `scope_tasks[]`/`scope_begins[]` 用于 fanout 引用记账，新模型已不需要（§9、§10） |
+| Fan-in producer-id 列表（每个环槽一份） | **每核私有** | 构建时解析出的 producer 任务 id，执行时轮询 | 无 |
+| 本地致命标志 | **每核私有** | 快路径致命错误；升级到全局错误字 | 本地标志 + 原子发布 |
+| 核数常量（`total_cluster_count`、`total_aiv_count`） | **每核复制（只读）** | 资格 / 合理性检查 | init 时一次性设置 |
+
+## 9. 动态内存管理（全局输出堆）
+
+任务的输出/中间缓冲分配在一块 GM 堆上。由于**一个核产出的 output 可能被另一个核作为输入读取**，
+这块堆必须是**全局可寻址**的。本节给出分布式 runtime 下的内存管理策略与数据结构，并说明它相对
+当前 AICPU 模型的“stack of ring + scope”实现需要如何更新。
+
+### 9.1 当前（AICPU 集中式）模型回顾
+
+- **统一分配器 `PTO2TaskAllocator`**：把**任务槽环**与**堆环（heap ring）**合并分配。单一
+  orchestrator 单线程推进，用普通 store 写 `heap_top`（bump），无需 CAS。
+- **回收**：调度器把“最旧已 CONSUMED 任务”推进 `last_task_alive`；分配器据该任务的
+  `packed_buffer_end` 反推 `heap_tail`，环形回收（分配从 `top` bump，到尾部则在 `tail` 足够时
+  绕回，缓冲不跨越绕回边界）。
+- **stack of ring**：按 scope 深度复制成 `PTO2_MAX_RING_DEPTH`(=4) 套 {TaskRing, HeapRing,
+  DepPool}，使内层 scope 可独立于外层回收。
+- **scope（`PTO2_SCOPE`）**：用 `scope_tasks[]`/`scope_begins[]` 记录本 scope 的任务；每个任务
+  持有一个 +1 的 fanout 引用，`scope_end` 才释放——从而保证输出缓冲的生命周期 =（真实消费者
+  全部完成）**且**（scope_end）。`TaskOutputTensors` 的引用只在其 `PTO2_SCOPE` 内有效。
+
+### 9.2 哪些前提失效、需要更新
+
+新模型（§2–§7）取消了集中 orchestrator 与 scheduler，因此上面多数机制的前提不再成立：
+
+| 旧机制 | 在新模型中的处置 |
+| ------ | ---------------- |
+| 单 orchestrator 普通-store bump | **失效**：现在每个核都为自己拥有的任务分配输出。多写者下 `heap_top` 不能再用普通 store。 |
+| `last_task_alive`/CONSUMED 驱动回收 | **失效**：无 scheduler、无 CONSUMED 状态。回收改由全局完成前沿（§9.5）驱动。 |
+| 每 scope 深度的 TaskRing / DepPool / FaninPool | **移除**（§10）：任务槽改为每核私有环（§5），无依赖列表。 |
+| fanout 引用 + scope_end 释放 | **失效**：无 fanout/refcount。生命周期改由“窗口/前沿 + scope arena 折叠”界定（§9.4、§9.5）。 |
+| “stack of ring” | **收敛**为“**每核私有任务环**（§5） + **scope arena 栈**（§9.4）”，后者只管 GM 输出堆。 |
+
+结论：**stack-ring 需要更新**——任务环部分整体移除，堆部分保留但分配方式与回收方式都要改；
+**scope 需要保留但语义简化**（不再做 fanout 引用记账，改为 arena 栈 + 确定性重放）。
+
+### 9.3 分配：确定性、每核复制的布局（无原子、无通信）
+
+核心思想与 §4 的“每核全量复制 TensorMap”一致：**因为 submit 序列与每个任务的输出大小在各核上
+完全确定且相同，输出缓冲的布局也可以被每个核确定性地复算。**
+
+- 每个核在确定性 submit 重放中，对**每一个**任务（无论自己是否拥有——胜者、败者、follower 一视同仁）
+  **无条件**推进一份**每核复制**的堆顶 `heap_top`。任务 `N` 的输出偏移 = 其所在 arena 基址 +
+  该 arena 内 `N` 之前所有任务输出大小的前缀和。
+- 因此 `addr(N)` 是 submit 序列（及确定性大小）的**纯函数**：每个核为任务 `N` 算出**完全相同**的
+  地址。owner 负责写数据；任何核都能**不经通信**算出任意任务的输出地址。
+
+这取代了旧的“单 orchestrator bump”（多核下不可行），也**优于全局原子 bump**：原子 `fetch_add`
+会让地址依赖跨核的 bump 顺序而**非确定**，消费者便无法自行算出 producer 地址，必须额外发布地址 +
+读地址，引入跨核通信。确定性复制方案两者皆免。
+
+> **TensorMap 与地址的关系。** TensorMap 把 tensor 区域映射到 producer 任务 id（§4）。消费者拿到
+> producer id 后，用上面同一套确定性布局即可算出其输出地址（或在 TensorMap 条目里直接缓存这个
+> 确定性地址，因为它在每个核上都相同）。无需 producer 主动发布地址。
+
+### 9.4 Scope = 确定性复制的 arena 栈
+
+`PTO2_SCOPE` 在新模型里仍然是确定性编排程序的一部分（每个核执行相同的嵌套结构），因此 scope 栈
+是**每核复制且各核相同**的（与 TensorMap 同理）。它现在的职责是界定 GM 输出堆的 **arena 栈**：
+
+- **scope begin**：把当前 `heap_top` 记为新 arena 的基址，压栈（这是旧“stack of ring”里
+  per-depth 独立回收的分布式对应物）。
+- scope 内任务：在该 arena 内确定性 bump 分配（§9.3）。
+- **scope end**：把堆顶折叠回该 arena 基址，**一次性回收**该 scope 内所有“不外逃”的输出（LIFO
+  栈式回收，干净且 O(1)）。**外逃输出**（被该 scope 之外的任务消费的 tensor）必须分配在/提升到
+  **父 arena**，以便在折叠后存活。
+- 对**长 scope**（任务很多、不能等到 scope_end 才回收），在 arena 内部用 §9.5 的窗口/前沿机制做
+  环形回收，先行回收已不再被读的缓冲。
+
+`TaskOutputTensors` 的**单 scope 有效**规则保持不变：它返回的引用指向 owner 私有环槽中的 tensor
+存储，不得逃出其 `PTO2_SCOPE`；跨 scope 的数据流一律通过 TensorMap 按 id 查 producer + 上述确定性
+地址完成，而非通过 `TaskOutputTensors` 句柄。
+
+### 9.5 回收：窗口/前沿，取代 `last_task_alive`/CONSUMED
+
+由于布局在 id 顺序上确定地 bump，回收也自然按 id 顺序进行（任务 `N` 的缓冲位于 `N+1` 之前）。
+难点在于判断“`N` 的缓冲何时不再被读”。新模型用**全局完成前沿**而非 fanout 精确计数：
+
+- 维护一个**全局回收水位线** `heap_reclaim_frontier`，由 `task_completed_flag` 环加上**各核进度
+  最小值**（最慢的核/最旧未完成任务）推导。它表示“所有 id ≤ 该值的任务都已完成且其消费者也已完成”。
+- 给定**有界依赖跨度** `H`（保证任务 `N` 的所有消费者 id ≤ `N + H`），当全局完成前沿越过 `F` 时，
+  所有 id ≤ `F − H` 的输出可安全回收——把堆尾推进，腾出位置给后续（确定性布局中绕回到该位置的）
+  更晚任务。
+- 这与 §11 的 “`task_completed_flag` 环开窗”使用**同一个窗口**：该窗口同时裁剪复制的 TensorMap
+  与 GM 堆。
+- **scope_end** 对“不外逃”输出提供额外的、更早的粗粒度回收边界（§9.4）。
+- **反压**：堆（或当前 arena）满时，想为新拥有任务分配的核**暂缓认领**并自旋等待前沿推进——与
+  私有环填满的反压（§6）同一性质，方向一致（不让快核无限超前于回收）。
+
+> **正确性要点。** 一个缓冲只有在其**全部消费者执行完毕**后才能回收。窗口法用有界跨度 `H` +
+> 全局完成前沿保证这一点；若某图的依赖跨度可能超过 `H`，必须把 `H`/堆容量调大，否则属配置错误
+> （类比旧模型的 heap/window 死锁诊断）。精确的“按 tensor 最后消费者”回收（利用 TensorMap 中
+> 同一区域被新 producer 覆盖这一确定性事件）是更省内存的改进方向，列入 §11。
+
+### 9.6 数据结构小结
+
+| 结构 | 类别 | 作用 |
+| ---- | ---- | ---- |
+| GM 输出堆（物理区域） | **全局共享（物理）** | 唯一一块全局可寻址的输出后备存储 |
+| `heap_top` | **每核复制（确定性）** | 确定性 bump 堆顶；每核相同，无原子 |
+| scope arena 基址栈 + `scope_stack_top` | **每核复制（确定性）** | scope→arena 映射；scope_end 折叠回收 |
+| `heap_reclaim_frontier` | **全局共享** | 回收水位线，由完成前沿推导 |
+| `graph_output_ptr` / `graph_output_size` | **全局共享** | 最终图输出位置，供 host 拷回 |
+
+被移除：`PTO2TaskAllocator` 的任务环部分、`last_task_alive`/`heap_tail`(基于 CONSUMED)、per-depth
+`DepListPool`/`FaninPool`、`scope_tasks[]`/`scope_begins[]` 的 fanout 记账（§10）。
+
+## 10. 被移除的结构（相对 AICPU 的 `tensormap_and_ringbuffer`）
+
+统一的 worker-scheduler 模型删除了整个子系统：
+
+| 被移除 | 为什么消失 |
+| ------ | ---------- |
+| `PTO2SchedulerState`、`RingSchedState` | 无调度器实体——每个核调度自己的环 |
+| `PTO2ReadyQueue`、`dummy_ready_queue`、`early_dispatch_queue` | owner 执行自己的就绪任务；无分派队列 |
+| `PTO2SpscQueue` + `WiringState` | 无独立连线权威；无 fanout 可连 |
+| `fanout_lock`、`fanout_head`、`PTO2DepListPool`、`PTO2FaninPool` 溢出 | 无 fanout 列表——依赖经标志环拉取 |
+| `fanin_refcount`、`fanout_refcount`、`completed_subtasks` | 被完成标志轮询替代 |
+| `Handshake` 门铃、`Runtime::workers[]`、`AICoreCompletionMailbox` | 无调度器→worker 分派握手 |
+| SM 中的全局 `PTO2TaskDescriptor` / `PTO2TaskPayload` / `PTO2TaskSlotState` 环 | 被每核私有任务环替代 |
+| `current_task_index`（环头）/ `last_task_alive`（环尾）流控 | 被 claim 计数器 + 每核环空槽替代 |
+| `task_state`（PENDING/COMPLETED/CONSUMED）、每线程 `sched_error_*` | 被单一全局 `task_completed_flag` 与单一错误字替代 |
+| `PTO2TaskAllocator` 的**任务环**部分、`heap_tail`(基于 CONSUMED 反推) | 堆分配改为每核复制的确定性 bump；回收改为全局完成前沿（§9） |
+| per-depth “stack of ring” 的 TaskRing | 收敛为每核私有环（§5）+ scope arena 栈（§9）；堆 arena 仍按 scope 分层 |
+| `scope_tasks[]` / `scope_begins[]` 的 fanout 引用记账 | scope 不再持有 +1 fanout 引用；生命周期由窗口/前沿 + arena 折叠界定（§9） |
+
+编排 API 表面（`PTO2RuntimeOps`、`rt_submit_*`）**保留**；只有 `submit_task` 背后的实现改变
+（认领 → 无条件 TensorMap 更新 → 有条件的私有环构建 → 稍后执行）。
+
+## 11. 实现规范（原开放问题的决议）
+
+本节把先前列为开放的问题逐一定为具体方案。先约定全局常量：
+
+| 常量 | 含义 | 默认 |
+| ---- | ---- | ---- |
+| `W` | 全局窗口（`task_completed_flag` 环、复制 TensorMap、GM 堆共用），2 的幂 | ≥ `Δ + H` |
+| `Δ` | 任一核相对全局完成前沿可向前跑的最大 id 跨度（由反压封顶） | 由 `PRIVATE_TASK_SLOT_NUM`、堆容量决定 |
+| `H` | 依赖跨度上界：任一 producer 的最后消费者 id ≤ producer id + `H`。**由 SCOPE 决定**（PC 退出 scope 即终结其内变量可见性，故 `H` = 最大 scope 任务跨度，详见 §6.6） | 真实 PYPTO 随 scope 动态定界；a2a3 原型用保守常数 `kHDefault=64`（`PTO_DIST_H` 覆盖）近似 |
+| `F` | 全局连续完成前沿：使所有 id ≤ `F` 的任务都已完成的最大前缀 | 运行期推进 |
+| `R` | 回收前沿 `= F − H`：id ≤ `R` 的输出可安全回收 | 由 `F` 推导 |
+| `BLOCK_WON_SLOTS` | 每 block 的 `block.won` 投递环容量 | `PRIVATE_TASK_SLOT_NUM`(=8) |
+
+### 11.1 Claim 原子性 + 两条流的无跳过（原“Claim 原子性”“每 anchor 类型 claim 计数器”）
+
+**原语：单条 `atomic_fetch_max`。** 一个类型为 `T` 的核到达任务 `N` 时执行
+`old = atomic_fetch_max(cursor[T], N)`（`cursor[T]` 为 GM 上一个 64 位字），**`old < N` 即胜出**，
+否则 `N` 已被认领。单原子、无循环。若硬件无 `fetch_max`，等价 CAS 回路：
+`do { c = load(cursor[T]); if (N <= c) return LOST; } while (!CAS(cursor[T], c, N)); return WON;`
+内存序取 **acq-rel**（release 发布胜利，acquire 观察既有认领）。所有权判定只依赖 cursor 本身；
+真正的产出数据另由完成标志同步（§11.5）。
+
+**恰一胜者且无跳过（取代“claim 计数器”）。** 每个 `T` 核按 id 递增顺序遇到 `T` 任务，`cursor[T]`
+只会取到真实的 `T` 任务 id 值。在任何核尝试第 `k` 个 `T` 任务 `t_k` 之前，它必先尝试过 `t_{k-1}`
+（于是其时 `cursor[T] ≥ t_{k-1}`）；而 `cursor[T]` 的相邻取值之间没有别的 `T` id，故它只能从
+`t_{k-1}` 跃到 `t_k`——**不跳过任何 `T` id，且每个恰被一个核置位（fetch_max 的单调性保证）**。
+`cube_cursor` 与 `vector_cursor` 各自对自己的子序列单调推进、互不干扰，全局任务 id 仍是单一确定
+序列。两个 cursor 的存在与必要性见 §2、§3.1。
+
+### 11.2 `block.won` 容量与反压（原“`block.won` 投递表大小与偏移”）
+
+- **容量**：每 block 一个小定长环，`BLOCK_WON_SLOTS`（默认 = `PRIVATE_TASK_SLOT_NUM`）个条目，
+  每条目 = 一个多核任务推送给本 block 的子任务集 + 剩余计数。界限依据：anchor 的超前量本就被其
+  自身私有环（很小，§5/§6.1）封顶，每赢一个多核任务至多占 anchor 1 个环槽 + 1 个 `block.won`
+  条目，故与私有环同样大小即足够（可更小）。
+- **反压（已落入 §6 伪代码）**：anchor 在**认领之前**（步骤 2）检查 `block.won` 是否有空位；满则
+  **本轮不认领**（不执行 `fetch_max`），下一轮回到步骤 1 执行就绪任务（从而让 follower 抽取、腾空
+  `block.won`）。被让出的多核任务由**另一个有空闲的 block 的 anchor 认领**（天然负载均衡）或本核
+  稍后重试。
+- **无死锁**：根任务无依赖恒就绪；执行持续腾空私有环与 `block.won`；DAG 无环 → 前向进展恒成立。
+  唯一残留是 §8 的尾部空转。
+
+### 11.3 完成标志环大小与回绕（原“`task_completed_flag` 环大小与回绕”）
+
+- `task_completed_flag` 是 `W` 个一次性置位布尔的环，`flag(N)` 位于 `N & (W−1)`。
+- **`W` 取 2 的幂且 ≥ `Δ + H`**：`Δ` 是最快核相对完成前沿的最大超前（由私有环 + 堆反压封顶），
+  `H` 是依赖跨度上界（§11.4）。同一个 `W` 同时给复制 TensorMap 与 GM 堆开窗。
+- **回绕/ABA**：当回收前沿 `R`（§11.4）越过 `N` 时，把 `flag(N)` 复位为 false，槽位让给 `N+W`。
+  不变式：消费者只在构建了依赖 `N` 的任务**之后**（即走位已过 `N`）才轮询 `flag(N)`，而 `W ≥ Δ+H`
+  保证 `N` 的标志仍被需要时 `N+W` 尚未被认领 → 不会别名。**更稳健的可选做法**：在槽内连同 true 写入
+  producer 的 `N`（消费者校验 `slot.id == N`），用代/epoch 戳彻底杜绝 ABA，与 `W` 大小无关。
+
+### 11.4 GM 堆细化：`H`、容量、前沿推导、外逃输出（原“GM 输出堆的细化”）
+
+- **`H`（依赖跨度上界）**：**由 SCOPE 决定，不是固化常数**（详见 §6.6）。tensor 的可见域就是其所在
+  `PTO2_SCOPE`；orchestrator 的 PC 退出该 scope 后，scope 内变量不再可见、不会被后续任务引用，故依赖
+  跨度天然被"所在 scope 的任务跨度"封顶，`H` ≈ 最大 scope 任务数（+ 并发 scope 余量）。真实 PYPTO 据此
+  随 scope 进出动态定界（按 scope 深度分环，内层 scope 完成即独立回收，见 a5 `MULTI_RING.md`）。
+  **本 a2a3 原型**（`dist_scope_begin/end` 为空 stub）用保守常数 `kHDefault=64`（`PTO_DIST_H` 覆盖）作为
+  "最大 scope 跨度"的静态上界近似。运行期校验：若某消费者的 producer id < (当前 − `H`)，或某分配将覆盖
+  尚不可回收的区域，即判为容量/配置错误（类比旧模型的 heap-deadlock 诊断）→ 调大 `H`/堆，或细化 scope。
+- **堆/arena 容量** ≥ 工作集 = 窗口 `(R, top]` 内各任务输出大小之和；超出则报诊断。
+- **`F`（连续完成前沿）**：全局原子、单调。**协作式推进**——任一核置位 `flag(N)` 后，
+  `while flag(F+1) == true: CAS(F, F, F+1)`。无锁、任意核可推进、开销摊薄。
+- **`R = F − H`（回收前沿）**：全局派生量。某 arena 的 `heap_tail` = 任务 `R` 在该 arena 内的确定性
+  偏移；因布局确定，每个核都算出相同的 `heap_tail`。核要在确定性偏移 `X` 上分配任务 `M` 时，须等
+  `X` 处上一占用者的任务 id ≤ `R`（即回收已到位）——这就是堆侧反压。
+- **外逃输出（promotion 的处置）**：**默认不做运行期提升**。堆按单一全局确定性 bump + 前沿回收
+  （§9.5），它对任意依赖（含跨 scope）都正确，无需前向信息。**scope-arena 折叠**（scope_end 处
+  LIFO 即时回收）只作为**可选优化**，仅施加于**静态可证/标注为“无外逃”**的 scope；含外逃输出的
+  scope 退回前沿回收。如此既无需在产出时预知外逃，也保证正确。
+- **“按 tensor 最后消费者”的精确回收**：**降级为可选优化，正确性不依赖它**。精确的最后消费者需要
+  前向信息/两遍扫描/引用计数（已移除），故以 `H`-窗口为已定的主用机制；精确回收作为省内存改进
+  留作未来工作（不阻塞）。
+
+### 11.5 跨核标志可见性（原“跨核标志可见性”）
+
+- **producer 次序**：写输出到 GM → 把输出区域 writeback/flush 到所有核读取的一致性点（GM/L2）→
+  **release-store** `flag(N) = true`。
+- **consumer 次序**：**acquire-load** `flag(N)`；见 true 后（acquire 栅栏）再读 producer 的输出区域；
+  非一致缓存平台上对该区域做 invalidate 或旁路缓存读。
+- **一致缓存平台**：标志字上的 release/acquire 即足够。**非一致平台**：在标志发布/观察前后，对**数据
+  区域**显式 writeback（producer）/ invalidate（consumer）。
+- `cursor[T]`、`F`、`R` 等原子量统一取 acq-rel（§11.1）。
+
+### 11.6 异步 / SDMA kernel（原“异步/SDMA kernel”）
+
+- **句柄记在私有环槽里，不是 `block.won`。** 异步算子是 owner 在执行自己**私有任务环**中的某个
+  （子）任务时发起的，故异步句柄/事件记入**该私有环槽**，槽因任务尚未真正完成而**暂不释放**。
+  异步本身与 `block.won` 没有直接关系——它只是把“完成动作”从*发起时刻*推迟到 *DMA 真正完成时刻*。
+- Phase B 在检查依赖就绪之外，**额外轮询在飞私有环槽的句柄**；异步完成时，按 §11.5 的次序
+  （先 flush）执行该（子）任务的**完成动作**，再释放槽。完成动作具体是什么取决于任务种类
+  （与异步无关，沿用 §6 的完成逻辑）：
+  - **单核任务（1C/1V）**：直接置 `flag(N)`。
+  - **多核任务（MIX/2V）的子任务**：`atomic_dec(block.won[N].remaining)`，由把 `remaining` 减到 0
+    的那个子任务最后置 `flag(N)`。**仅在此情形下，被推迟的完成动作才触及 `block.won`**——即“在
+    mixed/2V 子任务内部发起异步 DMA”时。
+- 消费者侧不变：仍只轮询标志，而标志只在算子（及其所属多核任务的全部子任务）**真正完成后**才被置。
+- **反压**：在飞异步算子数量被私有环容量天然封顶。
+
+**这一步轮询由谁做：每个核自己做，不专设 AICPU。**
+
+- **决策**：在飞句柄由**发起该算子的 owner 核**在自己的 Phase B 中轮询，**不**引入一个专职轮询的
+  AICPU。理由：
+  1. **不违背全局目标**——本设计的根本目的就是把编排/调度从 AICPU 移除、SPMD 分布到 AI 核；专设
+     AICPU 轮询器等于请回集中式部件，并制造单点。
+  2. **保持单一 owner、无锁不变式**——置 `flag(N)`、释放私有环槽、递减 `block.won[N].remaining`
+     都是 owner 的本地动作（owner = builder = executor = completer）。让 AICPU 代劳就要写别人“单一
+     owner、无锁”的私有环与 block-共享计数，反而需要加锁/协调。
+  3. **边际成本近零**——Phase B 本就逐槽遍历私有环查依赖就绪，顺带读一次在飞槽的句柄状态仅多一次
+     状态读；在飞数被私有环容量（`PRIVATE_TASK_SLOT_NUM`）封顶。
+  4. **异步算子本就并行**——SDMA 跑在 DMA 引擎上，核在此期间继续编排/执行其它任务，只在 Phase B
+     间隙轮询，不占算力。
+- **可选硬件辅助（不改变上述归属）**：若异步引擎能在完成时**自行写一个内存位**或**发事件**，则
+  - 让引擎按 §11.5 的次序直接置 `flag(N)`：消费者照常轮询标志，**无核需要为“发布完成”而忙等**；
+    owner 只需在下次访问该槽时**惰性**释放槽并递减 `remaining`（届时已见标志置位）。
+  - 或：尾部空转的 owner（§7/§8，已无其它就绪工作）**等待该完成事件**而非忙轮询。
+
+  两种辅助都仍由 owner 收尾，不引入集中式 AICPU 轮询器。
+
+### 11.7 仍然开放
+
+- **MIX 配对 —— 动态替代方案：** §3.1 规定*固定* block 配对（AIC_c + AIV0_c + AIV1_c）。
+  **平台依据：在 A5 平台上，block 由硬件把 1 个 AIC + 2 个 AIV 固定绑定**，因此面向 A5（及当前
+  目标核）开发时，**采用固定配对、不做动态 co-owner 匹配是合理且既定的选择**——它与硬件 block
+  边界天然对齐，省去跨 block 的认领协调与正确性论证负担（§3.2）。
+  动态配对方案（跨 block 均衡 MIX 工作；亦即 §3.2 讨论并暂不采用的“block 内先到先得代发布”等
+  思路的归宿）**仅在未来核解除该硬件绑定时**才需要，届时再行设计，**本节不予裁定**。
+
+## 12. 相关文档
+
+| 文档 | 关联性 |
+| ---- | ------ |
+| [chip-level-arch.md](chip-level-arch.md) | 当前 L2 host / AICPU / AICore 划分（本设计所替代的模型） |
+| [scheduler.md](scheduler.md) | 当前 AICPU 侧调度器（此处移除） |
+| [orchestrator.md](orchestrator.md) | Host/L3 Orchestrator DAG 构建器（不同层；仅命名重叠） |
+| [simt-launch.md](simt-launch.md) | 设备上的 SPMD / 多 block 启动 |
+| [tensormap_and_ringbuffer RUNTIME_LOGIC.md](../src/a2a3/runtime/tensormap_and_ringbuffer/docs/RUNTIME_LOGIC.md) | 此处移除/修改结构的权威来源 |
diff --git a/docs/fully_distributed_within_core/swimlane_bgemm_fullcore.json b/docs/fully_distributed_within_core/swimlane_bgemm_fullcore.json
new file mode 100644
index 000000000..18c795db7
--- /dev/null
+++ b/docs/fully_distributed_within_core/swimlane_bgemm_fullcore.json
@@ -0,0 +1,8429 @@
+{
+  "displayTimeUnit": "ns",
+  "traceEvents": [
+    {
+      "ph": "M",
+      "name": "process_name",
+      "pid": 0,
+      "args": {
+        "name": "block0"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "thread_name",
+      "pid": 0,
+      "tid": 0,
+      "args": {
+        "name": "AIC (core0)"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "process_name",
+      "pid": 1,
+      "args": {
+        "name": "block1"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "thread_name",
+      "pid": 1,
+      "tid": 0,
+      "args": {
+        "name": "AIC (core1)"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "process_name",
+      "pid": 2,
+      "args": {
+        "name": "block2"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "thread_name",
+      "pid": 2,
+      "tid": 0,
+      "args": {
+        "name": "AIC (core2)"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "process_name",
+      "pid": 3,
+      "args": {
+        "name": "block3"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "thread_name",
+      "pid": 3,
+      "tid": 0,
+      "args": {
+        "name": "AIC (core3)"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "process_name",
+      "pid": 4,
+      "args": {
+        "name": "block4"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "thread_name",
+      "pid": 4,
+      "tid": 0,
+      "args": {
+        "name": "AIC (core4)"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "process_name",
+      "pid": 5,
+      "args": {
+        "name": "block5"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "thread_name",
+      "pid": 5,
+      "tid": 0,
+      "args": {
+        "name": "AIC (core5)"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "process_name",
+      "pid": 6,
+      "args": {
+        "name": "block6"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "thread_name",
+      "pid": 6,
+      "tid": 0,
+      "args": {
+        "name": "AIC (core6)"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "process_name",
+      "pid": 7,
+      "args": {
+        "name": "block7"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "thread_name",
+      "pid": 7,
+      "tid": 0,
+      "args": {
+        "name": "AIC (core7)"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "process_name",
+      "pid": 8,
+      "args": {
+        "name": "block8"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "thread_name",
+      "pid": 8,
+      "tid": 0,
+      "args": {
+        "name": "AIC (core8)"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "process_name",
+      "pid": 9,
+      "args": {
+        "name": "block9"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "thread_name",
+      "pid": 9,
+      "tid": 0,
+      "args": {
+        "name": "AIC (core9)"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "process_name",
+      "pid": 10,
+      "args": {
+        "name": "block10"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "thread_name",
+      "pid": 10,
+      "tid": 0,
+      "args": {
+        "name": "AIC (core10)"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "process_name",
+      "pid": 11,
+      "args": {
+        "name": "block11"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "thread_name",
+      "pid": 11,
+      "tid": 0,
+      "args": {
+        "name": "AIC (core11)"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "process_name",
+      "pid": 12,
+      "args": {
+        "name": "block12"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "thread_name",
+      "pid": 12,
+      "tid": 0,
+      "args": {
+        "name": "AIC (core12)"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "process_name",
+      "pid": 13,
+      "args": {
+        "name": "block13"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "thread_name",
+      "pid": 13,
+      "tid": 0,
+      "args": {
+        "name": "AIC (core13)"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "process_name",
+      "pid": 14,
+      "args": {
+        "name": "block14"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "thread_name",
+      "pid": 14,
+      "tid": 0,
+      "args": {
+        "name": "AIC (core14)"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "process_name",
+      "pid": 15,
+      "args": {
+        "name": "block15"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "thread_name",
+      "pid": 15,
+      "tid": 0,
+      "args": {
+        "name": "AIC (core15)"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "process_name",
+      "pid": 16,
+      "args": {
+        "name": "block16"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "thread_name",
+      "pid": 16,
+      "tid": 0,
+      "args": {
+        "name": "AIC (core16)"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "process_name",
+      "pid": 17,
+      "args": {
+        "name": "block17"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "thread_name",
+      "pid": 17,
+      "tid": 0,
+      "args": {
+        "name": "AIC (core17)"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "process_name",
+      "pid": 18,
+      "args": {
+        "name": "block18"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "thread_name",
+      "pid": 18,
+      "tid": 0,
+      "args": {
+        "name": "AIC (core18)"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "process_name",
+      "pid": 19,
+      "args": {
+        "name": "block19"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "thread_name",
+      "pid": 19,
+      "tid": 0,
+      "args": {
+        "name": "AIC (core19)"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "process_name",
+      "pid": 20,
+      "args": {
+        "name": "block20"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "thread_name",
+      "pid": 20,
+      "tid": 0,
+      "args": {
+        "name": "AIC (core20)"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "process_name",
+      "pid": 21,
+      "args": {
+        "name": "block21"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "thread_name",
+      "pid": 21,
+      "tid": 0,
+      "args": {
+        "name": "AIC (core21)"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "process_name",
+      "pid": 22,
+      "args": {
+        "name": "block22"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "thread_name",
+      "pid": 22,
+      "tid": 0,
+      "args": {
+        "name": "AIC (core22)"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "process_name",
+      "pid": 23,
+      "args": {
+        "name": "block23"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "thread_name",
+      "pid": 23,
+      "tid": 0,
+      "args": {
+        "name": "AIC (core23)"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "process_name",
+      "pid": 0,
+      "args": {
+        "name": "block0"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "thread_name",
+      "pid": 0,
+      "tid": 1,
+      "args": {
+        "name": "AIV0 (core24)"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "process_name",
+      "pid": 0,
+      "args": {
+        "name": "block0"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "thread_name",
+      "pid": 0,
+      "tid": 2,
+      "args": {
+        "name": "AIV1 (core25)"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "process_name",
+      "pid": 1,
+      "args": {
+        "name": "block1"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "thread_name",
+      "pid": 1,
+      "tid": 1,
+      "args": {
+        "name": "AIV0 (core26)"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "process_name",
+      "pid": 1,
+      "args": {
+        "name": "block1"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "thread_name",
+      "pid": 1,
+      "tid": 2,
+      "args": {
+        "name": "AIV1 (core27)"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "process_name",
+      "pid": 2,
+      "args": {
+        "name": "block2"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "thread_name",
+      "pid": 2,
+      "tid": 1,
+      "args": {
+        "name": "AIV0 (core28)"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "process_name",
+      "pid": 2,
+      "args": {
+        "name": "block2"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "thread_name",
+      "pid": 2,
+      "tid": 2,
+      "args": {
+        "name": "AIV1 (core29)"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "process_name",
+      "pid": 3,
+      "args": {
+        "name": "block3"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "thread_name",
+      "pid": 3,
+      "tid": 1,
+      "args": {
+        "name": "AIV0 (core30)"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "process_name",
+      "pid": 3,
+      "args": {
+        "name": "block3"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "thread_name",
+      "pid": 3,
+      "tid": 2,
+      "args": {
+        "name": "AIV1 (core31)"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "process_name",
+      "pid": 4,
+      "args": {
+        "name": "block4"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "thread_name",
+      "pid": 4,
+      "tid": 1,
+      "args": {
+        "name": "AIV0 (core32)"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "process_name",
+      "pid": 4,
+      "args": {
+        "name": "block4"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "thread_name",
+      "pid": 4,
+      "tid": 2,
+      "args": {
+        "name": "AIV1 (core33)"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "process_name",
+      "pid": 5,
+      "args": {
+        "name": "block5"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "thread_name",
+      "pid": 5,
+      "tid": 1,
+      "args": {
+        "name": "AIV0 (core34)"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "process_name",
+      "pid": 5,
+      "args": {
+        "name": "block5"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "thread_name",
+      "pid": 5,
+      "tid": 2,
+      "args": {
+        "name": "AIV1 (core35)"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "process_name",
+      "pid": 6,
+      "args": {
+        "name": "block6"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "thread_name",
+      "pid": 6,
+      "tid": 1,
+      "args": {
+        "name": "AIV0 (core36)"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "process_name",
+      "pid": 6,
+      "args": {
+        "name": "block6"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "thread_name",
+      "pid": 6,
+      "tid": 2,
+      "args": {
+        "name": "AIV1 (core37)"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "process_name",
+      "pid": 7,
+      "args": {
+        "name": "block7"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "thread_name",
+      "pid": 7,
+      "tid": 1,
+      "args": {
+        "name": "AIV0 (core38)"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "process_name",
+      "pid": 7,
+      "args": {
+        "name": "block7"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "thread_name",
+      "pid": 7,
+      "tid": 2,
+      "args": {
+        "name": "AIV1 (core39)"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "process_name",
+      "pid": 8,
+      "args": {
+        "name": "block8"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "thread_name",
+      "pid": 8,
+      "tid": 1,
+      "args": {
+        "name": "AIV0 (core40)"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "process_name",
+      "pid": 8,
+      "args": {
+        "name": "block8"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "thread_name",
+      "pid": 8,
+      "tid": 2,
+      "args": {
+        "name": "AIV1 (core41)"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "process_name",
+      "pid": 9,
+      "args": {
+        "name": "block9"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "thread_name",
+      "pid": 9,
+      "tid": 1,
+      "args": {
+        "name": "AIV0 (core42)"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "process_name",
+      "pid": 9,
+      "args": {
+        "name": "block9"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "thread_name",
+      "pid": 9,
+      "tid": 2,
+      "args": {
+        "name": "AIV1 (core43)"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "process_name",
+      "pid": 10,
+      "args": {
+        "name": "block10"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "thread_name",
+      "pid": 10,
+      "tid": 1,
+      "args": {
+        "name": "AIV0 (core44)"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "process_name",
+      "pid": 10,
+      "args": {
+        "name": "block10"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "thread_name",
+      "pid": 10,
+      "tid": 2,
+      "args": {
+        "name": "AIV1 (core45)"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "process_name",
+      "pid": 11,
+      "args": {
+        "name": "block11"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "thread_name",
+      "pid": 11,
+      "tid": 1,
+      "args": {
+        "name": "AIV0 (core46)"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "process_name",
+      "pid": 11,
+      "args": {
+        "name": "block11"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "thread_name",
+      "pid": 11,
+      "tid": 2,
+      "args": {
+        "name": "AIV1 (core47)"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "process_name",
+      "pid": 12,
+      "args": {
+        "name": "block12"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "thread_name",
+      "pid": 12,
+      "tid": 1,
+      "args": {
+        "name": "AIV0 (core48)"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "process_name",
+      "pid": 12,
+      "args": {
+        "name": "block12"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "thread_name",
+      "pid": 12,
+      "tid": 2,
+      "args": {
+        "name": "AIV1 (core49)"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "process_name",
+      "pid": 13,
+      "args": {
+        "name": "block13"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "thread_name",
+      "pid": 13,
+      "tid": 1,
+      "args": {
+        "name": "AIV0 (core50)"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "process_name",
+      "pid": 13,
+      "args": {
+        "name": "block13"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "thread_name",
+      "pid": 13,
+      "tid": 2,
+      "args": {
+        "name": "AIV1 (core51)"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "process_name",
+      "pid": 14,
+      "args": {
+        "name": "block14"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "thread_name",
+      "pid": 14,
+      "tid": 1,
+      "args": {
+        "name": "AIV0 (core52)"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "process_name",
+      "pid": 14,
+      "args": {
+        "name": "block14"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "thread_name",
+      "pid": 14,
+      "tid": 2,
+      "args": {
+        "name": "AIV1 (core53)"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "process_name",
+      "pid": 15,
+      "args": {
+        "name": "block15"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "thread_name",
+      "pid": 15,
+      "tid": 1,
+      "args": {
+        "name": "AIV0 (core54)"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "process_name",
+      "pid": 15,
+      "args": {
+        "name": "block15"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "thread_name",
+      "pid": 15,
+      "tid": 2,
+      "args": {
+        "name": "AIV1 (core55)"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "process_name",
+      "pid": 16,
+      "args": {
+        "name": "block16"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "thread_name",
+      "pid": 16,
+      "tid": 1,
+      "args": {
+        "name": "AIV0 (core56)"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "process_name",
+      "pid": 16,
+      "args": {
+        "name": "block16"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "thread_name",
+      "pid": 16,
+      "tid": 2,
+      "args": {
+        "name": "AIV1 (core57)"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "process_name",
+      "pid": 17,
+      "args": {
+        "name": "block17"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "thread_name",
+      "pid": 17,
+      "tid": 1,
+      "args": {
+        "name": "AIV0 (core58)"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "process_name",
+      "pid": 17,
+      "args": {
+        "name": "block17"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "thread_name",
+      "pid": 17,
+      "tid": 2,
+      "args": {
+        "name": "AIV1 (core59)"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "process_name",
+      "pid": 18,
+      "args": {
+        "name": "block18"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "thread_name",
+      "pid": 18,
+      "tid": 1,
+      "args": {
+        "name": "AIV0 (core60)"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "process_name",
+      "pid": 18,
+      "args": {
+        "name": "block18"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "thread_name",
+      "pid": 18,
+      "tid": 2,
+      "args": {
+        "name": "AIV1 (core61)"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "process_name",
+      "pid": 19,
+      "args": {
+        "name": "block19"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "thread_name",
+      "pid": 19,
+      "tid": 1,
+      "args": {
+        "name": "AIV0 (core62)"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "process_name",
+      "pid": 19,
+      "args": {
+        "name": "block19"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "thread_name",
+      "pid": 19,
+      "tid": 2,
+      "args": {
+        "name": "AIV1 (core63)"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "process_name",
+      "pid": 20,
+      "args": {
+        "name": "block20"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "thread_name",
+      "pid": 20,
+      "tid": 1,
+      "args": {
+        "name": "AIV0 (core64)"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "process_name",
+      "pid": 20,
+      "args": {
+        "name": "block20"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "thread_name",
+      "pid": 20,
+      "tid": 2,
+      "args": {
+        "name": "AIV1 (core65)"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "process_name",
+      "pid": 21,
+      "args": {
+        "name": "block21"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "thread_name",
+      "pid": 21,
+      "tid": 1,
+      "args": {
+        "name": "AIV0 (core66)"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "process_name",
+      "pid": 21,
+      "args": {
+        "name": "block21"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "thread_name",
+      "pid": 21,
+      "tid": 2,
+      "args": {
+        "name": "AIV1 (core67)"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "process_name",
+      "pid": 22,
+      "args": {
+        "name": "block22"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "thread_name",
+      "pid": 22,
+      "tid": 1,
+      "args": {
+        "name": "AIV0 (core68)"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "process_name",
+      "pid": 22,
+      "args": {
+        "name": "block22"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "thread_name",
+      "pid": 22,
+      "tid": 2,
+      "args": {
+        "name": "AIV1 (core69)"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "process_name",
+      "pid": 23,
+      "args": {
+        "name": "block23"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "thread_name",
+      "pid": 23,
+      "tid": 1,
+      "args": {
+        "name": "AIV0 (core70)"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "process_name",
+      "pid": 23,
+      "args": {
+        "name": "block23"
+      }
+    },
+    {
+      "ph": "M",
+      "name": "thread_name",
+      "pid": 23,
+      "tid": 2,
+      "args": {
+        "name": "AIV1 (core71)"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#24",
+      "pid": 0,
+      "tid": 0,
+      "ts": 5030.041,
+      "dur": 24249.959,
+      "args": {
+        "task_id": 24,
+        "func_id": 0,
+        "core": 0,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#54",
+      "pid": 0,
+      "tid": 0,
+      "ts": 29297.125,
+      "dur": 32959.5,
+      "args": {
+        "task_id": 54,
+        "func_id": 0,
+        "core": 0,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#82",
+      "pid": 0,
+      "tid": 0,
+      "ts": 62270.833,
+      "dur": 60093.417,
+      "args": {
+        "task_id": 82,
+        "func_id": 0,
+        "core": 0,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#148",
+      "pid": 0,
+      "tid": 0,
+      "ts": 122401.958,
+      "dur": 34399.083,
+      "args": {
+        "task_id": 148,
+        "func_id": 0,
+        "core": 0,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#180",
+      "pid": 0,
+      "tid": 0,
+      "ts": 156811.333,
+      "dur": 46998.208,
+      "args": {
+        "task_id": 180,
+        "func_id": 0,
+        "core": 0,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#222",
+      "pid": 0,
+      "tid": 0,
+      "ts": 203826.583,
+      "dur": 49256.833,
+      "args": {
+        "task_id": 222,
+        "func_id": 0,
+        "core": 0,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#274",
+      "pid": 0,
+      "tid": 0,
+      "ts": 253111.083,
+      "dur": 30790.833,
+      "args": {
+        "task_id": 274,
+        "func_id": 0,
+        "core": 0,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#304",
+      "pid": 0,
+      "tid": 0,
+      "ts": 283934.208,
+      "dur": 105604.792,
+      "args": {
+        "task_id": 304,
+        "func_id": 0,
+        "core": 0,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#408",
+      "pid": 0,
+      "tid": 0,
+      "ts": 389595.25,
+      "dur": 43741.083,
+      "args": {
+        "task_id": 408,
+        "func_id": 0,
+        "core": 0,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#456",
+      "pid": 0,
+      "tid": 0,
+      "ts": 433367.958,
+      "dur": 21803.458,
+      "args": {
+        "task_id": 456,
+        "func_id": 0,
+        "core": 0,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#466",
+      "pid": 0,
+      "tid": 0,
+      "ts": 455179.0,
+      "dur": 35348.458,
+      "args": {
+        "task_id": 466,
+        "func_id": 0,
+        "core": 0,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#14",
+      "pid": 1,
+      "tid": 0,
+      "ts": 3561.625,
+      "dur": 29861.416,
+      "args": {
+        "task_id": 14,
+        "func_id": 0,
+        "core": 1,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#60",
+      "pid": 1,
+      "tid": 0,
+      "ts": 33432.041,
+      "dur": 67548.292,
+      "args": {
+        "task_id": 60,
+        "func_id": 0,
+        "core": 1,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#118",
+      "pid": 1,
+      "tid": 0,
+      "ts": 100994.333,
+      "dur": 75356.167,
+      "args": {
+        "task_id": 118,
+        "func_id": 0,
+        "core": 1,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#196",
+      "pid": 1,
+      "tid": 0,
+      "ts": 176378.75,
+      "dur": 32058.333,
+      "args": {
+        "task_id": 196,
+        "func_id": 0,
+        "core": 1,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#228",
+      "pid": 1,
+      "tid": 0,
+      "ts": 208452.25,
+      "dur": 69108.583,
+      "args": {
+        "task_id": 228,
+        "func_id": 0,
+        "core": 1,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#296",
+      "pid": 1,
+      "tid": 0,
+      "ts": 277592.75,
+      "dur": 66368.833,
+      "args": {
+        "task_id": 296,
+        "func_id": 0,
+        "core": 1,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#362",
+      "pid": 1,
+      "tid": 0,
+      "ts": 343995.666,
+      "dur": 15728.584,
+      "args": {
+        "task_id": 362,
+        "func_id": 0,
+        "core": 1,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#380",
+      "pid": 1,
+      "tid": 0,
+      "ts": 359735.291,
+      "dur": 56199.709,
+      "args": {
+        "task_id": 380,
+        "func_id": 0,
+        "core": 1,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#436",
+      "pid": 1,
+      "tid": 0,
+      "ts": 415969.291,
+      "dur": 65775.459,
+      "args": {
+        "task_id": 436,
+        "func_id": 0,
+        "core": 1,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#26",
+      "pid": 2,
+      "tid": 0,
+      "ts": 5108.0,
+      "dur": 65956.291,
+      "args": {
+        "task_id": 26,
+        "func_id": 0,
+        "core": 2,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#90",
+      "pid": 2,
+      "tid": 0,
+      "ts": 71111.875,
+      "dur": 40014.708,
+      "args": {
+        "task_id": 90,
+        "func_id": 0,
+        "core": 2,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#132",
+      "pid": 2,
+      "tid": 0,
+      "ts": 111137.625,
+      "dur": 40683.0,
+      "args": {
+        "task_id": 132,
+        "func_id": 0,
+        "core": 2,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#172",
+      "pid": 2,
+      "tid": 0,
+      "ts": 151836.166,
+      "dur": 65193.542,
+      "args": {
+        "task_id": 172,
+        "func_id": 0,
+        "core": 2,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#238",
+      "pid": 2,
+      "tid": 0,
+      "ts": 217053.875,
+      "dur": 49605.666,
+      "args": {
+        "task_id": 238,
+        "func_id": 0,
+        "core": 2,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#290",
+      "pid": 2,
+      "tid": 0,
+      "ts": 266706.958,
+      "dur": 61557.458,
+      "args": {
+        "task_id": 290,
+        "func_id": 0,
+        "core": 2,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#346",
+      "pid": 2,
+      "tid": 0,
+      "ts": 328315.708,
+      "dur": 50557.5,
+      "args": {
+        "task_id": 346,
+        "func_id": 0,
+        "core": 2,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#392",
+      "pid": 2,
+      "tid": 0,
+      "ts": 378899.166,
+      "dur": 32673.209,
+      "args": {
+        "task_id": 392,
+        "func_id": 0,
+        "core": 2,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#434",
+      "pid": 2,
+      "tid": 0,
+      "ts": 411601.708,
+      "dur": 44578.083,
+      "args": {
+        "task_id": 434,
+        "func_id": 0,
+        "core": 2,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#468",
+      "pid": 2,
+      "tid": 0,
+      "ts": 456204.541,
+      "dur": 31525.375,
+      "args": {
+        "task_id": 468,
+        "func_id": 0,
+        "core": 2,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#42",
+      "pid": 3,
+      "tid": 0,
+      "ts": 14318.291,
+      "dur": 59195.25,
+      "args": {
+        "task_id": 42,
+        "func_id": 0,
+        "core": 3,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#94",
+      "pid": 3,
+      "tid": 0,
+      "ts": 73525.875,
+      "dur": 34565.833,
+      "args": {
+        "task_id": 94,
+        "func_id": 0,
+        "core": 3,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#136",
+      "pid": 3,
+      "tid": 0,
+      "ts": 115117.625,
+      "dur": 60943.333,
+      "args": {
+        "task_id": 136,
+        "func_id": 0,
+        "core": 3,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#194",
+      "pid": 3,
+      "tid": 0,
+      "ts": 176082.458,
+      "dur": 31112.292,
+      "args": {
+        "task_id": 194,
+        "func_id": 0,
+        "core": 3,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#226",
+      "pid": 3,
+      "tid": 0,
+      "ts": 207207.708,
+      "dur": 39909.708,
+      "args": {
+        "task_id": 226,
+        "func_id": 0,
+        "core": 3,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#266",
+      "pid": 3,
+      "tid": 0,
+      "ts": 247136.291,
+      "dur": 41294.459,
+      "args": {
+        "task_id": 266,
+        "func_id": 0,
+        "core": 3,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#310",
+      "pid": 3,
+      "tid": 0,
+      "ts": 288451.833,
+      "dur": 54002.708,
+      "args": {
+        "task_id": 310,
+        "func_id": 0,
+        "core": 3,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#358",
+      "pid": 3,
+      "tid": 0,
+      "ts": 342512.25,
+      "dur": 37543.458,
+      "args": {
+        "task_id": 358,
+        "func_id": 0,
+        "core": 3,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#396",
+      "pid": 3,
+      "tid": 0,
+      "ts": 380082.041,
+      "dur": 22583.125,
+      "args": {
+        "task_id": 396,
+        "func_id": 0,
+        "core": 3,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#422",
+      "pid": 3,
+      "tid": 0,
+      "ts": 402682.416,
+      "dur": 84861.334,
+      "args": {
+        "task_id": 422,
+        "func_id": 0,
+        "core": 3,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#28",
+      "pid": 4,
+      "tid": 0,
+      "ts": 7495.291,
+      "dur": 70940.5,
+      "args": {
+        "task_id": 28,
+        "func_id": 0,
+        "core": 4,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#102",
+      "pid": 4,
+      "tid": 0,
+      "ts": 78451.833,
+      "dur": 58783.708,
+      "args": {
+        "task_id": 102,
+        "func_id": 0,
+        "core": 4,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#158",
+      "pid": 4,
+      "tid": 0,
+      "ts": 137251.583,
+      "dur": 75440.75,
+      "args": {
+        "task_id": 158,
+        "func_id": 0,
+        "core": 4,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#232",
+      "pid": 4,
+      "tid": 0,
+      "ts": 212744.916,
+      "dur": 46413.875,
+      "args": {
+        "task_id": 232,
+        "func_id": 0,
+        "core": 4,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#280",
+      "pid": 4,
+      "tid": 0,
+      "ts": 259182.583,
+      "dur": 51298.042,
+      "args": {
+        "task_id": 280,
+        "func_id": 0,
+        "core": 4,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#330",
+      "pid": 4,
+      "tid": 0,
+      "ts": 310509.375,
+      "dur": 24049.25,
+      "args": {
+        "task_id": 330,
+        "func_id": 0,
+        "core": 4,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#352",
+      "pid": 4,
+      "tid": 0,
+      "ts": 334570.75,
+      "dur": 54480.041,
+      "args": {
+        "task_id": 352,
+        "func_id": 0,
+        "core": 4,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#406",
+      "pid": 4,
+      "tid": 0,
+      "ts": 389110.458,
+      "dur": 73649.833,
+      "args": {
+        "task_id": 406,
+        "func_id": 0,
+        "core": 4,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#476",
+      "pid": 4,
+      "tid": 0,
+      "ts": 462840.416,
+      "dur": 23486.959,
+      "args": {
+        "task_id": 476,
+        "func_id": 0,
+        "core": 4,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#46",
+      "pid": 5,
+      "tid": 0,
+      "ts": 22077.25,
+      "dur": 50477.833,
+      "args": {
+        "task_id": 46,
+        "func_id": 0,
+        "core": 5,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#92",
+      "pid": 5,
+      "tid": 0,
+      "ts": 72564.458,
+      "dur": 39186.458,
+      "args": {
+        "task_id": 92,
+        "func_id": 0,
+        "core": 5,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#134",
+      "pid": 5,
+      "tid": 0,
+      "ts": 111764.583,
+      "dur": 63130.5,
+      "args": {
+        "task_id": 134,
+        "func_id": 0,
+        "core": 5,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#192",
+      "pid": 5,
+      "tid": 0,
+      "ts": 174936.708,
+      "dur": 12266.333,
+      "args": {
+        "task_id": 192,
+        "func_id": 0,
+        "core": 5,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#204",
+      "pid": 5,
+      "tid": 0,
+      "ts": 187209.333,
+      "dur": 68895.583,
+      "args": {
+        "task_id": 204,
+        "func_id": 0,
+        "core": 5,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#278",
+      "pid": 5,
+      "tid": 0,
+      "ts": 256139.0,
+      "dur": 67520.375,
+      "args": {
+        "task_id": 278,
+        "func_id": 0,
+        "core": 5,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#340",
+      "pid": 5,
+      "tid": 0,
+      "ts": 323690.333,
+      "dur": 44183.167,
+      "args": {
+        "task_id": 340,
+        "func_id": 0,
+        "core": 5,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#390",
+      "pid": 5,
+      "tid": 0,
+      "ts": 367901.416,
+      "dur": 63459.625,
+      "args": {
+        "task_id": 390,
+        "func_id": 0,
+        "core": 5,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#454",
+      "pid": 5,
+      "tid": 0,
+      "ts": 431401.541,
+      "dur": 44316.667,
+      "args": {
+        "task_id": 454,
+        "func_id": 0,
+        "core": 5,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#38",
+      "pid": 6,
+      "tid": 0,
+      "ts": 9350.708,
+      "dur": 20394.083,
+      "args": {
+        "task_id": 38,
+        "func_id": 0,
+        "core": 6,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#56",
+      "pid": 6,
+      "tid": 0,
+      "ts": 29754.291,
+      "dur": 13406.084,
+      "args": {
+        "task_id": 56,
+        "func_id": 0,
+        "core": 6,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#66",
+      "pid": 6,
+      "tid": 0,
+      "ts": 43163.875,
+      "dur": 10266.833,
+      "args": {
+        "task_id": 66,
+        "func_id": 0,
+        "core": 6,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#74",
+      "pid": 6,
+      "tid": 0,
+      "ts": 53433.166,
+      "dur": 10372.125,
+      "args": {
+        "task_id": 74,
+        "func_id": 0,
+        "core": 6,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#84",
+      "pid": 6,
+      "tid": 0,
+      "ts": 63808.25,
+      "dur": 10258.916,
+      "args": {
+        "task_id": 84,
+        "func_id": 0,
+        "core": 6,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#96",
+      "pid": 6,
+      "tid": 0,
+      "ts": 74070.875,
+      "dur": 18425.916,
+      "args": {
+        "task_id": 96,
+        "func_id": 0,
+        "core": 6,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#112",
+      "pid": 6,
+      "tid": 0,
+      "ts": 92506.875,
+      "dur": 12162.666,
+      "args": {
+        "task_id": 112,
+        "func_id": 0,
+        "core": 6,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#126",
+      "pid": 6,
+      "tid": 0,
+      "ts": 104674.083,
+      "dur": 16791.042,
+      "args": {
+        "task_id": 126,
+        "func_id": 0,
+        "core": 6,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#144",
+      "pid": 6,
+      "tid": 0,
+      "ts": 121470.958,
+      "dur": 13621.417,
+      "args": {
+        "task_id": 144,
+        "func_id": 0,
+        "core": 6,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#156",
+      "pid": 6,
+      "tid": 0,
+      "ts": 135097.416,
+      "dur": 16374.792,
+      "args": {
+        "task_id": 156,
+        "func_id": 0,
+        "core": 6,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#170",
+      "pid": 6,
+      "tid": 0,
+      "ts": 151477.458,
+      "dur": 17647.708,
+      "args": {
+        "task_id": 170,
+        "func_id": 0,
+        "core": 6,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#188",
+      "pid": 6,
+      "tid": 0,
+      "ts": 169132.208,
+      "dur": 19298.75,
+      "args": {
+        "task_id": 188,
+        "func_id": 0,
+        "core": 6,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#206",
+      "pid": 6,
+      "tid": 0,
+      "ts": 188454.958,
+      "dur": 49302.792,
+      "args": {
+        "task_id": 206,
+        "func_id": 0,
+        "core": 6,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#254",
+      "pid": 6,
+      "tid": 0,
+      "ts": 237798.333,
+      "dur": 60942.625,
+      "args": {
+        "task_id": 254,
+        "func_id": 0,
+        "core": 6,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#318",
+      "pid": 6,
+      "tid": 0,
+      "ts": 298771.583,
+      "dur": 47959.625,
+      "args": {
+        "task_id": 318,
+        "func_id": 0,
+        "core": 6,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#368",
+      "pid": 6,
+      "tid": 0,
+      "ts": 346757.958,
+      "dur": 64675.167,
+      "args": {
+        "task_id": 368,
+        "func_id": 0,
+        "core": 6,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#430",
+      "pid": 6,
+      "tid": 0,
+      "ts": 411470.916,
+      "dur": 18057.542,
+      "args": {
+        "task_id": 430,
+        "func_id": 0,
+        "core": 6,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#450",
+      "pid": 6,
+      "tid": 0,
+      "ts": 429552.833,
+      "dur": 50991.833,
+      "args": {
+        "task_id": 450,
+        "func_id": 0,
+        "core": 6,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#32",
+      "pid": 7,
+      "tid": 0,
+      "ts": 7822.208,
+      "dur": 47579.875,
+      "args": {
+        "task_id": 32,
+        "func_id": 0,
+        "core": 7,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#78",
+      "pid": 7,
+      "tid": 0,
+      "ts": 55411.916,
+      "dur": 73540.417,
+      "args": {
+        "task_id": 78,
+        "func_id": 0,
+        "core": 7,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#152",
+      "pid": 7,
+      "tid": 0,
+      "ts": 128974.0,
+      "dur": 18682.25,
+      "args": {
+        "task_id": 152,
+        "func_id": 0,
+        "core": 7,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#164",
+      "pid": 7,
+      "tid": 0,
+      "ts": 147661.958,
+      "dur": 47817.0,
+      "args": {
+        "task_id": 164,
+        "func_id": 0,
+        "core": 7,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#214",
+      "pid": 7,
+      "tid": 0,
+      "ts": 195497.083,
+      "dur": 52703.333,
+      "args": {
+        "task_id": 214,
+        "func_id": 0,
+        "core": 7,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#268",
+      "pid": 7,
+      "tid": 0,
+      "ts": 248246.208,
+      "dur": 52186.5,
+      "args": {
+        "task_id": 268,
+        "func_id": 0,
+        "core": 7,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#320",
+      "pid": 7,
+      "tid": 0,
+      "ts": 300480.208,
+      "dur": 45463.5,
+      "args": {
+        "task_id": 320,
+        "func_id": 0,
+        "core": 7,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#366",
+      "pid": 7,
+      "tid": 0,
+      "ts": 345968.791,
+      "dur": 38700.709,
+      "args": {
+        "task_id": 366,
+        "func_id": 0,
+        "core": 7,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#400",
+      "pid": 7,
+      "tid": 0,
+      "ts": 384692.208,
+      "dur": 42090.917,
+      "args": {
+        "task_id": 400,
+        "func_id": 0,
+        "core": 7,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#442",
+      "pid": 7,
+      "tid": 0,
+      "ts": 426811.75,
+      "dur": 27488.625,
+      "args": {
+        "task_id": 442,
+        "func_id": 0,
+        "core": 7,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#464",
+      "pid": 7,
+      "tid": 0,
+      "ts": 454316.875,
+      "dur": 42386.5,
+      "args": {
+        "task_id": 464,
+        "func_id": 0,
+        "core": 7,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#62",
+      "pid": 8,
+      "tid": 0,
+      "ts": 41052.083,
+      "dur": 35623.375,
+      "args": {
+        "task_id": 62,
+        "func_id": 0,
+        "core": 8,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#100",
+      "pid": 8,
+      "tid": 0,
+      "ts": 76687.041,
+      "dur": 87119.209,
+      "args": {
+        "task_id": 100,
+        "func_id": 0,
+        "core": 8,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#186",
+      "pid": 8,
+      "tid": 0,
+      "ts": 163831.958,
+      "dur": 56073.292,
+      "args": {
+        "task_id": 186,
+        "func_id": 0,
+        "core": 8,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#244",
+      "pid": 8,
+      "tid": 0,
+      "ts": 219961.166,
+      "dur": 22755.084,
+      "args": {
+        "task_id": 244,
+        "func_id": 0,
+        "core": 8,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#264",
+      "pid": 8,
+      "tid": 0,
+      "ts": 242726.541,
+      "dur": 55964.5,
+      "args": {
+        "task_id": 264,
+        "func_id": 0,
+        "core": 8,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#316",
+      "pid": 8,
+      "tid": 0,
+      "ts": 298717.0,
+      "dur": 55541.666,
+      "args": {
+        "task_id": 316,
+        "func_id": 0,
+        "core": 8,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#374",
+      "pid": 8,
+      "tid": 0,
+      "ts": 354294.125,
+      "dur": 39730.958,
+      "args": {
+        "task_id": 374,
+        "func_id": 0,
+        "core": 8,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#410",
+      "pid": 8,
+      "tid": 0,
+      "ts": 394072.458,
+      "dur": 77961.208,
+      "args": {
+        "task_id": 410,
+        "func_id": 0,
+        "core": 8,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#10",
+      "pid": 9,
+      "tid": 0,
+      "ts": 721.833,
+      "dur": 28289.625,
+      "args": {
+        "task_id": 10,
+        "func_id": 0,
+        "core": 9,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#52",
+      "pid": 9,
+      "tid": 0,
+      "ts": 29060.958,
+      "dur": 75849.125,
+      "args": {
+        "task_id": 52,
+        "func_id": 0,
+        "core": 9,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#128",
+      "pid": 9,
+      "tid": 0,
+      "ts": 104927.041,
+      "dur": 54917.667,
+      "args": {
+        "task_id": 128,
+        "func_id": 0,
+        "core": 9,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#182",
+      "pid": 9,
+      "tid": 0,
+      "ts": 159881.041,
+      "dur": 60415.125,
+      "args": {
+        "task_id": 182,
+        "func_id": 0,
+        "core": 9,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#246",
+      "pid": 9,
+      "tid": 0,
+      "ts": 220325.5,
+      "dur": 45066.833,
+      "args": {
+        "task_id": 246,
+        "func_id": 0,
+        "core": 9,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#288",
+      "pid": 9,
+      "tid": 0,
+      "ts": 265411.875,
+      "dur": 28484.375,
+      "args": {
+        "task_id": 288,
+        "func_id": 0,
+        "core": 9,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#312",
+      "pid": 9,
+      "tid": 0,
+      "ts": 293914.0,
+      "dur": 48583.958,
+      "args": {
+        "task_id": 312,
+        "func_id": 0,
+        "core": 9,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#360",
+      "pid": 9,
+      "tid": 0,
+      "ts": 342524.166,
+      "dur": 43929.75,
+      "args": {
+        "task_id": 360,
+        "func_id": 0,
+        "core": 9,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#402",
+      "pid": 9,
+      "tid": 0,
+      "ts": 386480.875,
+      "dur": 37530.0,
+      "args": {
+        "task_id": 402,
+        "func_id": 0,
+        "core": 9,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#440",
+      "pid": 9,
+      "tid": 0,
+      "ts": 424035.291,
+      "dur": 42066.792,
+      "args": {
+        "task_id": 440,
+        "func_id": 0,
+        "core": 9,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#478",
+      "pid": 9,
+      "tid": 0,
+      "ts": 466129.708,
+      "dur": 22346.75,
+      "args": {
+        "task_id": 478,
+        "func_id": 0,
+        "core": 9,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#20",
+      "pid": 10,
+      "tid": 0,
+      "ts": 4529.25,
+      "dur": 47050.625,
+      "args": {
+        "task_id": 20,
+        "func_id": 0,
+        "core": 10,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#70",
+      "pid": 10,
+      "tid": 0,
+      "ts": 51588.916,
+      "dur": 68993.042,
+      "args": {
+        "task_id": 70,
+        "func_id": 0,
+        "core": 10,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#140",
+      "pid": 10,
+      "tid": 0,
+      "ts": 120600.375,
+      "dur": 78769.875,
+      "args": {
+        "task_id": 140,
+        "func_id": 0,
+        "core": 10,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#216",
+      "pid": 10,
+      "tid": 0,
+      "ts": 199395.791,
+      "dur": 42119.209,
+      "args": {
+        "task_id": 216,
+        "func_id": 0,
+        "core": 10,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#262",
+      "pid": 10,
+      "tid": 0,
+      "ts": 241561.583,
+      "dur": 46683.417,
+      "args": {
+        "task_id": 262,
+        "func_id": 0,
+        "core": 10,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#308",
+      "pid": 10,
+      "tid": 0,
+      "ts": 288268.666,
+      "dur": 57239.542,
+      "args": {
+        "task_id": 308,
+        "func_id": 0,
+        "core": 10,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#364",
+      "pid": 10,
+      "tid": 0,
+      "ts": 345562.083,
+      "dur": 55157.542,
+      "args": {
+        "task_id": 364,
+        "func_id": 0,
+        "core": 10,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#418",
+      "pid": 10,
+      "tid": 0,
+      "ts": 400751.833,
+      "dur": 70627.125,
+      "args": {
+        "task_id": 418,
+        "func_id": 0,
+        "core": 10,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#6",
+      "pid": 11,
+      "tid": 0,
+      "ts": 321.416,
+      "dur": 31966.584,
+      "args": {
+        "task_id": 6,
+        "func_id": 0,
+        "core": 11,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#58",
+      "pid": 11,
+      "tid": 0,
+      "ts": 32295.458,
+      "dur": 49952.625,
+      "args": {
+        "task_id": 58,
+        "func_id": 0,
+        "core": 11,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#108",
+      "pid": 11,
+      "tid": 0,
+      "ts": 82260.125,
+      "dur": 66053.458,
+      "args": {
+        "task_id": 108,
+        "func_id": 0,
+        "core": 11,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#166",
+      "pid": 11,
+      "tid": 0,
+      "ts": 148348.541,
+      "dur": 84617.625,
+      "args": {
+        "task_id": 166,
+        "func_id": 0,
+        "core": 11,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#252",
+      "pid": 11,
+      "tid": 0,
+      "ts": 233002.416,
+      "dur": 21218.417,
+      "args": {
+        "task_id": 252,
+        "func_id": 0,
+        "core": 11,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#276",
+      "pid": 11,
+      "tid": 0,
+      "ts": 254232.875,
+      "dur": 25404.875,
+      "args": {
+        "task_id": 276,
+        "func_id": 0,
+        "core": 11,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#298",
+      "pid": 11,
+      "tid": 0,
+      "ts": 279649.375,
+      "dur": 73999.875,
+      "args": {
+        "task_id": 298,
+        "func_id": 0,
+        "core": 11,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#372",
+      "pid": 11,
+      "tid": 0,
+      "ts": 353719.375,
+      "dur": 43358.75,
+      "args": {
+        "task_id": 372,
+        "func_id": 0,
+        "core": 11,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#412",
+      "pid": 11,
+      "tid": 0,
+      "ts": 397120.916,
+      "dur": 72366.334,
+      "args": {
+        "task_id": 412,
+        "func_id": 0,
+        "core": 11,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#12",
+      "pid": 12,
+      "tid": 0,
+      "ts": 2463.166,
+      "dur": 88729.917,
+      "args": {
+        "task_id": 12,
+        "func_id": 0,
+        "core": 12,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#110",
+      "pid": 12,
+      "tid": 0,
+      "ts": 91236.208,
+      "dur": 25139.375,
+      "args": {
+        "task_id": 110,
+        "func_id": 0,
+        "core": 12,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#138",
+      "pid": 12,
+      "tid": 0,
+      "ts": 116383.75,
+      "dur": 76343.958,
+      "args": {
+        "task_id": 138,
+        "func_id": 0,
+        "core": 12,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#212",
+      "pid": 12,
+      "tid": 0,
+      "ts": 192755.083,
+      "dur": 16312.708,
+      "args": {
+        "task_id": 212,
+        "func_id": 0,
+        "core": 12,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#230",
+      "pid": 12,
+      "tid": 0,
+      "ts": 209075.625,
+      "dur": 53180.541,
+      "args": {
+        "task_id": 230,
+        "func_id": 0,
+        "core": 12,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#286",
+      "pid": 12,
+      "tid": 0,
+      "ts": 262282.0,
+      "dur": 47731.708,
+      "args": {
+        "task_id": 286,
+        "func_id": 0,
+        "core": 12,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#328",
+      "pid": 12,
+      "tid": 0,
+      "ts": 310035.291,
+      "dur": 19459.875,
+      "args": {
+        "task_id": 328,
+        "func_id": 0,
+        "core": 12,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#348",
+      "pid": 12,
+      "tid": 0,
+      "ts": 329506.583,
+      "dur": 75584.208,
+      "args": {
+        "task_id": 348,
+        "func_id": 0,
+        "core": 12,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#424",
+      "pid": 12,
+      "tid": 0,
+      "ts": 405138.375,
+      "dur": 22554.333,
+      "args": {
+        "task_id": 424,
+        "func_id": 0,
+        "core": 12,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#446",
+      "pid": 12,
+      "tid": 0,
+      "ts": 427723.458,
+      "dur": 57875.208,
+      "args": {
+        "task_id": 446,
+        "func_id": 0,
+        "core": 12,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#4",
+      "pid": 13,
+      "tid": 0,
+      "ts": 278.458,
+      "dur": 19697.083,
+      "args": {
+        "task_id": 4,
+        "func_id": 0,
+        "core": 13,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#44",
+      "pid": 13,
+      "tid": 0,
+      "ts": 19984.083,
+      "dur": 50777.208,
+      "args": {
+        "task_id": 44,
+        "func_id": 0,
+        "core": 13,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#88",
+      "pid": 13,
+      "tid": 0,
+      "ts": 70771.333,
+      "dur": 32713.708,
+      "args": {
+        "task_id": 88,
+        "func_id": 0,
+        "core": 13,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#122",
+      "pid": 13,
+      "tid": 0,
+      "ts": 103498.875,
+      "dur": 110731.083,
+      "args": {
+        "task_id": 122,
+        "func_id": 0,
+        "core": 13,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#236",
+      "pid": 13,
+      "tid": 0,
+      "ts": 214267.5,
+      "dur": 37709.583,
+      "args": {
+        "task_id": 236,
+        "func_id": 0,
+        "core": 13,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#272",
+      "pid": 13,
+      "tid": 0,
+      "ts": 252027.125,
+      "dur": 59647.541,
+      "args": {
+        "task_id": 272,
+        "func_id": 0,
+        "core": 13,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#332",
+      "pid": 13,
+      "tid": 0,
+      "ts": 311729.416,
+      "dur": 27345.0,
+      "args": {
+        "task_id": 332,
+        "func_id": 0,
+        "core": 13,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#356",
+      "pid": 13,
+      "tid": 0,
+      "ts": 339090.958,
+      "dur": 43838.833,
+      "args": {
+        "task_id": 356,
+        "func_id": 0,
+        "core": 13,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#398",
+      "pid": 13,
+      "tid": 0,
+      "ts": 382956.958,
+      "dur": 77372.708,
+      "args": {
+        "task_id": 398,
+        "func_id": 0,
+        "core": 13,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#474",
+      "pid": 13,
+      "tid": 0,
+      "ts": 460381.625,
+      "dur": 20160.666,
+      "args": {
+        "task_id": 474,
+        "func_id": 0,
+        "core": 13,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#34",
+      "pid": 14,
+      "tid": 0,
+      "ts": 8091.0,
+      "dur": 44697.333,
+      "args": {
+        "task_id": 34,
+        "func_id": 0,
+        "core": 14,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#72",
+      "pid": 14,
+      "tid": 0,
+      "ts": 52800.208,
+      "dur": 67940.542,
+      "args": {
+        "task_id": 72,
+        "func_id": 0,
+        "core": 14,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#142",
+      "pid": 14,
+      "tid": 0,
+      "ts": 120760.958,
+      "dur": 49544.375,
+      "args": {
+        "task_id": 142,
+        "func_id": 0,
+        "core": 14,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#190",
+      "pid": 14,
+      "tid": 0,
+      "ts": 170322.125,
+      "dur": 33391.208,
+      "args": {
+        "task_id": 190,
+        "func_id": 0,
+        "core": 14,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#220",
+      "pid": 14,
+      "tid": 0,
+      "ts": 203725.041,
+      "dur": 46147.375,
+      "args": {
+        "task_id": 220,
+        "func_id": 0,
+        "core": 14,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#270",
+      "pid": 14,
+      "tid": 0,
+      "ts": 249898.833,
+      "dur": 47645.667,
+      "args": {
+        "task_id": 270,
+        "func_id": 0,
+        "core": 14,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#314",
+      "pid": 14,
+      "tid": 0,
+      "ts": 297595.958,
+      "dur": 67813.917,
+      "args": {
+        "task_id": 314,
+        "func_id": 0,
+        "core": 14,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#388",
+      "pid": 14,
+      "tid": 0,
+      "ts": 365449.666,
+      "dur": 32143.292,
+      "args": {
+        "task_id": 388,
+        "func_id": 0,
+        "core": 14,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#414",
+      "pid": 14,
+      "tid": 0,
+      "ts": 397609.541,
+      "dur": 75845.375,
+      "args": {
+        "task_id": 414,
+        "func_id": 0,
+        "core": 14,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#36",
+      "pid": 15,
+      "tid": 0,
+      "ts": 8435.333,
+      "dur": 46579.292,
+      "args": {
+        "task_id": 36,
+        "func_id": 0,
+        "core": 15,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#76",
+      "pid": 15,
+      "tid": 0,
+      "ts": 55024.208,
+      "dur": 67307.25,
+      "args": {
+        "task_id": 76,
+        "func_id": 0,
+        "core": 15,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#146",
+      "pid": 15,
+      "tid": 0,
+      "ts": 122352.041,
+      "dur": 32488.375,
+      "args": {
+        "task_id": 146,
+        "func_id": 0,
+        "core": 15,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#174",
+      "pid": 15,
+      "tid": 0,
+      "ts": 154850.125,
+      "dur": 63847.0,
+      "args": {
+        "task_id": 174,
+        "func_id": 0,
+        "core": 15,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#242",
+      "pid": 15,
+      "tid": 0,
+      "ts": 218722.791,
+      "dur": 42984.542,
+      "args": {
+        "task_id": 242,
+        "func_id": 0,
+        "core": 15,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#284",
+      "pid": 15,
+      "tid": 0,
+      "ts": 261752.333,
+      "dur": 43674.542,
+      "args": {
+        "task_id": 284,
+        "func_id": 0,
+        "core": 15,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#326",
+      "pid": 15,
+      "tid": 0,
+      "ts": 305468.5,
+      "dur": 58419.375,
+      "args": {
+        "task_id": 326,
+        "func_id": 0,
+        "core": 15,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#386",
+      "pid": 15,
+      "tid": 0,
+      "ts": 363921.166,
+      "dur": 46999.917,
+      "args": {
+        "task_id": 386,
+        "func_id": 0,
+        "core": 15,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#428",
+      "pid": 15,
+      "tid": 0,
+      "ts": 410947.0,
+      "dur": 19617.041,
+      "args": {
+        "task_id": 428,
+        "func_id": 0,
+        "core": 15,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#452",
+      "pid": 15,
+      "tid": 0,
+      "ts": 430580.333,
+      "dur": 49961.542,
+      "args": {
+        "task_id": 452,
+        "func_id": 0,
+        "core": 15,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#0",
+      "pid": 16,
+      "tid": 0,
+      "ts": 211.041,
+      "dur": 23758.375,
+      "args": {
+        "task_id": 0,
+        "func_id": 0,
+        "core": 16,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#48",
+      "pid": 16,
+      "tid": 0,
+      "ts": 23978.041,
+      "dur": 80379.584,
+      "args": {
+        "task_id": 48,
+        "func_id": 0,
+        "core": 16,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#124",
+      "pid": 16,
+      "tid": 0,
+      "ts": 104404.041,
+      "dur": 39957.334,
+      "args": {
+        "task_id": 124,
+        "func_id": 0,
+        "core": 16,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#162",
+      "pid": 16,
+      "tid": 0,
+      "ts": 144374.75,
+      "dur": 45530.541,
+      "args": {
+        "task_id": 162,
+        "func_id": 0,
+        "core": 16,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#208",
+      "pid": 16,
+      "tid": 0,
+      "ts": 189924.125,
+      "dur": 27883.25,
+      "args": {
+        "task_id": 208,
+        "func_id": 0,
+        "core": 16,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#240",
+      "pid": 16,
+      "tid": 0,
+      "ts": 217832.916,
+      "dur": 102498.709,
+      "args": {
+        "task_id": 240,
+        "func_id": 0,
+        "core": 16,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#338",
+      "pid": 16,
+      "tid": 0,
+      "ts": 320378.625,
+      "dur": 41655.208,
+      "args": {
+        "task_id": 338,
+        "func_id": 0,
+        "core": 16,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#384",
+      "pid": 16,
+      "tid": 0,
+      "ts": 362060.208,
+      "dur": 65543.75,
+      "args": {
+        "task_id": 384,
+        "func_id": 0,
+        "core": 16,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#444",
+      "pid": 16,
+      "tid": 0,
+      "ts": 427640.958,
+      "dur": 44199.917,
+      "args": {
+        "task_id": 444,
+        "func_id": 0,
+        "core": 16,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#30",
+      "pid": 17,
+      "tid": 0,
+      "ts": 7518.833,
+      "dur": 68115.583,
+      "args": {
+        "task_id": 30,
+        "func_id": 0,
+        "core": 17,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#98",
+      "pid": 17,
+      "tid": 0,
+      "ts": 75647.791,
+      "dur": 51076.542,
+      "args": {
+        "task_id": 98,
+        "func_id": 0,
+        "core": 17,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#150",
+      "pid": 17,
+      "tid": 0,
+      "ts": 126763.083,
+      "dur": 23869.917,
+      "args": {
+        "task_id": 150,
+        "func_id": 0,
+        "core": 17,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#168",
+      "pid": 17,
+      "tid": 0,
+      "ts": 150649.875,
+      "dur": 55761.0,
+      "args": {
+        "task_id": 168,
+        "func_id": 0,
+        "core": 17,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#224",
+      "pid": 17,
+      "tid": 0,
+      "ts": 206432.041,
+      "dur": 53413.459,
+      "args": {
+        "task_id": 224,
+        "func_id": 0,
+        "core": 17,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#282",
+      "pid": 17,
+      "tid": 0,
+      "ts": 259871.791,
+      "dur": 43916.459,
+      "args": {
+        "task_id": 282,
+        "func_id": 0,
+        "core": 17,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#324",
+      "pid": 17,
+      "tid": 0,
+      "ts": 303809.416,
+      "dur": 56398.584,
+      "args": {
+        "task_id": 324,
+        "func_id": 0,
+        "core": 17,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#382",
+      "pid": 17,
+      "tid": 0,
+      "ts": 360264.666,
+      "dur": 19591.959,
+      "args": {
+        "task_id": 382,
+        "func_id": 0,
+        "core": 17,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#394",
+      "pid": 17,
+      "tid": 0,
+      "ts": 379864.541,
+      "dur": 77727.625,
+      "args": {
+        "task_id": 394,
+        "func_id": 0,
+        "core": 17,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#472",
+      "pid": 17,
+      "tid": 0,
+      "ts": 457698.708,
+      "dur": 14770.875,
+      "args": {
+        "task_id": 472,
+        "func_id": 0,
+        "core": 17,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#50",
+      "pid": 18,
+      "tid": 0,
+      "ts": 24296.083,
+      "dur": 44538.417,
+      "args": {
+        "task_id": 50,
+        "func_id": 0,
+        "core": 18,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#86",
+      "pid": 18,
+      "tid": 0,
+      "ts": 68843.5,
+      "dur": 41773.25,
+      "args": {
+        "task_id": 86,
+        "func_id": 0,
+        "core": 18,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#130",
+      "pid": 18,
+      "tid": 0,
+      "ts": 110645.458,
+      "dur": 45140.25,
+      "args": {
+        "task_id": 130,
+        "func_id": 0,
+        "core": 18,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#178",
+      "pid": 18,
+      "tid": 0,
+      "ts": 155800.958,
+      "dur": 27484.208,
+      "args": {
+        "task_id": 178,
+        "func_id": 0,
+        "core": 18,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#200",
+      "pid": 18,
+      "tid": 0,
+      "ts": 183294.541,
+      "dur": 56959.667,
+      "args": {
+        "task_id": 200,
+        "func_id": 0,
+        "core": 18,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#260",
+      "pid": 18,
+      "tid": 0,
+      "ts": 240278.625,
+      "dur": 31319.625,
+      "args": {
+        "task_id": 260,
+        "func_id": 0,
+        "core": 18,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#294",
+      "pid": 18,
+      "tid": 0,
+      "ts": 271613.958,
+      "dur": 54718.167,
+      "args": {
+        "task_id": 294,
+        "func_id": 0,
+        "core": 18,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#342",
+      "pid": 18,
+      "tid": 0,
+      "ts": 326356.875,
+      "dur": 30769.25,
+      "args": {
+        "task_id": 342,
+        "func_id": 0,
+        "core": 18,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#376",
+      "pid": 18,
+      "tid": 0,
+      "ts": 357150.041,
+      "dur": 64067.75,
+      "args": {
+        "task_id": 376,
+        "func_id": 0,
+        "core": 18,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#438",
+      "pid": 18,
+      "tid": 0,
+      "ts": 421259.291,
+      "dur": 30580.084,
+      "args": {
+        "task_id": 438,
+        "func_id": 0,
+        "core": 18,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#462",
+      "pid": 18,
+      "tid": 0,
+      "ts": 451857.666,
+      "dur": 41557.084,
+      "args": {
+        "task_id": 462,
+        "func_id": 0,
+        "core": 18,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#16",
+      "pid": 19,
+      "tid": 0,
+      "ts": 3581.291,
+      "dur": 56266.375,
+      "args": {
+        "task_id": 16,
+        "func_id": 0,
+        "core": 19,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#80",
+      "pid": 19,
+      "tid": 0,
+      "ts": 59865.166,
+      "dur": 41697.125,
+      "args": {
+        "task_id": 80,
+        "func_id": 0,
+        "core": 19,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#120",
+      "pid": 19,
+      "tid": 0,
+      "ts": 101586.125,
+      "dur": 84889.541,
+      "args": {
+        "task_id": 120,
+        "func_id": 0,
+        "core": 19,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#202",
+      "pid": 19,
+      "tid": 0,
+      "ts": 186504.833,
+      "dur": 27330.0,
+      "args": {
+        "task_id": 202,
+        "func_id": 0,
+        "core": 19,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#234",
+      "pid": 19,
+      "tid": 0,
+      "ts": 213847.625,
+      "dur": 53531.5,
+      "args": {
+        "task_id": 234,
+        "func_id": 0,
+        "core": 19,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#292",
+      "pid": 19,
+      "tid": 0,
+      "ts": 267409.291,
+      "dur": 44948.584,
+      "args": {
+        "task_id": 292,
+        "func_id": 0,
+        "core": 19,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#334",
+      "pid": 19,
+      "tid": 0,
+      "ts": 312381.375,
+      "dur": 21283.416,
+      "args": {
+        "task_id": 334,
+        "func_id": 0,
+        "core": 19,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#350",
+      "pid": 19,
+      "tid": 0,
+      "ts": 333674.958,
+      "dur": 66841.417,
+      "args": {
+        "task_id": 350,
+        "func_id": 0,
+        "core": 19,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#416",
+      "pid": 19,
+      "tid": 0,
+      "ts": 400554.541,
+      "dur": 27569.042,
+      "args": {
+        "task_id": 416,
+        "func_id": 0,
+        "core": 19,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#448",
+      "pid": 19,
+      "tid": 0,
+      "ts": 428145.125,
+      "dur": 55158.958,
+      "args": {
+        "task_id": 448,
+        "func_id": 0,
+        "core": 19,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#8",
+      "pid": 20,
+      "tid": 0,
+      "ts": 465.416,
+      "dur": 13702.542,
+      "args": {
+        "task_id": 8,
+        "func_id": 0,
+        "core": 20,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#40",
+      "pid": 20,
+      "tid": 0,
+      "ts": 14176.0,
+      "dur": 37223.958,
+      "args": {
+        "task_id": 40,
+        "func_id": 0,
+        "core": 20,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#68",
+      "pid": 20,
+      "tid": 0,
+      "ts": 51413.291,
+      "dur": 90006.667,
+      "args": {
+        "task_id": 68,
+        "func_id": 0,
+        "core": 20,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#160",
+      "pid": 20,
+      "tid": 0,
+      "ts": 141484.041,
+      "dur": 40144.834,
+      "args": {
+        "task_id": 160,
+        "func_id": 0,
+        "core": 20,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#198",
+      "pid": 20,
+      "tid": 0,
+      "ts": 181641.625,
+      "dur": 56219.666,
+      "args": {
+        "task_id": 198,
+        "func_id": 0,
+        "core": 20,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#256",
+      "pid": 20,
+      "tid": 0,
+      "ts": 237919.041,
+      "dur": 63216.042,
+      "args": {
+        "task_id": 256,
+        "func_id": 0,
+        "core": 20,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#322",
+      "pid": 20,
+      "tid": 0,
+      "ts": 301167.833,
+      "dur": 25547.458,
+      "args": {
+        "task_id": 322,
+        "func_id": 0,
+        "core": 20,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#344",
+      "pid": 20,
+      "tid": 0,
+      "ts": 326741.541,
+      "dur": 32550.584,
+      "args": {
+        "task_id": 344,
+        "func_id": 0,
+        "core": 20,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#378",
+      "pid": 20,
+      "tid": 0,
+      "ts": 359330.25,
+      "dur": 49324.25,
+      "args": {
+        "task_id": 378,
+        "func_id": 0,
+        "core": 20,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#426",
+      "pid": 20,
+      "tid": 0,
+      "ts": 408685.208,
+      "dur": 65099.917,
+      "args": {
+        "task_id": 426,
+        "func_id": 0,
+        "core": 20,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#22",
+      "pid": 21,
+      "tid": 0,
+      "ts": 4797.25,
+      "dur": 76004.583,
+      "args": {
+        "task_id": 22,
+        "func_id": 0,
+        "core": 21,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#106",
+      "pid": 21,
+      "tid": 0,
+      "ts": 80818.208,
+      "dur": 49925.542,
+      "args": {
+        "task_id": 106,
+        "func_id": 0,
+        "core": 21,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#154",
+      "pid": 21,
+      "tid": 0,
+      "ts": 130770.916,
+      "dur": 61550.0,
+      "args": {
+        "task_id": 154,
+        "func_id": 0,
+        "core": 21,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#210",
+      "pid": 21,
+      "tid": 0,
+      "ts": 192342.541,
+      "dur": 33872.5,
+      "args": {
+        "task_id": 210,
+        "func_id": 0,
+        "core": 21,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#248",
+      "pid": 21,
+      "tid": 0,
+      "ts": 226231.375,
+      "dur": 57675.458,
+      "args": {
+        "task_id": 248,
+        "func_id": 0,
+        "core": 21,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#306",
+      "pid": 21,
+      "tid": 0,
+      "ts": 283934.75,
+      "dur": 65088.833,
+      "args": {
+        "task_id": 306,
+        "func_id": 0,
+        "core": 21,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#370",
+      "pid": 21,
+      "tid": 0,
+      "ts": 349058.666,
+      "dur": 38469.625,
+      "args": {
+        "task_id": 370,
+        "func_id": 0,
+        "core": 21,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#404",
+      "pid": 21,
+      "tid": 0,
+      "ts": 387548.583,
+      "dur": 48652.375,
+      "args": {
+        "task_id": 404,
+        "func_id": 0,
+        "core": 21,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#460",
+      "pid": 21,
+      "tid": 0,
+      "ts": 436242.416,
+      "dur": 41012.542,
+      "args": {
+        "task_id": 460,
+        "func_id": 0,
+        "core": 21,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#2",
+      "pid": 22,
+      "tid": 0,
+      "ts": 243.041,
+      "dur": 42271.042,
+      "args": {
+        "task_id": 2,
+        "func_id": 0,
+        "core": 22,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#64",
+      "pid": 22,
+      "tid": 0,
+      "ts": 42526.208,
+      "dur": 52081.75,
+      "args": {
+        "task_id": 64,
+        "func_id": 0,
+        "core": 22,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#114",
+      "pid": 22,
+      "tid": 0,
+      "ts": 94619.625,
+      "dur": 60903.666,
+      "args": {
+        "task_id": 114,
+        "func_id": 0,
+        "core": 22,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#176",
+      "pid": 22,
+      "tid": 0,
+      "ts": 155564.375,
+      "dur": 47788.875,
+      "args": {
+        "task_id": 176,
+        "func_id": 0,
+        "core": 22,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#218",
+      "pid": 22,
+      "tid": 0,
+      "ts": 203369.75,
+      "dur": 35371.583,
+      "args": {
+        "task_id": 218,
+        "func_id": 0,
+        "core": 22,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#258",
+      "pid": 22,
+      "tid": 0,
+      "ts": 238759.041,
+      "dur": 44863.792,
+      "args": {
+        "task_id": 258,
+        "func_id": 0,
+        "core": 22,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#302",
+      "pid": 22,
+      "tid": 0,
+      "ts": 283645.0,
+      "dur": 33383.041,
+      "args": {
+        "task_id": 302,
+        "func_id": 0,
+        "core": 22,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#336",
+      "pid": 22,
+      "tid": 0,
+      "ts": 317045.583,
+      "dur": 83948.833,
+      "args": {
+        "task_id": 336,
+        "func_id": 0,
+        "core": 22,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#420",
+      "pid": 22,
+      "tid": 0,
+      "ts": 401085.958,
+      "dur": 56362.833,
+      "args": {
+        "task_id": 420,
+        "func_id": 0,
+        "core": 22,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#470",
+      "pid": 22,
+      "tid": 0,
+      "ts": 457514.75,
+      "dur": 27396.041,
+      "args": {
+        "task_id": 470,
+        "func_id": 0,
+        "core": 22,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#18",
+      "pid": 23,
+      "tid": 0,
+      "ts": 4074.041,
+      "dur": 74989.334,
+      "args": {
+        "task_id": 18,
+        "func_id": 0,
+        "core": 23,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#104",
+      "pid": 23,
+      "tid": 0,
+      "ts": 79107.541,
+      "dur": 20165.25,
+      "args": {
+        "task_id": 104,
+        "func_id": 0,
+        "core": 23,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#116",
+      "pid": 23,
+      "tid": 0,
+      "ts": 99277.125,
+      "dur": 63374.166,
+      "args": {
+        "task_id": 116,
+        "func_id": 0,
+        "core": 23,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#184",
+      "pid": 23,
+      "tid": 0,
+      "ts": 162673.083,
+      "dur": 68465.917,
+      "args": {
+        "task_id": 184,
+        "func_id": 0,
+        "core": 23,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#250",
+      "pid": 23,
+      "tid": 0,
+      "ts": 231165.5,
+      "dur": 48580.5,
+      "args": {
+        "task_id": 250,
+        "func_id": 0,
+        "core": 23,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#300",
+      "pid": 23,
+      "tid": 0,
+      "ts": 279769.583,
+      "dur": 58075.625,
+      "args": {
+        "task_id": 300,
+        "func_id": 0,
+        "core": 23,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#354",
+      "pid": 23,
+      "tid": 0,
+      "ts": 337874.916,
+      "dur": 73526.0,
+      "args": {
+        "task_id": 354,
+        "func_id": 0,
+        "core": 23,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#432",
+      "pid": 23,
+      "tid": 0,
+      "ts": 411488.541,
+      "dur": 23106.0,
+      "args": {
+        "task_id": 432,
+        "func_id": 0,
+        "core": 23,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "GEMM#458",
+      "pid": 23,
+      "tid": 0,
+      "ts": 434612.333,
+      "dur": 53234.667,
+      "args": {
+        "task_id": 458,
+        "func_id": 0,
+        "core": 23,
+        "mc": 0,
+        "name": "GEMM"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#67",
+      "pid": 0,
+      "tid": 1,
+      "ts": 108424.833,
+      "dur": 661.958,
+      "args": {
+        "task_id": 67,
+        "func_id": 1,
+        "core": 24,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#69",
+      "pid": 0,
+      "tid": 1,
+      "ts": 141488.541,
+      "dur": 335.709,
+      "args": {
+        "task_id": 69,
+        "func_id": 1,
+        "core": 24,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#71",
+      "pid": 0,
+      "tid": 1,
+      "ts": 141827.375,
+      "dur": 215.291,
+      "args": {
+        "task_id": 71,
+        "func_id": 1,
+        "core": 24,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#357",
+      "pid": 0,
+      "tid": 1,
+      "ts": 384519.25,
+      "dur": 254.125,
+      "args": {
+        "task_id": 357,
+        "func_id": 1,
+        "core": 24,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#393",
+      "pid": 0,
+      "tid": 1,
+      "ts": 419629.5,
+      "dur": 272.916,
+      "args": {
+        "task_id": 393,
+        "func_id": 1,
+        "core": 24,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#391",
+      "pid": 0,
+      "tid": 1,
+      "ts": 435066.958,
+      "dur": 132.583,
+      "args": {
+        "task_id": 391,
+        "func_id": 1,
+        "core": 24,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#133",
+      "pid": 0,
+      "tid": 2,
+      "ts": 153331.208,
+      "dur": 479.167,
+      "args": {
+        "task_id": 133,
+        "func_id": 1,
+        "core": 25,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#137",
+      "pid": 0,
+      "tid": 2,
+      "ts": 184318.958,
+      "dur": 238.333,
+      "args": {
+        "task_id": 137,
+        "func_id": 1,
+        "core": 25,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#135",
+      "pid": 0,
+      "tid": 2,
+      "ts": 184558.166,
+      "dur": 213.459,
+      "args": {
+        "task_id": 135,
+        "func_id": 1,
+        "core": 25,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#397",
+      "pid": 0,
+      "tid": 2,
+      "ts": 408414.333,
+      "dur": 136.458,
+      "args": {
+        "task_id": 397,
+        "func_id": 1,
+        "core": 25,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#429",
+      "pid": 0,
+      "tid": 2,
+      "ts": 435059.0,
+      "dur": 126.625,
+      "args": {
+        "task_id": 429,
+        "func_id": 1,
+        "core": 25,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#431",
+      "pid": 0,
+      "tid": 2,
+      "ts": 435186.583,
+      "dur": 99.583,
+      "args": {
+        "task_id": 431,
+        "func_id": 1,
+        "core": 25,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#229",
+      "pid": 1,
+      "tid": 1,
+      "ts": 285148.5,
+      "dur": 240.458,
+      "args": {
+        "task_id": 229,
+        "func_id": 1,
+        "core": 26,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#231",
+      "pid": 1,
+      "tid": 1,
+      "ts": 285389.458,
+      "dur": 96.792,
+      "args": {
+        "task_id": 231,
+        "func_id": 1,
+        "core": 26,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#233",
+      "pid": 1,
+      "tid": 1,
+      "ts": 285487.125,
+      "dur": 110.541,
+      "args": {
+        "task_id": 233,
+        "func_id": 1,
+        "core": 26,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#57",
+      "pid": 1,
+      "tid": 2,
+      "ts": 50585.041,
+      "dur": 404.042,
+      "args": {
+        "task_id": 57,
+        "func_id": 1,
+        "core": 27,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#59",
+      "pid": 1,
+      "tid": 2,
+      "ts": 86413.5,
+      "dur": 220.916,
+      "args": {
+        "task_id": 59,
+        "func_id": 1,
+        "core": 27,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#55",
+      "pid": 1,
+      "tid": 2,
+      "ts": 109363.875,
+      "dur": 97.25,
+      "args": {
+        "task_id": 55,
+        "func_id": 1,
+        "core": 27,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#329",
+      "pid": 1,
+      "tid": 2,
+      "ts": 336349.166,
+      "dur": 2710.459,
+      "args": {
+        "task_id": 329,
+        "func_id": 1,
+        "core": 27,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#301",
+      "pid": 1,
+      "tid": 2,
+      "ts": 339060.083,
+      "dur": 232.5,
+      "args": {
+        "task_id": 301,
+        "func_id": 1,
+        "core": 27,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#365",
+      "pid": 1,
+      "tid": 2,
+      "ts": 401726.583,
+      "dur": 134.125,
+      "args": {
+        "task_id": 365,
+        "func_id": 1,
+        "core": 27,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#49",
+      "pid": 2,
+      "tid": 1,
+      "ts": 108367.75,
+      "dur": 757.458,
+      "args": {
+        "task_id": 49,
+        "func_id": 1,
+        "core": 28,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#51",
+      "pid": 2,
+      "tid": 1,
+      "ts": 109125.666,
+      "dur": 96.834,
+      "args": {
+        "task_id": 51,
+        "func_id": 1,
+        "core": 28,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#53",
+      "pid": 2,
+      "tid": 1,
+      "ts": 109223.833,
+      "dur": 111.042,
+      "args": {
+        "task_id": 53,
+        "func_id": 1,
+        "core": 28,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#359",
+      "pid": 2,
+      "tid": 1,
+      "ts": 391132.708,
+      "dur": 131.792,
+      "args": {
+        "task_id": 359,
+        "func_id": 1,
+        "core": 28,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#361",
+      "pid": 2,
+      "tid": 1,
+      "ts": 391264.791,
+      "dur": 129.417,
+      "args": {
+        "task_id": 361,
+        "func_id": 1,
+        "core": 28,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#363",
+      "pid": 2,
+      "tid": 1,
+      "ts": 391397.833,
+      "dur": 98.5,
+      "args": {
+        "task_id": 363,
+        "func_id": 1,
+        "core": 28,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#261",
+      "pid": 2,
+      "tid": 2,
+      "ts": 274247.25,
+      "dur": 515.625,
+      "args": {
+        "task_id": 261,
+        "func_id": 1,
+        "core": 29,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#263",
+      "pid": 2,
+      "tid": 2,
+      "ts": 291234.625,
+      "dur": 303.375,
+      "args": {
+        "task_id": 263,
+        "func_id": 1,
+        "core": 29,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#259",
+      "pid": 2,
+      "tid": 2,
+      "ts": 307523.458,
+      "dur": 102.333,
+      "args": {
+        "task_id": 259,
+        "func_id": 1,
+        "core": 29,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#273",
+      "pid": 3,
+      "tid": 1,
+      "ts": 320994.583,
+      "dur": 472.875,
+      "args": {
+        "task_id": 273,
+        "func_id": 1,
+        "core": 30,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#275",
+      "pid": 3,
+      "tid": 1,
+      "ts": 321468.0,
+      "dur": 218.041,
+      "args": {
+        "task_id": 275,
+        "func_id": 1,
+        "core": 30,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#277",
+      "pid": 3,
+      "tid": 1,
+      "ts": 321688.041,
+      "dur": 236.5,
+      "args": {
+        "task_id": 277,
+        "func_id": 1,
+        "core": 30,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#139",
+      "pid": 3,
+      "tid": 2,
+      "ts": 202315.708,
+      "dur": 399.625,
+      "args": {
+        "task_id": 139,
+        "func_id": 1,
+        "core": 31,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#141",
+      "pid": 3,
+      "tid": 2,
+      "ts": 202715.75,
+      "dur": 233.833,
+      "args": {
+        "task_id": 141,
+        "func_id": 1,
+        "core": 31,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#143",
+      "pid": 3,
+      "tid": 2,
+      "ts": 202953.541,
+      "dur": 284.709,
+      "args": {
+        "task_id": 143,
+        "func_id": 1,
+        "core": 31,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#449",
+      "pid": 3,
+      "tid": 2,
+      "ts": 484212.416,
+      "dur": 450.584,
+      "args": {
+        "task_id": 449,
+        "func_id": 1,
+        "core": 31,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#451",
+      "pid": 3,
+      "tid": 2,
+      "ts": 484663.458,
+      "dur": 215.75,
+      "args": {
+        "task_id": 451,
+        "func_id": 1,
+        "core": 31,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#453",
+      "pid": 3,
+      "tid": 2,
+      "ts": 484882.166,
+      "dur": 236.417,
+      "args": {
+        "task_id": 453,
+        "func_id": 1,
+        "core": 31,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#285",
+      "pid": 4,
+      "tid": 1,
+      "ts": 307561.833,
+      "dur": 404.708,
+      "args": {
+        "task_id": 285,
+        "func_id": 1,
+        "core": 32,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#287",
+      "pid": 4,
+      "tid": 1,
+      "ts": 310037.791,
+      "dur": 100.167,
+      "args": {
+        "task_id": 287,
+        "func_id": 1,
+        "core": 32,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#283",
+      "pid": 4,
+      "tid": 1,
+      "ts": 320624.166,
+      "dur": 329.334,
+      "args": {
+        "task_id": 283,
+        "func_id": 1,
+        "core": 32,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#115",
+      "pid": 4,
+      "tid": 2,
+      "ts": 163293.25,
+      "dur": 405.416,
+      "args": {
+        "task_id": 115,
+        "func_id": 1,
+        "core": 33,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#117",
+      "pid": 4,
+      "tid": 2,
+      "ts": 163699.291,
+      "dur": 239.709,
+      "args": {
+        "task_id": 117,
+        "func_id": 1,
+        "core": 33,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#119",
+      "pid": 4,
+      "tid": 2,
+      "ts": 184303.25,
+      "dur": 217.125,
+      "args": {
+        "task_id": 119,
+        "func_id": 1,
+        "core": 33,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#413",
+      "pid": 4,
+      "tid": 2,
+      "ts": 473828.083,
+      "dur": 122.708,
+      "args": {
+        "task_id": 413,
+        "func_id": 1,
+        "core": 33,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#411",
+      "pid": 4,
+      "tid": 2,
+      "ts": 473951.0,
+      "dur": 96.375,
+      "args": {
+        "task_id": 411,
+        "func_id": 1,
+        "core": 33,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#427",
+      "pid": 4,
+      "tid": 2,
+      "ts": 474048.833,
+      "dur": 96.333,
+      "args": {
+        "task_id": 427,
+        "func_id": 1,
+        "core": 33,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#213",
+      "pid": 5,
+      "tid": 1,
+      "ts": 209078.25,
+      "dur": 226.25,
+      "args": {
+        "task_id": 213,
+        "func_id": 1,
+        "core": 34,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#215",
+      "pid": 5,
+      "tid": 1,
+      "ts": 255441.625,
+      "dur": 219.291,
+      "args": {
+        "task_id": 215,
+        "func_id": 1,
+        "core": 34,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#211",
+      "pid": 5,
+      "tid": 1,
+      "ts": 258939.958,
+      "dur": 109.958,
+      "args": {
+        "task_id": 211,
+        "func_id": 1,
+        "core": 34,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#471",
+      "pid": 5,
+      "tid": 1,
+      "ts": 488975.291,
+      "dur": 99.709,
+      "args": {
+        "task_id": 471,
+        "func_id": 1,
+        "core": 34,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#73",
+      "pid": 5,
+      "tid": 2,
+      "ts": 121273.875,
+      "dur": 415.791,
+      "args": {
+        "task_id": 73,
+        "func_id": 1,
+        "core": 35,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#75",
+      "pid": 5,
+      "tid": 2,
+      "ts": 121690.416,
+      "dur": 213.584,
+      "args": {
+        "task_id": 75,
+        "func_id": 1,
+        "core": 35,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#77",
+      "pid": 5,
+      "tid": 2,
+      "ts": 124383.0,
+      "dur": 359.458,
+      "args": {
+        "task_id": 77,
+        "func_id": 1,
+        "core": 35,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#373",
+      "pid": 5,
+      "tid": 2,
+      "ts": 401703.875,
+      "dur": 264.666,
+      "args": {
+        "task_id": 373,
+        "func_id": 1,
+        "core": 35,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#371",
+      "pid": 5,
+      "tid": 2,
+      "ts": 424346.25,
+      "dur": 244.75,
+      "args": {
+        "task_id": 371,
+        "func_id": 1,
+        "core": 35,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#379",
+      "pid": 5,
+      "tid": 2,
+      "ts": 424796.583,
+      "dur": 283.583,
+      "args": {
+        "task_id": 379,
+        "func_id": 1,
+        "core": 35,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#295",
+      "pid": 6,
+      "tid": 1,
+      "ts": 326371.583,
+      "dur": 234.917,
+      "args": {
+        "task_id": 295,
+        "func_id": 1,
+        "core": 36,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#297",
+      "pid": 6,
+      "tid": 1,
+      "ts": 352397.708,
+      "dur": 245.292,
+      "args": {
+        "task_id": 297,
+        "func_id": 1,
+        "core": 36,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#299",
+      "pid": 6,
+      "tid": 1,
+      "ts": 362772.708,
+      "dur": 213.292,
+      "args": {
+        "task_id": 299,
+        "func_id": 1,
+        "core": 36,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#163",
+      "pid": 6,
+      "tid": 2,
+      "ts": 191363.625,
+      "dur": 405.625,
+      "args": {
+        "task_id": 163,
+        "func_id": 1,
+        "core": 37,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#165",
+      "pid": 6,
+      "tid": 2,
+      "ts": 202234.583,
+      "dur": 253.375,
+      "args": {
+        "task_id": 165,
+        "func_id": 1,
+        "core": 37,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#167",
+      "pid": 6,
+      "tid": 2,
+      "ts": 236709.958,
+      "dur": 357.542,
+      "args": {
+        "task_id": 167,
+        "func_id": 1,
+        "core": 37,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#435",
+      "pid": 6,
+      "tid": 2,
+      "ts": 457045.25,
+      "dur": 99.75,
+      "args": {
+        "task_id": 435,
+        "func_id": 1,
+        "core": 37,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#445",
+      "pid": 6,
+      "tid": 2,
+      "ts": 474227.125,
+      "dur": 117.583,
+      "args": {
+        "task_id": 445,
+        "func_id": 1,
+        "core": 37,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#97",
+      "pid": 7,
+      "tid": 1,
+      "ts": 102170.333,
+      "dur": 454.375,
+      "args": {
+        "task_id": 97,
+        "func_id": 1,
+        "core": 38,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#99",
+      "pid": 7,
+      "tid": 1,
+      "ts": 126762.5,
+      "dur": 100.166,
+      "args": {
+        "task_id": 99,
+        "func_id": 1,
+        "core": 38,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#101",
+      "pid": 7,
+      "tid": 1,
+      "ts": 173283.166,
+      "dur": 256.375,
+      "args": {
+        "task_id": 101,
+        "func_id": 1,
+        "core": 38,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#343",
+      "pid": 7,
+      "tid": 1,
+      "ts": 374145.75,
+      "dur": 101.708,
+      "args": {
+        "task_id": 343,
+        "func_id": 1,
+        "core": 38,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#383",
+      "pid": 7,
+      "tid": 1,
+      "ts": 424452.583,
+      "dur": 104.25,
+      "args": {
+        "task_id": 383,
+        "func_id": 1,
+        "core": 38,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#421",
+      "pid": 7,
+      "tid": 1,
+      "ts": 457527.333,
+      "dur": 118.458,
+      "args": {
+        "task_id": 421,
+        "func_id": 1,
+        "core": 38,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#145",
+      "pid": 7,
+      "tid": 2,
+      "ts": 137317.375,
+      "dur": 502.333,
+      "args": {
+        "task_id": 145,
+        "func_id": 1,
+        "core": 39,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#149",
+      "pid": 7,
+      "tid": 2,
+      "ts": 157669.625,
+      "dur": 188.041,
+      "args": {
+        "task_id": 149,
+        "func_id": 1,
+        "core": 39,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#147",
+      "pid": 7,
+      "tid": 2,
+      "ts": 157858.583,
+      "dur": 98.167,
+      "args": {
+        "task_id": 147,
+        "func_id": 1,
+        "core": 39,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#389",
+      "pid": 7,
+      "tid": 2,
+      "ts": 401801.5,
+      "dur": 135.125,
+      "args": {
+        "task_id": 389,
+        "func_id": 1,
+        "core": 39,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#405",
+      "pid": 7,
+      "tid": 2,
+      "ts": 440873.083,
+      "dur": 123.875,
+      "args": {
+        "task_id": 405,
+        "func_id": 1,
+        "core": 39,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#407",
+      "pid": 7,
+      "tid": 2,
+      "ts": 462774.25,
+      "dur": 100.541,
+      "args": {
+        "task_id": 407,
+        "func_id": 1,
+        "core": 39,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#129",
+      "pid": 8,
+      "tid": 1,
+      "ts": 163317.583,
+      "dur": 497.375,
+      "args": {
+        "task_id": 129,
+        "func_id": 1,
+        "core": 40,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#131",
+      "pid": 8,
+      "tid": 1,
+      "ts": 163816.875,
+      "dur": 216.0,
+      "args": {
+        "task_id": 131,
+        "func_id": 1,
+        "core": 40,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#127",
+      "pid": 8,
+      "tid": 1,
+      "ts": 191908.458,
+      "dur": 118.0,
+      "args": {
+        "task_id": 127,
+        "func_id": 1,
+        "core": 40,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#417",
+      "pid": 8,
+      "tid": 1,
+      "ts": 435231.916,
+      "dur": 119.334,
+      "args": {
+        "task_id": 417,
+        "func_id": 1,
+        "core": 40,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#415",
+      "pid": 8,
+      "tid": 1,
+      "ts": 474183.041,
+      "dur": 102.542,
+      "args": {
+        "task_id": 415,
+        "func_id": 1,
+        "core": 40,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#437",
+      "pid": 8,
+      "tid": 1,
+      "ts": 484540.333,
+      "dur": 242.875,
+      "args": {
+        "task_id": 437,
+        "func_id": 1,
+        "core": 40,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#21",
+      "pid": 8,
+      "tid": 2,
+      "ts": 62493.25,
+      "dur": 402.416,
+      "args": {
+        "task_id": 21,
+        "func_id": 1,
+        "core": 41,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#23",
+      "pid": 8,
+      "tid": 2,
+      "ts": 86414.541,
+      "dur": 223.834,
+      "args": {
+        "task_id": 23,
+        "func_id": 1,
+        "core": 41,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#19",
+      "pid": 8,
+      "tid": 2,
+      "ts": 91656.416,
+      "dur": 99.042,
+      "args": {
+        "task_id": 19,
+        "func_id": 1,
+        "core": 41,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#327",
+      "pid": 8,
+      "tid": 2,
+      "ts": 373356.0,
+      "dur": 265.291,
+      "args": {
+        "task_id": 327,
+        "func_id": 1,
+        "core": 41,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#315",
+      "pid": 8,
+      "tid": 2,
+      "ts": 373621.708,
+      "dur": 213.333,
+      "args": {
+        "task_id": 315,
+        "func_id": 1,
+        "core": 41,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#341",
+      "pid": 8,
+      "tid": 2,
+      "ts": 373842.5,
+      "dur": 237.708,
+      "args": {
+        "task_id": 341,
+        "func_id": 1,
+        "core": 41,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#1",
+      "pid": 9,
+      "tid": 1,
+      "ts": 24333.541,
+      "dur": 227.875,
+      "args": {
+        "task_id": 1,
+        "func_id": 1,
+        "core": 42,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#5",
+      "pid": 9,
+      "tid": 1,
+      "ts": 24562.208,
+      "dur": 113.167,
+      "args": {
+        "task_id": 5,
+        "func_id": 1,
+        "core": 42,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#3",
+      "pid": 9,
+      "tid": 1,
+      "ts": 51072.291,
+      "dur": 232.125,
+      "args": {
+        "task_id": 3,
+        "func_id": 1,
+        "core": 42,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#281",
+      "pid": 9,
+      "tid": 1,
+      "ts": 310506.958,
+      "dur": 113.125,
+      "args": {
+        "task_id": 281,
+        "func_id": 1,
+        "core": 42,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#279",
+      "pid": 9,
+      "tid": 1,
+      "ts": 324567.041,
+      "dur": 201.5,
+      "args": {
+        "task_id": 279,
+        "func_id": 1,
+        "core": 42,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#303",
+      "pid": 9,
+      "tid": 1,
+      "ts": 340543.208,
+      "dur": 262.5,
+      "args": {
+        "task_id": 303,
+        "func_id": 1,
+        "core": 42,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#85",
+      "pid": 9,
+      "tid": 2,
+      "ts": 74452.25,
+      "dur": 452.291,
+      "args": {
+        "task_id": 85,
+        "func_id": 1,
+        "core": 43,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#89",
+      "pid": 9,
+      "tid": 2,
+      "ts": 108444.833,
+      "dur": 262.833,
+      "args": {
+        "task_id": 89,
+        "func_id": 1,
+        "core": 43,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#87",
+      "pid": 9,
+      "tid": 2,
+      "ts": 110636.333,
+      "dur": 100.708,
+      "args": {
+        "task_id": 87,
+        "func_id": 1,
+        "core": 43,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#325",
+      "pid": 9,
+      "tid": 2,
+      "ts": 362394.833,
+      "dur": 251.958,
+      "args": {
+        "task_id": 325,
+        "func_id": 1,
+        "core": 43,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#367",
+      "pid": 9,
+      "tid": 2,
+      "ts": 408417.166,
+      "dur": 103.375,
+      "args": {
+        "task_id": 367,
+        "func_id": 1,
+        "core": 43,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#355",
+      "pid": 9,
+      "tid": 2,
+      "ts": 419526.416,
+      "dur": 241.0,
+      "args": {
+        "task_id": 355,
+        "func_id": 1,
+        "core": 43,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#237",
+      "pid": 10,
+      "tid": 1,
+      "ts": 255385.0,
+      "dur": 445.166,
+      "args": {
+        "task_id": 237,
+        "func_id": 1,
+        "core": 44,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#239",
+      "pid": 10,
+      "tid": 1,
+      "ts": 270738.833,
+      "dur": 244.833,
+      "args": {
+        "task_id": 239,
+        "func_id": 1,
+        "core": 44,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#235",
+      "pid": 10,
+      "tid": 1,
+      "ts": 291130.333,
+      "dur": 100.292,
+      "args": {
+        "task_id": 235,
+        "func_id": 1,
+        "core": 44,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#169",
+      "pid": 10,
+      "tid": 2,
+      "ts": 206709.375,
+      "dur": 217.75,
+      "args": {
+        "task_id": 169,
+        "func_id": 1,
+        "core": 45,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#171",
+      "pid": 10,
+      "tid": 2,
+      "ts": 206927.25,
+      "dur": 96.958,
+      "args": {
+        "task_id": 171,
+        "func_id": 1,
+        "core": 45,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#173",
+      "pid": 10,
+      "tid": 2,
+      "ts": 217176.75,
+      "dur": 256.333,
+      "args": {
+        "task_id": 173,
+        "func_id": 1,
+        "core": 45,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#461",
+      "pid": 10,
+      "tid": 2,
+      "ts": 484195.083,
+      "dur": 353.083,
+      "args": {
+        "task_id": 461,
+        "func_id": 1,
+        "core": 45,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#473",
+      "pid": 10,
+      "tid": 2,
+      "ts": 484550.5,
+      "dur": 234.166,
+      "args": {
+        "task_id": 473,
+        "func_id": 1,
+        "core": 45,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#459",
+      "pid": 10,
+      "tid": 2,
+      "ts": 488296.958,
+      "dur": 250.375,
+      "args": {
+        "task_id": 459,
+        "func_id": 1,
+        "core": 45,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#13",
+      "pid": 11,
+      "tid": 1,
+      "ts": 91204.333,
+      "dur": 236.875,
+      "args": {
+        "task_id": 13,
+        "func_id": 1,
+        "core": 46,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#15",
+      "pid": 11,
+      "tid": 1,
+      "ts": 91441.5,
+      "dur": 96.166,
+      "args": {
+        "task_id": 15,
+        "func_id": 1,
+        "core": 46,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#17",
+      "pid": 11,
+      "tid": 1,
+      "ts": 91539.083,
+      "dur": 115.0,
+      "args": {
+        "task_id": 17,
+        "func_id": 1,
+        "core": 46,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#335",
+      "pid": 11,
+      "tid": 1,
+      "ts": 362462.916,
+      "dur": 221.834,
+      "args": {
+        "task_id": 335,
+        "func_id": 1,
+        "core": 46,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#337",
+      "pid": 11,
+      "tid": 1,
+      "ts": 401813.416,
+      "dur": 124.459,
+      "args": {
+        "task_id": 337,
+        "func_id": 1,
+        "core": 46,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#339",
+      "pid": 11,
+      "tid": 1,
+      "ts": 401940.916,
+      "dur": 98.25,
+      "args": {
+        "task_id": 339,
+        "func_id": 1,
+        "core": 46,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#189",
+      "pid": 11,
+      "tid": 2,
+      "ts": 191017.5,
+      "dur": 1002.791,
+      "args": {
+        "task_id": 189,
+        "func_id": 1,
+        "core": 47,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#191",
+      "pid": 11,
+      "tid": 2,
+      "ts": 207548.458,
+      "dur": 106.708,
+      "args": {
+        "task_id": 191,
+        "func_id": 1,
+        "core": 47,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#187",
+      "pid": 11,
+      "tid": 2,
+      "ts": 241400.333,
+      "dur": 104.458,
+      "args": {
+        "task_id": 187,
+        "func_id": 1,
+        "core": 47,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#443",
+      "pid": 11,
+      "tid": 2,
+      "ts": 484804.458,
+      "dur": 231.958,
+      "args": {
+        "task_id": 443,
+        "func_id": 1,
+        "core": 47,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#465",
+      "pid": 11,
+      "tid": 2,
+      "ts": 496858.958,
+      "dur": 119.375,
+      "args": {
+        "task_id": 465,
+        "func_id": 1,
+        "core": 47,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#81",
+      "pid": 12,
+      "tid": 1,
+      "ts": 102164.166,
+      "dur": 474.417,
+      "args": {
+        "task_id": 81,
+        "func_id": 1,
+        "core": 48,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#83",
+      "pid": 12,
+      "tid": 1,
+      "ts": 124371.083,
+      "dur": 603.0,
+      "args": {
+        "task_id": 83,
+        "func_id": 1,
+        "core": 48,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#79",
+      "pid": 12,
+      "tid": 1,
+      "ts": 137344.208,
+      "dur": 234.458,
+      "args": {
+        "task_id": 79,
+        "func_id": 1,
+        "core": 48,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#345",
+      "pid": 12,
+      "tid": 1,
+      "ts": 362372.25,
+      "dur": 255.041,
+      "args": {
+        "task_id": 345,
+        "func_id": 1,
+        "core": 48,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#381",
+      "pid": 12,
+      "tid": 1,
+      "ts": 419599.125,
+      "dur": 262.0,
+      "args": {
+        "task_id": 381,
+        "func_id": 1,
+        "core": 48,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#387",
+      "pid": 12,
+      "tid": 1,
+      "ts": 440859.958,
+      "dur": 106.5,
+      "args": {
+        "task_id": 387,
+        "func_id": 1,
+        "core": 48,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#153",
+      "pid": 12,
+      "tid": 2,
+      "ts": 153300.166,
+      "dur": 473.0,
+      "args": {
+        "task_id": 153,
+        "func_id": 1,
+        "core": 49,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#151",
+      "pid": 12,
+      "tid": 2,
+      "ts": 164116.416,
+      "dur": 270.0,
+      "args": {
+        "task_id": 151,
+        "func_id": 1,
+        "core": 49,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#155",
+      "pid": 12,
+      "tid": 2,
+      "ts": 202287.375,
+      "dur": 246.458,
+      "args": {
+        "task_id": 155,
+        "func_id": 1,
+        "core": 49,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#395",
+      "pid": 12,
+      "tid": 2,
+      "ts": 457688.833,
+      "dur": 326.417,
+      "args": {
+        "task_id": 395,
+        "func_id": 1,
+        "core": 49,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#419",
+      "pid": 12,
+      "tid": 2,
+      "ts": 474325.083,
+      "dur": 177.333,
+      "args": {
+        "task_id": 419,
+        "func_id": 1,
+        "core": 49,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#447",
+      "pid": 12,
+      "tid": 2,
+      "ts": 487007.916,
+      "dur": 221.709,
+      "args": {
+        "task_id": 447,
+        "func_id": 1,
+        "core": 49,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#37",
+      "pid": 13,
+      "tid": 1,
+      "ts": 60552.833,
+      "dur": 402.708,
+      "args": {
+        "task_id": 37,
+        "func_id": 1,
+        "core": 50,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#39",
+      "pid": 13,
+      "tid": 1,
+      "ts": 60956.208,
+      "dur": 211.208,
+      "args": {
+        "task_id": 39,
+        "func_id": 1,
+        "core": 50,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#41",
+      "pid": 13,
+      "tid": 1,
+      "ts": 61170.75,
+      "dur": 231.916,
+      "args": {
+        "task_id": 41,
+        "func_id": 1,
+        "core": 50,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#305",
+      "pid": 13,
+      "tid": 1,
+      "ts": 391086.125,
+      "dur": 254.375,
+      "args": {
+        "task_id": 305,
+        "func_id": 1,
+        "core": 50,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#307",
+      "pid": 13,
+      "tid": 1,
+      "ts": 391341.0,
+      "dur": 217.583,
+      "args": {
+        "task_id": 307,
+        "func_id": 1,
+        "core": 50,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#309",
+      "pid": 13,
+      "tid": 1,
+      "ts": 391564.375,
+      "dur": 235.916,
+      "args": {
+        "task_id": 309,
+        "func_id": 1,
+        "core": 50,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#91",
+      "pid": 13,
+      "tid": 2,
+      "ts": 111174.25,
+      "dur": 232.5,
+      "args": {
+        "task_id": 91,
+        "func_id": 1,
+        "core": 51,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#93",
+      "pid": 13,
+      "tid": 2,
+      "ts": 121870.416,
+      "dur": 234.584,
+      "args": {
+        "task_id": 93,
+        "func_id": 1,
+        "core": 51,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#95",
+      "pid": 13,
+      "tid": 2,
+      "ts": 122106.791,
+      "dur": 210.875,
+      "args": {
+        "task_id": 95,
+        "func_id": 1,
+        "core": 51,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#375",
+      "pid": 13,
+      "tid": 2,
+      "ts": 408348.583,
+      "dur": 274.417,
+      "args": {
+        "task_id": 375,
+        "func_id": 1,
+        "core": 51,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#369",
+      "pid": 13,
+      "tid": 2,
+      "ts": 420571.291,
+      "dur": 272.625,
+      "args": {
+        "task_id": 369,
+        "func_id": 1,
+        "core": 51,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#377",
+      "pid": 13,
+      "tid": 2,
+      "ts": 424629.25,
+      "dur": 120.125,
+      "args": {
+        "task_id": 377,
+        "func_id": 1,
+        "core": 51,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#253",
+      "pid": 14,
+      "tid": 1,
+      "ts": 255423.75,
+      "dur": 487.5,
+      "args": {
+        "task_id": 253,
+        "func_id": 1,
+        "core": 52,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#257",
+      "pid": 14,
+      "tid": 1,
+      "ts": 301745.375,
+      "dur": 264.458,
+      "args": {
+        "task_id": 257,
+        "func_id": 1,
+        "core": 52,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#255",
+      "pid": 14,
+      "tid": 1,
+      "ts": 302012.166,
+      "dur": 215.0,
+      "args": {
+        "task_id": 255,
+        "func_id": 1,
+        "core": 52,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#45",
+      "pid": 14,
+      "tid": 2,
+      "ts": 73111.375,
+      "dur": 428.791,
+      "args": {
+        "task_id": 45,
+        "func_id": 1,
+        "core": 53,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#43",
+      "pid": 14,
+      "tid": 2,
+      "ts": 73542.0,
+      "dur": 215.25,
+      "args": {
+        "task_id": 43,
+        "func_id": 1,
+        "core": 53,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#47",
+      "pid": 14,
+      "tid": 2,
+      "ts": 73757.833,
+      "dur": 210.417,
+      "args": {
+        "task_id": 47,
+        "func_id": 1,
+        "core": 53,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#321",
+      "pid": 14,
+      "tid": 2,
+      "ts": 353278.666,
+      "dur": 286.084,
+      "args": {
+        "task_id": 321,
+        "func_id": 1,
+        "core": 53,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#323",
+      "pid": 14,
+      "tid": 2,
+      "ts": 353566.75,
+      "dur": 212.583,
+      "args": {
+        "task_id": 323,
+        "func_id": 1,
+        "core": 53,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#319",
+      "pid": 14,
+      "tid": 2,
+      "ts": 363970.208,
+      "dur": 214.208,
+      "args": {
+        "task_id": 319,
+        "func_id": 1,
+        "core": 53,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#33",
+      "pid": 15,
+      "tid": 1,
+      "ts": 61581.083,
+      "dur": 410.75,
+      "args": {
+        "task_id": 33,
+        "func_id": 1,
+        "core": 54,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#35",
+      "pid": 15,
+      "tid": 1,
+      "ts": 61993.291,
+      "dur": 212.0,
+      "args": {
+        "task_id": 35,
+        "func_id": 1,
+        "core": 54,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#31",
+      "pid": 15,
+      "tid": 1,
+      "ts": 88686.625,
+      "dur": 210.875,
+      "args": {
+        "task_id": 31,
+        "func_id": 1,
+        "core": 54,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#313",
+      "pid": 15,
+      "tid": 1,
+      "ts": 352370.041,
+      "dur": 270.75,
+      "args": {
+        "task_id": 313,
+        "func_id": 1,
+        "core": 54,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#333",
+      "pid": 15,
+      "tid": 1,
+      "ts": 352642.625,
+      "dur": 230.541,
+      "args": {
+        "task_id": 333,
+        "func_id": 1,
+        "core": 54,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#311",
+      "pid": 15,
+      "tid": 1,
+      "ts": 401789.5,
+      "dur": 111.125,
+      "args": {
+        "task_id": 311,
+        "func_id": 1,
+        "core": 54,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#181",
+      "pid": 15,
+      "tid": 2,
+      "ts": 207172.5,
+      "dur": 252.458,
+      "args": {
+        "task_id": 181,
+        "func_id": 1,
+        "core": 55,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#183",
+      "pid": 15,
+      "tid": 2,
+      "ts": 224178.166,
+      "dur": 109.417,
+      "args": {
+        "task_id": 183,
+        "func_id": 1,
+        "core": 55,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#185",
+      "pid": 15,
+      "tid": 2,
+      "ts": 236762.541,
+      "dur": 250.959,
+      "args": {
+        "task_id": 185,
+        "func_id": 1,
+        "core": 55,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#467",
+      "pid": 15,
+      "tid": 2,
+      "ts": 496994.208,
+      "dur": 98.75,
+      "args": {
+        "task_id": 467,
+        "func_id": 1,
+        "core": 55,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#269",
+      "pid": 16,
+      "tid": 1,
+      "ts": 301790.583,
+      "dur": 492.417,
+      "args": {
+        "task_id": 269,
+        "func_id": 1,
+        "core": 56,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#271",
+      "pid": 16,
+      "tid": 1,
+      "ts": 302285.291,
+      "dur": 214.0,
+      "args": {
+        "task_id": 271,
+        "func_id": 1,
+        "core": 56,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#267",
+      "pid": 16,
+      "tid": 1,
+      "ts": 302502.583,
+      "dur": 210.542,
+      "args": {
+        "task_id": 267,
+        "func_id": 1,
+        "core": 56,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#241",
+      "pid": 16,
+      "tid": 2,
+      "ts": 320677.666,
+      "dur": 578.084,
+      "args": {
+        "task_id": 241,
+        "func_id": 1,
+        "core": 57,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#243",
+      "pid": 16,
+      "tid": 2,
+      "ts": 321256.375,
+      "dur": 284.125,
+      "args": {
+        "task_id": 243,
+        "func_id": 1,
+        "core": 57,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#245",
+      "pid": 16,
+      "tid": 2,
+      "ts": 321544.541,
+      "dur": 236.542,
+      "args": {
+        "task_id": 245,
+        "func_id": 1,
+        "core": 57,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#25",
+      "pid": 17,
+      "tid": 1,
+      "ts": 39923.625,
+      "dur": 408.458,
+      "args": {
+        "task_id": 25,
+        "func_id": 1,
+        "core": 58,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#27",
+      "pid": 17,
+      "tid": 1,
+      "ts": 73201.25,
+      "dur": 217.958,
+      "args": {
+        "task_id": 27,
+        "func_id": 1,
+        "core": 58,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#29",
+      "pid": 17,
+      "tid": 1,
+      "ts": 88406.75,
+      "dur": 241.083,
+      "args": {
+        "task_id": 29,
+        "func_id": 1,
+        "core": 58,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#289",
+      "pid": 17,
+      "tid": 1,
+      "ts": 301673.75,
+      "dur": 276.791,
+      "args": {
+        "task_id": 289,
+        "func_id": 1,
+        "core": 58,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#331",
+      "pid": 17,
+      "tid": 1,
+      "ts": 340308.208,
+      "dur": 103.625,
+      "args": {
+        "task_id": 331,
+        "func_id": 1,
+        "core": 58,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#317",
+      "pid": 17,
+      "tid": 1,
+      "ts": 362431.541,
+      "dur": 286.042,
+      "args": {
+        "task_id": 317,
+        "func_id": 1,
+        "core": 58,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#121",
+      "pid": 17,
+      "tid": 2,
+      "ts": 191161.333,
+      "dur": 464.958,
+      "args": {
+        "task_id": 121,
+        "func_id": 1,
+        "core": 59,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#125",
+      "pid": 17,
+      "tid": 2,
+      "ts": 191630.5,
+      "dur": 244.791,
+      "args": {
+        "task_id": 125,
+        "func_id": 1,
+        "core": 59,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#123",
+      "pid": 17,
+      "tid": 2,
+      "ts": 220213.25,
+      "dur": 222.416,
+      "args": {
+        "task_id": 123,
+        "func_id": 1,
+        "core": 59,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#441",
+      "pid": 17,
+      "tid": 2,
+      "ts": 474178.416,
+      "dur": 434.709,
+      "args": {
+        "task_id": 441,
+        "func_id": 1,
+        "core": 59,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#439",
+      "pid": 17,
+      "tid": 2,
+      "ts": 484958.208,
+      "dur": 101.208,
+      "args": {
+        "task_id": 439,
+        "func_id": 1,
+        "core": 59,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#477",
+      "pid": 17,
+      "tid": 2,
+      "ts": 488271.916,
+      "dur": 258.0,
+      "args": {
+        "task_id": 477,
+        "func_id": 1,
+        "core": 59,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#61",
+      "pid": 18,
+      "tid": 1,
+      "ts": 102153.041,
+      "dur": 486.292,
+      "args": {
+        "task_id": 61,
+        "func_id": 1,
+        "core": 60,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#63",
+      "pid": 18,
+      "tid": 1,
+      "ts": 102639.625,
+      "dur": 211.291,
+      "args": {
+        "task_id": 63,
+        "func_id": 1,
+        "core": 60,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#65",
+      "pid": 18,
+      "tid": 1,
+      "ts": 102853.0,
+      "dur": 245.708,
+      "args": {
+        "task_id": 65,
+        "func_id": 1,
+        "core": 60,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#347",
+      "pid": 18,
+      "tid": 1,
+      "ts": 384625.916,
+      "dur": 233.667,
+      "args": {
+        "task_id": 347,
+        "func_id": 1,
+        "core": 60,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#349",
+      "pid": 18,
+      "tid": 1,
+      "ts": 408263.458,
+      "dur": 121.833,
+      "args": {
+        "task_id": 349,
+        "func_id": 1,
+        "core": 60,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#351",
+      "pid": 18,
+      "tid": 1,
+      "ts": 408555.5,
+      "dur": 225.458,
+      "args": {
+        "task_id": 351,
+        "func_id": 1,
+        "core": 60,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#177",
+      "pid": 18,
+      "tid": 2,
+      "ts": 206621.583,
+      "dur": 216.333,
+      "args": {
+        "task_id": 177,
+        "func_id": 1,
+        "core": 61,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#179",
+      "pid": 18,
+      "tid": 2,
+      "ts": 206838.5,
+      "dur": 98.458,
+      "args": {
+        "task_id": 179,
+        "func_id": 1,
+        "core": 61,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#175",
+      "pid": 18,
+      "tid": 2,
+      "ts": 224058.375,
+      "dur": 100.291,
+      "args": {
+        "task_id": 175,
+        "func_id": 1,
+        "core": 61,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#457",
+      "pid": 18,
+      "tid": 2,
+      "ts": 455992.125,
+      "dur": 112.708,
+      "args": {
+        "task_id": 457,
+        "func_id": 1,
+        "core": 61,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#455",
+      "pid": 18,
+      "tid": 2,
+      "ts": 485126.083,
+      "dur": 99.958,
+      "args": {
+        "task_id": 455,
+        "func_id": 1,
+        "core": 61,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#479",
+      "pid": 18,
+      "tid": 2,
+      "ts": 488664.0,
+      "dur": 101.625,
+      "args": {
+        "task_id": 479,
+        "func_id": 1,
+        "core": 61,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#105",
+      "pid": 19,
+      "tid": 1,
+      "ts": 102812.875,
+      "dur": 510.5,
+      "args": {
+        "task_id": 105,
+        "func_id": 1,
+        "core": 62,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#107",
+      "pid": 19,
+      "tid": 1,
+      "ts": 137159.25,
+      "dur": 300.25,
+      "args": {
+        "task_id": 107,
+        "func_id": 1,
+        "core": 62,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#103",
+      "pid": 19,
+      "tid": 1,
+      "ts": 183261.791,
+      "dur": 245.875,
+      "args": {
+        "task_id": 103,
+        "func_id": 1,
+        "core": 62,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#353",
+      "pid": 19,
+      "tid": 1,
+      "ts": 391015.75,
+      "dur": 293.166,
+      "args": {
+        "task_id": 353,
+        "func_id": 1,
+        "core": 62,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#385",
+      "pid": 19,
+      "tid": 1,
+      "ts": 434999.458,
+      "dur": 141.0,
+      "args": {
+        "task_id": 385,
+        "func_id": 1,
+        "core": 62,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#425",
+      "pid": 19,
+      "tid": 1,
+      "ts": 435141.458,
+      "dur": 117.625,
+      "args": {
+        "task_id": 425,
+        "func_id": 1,
+        "core": 62,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#199",
+      "pid": 19,
+      "tid": 2,
+      "ts": 241295.25,
+      "dur": 481.208,
+      "args": {
+        "task_id": 199,
+        "func_id": 1,
+        "core": 63,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#201",
+      "pid": 19,
+      "tid": 2,
+      "ts": 241777.916,
+      "dur": 244.0,
+      "args": {
+        "task_id": 201,
+        "func_id": 1,
+        "core": 63,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#203",
+      "pid": 19,
+      "tid": 2,
+      "ts": 242025.375,
+      "dur": 219.083,
+      "args": {
+        "task_id": 203,
+        "func_id": 1,
+        "core": 63,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#157",
+      "pid": 20,
+      "tid": 1,
+      "ts": 161478.916,
+      "dur": 413.709,
+      "args": {
+        "task_id": 157,
+        "func_id": 1,
+        "core": 64,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#161",
+      "pid": 20,
+      "tid": 1,
+      "ts": 182115.625,
+      "dur": 122.708,
+      "args": {
+        "task_id": 161,
+        "func_id": 1,
+        "core": 64,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#159",
+      "pid": 20,
+      "tid": 1,
+      "ts": 220199.125,
+      "dur": 219.166,
+      "args": {
+        "task_id": 159,
+        "func_id": 1,
+        "core": 64,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#409",
+      "pid": 20,
+      "tid": 1,
+      "ts": 434980.833,
+      "dur": 129.375,
+      "args": {
+        "task_id": 409,
+        "func_id": 1,
+        "core": 64,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#475",
+      "pid": 20,
+      "tid": 1,
+      "ts": 484818.75,
+      "dur": 217.75,
+      "args": {
+        "task_id": 475,
+        "func_id": 1,
+        "core": 64,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#423",
+      "pid": 20,
+      "tid": 1,
+      "ts": 487892.375,
+      "dur": 222.25,
+      "args": {
+        "task_id": 423,
+        "func_id": 1,
+        "core": 64,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#9",
+      "pid": 20,
+      "tid": 2,
+      "ts": 21466.833,
+      "dur": 418.5,
+      "args": {
+        "task_id": 9,
+        "func_id": 1,
+        "core": 65,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#7",
+      "pid": 20,
+      "tid": 2,
+      "ts": 40521.208,
+      "dur": 212.875,
+      "args": {
+        "task_id": 7,
+        "func_id": 1,
+        "core": 65,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#11",
+      "pid": 20,
+      "tid": 2,
+      "ts": 40756.916,
+      "dur": 212.292,
+      "args": {
+        "task_id": 11,
+        "func_id": 1,
+        "core": 65,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#265",
+      "pid": 20,
+      "tid": 2,
+      "ts": 302213.333,
+      "dur": 249.167,
+      "args": {
+        "task_id": 265,
+        "func_id": 1,
+        "core": 65,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#293",
+      "pid": 20,
+      "tid": 2,
+      "ts": 320521.291,
+      "dur": 534.042,
+      "args": {
+        "task_id": 293,
+        "func_id": 1,
+        "core": 65,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#291",
+      "pid": 20,
+      "tid": 2,
+      "ts": 336365.5,
+      "dur": 349.0,
+      "args": {
+        "task_id": 291,
+        "func_id": 1,
+        "core": 65,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#223",
+      "pid": 21,
+      "tid": 1,
+      "ts": 257707.083,
+      "dur": 710.0,
+      "args": {
+        "task_id": 223,
+        "func_id": 1,
+        "core": 66,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#225",
+      "pid": 21,
+      "tid": 1,
+      "ts": 259856.916,
+      "dur": 122.084,
+      "args": {
+        "task_id": 225,
+        "func_id": 1,
+        "core": 66,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#227",
+      "pid": 21,
+      "tid": 1,
+      "ts": 259979.75,
+      "dur": 96.041,
+      "args": {
+        "task_id": 227,
+        "func_id": 1,
+        "core": 66,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#217",
+      "pid": 21,
+      "tid": 2,
+      "ts": 241570.791,
+      "dur": 339.125,
+      "args": {
+        "task_id": 217,
+        "func_id": 1,
+        "core": 67,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#219",
+      "pid": 21,
+      "tid": 2,
+      "ts": 241911.041,
+      "dur": 221.917,
+      "args": {
+        "task_id": 219,
+        "func_id": 1,
+        "core": 67,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#221",
+      "pid": 21,
+      "tid": 2,
+      "ts": 255476.333,
+      "dur": 300.708,
+      "args": {
+        "task_id": 221,
+        "func_id": 1,
+        "core": 67,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#249",
+      "pid": 22,
+      "tid": 1,
+      "ts": 285143.333,
+      "dur": 245.083,
+      "args": {
+        "task_id": 249,
+        "func_id": 1,
+        "core": 68,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#251",
+      "pid": 22,
+      "tid": 1,
+      "ts": 285389.166,
+      "dur": 96.667,
+      "args": {
+        "task_id": 251,
+        "func_id": 1,
+        "core": 68,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#247",
+      "pid": 22,
+      "tid": 1,
+      "ts": 324435.458,
+      "dur": 100.208,
+      "args": {
+        "task_id": 247,
+        "func_id": 1,
+        "core": 68,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#193",
+      "pid": 22,
+      "tid": 2,
+      "ts": 190983.875,
+      "dur": 656.5,
+      "args": {
+        "task_id": 193,
+        "func_id": 1,
+        "core": 69,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#195",
+      "pid": 22,
+      "tid": 2,
+      "ts": 207541.75,
+      "dur": 110.041,
+      "args": {
+        "task_id": 195,
+        "func_id": 1,
+        "core": 69,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#197",
+      "pid": 22,
+      "tid": 2,
+      "ts": 208488.666,
+      "dur": 118.959,
+      "args": {
+        "task_id": 197,
+        "func_id": 1,
+        "core": 69,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#433",
+      "pid": 22,
+      "tid": 2,
+      "ts": 435261.333,
+      "dur": 123.708,
+      "args": {
+        "task_id": 433,
+        "func_id": 1,
+        "core": 69,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#469",
+      "pid": 22,
+      "tid": 2,
+      "ts": 488566.708,
+      "dur": 237.083,
+      "args": {
+        "task_id": 469,
+        "func_id": 1,
+        "core": 69,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#463",
+      "pid": 22,
+      "tid": 2,
+      "ts": 493542.916,
+      "dur": 99.584,
+      "args": {
+        "task_id": 463,
+        "func_id": 1,
+        "core": 69,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#109",
+      "pid": 23,
+      "tid": 1,
+      "ts": 153282.25,
+      "dur": 493.0,
+      "args": {
+        "task_id": 109,
+        "func_id": 1,
+        "core": 70,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#111",
+      "pid": 23,
+      "tid": 1,
+      "ts": 153775.875,
+      "dur": 222.083,
+      "args": {
+        "task_id": 111,
+        "func_id": 1,
+        "core": 70,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#113",
+      "pid": 23,
+      "tid": 1,
+      "ts": 154000.208,
+      "dur": 237.458,
+      "args": {
+        "task_id": 113,
+        "func_id": 1,
+        "core": 70,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#401",
+      "pid": 23,
+      "tid": 1,
+      "ts": 435124.541,
+      "dur": 130.459,
+      "args": {
+        "task_id": 401,
+        "func_id": 1,
+        "core": 70,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#403",
+      "pid": 23,
+      "tid": 1,
+      "ts": 435255.666,
+      "dur": 101.5,
+      "args": {
+        "task_id": 403,
+        "func_id": 1,
+        "core": 70,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#399",
+      "pid": 23,
+      "tid": 1,
+      "ts": 460914.541,
+      "dur": 101.792,
+      "args": {
+        "task_id": 399,
+        "func_id": 1,
+        "core": 70,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#205",
+      "pid": 23,
+      "tid": 2,
+      "ts": 257757.791,
+      "dur": 565.167,
+      "args": {
+        "task_id": 205,
+        "func_id": 1,
+        "core": 71,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#207",
+      "pid": 23,
+      "tid": 2,
+      "ts": 258324.208,
+      "dur": 321.167,
+      "args": {
+        "task_id": 207,
+        "func_id": 1,
+        "core": 71,
+        "mc": 0,
+        "name": "ADD"
+      }
+    },
+    {
+      "ph": "X",
+      "name": "ADD#209",
+      "pid": 23,
+      "tid": 2,
+      "ts": 258647.541,
+      "dur": 121.584,
+      "args": {
+        "task_id": 209,
+        "func_id": 1,
+        "core": 71,
+        "mc": 0,
+        "name": "ADD"
+      }
+    }
+  ]
+}
\ No newline at end of file
diff --git a/docs/fully_distributed_within_core/swimlane_bgemm_fullcore.png b/docs/fully_distributed_within_core/swimlane_bgemm_fullcore.png
new file mode 100644
index 000000000..8a9e9a026
Binary files /dev/null and b/docs/fully_distributed_within_core/swimlane_bgemm_fullcore.png differ
diff --git a/examples/a2a3/fully_distributed_within_core/async_notify_demo/kernels/aiv/kernel_consumer.cpp b/examples/a2a3/fully_distributed_within_core/async_notify_demo/kernels/aiv/kernel_consumer.cpp
new file mode 100644
index 000000000..55d69ce34
--- /dev/null
+++ b/examples/a2a3/fully_distributed_within_core/async_notify_demo/kernels/aiv/kernel_consumer.cpp
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+#include <cstdint>
+
+#ifndef __gm__
+#define __gm__
+#endif
+#ifndef __aicore__
+#define __aicore__ [aicore]
+#endif
+
+#include <pto/pto-inst.hpp>
+#include "pto/common/pto_tile.hpp"
+
+#include "tensor.h"
+
+using namespace pto;
+
+extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) {
+    __gm__ Tensor *src_tensor = reinterpret_cast<__gm__ Tensor *>(args[1]);
+    __gm__ Tensor *result_tensor = reinterpret_cast<__gm__ Tensor *>(args[2]);
+    __gm__ int32_t *notify_counter = reinterpret_cast<__gm__ int32_t *>(args[3]);
+
+    __gm__ float *src = reinterpret_cast<__gm__ float *>(src_tensor->buffer.addr) + src_tensor->start_offset;
+    __gm__ float *result = reinterpret_cast<__gm__ float *>(result_tensor->buffer.addr) + result_tensor->start_offset;
+
+    constexpr int kRows = 128;
+    constexpr int kCols = 128;
+    using DynShapeDim5 = Shape<1, 1, 1, kRows, kCols>;
+    using DynStridDim5 = Stride<1, 1, 1, kCols, 1>;
+    using GlobalData = GlobalTensor<float, DynShapeDim5, DynStridDim5>;
+    using TileData = Tile<TileType::Vec, float, kRows, kCols, BLayout::RowMajor, -1, -1>;
+
+    TileData src_tile(kRows, kCols);
+    TileData dst_tile(kRows, kCols);
+    TASSIGN(src_tile, 0x0);
+    TASSIGN(dst_tile, 0x10000);
+
+    GlobalData src_global(src);
+    GlobalData dst_global(result);
+    TLOAD(src_tile, src_global);
+    set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+    wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+    TADDS(dst_tile, src_tile, static_cast<float>(*notify_counter));
+    set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+    wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+    TSTORE(dst_global, dst_tile);
+    set_flag(PIPE_MTE3, PIPE_S, EVENT_ID7);
+    wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID7);
+}
diff --git a/examples/a2a3/fully_distributed_within_core/async_notify_demo/kernels/aiv/kernel_notify_wait.cpp b/examples/a2a3/fully_distributed_within_core/async_notify_demo/kernels/aiv/kernel_notify_wait.cpp
new file mode 100644
index 000000000..bc8f1cd86
--- /dev/null
+++ b/examples/a2a3/fully_distributed_within_core/async_notify_demo/kernels/aiv/kernel_notify_wait.cpp
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+#include <cstdint>
+
+#ifndef __gm__
+#define __gm__
+#endif
+#ifndef __aicore__
+#define __aicore__ [aicore]
+#endif
+
+#include <pto/pto-inst.hpp>
+#include "pto_async_kernel_api.h"
+
+extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) {
+    uint64_t notify_counter_addr = static_cast<uint64_t>(args[1]);
+    uint32_t expected_value = static_cast<uint32_t>(args[2]);
+    AsyncCtx async_ctx = get_async_ctx(args);
+    save_expected_notification_counter(
+        async_ctx, reinterpret_cast<volatile __gm__ void *>(notify_counter_addr), expected_value
+    );
+}
diff --git a/examples/a2a3/fully_distributed_within_core/async_notify_demo/kernels/aiv/kernel_producer_notify.cpp b/examples/a2a3/fully_distributed_within_core/async_notify_demo/kernels/aiv/kernel_producer_notify.cpp
new file mode 100644
index 000000000..1cd3fb7ec
--- /dev/null
+++ b/examples/a2a3/fully_distributed_within_core/async_notify_demo/kernels/aiv/kernel_producer_notify.cpp
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+#include <cstdint>
+
+#ifndef __gm__
+#define __gm__
+#endif
+#ifndef __aicore__
+#define __aicore__ [aicore]
+#endif
+
+#include <pto/pto-inst.hpp>
+
+#include "platform_comm/comm_context.h"
+#include "pto_async_kernel_api.h"
+#include "tensor.h"
+
+using namespace pto;
+
+template <typename T>
+static inline __aicore__ __gm__ T *comm_remote_ptr(__gm__ CommContext *ctx, __gm__ T *local_ptr, int peer_rank) {
+    uint64_t local_base = ctx->windowsIn[ctx->rankId];
+    uint64_t offset = reinterpret_cast<uint64_t>(local_ptr) - local_base;
+    return reinterpret_cast<__gm__ T *>(ctx->windowsIn[peer_rank] + offset);
+}
+
+extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) {
+    __gm__ Tensor *in_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]);
+    __gm__ Tensor *out_tensor = reinterpret_cast<__gm__ Tensor *>(args[1]);
+    __gm__ int32_t *local_counter = reinterpret_cast<__gm__ int32_t *>(args[2]);
+    __gm__ CommContext *comm_ctx = reinterpret_cast<__gm__ CommContext *>(args[3]);
+
+    __gm__ float *in_data = reinterpret_cast<__gm__ float *>(in_tensor->buffer.addr) + in_tensor->start_offset;
+    __gm__ float *out_data = reinterpret_cast<__gm__ float *>(out_tensor->buffer.addr) + out_tensor->start_offset;
+
+    int my_rank = static_cast<int>(comm_ctx->rankId);
+    int peer_rank = 1 - my_rank;
+
+    constexpr int kRows = 128;
+    constexpr int kCols = 128;
+    using DynShapeDim5 = Shape<1, 1, 1, kRows, kCols>;
+    using DynStridDim5 = Stride<1, 1, 1, kCols, 1>;
+    using GlobalData = GlobalTensor<float, DynShapeDim5, DynStridDim5>;
+    using TileData = Tile<TileType::Vec, float, kRows, kCols, BLayout::RowMajor, -1, -1>;
+
+    TileData in_tile(kRows, kCols);
+    TileData out_tile(kRows, kCols);
+    TASSIGN(in_tile, 0x0);
+    TASSIGN(out_tile, 0x10000);
+
+    GlobalData in_global(in_data);
+    GlobalData out_global(out_data);
+    TLOAD(in_tile, in_global);
+    set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+    wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+    TADD(out_tile, in_tile, in_tile);
+    set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+    wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+    TSTORE(out_global, out_tile);
+    set_flag(PIPE_MTE3, PIPE_S, EVENT_ID7);
+    wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID7);
+
+    if (my_rank == 1) {
+        for (volatile int i = 0; i < 2000000; ++i) {}
+    }
+
+    __gm__ int32_t *remote_counter = comm_remote_ptr(comm_ctx, local_counter, peer_rank);
+    send_notification(remote_counter, 1, pto::comm::NotifyOp::AtomicAdd);
+    pipe_barrier(PIPE_ALL);
+}
diff --git a/examples/a2a3/fully_distributed_within_core/async_notify_demo/kernels/orchestration/async_notify_orchestration.cpp b/examples/a2a3/fully_distributed_within_core/async_notify_demo/kernels/orchestration/async_notify_orchestration.cpp
new file mode 100644
index 000000000..59e1cc23c
--- /dev/null
+++ b/examples/a2a3/fully_distributed_within_core/async_notify_demo/kernels/orchestration/async_notify_orchestration.cpp
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+#include <stdint.h>
+
+#include "platform_comm/comm_context.h"
+#include "pto_orchestration_api.h"
+
+extern "C" {
+
+__attribute__((visibility("default"))) PTO2OrchestrationConfig
+async_notify_orchestration_config(const L2TaskArgs &orch_args) {
+    (void)orch_args;
+    return PTO2OrchestrationConfig{.expected_arg_count = 5};
+}
+
+__attribute__((visibility("default"))) PTO2OrchestrationConfig aicpu_orchestration_config(const L2TaskArgs &orch_args) {
+    return async_notify_orchestration_config(orch_args);
+}
+
+__attribute__((visibility("default"))) void async_notify_orchestration(const L2TaskArgs &orch_args) {
+    if (orch_args.tensor_count() + orch_args.scalar_count() != 5) {
+        LOG_ERROR("async_notify_demo: expected 5 args");
+        return;
+    }
+
+    const Tensor &input = orch_args.tensor(0).ref();
+    const Tensor &output = orch_args.tensor(1).ref();
+    const Tensor &result = orch_args.tensor(2).ref();
+    const Tensor &notify_counter = orch_args.tensor(3).ref();
+    auto *comm_ctx = reinterpret_cast<CommContext *>(static_cast<uintptr_t>(orch_args.scalar(0)));
+
+    L0TaskArgs params_producer;
+    params_producer.add_input(input);
+    params_producer.add_output(output);
+    params_producer.add_scalar(notify_counter.buffer.addr);
+    params_producer.add_scalar(reinterpret_cast<uint64_t>(comm_ctx));
+    rt_submit_aiv_task(0, params_producer);
+
+    uint32_t notify_token_shape[1] = {1};
+    TensorCreateInfo notify_token_info(notify_token_shape, 1, DataType::INT32);
+    L0TaskArgs params_notify;
+    params_notify.add_output(notify_token_info);
+    params_notify.add_scalar(notify_counter.buffer.addr);
+    params_notify.add_scalar(static_cast<uint64_t>(1));
+    TaskOutputTensors notify_outputs = rt_submit_aiv_task(2, params_notify);
+    Tensor notify_token = notify_outputs.get_ref(0);
+
+    L0TaskArgs params_consumer;
+    params_consumer.add_input(notify_token);
+    params_consumer.add_input(output);
+    params_consumer.add_output(result);
+    params_consumer.add_scalar(notify_counter.buffer.addr);
+    rt_submit_aiv_task(1, params_consumer);
+}
+
+}  // extern "C"
diff --git a/examples/a2a3/fully_distributed_within_core/async_notify_demo/test_async_notify_demo.py b/examples/a2a3/fully_distributed_within_core/async_notify_demo/test_async_notify_demo.py
new file mode 100644
index 000000000..df462249a
--- /dev/null
+++ b/examples/a2a3/fully_distributed_within_core/async_notify_demo/test_async_notify_demo.py
@@ -0,0 +1,177 @@
+#!/usr/bin/env python3
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""Notification counter + deferred completion smoke test for onboard a2a3."""
+
+from __future__ import annotations
+
+import argparse
+import os
+
+import torch
+from simpler.task_interface import (
+    ArgDirection,
+    CallConfig,
+    ChipCallable,
+    CommBufferSpec,
+    CoreCallable,
+    DataType,
+    TaskArgs,
+    Tensor,
+    TensorArgType,
+)
+from simpler.worker import Worker
+
+from simpler_setup.elf_parser import extract_text_section
+from simpler_setup.kernel_compiler import KernelCompiler
+from simpler_setup.pto_isa import ensure_pto_isa_root
+from simpler_setup.torch_interop import make_tensor_arg
+
+HERE = os.path.dirname(os.path.abspath(__file__))
+N = 128 * 128
+
+
+def parse_device_range(spec: str) -> list[int]:
+    if "," in spec:
+        return [int(x) for x in spec.split(",") if x]
+    if "-" in spec:
+        lo, hi = (int(x) for x in spec.split("-"))
+        return list(range(lo, hi + 1))
+    return [int(spec)]
+
+
+def build_chip_callable(platform: str, pto_isa_commit: str | None, clone_protocol: str) -> ChipCallable:
+    kc = KernelCompiler(platform=platform)
+    runtime = "fully_distributed_within_core"
+    pto_isa_root = ensure_pto_isa_root(commit=pto_isa_commit, clone_protocol=clone_protocol)
+    include_dirs = kc.get_orchestration_include_dirs(runtime)
+    extra_includes = list(include_dirs) + [str(kc.project_root / "src" / "common")]
+
+    children = []
+    for func_id, rel in [
+        (0, "kernels/aiv/kernel_producer_notify.cpp"),
+        (1, "kernels/aiv/kernel_consumer.cpp"),
+        (2, "kernels/aiv/kernel_notify_wait.cpp"),
+    ]:
+        kernel = kc.compile_incore(
+            source_path=os.path.join(HERE, rel),
+            core_type="aiv",
+            pto_isa_root=pto_isa_root,
+            extra_include_dirs=extra_includes,
+        )
+        if not platform.endswith("sim"):
+            kernel = extract_text_section(kernel)
+        children.append(
+            (
+                func_id,
+                CoreCallable.build(
+                    signature=[ArgDirection.IN, ArgDirection.OUT, ArgDirection.OUT, ArgDirection.IN],
+                    binary=kernel,
+                ),
+            )
+        )
+
+    orch = kc.compile_orchestration(
+        runtime_name=runtime,
+        source_path=os.path.join(HERE, "kernels/orchestration/async_notify_orchestration.cpp"),
+        extra_include_dirs=[str(kc.project_root / "src" / "common")],
+    )
+    return ChipCallable.build(
+        signature=[ArgDirection.IN, ArgDirection.OUT, ArgDirection.OUT, ArgDirection.IN],
+        func_name="async_notify_orchestration",
+        binary=orch,
+        children=children,
+    )
+
+
+def run(
+    platform: str = "a2a3",
+    device_ids: list[int] | None = None,
+    pto_isa_commit: str | None = None,
+) -> int:
+    if device_ids is None:
+        device_ids = [0, 1]
+    nranks = len(device_ids)
+    if nranks != 2:
+        raise ValueError(f"async_notify_demo needs exactly 2 devices, got {device_ids}")
+
+    inp = [
+        torch.tensor([float(i % 251) / 10.0 for i in range(N)], dtype=torch.float32).share_memory_()
+        for _ in range(nranks)
+    ]
+    out = [torch.zeros(N, dtype=torch.float32).share_memory_() for _ in range(nranks)]
+    result = [torch.zeros(N, dtype=torch.float32).share_memory_() for _ in range(nranks)]
+
+    chip_callable = build_chip_callable(platform, pto_isa_commit, "https")
+    worker = Worker(
+        level=3,
+        platform=platform,
+        runtime="fully_distributed_within_core",
+        device_ids=device_ids,
+        num_sub_workers=0,
+    )
+    chip_handle = worker.register(chip_callable)
+    try:
+        worker.init()
+
+        def orch_fn(orch, _args, cfg):
+            with orch.allocate_domain(
+                name="default",
+                workers=list(range(nranks)),
+                window_size=4 * 1024,
+                buffers=[CommBufferSpec(name="notify_counter", dtype="int32", count=1, nbytes=4)],
+            ) as handle:
+                for rank in range(nranks):
+                    domain = handle[rank]
+                    args = TaskArgs()
+                    args.add_tensor(make_tensor_arg(inp[rank]), TensorArgType.INPUT)
+                    args.add_tensor(make_tensor_arg(out[rank]), TensorArgType.OUTPUT_EXISTING)
+                    args.add_tensor(make_tensor_arg(result[rank]), TensorArgType.OUTPUT_EXISTING)
+                    args.add_tensor(
+                        Tensor.make(
+                            data=domain.buffer_ptrs["notify_counter"],
+                            shapes=(1,),
+                            dtype=DataType.INT32,
+                            child_memory=True,
+                        ),
+                        TensorArgType.INPUT,
+                    )
+                    args.add_scalar(domain.device_ctx)
+                    orch.submit_next_level(chip_handle, args, cfg, worker=rank)
+
+        worker.run(orch_fn, args=None, config=CallConfig())
+
+        ok = True
+        for rank in range(nranks):
+            expected_out = inp[rank] * 2.0
+            expected_result = expected_out + 1.0
+            max_out = float(torch.max(torch.abs(out[rank] - expected_out)))
+            max_result = float(torch.max(torch.abs(result[rank] - expected_result)))
+            print(f"[async_notify_demo] rank {rank}: max_out={max_out:.3e} max_result={max_result:.3e}")
+            ok = ok and max_out <= 1e-3 and max_result <= 1e-3
+        return 0 if ok else 1
+    finally:
+        worker.close()
+
+
+def test_async_notify_demo() -> None:
+    assert run("a2a3", [0, 1]) == 0
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-p", "--platform", default="a2a3")
+    parser.add_argument("-d", "--device", default="0-1")
+    parser.add_argument("--pto-isa-commit", default=None)
+    args = parser.parse_args()
+    return run(args.platform, parse_device_range(args.device), args.pto_isa_commit)
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/examples/a2a3/fully_distributed_within_core/benchmark_bgemm/kernels/aic/kernel_gemm_tile.cpp b/examples/a2a3/fully_distributed_within_core/benchmark_bgemm/kernels/aic/kernel_gemm_tile.cpp
new file mode 100644
index 000000000..1f331d6e0
--- /dev/null
+++ b/examples/a2a3/fully_distributed_within_core/benchmark_bgemm/kernels/aic/kernel_gemm_tile.cpp
@@ -0,0 +1,150 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Tile-based Matrix Multiplication Kernel (Cube Core)
+ *
+ * Computes: output = input_a @ input_b (tile_size x tile_size tile matmul)
+ * Uses TMATMUL instruction
+ *
+ * Tile size is determined by golden.py configuration and passed through
+ * tensor shapes from orchestration.
+ *
+ * Args (Tensor*):
+ *   args[0] = input_a (INPUT)
+ *   args[1] = input_b (INPUT)
+ *   args[2] = output  (OUTPUT)
+ *   args[3] = config  (INPUT) - int64_t[4]: [tile_size, grid_k, num_groups, incore_loop]
+ */
+
+#include <cstdint>
+#include <pto/pto-inst.hpp>
+#include <pto/common/constants.hpp>
+#include <pto/common/pto_tile.hpp>
+
+#include "tensor.h"
+
+using namespace pto;
+
+#include "pipe_sync.h"
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__ [aicore]
+#endif
+
+template <typename T>
+AICORE constexpr inline T CeilAlign(T num_1, T num_2) {
+    if (num_2 == 0) {
+        return 0;
+    }
+    return (num_1 + num_2 - 1) / num_2 * num_2;
+}
+
+template <int TILE>
+static __aicore__ void gemm_tile_impl(__gm__ float *input_a, __gm__ float *input_b, __gm__ float *output) {
+    constexpr int blockAlign = C0_SIZE_BYTE / sizeof(float);
+    constexpr int M = CeilAlign<int>(TILE, 16);
+    constexpr int K = CeilAlign<int>(TILE, blockAlign);
+    constexpr int N = CeilAlign<int>(TILE, blockAlign);
+
+    using GlobalDataA =
+        GlobalTensor<float, Shape<1, 1, 1, TILE, TILE>, Stride<1 * TILE * TILE, 1 * TILE * TILE, TILE * TILE, TILE, 1>>;
+    using GlobalDataB =
+        GlobalTensor<float, Shape<1, 1, 1, TILE, TILE>, Stride<1 * TILE * TILE, 1 * TILE * TILE, TILE * TILE, TILE, 1>>;
+    using GlobalDataC =
+        GlobalTensor<float, Shape<1, 1, 1, TILE, TILE>, Stride<1 * TILE * TILE, 1 * TILE * TILE, TILE * TILE, TILE, 1>>;
+
+    GlobalDataA src0Global(input_a);
+    GlobalDataB src1Global(input_b);
+    GlobalDataC dstGlobal(output);
+
+    using TileMatA = Tile<TileType::Mat, float, M, K, BLayout::ColMajor, TILE, TILE, SLayout::RowMajor, 512>;
+    using TileMatB = Tile<TileType::Mat, float, K, N, BLayout::ColMajor, TILE, TILE, SLayout::RowMajor, 512>;
+
+    using LeftTile = TileLeft<float, M, K, TILE, TILE>;
+    using RightTile = TileRight<float, K, N, TILE, TILE>;
+    using AccTile = TileAcc<float, M, N, TILE, TILE>;
+
+    TileMatA aMatTile;
+    TileMatB bMatTile;
+    TASSIGN(aMatTile, 0x0);
+    TASSIGN(bMatTile, 0x20000);
+
+    LeftTile aTile;
+    RightTile bTile;
+    AccTile cTile;
+    TASSIGN(aTile, 0x0);
+    TASSIGN(bTile, 0x0);
+    TASSIGN(cTile, 0x0);
+
+    TLOAD(aMatTile, src0Global);
+    TLOAD(bMatTile, src1Global);
+
+    set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
+    wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
+
+    TMOV(aTile, aMatTile);
+    TMOV(bTile, bMatTile);
+
+    set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
+    wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
+
+    TMATMUL(cTile, aTile, bTile);
+
+    set_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
+    wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
+
+    TSTORE(dstGlobal, cTile);
+
+    pipe_sync();
+}
+
+extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) {
+    __gm__ Tensor *input_a = reinterpret_cast<__gm__ Tensor *>(args[0]);
+    __gm__ Tensor *input_b = reinterpret_cast<__gm__ Tensor *>(args[1]);
+    __gm__ Tensor *output = reinterpret_cast<__gm__ Tensor *>(args[2]);
+    __gm__ Tensor *config = reinterpret_cast<__gm__ Tensor *>(args[3]);
+
+    __gm__ int64_t *cfg = reinterpret_cast<__gm__ int64_t *>(config->buffer.addr);
+    uint64_t tile_size = static_cast<uint64_t>(cfg[0]);
+    uint64_t tile_elems = tile_size * tile_size;
+    int num_tiles = static_cast<uint64_t>(cfg[3]);
+
+    __gm__ float *base_a = reinterpret_cast<__gm__ float *>(input_a->buffer.addr) + input_a->start_offset;
+    __gm__ float *base_b = reinterpret_cast<__gm__ float *>(input_b->buffer.addr) + input_b->start_offset;
+    __gm__ float *base_c = reinterpret_cast<__gm__ float *>(output->buffer.addr) + output->start_offset;
+
+    for (int tile_idx = 0; tile_idx < num_tiles; tile_idx++) {
+        __gm__ float *a_ptr = base_a + (tile_idx * tile_elems);
+        __gm__ float *b_ptr = base_b + (tile_idx * tile_elems);
+        __gm__ float *c_ptr = base_c + (tile_idx * tile_elems);
+
+        switch (tile_size) {
+        case 16:
+            gemm_tile_impl<16>(a_ptr, b_ptr, c_ptr);
+            break;
+        case 32:
+            gemm_tile_impl<32>(a_ptr, b_ptr, c_ptr);
+            break;
+        case 64:
+            gemm_tile_impl<64>(a_ptr, b_ptr, c_ptr);
+            break;
+        case 128:
+            gemm_tile_impl<128>(a_ptr, b_ptr, c_ptr);
+            break;
+        default:
+            break;
+        }
+    }
+}
diff --git a/examples/a2a3/fully_distributed_within_core/benchmark_bgemm/kernels/aiv/kernel_tile_add.cpp b/examples/a2a3/fully_distributed_within_core/benchmark_bgemm/kernels/aiv/kernel_tile_add.cpp
new file mode 100644
index 000000000..c80e88244
--- /dev/null
+++ b/examples/a2a3/fully_distributed_within_core/benchmark_bgemm/kernels/aiv/kernel_tile_add.cpp
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Tile-based Element-wise Addition Kernel (Vector Core) - INOUT Pattern
+ *
+ * Computes: C_tile = C_tile + P (tile_size x tile_size tile accumulation)
+ * Uses TADD instruction
+ *
+ * Tile size is determined by golden.py configuration and passed through
+ * tensor shapes from orchestration.
+ *
+ * Args (Tensor*):
+ *   args[0] = C_tile (INOUT: read + write accumulator)
+ *   args[1] = P      (INPUT: matmul result to accumulate)
+ *   args[2] = config (INPUT) - int64_t[4]: [tile_size, grid_k, num_groups, incore_loop]
+ */
+
+#include <cstdint>
+#include <pto/pto-inst.hpp>
+#include <pto/common/constants.hpp>
+
+#include "tensor.h"
+
+using namespace pto;
+
+#include "pipe_sync.h"
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__ [aicore]
+#endif
+
+template <int TILE>
+static __aicore__ void tile_add_impl(__gm__ float *c_ptr, __gm__ float *p_ptr) {
+    using DynShapeDim5 = Shape<1, 1, 1, TILE, TILE>;
+    using DynStridDim5 = Stride<1, 1, 1, TILE, 1>;
+    using GlobalData = GlobalTensor<float, DynShapeDim5, DynStridDim5>;
+    using TileData = Tile<TileType::Vec, float, TILE, TILE, BLayout::RowMajor, -1, -1>;
+
+    TileData cTile(TILE, TILE);
+    TileData pTile(TILE, TILE);
+    TileData outTile(TILE, TILE);
+    TASSIGN(cTile, 0x0);
+    TASSIGN(pTile, 0x10000);
+    TASSIGN(outTile, 0x20000);
+
+    GlobalData cGlobal(c_ptr);
+    GlobalData pGlobal(p_ptr);
+    GlobalData outGlobal(c_ptr);  // write back to same C location
+
+    TLOAD(cTile, cGlobal);
+    TLOAD(pTile, pGlobal);
+    set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+    wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+    TADD(outTile, cTile, pTile);
+    set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+    wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+    TSTORE(outGlobal, outTile);
+    pipe_sync();
+}
+
+extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) {
+    __gm__ Tensor *c_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]);
+    __gm__ Tensor *p_tensor = reinterpret_cast<__gm__ Tensor *>(args[1]);
+    __gm__ Tensor *config = reinterpret_cast<__gm__ Tensor *>(args[2]);
+
+    __gm__ int64_t *cfg = reinterpret_cast<__gm__ int64_t *>(config->buffer.addr);
+    uint64_t tile_size = static_cast<uint64_t>(cfg[0]);
+    uint64_t tile_elems = tile_size * tile_size;
+    int num_tiles = static_cast<int>(cfg[3]);
+
+    __gm__ float *base_c = reinterpret_cast<__gm__ float *>(c_tensor->buffer.addr) + c_tensor->start_offset;
+    __gm__ float *base_p = reinterpret_cast<__gm__ float *>(p_tensor->buffer.addr) + p_tensor->start_offset;
+
+    for (int tile_idx = 0; tile_idx < num_tiles; tile_idx++) {
+        __gm__ float *c_ptr = base_c + (tile_idx * tile_elems);
+        __gm__ float *p_ptr = base_p + (tile_idx * tile_elems);
+
+        switch (tile_size) {
+        case 16:
+            tile_add_impl<16>(c_ptr, p_ptr);
+            break;
+        case 32:
+            tile_add_impl<32>(c_ptr, p_ptr);
+            break;
+        case 64:
+            tile_add_impl<64>(c_ptr, p_ptr);
+            break;
+        case 128:
+            tile_add_impl<128>(c_ptr, p_ptr);
+            break;
+        default:
+            break;
+        }
+    }
+}
diff --git a/examples/a2a3/fully_distributed_within_core/benchmark_bgemm/kernels/orchestration/bgemm_orch.cpp b/examples/a2a3/fully_distributed_within_core/benchmark_bgemm/kernels/orchestration/bgemm_orch.cpp
new file mode 100644
index 000000000..dcfc11340
--- /dev/null
+++ b/examples/a2a3/fully_distributed_within_core/benchmark_bgemm/kernels/orchestration/bgemm_orch.cpp
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * BGEMM Orchestration Function (tensormap_and_ringbuffer Runtime)
+ *
+ * Builds the task graph for tiled matrix multiplication: C = A @ B
+ *
+ * Configuration read from scalar args (set in golden.py):
+ *   - tile_size: tile dimension (tile_size x tile_size per tile)
+ *   - grid_k: number of K-dimension partitions
+ *   - num_groups: number of independent groups (= matmul_add_task_num / grid_k)
+ *   - incore_loop: number of tiles per group
+ *
+ * Memory layout (tile-first, flattened):
+ *   A: [num_groups, grid_k, incore_loop, tile_size, tile_size]
+ *   B: [num_groups, grid_k, incore_loop, tile_size, tile_size]
+ *   C: [incore_loop * num_groups, tile_size, tile_size]
+ *
+ * Arg layout: [A, B, C, config]
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "pto_orchestration_api.h"  // NOLINT(build/include_subdir)
+
+#define FUNC_GEMM_TILE 0
+#define FUNC_TILE_ADD 1
+
+extern "C" {
+
+__attribute__((visibility("default"))) PTO2OrchestrationConfig aicpu_orchestration_config(const L2TaskArgs &orch_args) {
+    (void)orch_args;  // NOLINT(readability/casting)
+    return PTO2OrchestrationConfig{
+        .expected_arg_count = 4,
+    };
+}
+
+__attribute__((visibility("default"))) void aicpu_orchestration_entry(const L2TaskArgs &orch_args) {
+    // Tensor args
+    const Tensor &ext_A = orch_args.tensor(0).ref();
+    const Tensor &ext_B = orch_args.tensor(1).ref();
+    const Tensor &ext_C = orch_args.tensor(2).ref();
+    const Tensor &ext_config = orch_args.tensor(3).ref();
+
+    // Read config from tensor data: [tile_size, grid_k, num_groups, incore_loop]
+    int64_t *host_config = orch_args.tensor(3).ref().data_as<int64_t>();
+    int tile_size = static_cast<int>(host_config[0]);
+    int grid_k = static_cast<int>(host_config[1]);
+    int num_groups = static_cast<int>(host_config[2]);
+    int incore_loop = static_cast<int>(host_config[3]);
+    uint64_t tile_elems = static_cast<uint64_t>(tile_size) * tile_size;
+
+    int grid_m = 1;
+    int grid_n = 1;
+
+    LOG_INFO_V0(
+        "[bgemm_orch] tile_size: %d, grid_m: %d, grid_n: %d, grid_k: %d, num_groups: %d, incore_loop: %d", tile_size,
+        grid_m, grid_n, grid_k, num_groups, incore_loop
+    );
+
+    uint32_t tile_shapes[1] = {static_cast<uint32_t>(tile_elems)};
+    uint64_t group_tile_elems = static_cast<uint64_t>(incore_loop) * tile_elems;
+    uint32_t group_shapes[1] = {static_cast<uint32_t>(group_tile_elems)};
+    TensorCreateInfo group_ci(group_shapes, 1, DataType::FLOAT32);
+
+    int total_gemm = 0;
+    int total_add = 0;
+
+    // A/B layout: [num_groups, grid_k, incore_loop, tile_size, tile_size]
+    // C layout:   [incore_loop * num_groups, tile_size, tile_size]
+    for (int group_idx = 0; group_idx < num_groups; group_idx++) {
+        PTO2_SCOPE_GUARD();
+
+        uint32_t c_elem_offset = static_cast<uint32_t>(static_cast<uint64_t>(group_idx) * group_tile_elems);
+        uint32_t c_view_offsets[1] = {c_elem_offset};
+        Tensor C_view = ext_C.view(group_shapes, c_view_offsets);
+
+        for (int k_idx = 0; k_idx < grid_k; k_idx++) {
+            // In layout [num_groups, grid_k, incore_loop, tile_size, tile_size],
+            // offset = (group_idx * grid_k + k_idx) * incore_loop * tile_elems
+            uint64_t ab_offset =
+                (static_cast<uint64_t>(group_idx) * grid_k + static_cast<uint64_t>(k_idx)) * group_tile_elems;
+
+            uint32_t a_view_offsets[1] = {static_cast<uint32_t>(ab_offset)};
+            Tensor A_view = ext_A.view(group_shapes, a_view_offsets);
+            uint32_t b_view_offsets[1] = {static_cast<uint32_t>(ab_offset)};
+            Tensor B_view = ext_B.view(group_shapes, b_view_offsets);
+            L0TaskArgs params_gemm;
+            params_gemm.add_input(A_view);
+            params_gemm.add_input(B_view);
+            params_gemm.add_output(group_ci);
+            params_gemm.add_input(ext_config);
+            TaskOutputTensors gemm_outs = rt_submit_aic_task(FUNC_GEMM_TILE, params_gemm);
+            total_gemm++;
+
+            L0TaskArgs params_add;
+            params_add.add_inout(C_view);
+            params_add.add_input(gemm_outs.get_ref(0));
+            params_add.add_input(ext_config);
+            rt_submit_aiv_task(FUNC_TILE_ADD, params_add);
+            total_add++;
+        }
+    }
+
+    LOG_INFO_V0(
+        "[bgemm_orch] Submitted %d gemm tasks and %d add tasks (%d total)", total_gemm, total_add,
+        total_gemm + total_add
+    );
+}
+
+}  // extern "C"
diff --git a/examples/a2a3/fully_distributed_within_core/benchmark_bgemm/test_benchmark_bgemm.py b/examples/a2a3/fully_distributed_within_core/benchmark_bgemm/test_benchmark_bgemm.py
new file mode 100644
index 000000000..a6c457171
--- /dev/null
+++ b/examples/a2a3/fully_distributed_within_core/benchmark_bgemm/test_benchmark_bgemm.py
@@ -0,0 +1,135 @@
+#!/usr/bin/env python3
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""Benchmark BGEMM: runtime-configurable tiled matmul C = sum(k) A[k] @ B[k]."""
+
+import torch
+from simpler.task_interface import ArgDirection as D
+
+from simpler_setup import SceneTestCase, TaskArgsBuilder, Tensor, scene_test
+
+
+@scene_test(level=2, runtime="fully_distributed_within_core")
+class TestBenchmarkBgemm(SceneTestCase):
+    RTOL = 1e-3
+    ATOL = 1e-3
+
+    CALLABLE = {
+        "orchestration": {
+            "source": "kernels/orchestration/bgemm_orch.cpp",
+            "function_name": "aicpu_orchestration_entry",
+            "signature": [D.IN, D.IN, D.OUT, D.IN],
+        },
+        "incores": [
+            {
+                "func_id": 0,
+                "name": "GEMM",
+                "source": "kernels/aic/kernel_gemm_tile.cpp",
+                "core_type": "aic",
+                "signature": [D.IN, D.IN, D.OUT],
+            },
+            {
+                "func_id": 1,
+                "name": "ADD",
+                "source": "kernels/aiv/kernel_tile_add.cpp",
+                "core_type": "aiv",
+                "signature": [D.INOUT, D.IN],
+            },
+        ],
+    }
+
+    CASES = [
+        {
+            "name": "Case0",
+            "platforms": ["a2a3sim", "a2a3"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 24},
+            "params": {"matmul_add_task_num": 500, "incore_data_size": 128, "incore_loop": 4, "grid_k": 2},
+        },
+        {
+            "name": "Case1",
+            "manual": True,
+            "platforms": ["a2a3sim", "a2a3"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 24},
+            "params": {"matmul_add_task_num": 64, "incore_data_size": 128, "incore_loop": 4, "grid_k": 2},
+        },
+        {
+            "name": "Case2",
+            "manual": True,
+            "platforms": ["a2a3sim", "a2a3"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 24},
+            "params": {"matmul_add_task_num": 256, "incore_data_size": 128, "incore_loop": 4, "grid_k": 2},
+        },
+        {
+            "name": "Case3",
+            "manual": True,
+            "platforms": ["a2a3sim", "a2a3"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 24},
+            "params": {"matmul_add_task_num": 64, "incore_data_size": 128, "incore_loop": 16, "grid_k": 2},
+        },
+        {
+            "name": "Case4",
+            "manual": True,
+            "platforms": ["a2a3sim", "a2a3"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 24},
+            "params": {"matmul_add_task_num": 64, "incore_data_size": 128, "incore_loop": 4, "grid_k": 4},
+        },
+        {
+            "name": "Bgemm64",
+            "platforms": ["a2a3sim", "a2a3"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 3},
+            "params": {"matmul_add_task_num": 32, "incore_data_size": 64, "incore_loop": 1, "grid_k": 4},
+        },
+        {
+            # Full-core swimlane visualization: block_dim == a2a3sim capacity
+            # (PLATFORM_MAX_BLOCKDIM=24 → 24 AIC + 48 AIV = 72 cores). 240 GEMM
+            # (1C) + 240 ADD (1V) tasks so every one of the 24 AIC blocks gets
+            # ~10 GEMMs, filling all lanes. Manual (opt-in) so it does not slow
+            # the default suite. Capture a swimlane with:
+            #   PTO_DIST_SWIMLANE=$PWD/outputs/dist_swimlane/bgemm_fullcore.json \
+            #     python test_benchmark_bgemm.py -p a2a3sim --case FullCore24 --manual include
+            #   python -m simpler_setup.tools.dist_swimlane_render \
+            #     outputs/dist_swimlane/bgemm_fullcore.json --names 0=GEMM,1=ADD
+            "name": "FullCore24",
+            "manual": True,
+            "platforms": ["a2a3sim", "a2a3"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 24},
+            "params": {"matmul_add_task_num": 240, "incore_data_size": 128, "incore_loop": 4, "grid_k": 2},
+        },
+    ]
+
+    def generate_args(self, params):
+        tile_size = params["incore_data_size"]
+        incore_loop = params["incore_loop"]
+        grid_k = params["grid_k"]
+        num_groups = params["matmul_add_task_num"] // grid_k
+        A = torch.randn(num_groups, grid_k, incore_loop, tile_size, tile_size, dtype=torch.float32) * 0.01
+        B = torch.randn(num_groups, grid_k, incore_loop, tile_size, tile_size, dtype=torch.float32) * 0.01
+        C = torch.zeros(incore_loop * num_groups, tile_size, tile_size, dtype=torch.float32)
+        config = torch.tensor([tile_size, grid_k, num_groups, incore_loop], dtype=torch.int64)
+        return TaskArgsBuilder(
+            Tensor("A", A.flatten()), Tensor("B", B.flatten()), Tensor("C", C.flatten()), Tensor("config", config)
+        )
+
+    def compute_golden(self, args, params):
+        tile_size = params["incore_data_size"]
+        incore_loop = params["incore_loop"]
+        grid_k = params["grid_k"]
+        num_groups = params["matmul_add_task_num"] // grid_k
+        A = args.A.reshape(num_groups, grid_k, incore_loop, tile_size, tile_size)
+        B = args.B.reshape(num_groups, grid_k, incore_loop, tile_size, tile_size)
+        C = args.C.reshape(incore_loop * num_groups, tile_size, tile_size)
+        C[:] = 0.0
+        for group in range(num_groups):
+            for k_idx in range(grid_k):
+                for i in range(incore_loop):
+                    C[group * incore_loop + i] += torch.matmul(A[group, k_idx, i], B[group, k_idx, i])
+
+
+if __name__ == "__main__":
+    SceneTestCase.run_module(__name__)
diff --git a/examples/a2a3/fully_distributed_within_core/deferred_notify_demo/kernels/aiv/kernel_consumer.cpp b/examples/a2a3/fully_distributed_within_core/deferred_notify_demo/kernels/aiv/kernel_consumer.cpp
new file mode 100644
index 000000000..b860b7223
--- /dev/null
+++ b/examples/a2a3/fully_distributed_within_core/deferred_notify_demo/kernels/aiv/kernel_consumer.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+#include <cstdint>
+
+#include <pto/pto-inst.hpp>
+
+#ifndef __gm__
+#define __gm__
+#endif
+#ifndef __aicore__
+#define __aicore__ [aicore]
+#endif
+
+#include "tensor.h"
+
+extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) {
+    __gm__ Tensor *mailbox_tensor = reinterpret_cast<__gm__ Tensor *>(args[1]);
+    __gm__ Tensor *result_tensor = reinterpret_cast<__gm__ Tensor *>(args[2]);
+    __gm__ float *mailbox =
+        reinterpret_cast<__gm__ float *>(mailbox_tensor->buffer.addr) + mailbox_tensor->start_offset;
+    __gm__ float *result = reinterpret_cast<__gm__ float *>(result_tensor->buffer.addr) + result_tensor->start_offset;
+
+    uint32_t n = static_cast<uint32_t>(result_tensor->shapes[0]);
+    for (uint32_t i = 0; i < n; ++i) {
+        result[i] = mailbox[i];
+    }
+}
diff --git a/examples/a2a3/fully_distributed_within_core/deferred_notify_demo/kernels/aiv/kernel_notify_wait.cpp b/examples/a2a3/fully_distributed_within_core/deferred_notify_demo/kernels/aiv/kernel_notify_wait.cpp
new file mode 100644
index 000000000..2a4d5cbf2
--- /dev/null
+++ b/examples/a2a3/fully_distributed_within_core/deferred_notify_demo/kernels/aiv/kernel_notify_wait.cpp
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+#include <cstdint>
+
+#include <pto/pto-inst.hpp>
+
+#ifndef __gm__
+#define __gm__
+#endif
+#ifndef __aicore__
+#define __aicore__ [aicore]
+#endif
+
+#include "pto_async_kernel_api.h"
+
+extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) {
+    uint64_t counter_addr = static_cast<uint64_t>(args[1]);
+    uint32_t expected_value = static_cast<uint32_t>(args[2]);
+    AsyncCtx async_ctx = get_async_ctx(args);
+    save_expected_notification_counter(
+        async_ctx, reinterpret_cast<volatile __gm__ void *>(counter_addr), expected_value
+    );
+}
diff --git a/examples/a2a3/fully_distributed_within_core/deferred_notify_demo/kernels/aiv/kernel_producer.cpp b/examples/a2a3/fully_distributed_within_core/deferred_notify_demo/kernels/aiv/kernel_producer.cpp
new file mode 100644
index 000000000..f846b313f
--- /dev/null
+++ b/examples/a2a3/fully_distributed_within_core/deferred_notify_demo/kernels/aiv/kernel_producer.cpp
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+#include <cstdint>
+
+#include <pto/pto-inst.hpp>
+
+#ifndef __gm__
+#define __gm__
+#endif
+#ifndef __aicore__
+#define __aicore__ [aicore]
+#endif
+
+#include "platform_comm/comm_context.h"
+#include "pto_async_kernel_api.h"
+#include "tensor.h"
+
+template <typename T>
+static inline __aicore__ __gm__ T *comm_remote_ptr(__gm__ CommContext *ctx, __gm__ T *local_ptr, int peer_rank) {
+    uint64_t local_base = ctx->windowsIn[ctx->rankId];
+    uint64_t offset = reinterpret_cast<uint64_t>(local_ptr) - local_base;
+    return reinterpret_cast<__gm__ T *>(ctx->windowsIn[peer_rank] + offset);
+}
+
+extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) {
+    __gm__ Tensor *partial_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]);
+    __gm__ Tensor *mailbox_tensor = reinterpret_cast<__gm__ Tensor *>(args[1]);
+    __gm__ int32_t *local_counter = reinterpret_cast<__gm__ int32_t *>(args[3]);
+    __gm__ CommContext *ctx = reinterpret_cast<__gm__ CommContext *>(args[4]);
+
+    __gm__ float *partial =
+        reinterpret_cast<__gm__ float *>(partial_tensor->buffer.addr) + partial_tensor->start_offset;
+    __gm__ float *mailbox =
+        reinterpret_cast<__gm__ float *>(mailbox_tensor->buffer.addr) + mailbox_tensor->start_offset;
+
+    int peer_rank = (static_cast<int>(ctx->rankId) + 1) % static_cast<int>(ctx->rankNum);
+    __gm__ float *peer_mailbox = comm_remote_ptr(ctx, mailbox, peer_rank);
+    uint32_t n = static_cast<uint32_t>(partial_tensor->shapes[0]);
+    for (uint32_t i = 0; i < n; ++i) {
+        peer_mailbox[i] = partial[i];
+    }
+#if defined(__CCE_KT_TEST__) || defined(__CCE_AICORE__) || defined(__DAV_C220__)
+    dcci((__gm__ int32_t *)peer_mailbox, ENTIRE_DATA_CACHE, CACHELINE_OUT);
+#if defined(__CPU_SIM)
+    dsb(0);
+#else
+    dsb(DSB_DDR);
+#endif
+    pipe_barrier(PIPE_ALL);
+#endif
+
+    __gm__ int32_t *peer_counter = comm_remote_ptr(ctx, local_counter, peer_rank);
+    send_notification(peer_counter, 1, pto::comm::NotifyOp::AtomicAdd);
+}
diff --git a/examples/a2a3/fully_distributed_within_core/deferred_notify_demo/kernels/orchestration/deferred_notify_orch.cpp b/examples/a2a3/fully_distributed_within_core/deferred_notify_demo/kernels/orchestration/deferred_notify_orch.cpp
new file mode 100644
index 000000000..7a5af06c8
--- /dev/null
+++ b/examples/a2a3/fully_distributed_within_core/deferred_notify_demo/kernels/orchestration/deferred_notify_orch.cpp
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+#include <stdint.h>
+
+#include "platform_comm/comm_context.h"
+#include "pto_orchestration_api.h"
+
+extern "C" {
+
+__attribute__((visibility("default"))) PTO2OrchestrationConfig
+deferred_notify_orchestration_config(const L2TaskArgs &orch_args) {
+    (void)orch_args;
+    return PTO2OrchestrationConfig{.expected_arg_count = 5};
+}
+
+__attribute__((visibility("default"))) PTO2OrchestrationConfig aicpu_orchestration_config(const L2TaskArgs &orch_args) {
+    return deferred_notify_orchestration_config(orch_args);
+}
+
+__attribute__((visibility("default"))) void deferred_notify_orchestration(const L2TaskArgs &orch_args) {
+    if (orch_args.tensor_count() + orch_args.scalar_count() != 5) {
+        LOG_ERROR("deferred_notify_demo: expected 5 args");
+        return;
+    }
+
+    const Tensor &partial = orch_args.tensor(0).ref();
+    const Tensor &mailbox = orch_args.tensor(1).ref();
+    const Tensor &result = orch_args.tensor(2).ref();
+    const Tensor &notify_counter = orch_args.tensor(3).ref();
+    auto *comm_ctx = reinterpret_cast<CommContext *>(static_cast<uintptr_t>(orch_args.scalar(0)));
+
+    uint32_t shapes[1] = {128 * 128};
+    TensorCreateInfo producer_output_info(shapes, 1, DataType::FLOAT32);
+    L0TaskArgs params_producer;
+    params_producer.add_input(partial);
+    params_producer.add_inout(mailbox);
+    params_producer.add_output(producer_output_info);
+    params_producer.add_scalar(notify_counter.buffer.addr);
+    params_producer.add_scalar(reinterpret_cast<uint64_t>(comm_ctx));
+    rt_submit_aiv_task(0, params_producer);
+
+    uint32_t notify_token_shape[1] = {1};
+    TensorCreateInfo notify_token_info(notify_token_shape, 1, DataType::INT32);
+    L0TaskArgs params_notify;
+    params_notify.add_output(notify_token_info);
+    params_notify.add_scalar(notify_counter.buffer.addr);
+    params_notify.add_scalar(static_cast<uint64_t>(1));
+    TaskOutputTensors notify_outputs = rt_submit_aiv_task(2, params_notify);
+    Tensor notify_token = notify_outputs.get_ref(0);
+
+    L0TaskArgs params_consumer;
+    params_consumer.add_input(notify_token);
+    params_consumer.add_input(mailbox);
+    params_consumer.add_output(result);
+    params_consumer.add_scalar(notify_counter.buffer.addr);
+    rt_submit_aiv_task(1, params_consumer);
+}
+
+}  // extern "C"
diff --git a/examples/a2a3/fully_distributed_within_core/deferred_notify_demo/test_deferred_notify_demo.py b/examples/a2a3/fully_distributed_within_core/deferred_notify_demo/test_deferred_notify_demo.py
new file mode 100644
index 000000000..873871776
--- /dev/null
+++ b/examples/a2a3/fully_distributed_within_core/deferred_notify_demo/test_deferred_notify_demo.py
@@ -0,0 +1,189 @@
+#!/usr/bin/env python3
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""L2 deferred completion + two-chip comm smoke test for a2a3sim."""
+
+from __future__ import annotations
+
+import argparse
+import os
+
+import torch
+from simpler.task_interface import (
+    ArgDirection,
+    CallConfig,
+    ChipCallable,
+    CommBufferSpec,
+    CoreCallable,
+    DataType,
+    TaskArgs,
+    Tensor,
+    TensorArgType,
+)
+from simpler.worker import Worker
+
+from simpler_setup.elf_parser import extract_text_section
+from simpler_setup.kernel_compiler import KernelCompiler
+from simpler_setup.pto_isa import ensure_pto_isa_root
+from simpler_setup.torch_interop import make_tensor_arg
+
+HERE = os.path.dirname(os.path.abspath(__file__))
+N = 128 * 128
+DTYPE_NBYTES = 4
+
+
+def parse_device_range(spec: str) -> list[int]:
+    if "," in spec:
+        return [int(x) for x in spec.split(",") if x]
+    if "-" in spec:
+        lo, hi = (int(x) for x in spec.split("-"))
+        return list(range(lo, hi + 1))
+    return [int(spec)]
+
+
+def build_chip_callable(platform: str, pto_isa_commit: str | None, clone_protocol: str) -> ChipCallable:
+    kc = KernelCompiler(platform=platform)
+    runtime = "fully_distributed_within_core"
+    pto_isa_root = ensure_pto_isa_root(commit=pto_isa_commit, clone_protocol=clone_protocol)
+    include_dirs = kc.get_orchestration_include_dirs(runtime)
+    extra_includes = list(include_dirs) + [str(kc.project_root / "src" / "common")]
+
+    children = []
+    for func_id, rel in [
+        (0, "kernels/aiv/kernel_producer.cpp"),
+        (1, "kernels/aiv/kernel_consumer.cpp"),
+        (2, "kernels/aiv/kernel_notify_wait.cpp"),
+    ]:
+        kernel = kc.compile_incore(
+            source_path=os.path.join(HERE, rel),
+            core_type="aiv",
+            pto_isa_root=pto_isa_root,
+            extra_include_dirs=extra_includes,
+        )
+        if not platform.endswith("sim"):
+            kernel = extract_text_section(kernel)
+        children.append(
+            (
+                func_id,
+                CoreCallable.build(
+                    signature=[ArgDirection.IN, ArgDirection.INOUT, ArgDirection.OUT, ArgDirection.IN],
+                    binary=kernel,
+                ),
+            )
+        )
+
+    orch = kc.compile_orchestration(
+        runtime_name=runtime,
+        source_path=os.path.join(HERE, "kernels/orchestration/deferred_notify_orch.cpp"),
+        extra_include_dirs=[str(kc.project_root / "src" / "common")],
+    )
+    return ChipCallable.build(
+        signature=[ArgDirection.IN, ArgDirection.INOUT, ArgDirection.OUT, ArgDirection.IN],
+        func_name="deferred_notify_orchestration",
+        binary=orch,
+        children=children,
+    )
+
+
+def run(
+    platform: str = "a2a3sim",
+    device_ids: list[int] | None = None,
+    pto_isa_commit: str | None = None,
+) -> int:
+    if device_ids is None:
+        device_ids = [0, 1]
+    nranks = len(device_ids)
+    if nranks != 2:
+        raise ValueError(f"deferred_notify_demo needs exactly 2 devices, got {device_ids}")
+
+    mailbox_nbytes = N * DTYPE_NBYTES
+    counter_nbytes = 4
+    window_size = max(mailbox_nbytes + counter_nbytes, 4 * 1024)
+
+    partial = [torch.full((N,), float(rank + 1), dtype=torch.float32).share_memory_() for rank in range(nranks)]
+    result = [torch.zeros(N, dtype=torch.float32).share_memory_() for _ in range(nranks)]
+
+    chip_callable = build_chip_callable(platform, pto_isa_commit, "https")
+    worker = Worker(
+        level=3,
+        platform=platform,
+        runtime="fully_distributed_within_core",
+        device_ids=device_ids,
+        num_sub_workers=0,
+    )
+    chip_handle = worker.register(chip_callable)
+    try:
+        worker.init()
+
+        def orch_fn(orch, _args, cfg):
+            # `notify_counter` must start at 0; allocate_domain zero-initializes
+            # the whole window, so no explicit host seed is needed.
+            with orch.allocate_domain(
+                name="default",
+                workers=list(range(nranks)),
+                window_size=window_size,
+                buffers=[
+                    CommBufferSpec(name="mailbox", dtype="float32", count=N, nbytes=mailbox_nbytes),
+                    CommBufferSpec(name="notify_counter", dtype="int32", count=1, nbytes=counter_nbytes),
+                ],
+            ) as handle:
+                for rank in range(nranks):
+                    domain = handle[rank]
+                    args = TaskArgs()
+                    args.add_tensor(make_tensor_arg(partial[rank]), TensorArgType.INPUT)
+                    args.add_tensor(
+                        Tensor.make(
+                            data=domain.buffer_ptrs["mailbox"],
+                            shapes=(N,),
+                            dtype=DataType.FLOAT32,
+                            child_memory=True,
+                        ),
+                        TensorArgType.INOUT,
+                    )
+                    args.add_tensor(make_tensor_arg(result[rank]), TensorArgType.OUTPUT_EXISTING)
+                    args.add_tensor(
+                        Tensor.make(
+                            data=domain.buffer_ptrs["notify_counter"],
+                            shapes=(1,),
+                            dtype=DataType.INT32,
+                            child_memory=True,
+                        ),
+                        TensorArgType.INPUT,
+                    )
+                    args.add_scalar(domain.device_ctx)
+                    orch.submit_next_level(chip_handle, args, cfg, worker=rank)
+
+        worker.run(orch_fn, args=None, config=CallConfig())
+
+        ok = True
+        for rank in range(nranks):
+            expected = partial[(rank + 1) % nranks]
+            max_diff = float(torch.max(torch.abs(result[rank] - expected)))
+            print(f"[deferred_notify_demo] rank {rank}: max_diff={max_diff:.3e}")
+            ok = ok and max_diff <= 1e-6
+        return 0 if ok else 1
+    finally:
+        worker.close()
+
+
+def test_deferred_notify_demo() -> None:
+    assert run("a2a3sim", [0, 1]) == 0
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-p", "--platform", default="a2a3sim")
+    parser.add_argument("-d", "--device", default="0-1")
+    parser.add_argument("--pto-isa-commit", default=None)
+    args = parser.parse_args()
+    return run(args.platform, parse_device_range(args.device), args.pto_isa_commit)
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/examples/a2a3/fully_distributed_within_core/mix_coown/kernels/aic/kernel_mm.cpp b/examples/a2a3/fully_distributed_within_core/mix_coown/kernels/aic/kernel_mm.cpp
new file mode 100644
index 000000000..123c44f65
--- /dev/null
+++ b/examples/a2a3/fully_distributed_within_core/mix_coown/kernels/aic/kernel_mm.cpp
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * MIX co-ownership test — AIC (cube) subtask: Cmm = A @ B (single tile).
+ *
+ * This is the AIC lane of a 1C+2V MIX task. All three lanes share one argument
+ * list; each lane writes ITS OWN designated output by fixed index:
+ *
+ *   args[0] = A     (INPUT)
+ *   args[1] = B     (INPUT)
+ *   args[2] = Cmm   (INOUT, external)   <- this AIC lane writes here
+ *   args[3] = V0    (OUTPUT, heap)      <- AIV0 lane
+ *   args[4] = V1    (OUTPUT, heap)      <- AIV1 lane
+ *   args[5] = config (INPUT) int64_t[4]: [tile_size, grid_k, num_groups, num_tiles]
+ */
+
+#include <cstdint>
+#include <pto/pto-inst.hpp>
+#include <pto/common/constants.hpp>
+#include <pto/common/pto_tile.hpp>
+
+#include "tensor.h"
+
+using namespace pto;
+
+#include "pipe_sync.h"
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__ [aicore]
+#endif
+
+template <typename T>
+AICORE constexpr inline T CeilAlign(T num_1, T num_2) {
+    if (num_2 == 0) {
+        return 0;
+    }
+    return (num_1 + num_2 - 1) / num_2 * num_2;
+}
+
+template <int TILE>
+static __aicore__ void mm_tile_impl(__gm__ float *input_a, __gm__ float *input_b, __gm__ float *output) {
+    constexpr int blockAlign = C0_SIZE_BYTE / sizeof(float);
+    constexpr int M = CeilAlign<int>(TILE, 16);
+    constexpr int K = CeilAlign<int>(TILE, blockAlign);
+    constexpr int N = CeilAlign<int>(TILE, blockAlign);
+
+    using GlobalDataA =
+        GlobalTensor<float, Shape<1, 1, 1, TILE, TILE>, Stride<1 * TILE * TILE, 1 * TILE * TILE, TILE * TILE, TILE, 1>>;
+    using GlobalDataB =
+        GlobalTensor<float, Shape<1, 1, 1, TILE, TILE>, Stride<1 * TILE * TILE, 1 * TILE * TILE, TILE * TILE, TILE, 1>>;
+    using GlobalDataC =
+        GlobalTensor<float, Shape<1, 1, 1, TILE, TILE>, Stride<1 * TILE * TILE, 1 * TILE * TILE, TILE * TILE, TILE, 1>>;
+
+    GlobalDataA src0Global(input_a);
+    GlobalDataB src1Global(input_b);
+    GlobalDataC dstGlobal(output);
+
+    using TileMatA = Tile<TileType::Mat, float, M, K, BLayout::ColMajor, TILE, TILE, SLayout::RowMajor, 512>;
+    using TileMatB = Tile<TileType::Mat, float, K, N, BLayout::ColMajor, TILE, TILE, SLayout::RowMajor, 512>;
+
+    using LeftTile = TileLeft<float, M, K, TILE, TILE>;
+    using RightTile = TileRight<float, K, N, TILE, TILE>;
+    using AccTile = TileAcc<float, M, N, TILE, TILE>;
+
+    TileMatA aMatTile;
+    TileMatB bMatTile;
+    TASSIGN(aMatTile, 0x0);
+    TASSIGN(bMatTile, 0x20000);
+
+    LeftTile aTile;
+    RightTile bTile;
+    AccTile cTile;
+    TASSIGN(aTile, 0x0);
+    TASSIGN(bTile, 0x0);
+    TASSIGN(cTile, 0x0);
+
+    TLOAD(aMatTile, src0Global);
+    TLOAD(bMatTile, src1Global);
+
+    set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
+    wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
+
+    TMOV(aTile, aMatTile);
+    TMOV(bTile, bMatTile);
+
+    set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
+    wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
+
+    TMATMUL(cTile, aTile, bTile);
+
+    set_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
+    wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
+
+    TSTORE(dstGlobal, cTile);
+
+    pipe_sync();
+}
+
+extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) {
+    __gm__ Tensor *input_a = reinterpret_cast<__gm__ Tensor *>(args[0]);
+    __gm__ Tensor *input_b = reinterpret_cast<__gm__ Tensor *>(args[1]);
+    __gm__ Tensor *cmm = reinterpret_cast<__gm__ Tensor *>(args[2]);
+    __gm__ Tensor *config = reinterpret_cast<__gm__ Tensor *>(args[5]);
+
+    __gm__ int64_t *cfg = reinterpret_cast<__gm__ int64_t *>(config->buffer.addr);
+    uint64_t tile_size = static_cast<uint64_t>(cfg[0]);
+    uint64_t tile_elems = tile_size * tile_size;
+    int num_tiles = static_cast<int>(cfg[3]);
+
+    __gm__ float *base_a = reinterpret_cast<__gm__ float *>(input_a->buffer.addr) + input_a->start_offset;
+    __gm__ float *base_b = reinterpret_cast<__gm__ float *>(input_b->buffer.addr) + input_b->start_offset;
+    __gm__ float *base_c = reinterpret_cast<__gm__ float *>(cmm->buffer.addr) + cmm->start_offset;
+
+    for (int tile_idx = 0; tile_idx < num_tiles; tile_idx++) {
+        __gm__ float *a_ptr = base_a + (tile_idx * tile_elems);
+        __gm__ float *b_ptr = base_b + (tile_idx * tile_elems);
+        __gm__ float *c_ptr = base_c + (tile_idx * tile_elems);
+
+        switch (tile_size) {
+        case 16:
+            mm_tile_impl<16>(a_ptr, b_ptr, c_ptr);
+            break;
+        case 32:
+            mm_tile_impl<32>(a_ptr, b_ptr, c_ptr);
+            break;
+        case 64:
+            mm_tile_impl<64>(a_ptr, b_ptr, c_ptr);
+            break;
+        case 128:
+            mm_tile_impl<128>(a_ptr, b_ptr, c_ptr);
+            break;
+        default:
+            break;
+        }
+    }
+}
diff --git a/examples/a2a3/fully_distributed_within_core/mix_coown/kernels/aiv/kernel_add_v0.cpp b/examples/a2a3/fully_distributed_within_core/mix_coown/kernels/aiv/kernel_add_v0.cpp
new file mode 100644
index 000000000..f27d6dd6c
--- /dev/null
+++ b/examples/a2a3/fully_distributed_within_core/mix_coown/kernels/aiv/kernel_add_v0.cpp
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * MIX co-ownership test — AIV0 subtask: V0 = A + B (single tile, element-wise).
+ *
+ * AIV0 lane of a 1C+2V MIX task. Shared argument list (see kernel_mm.cpp);
+ * this lane writes the V0 output at args[3].
+ */
+
+#include <cstdint>
+#include <pto/pto-inst.hpp>
+#include <pto/common/constants.hpp>
+
+#include "tensor.h"
+
+using namespace pto;
+
+#include "pipe_sync.h"
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__ [aicore]
+#endif
+
+template <int TILE>
+static __aicore__ void add_tile_impl(__gm__ float *a_ptr, __gm__ float *b_ptr, __gm__ float *dst_ptr) {
+    using DynShapeDim5 = Shape<1, 1, 1, TILE, TILE>;
+    using DynStridDim5 = Stride<1, 1, 1, TILE, 1>;
+    using GlobalData = GlobalTensor<float, DynShapeDim5, DynStridDim5>;
+    using TileData = Tile<TileType::Vec, float, TILE, TILE, BLayout::RowMajor, -1, -1>;
+
+    TileData aTile(TILE, TILE);
+    TileData bTile(TILE, TILE);
+    TileData outTile(TILE, TILE);
+    TASSIGN(aTile, 0x0);
+    TASSIGN(bTile, 0x10000);
+    TASSIGN(outTile, 0x20000);
+
+    GlobalData aGlobal(a_ptr);
+    GlobalData bGlobal(b_ptr);
+    GlobalData outGlobal(dst_ptr);
+
+    TLOAD(aTile, aGlobal);
+    TLOAD(bTile, bGlobal);
+    set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+    wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+    TADD(outTile, aTile, bTile);
+    set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+    wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+    TSTORE(outGlobal, outTile);
+    pipe_sync();
+}
+
+extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) {
+    __gm__ Tensor *a_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]);
+    __gm__ Tensor *b_tensor = reinterpret_cast<__gm__ Tensor *>(args[1]);
+    __gm__ Tensor *out_tensor = reinterpret_cast<__gm__ Tensor *>(args[3]);  // V0
+    __gm__ Tensor *config = reinterpret_cast<__gm__ Tensor *>(args[5]);
+
+    __gm__ int64_t *cfg = reinterpret_cast<__gm__ int64_t *>(config->buffer.addr);
+    uint64_t tile_size = static_cast<uint64_t>(cfg[0]);
+    uint64_t tile_elems = tile_size * tile_size;
+    int num_tiles = static_cast<int>(cfg[3]);
+
+    __gm__ float *base_a = reinterpret_cast<__gm__ float *>(a_tensor->buffer.addr) + a_tensor->start_offset;
+    __gm__ float *base_b = reinterpret_cast<__gm__ float *>(b_tensor->buffer.addr) + b_tensor->start_offset;
+    __gm__ float *base_out = reinterpret_cast<__gm__ float *>(out_tensor->buffer.addr) + out_tensor->start_offset;
+
+    for (int tile_idx = 0; tile_idx < num_tiles; tile_idx++) {
+        __gm__ float *a = base_a + (tile_idx * tile_elems);
+        __gm__ float *b = base_b + (tile_idx * tile_elems);
+        __gm__ float *o = base_out + (tile_idx * tile_elems);
+        switch (tile_size) {
+        case 16:
+            add_tile_impl<16>(a, b, o);
+            break;
+        case 32:
+            add_tile_impl<32>(a, b, o);
+            break;
+        case 64:
+            add_tile_impl<64>(a, b, o);
+            break;
+        case 128:
+            add_tile_impl<128>(a, b, o);
+            break;
+        default:
+            break;
+        }
+    }
+}
diff --git a/examples/a2a3/fully_distributed_within_core/mix_coown/kernels/aiv/kernel_add_v1.cpp b/examples/a2a3/fully_distributed_within_core/mix_coown/kernels/aiv/kernel_add_v1.cpp
new file mode 100644
index 000000000..8c794f477
--- /dev/null
+++ b/examples/a2a3/fully_distributed_within_core/mix_coown/kernels/aiv/kernel_add_v1.cpp
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * MIX co-ownership test — AIV1 subtask: V1 = A + B (single tile, element-wise).
+ *
+ * AIV1 lane of a 1C+2V MIX task. Shared argument list (see kernel_mm.cpp);
+ * this lane writes the V1 output at args[4].
+ */
+
+#include <cstdint>
+#include <pto/pto-inst.hpp>
+#include <pto/common/constants.hpp>
+
+#include "tensor.h"
+
+using namespace pto;
+
+#include "pipe_sync.h"
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__ [aicore]
+#endif
+
+template <int TILE>
+static __aicore__ void add_tile_impl(__gm__ float *a_ptr, __gm__ float *b_ptr, __gm__ float *dst_ptr) {
+    using DynShapeDim5 = Shape<1, 1, 1, TILE, TILE>;
+    using DynStridDim5 = Stride<1, 1, 1, TILE, 1>;
+    using GlobalData = GlobalTensor<float, DynShapeDim5, DynStridDim5>;
+    using TileData = Tile<TileType::Vec, float, TILE, TILE, BLayout::RowMajor, -1, -1>;
+
+    TileData aTile(TILE, TILE);
+    TileData bTile(TILE, TILE);
+    TileData outTile(TILE, TILE);
+    TASSIGN(aTile, 0x0);
+    TASSIGN(bTile, 0x10000);
+    TASSIGN(outTile, 0x20000);
+
+    GlobalData aGlobal(a_ptr);
+    GlobalData bGlobal(b_ptr);
+    GlobalData outGlobal(dst_ptr);
+
+    TLOAD(aTile, aGlobal);
+    TLOAD(bTile, bGlobal);
+    set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+    wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+    TADD(outTile, aTile, bTile);
+    set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+    wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+    TSTORE(outGlobal, outTile);
+    pipe_sync();
+}
+
+extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) {
+    __gm__ Tensor *a_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]);
+    __gm__ Tensor *b_tensor = reinterpret_cast<__gm__ Tensor *>(args[1]);
+    __gm__ Tensor *out_tensor = reinterpret_cast<__gm__ Tensor *>(args[4]);  // V1
+    __gm__ Tensor *config = reinterpret_cast<__gm__ Tensor *>(args[5]);
+
+    __gm__ int64_t *cfg = reinterpret_cast<__gm__ int64_t *>(config->buffer.addr);
+    uint64_t tile_size = static_cast<uint64_t>(cfg[0]);
+    uint64_t tile_elems = tile_size * tile_size;
+    int num_tiles = static_cast<int>(cfg[3]);
+
+    __gm__ float *base_a = reinterpret_cast<__gm__ float *>(a_tensor->buffer.addr) + a_tensor->start_offset;
+    __gm__ float *base_b = reinterpret_cast<__gm__ float *>(b_tensor->buffer.addr) + b_tensor->start_offset;
+    __gm__ float *base_out = reinterpret_cast<__gm__ float *>(out_tensor->buffer.addr) + out_tensor->start_offset;
+
+    for (int tile_idx = 0; tile_idx < num_tiles; tile_idx++) {
+        __gm__ float *a = base_a + (tile_idx * tile_elems);
+        __gm__ float *b = base_b + (tile_idx * tile_elems);
+        __gm__ float *o = base_out + (tile_idx * tile_elems);
+        switch (tile_size) {
+        case 16:
+            add_tile_impl<16>(a, b, o);
+            break;
+        case 32:
+            add_tile_impl<32>(a, b, o);
+            break;
+        case 64:
+            add_tile_impl<64>(a, b, o);
+            break;
+        case 128:
+            add_tile_impl<128>(a, b, o);
+            break;
+        default:
+            break;
+        }
+    }
+}
diff --git a/examples/a2a3/fully_distributed_within_core/mix_coown/kernels/aiv/kernel_sum.cpp b/examples/a2a3/fully_distributed_within_core/mix_coown/kernels/aiv/kernel_sum.cpp
new file mode 100644
index 000000000..59bc2b6f2
--- /dev/null
+++ b/examples/a2a3/fully_distributed_within_core/mix_coown/kernels/aiv/kernel_sum.cpp
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * MIX co-ownership test — consumer (AIV): Vfinal = V0 + V1.
+ *
+ * Reads the two heap outputs produced by the MIX task's AIV0/AIV1 lanes and
+ * writes the external Vfinal. Its fan-in is the single MIX task id, so it can
+ * only run once the joint completion flag is set (i.e. after BOTH co-owned
+ * AIV subtasks finished), validating the block.won remaining-counter logic.
+ *
+ *   args[0] = V0     (INPUT, heap)
+ *   args[1] = V1     (INPUT, heap)
+ *   args[2] = Vfinal (INOUT, external)
+ *   args[3] = config (INPUT) int64_t[4]: [tile_size, grid_k, num_groups, num_tiles]
+ */
+
+#include <cstdint>
+#include <pto/pto-inst.hpp>
+#include <pto/common/constants.hpp>
+
+#include "tensor.h"
+
+using namespace pto;
+
+#include "pipe_sync.h"
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__ [aicore]
+#endif
+
+template <int TILE>
+static __aicore__ void sum_tile_impl(__gm__ float *v0_ptr, __gm__ float *v1_ptr, __gm__ float *dst_ptr) {
+    using DynShapeDim5 = Shape<1, 1, 1, TILE, TILE>;
+    using DynStridDim5 = Stride<1, 1, 1, TILE, 1>;
+    using GlobalData = GlobalTensor<float, DynShapeDim5, DynStridDim5>;
+    using TileData = Tile<TileType::Vec, float, TILE, TILE, BLayout::RowMajor, -1, -1>;
+
+    TileData v0Tile(TILE, TILE);
+    TileData v1Tile(TILE, TILE);
+    TileData outTile(TILE, TILE);
+    TASSIGN(v0Tile, 0x0);
+    TASSIGN(v1Tile, 0x10000);
+    TASSIGN(outTile, 0x20000);
+
+    GlobalData v0Global(v0_ptr);
+    GlobalData v1Global(v1_ptr);
+    GlobalData outGlobal(dst_ptr);
+
+    TLOAD(v0Tile, v0Global);
+    TLOAD(v1Tile, v1Global);
+    set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+    wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+    TADD(outTile, v0Tile, v1Tile);
+    set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+    wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+    TSTORE(outGlobal, outTile);
+    pipe_sync();
+}
+
+extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) {
+    __gm__ Tensor *v0_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]);
+    __gm__ Tensor *v1_tensor = reinterpret_cast<__gm__ Tensor *>(args[1]);
+    __gm__ Tensor *out_tensor = reinterpret_cast<__gm__ Tensor *>(args[2]);
+    __gm__ Tensor *config = reinterpret_cast<__gm__ Tensor *>(args[3]);
+
+    __gm__ int64_t *cfg = reinterpret_cast<__gm__ int64_t *>(config->buffer.addr);
+    uint64_t tile_size = static_cast<uint64_t>(cfg[0]);
+    uint64_t tile_elems = tile_size * tile_size;
+    int num_tiles = static_cast<int>(cfg[3]);
+
+    __gm__ float *base_v0 = reinterpret_cast<__gm__ float *>(v0_tensor->buffer.addr) + v0_tensor->start_offset;
+    __gm__ float *base_v1 = reinterpret_cast<__gm__ float *>(v1_tensor->buffer.addr) + v1_tensor->start_offset;
+    __gm__ float *base_out = reinterpret_cast<__gm__ float *>(out_tensor->buffer.addr) + out_tensor->start_offset;
+
+    for (int tile_idx = 0; tile_idx < num_tiles; tile_idx++) {
+        __gm__ float *v0 = base_v0 + (tile_idx * tile_elems);
+        __gm__ float *v1 = base_v1 + (tile_idx * tile_elems);
+        __gm__ float *o = base_out + (tile_idx * tile_elems);
+        switch (tile_size) {
+        case 16:
+            sum_tile_impl<16>(v0, v1, o);
+            break;
+        case 32:
+            sum_tile_impl<32>(v0, v1, o);
+            break;
+        case 64:
+            sum_tile_impl<64>(v0, v1, o);
+            break;
+        case 128:
+            sum_tile_impl<128>(v0, v1, o);
+            break;
+        default:
+            break;
+        }
+    }
+}
diff --git a/examples/a2a3/fully_distributed_within_core/mix_coown/kernels/orchestration/mix_coown_orch.cpp b/examples/a2a3/fully_distributed_within_core/mix_coown/kernels/orchestration/mix_coown_orch.cpp
new file mode 100644
index 000000000..4d5b741e2
--- /dev/null
+++ b/examples/a2a3/fully_distributed_within_core/mix_coown/kernels/orchestration/mix_coown_orch.cpp
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * MIX co-ownership orchestration (fully_distributed_within_core).
+ *
+ * For each group g it submits a 1C+2V MIX task and a dependent consumer,
+ * exercising the block.won anchor->follower deposit/drain path (§3.1):
+ *
+ *   MIX[g] (1C+2V):  Cmm[g] = A[g] @ B[g]      (AIC lane, external out)
+ *                    V0     = A[g] + B[g]       (AIV0 lane, heap out)
+ *                    V1     = A[g] + B[g]       (AIV1 lane, heap out)
+ *   consumer[g] (1V): Vfinal[g] = V0 + V1       (depends on the single MIX
+ *                                                completion flag)
+ *
+ * Golden: Cmm[g] = A[g]@B[g]; Vfinal[g] = 2*(A[g]+B[g]).
+ *
+ * Arg layout (external): [A, B, Cmm, Vfinal, config]
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "pto_orchestration_api.h"  // NOLINT(build/include_subdir)
+
+#define FUNC_MM 0
+#define FUNC_ADD_V0 1
+#define FUNC_ADD_V1 2
+#define FUNC_SUM 3
+
+extern "C" {
+
+__attribute__((visibility("default"))) PTO2OrchestrationConfig aicpu_orchestration_config(const L2TaskArgs &orch_args) {
+    (void)orch_args;
+    return PTO2OrchestrationConfig{
+        .expected_arg_count = 5,
+    };
+}
+
+__attribute__((visibility("default"))) void aicpu_orchestration_entry(const L2TaskArgs &orch_args) {
+    const Tensor &ext_A = orch_args.tensor(0).ref();
+    const Tensor &ext_B = orch_args.tensor(1).ref();
+    const Tensor &ext_Cmm = orch_args.tensor(2).ref();
+    const Tensor &ext_Vfinal = orch_args.tensor(3).ref();
+    const Tensor &ext_config = orch_args.tensor(4).ref();
+
+    int64_t *host_config = orch_args.tensor(4).ref().data_as<int64_t>();
+    int tile_size = static_cast<int>(host_config[0]);
+    int num_groups = static_cast<int>(host_config[2]);
+    int num_tiles = static_cast<int>(host_config[3]);
+    uint64_t tile_elems = static_cast<uint64_t>(tile_size) * tile_size;
+    uint64_t group_elems = static_cast<uint64_t>(num_tiles) * tile_elems;
+
+    LOG_INFO_V0(
+        "[mix_coown_orch] tile_size=%d num_groups=%d num_tiles=%d", tile_size, num_groups, num_tiles
+    );
+
+    uint32_t group_shapes[1] = {static_cast<uint32_t>(group_elems)};
+    TensorCreateInfo heap_ci(group_shapes, 1, DataType::FLOAT32);
+
+    for (int g = 0; g < num_groups; g++) {
+        PTO2_SCOPE_GUARD();
+
+        uint32_t off[1] = {static_cast<uint32_t>(static_cast<uint64_t>(g) * group_elems)};
+        Tensor A_view = ext_A.view(group_shapes, off);
+        Tensor B_view = ext_B.view(group_shapes, off);
+        Tensor Cmm_view = ext_Cmm.view(group_shapes, off);
+        Tensor Vfinal_view = ext_Vfinal.view(group_shapes, off);
+
+        // 1C + 2V MIX task. Shared arg list; each lane writes its own output.
+        L0TaskArgs mix;
+        mix.add_input(A_view);    // 0
+        mix.add_input(B_view);    // 1
+        mix.add_inout(Cmm_view);  // 2  (AIC writes Cmm)
+        mix.add_output(heap_ci);  // 3  V0 (AIV0 writes)
+        mix.add_output(heap_ci);  // 4  V1 (AIV1 writes)
+        mix.add_input(ext_config);  // 5
+        MixedKernels mk;
+        mk.aic_kernel_id = FUNC_MM;
+        mk.aiv0_kernel_id = FUNC_ADD_V0;
+        mk.aiv1_kernel_id = FUNC_ADD_V1;
+        TaskOutputTensors outs = rt_submit_task(mk, mix);
+
+        // Consumer (1V): Vfinal = V0 + V1 — depends on the single MIX flag.
+        L0TaskArgs cons;
+        cons.add_input(outs.get_ref(0));  // 0  V0
+        cons.add_input(outs.get_ref(1));  // 1  V1
+        cons.add_inout(Vfinal_view);      // 2  Vfinal
+        cons.add_input(ext_config);       // 3
+        rt_submit_aiv_task(FUNC_SUM, cons);
+    }
+
+    LOG_INFO_V0("[mix_coown_orch] submitted %d MIX + %d consumer tasks", num_groups, num_groups);
+}
+
+}  // extern "C"
diff --git a/examples/a2a3/fully_distributed_within_core/mix_coown/test_mix_coown.py b/examples/a2a3/fully_distributed_within_core/mix_coown/test_mix_coown.py
new file mode 100644
index 000000000..fadc4723c
--- /dev/null
+++ b/examples/a2a3/fully_distributed_within_core/mix_coown/test_mix_coown.py
@@ -0,0 +1,112 @@
+#!/usr/bin/env python3
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""MIX co-ownership test for fully_distributed_within_core.
+
+Each group submits a 1C+2V MIX task (Cmm=A@B on AIC, V0=A+B on AIV0, V1=A+B on
+AIV1) plus a consumer (Vfinal=V0+V1). This exercises the block.won anchor->
+follower deposit/drain path and the single joint completion flag.
+"""
+
+import torch
+from simpler.task_interface import ArgDirection as D
+
+from simpler_setup import SceneTestCase, TaskArgsBuilder, Tensor, scene_test
+
+
+@scene_test(level=2, runtime="fully_distributed_within_core")
+class TestMixCoown(SceneTestCase):
+    RTOL = 1e-3
+    ATOL = 1e-3
+
+    CALLABLE = {
+        "orchestration": {
+            "source": "kernels/orchestration/mix_coown_orch.cpp",
+            "function_name": "aicpu_orchestration_entry",
+            "signature": [D.IN, D.IN, D.OUT, D.OUT, D.IN],
+        },
+        "incores": [
+            {
+                "func_id": 0,
+                "name": "MM",
+                "source": "kernels/aic/kernel_mm.cpp",
+                "core_type": "aic",
+                "signature": [D.IN, D.IN, D.INOUT, D.OUT, D.OUT, D.IN],
+            },
+            {
+                "func_id": 1,
+                "name": "ADD_V0",
+                "source": "kernels/aiv/kernel_add_v0.cpp",
+                "core_type": "aiv",
+                "signature": [D.IN, D.IN, D.INOUT, D.OUT, D.OUT, D.IN],
+            },
+            {
+                "func_id": 2,
+                "name": "ADD_V1",
+                "source": "kernels/aiv/kernel_add_v1.cpp",
+                "core_type": "aiv",
+                "signature": [D.IN, D.IN, D.INOUT, D.OUT, D.OUT, D.IN],
+            },
+            {
+                "func_id": 3,
+                "name": "SUM",
+                "source": "kernels/aiv/kernel_sum.cpp",
+                "core_type": "aiv",
+                "signature": [D.IN, D.IN, D.INOUT, D.IN],
+            },
+        ],
+    }
+
+    CASES = [
+        {
+            "name": "Mix12",
+            "platforms": ["a2a3sim", "a2a3"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 3},
+            "params": {"num_groups": 12, "tile_size": 64},
+        },
+        {
+            "name": "Mix24",
+            "manual": True,
+            "platforms": ["a2a3sim", "a2a3"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 24},
+            "params": {"num_groups": 48, "tile_size": 64},
+        },
+    ]
+
+    def generate_args(self, params):
+        n = params["num_groups"]
+        t = params["tile_size"]
+        A = torch.randn(n, t, t, dtype=torch.float32) * 0.01
+        B = torch.randn(n, t, t, dtype=torch.float32) * 0.01
+        Cmm = torch.zeros(n, t, t, dtype=torch.float32)
+        Vfinal = torch.zeros(n, t, t, dtype=torch.float32)
+        # config: [tile_size, grid_k(unused), num_groups, num_tiles_per_group]
+        config = torch.tensor([t, 1, n, 1], dtype=torch.int64)
+        return TaskArgsBuilder(
+            Tensor("A", A.flatten()),
+            Tensor("B", B.flatten()),
+            Tensor("Cmm", Cmm.flatten()),
+            Tensor("Vfinal", Vfinal.flatten()),
+            Tensor("config", config),
+        )
+
+    def compute_golden(self, args, params):
+        n = params["num_groups"]
+        t = params["tile_size"]
+        A = args.A.reshape(n, t, t)
+        B = args.B.reshape(n, t, t)
+        Cmm = args.Cmm.reshape(n, t, t)
+        Vfinal = args.Vfinal.reshape(n, t, t)
+        for g in range(n):
+            Cmm[g] = torch.matmul(A[g], B[g])
+            Vfinal[g] = 2.0 * (A[g] + B[g])
+
+
+if __name__ == "__main__":
+    SceneTestCase.run_module(__name__)
diff --git a/examples/a2a3/fully_distributed_within_core/paged_attention/kernels/aic/aic_pv_matmul.cpp b/examples/a2a3/fully_distributed_within_core/paged_attention/kernels/aic/aic_pv_matmul.cpp
new file mode 100644
index 000000000..0220a6bbb
--- /dev/null
+++ b/examples/a2a3/fully_distributed_within_core/paged_attention/kernels/aic/aic_pv_matmul.cpp
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+// PV Matmul Kernel: pij(M, K) @ vj(K, N) -> oi_new(M, N)
+//
+// Supports two tile configurations via runtime dispatch:
+//   Case1: (16, 128) @ (128, 128) -> (16, 128)
+//   Case2: (64,  64) @ ( 64, 128) -> (64, 128)
+//
+// pij is bfloat16 (converted from fp32 in softmax_prepare via TCVT).
+// vj is stored as (K, N) = (block_size, head_dim) in row-major (ND) layout.
+// Standard non-transposed B pattern: ND GlobalB + ColMajor/RowMajor TileMatB.
+
+#include <cstdint>
+#include <pto/pto-inst.hpp>
+
+#include "tensor.h"
+
+using namespace pto;
+
+#include "pipe_sync.h"
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__ [aicore]
+#endif
+
+template <int M, int K, int N>
+static __aicore__ void pv_matmul_impl(__gm__ Tensor *pij, __gm__ Tensor *vj, __gm__ Tensor *oi) {
+    __gm__ bfloat16_t *pij_addr = reinterpret_cast<__gm__ bfloat16_t *>(pij->buffer.addr);
+    __gm__ bfloat16_t *vj_addr = reinterpret_cast<__gm__ bfloat16_t *>(vj->buffer.addr);
+    __gm__ float *oi_addr = reinterpret_cast<__gm__ float *>(oi->buffer.addr);
+
+    // pij (M, K) bf16, vj (K, N) bf16 in ND (row-major), oi_new (M, N) fp32
+    using GlobalA = GlobalTensor<bfloat16_t, Shape<1, 1, 1, M, K>, Stride<M * K, M * K, M * K, K, 1>>;
+    using GlobalB = GlobalTensor<bfloat16_t, Shape<1, 1, 1, K, N>, Stride<K * N, K * N, K * N, N, 1>>;
+    using GlobalOut = GlobalTensor<float, Shape<1, 1, 1, M, N>, Stride<M * N, M * N, M * N, N, 1>>;
+
+    GlobalA pijGlobal(pij_addr + pij->start_offset);
+    GlobalB vjGlobal(vj_addr + vj->start_offset);
+    GlobalOut oiGlobal(oi_addr + oi->start_offset);
+
+    // L1 Mat tiles: standard ND pattern for both A and B
+    using TileMatA = Tile<TileType::Mat, bfloat16_t, M, K, BLayout::ColMajor, M, K, SLayout::RowMajor, 512>;
+    using TileMatB = Tile<TileType::Mat, bfloat16_t, K, N, BLayout::ColMajor, K, N, SLayout::RowMajor, 512>;
+
+    // L0 tiles
+    using LeftTile = TileLeft<bfloat16_t, M, K, M, K>;
+    using RightTile = TileRight<bfloat16_t, K, N, K, N>;
+    using AccTile = TileAcc<float, M, N, M, N>;
+
+    TileMatA aMatTile;
+    TileMatB bMatTile;
+    TASSIGN(aMatTile, 0x0);
+    TASSIGN(bMatTile, 0x20000);
+
+    LeftTile aTile;
+    RightTile bTile;
+    AccTile cTile;
+    TASSIGN(aTile, 0x0);
+    TASSIGN(bTile, 0x0);
+    TASSIGN(cTile, 0x0);
+
+    // Load pij and vj to L1 with separate events for pipeline overlap
+    TLOAD(aMatTile, pijGlobal);
+    set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);  // A load done
+    TLOAD(bMatTile, vjGlobal);
+    set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID1);  // B load done
+
+    // Move A to L0A as soon as A load completes (B may still be loading)
+    wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
+    TMOV(aTile, aMatTile);
+    // Move B to L0B after B load completes
+    wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID1);
+    TMOV(bTile, bMatTile);
+
+    set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
+    wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
+
+    // Single matmul: (M,K) x (K,N) -> (M,N)
+    TMATMUL(cTile, aTile, bTile);
+
+    set_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
+    wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
+
+    TSTORE(oiGlobal, cTile);
+
+    pipe_sync();
+}
+
+extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) {
+    __gm__ Tensor *pij = reinterpret_cast<__gm__ Tensor *>(args[0]);
+    __gm__ Tensor *vj = reinterpret_cast<__gm__ Tensor *>(args[1]);
+    __gm__ Tensor *oi_new = reinterpret_cast<__gm__ Tensor *>(args[2]);
+    uint64_t q_tile_size = static_cast<uint64_t>(pij->shapes[0]);
+
+    if (q_tile_size == 16 && pij->shapes[1] <= 16) {
+        pv_matmul_impl<16, 16, 16>(pij, vj, oi_new);
+    } else if (q_tile_size == 16) {
+        pv_matmul_impl<16, 128, 128>(pij, vj, oi_new);
+    } else {
+        pv_matmul_impl<64, 64, 128>(pij, vj, oi_new);
+    }
+}
diff --git a/examples/a2a3/fully_distributed_within_core/paged_attention/kernels/aic/aic_qk_matmul.cpp b/examples/a2a3/fully_distributed_within_core/paged_attention/kernels/aic/aic_qk_matmul.cpp
new file mode 100644
index 000000000..efd423bd6
--- /dev/null
+++ b/examples/a2a3/fully_distributed_within_core/paged_attention/kernels/aic/aic_qk_matmul.cpp
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+// QK Matmul Kernel: qi(M, K) @ kj.T(K, N) -> sij(M, N)
+//
+// Supports two tile configurations via runtime dispatch:
+//   Case1: (16, 128) @ (128, 128).T -> (16, 128)
+//   Case2: (64, 128) @ (128,  64).T -> (64,  64)
+//
+// kj is stored as (N, K) = (block_size, head_dim) in row-major memory.
+// This is equivalent to (K, N) in column-major (DN) layout.
+// Using DN GlobalB + RowMajor/ColMajor TileMatB to handle the transposed B pattern.
+
+#include <cstdint>
+#include <pto/pto-inst.hpp>
+
+#include "tensor.h"
+
+using namespace pto;
+
+#include "pipe_sync.h"
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__ [aicore]
+#endif
+
+template <int M, int K, int N>
+static __aicore__ void qk_matmul_impl(__gm__ Tensor *qi, __gm__ Tensor *kj, __gm__ Tensor *sij) {
+    __gm__ bfloat16_t *qi_addr = reinterpret_cast<__gm__ bfloat16_t *>(qi->buffer.addr);
+    __gm__ bfloat16_t *kj_addr = reinterpret_cast<__gm__ bfloat16_t *>(kj->buffer.addr);
+    __gm__ float *sij_addr = reinterpret_cast<__gm__ float *>(sij->buffer.addr);
+
+    // qi (M, K) bf16 in ND (row-major) layout
+    using GlobalA = GlobalTensor<bfloat16_t, Shape<1, 1, 1, M, K>, Stride<M * K, M * K, M * K, K, 1>>;
+    // kj stored as (N, K) row-major = (K, N) column-major -> DN layout
+    using GlobalB = GlobalTensor<bfloat16_t, Shape<1, 1, 1, K, N>, Stride<K * N, K * N, K * N, 1, K>, Layout::DN>;
+    using GlobalOut = GlobalTensor<float, Shape<1, 1, 1, M, N>, Stride<M * N, M * N, M * N, N, 1>>;
+
+    GlobalA qiGlobal(qi_addr + qi->start_offset);
+    GlobalB kjGlobal(kj_addr + kj->start_offset);
+    GlobalOut sijGlobal(sij_addr + sij->start_offset);
+
+    // L1 Mat tiles: A is standard ND, B uses transposed-B pattern (RowMajor/ColMajor)
+    using TileMatA = Tile<TileType::Mat, bfloat16_t, M, K, BLayout::ColMajor, M, K, SLayout::RowMajor, 512>;
+    using TileMatB = Tile<TileType::Mat, bfloat16_t, K, N, BLayout::RowMajor, K, N, SLayout::ColMajor, 512>;
+
+    // L0 tiles
+    using LeftTile = TileLeft<bfloat16_t, M, K, M, K>;
+    using RightTile = TileRight<bfloat16_t, K, N, K, N>;
+    using AccTile = TileAcc<float, M, N, M, N>;
+
+    TileMatA aMatTile;
+    TileMatB bMatTile;
+    TASSIGN(aMatTile, 0x0);
+    TASSIGN(bMatTile, 0x20000);
+
+    LeftTile aTile;
+    RightTile bTile;
+    AccTile cTile;
+    TASSIGN(aTile, 0x0);
+    TASSIGN(bTile, 0x0);
+    TASSIGN(cTile, 0x0);
+
+    // Load A and B to L1 with separate events for pipeline overlap
+    TLOAD(aMatTile, qiGlobal);
+    set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);  // A load done
+    TLOAD(bMatTile, kjGlobal);
+    set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID1);  // B load done
+
+    // Move A to L0A as soon as A load completes (B may still be loading)
+    wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
+    TMOV(aTile, aMatTile);
+    // Move B to L0B after B load completes
+    wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID1);
+    TMOV(bTile, bMatTile);
+
+    set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
+    wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
+
+    // Matmul
+    TMATMUL(cTile, aTile, bTile);
+
+    set_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
+    wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
+
+    TSTORE(sijGlobal, cTile);
+
+    pipe_sync();
+}
+
+extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) {
+    __gm__ Tensor *qi = reinterpret_cast<__gm__ Tensor *>(args[0]);
+    __gm__ Tensor *kj = reinterpret_cast<__gm__ Tensor *>(args[1]);
+    __gm__ Tensor *sij = reinterpret_cast<__gm__ Tensor *>(args[2]);
+    uint64_t q_tile_size = static_cast<uint64_t>(qi->shapes[0]);
+
+    if (q_tile_size == 16 && qi->shapes[1] <= 16) {
+        qk_matmul_impl<16, 16, 16>(qi, kj, sij);
+    } else if (q_tile_size == 16) {
+        qk_matmul_impl<16, 128, 128>(qi, kj, sij);
+    } else {
+        qk_matmul_impl<64, 128, 64>(qi, kj, sij);
+    }
+}
diff --git a/examples/a2a3/fully_distributed_within_core/paged_attention/kernels/aiv/aiv_online_update.cpp b/examples/a2a3/fully_distributed_within_core/paged_attention/kernels/aiv/aiv_online_update.cpp
new file mode 100644
index 000000000..ded4dcad8
--- /dev/null
+++ b/examples/a2a3/fully_distributed_within_core/paged_attention/kernels/aiv/aiv_online_update.cpp
@@ -0,0 +1,256 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+// Online Softmax Update + Normalize Kernel (AIV)
+//
+// Operates on full tiles where M=q_tile_size, N=head_dim (128):
+//   Case1: oi/oi_new are (16, 128), mij/lij/mi/li are 16-element vectors
+//   Case2: oi/oi_new are (64, 128), mij/lij/mi/li are 64-element vectors
+//
+// Scalar layout strategy using TRESHAPE (zero-copy UB reshape):
+//   Scalars loaded as DN ColMajor (M, 1) for TROWEXPANDMUL/TROWEXPANDDIV.
+//   For element-wise ops (TMAX, TSUB, TEXP, etc.), TRESHAPE to RowMajor (1, M).
+//   After arithmetic, TRESHAPE back to ColMajor (M, 1) for row-broadcast ops.
+//   This eliminates the GM round-trip (TSTORE ND → TLOAD DN) used in the original.
+
+#include <cstdint>
+#include <pto/pto-inst.hpp>
+
+#include "tensor.h"
+
+using namespace pto;
+
+#include "pipe_sync.h"
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__ [aicore]
+#endif
+
+template <int M, int N>
+static __aicore__ void online_update_impl(
+    __gm__ Tensor *mij, __gm__ Tensor *lij, __gm__ Tensor *oi_new, __gm__ Tensor *mi, __gm__ Tensor *li,
+    __gm__ Tensor *oi, uint64_t is_first, uint64_t is_last, __gm__ Tensor *dst
+) {
+    __gm__ float *mij_ptr = reinterpret_cast<__gm__ float *>(mij->buffer.addr);
+    __gm__ float *lij_ptr = reinterpret_cast<__gm__ float *>(lij->buffer.addr);
+    __gm__ float *oi_new_ptr = reinterpret_cast<__gm__ float *>(oi_new->buffer.addr);
+    __gm__ float *mi_ptr = reinterpret_cast<__gm__ float *>(mi->buffer.addr);
+    __gm__ float *li_ptr = reinterpret_cast<__gm__ float *>(li->buffer.addr);
+    __gm__ float *oi_ptr = reinterpret_cast<__gm__ float *>(oi->buffer.addr);
+    __gm__ float *dst_ptr = reinterpret_cast<__gm__ float *>(dst->buffer.addr);
+
+    // Aligned rows for ColMajor DN tiles (32-byte alignment)
+    constexpr int kAlignedRows = ((M * sizeof(float) + 31) / 32) * (32 / sizeof(float));
+
+    // --- GlobalTensor types ---
+
+    // Data (M, N) RowMajor
+    using GlobalDataMxN = GlobalTensor<float, Shape<1, 1, 1, M, N>, Stride<1, 1, 1, N, 1>>;
+
+    // Scalar DN: M contiguous floats as (kAlignedRows, 1) ColMajor for TROWEXPAND ops and loading
+    using GlobalScalarDN = GlobalTensor<float, Shape<1, 1, 1, kAlignedRows, 1>, Stride<1, 1, 1, 1, 1>, Layout::DN>;
+
+    // Scalar ND: for storing mi_new and li_new back to GM
+    constexpr int kScalarCols = 32 / sizeof(float);
+    constexpr int kScalarRows = M / kScalarCols;
+    using GlobalScalarND =
+        GlobalTensor<float, Shape<1, 1, 1, kScalarRows, kScalarCols>, Stride<1, 1, 1, kScalarCols, 1>>;
+
+    // --- GlobalTensor instances ---
+
+    GlobalDataMxN oiNewGlobal(oi_new_ptr + oi_new->start_offset);
+    GlobalDataMxN oiGlobal(oi_ptr + oi->start_offset);
+    GlobalDataMxN dstGlobal(dst_ptr + dst->start_offset);
+
+    // DN globals for loading scalars as ColMajor
+    GlobalScalarDN mijGlobalDN(mij_ptr + mij->start_offset);
+    GlobalScalarDN lijGlobalDN(lij_ptr + lij->start_offset);
+    GlobalScalarDN miGlobalDN(mi_ptr + mi->start_offset);
+    GlobalScalarDN liGlobalDN(li_ptr + li->start_offset);
+
+    // ND globals for storing scalar results
+    GlobalScalarND miGlobalND(mi_ptr + mi->start_offset);
+    GlobalScalarND liGlobalND(li_ptr + li->start_offset);
+
+    // --- Tile types ---
+
+    using TileDataMxN = Tile<TileType::Vec, float, M, N, BLayout::RowMajor, M, N>;
+    using TileScalarDN = Tile<TileType::Vec, float, kAlignedRows, 1, BLayout::ColMajor, M, 1>;
+
+    // RowMajor (1, M) tiles for element-wise arithmetic via TRESHAPE
+    using TileScalarRow = Tile<TileType::Vec, float, 1, M, BLayout::RowMajor, 1, M>;
+
+    // ND tile for storing back to GM
+    using TileScalarND =
+        Tile<TileType::Vec, float, kScalarRows, kScalarCols, BLayout::RowMajor, kScalarRows, kScalarCols>;
+
+    // --- UB memory layout ---
+
+    constexpr int kDataBytes = M * N * sizeof(float);
+    constexpr int kScalarDNBytes = kAlignedRows * sizeof(float);
+
+    // Data tiles
+    TileDataMxN oiNewTile;
+    TileDataMxN oiTile;
+
+    // Scalar DN tiles loaded from GM (ColMajor)
+    TileScalarDN mijDN, lijDN, miDN, liDN;
+
+    // Temporary DN tiles for results
+    TileScalarDN miNewDN, alphaDN, betaDN, liNewDN, tmpDN;
+
+    TASSIGN(oiNewTile, 0);
+    TASSIGN(oiTile, kDataBytes);
+    TASSIGN(mijDN, 2 * kDataBytes);
+    TASSIGN(lijDN, 2 * kDataBytes + kScalarDNBytes);
+    TASSIGN(miDN, 2 * kDataBytes + 2 * kScalarDNBytes);
+    TASSIGN(liDN, 2 * kDataBytes + 3 * kScalarDNBytes);
+    TASSIGN(miNewDN, 2 * kDataBytes + 4 * kScalarDNBytes);
+    TASSIGN(alphaDN, 2 * kDataBytes + 5 * kScalarDNBytes);
+    TASSIGN(betaDN, 2 * kDataBytes + 6 * kScalarDNBytes);
+    TASSIGN(liNewDN, 2 * kDataBytes + 7 * kScalarDNBytes);
+    TASSIGN(tmpDN, 2 * kDataBytes + 8 * kScalarDNBytes);
+
+    if (is_first) {
+        // --- First block: copy inputs to accumulators ---
+        TLOAD(oiNewTile, oiNewGlobal);
+        TLOAD(mijDN, mijGlobalDN);
+        TLOAD(lijDN, lijGlobalDN);
+        set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+        wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+
+        // Store mi = mij, li = lij, oi = oi_new
+        // Alias ND tiles to the same UB as DN tiles for storing as ND format
+        TileScalarND mijND, lijND;
+        TASSIGN(mijND, 2 * kDataBytes);                   // alias same UB as mijDN
+        TASSIGN(lijND, 2 * kDataBytes + kScalarDNBytes);  // alias same UB as lijDN
+
+        set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+        wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+        TSTORE(miGlobalND, mijND);    // mi = mij
+        TSTORE(liGlobalND, lijND);    // li = lij
+        TSTORE(oiGlobal, oiNewTile);  // oi = oi_new
+
+        if (is_last) {
+            // Single block: normalize dst = oi_new / lij
+            // lijDN already in ColMajor DN format, use directly for TROWEXPANDDIV
+            set_flag(PIPE_MTE3, PIPE_V, EVENT_ID0);
+            wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID0);
+            TROWEXPANDDIV(oiNewTile, oiNewTile, lijDN);
+            set_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);
+            wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);
+            TSTORE(dstGlobal, oiNewTile);
+        }
+    } else {
+        // --- Subsequent blocks: accumulate ---
+
+        // Load all inputs
+        TLOAD(oiNewTile, oiNewGlobal);
+        TLOAD(oiTile, oiGlobal);
+        TLOAD(mijDN, mijGlobalDN);
+        TLOAD(lijDN, lijGlobalDN);
+        TLOAD(miDN, miGlobalDN);
+        TLOAD(liDN, liGlobalDN);
+        set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+        wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+
+        // TRESHAPE: ColMajor(M,1) → RowMajor(1,M) for element-wise arithmetic
+        TileScalarRow miRow, mijRow, liRow, lijRow;
+        TRESHAPE(miRow, miDN);
+        TRESHAPE(mijRow, mijDN);
+        TRESHAPE(liRow, liDN);
+        TRESHAPE(lijRow, lijDN);
+
+        // Scalar arithmetic in RowMajor (1, M) layout
+        TileScalarRow miNewRow, alphaRow, betaRow, liNewRow, tmpRow;
+        TASSIGN(miNewRow, 2 * kDataBytes + 4 * kScalarDNBytes);
+        TASSIGN(alphaRow, 2 * kDataBytes + 5 * kScalarDNBytes);
+        TASSIGN(betaRow, 2 * kDataBytes + 6 * kScalarDNBytes);
+        TASSIGN(liNewRow, 2 * kDataBytes + 7 * kScalarDNBytes);
+        TASSIGN(tmpRow, 2 * kDataBytes + 8 * kScalarDNBytes);
+
+        TMAX(miNewRow, miRow, mijRow);  // mi_new = max(mi, mij)
+        pipe_barrier(PIPE_V);
+        TSUB(alphaRow, miRow, miNewRow);  // alpha_exp = mi - mi_new
+        pipe_barrier(PIPE_V);
+        TEXP(alphaRow, alphaRow);  // alpha = exp(mi - mi_new)
+        pipe_barrier(PIPE_V);
+        TSUB(betaRow, mijRow, miNewRow);  // beta_exp = mij - mi_new
+        pipe_barrier(PIPE_V);
+        TEXP(betaRow, betaRow);  // beta = exp(mij - mi_new)
+        pipe_barrier(PIPE_V);
+        TMUL(tmpRow, alphaRow, liRow);  // alpha * li
+        pipe_barrier(PIPE_V);
+        TMUL(liNewRow, betaRow, lijRow);  // beta * lij
+        pipe_barrier(PIPE_V);
+        TADD(liNewRow, tmpRow, liNewRow);  // li_new = alpha*li + beta*lij
+
+        // TRESHAPE back: RowMajor(1,M) → ColMajor(M,1) for TROWEXPANDMUL
+        TRESHAPE(alphaDN, alphaRow);
+        TRESHAPE(betaDN, betaRow);
+
+        // Scale data tiles using row-broadcast multiply
+        TROWEXPANDMUL(oiTile, oiTile, alphaDN);       // oi *= alpha
+        TROWEXPANDMUL(oiNewTile, oiNewTile, betaDN);  // oi_new *= beta
+        pipe_barrier(PIPE_V);
+        TADD(oiTile, oiTile, oiNewTile);  // oi = alpha*oi + beta*oi_new
+
+        // Store mi_new and li_new to GM (ND format)
+        // Alias ND tiles to the same UB locations as miNewRow and liNewRow
+        TileScalarND miNewND, liNewND;
+        TASSIGN(miNewND, 2 * kDataBytes + 4 * kScalarDNBytes);
+        TASSIGN(liNewND, 2 * kDataBytes + 7 * kScalarDNBytes);
+
+        if (is_last) {
+            // Normalize and output: dst = oi / li_new
+            TRESHAPE(liNewDN, liNewRow);
+            pipe_barrier(PIPE_V);
+            TROWEXPANDDIV(oiTile, oiTile, liNewDN);
+            set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+            wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+            TSTORE(miGlobalND, miNewND);  // persist mi_new
+            TSTORE(liGlobalND, liNewND);  // persist li_new
+            TSTORE(dstGlobal, oiTile);
+        } else {
+            // Store updated accumulators
+            set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+            wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+            TSTORE(miGlobalND, miNewND);  // persist mi_new
+            TSTORE(liGlobalND, liNewND);  // persist li_new
+            TSTORE(oiGlobal, oiTile);
+        }
+    }
+    pipe_sync();
+}
+
+extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) {
+    __gm__ Tensor *mij = reinterpret_cast<__gm__ Tensor *>(args[0]);
+    __gm__ Tensor *lij = reinterpret_cast<__gm__ Tensor *>(args[1]);
+    __gm__ Tensor *oi_new = reinterpret_cast<__gm__ Tensor *>(args[2]);
+    __gm__ Tensor *mi = reinterpret_cast<__gm__ Tensor *>(args[3]);
+    __gm__ Tensor *li = reinterpret_cast<__gm__ Tensor *>(args[4]);
+    __gm__ Tensor *oi = reinterpret_cast<__gm__ Tensor *>(args[5]);
+    __gm__ Tensor *dst = reinterpret_cast<__gm__ Tensor *>(args[6]);
+    uint64_t is_first = static_cast<uint64_t>(args[7]);
+    uint64_t is_last = static_cast<uint64_t>(args[8]);
+    uint64_t q_tile_size = static_cast<uint64_t>(mij->shapes[0]);
+
+    if (q_tile_size == 16 && oi_new->shapes[1] <= 16) {
+        online_update_impl<16, 16>(mij, lij, oi_new, mi, li, oi, is_first, is_last, dst);
+    } else if (q_tile_size == 16) {
+        online_update_impl<16, 128>(mij, lij, oi_new, mi, li, oi, is_first, is_last, dst);
+    } else {
+        online_update_impl<64, 128>(mij, lij, oi_new, mi, li, oi, is_first, is_last, dst);
+    }
+}
diff --git a/examples/a2a3/fully_distributed_within_core/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp b/examples/a2a3/fully_distributed_within_core/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp
new file mode 100644
index 000000000..8f0c41775
--- /dev/null
+++ b/examples/a2a3/fully_distributed_within_core/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp
@@ -0,0 +1,156 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+// Softmax Preparation Kernel (AIV) with partial block masking
+//
+// Operates on (M, N) tile where M=q_tile_size, N=block_size:
+//   Case1: sij is (16, 128)
+//   Case2: sij is (64, 64)
+//
+// For partial blocks (valid_len < N), positions [valid_len, N) in sij are
+// filled with -inf via TFILLPAD_INPLACE before softmax, ensuring exp(-inf)=0
+// so that invalid key positions contribute zero attention weight.
+//
+// Computes:
+//   sij_masked = TFILLPAD(sij, valid_len, pad=-inf)
+//   sij_scale = sij_masked * scale
+//   mij = row_max(sij_scale)        -> (M, 1)
+//   pij = exp(sij_scale - mij)      -> (M, N)
+//   lij = row_sum(pij)              -> (M, 1)
+
+#include <cstdint>
+#include <pto/pto-inst.hpp>
+
+#include "tensor.h"
+
+using namespace pto;
+
+#include "pipe_sync.h"
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__ [aicore]
+#endif
+
+template <int M, int N>
+static __aicore__ void softmax_prepare_impl(
+    __gm__ Tensor *sij, float scale_value, __gm__ Tensor *pij, __gm__ Tensor *mij, __gm__ Tensor *lij
+) {
+    uint64_t valid_len = static_cast<uint64_t>(sij->shapes[1]);
+    __gm__ float *sij_addr = reinterpret_cast<__gm__ float *>(sij->buffer.addr);
+    __gm__ bfloat16_t *pij_addr = reinterpret_cast<__gm__ bfloat16_t *>(pij->buffer.addr);
+    __gm__ float *mij_addr = reinterpret_cast<__gm__ float *>(mij->buffer.addr);
+    __gm__ float *lij_addr = reinterpret_cast<__gm__ float *>(lij->buffer.addr);
+
+    constexpr int kAlignedRows = ((M * sizeof(float) + 31) / 32) * (32 / sizeof(float));
+
+    using GlobalDataMxN = GlobalTensor<float, Shape<1, 1, 1, M, N>, Stride<1, 1, 1, N, 1>>;
+    using GlobalDataMxN_bf16 = GlobalTensor<bfloat16_t, Shape<1, 1, 1, M, N>, Stride<1, 1, 1, N, 1>>;
+    using GlobalScalarDN = GlobalTensor<float, Shape<1, 1, 1, kAlignedRows, 1>, Stride<1, 1, 1, 1, 1>, Layout::DN>;
+
+    GlobalDataMxN sijGlobal(sij_addr + sij->start_offset);
+    GlobalDataMxN_bf16 pijGlobal(pij_addr + pij->start_offset);
+    GlobalScalarDN mijGlobal(mij_addr + mij->start_offset);
+    GlobalScalarDN lijGlobal(lij_addr + lij->start_offset);
+
+    // Dynamic-cols tile: marks which columns are valid for TFILLPAD boundary
+    using TileSijDyn = Tile<TileType::Vec, float, M, N, BLayout::RowMajor, M, -1>;
+    // Padded tile: TFILLPAD_INPLACE fills positions [valid_len, N) with -inf
+    using TileSijPad = Tile<TileType::Vec, float, M, N, BLayout::RowMajor, M, N, SLayout::NoneBox, 512, PadValue::Min>;
+
+    using TileVecMxN = Tile<TileType::Vec, float, M, N, BLayout::RowMajor, M, N>;
+    using TileVecMxN_bf16 = Tile<TileType::Vec, bfloat16_t, M, N, BLayout::RowMajor, M, N>;
+    using TileScalarDN = Tile<TileType::Vec, float, kAlignedRows, 1, BLayout::ColMajor, M, 1>;
+
+    TileVecMxN sijTile;
+    TileSijDyn sijDynTile(static_cast<size_t>(valid_len));
+    TileSijPad sijPadTile;
+    TileVecMxN pijTile;
+    TileVecMxN tmpTile;
+    TileScalarDN maxTile;
+    TileScalarDN sumTile;
+    TileVecMxN_bf16 pijBf16Tile;
+
+    // All sij tiles share UB address 0x0 (in-place masking)
+    TASSIGN(sijTile, 0x0);
+    TASSIGN(sijDynTile, 0x0);
+    TASSIGN(sijPadTile, 0x0);
+    TASSIGN(pijTile, M * N * sizeof(float));
+    TASSIGN(tmpTile, 2 * M * N * sizeof(float));
+    TASSIGN(maxTile, 3 * M * N * sizeof(float));
+    TASSIGN(sumTile, 3 * M * N * sizeof(float) + kAlignedRows * sizeof(float));
+    TASSIGN(pijBf16Tile, 3 * M * N * sizeof(float) + 2 * kAlignedRows * sizeof(float));
+
+    // Load full sij (M, N) tile from GM - all N columns including garbage for partial blocks
+    // printf("sij addr incore %x\n", sij->buffer.addr);
+    TLOAD(sijTile, sijGlobal);
+    set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+    wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+
+    // Mask columns [valid_len, N) with -inf. sijDynTile provides the valid boundary,
+    // sijPadTile provides PadValue::Min as the fill value. No-op when valid_len == N.
+    TFILLPAD_INPLACE(sijPadTile, sijDynTile);
+    pipe_barrier(PIPE_V);
+
+    TMULS(sijTile, sijTile, scale_value);
+    pipe_barrier(PIPE_V);
+    TROWMAX(maxTile, sijTile, tmpTile);
+    pipe_barrier(PIPE_V);
+    TROWEXPANDSUB(pijTile, sijTile, maxTile);
+    pipe_barrier(PIPE_V);
+    TEXP(pijTile, pijTile);
+    // Truncate pij to bf16 first
+    pipe_barrier(PIPE_V);
+    TCVT(pijBf16Tile, pijTile, RoundMode::CAST_ROUND);
+    set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);  // pij bf16 ready, can store early
+
+    // Continue computing: bf16 → f32 and rowsum while pij store proceeds in parallel
+    pipe_barrier(PIPE_V);
+    TCVT(pijTile, pijBf16Tile, RoundMode::CAST_ROUND);
+    pipe_barrier(PIPE_V);
+    TROWSUM(sumTile, pijTile, tmpTile);
+    set_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);  // sum ready
+
+    // Store pij (overlaps with TCVT + TROWSUM above)
+    wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+    TSTORE(pijGlobal, pijBf16Tile);
+
+    // Store max and sum
+    TSTORE(mijGlobal, maxTile);
+    wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);
+    TSTORE(lijGlobal, sumTile);
+
+    pipe_sync();
+}
+
+extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) {
+    __gm__ Tensor *sij = reinterpret_cast<__gm__ Tensor *>(args[0]);
+    __gm__ Tensor *pij = reinterpret_cast<__gm__ Tensor *>(args[1]);
+    __gm__ Tensor *mij = reinterpret_cast<__gm__ Tensor *>(args[2]);
+    __gm__ Tensor *lij = reinterpret_cast<__gm__ Tensor *>(args[3]);
+    union {
+        uint64_t u;
+        float f;
+    } scale_conv;
+    scale_conv.u = static_cast<uint64_t>(args[4]);
+    float scale_value = scale_conv.f;
+    uint64_t q_tile_size = static_cast<uint64_t>(sij->shapes[0]);
+
+    if (q_tile_size == 16 && pij->shapes[1] <= 16) {
+        softmax_prepare_impl<16, 16>(sij, scale_value, pij, mij, lij);
+    } else if (q_tile_size == 16) {
+        softmax_prepare_impl<16, 128>(sij, scale_value, pij, mij, lij);
+    } else {
+        softmax_prepare_impl<64, 64>(sij, scale_value, pij, mij, lij);
+    }
+}
diff --git a/examples/a2a3/fully_distributed_within_core/paged_attention/kernels/orchestration/paged_attention_orch.cpp b/examples/a2a3/fully_distributed_within_core/paged_attention/kernels/orchestration/paged_attention_orch.cpp
new file mode 100644
index 000000000..2ed86cdf2
--- /dev/null
+++ b/examples/a2a3/fully_distributed_within_core/paged_attention/kernels/orchestration/paged_attention_orch.cpp
@@ -0,0 +1,292 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Paged Attention Orchestration Function - 16x16 Version
+ *
+ * Simplified for 16x16 framework-generated matmul kernels.
+ * Each block processes a single 16x16 matmul operation.
+ *
+ * Memory Layout:
+ *   Query: (batch, 16, 16) - one 16x16 tile per batch
+ *   Key:   (total_blocks, 16, 16) - stored as K^T for direct matmul
+ *   Value: (total_blocks, 16, 16) - direct format
+ */
+
+#include <algorithm>
+#include <cinttypes>
+#include <cstdint>
+#include <cstring>
+
+#include "pto_orchestration_api.h"
+
+#define FUNC_QK_MATMUL 0
+#define FUNC_SOFTMAX_PREPARE 1
+#define FUNC_PV_MATMUL 2
+#define FUNC_ONLINE_UPDATE 3
+constexpr uint64_t PLATFORM_PROF_SYS_CNT_FREQ = 50000000;  // 50 MHz
+
+inline double cycles_to_us(uint64_t cycles) {
+    return (static_cast<double>(cycles) / PLATFORM_PROF_SYS_CNT_FREQ) * 1000000.0;
+}
+
+inline uint64_t get_sys_cnt_aicpu() {
+#if defined(__aarch64__)
+    uint64_t ticks;
+    asm volatile("mrs %0, cntvct_el0" : "=r"(ticks));
+    return ticks;
+#elif defined(__x86_64__)
+    return 0;
+#else
+    return 0;
+#endif
+}
+
+#ifdef ENABLE_PROFILING
+#define CYCLE_COUNT_START() uint64_t _t0 = get_sys_cnt_aicpu(), _t1
+#define CYCLE_COUNT_LAP(acc)       \
+    do {                           \
+        _t1 = get_sys_cnt_aicpu(); \
+        acc += (_t1 - _t0);        \
+        _t0 = _t1;                 \
+    } while (0)
+#define PROF_INC(counter, n) (counter) += (n)
+#else
+#define CYCLE_COUNT_START() (void)0
+#define CYCLE_COUNT_LAP(acc) (void)0
+#define PROF_INC(counter, n) (void)0
+#endif
+
+extern "C" {
+
+__attribute__((visibility("default"))) PTO2OrchestrationConfig aicpu_orchestration_config(const L2TaskArgs &orch_args) {
+    (void)orch_args;
+    return PTO2OrchestrationConfig{
+        .expected_arg_count = 7,
+    };
+}
+
+__attribute__((visibility("default"))) void aicpu_orchestration_entry(const L2TaskArgs &orch_args) {
+#ifdef ENABLE_PROFILING
+    uint64_t prof_param_extract = 0;
+    uint64_t prof_ext_tensor = 0;
+    uint64_t prof_scope = 0;
+    uint64_t prof_make_tensor = 0;
+    uint64_t prof_tensor_view = 0;
+    uint64_t prof_param_setup = 0;
+    uint64_t prof_submit_task = 0;
+    int prof_submit_count = 0;
+    int prof_make_count = 0;
+    int prof_view_count = 0;
+#endif
+
+    CYCLE_COUNT_START();
+
+    // Read dimensions from tensor metadata
+    uint64_t batch = orch_args.tensor(0).ref().shapes[0];
+    uint64_t num_heads = orch_args.tensor(0).ref().shapes[1];
+    uint64_t head_dim = orch_args.tensor(0).ref().shapes[2];
+    DataType data_type = orch_args.tensor(0).ref().dtype;
+
+    uint64_t block_size = orch_args.tensor(1).ref().shapes[1];
+    uint64_t block_num = orch_args.tensor(3).ref().shapes[1];
+
+    uint64_t scale_value = orch_args.scalar(0);
+
+    uint64_t q_head_num = num_heads;
+    uint64_t q_tile = std::min(num_heads, static_cast<uint64_t>(128));
+    uint64_t q_loop = (q_head_num + q_tile - 1) / q_tile;
+    CYCLE_COUNT_LAP(prof_param_extract);
+
+    LOG_INFO_V9(">>>>>> batch = %" PRIu64, batch);
+
+    // Reshape tensors for kernel consumption (2D flattened)
+    void *query_ptr = orch_args.tensor(0).ref().data_as<void>();
+    void *kc_ptr = orch_args.tensor(1).ref().data_as<void>();
+    void *vc_ptr = orch_args.tensor(2).ref().data_as<void>();
+    void *out_ptr = orch_args.tensor(5).ref().data_as<void>();
+
+    uint64_t total_blocks_count = orch_args.tensor(1).ref().shapes[0];
+
+    uint32_t query_shapes[2] = {static_cast<uint32_t>(batch * num_heads), static_cast<uint32_t>(head_dim)};
+    uint32_t key_cache_shapes[2] = {
+        static_cast<uint32_t>(total_blocks_count * block_size), static_cast<uint32_t>(head_dim)
+    };
+    uint32_t value_cache_shapes[2] = {
+        static_cast<uint32_t>(total_blocks_count * block_size), static_cast<uint32_t>(head_dim)
+    };
+    uint32_t out_shapes[2] = {static_cast<uint32_t>(batch * num_heads), static_cast<uint32_t>(head_dim)};
+    Tensor query = make_tensor_external(query_ptr, query_shapes, 2, data_type);
+    Tensor key_cache = make_tensor_external(kc_ptr, key_cache_shapes, 2, data_type);
+    Tensor value_cache = make_tensor_external(vc_ptr, value_cache_shapes, 2, data_type);
+    Tensor out = make_tensor_external(out_ptr, out_shapes, 2, DataType::FLOAT32);
+    CYCLE_COUNT_LAP(prof_ext_tensor);
+
+    uint32_t bt_shapes[2] = {static_cast<uint32_t>(batch), static_cast<uint32_t>(block_num)};
+    Tensor block_table =
+        make_tensor_external(orch_args.tensor(3).ref().data_as<void>(), bt_shapes, 2, DataType::INT32, false);
+    uint32_t cl_shapes[1] = {static_cast<uint32_t>(batch)};
+    Tensor context_lens =
+        make_tensor_external(orch_args.tensor(4).ref().data_as<void>(), cl_shapes, 1, DataType::INT32, false);
+
+    // Create infos are loop-invariant — shapes depend only on q_tile/head_dim/block_size
+    uint32_t tile2d_shapes[2] = {static_cast<uint32_t>(q_tile), static_cast<uint32_t>(head_dim)};
+    uint32_t scalar_shapes[1] = {static_cast<uint32_t>(q_tile)};
+    uint32_t sij_shapes[2] = {static_cast<uint32_t>(q_tile), static_cast<uint32_t>(block_size)};
+    TensorCreateInfo tile2d_ci(tile2d_shapes, 2, DataType::FLOAT32);
+    TensorCreateInfo scalar_ci(scalar_shapes, 1, DataType::FLOAT32);
+    TensorCreateInfo sij_ci(sij_shapes, 2, DataType::FLOAT32);
+    TensorCreateInfo pij_f16_ci(sij_shapes, 2, data_type);
+
+    PROF_INC(prof_make_count, 4);
+    CYCLE_COUNT_LAP(prof_make_tensor);
+
+    for (uint64_t b_idx = 0; b_idx < batch; b_idx++) {
+        uint32_t cl_idx[1] = {static_cast<uint32_t>(b_idx)};
+        uint64_t cur_seq = static_cast<uint64_t>(get_tensor_data<int32_t>(context_lens, 1, cl_idx));
+        uint64_t bn_this_batch = (cur_seq + block_size - 1) / block_size;
+        for (uint64_t q_idx = 0; q_idx < q_loop; q_idx++) {
+            PTO2_SCOPE() {
+                CYCLE_COUNT_LAP(prof_scope);
+                uint64_t cur_offset = b_idx * q_head_num + q_idx * q_tile;
+
+                uint32_t qi_offsets[2] = {static_cast<uint32_t>(cur_offset), 0};
+                Tensor qi = query.view(tile2d_shapes, qi_offsets);
+                uint32_t out_view_offsets[2] = {static_cast<uint32_t>(cur_offset), 0};
+                Tensor out_view = out.view(tile2d_shapes, out_view_offsets);
+                PROF_INC(prof_view_count, 2);
+                CYCLE_COUNT_LAP(prof_tensor_view);
+
+                CYCLE_COUNT_LAP(prof_param_setup);
+                TaskOutputTensors alloc_outs = alloc_tensors(tile2d_ci, scalar_ci, scalar_ci);
+                const Tensor &oi = alloc_outs.get_ref(0);
+                const Tensor &li_update = alloc_outs.get_ref(1);
+                const Tensor &mi_update = alloc_outs.get_ref(2);
+                PROF_INC(prof_submit_count, 1);
+                CYCLE_COUNT_LAP(prof_submit_task);
+
+                for (uint64_t bn = 0; bn < bn_this_batch; bn++) {
+                    PTO2_SCOPE_GUARD();
+
+                    uint32_t bt_idx[2] = {static_cast<uint32_t>(b_idx), static_cast<uint32_t>(bn)};
+                    uint64_t cur_block_idx = static_cast<uint64_t>(get_tensor_data<int32_t>(block_table, 2, bt_idx));
+                    uint64_t valid_len = std::min(block_size, cur_seq - bn * block_size);
+                    CYCLE_COUNT_LAP(prof_param_extract);
+
+                    uint32_t kv_shapes[2] = {static_cast<uint32_t>(block_size), static_cast<uint32_t>(head_dim)};
+                    uint32_t kv_offsets[2] = {static_cast<uint32_t>(cur_block_idx * block_size), 0};
+                    Tensor kj = key_cache.view(kv_shapes, kv_offsets);
+                    Tensor vj = value_cache.view(kv_shapes, kv_offsets);
+                    PROF_INC(prof_view_count, 2);
+                    CYCLE_COUNT_LAP(prof_tensor_view);
+
+                    L0TaskArgs params_qk;
+                    params_qk.add_input(qi);
+                    params_qk.add_input(kj);
+                    params_qk.add_output(sij_ci);
+                    CYCLE_COUNT_LAP(prof_param_setup);
+                    TaskOutputTensors qk_outs = rt_submit_aic_task(FUNC_QK_MATMUL, params_qk);
+                    const Tensor &sij = qk_outs.get_ref(0);
+                    PROF_INC(prof_submit_count, 1);
+                    CYCLE_COUNT_LAP(prof_submit_task);
+
+                    uint32_t sij_valid_shapes[2] = {static_cast<uint32_t>(q_tile), static_cast<uint32_t>(valid_len)};
+                    uint32_t sij_valid_offsets[2] = {0, 0};
+                    Tensor sij_valid = sij.view(sij_valid_shapes, sij_valid_offsets);
+                    PROF_INC(prof_view_count, 1);
+                    CYCLE_COUNT_LAP(prof_tensor_view);
+
+                    L0TaskArgs params_sf;
+                    params_sf.add_input(sij_valid);
+                    params_sf.add_output(pij_f16_ci);
+                    params_sf.add_output(scalar_ci);
+                    params_sf.add_output(scalar_ci);
+                    params_sf.add_scalar(scale_value);
+                    CYCLE_COUNT_LAP(prof_param_setup);
+                    TaskOutputTensors sf_outs = rt_submit_aiv_task(FUNC_SOFTMAX_PREPARE, params_sf);
+                    const Tensor &pij_f16 = sf_outs.get_ref(0);
+                    const Tensor &mi = sf_outs.get_ref(1);
+                    const Tensor &li = sf_outs.get_ref(2);
+                    PROF_INC(prof_submit_count, 1);
+                    CYCLE_COUNT_LAP(prof_submit_task);
+
+                    L0TaskArgs params_pv;
+                    params_pv.add_input(pij_f16);
+                    params_pv.add_input(vj);
+                    params_pv.add_output(tile2d_ci);
+                    CYCLE_COUNT_LAP(prof_param_setup);
+                    TaskOutputTensors pv_outs = rt_submit_aic_task(FUNC_PV_MATMUL, params_pv);
+                    const Tensor &oi_tmp = pv_outs.get_ref(0);
+                    PROF_INC(prof_submit_count, 1);
+                    CYCLE_COUNT_LAP(prof_submit_task);
+
+                    uint64_t is_first = (bn == 0) ? 1 : 0;
+                    uint64_t is_last = (bn == bn_this_batch - 1) ? 1 : 0;
+                    CYCLE_COUNT_LAP(prof_param_extract);
+
+                    L0TaskArgs params_up;
+                    params_up.add_input(mi);
+                    params_up.add_input(li);
+                    params_up.add_input(oi_tmp);
+                    params_up.add_inout(mi_update);
+                    params_up.add_inout(li_update);
+                    params_up.add_inout(oi);
+                    params_up.add_inout(out_view);
+                    params_up.add_scalar(is_first);
+                    params_up.add_scalar(is_last);
+                    CYCLE_COUNT_LAP(prof_param_setup);
+                    rt_submit_aiv_task(FUNC_ONLINE_UPDATE, params_up);
+                    PROF_INC(prof_submit_count, 1);
+                    CYCLE_COUNT_LAP(prof_submit_task);
+                }
+            }
+            CYCLE_COUNT_LAP(prof_scope);
+        }
+    }
+
+#ifdef ENABLE_PROFILING
+    uint64_t total = prof_param_extract + prof_ext_tensor + prof_make_tensor + prof_tensor_view + prof_param_setup +
+                     prof_submit_task + prof_scope;
+    LOG_INFO_V9(
+        "=== PagedAttn Orch Profiling: %d submits, %d makes, %d views, total=%.3fus ===", prof_submit_count,
+        prof_make_count, prof_view_count, cycles_to_us(total)
+    );
+    if (total > 0) {
+        LOG_INFO_V9(
+            "  param_extract    : %7.3fus (%5.1f%%)", cycles_to_us(prof_param_extract),
+            prof_param_extract * 100.0 / total
+        );
+        LOG_INFO_V9(
+            "  ext_tensor(x4)   : %7.3fus (%5.1f%%)", cycles_to_us(prof_ext_tensor), prof_ext_tensor * 100.0 / total
+        );
+        LOG_INFO_V9(
+            "  create_info(x%d) : %7.3fus (%5.1f%%)  avg=%.3fus", prof_make_count, cycles_to_us(prof_make_tensor),
+            prof_make_tensor * 100.0 / total,
+            prof_make_count > 0 ? cycles_to_us(prof_make_tensor) / prof_make_count : 0.0
+        );
+        LOG_INFO_V9(
+            "  tensor_view(x%d) : %7.3fus (%5.1f%%)  avg=%.3fus", prof_view_count, cycles_to_us(prof_tensor_view),
+            prof_tensor_view * 100.0 / total,
+            prof_view_count > 0 ? cycles_to_us(prof_tensor_view) / prof_view_count : 0.0
+        );
+        LOG_INFO_V9(
+            "  param_setup      : %7.3fus (%5.1f%%)", cycles_to_us(prof_param_setup), prof_param_setup * 100.0 / total
+        );
+        LOG_INFO_V9("  scope            : %7.3fus (%5.1f%%)", cycles_to_us(prof_scope), prof_scope * 100.0 / total);
+        LOG_INFO_V9(
+            "  submit_task(x%d) : %7.3fus (%5.1f%%)  avg=%.3fus", prof_submit_count, cycles_to_us(prof_submit_task),
+            prof_submit_task * 100.0 / total,
+            prof_submit_count > 0 ? cycles_to_us(prof_submit_task) / prof_submit_count : 0.0
+        );
+    }
+#endif
+}
+
+}  // extern "C"
diff --git a/examples/a2a3/fully_distributed_within_core/paged_attention/test_paged_attention.py b/examples/a2a3/fully_distributed_within_core/paged_attention/test_paged_attention.py
new file mode 100644
index 000000000..8405a0d3d
--- /dev/null
+++ b/examples/a2a3/fully_distributed_within_core/paged_attention/test_paged_attention.py
@@ -0,0 +1,197 @@
+#!/usr/bin/env python3
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""Paged attention: online softmax with AIC/AIV subgraph splitting (bfloat16)."""
+
+import torch
+from simpler.task_interface import ArgDirection as D
+
+from simpler_setup import Scalar, SceneTestCase, TaskArgsBuilder, Tensor, scene_test
+from simpler_setup.goldens.paged_attention import compute_golden as _pa_compute_golden
+from simpler_setup.goldens.paged_attention import generate_inputs as _pa_generate_inputs
+
+
+@scene_test(level=2, runtime="fully_distributed_within_core")
+class TestPagedAttention(SceneTestCase):
+    RTOL = 1e-3
+    ATOL = 1e-3
+
+    CALLABLE = {
+        "orchestration": {
+            "source": "kernels/orchestration/paged_attention_orch.cpp",
+            "function_name": "aicpu_orchestration_entry",
+            "signature": [D.IN, D.IN, D.IN, D.IN, D.IN, D.OUT],
+        },
+        "incores": [
+            {
+                "func_id": 0,
+                "name": "QK",
+                "source": "kernels/aic/aic_qk_matmul.cpp",
+                "core_type": "aic",
+                "signature": [D.IN, D.IN, D.OUT],
+            },
+            {
+                "func_id": 1,
+                "name": "SF",
+                "source": "kernels/aiv/aiv_softmax_prepare.cpp",
+                "core_type": "aiv",
+                "signature": [D.IN, D.OUT, D.OUT, D.OUT],
+            },
+            {
+                "func_id": 2,
+                "name": "PV",
+                "source": "kernels/aic/aic_pv_matmul.cpp",
+                "core_type": "aic",
+                "signature": [D.IN, D.IN, D.OUT],
+            },
+            {
+                "func_id": 3,
+                "name": "UP",
+                "source": "kernels/aiv/aiv_online_update.cpp",
+                "core_type": "aiv",
+                "signature": [D.IN, D.IN, D.IN, D.INOUT, D.INOUT, D.INOUT, D.INOUT],
+            },
+        ],
+    }
+
+    CASES = [
+        {
+            "name": "Case1",
+            "platforms": ["a2a3"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 24},
+            "params": {
+                "batch": 256,
+                "num_heads": 16,
+                "kv_head_num": 1,
+                "head_dim": 128,
+                "block_size": 128,
+                "context_len": 8192,
+                "max_model_len": 32768,
+                "dtype": "bfloat16",
+            },
+        },
+        {
+            "name": "Case2",
+            "platforms": ["a2a3"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 24},
+            "manual": True,
+            "params": {
+                "batch": 64,
+                "num_heads": 64,
+                "kv_head_num": 1,
+                "head_dim": 128,
+                "block_size": 64,
+                "context_len": 8192,
+                "max_model_len": 32768,
+                "dtype": "bfloat16",
+            },
+        },
+        {
+            "name": "Case3",
+            "platforms": ["a2a3"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 24},
+            "manual": True,
+            "params": {
+                "batch": 64,
+                "num_heads": 64,
+                "kv_head_num": 1,
+                "head_dim": 256,
+                "block_size": 64,
+                "context_len": 8192,
+                "max_model_len": 32768,
+                "dtype": "bfloat16",
+            },
+        },
+        {
+            "name": "CaseSmall1",
+            "platforms": ["a2a3sim", "a2a3"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 9},
+            "params": {
+                "batch": 1,
+                "num_heads": 16,
+                "kv_head_num": 1,
+                "head_dim": 16,
+                "block_size": 16,
+                "context_len": 33,
+                "max_model_len": 256,
+                "dtype": "bfloat16",
+            },
+        },
+        {
+            "name": "CaseSmall2",
+            "platforms": ["a2a3sim", "a2a3"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 24},
+            "manual": True,
+            "params": {
+                "batch": 1,
+                "num_heads": 16,
+                "kv_head_num": 1,
+                "head_dim": 16,
+                "block_size": 16,
+                "context_len": 128,
+                "max_model_len": 256,
+                "dtype": "bfloat16",
+            },
+        },
+        {
+            "name": "CaseVarSeq2",
+            "platforms": ["a2a3sim", "a2a3"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 24},
+            "manual": True,
+            "params": {
+                "batch": 2,
+                "num_heads": 16,
+                "kv_head_num": 1,
+                "head_dim": 16,
+                "block_size": 16,
+                "context_len": 33,
+                "context_lens_list": [33, 17],
+                "max_model_len": 256,
+                "dtype": "bfloat16",
+            },
+        },
+        {
+            "name": "CaseVarSeq4",
+            "platforms": ["a2a3sim", "a2a3"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 24},
+            "manual": True,
+            "params": {
+                "batch": 4,
+                "num_heads": 16,
+                "kv_head_num": 1,
+                "head_dim": 16,
+                "block_size": 16,
+                "context_len": 128,
+                "context_lens_list": [33, 64, 128, 15],
+                "max_model_len": 256,
+                "dtype": "bfloat16",
+            },
+        },
+    ]
+
+    def generate_args(self, params):
+        result = _pa_generate_inputs(params)
+        specs = []
+        for name, value in result:
+            if isinstance(value, torch.Tensor):
+                specs.append(Tensor(name, value))
+            else:
+                specs.append(Scalar(name, value))
+        return TaskArgsBuilder(*specs)
+
+    def compute_golden(self, args, params):
+        tensors = {s.name: s.value for s in args.specs if isinstance(s, Tensor)}
+        _pa_compute_golden(tensors, params)
+        for s in args.specs:
+            if isinstance(s, Tensor) and s.name in tensors:
+                getattr(args, s.name)[:] = tensors[s.name]
+
+
+if __name__ == "__main__":
+    SceneTestCase.run_module(__name__)
diff --git a/examples/a2a3/fully_distributed_within_core/paged_attention_manual_scope/kernels/aic/aic_pv_matmul.cpp b/examples/a2a3/fully_distributed_within_core/paged_attention_manual_scope/kernels/aic/aic_pv_matmul.cpp
new file mode 100644
index 000000000..0220a6bbb
--- /dev/null
+++ b/examples/a2a3/fully_distributed_within_core/paged_attention_manual_scope/kernels/aic/aic_pv_matmul.cpp
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+// PV Matmul Kernel: pij(M, K) @ vj(K, N) -> oi_new(M, N)
+//
+// Supports two tile configurations via runtime dispatch:
+//   Case1: (16, 128) @ (128, 128) -> (16, 128)
+//   Case2: (64,  64) @ ( 64, 128) -> (64, 128)
+//
+// pij is bfloat16 (converted from fp32 in softmax_prepare via TCVT).
+// vj is stored as (K, N) = (block_size, head_dim) in row-major (ND) layout.
+// Standard non-transposed B pattern: ND GlobalB + ColMajor/RowMajor TileMatB.
+
+#include <cstdint>
+#include <pto/pto-inst.hpp>
+
+#include "tensor.h"
+
+using namespace pto;
+
+#include "pipe_sync.h"
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__ [aicore]
+#endif
+
+template <int M, int K, int N>
+static __aicore__ void pv_matmul_impl(__gm__ Tensor *pij, __gm__ Tensor *vj, __gm__ Tensor *oi) {
+    __gm__ bfloat16_t *pij_addr = reinterpret_cast<__gm__ bfloat16_t *>(pij->buffer.addr);
+    __gm__ bfloat16_t *vj_addr = reinterpret_cast<__gm__ bfloat16_t *>(vj->buffer.addr);
+    __gm__ float *oi_addr = reinterpret_cast<__gm__ float *>(oi->buffer.addr);
+
+    // pij (M, K) bf16, vj (K, N) bf16 in ND (row-major), oi_new (M, N) fp32
+    using GlobalA = GlobalTensor<bfloat16_t, Shape<1, 1, 1, M, K>, Stride<M * K, M * K, M * K, K, 1>>;
+    using GlobalB = GlobalTensor<bfloat16_t, Shape<1, 1, 1, K, N>, Stride<K * N, K * N, K * N, N, 1>>;
+    using GlobalOut = GlobalTensor<float, Shape<1, 1, 1, M, N>, Stride<M * N, M * N, M * N, N, 1>>;
+
+    GlobalA pijGlobal(pij_addr + pij->start_offset);
+    GlobalB vjGlobal(vj_addr + vj->start_offset);
+    GlobalOut oiGlobal(oi_addr + oi->start_offset);
+
+    // L1 Mat tiles: standard ND pattern for both A and B
+    using TileMatA = Tile<TileType::Mat, bfloat16_t, M, K, BLayout::ColMajor, M, K, SLayout::RowMajor, 512>;
+    using TileMatB = Tile<TileType::Mat, bfloat16_t, K, N, BLayout::ColMajor, K, N, SLayout::RowMajor, 512>;
+
+    // L0 tiles
+    using LeftTile = TileLeft<bfloat16_t, M, K, M, K>;
+    using RightTile = TileRight<bfloat16_t, K, N, K, N>;
+    using AccTile = TileAcc<float, M, N, M, N>;
+
+    TileMatA aMatTile;
+    TileMatB bMatTile;
+    TASSIGN(aMatTile, 0x0);
+    TASSIGN(bMatTile, 0x20000);
+
+    LeftTile aTile;
+    RightTile bTile;
+    AccTile cTile;
+    TASSIGN(aTile, 0x0);
+    TASSIGN(bTile, 0x0);
+    TASSIGN(cTile, 0x0);
+
+    // Load pij and vj to L1 with separate events for pipeline overlap
+    TLOAD(aMatTile, pijGlobal);
+    set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);  // A load done
+    TLOAD(bMatTile, vjGlobal);
+    set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID1);  // B load done
+
+    // Move A to L0A as soon as A load completes (B may still be loading)
+    wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
+    TMOV(aTile, aMatTile);
+    // Move B to L0B after B load completes
+    wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID1);
+    TMOV(bTile, bMatTile);
+
+    set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
+    wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
+
+    // Single matmul: (M,K) x (K,N) -> (M,N)
+    TMATMUL(cTile, aTile, bTile);
+
+    set_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
+    wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
+
+    TSTORE(oiGlobal, cTile);
+
+    pipe_sync();
+}
+
+extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) {
+    __gm__ Tensor *pij = reinterpret_cast<__gm__ Tensor *>(args[0]);
+    __gm__ Tensor *vj = reinterpret_cast<__gm__ Tensor *>(args[1]);
+    __gm__ Tensor *oi_new = reinterpret_cast<__gm__ Tensor *>(args[2]);
+    uint64_t q_tile_size = static_cast<uint64_t>(pij->shapes[0]);
+
+    if (q_tile_size == 16 && pij->shapes[1] <= 16) {
+        pv_matmul_impl<16, 16, 16>(pij, vj, oi_new);
+    } else if (q_tile_size == 16) {
+        pv_matmul_impl<16, 128, 128>(pij, vj, oi_new);
+    } else {
+        pv_matmul_impl<64, 64, 128>(pij, vj, oi_new);
+    }
+}
diff --git a/examples/a2a3/fully_distributed_within_core/paged_attention_manual_scope/kernels/aic/aic_qk_matmul.cpp b/examples/a2a3/fully_distributed_within_core/paged_attention_manual_scope/kernels/aic/aic_qk_matmul.cpp
new file mode 100644
index 000000000..efd423bd6
--- /dev/null
+++ b/examples/a2a3/fully_distributed_within_core/paged_attention_manual_scope/kernels/aic/aic_qk_matmul.cpp
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+// QK Matmul Kernel: qi(M, K) @ kj.T(K, N) -> sij(M, N)
+//
+// Supports two tile configurations via runtime dispatch:
+//   Case1: (16, 128) @ (128, 128).T -> (16, 128)
+//   Case2: (64, 128) @ (128,  64).T -> (64,  64)
+//
+// kj is stored as (N, K) = (block_size, head_dim) in row-major memory.
+// This is equivalent to (K, N) in column-major (DN) layout.
+// Using DN GlobalB + RowMajor/ColMajor TileMatB to handle the transposed B pattern.
+
+#include <cstdint>
+#include <pto/pto-inst.hpp>
+
+#include "tensor.h"
+
+using namespace pto;
+
+#include "pipe_sync.h"
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__ [aicore]
+#endif
+
+template <int M, int K, int N>
+static __aicore__ void qk_matmul_impl(__gm__ Tensor *qi, __gm__ Tensor *kj, __gm__ Tensor *sij) {
+    __gm__ bfloat16_t *qi_addr = reinterpret_cast<__gm__ bfloat16_t *>(qi->buffer.addr);
+    __gm__ bfloat16_t *kj_addr = reinterpret_cast<__gm__ bfloat16_t *>(kj->buffer.addr);
+    __gm__ float *sij_addr = reinterpret_cast<__gm__ float *>(sij->buffer.addr);
+
+    // qi (M, K) bf16 in ND (row-major) layout
+    using GlobalA = GlobalTensor<bfloat16_t, Shape<1, 1, 1, M, K>, Stride<M * K, M * K, M * K, K, 1>>;
+    // kj stored as (N, K) row-major = (K, N) column-major -> DN layout
+    using GlobalB = GlobalTensor<bfloat16_t, Shape<1, 1, 1, K, N>, Stride<K * N, K * N, K * N, 1, K>, Layout::DN>;
+    using GlobalOut = GlobalTensor<float, Shape<1, 1, 1, M, N>, Stride<M * N, M * N, M * N, N, 1>>;
+
+    GlobalA qiGlobal(qi_addr + qi->start_offset);
+    GlobalB kjGlobal(kj_addr + kj->start_offset);
+    GlobalOut sijGlobal(sij_addr + sij->start_offset);
+
+    // L1 Mat tiles: A is standard ND, B uses transposed-B pattern (RowMajor/ColMajor)
+    using TileMatA = Tile<TileType::Mat, bfloat16_t, M, K, BLayout::ColMajor, M, K, SLayout::RowMajor, 512>;
+    using TileMatB = Tile<TileType::Mat, bfloat16_t, K, N, BLayout::RowMajor, K, N, SLayout::ColMajor, 512>;
+
+    // L0 tiles
+    using LeftTile = TileLeft<bfloat16_t, M, K, M, K>;
+    using RightTile = TileRight<bfloat16_t, K, N, K, N>;
+    using AccTile = TileAcc<float, M, N, M, N>;
+
+    TileMatA aMatTile;
+    TileMatB bMatTile;
+    TASSIGN(aMatTile, 0x0);
+    TASSIGN(bMatTile, 0x20000);
+
+    LeftTile aTile;
+    RightTile bTile;
+    AccTile cTile;
+    TASSIGN(aTile, 0x0);
+    TASSIGN(bTile, 0x0);
+    TASSIGN(cTile, 0x0);
+
+    // Load A and B to L1 with separate events for pipeline overlap
+    TLOAD(aMatTile, qiGlobal);
+    set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);  // A load done
+    TLOAD(bMatTile, kjGlobal);
+    set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID1);  // B load done
+
+    // Move A to L0A as soon as A load completes (B may still be loading)
+    wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
+    TMOV(aTile, aMatTile);
+    // Move B to L0B after B load completes
+    wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID1);
+    TMOV(bTile, bMatTile);
+
+    set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
+    wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
+
+    // Matmul
+    TMATMUL(cTile, aTile, bTile);
+
+    set_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
+    wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
+
+    TSTORE(sijGlobal, cTile);
+
+    pipe_sync();
+}
+
+extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) {
+    __gm__ Tensor *qi = reinterpret_cast<__gm__ Tensor *>(args[0]);
+    __gm__ Tensor *kj = reinterpret_cast<__gm__ Tensor *>(args[1]);
+    __gm__ Tensor *sij = reinterpret_cast<__gm__ Tensor *>(args[2]);
+    uint64_t q_tile_size = static_cast<uint64_t>(qi->shapes[0]);
+
+    if (q_tile_size == 16 && qi->shapes[1] <= 16) {
+        qk_matmul_impl<16, 16, 16>(qi, kj, sij);
+    } else if (q_tile_size == 16) {
+        qk_matmul_impl<16, 128, 128>(qi, kj, sij);
+    } else {
+        qk_matmul_impl<64, 128, 64>(qi, kj, sij);
+    }
+}
diff --git a/examples/a2a3/fully_distributed_within_core/paged_attention_manual_scope/kernels/aiv/aiv_online_update.cpp b/examples/a2a3/fully_distributed_within_core/paged_attention_manual_scope/kernels/aiv/aiv_online_update.cpp
new file mode 100644
index 000000000..ded4dcad8
--- /dev/null
+++ b/examples/a2a3/fully_distributed_within_core/paged_attention_manual_scope/kernels/aiv/aiv_online_update.cpp
@@ -0,0 +1,256 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+// Online Softmax Update + Normalize Kernel (AIV)
+//
+// Operates on full tiles where M=q_tile_size, N=head_dim (128):
+//   Case1: oi/oi_new are (16, 128), mij/lij/mi/li are 16-element vectors
+//   Case2: oi/oi_new are (64, 128), mij/lij/mi/li are 64-element vectors
+//
+// Scalar layout strategy using TRESHAPE (zero-copy UB reshape):
+//   Scalars loaded as DN ColMajor (M, 1) for TROWEXPANDMUL/TROWEXPANDDIV.
+//   For element-wise ops (TMAX, TSUB, TEXP, etc.), TRESHAPE to RowMajor (1, M).
+//   After arithmetic, TRESHAPE back to ColMajor (M, 1) for row-broadcast ops.
+//   This eliminates the GM round-trip (TSTORE ND → TLOAD DN) used in the original.
+
+#include <cstdint>
+#include <pto/pto-inst.hpp>
+
+#include "tensor.h"
+
+using namespace pto;
+
+#include "pipe_sync.h"
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__ [aicore]
+#endif
+
+template <int M, int N>
+static __aicore__ void online_update_impl(
+    __gm__ Tensor *mij, __gm__ Tensor *lij, __gm__ Tensor *oi_new, __gm__ Tensor *mi, __gm__ Tensor *li,
+    __gm__ Tensor *oi, uint64_t is_first, uint64_t is_last, __gm__ Tensor *dst
+) {
+    __gm__ float *mij_ptr = reinterpret_cast<__gm__ float *>(mij->buffer.addr);
+    __gm__ float *lij_ptr = reinterpret_cast<__gm__ float *>(lij->buffer.addr);
+    __gm__ float *oi_new_ptr = reinterpret_cast<__gm__ float *>(oi_new->buffer.addr);
+    __gm__ float *mi_ptr = reinterpret_cast<__gm__ float *>(mi->buffer.addr);
+    __gm__ float *li_ptr = reinterpret_cast<__gm__ float *>(li->buffer.addr);
+    __gm__ float *oi_ptr = reinterpret_cast<__gm__ float *>(oi->buffer.addr);
+    __gm__ float *dst_ptr = reinterpret_cast<__gm__ float *>(dst->buffer.addr);
+
+    // Aligned rows for ColMajor DN tiles (32-byte alignment)
+    constexpr int kAlignedRows = ((M * sizeof(float) + 31) / 32) * (32 / sizeof(float));
+
+    // --- GlobalTensor types ---
+
+    // Data (M, N) RowMajor
+    using GlobalDataMxN = GlobalTensor<float, Shape<1, 1, 1, M, N>, Stride<1, 1, 1, N, 1>>;
+
+    // Scalar DN: M contiguous floats as (kAlignedRows, 1) ColMajor for TROWEXPAND ops and loading
+    using GlobalScalarDN = GlobalTensor<float, Shape<1, 1, 1, kAlignedRows, 1>, Stride<1, 1, 1, 1, 1>, Layout::DN>;
+
+    // Scalar ND: for storing mi_new and li_new back to GM
+    constexpr int kScalarCols = 32 / sizeof(float);
+    constexpr int kScalarRows = M / kScalarCols;
+    using GlobalScalarND =
+        GlobalTensor<float, Shape<1, 1, 1, kScalarRows, kScalarCols>, Stride<1, 1, 1, kScalarCols, 1>>;
+
+    // --- GlobalTensor instances ---
+
+    GlobalDataMxN oiNewGlobal(oi_new_ptr + oi_new->start_offset);
+    GlobalDataMxN oiGlobal(oi_ptr + oi->start_offset);
+    GlobalDataMxN dstGlobal(dst_ptr + dst->start_offset);
+
+    // DN globals for loading scalars as ColMajor
+    GlobalScalarDN mijGlobalDN(mij_ptr + mij->start_offset);
+    GlobalScalarDN lijGlobalDN(lij_ptr + lij->start_offset);
+    GlobalScalarDN miGlobalDN(mi_ptr + mi->start_offset);
+    GlobalScalarDN liGlobalDN(li_ptr + li->start_offset);
+
+    // ND globals for storing scalar results
+    GlobalScalarND miGlobalND(mi_ptr + mi->start_offset);
+    GlobalScalarND liGlobalND(li_ptr + li->start_offset);
+
+    // --- Tile types ---
+
+    using TileDataMxN = Tile<TileType::Vec, float, M, N, BLayout::RowMajor, M, N>;
+    using TileScalarDN = Tile<TileType::Vec, float, kAlignedRows, 1, BLayout::ColMajor, M, 1>;
+
+    // RowMajor (1, M) tiles for element-wise arithmetic via TRESHAPE
+    using TileScalarRow = Tile<TileType::Vec, float, 1, M, BLayout::RowMajor, 1, M>;
+
+    // ND tile for storing back to GM
+    using TileScalarND =
+        Tile<TileType::Vec, float, kScalarRows, kScalarCols, BLayout::RowMajor, kScalarRows, kScalarCols>;
+
+    // --- UB memory layout ---
+
+    constexpr int kDataBytes = M * N * sizeof(float);
+    constexpr int kScalarDNBytes = kAlignedRows * sizeof(float);
+
+    // Data tiles
+    TileDataMxN oiNewTile;
+    TileDataMxN oiTile;
+
+    // Scalar DN tiles loaded from GM (ColMajor)
+    TileScalarDN mijDN, lijDN, miDN, liDN;
+
+    // Temporary DN tiles for results
+    TileScalarDN miNewDN, alphaDN, betaDN, liNewDN, tmpDN;
+
+    TASSIGN(oiNewTile, 0);
+    TASSIGN(oiTile, kDataBytes);
+    TASSIGN(mijDN, 2 * kDataBytes);
+    TASSIGN(lijDN, 2 * kDataBytes + kScalarDNBytes);
+    TASSIGN(miDN, 2 * kDataBytes + 2 * kScalarDNBytes);
+    TASSIGN(liDN, 2 * kDataBytes + 3 * kScalarDNBytes);
+    TASSIGN(miNewDN, 2 * kDataBytes + 4 * kScalarDNBytes);
+    TASSIGN(alphaDN, 2 * kDataBytes + 5 * kScalarDNBytes);
+    TASSIGN(betaDN, 2 * kDataBytes + 6 * kScalarDNBytes);
+    TASSIGN(liNewDN, 2 * kDataBytes + 7 * kScalarDNBytes);
+    TASSIGN(tmpDN, 2 * kDataBytes + 8 * kScalarDNBytes);
+
+    if (is_first) {
+        // --- First block: copy inputs to accumulators ---
+        TLOAD(oiNewTile, oiNewGlobal);
+        TLOAD(mijDN, mijGlobalDN);
+        TLOAD(lijDN, lijGlobalDN);
+        set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+        wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+
+        // Store mi = mij, li = lij, oi = oi_new
+        // Alias ND tiles to the same UB as DN tiles for storing as ND format
+        TileScalarND mijND, lijND;
+        TASSIGN(mijND, 2 * kDataBytes);                   // alias same UB as mijDN
+        TASSIGN(lijND, 2 * kDataBytes + kScalarDNBytes);  // alias same UB as lijDN
+
+        set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+        wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+        TSTORE(miGlobalND, mijND);    // mi = mij
+        TSTORE(liGlobalND, lijND);    // li = lij
+        TSTORE(oiGlobal, oiNewTile);  // oi = oi_new
+
+        if (is_last) {
+            // Single block: normalize dst = oi_new / lij
+            // lijDN already in ColMajor DN format, use directly for TROWEXPANDDIV
+            set_flag(PIPE_MTE3, PIPE_V, EVENT_ID0);
+            wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID0);
+            TROWEXPANDDIV(oiNewTile, oiNewTile, lijDN);
+            set_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);
+            wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);
+            TSTORE(dstGlobal, oiNewTile);
+        }
+    } else {
+        // --- Subsequent blocks: accumulate ---
+
+        // Load all inputs
+        TLOAD(oiNewTile, oiNewGlobal);
+        TLOAD(oiTile, oiGlobal);
+        TLOAD(mijDN, mijGlobalDN);
+        TLOAD(lijDN, lijGlobalDN);
+        TLOAD(miDN, miGlobalDN);
+        TLOAD(liDN, liGlobalDN);
+        set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+        wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+
+        // TRESHAPE: ColMajor(M,1) → RowMajor(1,M) for element-wise arithmetic
+        TileScalarRow miRow, mijRow, liRow, lijRow;
+        TRESHAPE(miRow, miDN);
+        TRESHAPE(mijRow, mijDN);
+        TRESHAPE(liRow, liDN);
+        TRESHAPE(lijRow, lijDN);
+
+        // Scalar arithmetic in RowMajor (1, M) layout
+        TileScalarRow miNewRow, alphaRow, betaRow, liNewRow, tmpRow;
+        TASSIGN(miNewRow, 2 * kDataBytes + 4 * kScalarDNBytes);
+        TASSIGN(alphaRow, 2 * kDataBytes + 5 * kScalarDNBytes);
+        TASSIGN(betaRow, 2 * kDataBytes + 6 * kScalarDNBytes);
+        TASSIGN(liNewRow, 2 * kDataBytes + 7 * kScalarDNBytes);
+        TASSIGN(tmpRow, 2 * kDataBytes + 8 * kScalarDNBytes);
+
+        TMAX(miNewRow, miRow, mijRow);  // mi_new = max(mi, mij)
+        pipe_barrier(PIPE_V);
+        TSUB(alphaRow, miRow, miNewRow);  // alpha_exp = mi - mi_new
+        pipe_barrier(PIPE_V);
+        TEXP(alphaRow, alphaRow);  // alpha = exp(mi - mi_new)
+        pipe_barrier(PIPE_V);
+        TSUB(betaRow, mijRow, miNewRow);  // beta_exp = mij - mi_new
+        pipe_barrier(PIPE_V);
+        TEXP(betaRow, betaRow);  // beta = exp(mij - mi_new)
+        pipe_barrier(PIPE_V);
+        TMUL(tmpRow, alphaRow, liRow);  // alpha * li
+        pipe_barrier(PIPE_V);
+        TMUL(liNewRow, betaRow, lijRow);  // beta * lij
+        pipe_barrier(PIPE_V);
+        TADD(liNewRow, tmpRow, liNewRow);  // li_new = alpha*li + beta*lij
+
+        // TRESHAPE back: RowMajor(1,M) → ColMajor(M,1) for TROWEXPANDMUL
+        TRESHAPE(alphaDN, alphaRow);
+        TRESHAPE(betaDN, betaRow);
+
+        // Scale data tiles using row-broadcast multiply
+        TROWEXPANDMUL(oiTile, oiTile, alphaDN);       // oi *= alpha
+        TROWEXPANDMUL(oiNewTile, oiNewTile, betaDN);  // oi_new *= beta
+        pipe_barrier(PIPE_V);
+        TADD(oiTile, oiTile, oiNewTile);  // oi = alpha*oi + beta*oi_new
+
+        // Store mi_new and li_new to GM (ND format)
+        // Alias ND tiles to the same UB locations as miNewRow and liNewRow
+        TileScalarND miNewND, liNewND;
+        TASSIGN(miNewND, 2 * kDataBytes + 4 * kScalarDNBytes);
+        TASSIGN(liNewND, 2 * kDataBytes + 7 * kScalarDNBytes);
+
+        if (is_last) {
+            // Normalize and output: dst = oi / li_new
+            TRESHAPE(liNewDN, liNewRow);
+            pipe_barrier(PIPE_V);
+            TROWEXPANDDIV(oiTile, oiTile, liNewDN);
+            set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+            wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+            TSTORE(miGlobalND, miNewND);  // persist mi_new
+            TSTORE(liGlobalND, liNewND);  // persist li_new
+            TSTORE(dstGlobal, oiTile);
+        } else {
+            // Store updated accumulators
+            set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+            wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+            TSTORE(miGlobalND, miNewND);  // persist mi_new
+            TSTORE(liGlobalND, liNewND);  // persist li_new
+            TSTORE(oiGlobal, oiTile);
+        }
+    }
+    pipe_sync();
+}
+
+extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) {
+    __gm__ Tensor *mij = reinterpret_cast<__gm__ Tensor *>(args[0]);
+    __gm__ Tensor *lij = reinterpret_cast<__gm__ Tensor *>(args[1]);
+    __gm__ Tensor *oi_new = reinterpret_cast<__gm__ Tensor *>(args[2]);
+    __gm__ Tensor *mi = reinterpret_cast<__gm__ Tensor *>(args[3]);
+    __gm__ Tensor *li = reinterpret_cast<__gm__ Tensor *>(args[4]);
+    __gm__ Tensor *oi = reinterpret_cast<__gm__ Tensor *>(args[5]);
+    __gm__ Tensor *dst = reinterpret_cast<__gm__ Tensor *>(args[6]);
+    uint64_t is_first = static_cast<uint64_t>(args[7]);
+    uint64_t is_last = static_cast<uint64_t>(args[8]);
+    uint64_t q_tile_size = static_cast<uint64_t>(mij->shapes[0]);
+
+    if (q_tile_size == 16 && oi_new->shapes[1] <= 16) {
+        online_update_impl<16, 16>(mij, lij, oi_new, mi, li, oi, is_first, is_last, dst);
+    } else if (q_tile_size == 16) {
+        online_update_impl<16, 128>(mij, lij, oi_new, mi, li, oi, is_first, is_last, dst);
+    } else {
+        online_update_impl<64, 128>(mij, lij, oi_new, mi, li, oi, is_first, is_last, dst);
+    }
+}
diff --git a/examples/a2a3/fully_distributed_within_core/paged_attention_manual_scope/kernels/aiv/aiv_softmax_prepare.cpp b/examples/a2a3/fully_distributed_within_core/paged_attention_manual_scope/kernels/aiv/aiv_softmax_prepare.cpp
new file mode 100644
index 000000000..8f0c41775
--- /dev/null
+++ b/examples/a2a3/fully_distributed_within_core/paged_attention_manual_scope/kernels/aiv/aiv_softmax_prepare.cpp
@@ -0,0 +1,156 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+// Softmax Preparation Kernel (AIV) with partial block masking
+//
+// Operates on (M, N) tile where M=q_tile_size, N=block_size:
+//   Case1: sij is (16, 128)
+//   Case2: sij is (64, 64)
+//
+// For partial blocks (valid_len < N), positions [valid_len, N) in sij are
+// filled with -inf via TFILLPAD_INPLACE before softmax, ensuring exp(-inf)=0
+// so that invalid key positions contribute zero attention weight.
+//
+// Computes:
+//   sij_masked = TFILLPAD(sij, valid_len, pad=-inf)
+//   sij_scale = sij_masked * scale
+//   mij = row_max(sij_scale)        -> (M, 1)
+//   pij = exp(sij_scale - mij)      -> (M, N)
+//   lij = row_sum(pij)              -> (M, 1)
+
+#include <cstdint>
+#include <pto/pto-inst.hpp>
+
+#include "tensor.h"
+
+using namespace pto;
+
+#include "pipe_sync.h"
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__ [aicore]
+#endif
+
+template <int M, int N>
+static __aicore__ void softmax_prepare_impl(
+    __gm__ Tensor *sij, float scale_value, __gm__ Tensor *pij, __gm__ Tensor *mij, __gm__ Tensor *lij
+) {
+    uint64_t valid_len = static_cast<uint64_t>(sij->shapes[1]);
+    __gm__ float *sij_addr = reinterpret_cast<__gm__ float *>(sij->buffer.addr);
+    __gm__ bfloat16_t *pij_addr = reinterpret_cast<__gm__ bfloat16_t *>(pij->buffer.addr);
+    __gm__ float *mij_addr = reinterpret_cast<__gm__ float *>(mij->buffer.addr);
+    __gm__ float *lij_addr = reinterpret_cast<__gm__ float *>(lij->buffer.addr);
+
+    constexpr int kAlignedRows = ((M * sizeof(float) + 31) / 32) * (32 / sizeof(float));
+
+    using GlobalDataMxN = GlobalTensor<float, Shape<1, 1, 1, M, N>, Stride<1, 1, 1, N, 1>>;
+    using GlobalDataMxN_bf16 = GlobalTensor<bfloat16_t, Shape<1, 1, 1, M, N>, Stride<1, 1, 1, N, 1>>;
+    using GlobalScalarDN = GlobalTensor<float, Shape<1, 1, 1, kAlignedRows, 1>, Stride<1, 1, 1, 1, 1>, Layout::DN>;
+
+    GlobalDataMxN sijGlobal(sij_addr + sij->start_offset);
+    GlobalDataMxN_bf16 pijGlobal(pij_addr + pij->start_offset);
+    GlobalScalarDN mijGlobal(mij_addr + mij->start_offset);
+    GlobalScalarDN lijGlobal(lij_addr + lij->start_offset);
+
+    // Dynamic-cols tile: marks which columns are valid for TFILLPAD boundary
+    using TileSijDyn = Tile<TileType::Vec, float, M, N, BLayout::RowMajor, M, -1>;
+    // Padded tile: TFILLPAD_INPLACE fills positions [valid_len, N) with -inf
+    using TileSijPad = Tile<TileType::Vec, float, M, N, BLayout::RowMajor, M, N, SLayout::NoneBox, 512, PadValue::Min>;
+
+    using TileVecMxN = Tile<TileType::Vec, float, M, N, BLayout::RowMajor, M, N>;
+    using TileVecMxN_bf16 = Tile<TileType::Vec, bfloat16_t, M, N, BLayout::RowMajor, M, N>;
+    using TileScalarDN = Tile<TileType::Vec, float, kAlignedRows, 1, BLayout::ColMajor, M, 1>;
+
+    TileVecMxN sijTile;
+    TileSijDyn sijDynTile(static_cast<size_t>(valid_len));
+    TileSijPad sijPadTile;
+    TileVecMxN pijTile;
+    TileVecMxN tmpTile;
+    TileScalarDN maxTile;
+    TileScalarDN sumTile;
+    TileVecMxN_bf16 pijBf16Tile;
+
+    // All sij tiles share UB address 0x0 (in-place masking)
+    TASSIGN(sijTile, 0x0);
+    TASSIGN(sijDynTile, 0x0);
+    TASSIGN(sijPadTile, 0x0);
+    TASSIGN(pijTile, M * N * sizeof(float));
+    TASSIGN(tmpTile, 2 * M * N * sizeof(float));
+    TASSIGN(maxTile, 3 * M * N * sizeof(float));
+    TASSIGN(sumTile, 3 * M * N * sizeof(float) + kAlignedRows * sizeof(float));
+    TASSIGN(pijBf16Tile, 3 * M * N * sizeof(float) + 2 * kAlignedRows * sizeof(float));
+
+    // Load full sij (M, N) tile from GM - all N columns including garbage for partial blocks
+    // printf("sij addr incore %x\n", sij->buffer.addr);
+    TLOAD(sijTile, sijGlobal);
+    set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+    wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+
+    // Mask columns [valid_len, N) with -inf. sijDynTile provides the valid boundary,
+    // sijPadTile provides PadValue::Min as the fill value. No-op when valid_len == N.
+    TFILLPAD_INPLACE(sijPadTile, sijDynTile);
+    pipe_barrier(PIPE_V);
+
+    TMULS(sijTile, sijTile, scale_value);
+    pipe_barrier(PIPE_V);
+    TROWMAX(maxTile, sijTile, tmpTile);
+    pipe_barrier(PIPE_V);
+    TROWEXPANDSUB(pijTile, sijTile, maxTile);
+    pipe_barrier(PIPE_V);
+    TEXP(pijTile, pijTile);
+    // Truncate pij to bf16 first
+    pipe_barrier(PIPE_V);
+    TCVT(pijBf16Tile, pijTile, RoundMode::CAST_ROUND);
+    set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);  // pij bf16 ready, can store early
+
+    // Continue computing: bf16 → f32 and rowsum while pij store proceeds in parallel
+    pipe_barrier(PIPE_V);
+    TCVT(pijTile, pijBf16Tile, RoundMode::CAST_ROUND);
+    pipe_barrier(PIPE_V);
+    TROWSUM(sumTile, pijTile, tmpTile);
+    set_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);  // sum ready
+
+    // Store pij (overlaps with TCVT + TROWSUM above)
+    wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+    TSTORE(pijGlobal, pijBf16Tile);
+
+    // Store max and sum
+    TSTORE(mijGlobal, maxTile);
+    wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);
+    TSTORE(lijGlobal, sumTile);
+
+    pipe_sync();
+}
+
+extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) {
+    __gm__ Tensor *sij = reinterpret_cast<__gm__ Tensor *>(args[0]);
+    __gm__ Tensor *pij = reinterpret_cast<__gm__ Tensor *>(args[1]);
+    __gm__ Tensor *mij = reinterpret_cast<__gm__ Tensor *>(args[2]);
+    __gm__ Tensor *lij = reinterpret_cast<__gm__ Tensor *>(args[3]);
+    union {
+        uint64_t u;
+        float f;
+    } scale_conv;
+    scale_conv.u = static_cast<uint64_t>(args[4]);
+    float scale_value = scale_conv.f;
+    uint64_t q_tile_size = static_cast<uint64_t>(sij->shapes[0]);
+
+    if (q_tile_size == 16 && pij->shapes[1] <= 16) {
+        softmax_prepare_impl<16, 16>(sij, scale_value, pij, mij, lij);
+    } else if (q_tile_size == 16) {
+        softmax_prepare_impl<16, 128>(sij, scale_value, pij, mij, lij);
+    } else {
+        softmax_prepare_impl<64, 64>(sij, scale_value, pij, mij, lij);
+    }
+}
diff --git a/examples/a2a3/fully_distributed_within_core/paged_attention_manual_scope/kernels/orchestration/paged_attention_orch.cpp b/examples/a2a3/fully_distributed_within_core/paged_attention_manual_scope/kernels/orchestration/paged_attention_orch.cpp
new file mode 100644
index 000000000..4ddab0a70
--- /dev/null
+++ b/examples/a2a3/fully_distributed_within_core/paged_attention_manual_scope/kernels/orchestration/paged_attention_orch.cpp
@@ -0,0 +1,311 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Paged Attention Orchestration Function - manual-scope variant
+ *
+ * Matches the small-case paged_attention orchestration shape while replacing
+ * the automatic same-scope dependency wiring with explicit task-to-task deps
+ * inside PTO2_SCOPE(PTO2ScopeMode::MANUAL).
+ */
+
+#include <algorithm>
+#include <cinttypes>
+#include <cstdint>
+#include <cstring>
+
+#include "pto_orchestration_api.h"
+
+#define FUNC_QK_MATMUL 0
+#define FUNC_SOFTMAX_PREPARE 1
+#define FUNC_PV_MATMUL 2
+#define FUNC_ONLINE_UPDATE 3
+constexpr uint64_t PLATFORM_PROF_SYS_CNT_FREQ = 50000000;  // 50 MHz
+
+inline double cycles_to_us(uint64_t cycles) {
+    return (static_cast<double>(cycles) / PLATFORM_PROF_SYS_CNT_FREQ) * 1000000.0;
+}
+
+inline uint64_t get_sys_cnt_aicpu() {
+#if defined(__aarch64__)
+    uint64_t ticks;
+    asm volatile("mrs %0, cntvct_el0" : "=r"(ticks));
+    return ticks;
+#elif defined(__x86_64__)
+    return 0;
+#else
+    return 0;
+#endif
+}
+
+#ifdef ENABLE_PROFILING
+#define CYCLE_COUNT_START() uint64_t _t0 = get_sys_cnt_aicpu(), _t1
+#define CYCLE_COUNT_LAP(acc)       \
+    do {                           \
+        _t1 = get_sys_cnt_aicpu(); \
+        acc += (_t1 - _t0);        \
+        _t0 = _t1;                 \
+    } while (0)
+#define PROF_INC(counter, n) (counter) += (n)
+#else
+#define CYCLE_COUNT_START() (void)0
+#define CYCLE_COUNT_LAP(acc) (void)0
+#define PROF_INC(counter, n) (void)0
+#endif
+
+extern "C" {
+
+__attribute__((visibility("default"))) PTO2OrchestrationConfig aicpu_orchestration_config(const L2TaskArgs &orch_args) {
+    (void)orch_args;
+    return PTO2OrchestrationConfig{
+        .expected_arg_count = 7,
+    };
+}
+
+__attribute__((visibility("default"))) void aicpu_orchestration_entry(const L2TaskArgs &orch_args) {
+#ifdef ENABLE_PROFILING
+    uint64_t prof_param_extract = 0;
+    uint64_t prof_ext_tensor = 0;
+    uint64_t prof_scope = 0;
+    uint64_t prof_make_tensor = 0;
+    uint64_t prof_tensor_view = 0;
+    uint64_t prof_param_setup = 0;
+    uint64_t prof_submit_task = 0;
+    int prof_submit_count = 0;
+    int prof_make_count = 0;
+    int prof_view_count = 0;
+#endif
+
+    CYCLE_COUNT_START();
+
+    // Read dimensions from tensor metadata
+    uint64_t batch = orch_args.tensor(0).ref().shapes[0];
+    uint64_t num_heads = orch_args.tensor(0).ref().shapes[1];
+    uint64_t head_dim = orch_args.tensor(0).ref().shapes[2];
+    DataType data_type = orch_args.tensor(0).ref().dtype;
+
+    uint64_t block_size = orch_args.tensor(1).ref().shapes[1];
+    uint64_t block_num = orch_args.tensor(3).ref().shapes[1];
+
+    uint64_t scale_value = orch_args.scalar(0);
+
+    uint64_t q_head_num = num_heads;
+    uint64_t q_tile = std::min(num_heads, static_cast<uint64_t>(128));
+    uint64_t q_loop = (q_head_num + q_tile - 1) / q_tile;
+    CYCLE_COUNT_LAP(prof_param_extract);
+
+    LOG_INFO_V9(">>>>>> batch = %" PRIu64, batch);
+
+    // Reshape tensors for kernel consumption (2D flattened)
+    void *query_ptr = orch_args.tensor(0).ref().data_as<void>();
+    void *kc_ptr = orch_args.tensor(1).ref().data_as<void>();
+    void *vc_ptr = orch_args.tensor(2).ref().data_as<void>();
+    void *out_ptr = orch_args.tensor(5).ref().data_as<void>();
+
+    uint64_t total_blocks_count = orch_args.tensor(1).ref().shapes[0];
+
+    uint32_t query_shapes[2] = {static_cast<uint32_t>(batch * num_heads), static_cast<uint32_t>(head_dim)};
+    uint32_t key_cache_shapes[2] = {
+        static_cast<uint32_t>(total_blocks_count * block_size), static_cast<uint32_t>(head_dim)
+    };
+    uint32_t value_cache_shapes[2] = {
+        static_cast<uint32_t>(total_blocks_count * block_size), static_cast<uint32_t>(head_dim)
+    };
+    uint32_t out_shapes[2] = {static_cast<uint32_t>(batch * num_heads), static_cast<uint32_t>(head_dim)};
+    Tensor query = make_tensor_external(query_ptr, query_shapes, 2, data_type);
+    Tensor key_cache = make_tensor_external(kc_ptr, key_cache_shapes, 2, data_type);
+    Tensor value_cache = make_tensor_external(vc_ptr, value_cache_shapes, 2, data_type);
+    Tensor out = make_tensor_external(out_ptr, out_shapes, 2, DataType::FLOAT32);
+    CYCLE_COUNT_LAP(prof_ext_tensor);
+
+    uint32_t bt_shapes[2] = {static_cast<uint32_t>(batch), static_cast<uint32_t>(block_num)};
+    Tensor block_table =
+        make_tensor_external(orch_args.tensor(3).ref().data_as<void>(), bt_shapes, 2, DataType::INT32, false);
+    uint32_t cl_shapes[1] = {static_cast<uint32_t>(batch)};
+    Tensor context_lens =
+        make_tensor_external(orch_args.tensor(4).ref().data_as<void>(), cl_shapes, 1, DataType::INT32, false);
+
+    // Create infos are loop-invariant — shapes depend only on q_tile/head_dim/block_size
+    uint32_t tile2d_shapes[2] = {static_cast<uint32_t>(q_tile), static_cast<uint32_t>(head_dim)};
+    uint32_t scalar_shapes[1] = {static_cast<uint32_t>(q_tile)};
+    uint32_t sij_shapes[2] = {static_cast<uint32_t>(q_tile), static_cast<uint32_t>(block_size)};
+    TensorCreateInfo tile2d_ci(tile2d_shapes, 2, DataType::FLOAT32);
+    TensorCreateInfo scalar_ci(scalar_shapes, 1, DataType::FLOAT32);
+    TensorCreateInfo sij_ci(sij_shapes, 2, DataType::FLOAT32);
+    TensorCreateInfo pij_f16_ci(sij_shapes, 2, data_type);
+
+    PROF_INC(prof_make_count, 4);
+    CYCLE_COUNT_LAP(prof_make_tensor);
+
+    for (uint64_t b_idx = 0; b_idx < batch; b_idx++) {
+        uint32_t cl_idx[1] = {static_cast<uint32_t>(b_idx)};
+        uint64_t cur_seq = static_cast<uint64_t>(get_tensor_data<int32_t>(context_lens, 1, cl_idx));
+        uint64_t bn_this_batch = (cur_seq + block_size - 1) / block_size;
+        for (uint64_t q_idx = 0; q_idx < q_loop; q_idx++) {
+            PTO2_SCOPE(PTO2ScopeMode::MANUAL) {
+                CYCLE_COUNT_LAP(prof_scope);
+                uint64_t cur_offset = b_idx * q_head_num + q_idx * q_tile;
+
+                uint32_t qi_offsets[2] = {static_cast<uint32_t>(cur_offset), 0};
+                Tensor qi = query.view(tile2d_shapes, qi_offsets);
+                uint32_t out_view_offsets[2] = {static_cast<uint32_t>(cur_offset), 0};
+                Tensor out_view = out.view(tile2d_shapes, out_view_offsets);
+                PROF_INC(prof_view_count, 2);
+                CYCLE_COUNT_LAP(prof_tensor_view);
+
+                CYCLE_COUNT_LAP(prof_param_setup);
+                TaskOutputTensors alloc_outs = alloc_tensors(tile2d_ci, scalar_ci, scalar_ci);
+                const Tensor &oi = alloc_outs.get_ref(0);
+                const Tensor &li_update = alloc_outs.get_ref(1);
+                const Tensor &mi_update = alloc_outs.get_ref(2);
+                PTO2TaskId alloc_task = alloc_outs.task_id();
+                PTO2TaskId prev_update_task = PTO2TaskId::invalid();
+                PROF_INC(prof_submit_count, 1);
+                CYCLE_COUNT_LAP(prof_submit_task);
+
+                for (uint64_t bn = 0; bn < bn_this_batch; bn++) {
+                    uint32_t bt_idx[2] = {static_cast<uint32_t>(b_idx), static_cast<uint32_t>(bn)};
+                    uint64_t cur_block_idx = static_cast<uint64_t>(get_tensor_data<int32_t>(block_table, 2, bt_idx));
+                    uint64_t valid_len = std::min(block_size, cur_seq - bn * block_size);
+                    CYCLE_COUNT_LAP(prof_param_extract);
+
+                    uint32_t kv_shapes[2] = {static_cast<uint32_t>(block_size), static_cast<uint32_t>(head_dim)};
+                    uint32_t kv_offsets[2] = {static_cast<uint32_t>(cur_block_idx * block_size), 0};
+                    Tensor kj = key_cache.view(kv_shapes, kv_offsets);
+                    Tensor vj = value_cache.view(kv_shapes, kv_offsets);
+                    PROF_INC(prof_view_count, 2);
+                    CYCLE_COUNT_LAP(prof_tensor_view);
+
+                    L0TaskArgs params_qk;
+                    params_qk.add_input(qi);
+                    params_qk.add_input(kj);
+                    params_qk.add_output(sij_ci);
+                    CYCLE_COUNT_LAP(prof_param_setup);
+                    TaskOutputTensors qk_outs = rt_submit_aic_task(FUNC_QK_MATMUL, params_qk);
+                    const Tensor &sij = qk_outs.get_ref(0);
+                    PROF_INC(prof_submit_count, 1);
+                    CYCLE_COUNT_LAP(prof_submit_task);
+
+                    uint32_t sij_valid_shapes[2] = {static_cast<uint32_t>(q_tile), static_cast<uint32_t>(valid_len)};
+                    uint32_t sij_valid_offsets[2] = {0, 0};
+                    Tensor sij_valid = sij.view(sij_valid_shapes, sij_valid_offsets);
+                    PROF_INC(prof_view_count, 1);
+                    CYCLE_COUNT_LAP(prof_tensor_view);
+
+                    // --- Primitive dep API (Arg + set_dependencies) ---
+                    // Caller owns the deps buffer; Arg stores (ptr, count).
+                    // Suited for codegen and for cases with a fixed dep set.
+                    L0TaskArgs params_sf;
+                    params_sf.add_input(sij_valid);
+                    params_sf.add_output(pij_f16_ci);
+                    params_sf.add_output(scalar_ci);
+                    params_sf.add_output(scalar_ci);
+                    PTO2TaskId sf_deps[] = {qk_outs.task_id()};
+                    params_sf.set_dependencies(sf_deps, 1);
+                    params_sf.add_scalar(scale_value);
+                    CYCLE_COUNT_LAP(prof_param_setup);
+                    TaskOutputTensors sf_outs = rt_submit_aiv_task(FUNC_SOFTMAX_PREPARE, params_sf);
+                    const Tensor &pij_f16 = sf_outs.get_ref(0);
+                    const Tensor &mi = sf_outs.get_ref(1);
+                    const Tensor &li = sf_outs.get_ref(2);
+                    PROF_INC(prof_submit_count, 1);
+                    CYCLE_COUNT_LAP(prof_submit_task);
+
+                    L0TaskArgs params_pv;
+                    params_pv.add_input(pij_f16);
+                    params_pv.add_input(vj);
+                    params_pv.add_output(tile2d_ci);
+                    PTO2TaskId pv_deps[] = {sf_outs.task_id()};
+                    params_pv.set_dependencies(pv_deps, 1);
+                    CYCLE_COUNT_LAP(prof_param_setup);
+                    TaskOutputTensors pv_outs = rt_submit_aic_task(FUNC_PV_MATMUL, params_pv);
+                    const Tensor &oi_tmp = pv_outs.get_ref(0);
+                    PROF_INC(prof_submit_count, 1);
+                    CYCLE_COUNT_LAP(prof_submit_task);
+
+                    uint64_t is_first = (bn == 0) ? 1 : 0;
+                    uint64_t is_last = (bn == bn_this_batch - 1) ? 1 : 0;
+                    CYCLE_COUNT_LAP(prof_param_extract);
+
+                    // --- Convenience dep API (L0TaskArgsWithDeps + add_dep) ---
+                    // Wrapper owns a stack-sized deps buffer and accepts
+                    // incremental add_dep() calls; the submit overload binds
+                    // them to the underlying Arg via set_dependencies(...).
+                    // Suited for hand-written orch where the dep set is
+                    // assembled conditionally across branches.
+                    L0TaskArgsWithDeps<> params_up;
+                    params_up.add_input(mi);
+                    params_up.add_input(li);
+                    params_up.add_input(oi_tmp);
+                    params_up.add_inout(mi_update);
+                    params_up.add_inout(li_update);
+                    params_up.add_inout(oi);
+                    params_up.add_inout(out_view);
+                    // UP reads SF's mi/li, but SF -> PV -> UP already orders it; only the PV edge is explicit.
+                    params_up.add_dep(pv_outs.task_id());
+                    if (prev_update_task.is_valid()) {
+                        params_up.add_dep(prev_update_task);
+                    }
+                    // alloc completes inline; this dep only keeps the scratch buffers alive until the last consumer.
+                    if (is_last) {
+                        params_up.add_dep(alloc_task);
+                    }
+                    params_up.add_scalar(is_first);
+                    params_up.add_scalar(is_last);
+                    CYCLE_COUNT_LAP(prof_param_setup);
+                    TaskOutputTensors up_outs = rt_submit_aiv_task(FUNC_ONLINE_UPDATE, params_up);
+                    prev_update_task = up_outs.task_id();
+                    PROF_INC(prof_submit_count, 1);
+                    CYCLE_COUNT_LAP(prof_submit_task);
+                }
+            }
+            CYCLE_COUNT_LAP(prof_scope);
+        }
+    }
+
+#ifdef ENABLE_PROFILING
+    uint64_t total = prof_param_extract + prof_ext_tensor + prof_make_tensor + prof_tensor_view + prof_param_setup +
+                     prof_submit_task + prof_scope;
+    LOG_INFO_V9(
+        "=== PagedAttn Orch Profiling: %d submits, %d makes, %d views, total=%.3fus ===", prof_submit_count,
+        prof_make_count, prof_view_count, cycles_to_us(total)
+    );
+    if (total > 0) {
+        LOG_INFO_V9(
+            "  param_extract    : %7.3fus (%5.1f%%)", cycles_to_us(prof_param_extract),
+            prof_param_extract * 100.0 / total
+        );
+        LOG_INFO_V9(
+            "  ext_tensor(x4)   : %7.3fus (%5.1f%%)", cycles_to_us(prof_ext_tensor), prof_ext_tensor * 100.0 / total
+        );
+        LOG_INFO_V9(
+            "  create_info(x%d) : %7.3fus (%5.1f%%)  avg=%.3fus", prof_make_count, cycles_to_us(prof_make_tensor),
+            prof_make_tensor * 100.0 / total,
+            prof_make_count > 0 ? cycles_to_us(prof_make_tensor) / prof_make_count : 0.0
+        );
+        LOG_INFO_V9(
+            "  tensor_view(x%d) : %7.3fus (%5.1f%%)  avg=%.3fus", prof_view_count, cycles_to_us(prof_tensor_view),
+            prof_tensor_view * 100.0 / total,
+            prof_view_count > 0 ? cycles_to_us(prof_tensor_view) / prof_view_count : 0.0
+        );
+        LOG_INFO_V9(
+            "  param_setup      : %7.3fus (%5.1f%%)", cycles_to_us(prof_param_setup), prof_param_setup * 100.0 / total
+        );
+        LOG_INFO_V9("  scope            : %7.3fus (%5.1f%%)", cycles_to_us(prof_scope), prof_scope * 100.0 / total);
+        LOG_INFO_V9(
+            "  submit_task(x%d) : %7.3fus (%5.1f%%)  avg=%.3fus", prof_submit_count, cycles_to_us(prof_submit_task),
+            prof_submit_task * 100.0 / total,
+            prof_submit_count > 0 ? cycles_to_us(prof_submit_task) / prof_submit_count : 0.0
+        );
+    }
+#endif
+}
+
+}  // extern "C"
diff --git a/examples/a2a3/fully_distributed_within_core/paged_attention_manual_scope/test_paged_attention.py b/examples/a2a3/fully_distributed_within_core/paged_attention_manual_scope/test_paged_attention.py
new file mode 100644
index 000000000..971c714b6
--- /dev/null
+++ b/examples/a2a3/fully_distributed_within_core/paged_attention_manual_scope/test_paged_attention.py
@@ -0,0 +1,201 @@
+#!/usr/bin/env python3
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""Paged attention manual-scope wrapper for A2A3 tensormap_and_ringbuffer."""
+
+import torch
+from simpler.task_interface import ArgDirection as D
+
+from simpler_setup import Scalar, SceneTestCase, TaskArgsBuilder, Tensor, scene_test
+from simpler_setup.goldens.paged_attention import compute_golden as _pa_compute_golden
+from simpler_setup.goldens.paged_attention import generate_inputs as _pa_generate_inputs
+
+
+@scene_test(level=2, runtime="fully_distributed_within_core")
+class TestPagedAttentionManualScope(SceneTestCase):
+    RTOL = 1e-3
+    ATOL = 1e-3
+
+    CALLABLE = {
+        "orchestration": {
+            "source": "kernels/orchestration/paged_attention_orch.cpp",
+            "function_name": "aicpu_orchestration_entry",
+            "signature": [D.IN, D.IN, D.IN, D.IN, D.IN, D.OUT],
+        },
+        "incores": [
+            {
+                "func_id": 0,
+                "name": "QK",
+                "source": "kernels/aic/aic_qk_matmul.cpp",
+                "core_type": "aic",
+                "signature": [D.IN, D.IN, D.OUT],
+            },
+            {
+                "func_id": 1,
+                "name": "SF",
+                "source": "kernels/aiv/aiv_softmax_prepare.cpp",
+                "core_type": "aiv",
+                "signature": [D.IN, D.OUT, D.OUT, D.OUT],
+            },
+            {
+                "func_id": 2,
+                "name": "PV",
+                "source": "kernels/aic/aic_pv_matmul.cpp",
+                "core_type": "aic",
+                "signature": [D.IN, D.IN, D.OUT],
+            },
+            {
+                "func_id": 3,
+                "name": "UP",
+                "source": "kernels/aiv/aiv_online_update.cpp",
+                "core_type": "aiv",
+                "signature": [D.IN, D.IN, D.IN, D.INOUT, D.INOUT, D.INOUT, D.INOUT],
+            },
+        ],
+    }
+
+    CASES = [
+        {
+            "name": "Case1",
+            "platforms": ["a2a3"],
+            # Long-context cases submit >16384 in-flight tasks into a single
+            # MANUAL scope; the default per-ring task window (16384) can fill
+            # before the oldest task retires and wedge the orchestrator
+            # (FLOW_CONTROL_DEADLOCK / code 3). Double the window for headroom.
+            "config": {"aicpu_thread_num": 4, "block_dim": 24, "runtime_env": {"ring_task_window": 32768}},
+            "params": {
+                "batch": 256,
+                "num_heads": 16,
+                "kv_head_num": 1,
+                "head_dim": 128,
+                "block_size": 128,
+                "context_len": 8192,
+                "max_model_len": 32768,
+                "dtype": "bfloat16",
+            },
+        },
+        {
+            "name": "Case2",
+            "platforms": ["a2a3"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 24},
+            "manual": True,
+            "params": {
+                "batch": 64,
+                "num_heads": 64,
+                "kv_head_num": 1,
+                "head_dim": 128,
+                "block_size": 64,
+                "context_len": 8192,
+                "max_model_len": 32768,
+                "dtype": "bfloat16",
+            },
+        },
+        {
+            "name": "Case3",
+            "platforms": ["a2a3"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 24},
+            "manual": True,
+            "params": {
+                "batch": 64,
+                "num_heads": 64,
+                "kv_head_num": 1,
+                "head_dim": 256,
+                "block_size": 64,
+                "context_len": 8192,
+                "max_model_len": 32768,
+                "dtype": "bfloat16",
+            },
+        },
+        {
+            "name": "CaseSmall1",
+            "platforms": ["a2a3sim", "a2a3"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 9},
+            "params": {
+                "batch": 1,
+                "num_heads": 16,
+                "kv_head_num": 1,
+                "head_dim": 16,
+                "block_size": 16,
+                "context_len": 33,
+                "max_model_len": 256,
+                "dtype": "bfloat16",
+            },
+        },
+        {
+            "name": "CaseSmall2",
+            "platforms": ["a2a3sim", "a2a3"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 24},
+            "manual": True,
+            "params": {
+                "batch": 1,
+                "num_heads": 16,
+                "kv_head_num": 1,
+                "head_dim": 16,
+                "block_size": 16,
+                "context_len": 128,
+                "max_model_len": 256,
+                "dtype": "bfloat16",
+            },
+        },
+        {
+            "name": "CaseVarSeq2",
+            "platforms": ["a2a3sim", "a2a3"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 24},
+            "manual": True,
+            "params": {
+                "batch": 2,
+                "num_heads": 16,
+                "kv_head_num": 1,
+                "head_dim": 16,
+                "block_size": 16,
+                "context_len": 33,
+                "context_lens_list": [33, 17],
+                "max_model_len": 256,
+                "dtype": "bfloat16",
+            },
+        },
+        {
+            "name": "CaseVarSeq4",
+            "platforms": ["a2a3sim", "a2a3"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 24},
+            "manual": True,
+            "params": {
+                "batch": 4,
+                "num_heads": 16,
+                "kv_head_num": 1,
+                "head_dim": 16,
+                "block_size": 16,
+                "context_len": 128,
+                "context_lens_list": [33, 64, 128, 15],
+                "max_model_len": 256,
+                "dtype": "bfloat16",
+            },
+        },
+    ]
+
+    def generate_args(self, params):
+        result = _pa_generate_inputs(params)
+        specs = []
+        for name, value in result:
+            if isinstance(value, torch.Tensor):
+                specs.append(Tensor(name, value))
+            else:
+                specs.append(Scalar(name, value))
+        return TaskArgsBuilder(*specs)
+
+    def compute_golden(self, args, params):
+        tensors = {s.name: s.value for s in args.specs if isinstance(s, Tensor)}
+        _pa_compute_golden(tensors, params)
+        for s in args.specs:
+            if isinstance(s, Tensor) and s.name in tensors:
+                getattr(args, s.name)[:] = tensors[s.name]
+
+
+if __name__ == "__main__":
+    SceneTestCase.run_module(__name__)
diff --git a/examples/a2a3/fully_distributed_within_core/paged_attention_ringbuffer/test_paged_attention_ringbuffer.py b/examples/a2a3/fully_distributed_within_core/paged_attention_ringbuffer/test_paged_attention_ringbuffer.py
new file mode 100644
index 000000000..88f3de4d3
--- /dev/null
+++ b/examples/a2a3/fully_distributed_within_core/paged_attention_ringbuffer/test_paged_attention_ringbuffer.py
@@ -0,0 +1,115 @@
+#!/usr/bin/env python3
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""Paged attention with small ring buffer sizes — stress test for ring rotation/reclamation.
+
+Drives per-case ring sizing through ``config.runtime_env`` (ring_task_window /
+ring_heap / ring_dep_pool) rather than the process-global PTO2_RING_* env, plus
+INOUT tensors, bfloat16, and AIC+AIV mixed execution.
+"""
+
+import torch
+from simpler.task_interface import ArgDirection as D
+
+from simpler_setup import Scalar, SceneTestCase, TaskArgsBuilder, Tensor, scene_test
+from simpler_setup.goldens.paged_attention import compute_golden as _pa_compute_golden  # noqa: PLC0415
+from simpler_setup.goldens.paged_attention import generate_inputs as _pa_generate_inputs  # noqa: PLC0415
+
+PA_KERNELS = "../../../../tests/st/a2a3/tensormap_and_ringbuffer/batch_paged_attention/kernels"
+
+
+@scene_test(level=2, runtime="fully_distributed_within_core")
+class TestPagedAttentionRingbuffer(SceneTestCase):
+    """Paged attention with small ring buffer sizes for stress testing."""
+
+    RTOL = 1e-3
+    ATOL = 1e-3
+
+    CALLABLE = {
+        "orchestration": {
+            "source": f"{PA_KERNELS}/orchestration/paged_attention_orch.cpp",
+            "function_name": "aicpu_orchestration_entry",
+            "signature": [D.IN, D.IN, D.IN, D.IN, D.IN, D.OUT],
+        },
+        "incores": [
+            {
+                "func_id": 0,
+                "source": f"{PA_KERNELS}/aic/aic_qk_matmul.cpp",
+                "core_type": "aic",
+                "signature": [D.IN, D.IN, D.OUT],
+            },
+            {
+                "func_id": 2,
+                "source": f"{PA_KERNELS}/aic/aic_pv_matmul.cpp",
+                "core_type": "aic",
+                "signature": [D.IN, D.IN, D.OUT],
+            },
+            {
+                "func_id": 1,
+                "source": f"{PA_KERNELS}/aiv/aiv_softmax_prepare.cpp",
+                "core_type": "aiv",
+                "signature": [D.IN, D.OUT, D.OUT, D.OUT],
+            },
+            {
+                "func_id": 3,
+                "source": f"{PA_KERNELS}/aiv/aiv_online_update.cpp",
+                "core_type": "aiv",
+                "signature": [D.IN, D.IN, D.IN, D.INOUT, D.INOUT, D.INOUT, D.INOUT],
+            },
+        ],
+    }
+
+    CASES = [
+        {
+            "name": "ringbuffer_stress",
+            "platforms": ["a2a3"],
+            # ring_heap is bytes per ring. Non power-of-2 sizes are accepted,
+            # but 4 MiB keeps the small-ring stress intent compact.
+            "config": {
+                "aicpu_thread_num": 4,
+                "block_dim": 24,
+                "runtime_env": {
+                    "ring_task_window": 64,
+                    "ring_heap": 4 * 1024 * 1024,
+                    "ring_dep_pool": 256,
+                },
+            },
+            "params": {
+                "batch": 32,
+                "num_heads": 16,
+                "kv_head_num": 1,
+                "head_dim": 128,
+                "block_size": 128,
+                "context_len": 4096,
+                "max_model_len": 32768,
+                "dtype": "bfloat16",
+            },
+        },
+    ]
+
+    def generate_args(self, params):
+        inputs = _pa_generate_inputs(params)
+        specs = []
+        for name, val in inputs:
+            if isinstance(val, torch.Tensor):
+                specs.append(Tensor(name, val))
+            else:
+                specs.append(Scalar(name, val))
+        return TaskArgsBuilder(*specs)
+
+    def compute_golden(self, args, params):
+        tensors = {s.name: s.value for s in args.specs if isinstance(s, Tensor)}
+        _pa_compute_golden(tensors, params)
+        for s in args.specs:
+            if isinstance(s, Tensor) and s.name in tensors:
+                getattr(args, s.name)[:] = tensors[s.name]
+
+
+if __name__ == "__main__":
+    SceneTestCase.run_module(__name__)
diff --git a/examples/a2a3/fully_distributed_within_core/paged_attention_unroll/kernels/aic/aic_pv_matmul.cpp b/examples/a2a3/fully_distributed_within_core/paged_attention_unroll/kernels/aic/aic_pv_matmul.cpp
new file mode 100644
index 000000000..ec55f0377
--- /dev/null
+++ b/examples/a2a3/fully_distributed_within_core/paged_attention_unroll/kernels/aic/aic_pv_matmul.cpp
@@ -0,0 +1,169 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+// SplitK PV Matmul Kernel: Accumulated P @ V across n_blocks
+//
+// Processes n_blocks blocks using SplitK accumulation pattern:
+//   Block 0: TMATMUL(C, A, B)       — initialize accumulator
+//   Block i: TMATMUL_ACC(C, C, A, B) — accumulate into same C
+//
+// Per-block pij addresses: contiguous slices of pij_buf (n_blocks * M * K)
+// Per-block vj addresses: value_cache base + block_indices lookup
+// Single output: oi_new (M, N) fp32 = sum of P_i @ V_i across all blocks
+//
+// Optimizations:
+//   - Double-buffered L1 tiles (ping/pong for A and B via MTE2)
+//   - Double-buffered L0 tiles (ping/pong for L0A and L0B via MTE1)
+//   - TLOAD(next) overlaps with TMATMUL(current) via MTE2/M-pipe parallelism
+//   - Canonical 3-stage pipeline: TLOAD(MTE2) → TMOV(MTE1) → TMATMUL(M)
+//   - Reverse-dependency events ensure buffer safety across iterations
+//
+// Supports two tile configurations via runtime dispatch:
+//   Case1: (16, 128) @ (128, 128) -> (16, 128)
+//   Case2: (64,  64) @ ( 64, 128) -> (64, 128)
+//
+// pij is bfloat16 (from softmax_prepare TCVT).
+// vj is stored as (K, N) = (block_size, head_dim) in row-major (ND) layout.
+
+#include <cstdint>
+// NOLINTBEGIN(clang-diagnostic-error,bugprone-reserved-identifier,bugprone-easily-swappable-parameters,modernize-avoid-c-arrays,modernize-use-auto)
+#include <pto/pto-inst.hpp>
+
+#include "tensor.h"
+
+// NOLINTNEXTLINE(build/namespaces)
+using namespace pto;
+
+#include "pipe_sync.h"
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__ [aicore]  // NOLINT(whitespace/braces)
+#endif
+
+template <int M, int K, int N>
+static __aicore__ void pv_matmul_n_impl(
+    __gm__ bfloat16_t *pij_base, __gm__ bfloat16_t *val_base, __gm__ float *oi_base, uint64_t n_blocks,
+    __gm__ int32_t *bt, uint64_t bt_offset
+) {
+    using GlobalA = GlobalTensor<bfloat16_t, Shape<1, 1, 1, M, K>, Stride<M * K, M * K, M * K, K, 1>>;
+    using GlobalB = GlobalTensor<bfloat16_t, Shape<1, 1, 1, K, N>, Stride<K * N, K * N, K * N, N, 1>>;
+    using GlobalOut = GlobalTensor<float, Shape<1, 1, 1, M, N>, Stride<M * N, M * N, M * N, N, 1>>;
+
+    using TileMatA = Tile<TileType::Mat, bfloat16_t, M, K, BLayout::ColMajor, M, K, SLayout::RowMajor, 512>;
+    using TileMatB = Tile<TileType::Mat, bfloat16_t, K, N, BLayout::ColMajor, K, N, SLayout::RowMajor, 512>;
+
+    using LeftTile = TileLeft<bfloat16_t, M, K, M, K>;
+    using RightTile = TileRight<bfloat16_t, K, N, K, N>;
+    using AccTile = TileAcc<float, M, N, M, N>;
+
+    // L1 memory layout: double-buffered A and B tiles (tightly packed)
+    constexpr int kATileBytes = M * K * static_cast<int>(sizeof(bfloat16_t));
+    constexpr int kBTileBytes = K * N * static_cast<int>(sizeof(bfloat16_t));
+
+    TileMatA aMatTile[2];
+    TileMatB bMatTile[2];
+    TASSIGN(aMatTile[0], 0x0);
+    TASSIGN(aMatTile[1], kATileBytes);
+    TASSIGN(bMatTile[0], 2 * kATileBytes);
+    TASSIGN(bMatTile[1], 2 * kATileBytes + kBTileBytes);
+
+    // L0 memory layout: double-buffered L0A and L0B, single accumulator L0C
+    LeftTile aTile[2];
+    RightTile bTile[2];
+    AccTile cTile;
+    TASSIGN(aTile[0], 0x0);
+    TASSIGN(aTile[1], kATileBytes);
+    TASSIGN(bTile[0], 0x0);
+    TASSIGN(bTile[1], kBTileBytes);
+    TASSIGN(cTile, 0x0);
+
+    GlobalOut oiGlobal(oi_base);
+
+    // Seed reverse-dependency flags: all ping/pong buffers initially free
+    //   PIPE_MTE1 → PIPE_MTE2: L1 buffer [0/1] safe for TLOAD to overwrite
+    //   PIPE_M    → PIPE_MTE1: L0 buffer [0/1] safe for TMOV to overwrite
+    set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID0);
+    set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1);
+    set_flag(PIPE_M, PIPE_MTE1, EVENT_ID0);
+    set_flag(PIPE_M, PIPE_MTE1, EVENT_ID1);
+
+    for (uint64_t i = 0; i < n_blocks; i++) {
+        int cur = static_cast<int>(i % 2);
+        GlobalA pijGlobal(pij_base + i * M * K);
+        GlobalB vjGlobal(val_base + bt[bt_offset + i] * K * N);
+
+        // Stage 1: TLOAD (MTE2: GM → L1[cur])
+        // Wait for MTE1 to release L1[cur] (reverse dep from previous iteration)
+        wait_flag(PIPE_MTE1, PIPE_MTE2, static_cast<::event_t>(cur));
+        TLOAD(aMatTile[cur], pijGlobal);
+        set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);  // forward: A in L1 ready
+        TLOAD(bMatTile[cur], vjGlobal);
+        set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID1);  // forward: B in L1 ready
+
+        // Stage 2: TMOV (MTE1: L1[cur] → L0[cur])
+        // Wait for M-pipe to release L0[cur] (reverse dep from previous iteration)
+        wait_flag(PIPE_M, PIPE_MTE1, static_cast<::event_t>(cur));
+        wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);  // forward: wait A loaded
+        TMOV(aTile[cur], aMatTile[cur]);
+        wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID1);  // forward: wait B loaded
+        TMOV(bTile[cur], bMatTile[cur]);
+        set_flag(PIPE_MTE1, PIPE_MTE2, static_cast<::event_t>(cur));  // reverse: release L1[cur]
+
+        // Stage 3: TMATMUL (M-pipe: L0A[cur] × L0B[cur] → L0C)
+        set_flag(PIPE_MTE1, PIPE_M, static_cast<::event_t>(cur));  // forward: L0[cur] ready
+        wait_flag(PIPE_MTE1, PIPE_M, static_cast<::event_t>(cur));
+        if (i == 0) {
+            TMATMUL(cTile, aTile[cur], bTile[cur]);
+        } else {
+            TMATMUL_ACC(cTile, cTile, aTile[cur], bTile[cur]);
+        }
+        set_flag(PIPE_M, PIPE_MTE1, static_cast<::event_t>(cur));  // reverse: release L0[cur]
+    }
+
+    // Drain outstanding reverse-dependency flags
+    wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID0);
+    wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1);
+    wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID0);
+    wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID1);
+
+    set_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
+    wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
+    TSTORE(oiGlobal, cTile);
+
+    pipe_sync();
+}
+
+extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) {
+    __gm__ Tensor *pij_buf = reinterpret_cast<__gm__ Tensor *>(args[0]);
+    __gm__ Tensor *value_cache = reinterpret_cast<__gm__ Tensor *>(args[1]);
+    __gm__ Tensor *block_table_t = reinterpret_cast<__gm__ Tensor *>(args[2]);
+    __gm__ Tensor *oi_new = reinterpret_cast<__gm__ Tensor *>(args[3]);
+    uint64_t n_blocks = static_cast<uint64_t>(args[4]);
+    uint64_t bt_offset = static_cast<uint64_t>(args[5]);
+
+    __gm__ bfloat16_t *pij_base = reinterpret_cast<__gm__ bfloat16_t *>(pij_buf->buffer.addr) + pij_buf->start_offset;
+    __gm__ bfloat16_t *val_base = reinterpret_cast<__gm__ bfloat16_t *>(value_cache->buffer.addr);
+    __gm__ float *oi_base = reinterpret_cast<__gm__ float *>(oi_new->buffer.addr) + oi_new->start_offset;
+    __gm__ int32_t *bt = reinterpret_cast<__gm__ int32_t *>(block_table_t->buffer.addr);
+
+    uint64_t q_tile_size = static_cast<uint64_t>(pij_buf->shapes[0]);
+
+    if (q_tile_size == 16) {
+        pv_matmul_n_impl<16, 128, 128>(pij_base, val_base, oi_base, n_blocks, bt, bt_offset);
+    } else {
+        pv_matmul_n_impl<64, 64, 128>(pij_base, val_base, oi_base, n_blocks, bt, bt_offset);
+    }
+}
+// NOLINTEND(clang-diagnostic-error,bugprone-reserved-identifier,bugprone-easily-swappable-parameters,modernize-avoid-c-arrays,modernize-use-auto)
diff --git a/examples/a2a3/fully_distributed_within_core/paged_attention_unroll/kernels/aic/aic_qk_matmul.cpp b/examples/a2a3/fully_distributed_within_core/paged_attention_unroll/kernels/aic/aic_qk_matmul.cpp
new file mode 100644
index 000000000..20ec20e73
--- /dev/null
+++ b/examples/a2a3/fully_distributed_within_core/paged_attention_unroll/kernels/aic/aic_qk_matmul.cpp
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+// Multi-block QK Matmul Kernel: qi(M, K) @ kj.T(K, N) -> sij(M, N) for each block
+//
+// Processes n_blocks blocks in a single kernel invocation.
+// Per-block kj addresses computed from key_cache base + block_indices lookup.
+// qi is shared across all blocks (same query head against different key blocks).
+//
+// Output layout: n_blocks contiguous (M, N) tiles stacked vertically.
+// Block i occupies sij[i*M : (i+1)*M, 0:N].
+//
+// Optimizations:
+//   - qi TLOAD hoisted before the loop (constant across all iterations)
+//   - Double-buffered L1 B tiles: prefetch next kj during current TMATMUL+TSTORE
+//
+// Supports two tile configurations via runtime dispatch:
+//   Case1: (16, 128) @ (128, 128).T -> (16, 128)
+//   Case2: (64, 128) @ (128,  64).T -> (64,  64)
+//
+// Template: M=q_tile, K=head_dim, N=block_size
+
+#include <cstdint>
+// NOLINTBEGIN(clang-diagnostic-error,bugprone-reserved-identifier,bugprone-easily-swappable-parameters,modernize-use-auto)
+#include <pto/pto-inst.hpp>
+
+#include "tensor.h"
+
+// NOLINTNEXTLINE(build/namespaces)
+using namespace pto;
+
+#include "pipe_sync.h"
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__ [aicore]  // NOLINT(whitespace/braces)
+#endif
+
+template <int M, int K, int N>
+static __aicore__ void qk_matmul_n_impl(
+    __gm__ bfloat16_t *qi_base, __gm__ bfloat16_t *key_base, __gm__ float *sij_base, uint64_t n_blocks,
+    __gm__ int32_t *bt, uint64_t bt_offset
+) {
+    using GlobalA = GlobalTensor<bfloat16_t, Shape<1, 1, 1, M, K>, Stride<M * K, M * K, M * K, K, 1>>;
+    using GlobalB = GlobalTensor<bfloat16_t, Shape<1, 1, 1, K, N>, Stride<K * N, K * N, K * N, 1, K>, Layout::DN>;
+    using GlobalOut = GlobalTensor<float, Shape<1, 1, 1, M, N>, Stride<M * N, M * N, M * N, N, 1>>;
+
+    using TileMatA = Tile<TileType::Mat, bfloat16_t, M, K, BLayout::ColMajor, M, K, SLayout::RowMajor, 512>;
+    using TileMatB = Tile<TileType::Mat, bfloat16_t, K, N, BLayout::RowMajor, K, N, SLayout::ColMajor, 512>;
+
+    using LeftTile = TileLeft<bfloat16_t, M, K, M, K>;
+    using RightTile = TileRight<bfloat16_t, K, N, K, N>;
+    using AccTile = TileAcc<float, M, N, M, N>;
+
+    // Double-buffered L1 B tiles for kj prefetching
+    constexpr int kBBytes = K * N * static_cast<int>(sizeof(bfloat16_t));
+    TileMatA aMatTile;
+    TileMatB bMatTile_A;
+    TileMatB bMatTile_B;
+    TASSIGN(aMatTile, 0x0);
+    TASSIGN(bMatTile_A, 0x20000);
+    TASSIGN(bMatTile_B, 0x20000 + kBBytes);
+
+    LeftTile aTile;
+    RightTile bTile;
+    AccTile cTile;
+    TASSIGN(aTile, 0x0);
+    TASSIGN(bTile, 0x0);
+    TASSIGN(cTile, 0x0);
+
+    // Hoist qi TLOAD before the loop (qi is constant across all blocks)
+    GlobalA qiGlobal(qi_base);
+    TLOAD(aMatTile, qiGlobal);
+
+    // Pre-load first kj into buffer A
+    GlobalB kjGlobal_0(key_base + bt[bt_offset + 0] * N * K);
+    TLOAD(bMatTile_A, kjGlobal_0);
+
+    for (uint64_t i = 0; i < n_blocks; i++) {
+        GlobalOut sijGlobal(sij_base + i * M * N);
+
+        // Wait for current kj TLOAD to complete
+        set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
+        wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
+
+        // TMOV qi L1→L0A and kj L1→L0B from current buffer
+        TMOV(aTile, aMatTile);
+        if (i % 2 == 0) {
+            TMOV(bTile, bMatTile_A);
+        } else {
+            TMOV(bTile, bMatTile_B);
+        }
+
+        // Prefetch next kj into alternate L1 buffer (overlaps with MTE1→M→FIX)
+        if (i + 1 < n_blocks) {
+            GlobalB kjGlobal_next(key_base + bt[bt_offset + i + 1] * N * K);
+            if (i % 2 == 0) {
+                TLOAD(bMatTile_B, kjGlobal_next);
+            } else {
+                TLOAD(bMatTile_A, kjGlobal_next);
+            }
+        }
+
+        set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
+        wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
+
+        TMATMUL(cTile, aTile, bTile);
+
+        set_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
+        wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
+
+        TSTORE(sijGlobal, cTile);
+
+        if (i + 1 < n_blocks) {
+            // Drain all pipes before next iteration:
+            //   - FIX/MTE3: ensures TSTORE data path (L0C→UB→GM) fully completes
+            //   - MTE2: prefetch TLOAD likely already done (ran during TMATMUL+TSTORE)
+            // The prefetch TLOAD overlaps with compute, so barrier cost is minimal.
+            pipe_barrier(PIPE_ALL);
+        }
+    }
+    pipe_sync();
+}
+
+extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) {
+    __gm__ Tensor *qi = reinterpret_cast<__gm__ Tensor *>(args[0]);
+    __gm__ Tensor *key_cache = reinterpret_cast<__gm__ Tensor *>(args[1]);
+    __gm__ Tensor *block_table_t = reinterpret_cast<__gm__ Tensor *>(args[2]);
+    __gm__ Tensor *sij_buf = reinterpret_cast<__gm__ Tensor *>(args[3]);
+    uint64_t n_blocks = static_cast<uint64_t>(args[4]);
+    uint64_t bt_offset = static_cast<uint64_t>(args[5]);
+
+    __gm__ bfloat16_t *qi_base = reinterpret_cast<__gm__ bfloat16_t *>(qi->buffer.addr) + qi->start_offset;
+    __gm__ bfloat16_t *key_base = reinterpret_cast<__gm__ bfloat16_t *>(key_cache->buffer.addr);
+    __gm__ float *sij_base = reinterpret_cast<__gm__ float *>(sij_buf->buffer.addr) + sij_buf->start_offset;
+    __gm__ int32_t *bt = reinterpret_cast<__gm__ int32_t *>(block_table_t->buffer.addr);
+
+    uint64_t q_tile_size = static_cast<uint64_t>(qi->shapes[0]);
+
+    if (q_tile_size == 16) {
+        qk_matmul_n_impl<16, 128, 128>(qi_base, key_base, sij_base, n_blocks, bt, bt_offset);
+    } else {
+        qk_matmul_n_impl<64, 128, 64>(qi_base, key_base, sij_base, n_blocks, bt, bt_offset);
+    }
+}
+// NOLINTEND(clang-diagnostic-error,bugprone-reserved-identifier,bugprone-easily-swappable-parameters,modernize-use-auto)
diff --git a/examples/a2a3/fully_distributed_within_core/paged_attention_unroll/kernels/aiv/aiv_online_update.cpp b/examples/a2a3/fully_distributed_within_core/paged_attention_unroll/kernels/aiv/aiv_online_update.cpp
new file mode 100644
index 000000000..df4b5a726
--- /dev/null
+++ b/examples/a2a3/fully_distributed_within_core/paged_attention_unroll/kernels/aiv/aiv_online_update.cpp
@@ -0,0 +1,256 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+// Online Softmax Update + Normalize Kernel (AIV)
+//
+// Operates on full tiles where M=q_tile_size, N=head_dim (128):
+//   Case1: oi/oi_new are (16, 128), mij/lij/mi/li are 16-element vectors
+//   Case2: oi/oi_new are (64, 128), mij/lij/mi/li are 64-element vectors
+//
+// Scalar layout strategy using TRESHAPE (zero-copy UB reshape):
+//   Scalars loaded as DN ColMajor (M, 1) for TROWEXPANDMUL/TROWEXPANDDIV.
+//   For element-wise ops (TMAX, TSUB, TEXP, etc.), TRESHAPE to RowMajor (1, M).
+//   After arithmetic, TRESHAPE back to ColMajor (M, 1) for row-broadcast ops.
+//   This eliminates the GM round-trip (TSTORE ND → TLOAD DN) used in the original.
+
+#include <cstdint>
+#include <pto/pto-inst.hpp>
+
+#include "tensor.h"
+
+using namespace pto;
+
+#include "pipe_sync.h"
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__ [aicore]
+#endif
+
+template <int M, int N>
+static __aicore__ void online_update_impl(
+    __gm__ Tensor *mij, __gm__ Tensor *lij, __gm__ Tensor *oi_new, __gm__ Tensor *mi, __gm__ Tensor *li,
+    __gm__ Tensor *oi, uint64_t is_first, uint64_t is_last, __gm__ Tensor *dst
+) {
+    __gm__ float *mij_ptr = reinterpret_cast<__gm__ float *>(mij->buffer.addr);
+    __gm__ float *lij_ptr = reinterpret_cast<__gm__ float *>(lij->buffer.addr);
+    __gm__ float *oi_new_ptr = reinterpret_cast<__gm__ float *>(oi_new->buffer.addr);
+    __gm__ float *mi_ptr = reinterpret_cast<__gm__ float *>(mi->buffer.addr);
+    __gm__ float *li_ptr = reinterpret_cast<__gm__ float *>(li->buffer.addr);
+    __gm__ float *oi_ptr = reinterpret_cast<__gm__ float *>(oi->buffer.addr);
+    __gm__ float *dst_ptr = reinterpret_cast<__gm__ float *>(dst->buffer.addr);
+
+    // Aligned rows for ColMajor DN tiles (32-byte alignment)
+    constexpr int kAlignedRows = ((M * sizeof(float) + 31) / 32) * (32 / sizeof(float));
+
+    // --- GlobalTensor types ---
+
+    // Data (M, N) RowMajor
+    using GlobalDataMxN = GlobalTensor<float, Shape<1, 1, 1, M, N>, Stride<1, 1, 1, N, 1>>;
+
+    // Scalar DN: M contiguous floats as (kAlignedRows, 1) ColMajor for TROWEXPAND ops and loading
+    using GlobalScalarDN = GlobalTensor<float, Shape<1, 1, 1, kAlignedRows, 1>, Stride<1, 1, 1, 1, 1>, Layout::DN>;
+
+    // Scalar ND: for storing mi_new and li_new back to GM
+    constexpr int kScalarCols = 32 / sizeof(float);
+    constexpr int kScalarRows = M / kScalarCols;
+    using GlobalScalarND =
+        GlobalTensor<float, Shape<1, 1, 1, kScalarRows, kScalarCols>, Stride<1, 1, 1, kScalarCols, 1>>;
+
+    // --- GlobalTensor instances ---
+
+    GlobalDataMxN oiNewGlobal(oi_new_ptr + oi_new->start_offset);
+    GlobalDataMxN oiGlobal(oi_ptr + oi->start_offset);
+    GlobalDataMxN dstGlobal(dst_ptr + dst->start_offset);
+
+    // DN globals for loading scalars as ColMajor
+    GlobalScalarDN mijGlobalDN(mij_ptr + mij->start_offset);
+    GlobalScalarDN lijGlobalDN(lij_ptr + lij->start_offset);
+    GlobalScalarDN miGlobalDN(mi_ptr + mi->start_offset);
+    GlobalScalarDN liGlobalDN(li_ptr + li->start_offset);
+
+    // ND globals for storing scalar results
+    GlobalScalarND miGlobalND(mi_ptr + mi->start_offset);
+    GlobalScalarND liGlobalND(li_ptr + li->start_offset);
+
+    // --- Tile types ---
+
+    using TileDataMxN = Tile<TileType::Vec, float, M, N, BLayout::RowMajor, M, N>;
+    using TileScalarDN = Tile<TileType::Vec, float, kAlignedRows, 1, BLayout::ColMajor, M, 1>;
+
+    // RowMajor (1, M) tiles for element-wise arithmetic via TRESHAPE
+    using TileScalarRow = Tile<TileType::Vec, float, 1, M, BLayout::RowMajor, 1, M>;
+
+    // ND tile for storing back to GM
+    using TileScalarND =
+        Tile<TileType::Vec, float, kScalarRows, kScalarCols, BLayout::RowMajor, kScalarRows, kScalarCols>;
+
+    // --- UB memory layout ---
+
+    constexpr int kDataBytes = M * N * sizeof(float);
+    constexpr int kScalarDNBytes = kAlignedRows * sizeof(float);
+
+    // Data tiles
+    TileDataMxN oiNewTile;
+    TileDataMxN oiTile;
+
+    // Scalar DN tiles loaded from GM (ColMajor)
+    TileScalarDN mijDN, lijDN, miDN, liDN;
+
+    // Temporary DN tiles for results
+    TileScalarDN miNewDN, alphaDN, betaDN, liNewDN, tmpDN;
+
+    TASSIGN(oiNewTile, 0);
+    TASSIGN(oiTile, kDataBytes);
+    TASSIGN(mijDN, 2 * kDataBytes);
+    TASSIGN(lijDN, 2 * kDataBytes + kScalarDNBytes);
+    TASSIGN(miDN, 2 * kDataBytes + 2 * kScalarDNBytes);
+    TASSIGN(liDN, 2 * kDataBytes + 3 * kScalarDNBytes);
+    TASSIGN(miNewDN, 2 * kDataBytes + 4 * kScalarDNBytes);
+    TASSIGN(alphaDN, 2 * kDataBytes + 5 * kScalarDNBytes);
+    TASSIGN(betaDN, 2 * kDataBytes + 6 * kScalarDNBytes);
+    TASSIGN(liNewDN, 2 * kDataBytes + 7 * kScalarDNBytes);
+    TASSIGN(tmpDN, 2 * kDataBytes + 8 * kScalarDNBytes);
+
+    if (is_first) {
+        // --- First block: copy inputs to accumulators ---
+        TLOAD(oiNewTile, oiNewGlobal);
+        TLOAD(mijDN, mijGlobalDN);
+        TLOAD(lijDN, lijGlobalDN);
+        set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+        wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+
+        // Store mi = mij, li = lij, oi = oi_new
+        // Alias ND tiles to same UB as DN tiles for ND-format store
+        TileScalarND mijND, lijND;
+        TASSIGN(mijND, 2 * kDataBytes);                   // alias same UB as mijDN
+        TASSIGN(lijND, 2 * kDataBytes + kScalarDNBytes);  // alias same UB as lijDN
+
+        set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+        wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+        TSTORE(miGlobalND, mijND);    // mi = mij
+        TSTORE(liGlobalND, lijND);    // li = lij
+        TSTORE(oiGlobal, oiNewTile);  // oi = oi_new
+
+        if (is_last) {
+            // Single block: normalize dst = oi_new / lij
+            // lijDN already in ColMajor DN format, use directly for TROWEXPANDDIV
+            set_flag(PIPE_MTE3, PIPE_V, EVENT_ID0);
+            wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID0);
+            TROWEXPANDDIV(oiNewTile, oiNewTile, lijDN);
+            set_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);
+            wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);
+            TSTORE(dstGlobal, oiNewTile);
+        }
+    } else {
+        // --- Subsequent blocks: accumulate ---
+
+        // Load all inputs as DN (ColMajor)
+        TLOAD(oiNewTile, oiNewGlobal);
+        TLOAD(oiTile, oiGlobal);
+        TLOAD(mijDN, mijGlobalDN);
+        TLOAD(lijDN, lijGlobalDN);
+        TLOAD(miDN, miGlobalDN);
+        TLOAD(liDN, liGlobalDN);
+        set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+        wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+
+        // TRESHAPE: ColMajor(M,1) → RowMajor(1,M) for element-wise arithmetic
+        TileScalarRow miRow, mijRow, liRow, lijRow;
+        TRESHAPE(miRow, miDN);
+        TRESHAPE(mijRow, mijDN);
+        TRESHAPE(liRow, liDN);
+        TRESHAPE(lijRow, lijDN);
+
+        // Scalar arithmetic in RowMajor (1, M) layout
+        TileScalarRow miNewRow, alphaRow, betaRow, liNewRow, tmpRow;
+        TASSIGN(miNewRow, 2 * kDataBytes + 4 * kScalarDNBytes);
+        TASSIGN(alphaRow, 2 * kDataBytes + 5 * kScalarDNBytes);
+        TASSIGN(betaRow, 2 * kDataBytes + 6 * kScalarDNBytes);
+        TASSIGN(liNewRow, 2 * kDataBytes + 7 * kScalarDNBytes);
+        TASSIGN(tmpRow, 2 * kDataBytes + 8 * kScalarDNBytes);
+
+        TMAX(miNewRow, miRow, mijRow);  // mi_new = max(mi, mij)
+        pipe_barrier(PIPE_V);
+        // alphaRow and betaRow write to independent UB addresses; both only read miNewRow
+        TSUB(alphaRow, miRow, miNewRow);  // alpha_exp = mi - mi_new
+        TSUB(betaRow, mijRow, miNewRow);  // beta_exp = mij - mi_new
+        pipe_barrier(PIPE_V);
+        // TEXP on independent UB addresses
+        TEXP(alphaRow, alphaRow);  // alpha = exp(mi - mi_new)
+        TEXP(betaRow, betaRow);    // beta = exp(mij - mi_new)
+        pipe_barrier(PIPE_V);
+        // tmpRow and liNewRow write to independent UB addresses
+        TMUL(tmpRow, alphaRow, liRow);    // alpha * li
+        TMUL(liNewRow, betaRow, lijRow);  // beta * lij
+        pipe_barrier(PIPE_V);
+        TADD(liNewRow, tmpRow, liNewRow);  // li_new = alpha*li + beta*lij
+
+        // TRESHAPE back: RowMajor(1,M) → ColMajor(M,1) for TROWEXPANDMUL
+        pipe_barrier(PIPE_V);
+        TRESHAPE(alphaDN, alphaRow);
+        TRESHAPE(betaDN, betaRow);
+
+        // Scale data tiles using row-broadcast multiply
+        TROWEXPANDMUL(oiTile, oiTile, alphaDN);       // oi *= alpha
+        TROWEXPANDMUL(oiNewTile, oiNewTile, betaDN);  // oi_new *= beta
+        pipe_barrier(PIPE_V);
+        TADD(oiTile, oiTile, oiNewTile);  // oi = alpha*oi + beta*oi_new
+
+        // Store mi_new and li_new to GM (ND format)
+        // Alias ND tiles to the same UB locations as miNewRow and liNewRow
+        TileScalarND miNewND, liNewND;
+        TASSIGN(miNewND, 2 * kDataBytes + 4 * kScalarDNBytes);
+        TASSIGN(liNewND, 2 * kDataBytes + 7 * kScalarDNBytes);
+
+        if (is_last) {
+            // Normalize and output: dst = oi / li_new
+            TRESHAPE(liNewDN, liNewRow);
+            pipe_barrier(PIPE_V);
+            TROWEXPANDDIV(oiTile, oiTile, liNewDN);
+            set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+            wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+            TSTORE(miGlobalND, miNewND);  // persist mi_new
+            TSTORE(liGlobalND, liNewND);  // persist li_new
+            TSTORE(dstGlobal, oiTile);
+        } else {
+            // Store updated accumulators
+            set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+            wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+            TSTORE(miGlobalND, miNewND);  // persist mi_new
+            TSTORE(liGlobalND, liNewND);  // persist li_new
+            TSTORE(oiGlobal, oiTile);
+        }
+    }
+    pipe_sync();
+}
+
+extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) {
+    __gm__ Tensor *mij = reinterpret_cast<__gm__ Tensor *>(args[0]);
+    __gm__ Tensor *lij = reinterpret_cast<__gm__ Tensor *>(args[1]);
+    __gm__ Tensor *oi_new = reinterpret_cast<__gm__ Tensor *>(args[2]);
+    __gm__ Tensor *mi = reinterpret_cast<__gm__ Tensor *>(args[3]);
+    __gm__ Tensor *li = reinterpret_cast<__gm__ Tensor *>(args[4]);
+    __gm__ Tensor *oi = reinterpret_cast<__gm__ Tensor *>(args[5]);
+    __gm__ Tensor *dst = reinterpret_cast<__gm__ Tensor *>(args[6]);
+    uint64_t is_first = static_cast<uint64_t>(args[7]);
+    uint64_t is_last = static_cast<uint64_t>(args[8]);
+    uint64_t q_tile_size = static_cast<uint64_t>(mij->shapes[0]);
+    // args[10] = head_dim (128)
+
+    if (q_tile_size == 16) {
+        online_update_impl<16, 128>(mij, lij, oi_new, mi, li, oi, is_first, is_last, dst);
+    } else {
+        online_update_impl<64, 128>(mij, lij, oi_new, mi, li, oi, is_first, is_last, dst);
+    }
+}
diff --git a/examples/a2a3/fully_distributed_within_core/paged_attention_unroll/kernels/aiv/aiv_softmax_prepare.cpp b/examples/a2a3/fully_distributed_within_core/paged_attention_unroll/kernels/aiv/aiv_softmax_prepare.cpp
new file mode 100644
index 000000000..aa221fa5c
--- /dev/null
+++ b/examples/a2a3/fully_distributed_within_core/paged_attention_unroll/kernels/aiv/aiv_softmax_prepare.cpp
@@ -0,0 +1,293 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+// Two-Pass Softmax Kernel (AIV) for n_blocks tiles
+//
+// Input:  sij_buf (n_blocks * M, N) fp32 — QK results stacked vertically
+// Output: pij_buf (n_blocks * M, N) bf16 — attention weights per block
+//         mij (M,) fp32 — global row max across all blocks
+//         lij (M,) fp32 — total row sum across all blocks
+//
+// Pass 1: Iterate over n_blocks tiles, mask last block,
+//         find global m = scale * max over all blocks of rowmax(S_i)
+//         Defers scale to after the loop (single M-element TMULS vs n_blocks M×N).
+//         Uses double-buffered sij tiles and TRESHAPE for DN↔Row conversion.
+// Pass 2: Iterate again, compute P_i = exp(S_i * scale - m) -> bf16,
+//         accumulate l = sum over all blocks of rowsum(P_i)
+//         Uses double-buffered sij tiles to overlap TLOAD with computation.
+//
+// Two-pass ensures all P_i tiles share the same scale (global max),
+// enabling direct TMATMUL_ACC accumulation in the PV kernel.
+//
+// Supports two tile configurations via runtime dispatch:
+//   Case1: M=16, N=128 (q_tile=16, block_size=128)
+//   Case2: M=64, N=64  (q_tile=64, block_size=64)
+
+#include <cstdint>
+#include <pto/pto-inst.hpp>
+
+#include "tensor.h"
+
+using namespace pto;
+
+#include "pipe_sync.h"
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__ [aicore]
+#endif
+
+template <int M, int N>
+static __aicore__ void softmax_prepare_n_impl(
+    __gm__ float *sij_base, float scale_value, __gm__ bfloat16_t *pij_base, __gm__ float *mij_addr,
+    __gm__ float *lij_addr, uint64_t n_blocks, uint64_t valid_len_last
+) {
+    constexpr int kAlignedRows = ((M * sizeof(float) + 31) / 32) * (32 / sizeof(float));
+    constexpr int kScalarCols = 32 / sizeof(float);
+    constexpr int kScalarRows = M / kScalarCols;
+
+    // --- GlobalTensor types ---
+    using GlobalDataMxN = GlobalTensor<float, Shape<1, 1, 1, M, N>, Stride<1, 1, 1, N, 1>>;
+    using GlobalDataMxN_bf16 = GlobalTensor<bfloat16_t, Shape<1, 1, 1, M, N>, Stride<1, 1, 1, N, 1>>;
+    using GlobalScalarDN = GlobalTensor<float, Shape<1, 1, 1, kAlignedRows, 1>, Stride<1, 1, 1, 1, 1>, Layout::DN>;
+    using GlobalScalarND =
+        GlobalTensor<float, Shape<1, 1, 1, kScalarRows, kScalarCols>, Stride<1, 1, 1, kScalarCols, 1>>;
+
+    // --- Tile types ---
+    using TileSijDyn = Tile<TileType::Vec, float, M, N, BLayout::RowMajor, M, -1>;
+    using TileSijPad = Tile<TileType::Vec, float, M, N, BLayout::RowMajor, M, N, SLayout::NoneBox, 512, PadValue::Min>;
+    using TileVecMxN = Tile<TileType::Vec, float, M, N, BLayout::RowMajor, M, N>;
+    using TileVecMxN_bf16 = Tile<TileType::Vec, bfloat16_t, M, N, BLayout::RowMajor, M, N>;
+    using TileScalarDN = Tile<TileType::Vec, float, kAlignedRows, 1, BLayout::ColMajor, M, 1>;
+    using TileScalarND =
+        Tile<TileType::Vec, float, kScalarRows, kScalarCols, BLayout::RowMajor, kScalarRows, kScalarCols>;
+    // RowMajor (1, M) tile for element-wise arithmetic via TRESHAPE
+    using TileScalarRow = Tile<TileType::Vec, float, 1, M, BLayout::RowMajor, 1, M>;
+
+    // --- UB memory layout (double-buffered sij) ---
+    constexpr int kDataBytes = M * N * sizeof(float);
+    constexpr int kScalarDNBytes = kAlignedRows * sizeof(float);
+
+    // Double-buffered sij tiles
+    TileVecMxN sijTile_A;
+    TileSijPad sijPadTile_A;
+    TileVecMxN sijTile_B;
+    TileSijPad sijPadTile_B;
+    TileVecMxN pijTile;
+    TileVecMxN tmpTile;
+    TileVecMxN sumAccTile;
+    TileScalarDN localMaxDN;
+    TileScalarDN globalMaxDN;
+    TileScalarDN sumDN;
+    TileVecMxN_bf16 pijBf16Tile;
+
+    // TRESHAPE aliases (same UB address as their DN counterparts)
+    TileScalarRow localMaxRow;
+    TileScalarRow globalMaxRow;
+
+    // ND alias for storing globalMax to GM
+    TileScalarND globalMaxND;
+
+    TASSIGN(sijTile_A, 0x0);
+    TASSIGN(sijPadTile_A, 0x0);
+    TASSIGN(sijTile_B, kDataBytes);
+    TASSIGN(sijPadTile_B, kDataBytes);
+    TASSIGN(pijTile, 2 * kDataBytes);
+    TASSIGN(tmpTile, 3 * kDataBytes);
+    TASSIGN(sumAccTile, 4 * kDataBytes);
+    int scalarBase = 5 * kDataBytes;
+    TASSIGN(localMaxDN, scalarBase);
+    TASSIGN(localMaxRow, scalarBase);  // alias: same UB as localMaxDN
+    TASSIGN(globalMaxDN, scalarBase + kScalarDNBytes);
+    TASSIGN(globalMaxRow, scalarBase + kScalarDNBytes);  // alias: same UB as globalMaxDN
+    TASSIGN(globalMaxND, scalarBase + kScalarDNBytes);   // alias: same UB as globalMaxDN
+    TASSIGN(sumDN, scalarBase + 2 * kScalarDNBytes);
+    TASSIGN(pijBf16Tile, scalarBase + 3 * kScalarDNBytes);
+
+    // GM aliases (mij/lij output buffers)
+    GlobalScalarND mijGlobalND(mij_addr);
+    GlobalScalarDN lijGlobalDN(lij_addr);
+
+    // ======== Pass 1: Find global row max (unscaled) with double-buffered sij ========
+    // rowmax(S*scale) = scale * rowmax(S) since scale > 0, so defer scale to after loop.
+    GlobalDataMxN sijGlobal_p1_0(sij_base);
+    TLOAD(sijTile_A, sijGlobal_p1_0);
+
+    for (uint64_t i = 0; i < n_blocks; i++) {
+        set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+        wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+
+        if (i == n_blocks - 1 && valid_len_last < static_cast<uint64_t>(N)) {
+            TileSijDyn sijDynTile(static_cast<size_t>(valid_len_last));
+            if (i % 2 == 0) {
+                TASSIGN(sijDynTile, 0x0);
+                TFILLPAD_INPLACE(sijPadTile_A, sijDynTile);
+            } else {
+                TASSIGN(sijDynTile, static_cast<int>(kDataBytes));
+                TFILLPAD_INPLACE(sijPadTile_B, sijDynTile);
+            }
+            pipe_barrier(PIPE_V);
+        }
+
+        // Compute unscaled TROWMAX on current buffer
+        if (i % 2 == 0) {
+            TROWMAX(localMaxDN, sijTile_A, tmpTile);
+        } else {
+            TROWMAX(localMaxDN, sijTile_B, tmpTile);
+        }
+        pipe_barrier(PIPE_V);
+
+        // Prefetch next sij into alternate buffer (overlaps with V pipe scalar ops)
+        if (i + 1 < n_blocks) {
+            GlobalDataMxN sijGlobal_next(sij_base + (i + 1) * M * N);
+            if (i % 2 == 0) {
+                TLOAD(sijTile_B, sijGlobal_next);
+            } else {
+                TLOAD(sijTile_A, sijGlobal_next);
+            }
+        }
+
+        // TRESHAPE: ColMajor(M,1) → RowMajor(1,M) for element-wise TMAX
+        TRESHAPE(localMaxRow, localMaxDN);
+        if (i == 0) {
+            TMAX(globalMaxRow, localMaxRow, localMaxRow);
+        } else {
+            TMAX(globalMaxRow, globalMaxRow, localMaxRow);
+        }
+        pipe_barrier(PIPE_V);
+    }
+
+    // Apply scale once to the global max vector (M elements, not n_blocks × M × N)
+    TMULS(globalMaxRow, globalMaxRow, scale_value);
+    pipe_barrier(PIPE_V);
+
+    // TRESHAPE back: RowMajor(1,M) → ColMajor(M,1) for Pass 2's TROWEXPANDSUB
+    TRESHAPE(globalMaxDN, globalMaxRow);
+
+    // Store final global max to mij for online_update to consume
+    set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+    wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+    TSTORE(mijGlobalND, globalMaxND);
+
+    // ======== Pass 2: Compute softmax with double-buffered sij ========
+    // globalMaxDN is already in UB from TRESHAPE — no reload needed.
+    // Sync MTE3→MTE2 to ensure the mij TSTORE completed before first sij TLOAD.
+    set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
+    wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
+
+    // Pre-load first sij tile into buffer A
+    GlobalDataMxN sijGlobal_0(sij_base);
+    TLOAD(sijTile_A, sijGlobal_0);
+
+    for (uint64_t i = 0; i < n_blocks; i++) {
+        GlobalDataMxN_bf16 pijGlobal(pij_base + i * M * N);
+
+        // Wait for current tile's TLOAD to complete
+        set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+        wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+
+        // TFILLPAD on current buffer if last block with partial valid length
+        if (i == n_blocks - 1 && valid_len_last < static_cast<uint64_t>(N)) {
+            TileSijDyn curSijDyn(static_cast<size_t>(valid_len_last));
+            if (i % 2 == 0) {
+                TASSIGN(curSijDyn, 0x0);
+                TFILLPAD_INPLACE(sijPadTile_A, curSijDyn);
+            } else {
+                TASSIGN(curSijDyn, static_cast<int>(kDataBytes));
+                TFILLPAD_INPLACE(sijPadTile_B, curSijDyn);
+            }
+            pipe_barrier(PIPE_V);
+        }
+
+        // Compute on current buffer (select A or B based on iteration parity)
+        if (i % 2 == 0) {
+            TMULS(sijTile_A, sijTile_A, scale_value);
+            pipe_barrier(PIPE_V);
+            TROWEXPANDSUB(pijTile, sijTile_A, globalMaxDN);
+        } else {
+            TMULS(sijTile_B, sijTile_B, scale_value);
+            pipe_barrier(PIPE_V);
+            TROWEXPANDSUB(pijTile, sijTile_B, globalMaxDN);
+        }
+        pipe_barrier(PIPE_V);
+        TEXP(pijTile, pijTile);
+        pipe_barrier(PIPE_V);
+        TCVT(pijBf16Tile, pijTile, RoundMode::CAST_ROUND);
+        pipe_barrier(PIPE_V);
+        TCVT(pijTile, pijBf16Tile, RoundMode::CAST_ROUND);
+
+        pipe_barrier(PIPE_V);
+        if (i == 0) {
+            TMULS(sumAccTile, pijTile, 1.0f);
+        } else {
+            TADD(sumAccTile, sumAccTile, pijTile);
+        }
+
+        // Store pij (must complete before next iteration's TCVT overwrites pijBf16Tile)
+        pipe_barrier(PIPE_V);
+        set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+        wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+        TSTORE(pijGlobal, pijBf16Tile);
+
+        // Prefetch next sij into alternate buffer (after TSTORE to avoid UB race)
+        if (i + 1 < n_blocks) {
+            set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
+            wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
+            GlobalDataMxN sijGlobal_next(sij_base + (i + 1) * M * N);
+            if (i % 2 == 0) {
+                TLOAD(sijTile_B, sijGlobal_next);
+            } else {
+                TLOAD(sijTile_A, sijGlobal_next);
+            }
+        }
+    }
+
+    // Compute final row sum from accumulated pij values
+    pipe_barrier(PIPE_V);
+    TROWSUM(sumDN, sumAccTile, tmpTile);
+
+    // Store lij (total sum). mij already stored after Pass 1.
+    set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+    wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+    TSTORE(lijGlobalDN, sumDN);
+
+    pipe_sync();
+}
+
+extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) {
+    __gm__ Tensor *sij_buf = reinterpret_cast<__gm__ Tensor *>(args[0]);
+    __gm__ Tensor *pij_buf = reinterpret_cast<__gm__ Tensor *>(args[1]);
+    __gm__ Tensor *mij = reinterpret_cast<__gm__ Tensor *>(args[2]);
+    __gm__ Tensor *lij = reinterpret_cast<__gm__ Tensor *>(args[3]);
+    union {
+        uint64_t u;
+        float f;
+    } scale_conv;
+    scale_conv.u = static_cast<uint64_t>(args[4]);
+    float scale_value = scale_conv.f;
+    uint64_t n_blocks = static_cast<uint64_t>(args[5]);
+    uint64_t valid_len_last = static_cast<uint64_t>(args[6]);
+
+    __gm__ float *sij_base = reinterpret_cast<__gm__ float *>(sij_buf->buffer.addr) + sij_buf->start_offset;
+    __gm__ bfloat16_t *pij_base = reinterpret_cast<__gm__ bfloat16_t *>(pij_buf->buffer.addr) + pij_buf->start_offset;
+    __gm__ float *mij_addr = reinterpret_cast<__gm__ float *>(mij->buffer.addr) + mij->start_offset;
+    __gm__ float *lij_addr = reinterpret_cast<__gm__ float *>(lij->buffer.addr) + lij->start_offset;
+
+    uint64_t q_tile_size = static_cast<uint64_t>(sij_buf->shapes[0]);
+
+    if (q_tile_size == 16) {
+        softmax_prepare_n_impl<16, 128>(sij_base, scale_value, pij_base, mij_addr, lij_addr, n_blocks, valid_len_last);
+    } else {
+        softmax_prepare_n_impl<64, 64>(sij_base, scale_value, pij_base, mij_addr, lij_addr, n_blocks, valid_len_last);
+    }
+}
diff --git a/examples/a2a3/fully_distributed_within_core/paged_attention_unroll/kernels/orchestration/paged_attention_orch.cpp b/examples/a2a3/fully_distributed_within_core/paged_attention_unroll/kernels/orchestration/paged_attention_orch.cpp
new file mode 100644
index 000000000..0978073d9
--- /dev/null
+++ b/examples/a2a3/fully_distributed_within_core/paged_attention_unroll/kernels/orchestration/paged_attention_orch.cpp
@@ -0,0 +1,379 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Paged Attention Orchestration Function V2 - N_UNROLL=8, 4 Tasks Per Group
+ *
+ * Batches up to N_UNROLL blocks per group. Each group submits exactly 4 tasks:
+ *   1. QK matmul:  qi @ K^T for n_blocks → sij_buf (q_tile, n_blocks * block_size)
+ *   2. Softmax:    two-pass over sij_buf → pij_buf, mi, li
+ *   3. PV matmul:  SplitK accumulated P @ V → oi_new (q_tile, head_dim)
+ *   4. Update:     online softmax accumulation with group-level mi, li, oi_new
+ *
+ * Memory Layout:
+ *   Query: (batch * num_heads, head_dim) bf16
+ *   Key:   (total_blocks, block_size, head_dim) bf16 (stored as K^T for QK)
+ *   Value: (total_blocks, block_size, head_dim) bf16
+ */
+
+#include <algorithm>
+#include <cstdint>
+#include <cstring>
+
+#include "pto_orchestration_api.h"
+
+#define N_UNROLL 64
+
+#define FUNC_QK_MATMUL 0
+#define FUNC_SOFTMAX_PREPARE 1
+#define FUNC_PV_MATMUL 2
+#define FUNC_ONLINE_UPDATE 3
+constexpr uint64_t PLATFORM_PROF_SYS_CNT_FREQ = 50000000;  // 50 MHz
+
+inline double cycles_to_us(uint64_t cycles) {
+    return (static_cast<double>(cycles) / PLATFORM_PROF_SYS_CNT_FREQ) * 1000000.0;
+}
+
+inline uint64_t get_sys_cnt_aicpu() {
+    uint64_t ticks;
+    asm volatile("mrs %0, cntvct_el0" : "=r"(ticks));
+    return ticks;
+}
+
+#ifdef ENABLE_PROFILING
+struct ProfCounters {
+    uint64_t param_extract = 0;
+    uint64_t ext_tensor = 0;
+    uint64_t make_tensor = 0;
+    uint64_t tensor_view = 0;
+    uint64_t param_setup = 0;
+    uint64_t submit_task = 0;
+    uint64_t scope_and_loop = 0;
+    int submit_count = 0;
+    int make_count = 0;
+    int view_count = 0;
+    // Running lap timestamps. File-global so the lap timeline stays continuous
+    // across the entry/process_qtile_scope() boundary — orchestration runs on a
+    // single thread, so a shared counter needs no synchronization.
+    uint64_t t0 = 0;
+    uint64_t t1 = 0;
+};
+static ProfCounters g_prof;
+#define CYCLE_COUNT_START() (g_prof.t0 = get_sys_cnt_aicpu())
+#define CYCLE_COUNT_LAP(acc)              \
+    do {                                  \
+        g_prof.t1 = get_sys_cnt_aicpu();  \
+        (acc) += (g_prof.t1 - g_prof.t0); \
+        g_prof.t0 = g_prof.t1;            \
+    } while (0)
+#else
+#define CYCLE_COUNT_START() (void)0
+#define CYCLE_COUNT_LAP(acc) (void)0
+#endif
+
+/**
+ * Submit the QK -> softmax -> PV -> update task chain for one (batch, q-tile) unit.
+ *
+ * All context is passed positionally through a transport `Arg` (built by the
+ * caller, never submitted — only its slots are read back here). Every tensor
+ * slot is a materialized Tensor; the Arg carries no TensorCreateInfo (the
+ * scope's create-infos are rebuilt locally from the q_tile/head_dim scalars):
+ *   tensors: 0 query, 1 key_cache, 2 value_cache, 3 block_table (inputs),
+ *            4 out (output buffer the update task writes — add_output(Tensor))
+ *   scalars: 0 b_idx, 1 q_idx, 2 q_head_num, 3 q_tile, 4 head_dim,
+ *            5 block_size, 6 block_num, 7 scale_value, 8 bn_this_batch,
+ *            9 cur_seq, 10 data_type
+ * Adding/removing a slot here must be mirrored at the caller's build site.
+ *
+ * Must run inside a PTO2_SCOPE: the alloc'd / submitted tensors it references
+ * do not outlive that scope.
+ */
+static void process_qtile_scope(const L0TaskArgs &ctx) {
+    const Tensor &query = ctx.tensor(0).ref();
+    const Tensor &key_cache = ctx.tensor(1).ref();
+    const Tensor &value_cache = ctx.tensor(2).ref();
+    const Tensor &block_table = ctx.tensor(3).ref();
+    const Tensor &out = ctx.tensor(4).ref();
+    uint64_t b_idx = ctx.scalar(0);
+    uint64_t q_idx = ctx.scalar(1);
+    uint64_t q_head_num = ctx.scalar(2);
+    uint64_t q_tile = ctx.scalar(3);
+    uint64_t head_dim = ctx.scalar(4);
+    uint64_t block_size = ctx.scalar(5);
+    uint64_t block_num = ctx.scalar(6);
+    uint64_t scale_value = ctx.scalar(7);
+    uint64_t bn_this_batch = ctx.scalar(8);
+    uint64_t cur_seq = ctx.scalar(9);
+    DataType data_type = static_cast<DataType>(ctx.scalar(10));
+
+    CYCLE_COUNT_START();
+
+    // Create infos for the per-scope accumulators — shapes depend only on
+    // q_tile/head_dim, so build once before the block loop. Kept out of the
+    // transport Arg, which carries only materialized Tensors.
+    uint32_t oi_shapes[2] = {static_cast<uint32_t>(q_tile), static_cast<uint32_t>(head_dim)};
+    uint32_t li_shapes[1] = {static_cast<uint32_t>(q_tile)};
+    TensorCreateInfo tile2d_ci(oi_shapes, 2, DataType::FLOAT32);
+    TensorCreateInfo scalar_ci(li_shapes, 1, DataType::FLOAT32);
+#ifdef ENABLE_PROFILING
+    g_prof.make_count += 2;
+    CYCLE_COUNT_LAP(g_prof.make_tensor);
+#endif
+
+    uint64_t cur_offset = b_idx * q_head_num + q_idx * q_tile;
+
+    uint32_t qi_shapes[2] = {static_cast<uint32_t>(q_tile), static_cast<uint32_t>(head_dim)};
+    uint32_t qi_offsets[2] = {static_cast<uint32_t>(cur_offset), 0};
+    Tensor qi = query.view(qi_shapes, qi_offsets);
+    uint32_t out_view_shapes[2] = {static_cast<uint32_t>(q_tile), static_cast<uint32_t>(head_dim)};
+    uint32_t out_view_offsets[2] = {static_cast<uint32_t>(cur_offset), 0};
+    Tensor out_view = out.view(out_view_shapes, out_view_offsets, true);
+#ifdef ENABLE_PROFILING
+    g_prof.view_count += 2;
+    CYCLE_COUNT_LAP(g_prof.tensor_view);
+#endif
+    CYCLE_COUNT_LAP(g_prof.param_setup);
+    TaskOutputTensors alloc_outs = alloc_tensors(tile2d_ci, scalar_ci, scalar_ci);
+    const Tensor &oi = alloc_outs.get_ref(0);
+    const Tensor &li_update = alloc_outs.get_ref(1);
+    const Tensor &mi_update = alloc_outs.get_ref(2);
+#ifdef ENABLE_PROFILING
+    g_prof.submit_count++;
+    CYCLE_COUNT_LAP(g_prof.submit_task);
+#endif
+
+    // Reusable Arg objects — reset() before each use avoids
+    // repeated stack-frame construction in the inner loop.
+    L0TaskArgs params_qk, params_sf, params_pv, params_up;
+
+    for (uint64_t bn = 0; bn < bn_this_batch; bn += N_UNROLL) {
+        uint64_t n_blocks = std::min(static_cast<uint64_t>(N_UNROLL), bn_this_batch - bn);
+
+        // Valid length for last block in this group
+        uint64_t last_block_seq_start = (bn + n_blocks - 1) * block_size;
+        uint64_t valid_len_last = std::min(block_size, cur_seq - last_block_seq_start);
+        CYCLE_COUNT_LAP(g_prof.param_extract);
+
+        // === Task 1: Batched QK matmul ===
+        uint32_t sij_buf_shapes[2] = {static_cast<uint32_t>(q_tile), static_cast<uint32_t>(n_blocks * block_size)};
+        TensorCreateInfo sij_buf_ci(sij_buf_shapes, 2, DataType::FLOAT32);
+#ifdef ENABLE_PROFILING
+        g_prof.make_count += 1;
+        CYCLE_COUNT_LAP(g_prof.make_tensor);
+#endif
+
+        params_qk.reset();
+        params_qk.add_input(qi, key_cache, block_table);
+        params_qk.add_output(sij_buf_ci);
+        params_qk.add_scalar(n_blocks, b_idx * block_num + bn);
+        CYCLE_COUNT_LAP(g_prof.param_setup);
+        TaskOutputTensors qk_outs = rt_submit_aic_task(FUNC_QK_MATMUL, params_qk);
+        const Tensor &sij_buf = qk_outs.get_ref(0);
+#ifdef ENABLE_PROFILING
+        g_prof.submit_count++;
+        CYCLE_COUNT_LAP(g_prof.submit_task);
+#endif
+
+        // === Task 2: Two-pass softmax over all blocks in group ===
+        uint32_t pij_buf_shapes[2] = {static_cast<uint32_t>(q_tile), static_cast<uint32_t>(n_blocks * block_size)};
+        TensorCreateInfo pij_buf_ci(pij_buf_shapes, 2, data_type);
+#ifdef ENABLE_PROFILING
+        g_prof.make_count += 1;
+        CYCLE_COUNT_LAP(g_prof.make_tensor);
+#endif
+
+        params_sf.reset();
+        params_sf.add_input(sij_buf);
+        params_sf.add_output(pij_buf_ci, scalar_ci, scalar_ci);
+        params_sf.add_scalar(scale_value, n_blocks, valid_len_last);
+        CYCLE_COUNT_LAP(g_prof.param_setup);
+        TaskOutputTensors sf_outs = rt_submit_aiv_task(FUNC_SOFTMAX_PREPARE, params_sf);
+        const Tensor &pij_buf = sf_outs.get_ref(0);
+        const Tensor &mi = sf_outs.get_ref(1);
+        const Tensor &li = sf_outs.get_ref(2);
+#ifdef ENABLE_PROFILING
+        g_prof.submit_count++;
+        CYCLE_COUNT_LAP(g_prof.submit_task);
+#endif
+
+        // === Task 3: SplitK PV matmul (accumulated P @ V) ===
+        params_pv.reset();
+        params_pv.add_input(pij_buf, value_cache, block_table);
+        params_pv.add_output(tile2d_ci);
+        params_pv.add_scalar(n_blocks, b_idx * block_num + bn);
+        CYCLE_COUNT_LAP(g_prof.param_setup);
+        TaskOutputTensors pv_outs = rt_submit_aic_task(FUNC_PV_MATMUL, params_pv);
+        const Tensor &oi_new = pv_outs.get_ref(0);
+#ifdef ENABLE_PROFILING
+        g_prof.submit_count++;
+        CYCLE_COUNT_LAP(g_prof.submit_task);
+#endif
+
+        // === Task 4: Online update (per-group) ===
+        uint64_t is_first = (bn == 0) ? 1 : 0;
+        uint64_t is_last = (bn + n_blocks >= bn_this_batch) ? 1 : 0;
+
+        params_up.reset();
+        params_up.add_input(mi, li, oi_new);
+        params_up.add_inout(mi_update, li_update, oi, out_view);
+        params_up.add_scalar(is_first, is_last);
+        CYCLE_COUNT_LAP(g_prof.param_setup);
+        rt_submit_aiv_task(FUNC_ONLINE_UPDATE, params_up);
+#ifdef ENABLE_PROFILING
+        g_prof.submit_count++;
+        CYCLE_COUNT_LAP(g_prof.submit_task);
+#endif
+    }
+}
+
+extern "C" {
+/**
+ * Orchestration config — the executor reads these values to set up
+ * shared memory and runtime before calling aicpu_orchestration_entry.
+ */
+__attribute__((visibility("default"))) PTO2OrchestrationConfig aicpu_orchestration_config(const L2TaskArgs &orch_args) {
+    (void)orch_args;
+    return PTO2OrchestrationConfig{
+        .expected_arg_count = 7,
+    };
+}
+
+__attribute__((visibility("default"))) void aicpu_orchestration_entry(const L2TaskArgs &orch_args) {
+#ifdef ENABLE_PROFILING
+    g_prof = ProfCounters{};  // reset per entry — single-threaded orchestration
+#endif
+
+    CYCLE_COUNT_START();
+
+    // Read dimensions from tensor metadata
+    // query: shape=[batch, num_heads, head_dim]
+    uint64_t batch = orch_args.tensor(0).ref().shapes[0];
+    uint64_t num_heads = orch_args.tensor(0).ref().shapes[1];
+    uint64_t head_dim = orch_args.tensor(0).ref().shapes[2];
+    DataType data_type = orch_args.tensor(0).ref().dtype;
+
+    // key_cache: shape=[total_blocks, block_size, kv_head_num, head_dim]
+    uint64_t block_size = orch_args.tensor(1).ref().shapes[1];
+
+    // block_table: shape=[batch, max_num_blocks_per_req]
+    uint64_t block_num = orch_args.tensor(3).ref().shapes[1];
+
+    // scale from scalar arg
+    uint64_t scale_value = orch_args.scalar(0);
+    uint64_t q_head_num = num_heads;
+    uint64_t q_tile = std::min(num_heads, static_cast<uint64_t>(128));
+    uint64_t q_loop = (q_head_num + q_tile - 1) / q_tile;
+    CYCLE_COUNT_LAP(g_prof.param_extract);
+
+    // Reshape tensors for kernel consumption (2D flattened)
+    void *query_ptr = orch_args.tensor(0).ref().data_as<void>();
+    void *kc_ptr = orch_args.tensor(1).ref().data_as<void>();
+    void *vc_ptr = orch_args.tensor(2).ref().data_as<void>();
+    void *out_ptr = orch_args.tensor(5).ref().data_as<void>();
+
+    uint64_t total_blocks_count = orch_args.tensor(1).ref().shapes[0];
+
+    uint32_t query_shapes[2] = {static_cast<uint32_t>(batch * num_heads), static_cast<uint32_t>(head_dim)};
+    uint32_t key_cache_shapes[2] = {
+        static_cast<uint32_t>(total_blocks_count * block_size), static_cast<uint32_t>(head_dim)
+    };
+    uint32_t value_cache_shapes[2] = {
+        static_cast<uint32_t>(total_blocks_count * block_size), static_cast<uint32_t>(head_dim)
+    };
+    uint32_t out_shapes[2] = {static_cast<uint32_t>(batch * num_heads), static_cast<uint32_t>(head_dim)};
+    Tensor query = make_tensor_external(query_ptr, query_shapes, 2, data_type, false);
+    Tensor key_cache = make_tensor_external(kc_ptr, key_cache_shapes, 2, data_type, false);
+    Tensor value_cache = make_tensor_external(vc_ptr, value_cache_shapes, 2, data_type, false);
+    Tensor out = make_tensor_external(out_ptr, out_shapes, 2, DataType::FLOAT32);
+
+    uint32_t bt_shapes[2] = {static_cast<uint32_t>(batch), static_cast<uint32_t>(block_num)};
+    Tensor block_table =
+        make_tensor_external(orch_args.tensor(3).ref().data_as<void>(), bt_shapes, 2, DataType::INT32, false);
+    uint32_t cl_shapes[1] = {static_cast<uint32_t>(batch)};
+    Tensor context_lens =
+        make_tensor_external(orch_args.tensor(4).ref().data_as<void>(), cl_shapes, 1, DataType::INT32, false);
+
+#ifdef ENABLE_PROFILING
+    CYCLE_COUNT_LAP(g_prof.ext_tensor);
+#endif
+
+    // Transport Arg reused across iterations — packs the scope's context for
+    // process_qtile_scope(); see that function for the positional slot layout.
+    // It carries only materialized Tensors (no TensorCreateInfo); the scope's
+    // create-infos are rebuilt inside the helper from the q_tile/head_dim scalars.
+    L0TaskArgs ctx;
+
+    for (uint64_t b_idx = 0; b_idx < batch; b_idx++) {
+        uint32_t cl_idx[1] = {static_cast<uint32_t>(b_idx)};
+        uint64_t cur_seq = static_cast<uint64_t>(get_tensor_data<int32_t>(context_lens, 1, cl_idx));
+        uint64_t bn_this_batch = (cur_seq + block_size - 1) / block_size;
+
+        for (uint64_t q_idx = 0; q_idx < q_loop; q_idx++) {
+            CYCLE_COUNT_LAP(g_prof.scope_and_loop);
+
+            ctx.reset();
+            ctx.add_input(query, key_cache, value_cache, block_table);
+            ctx.add_output(out);
+            ctx.add_scalar(
+                b_idx, q_idx, q_head_num, q_tile, head_dim, block_size, block_num, scale_value, bn_this_batch, cur_seq,
+                static_cast<uint64_t>(data_type)
+            );
+
+            PTO2_SCOPE() { process_qtile_scope(ctx); }
+        }
+    }
+    CYCLE_COUNT_LAP(g_prof.scope_and_loop);
+
+#ifdef ENABLE_PROFILING
+    uint64_t total = g_prof.param_extract + g_prof.ext_tensor + g_prof.make_tensor + g_prof.tensor_view +
+                     g_prof.param_setup + g_prof.submit_task + g_prof.scope_and_loop;
+    LOG_INFO_V9(
+        "=== PagedAttn Orch Profiling: %d submits, %d makes, %d views, total=%.3fus ===", g_prof.submit_count,
+        g_prof.make_count, g_prof.view_count, cycles_to_us(total)
+    );
+    if (total > 0) {
+        LOG_INFO_V9(
+            "  param_extract    : %7.3fus (%5.1f%%)", cycles_to_us(g_prof.param_extract),
+            g_prof.param_extract * 100.0 / total
+        );
+        LOG_INFO_V9(
+            "  ext_tensor(x4)   : %7.3fus (%5.1f%%)", cycles_to_us(g_prof.ext_tensor), g_prof.ext_tensor * 100.0 / total
+        );
+        LOG_INFO_V9(
+            "  create_info(x%d) : %7.3fus (%5.1f%%)  avg=%.3fus", g_prof.make_count, cycles_to_us(g_prof.make_tensor),
+            g_prof.make_tensor * 100.0 / total,
+            g_prof.make_count > 0 ? cycles_to_us(g_prof.make_tensor) / g_prof.make_count : 0.0
+        );
+        LOG_INFO_V9(
+            "  tensor_view(x%d) : %7.3fus (%5.1f%%)  avg=%.3fus", g_prof.view_count, cycles_to_us(g_prof.tensor_view),
+            g_prof.tensor_view * 100.0 / total,
+            g_prof.view_count > 0 ? cycles_to_us(g_prof.tensor_view) / g_prof.view_count : 0.0
+        );
+        LOG_INFO_V9(
+            "  param_setup      : %7.3fus (%5.1f%%)", cycles_to_us(g_prof.param_setup),
+            g_prof.param_setup * 100.0 / total
+        );
+        LOG_INFO_V9(
+            "  submit_task(x%d) : %7.3fus (%5.1f%%)  avg=%.3fus", g_prof.submit_count, cycles_to_us(g_prof.submit_task),
+            g_prof.submit_task * 100.0 / total,
+            g_prof.submit_count > 0 ? cycles_to_us(g_prof.submit_task) / g_prof.submit_count : 0.0
+        );
+        LOG_INFO_V9(
+            "  scope_and_loop   : %7.3fus (%5.1f%%)", cycles_to_us(g_prof.scope_and_loop),
+            g_prof.scope_and_loop * 100.0 / total
+        );
+    }
+#endif
+
+#undef CYCLE_COUNT_START
+#undef CYCLE_COUNT_LAP
+}
+
+}  // extern "C"
diff --git a/examples/a2a3/fully_distributed_within_core/paged_attention_unroll/test_paged_attention_unroll.py b/examples/a2a3/fully_distributed_within_core/paged_attention_unroll/test_paged_attention_unroll.py
new file mode 100644
index 000000000..c6070994d
--- /dev/null
+++ b/examples/a2a3/fully_distributed_within_core/paged_attention_unroll/test_paged_attention_unroll.py
@@ -0,0 +1,162 @@
+#!/usr/bin/env python3
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""Paged attention unroll: production-scale with unrolled orchestration."""
+
+import torch
+from simpler.task_interface import ArgDirection as D
+
+from simpler_setup import Scalar, SceneTestCase, TaskArgsBuilder, Tensor, scene_test
+from simpler_setup.goldens.paged_attention import compute_golden as _pa_compute_golden
+from simpler_setup.goldens.paged_attention import generate_inputs as _pa_generate_inputs
+
+
+@scene_test(level=2, runtime="fully_distributed_within_core")
+class TestPagedAttentionUnroll(SceneTestCase):
+    RTOL = 1e-3
+    ATOL = 1e-3
+
+    CALLABLE = {
+        "orchestration": {
+            "source": "kernels/orchestration/paged_attention_orch.cpp",
+            "function_name": "aicpu_orchestration_entry",
+            "signature": [D.IN, D.IN, D.IN, D.IN, D.IN, D.OUT],
+        },
+        # example_exec_time_ns is the per-func reference kernel duration in
+        # nanoseconds, measured on-hardware via tensormap_and_ringbuffer (QK 50us,
+        # SF 55us, PV 50us, UP 3us). Under --use-example-exec-time
+        # (fully_distributed_within_core sim only) each incore busy-waits this
+        # instead of running the real kernel, so a fast sim run reflects measured
+        # on-hardware kernel durations + orchestration overhead. Ignored (kernels
+        # run for real) when the flag is off.
+        "incores": [
+            {
+                "func_id": 0,
+                "name": "QK",
+                "source": "kernels/aic/aic_qk_matmul.cpp",
+                "core_type": "aic",
+                "signature": [D.IN, D.IN, D.IN, D.OUT],
+                "example_exec_time_ns": 50000,
+            },
+            {
+                "func_id": 1,
+                "name": "SF",
+                "source": "kernels/aiv/aiv_softmax_prepare.cpp",
+                "core_type": "aiv",
+                "signature": [D.IN, D.OUT, D.OUT, D.OUT],
+                "example_exec_time_ns": 55000,
+            },
+            {
+                "func_id": 2,
+                "name": "PV",
+                "source": "kernels/aic/aic_pv_matmul.cpp",
+                "core_type": "aic",
+                "signature": [D.IN, D.IN, D.IN, D.OUT],
+                "example_exec_time_ns": 50000,
+            },
+            {
+                "func_id": 3,
+                "name": "UP",
+                "source": "kernels/aiv/aiv_online_update.cpp",
+                "core_type": "aiv",
+                "signature": [D.IN, D.IN, D.IN, D.INOUT, D.INOUT, D.INOUT, D.INOUT],
+                "example_exec_time_ns": 3000,
+            },
+        ],
+    }
+
+    CASES = [
+        {
+            # The QK/PV kernels hardcode head_dim=128 / block_size=128 (production
+            # Case1: <M=16, K=128, N=128>); feeding any other head_dim reads the
+            # query buffer out of bounds. Keep that shape, shrink batch/context so
+            # the sim run stays fast.
+            "name": "CaseSimSmall",
+            "platforms": ["a2a3sim"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 24},
+            "params": {
+                "batch": 1,
+                "num_heads": 16,
+                "kv_head_num": 1,
+                "head_dim": 128,
+                "block_size": 128,
+                "context_len": 256,
+                "max_model_len": 512,
+                "dtype": "bfloat16",
+            },
+        },
+        {
+            "name": "Case1",
+            "platforms": ["a2a3sim"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 24},
+            "params": {
+                "batch": 256,
+                "num_heads": 16,
+                "kv_head_num": 1,
+                "head_dim": 128,
+                "block_size": 128,
+                "context_len": 8192,
+                "max_model_len": 32768,
+                "dtype": "bfloat16",
+            },
+        },
+        {
+            "name": "Case2",
+            "platforms": ["a2a3sim"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 24},
+            "manual": True,
+            "params": {
+                "batch": 64,
+                "num_heads": 64,
+                "kv_head_num": 1,
+                "head_dim": 128,
+                "block_size": 64,
+                "context_len": 8192,
+                "max_model_len": 32768,
+                "dtype": "bfloat16",
+            },
+        },
+        {
+            "name": "Case3",
+            "platforms": ["a2a3sim"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 24},
+            "manual": True,
+            "params": {
+                "batch": 64,
+                "num_heads": 64,
+                "kv_head_num": 1,
+                "head_dim": 256,
+                "block_size": 64,
+                "context_len": 8192,
+                "max_model_len": 32768,
+                "dtype": "bfloat16",
+            },
+        },
+    ]
+
+    def generate_args(self, params):
+        result = _pa_generate_inputs(params)
+        specs = []
+        for name, value in result:
+            if isinstance(value, torch.Tensor):
+                specs.append(Tensor(name, value))
+            else:
+                specs.append(Scalar(name, value))
+        return TaskArgsBuilder(*specs)
+
+    def compute_golden(self, args, params):
+        tensors = {s.name: s.value for s in args.specs if isinstance(s, Tensor)}
+        _pa_compute_golden(tensors, params)
+        for s in args.specs:
+            if isinstance(s, Tensor) and s.name in tensors:
+                getattr(args, s.name)[:] = tensors[s.name]
+
+
+if __name__ == "__main__":
+    SceneTestCase.run_module(__name__)
diff --git a/examples/a2a3/fully_distributed_within_core/paged_attention_unroll_manual_scope/kernels/aic/aic_pv_matmul.cpp b/examples/a2a3/fully_distributed_within_core/paged_attention_unroll_manual_scope/kernels/aic/aic_pv_matmul.cpp
new file mode 100644
index 000000000..8befa5c51
--- /dev/null
+++ b/examples/a2a3/fully_distributed_within_core/paged_attention_unroll_manual_scope/kernels/aic/aic_pv_matmul.cpp
@@ -0,0 +1,168 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+// SplitK PV Matmul Kernel: Accumulated P @ V across n_blocks
+//
+// Processes n_blocks blocks using SplitK accumulation pattern:
+//   Block 0: TMATMUL(C, A, B)       — initialize accumulator
+//   Block i: TMATMUL_ACC(C, C, A, B) — accumulate into same C
+//
+// Per-block pij addresses: contiguous slices of pij_buf (n_blocks * M * K)
+// Per-block vj addresses: value_cache base + block_indices lookup
+// Single output: oi_new (M, N) fp32 = sum of P_i @ V_i across all blocks
+//
+// Optimizations:
+//   - Double-buffered L1 tiles (ping/pong for A and B via MTE2)
+//   - Double-buffered L0 tiles (ping/pong for L0A and L0B via MTE1)
+//   - TLOAD(next) overlaps with TMATMUL(current) via MTE2/M-pipe parallelism
+//   - Canonical 3-stage pipeline: TLOAD(MTE2) → TMOV(MTE1) → TMATMUL(M)
+//   - Reverse-dependency events ensure buffer safety across iterations
+//
+// Supports two tile configurations via runtime dispatch:
+//   Case1: (16, 128) @ (128, 128) -> (16, 128)
+//   Case2: (64,  64) @ ( 64, 128) -> (64, 128)
+//
+// pij is bfloat16 (from softmax_prepare TCVT).
+// vj is stored as (K, N) = (block_size, head_dim) in row-major (ND) layout.
+
+#include <cstdint>
+// NOLINTBEGIN(clang-diagnostic-error,bugprone-reserved-identifier,bugprone-easily-swappable-parameters,modernize-avoid-c-arrays,modernize-use-auto)
+#include <pto/pto-inst.hpp>
+
+#include "tensor.h"
+
+// NOLINTNEXTLINE(build/namespaces)
+using namespace pto;
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__ [aicore]  // NOLINT(whitespace/braces)
+#endif
+
+template <int M, int K, int N>
+static __aicore__ void pv_matmul_n_impl(
+    __gm__ bfloat16_t *pij_base, __gm__ bfloat16_t *val_base, __gm__ float *oi_base, uint64_t n_blocks,
+    __gm__ int32_t *bt, uint64_t bt_offset
+) {
+    using GlobalA = GlobalTensor<bfloat16_t, Shape<1, 1, 1, M, K>, Stride<M * K, M * K, M * K, K, 1>>;
+    using GlobalB = GlobalTensor<bfloat16_t, Shape<1, 1, 1, K, N>, Stride<K * N, K * N, K * N, N, 1>>;
+    using GlobalOut = GlobalTensor<float, Shape<1, 1, 1, M, N>, Stride<M * N, M * N, M * N, N, 1>>;
+
+    using TileMatA = Tile<TileType::Mat, bfloat16_t, M, K, BLayout::ColMajor, M, K, SLayout::RowMajor, 512>;
+    using TileMatB = Tile<TileType::Mat, bfloat16_t, K, N, BLayout::ColMajor, K, N, SLayout::RowMajor, 512>;
+
+    using LeftTile = TileLeft<bfloat16_t, M, K, M, K>;
+    using RightTile = TileRight<bfloat16_t, K, N, K, N>;
+    using AccTile = TileAcc<float, M, N, M, N>;
+
+    // L1 memory layout: double-buffered A and B tiles (tightly packed)
+    constexpr int kATileBytes = M * K * static_cast<int>(sizeof(bfloat16_t));
+    constexpr int kBTileBytes = K * N * static_cast<int>(sizeof(bfloat16_t));
+
+    TileMatA aMatTile[2];
+    TileMatB bMatTile[2];
+    TASSIGN(aMatTile[0], 0x0);
+    TASSIGN(aMatTile[1], kATileBytes);
+    TASSIGN(bMatTile[0], 2 * kATileBytes);
+    TASSIGN(bMatTile[1], 2 * kATileBytes + kBTileBytes);
+
+    // L0 memory layout: double-buffered L0A and L0B, single accumulator L0C
+    LeftTile aTile[2];
+    RightTile bTile[2];
+    AccTile cTile;
+    TASSIGN(aTile[0], 0x0);
+    TASSIGN(aTile[1], kATileBytes);
+    TASSIGN(bTile[0], 0x0);
+    TASSIGN(bTile[1], kBTileBytes);
+    TASSIGN(cTile, 0x0);
+
+    GlobalOut oiGlobal(oi_base);
+
+    // Seed reverse-dependency flags: all ping/pong buffers initially free
+    //   PIPE_MTE1 → PIPE_MTE2: L1 buffer [0/1] safe for TLOAD to overwrite
+    //   PIPE_M    → PIPE_MTE1: L0 buffer [0/1] safe for TMOV to overwrite
+    set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID0);
+    set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1);
+    set_flag(PIPE_M, PIPE_MTE1, EVENT_ID0);
+    set_flag(PIPE_M, PIPE_MTE1, EVENT_ID1);
+
+    for (uint64_t i = 0; i < n_blocks; i++) {
+        int cur = static_cast<int>(i % 2);
+        GlobalA pijGlobal(pij_base + i * M * K);
+        GlobalB vjGlobal(val_base + bt[bt_offset + i] * K * N);
+
+        // Stage 1: TLOAD (MTE2: GM → L1[cur])
+        // Wait for MTE1 to release L1[cur] (reverse dep from previous iteration)
+        wait_flag(PIPE_MTE1, PIPE_MTE2, static_cast<::event_t>(cur));
+        TLOAD(aMatTile[cur], pijGlobal);
+        set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);  // forward: A in L1 ready
+        TLOAD(bMatTile[cur], vjGlobal);
+        set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID1);  // forward: B in L1 ready
+
+        // Stage 2: TMOV (MTE1: L1[cur] → L0[cur])
+        // Wait for M-pipe to release L0[cur] (reverse dep from previous iteration)
+        wait_flag(PIPE_M, PIPE_MTE1, static_cast<::event_t>(cur));
+        wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);  // forward: wait A loaded
+        TMOV(aTile[cur], aMatTile[cur]);
+        wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID1);  // forward: wait B loaded
+        TMOV(bTile[cur], bMatTile[cur]);
+        set_flag(PIPE_MTE1, PIPE_MTE2, static_cast<::event_t>(cur));  // reverse: release L1[cur]
+
+        // Stage 3: TMATMUL (M-pipe: L0A[cur] × L0B[cur] → L0C)
+        set_flag(PIPE_MTE1, PIPE_M, static_cast<::event_t>(cur));  // forward: L0[cur] ready
+        wait_flag(PIPE_MTE1, PIPE_M, static_cast<::event_t>(cur));
+        if (i == 0) {
+            TMATMUL(cTile, aTile[cur], bTile[cur]);
+        } else {
+            TMATMUL_ACC(cTile, cTile, aTile[cur], bTile[cur]);
+        }
+        set_flag(PIPE_M, PIPE_MTE1, static_cast<::event_t>(cur));  // reverse: release L0[cur]
+    }
+
+    // Drain outstanding reverse-dependency flags
+    wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID0);
+    wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1);
+    wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID0);
+    wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID1);
+
+    set_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
+    wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
+    TSTORE(oiGlobal, cTile);
+
+    set_flag(PIPE_FIX, PIPE_S, EVENT_ID7);
+    wait_flag(PIPE_FIX, PIPE_S, EVENT_ID7);
+}
+
+extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) {
+    __gm__ Tensor *pij_buf = reinterpret_cast<__gm__ Tensor *>(args[0]);
+    __gm__ Tensor *value_cache = reinterpret_cast<__gm__ Tensor *>(args[1]);
+    __gm__ Tensor *block_table_t = reinterpret_cast<__gm__ Tensor *>(args[2]);
+    __gm__ Tensor *oi_new = reinterpret_cast<__gm__ Tensor *>(args[3]);
+    uint64_t n_blocks = static_cast<uint64_t>(args[4]);
+    uint64_t bt_offset = static_cast<uint64_t>(args[5]);
+
+    __gm__ bfloat16_t *pij_base = reinterpret_cast<__gm__ bfloat16_t *>(pij_buf->buffer.addr) + pij_buf->start_offset;
+    __gm__ bfloat16_t *val_base = reinterpret_cast<__gm__ bfloat16_t *>(value_cache->buffer.addr);
+    __gm__ float *oi_base = reinterpret_cast<__gm__ float *>(oi_new->buffer.addr) + oi_new->start_offset;
+    __gm__ int32_t *bt = reinterpret_cast<__gm__ int32_t *>(block_table_t->buffer.addr);
+
+    uint64_t q_tile_size = static_cast<uint64_t>(pij_buf->shapes[0]);
+
+    if (q_tile_size == 16) {
+        pv_matmul_n_impl<16, 128, 128>(pij_base, val_base, oi_base, n_blocks, bt, bt_offset);
+    } else {
+        pv_matmul_n_impl<64, 64, 128>(pij_base, val_base, oi_base, n_blocks, bt, bt_offset);
+    }
+}
+// NOLINTEND(clang-diagnostic-error,bugprone-reserved-identifier,bugprone-easily-swappable-parameters,modernize-avoid-c-arrays,modernize-use-auto)
diff --git a/examples/a2a3/fully_distributed_within_core/paged_attention_unroll_manual_scope/kernels/aic/aic_qk_matmul.cpp b/examples/a2a3/fully_distributed_within_core/paged_attention_unroll_manual_scope/kernels/aic/aic_qk_matmul.cpp
new file mode 100644
index 000000000..13ef8e06b
--- /dev/null
+++ b/examples/a2a3/fully_distributed_within_core/paged_attention_unroll_manual_scope/kernels/aic/aic_qk_matmul.cpp
@@ -0,0 +1,156 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+// Multi-block QK Matmul Kernel: qi(M, K) @ kj.T(K, N) -> sij(M, N) for each block
+//
+// Processes n_blocks blocks in a single kernel invocation.
+// Per-block kj addresses computed from key_cache base + block_indices lookup.
+// qi is shared across all blocks (same query head against different key blocks).
+//
+// Output layout: n_blocks contiguous (M, N) tiles stacked vertically.
+// Block i occupies sij[i*M : (i+1)*M, 0:N].
+//
+// Optimizations:
+//   - qi TLOAD hoisted before the loop (constant across all iterations)
+//   - Double-buffered L1 B tiles: prefetch next kj during current TMATMUL+TSTORE
+//
+// Supports two tile configurations via runtime dispatch:
+//   Case1: (16, 128) @ (128, 128).T -> (16, 128)
+//   Case2: (64, 128) @ (128,  64).T -> (64,  64)
+//
+// Template: M=q_tile, K=head_dim, N=block_size
+
+#include <cstdint>
+// NOLINTBEGIN(clang-diagnostic-error,bugprone-reserved-identifier,bugprone-easily-swappable-parameters,modernize-use-auto)
+#include <pto/pto-inst.hpp>
+
+#include "tensor.h"
+
+// NOLINTNEXTLINE(build/namespaces)
+using namespace pto;
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__ [aicore]  // NOLINT(whitespace/braces)
+#endif
+
+template <int M, int K, int N>
+static __aicore__ void qk_matmul_n_impl(
+    __gm__ bfloat16_t *qi_base, __gm__ bfloat16_t *key_base, __gm__ float *sij_base, uint64_t n_blocks,
+    __gm__ int32_t *bt, uint64_t bt_offset
+) {
+    using GlobalA = GlobalTensor<bfloat16_t, Shape<1, 1, 1, M, K>, Stride<M * K, M * K, M * K, K, 1>>;
+    using GlobalB = GlobalTensor<bfloat16_t, Shape<1, 1, 1, K, N>, Stride<K * N, K * N, K * N, 1, K>, Layout::DN>;
+    using GlobalOut = GlobalTensor<float, Shape<1, 1, 1, M, N>, Stride<M * N, M * N, M * N, N, 1>>;
+
+    using TileMatA = Tile<TileType::Mat, bfloat16_t, M, K, BLayout::ColMajor, M, K, SLayout::RowMajor, 512>;
+    using TileMatB = Tile<TileType::Mat, bfloat16_t, K, N, BLayout::RowMajor, K, N, SLayout::ColMajor, 512>;
+
+    using LeftTile = TileLeft<bfloat16_t, M, K, M, K>;
+    using RightTile = TileRight<bfloat16_t, K, N, K, N>;
+    using AccTile = TileAcc<float, M, N, M, N>;
+
+    // Double-buffered L1 B tiles for kj prefetching
+    constexpr int kBBytes = K * N * static_cast<int>(sizeof(bfloat16_t));
+    TileMatA aMatTile;
+    TileMatB bMatTile_A;
+    TileMatB bMatTile_B;
+    TASSIGN(aMatTile, 0x0);
+    TASSIGN(bMatTile_A, 0x20000);
+    TASSIGN(bMatTile_B, 0x20000 + kBBytes);
+
+    LeftTile aTile;
+    RightTile bTile;
+    AccTile cTile;
+    TASSIGN(aTile, 0x0);
+    TASSIGN(bTile, 0x0);
+    TASSIGN(cTile, 0x0);
+
+    // Hoist qi TLOAD before the loop (qi is constant across all blocks)
+    GlobalA qiGlobal(qi_base);
+    TLOAD(aMatTile, qiGlobal);
+
+    // Pre-load first kj into buffer A
+    GlobalB kjGlobal_0(key_base + bt[bt_offset + 0] * N * K);
+    TLOAD(bMatTile_A, kjGlobal_0);
+
+    for (uint64_t i = 0; i < n_blocks; i++) {
+        GlobalOut sijGlobal(sij_base + i * M * N);
+
+        // Wait for current kj TLOAD to complete
+        set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
+        wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
+
+        // TMOV qi L1→L0A and kj L1→L0B from current buffer
+        TMOV(aTile, aMatTile);
+        if (i % 2 == 0) {
+            TMOV(bTile, bMatTile_A);
+        } else {
+            TMOV(bTile, bMatTile_B);
+        }
+
+        // Prefetch next kj into alternate L1 buffer (overlaps with MTE1→M→FIX)
+        if (i + 1 < n_blocks) {
+            GlobalB kjGlobal_next(key_base + bt[bt_offset + i + 1] * N * K);
+            if (i % 2 == 0) {
+                TLOAD(bMatTile_B, kjGlobal_next);
+            } else {
+                TLOAD(bMatTile_A, kjGlobal_next);
+            }
+        }
+
+        set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
+        wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
+
+        TMATMUL(cTile, aTile, bTile);
+
+        set_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
+        wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
+
+        TSTORE(sijGlobal, cTile);
+
+        if (i + 1 < n_blocks) {
+            // Drain all pipes before next iteration:
+            //   - FIX/MTE3: ensures TSTORE data path (L0C→UB→GM) fully completes
+            //   - MTE2: prefetch TLOAD likely already done (ran during TMATMUL+TSTORE)
+            // The prefetch TLOAD overlaps with compute, so barrier cost is minimal.
+            pipe_barrier(PIPE_ALL);
+        }
+    }
+    set_flag(PIPE_FIX, PIPE_S, EVENT_ID7);
+    wait_flag(PIPE_FIX, PIPE_S, EVENT_ID7);
+}
+
+extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) {
+    __gm__ Tensor *qi = reinterpret_cast<__gm__ Tensor *>(args[0]);
+    __gm__ Tensor *key_cache = reinterpret_cast<__gm__ Tensor *>(args[1]);
+    __gm__ Tensor *block_table_t = reinterpret_cast<__gm__ Tensor *>(args[2]);
+    __gm__ Tensor *sij_buf = reinterpret_cast<__gm__ Tensor *>(args[3]);
+    uint64_t n_blocks = static_cast<uint64_t>(args[4]);
+    uint64_t bt_offset = static_cast<uint64_t>(args[5]);
+
+    __gm__ bfloat16_t *qi_base = reinterpret_cast<__gm__ bfloat16_t *>(qi->buffer.addr) + qi->start_offset;
+    __gm__ bfloat16_t *key_base = reinterpret_cast<__gm__ bfloat16_t *>(key_cache->buffer.addr);
+    __gm__ float *sij_base = reinterpret_cast<__gm__ float *>(sij_buf->buffer.addr) + sij_buf->start_offset;
+    __gm__ int32_t *bt = reinterpret_cast<__gm__ int32_t *>(block_table_t->buffer.addr);
+
+    uint64_t q_tile_size = static_cast<uint64_t>(qi->shapes[0]);
+
+    if (q_tile_size == 16) {
+        qk_matmul_n_impl<16, 128, 128>(qi_base, key_base, sij_base, n_blocks, bt, bt_offset);
+    } else {
+        qk_matmul_n_impl<64, 128, 64>(qi_base, key_base, sij_base, n_blocks, bt, bt_offset);
+    }
+}
+// NOLINTEND(clang-diagnostic-error,bugprone-reserved-identifier,bugprone-easily-swappable-parameters,modernize-use-auto)
diff --git a/examples/a2a3/fully_distributed_within_core/paged_attention_unroll_manual_scope/kernels/aiv/aiv_online_update.cpp b/examples/a2a3/fully_distributed_within_core/paged_attention_unroll_manual_scope/kernels/aiv/aiv_online_update.cpp
new file mode 100644
index 000000000..b5d71b544
--- /dev/null
+++ b/examples/a2a3/fully_distributed_within_core/paged_attention_unroll_manual_scope/kernels/aiv/aiv_online_update.cpp
@@ -0,0 +1,255 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+// Online Softmax Update + Normalize Kernel (AIV)
+//
+// Operates on full tiles where M=q_tile_size, N=head_dim (128):
+//   Case1: oi/oi_new are (16, 128), mij/lij/mi/li are 16-element vectors
+//   Case2: oi/oi_new are (64, 128), mij/lij/mi/li are 64-element vectors
+//
+// Scalar layout strategy using TRESHAPE (zero-copy UB reshape):
+//   Scalars loaded as DN ColMajor (M, 1) for TROWEXPANDMUL/TROWEXPANDDIV.
+//   For element-wise ops (TMAX, TSUB, TEXP, etc.), TRESHAPE to RowMajor (1, M).
+//   After arithmetic, TRESHAPE back to ColMajor (M, 1) for row-broadcast ops.
+//   This eliminates the GM round-trip (TSTORE ND → TLOAD DN) used in the original.
+
+#include <cstdint>
+#include <pto/pto-inst.hpp>
+
+#include "tensor.h"
+
+using namespace pto;
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__ [aicore]
+#endif
+
+template <int M, int N>
+static __aicore__ void online_update_impl(
+    __gm__ Tensor *mij, __gm__ Tensor *lij, __gm__ Tensor *oi_new, __gm__ Tensor *mi, __gm__ Tensor *li,
+    __gm__ Tensor *oi, uint64_t is_first, uint64_t is_last, __gm__ Tensor *dst
+) {
+    __gm__ float *mij_ptr = reinterpret_cast<__gm__ float *>(mij->buffer.addr);
+    __gm__ float *lij_ptr = reinterpret_cast<__gm__ float *>(lij->buffer.addr);
+    __gm__ float *oi_new_ptr = reinterpret_cast<__gm__ float *>(oi_new->buffer.addr);
+    __gm__ float *mi_ptr = reinterpret_cast<__gm__ float *>(mi->buffer.addr);
+    __gm__ float *li_ptr = reinterpret_cast<__gm__ float *>(li->buffer.addr);
+    __gm__ float *oi_ptr = reinterpret_cast<__gm__ float *>(oi->buffer.addr);
+    __gm__ float *dst_ptr = reinterpret_cast<__gm__ float *>(dst->buffer.addr);
+
+    // Aligned rows for ColMajor DN tiles (32-byte alignment)
+    constexpr int kAlignedRows = ((M * sizeof(float) + 31) / 32) * (32 / sizeof(float));
+
+    // --- GlobalTensor types ---
+
+    // Data (M, N) RowMajor
+    using GlobalDataMxN = GlobalTensor<float, Shape<1, 1, 1, M, N>, Stride<1, 1, 1, N, 1>>;
+
+    // Scalar DN: M contiguous floats as (kAlignedRows, 1) ColMajor for TROWEXPAND ops and loading
+    using GlobalScalarDN = GlobalTensor<float, Shape<1, 1, 1, kAlignedRows, 1>, Stride<1, 1, 1, 1, 1>, Layout::DN>;
+
+    // Scalar ND: for storing mi_new and li_new back to GM
+    constexpr int kScalarCols = 32 / sizeof(float);
+    constexpr int kScalarRows = M / kScalarCols;
+    using GlobalScalarND =
+        GlobalTensor<float, Shape<1, 1, 1, kScalarRows, kScalarCols>, Stride<1, 1, 1, kScalarCols, 1>>;
+
+    // --- GlobalTensor instances ---
+
+    GlobalDataMxN oiNewGlobal(oi_new_ptr + oi_new->start_offset);
+    GlobalDataMxN oiGlobal(oi_ptr + oi->start_offset);
+    GlobalDataMxN dstGlobal(dst_ptr + dst->start_offset);
+
+    // DN globals for loading scalars as ColMajor
+    GlobalScalarDN mijGlobalDN(mij_ptr + mij->start_offset);
+    GlobalScalarDN lijGlobalDN(lij_ptr + lij->start_offset);
+    GlobalScalarDN miGlobalDN(mi_ptr + mi->start_offset);
+    GlobalScalarDN liGlobalDN(li_ptr + li->start_offset);
+
+    // ND globals for storing scalar results
+    GlobalScalarND miGlobalND(mi_ptr + mi->start_offset);
+    GlobalScalarND liGlobalND(li_ptr + li->start_offset);
+
+    // --- Tile types ---
+
+    using TileDataMxN = Tile<TileType::Vec, float, M, N, BLayout::RowMajor, M, N>;
+    using TileScalarDN = Tile<TileType::Vec, float, kAlignedRows, 1, BLayout::ColMajor, M, 1>;
+
+    // RowMajor (1, M) tiles for element-wise arithmetic via TRESHAPE
+    using TileScalarRow = Tile<TileType::Vec, float, 1, M, BLayout::RowMajor, 1, M>;
+
+    // ND tile for storing back to GM
+    using TileScalarND =
+        Tile<TileType::Vec, float, kScalarRows, kScalarCols, BLayout::RowMajor, kScalarRows, kScalarCols>;
+
+    // --- UB memory layout ---
+
+    constexpr int kDataBytes = M * N * sizeof(float);
+    constexpr int kScalarDNBytes = kAlignedRows * sizeof(float);
+
+    // Data tiles
+    TileDataMxN oiNewTile;
+    TileDataMxN oiTile;
+
+    // Scalar DN tiles loaded from GM (ColMajor)
+    TileScalarDN mijDN, lijDN, miDN, liDN;
+
+    // Temporary DN tiles for results
+    TileScalarDN miNewDN, alphaDN, betaDN, liNewDN, tmpDN;
+
+    TASSIGN(oiNewTile, 0);
+    TASSIGN(oiTile, kDataBytes);
+    TASSIGN(mijDN, 2 * kDataBytes);
+    TASSIGN(lijDN, 2 * kDataBytes + kScalarDNBytes);
+    TASSIGN(miDN, 2 * kDataBytes + 2 * kScalarDNBytes);
+    TASSIGN(liDN, 2 * kDataBytes + 3 * kScalarDNBytes);
+    TASSIGN(miNewDN, 2 * kDataBytes + 4 * kScalarDNBytes);
+    TASSIGN(alphaDN, 2 * kDataBytes + 5 * kScalarDNBytes);
+    TASSIGN(betaDN, 2 * kDataBytes + 6 * kScalarDNBytes);
+    TASSIGN(liNewDN, 2 * kDataBytes + 7 * kScalarDNBytes);
+    TASSIGN(tmpDN, 2 * kDataBytes + 8 * kScalarDNBytes);
+
+    if (is_first) {
+        // --- First block: copy inputs to accumulators ---
+        TLOAD(oiNewTile, oiNewGlobal);
+        TLOAD(mijDN, mijGlobalDN);
+        TLOAD(lijDN, lijGlobalDN);
+        set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+        wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+
+        // Store mi = mij, li = lij, oi = oi_new
+        // Alias ND tiles to same UB as DN tiles for ND-format store
+        TileScalarND mijND, lijND;
+        TASSIGN(mijND, 2 * kDataBytes);                   // alias same UB as mijDN
+        TASSIGN(lijND, 2 * kDataBytes + kScalarDNBytes);  // alias same UB as lijDN
+
+        set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+        wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+        TSTORE(miGlobalND, mijND);    // mi = mij
+        TSTORE(liGlobalND, lijND);    // li = lij
+        TSTORE(oiGlobal, oiNewTile);  // oi = oi_new
+
+        if (is_last) {
+            // Single block: normalize dst = oi_new / lij
+            // lijDN already in ColMajor DN format, use directly for TROWEXPANDDIV
+            set_flag(PIPE_MTE3, PIPE_V, EVENT_ID0);
+            wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID0);
+            TROWEXPANDDIV(oiNewTile, oiNewTile, lijDN);
+            set_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);
+            wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);
+            TSTORE(dstGlobal, oiNewTile);
+        }
+    } else {
+        // --- Subsequent blocks: accumulate ---
+
+        // Load all inputs as DN (ColMajor)
+        TLOAD(oiNewTile, oiNewGlobal);
+        TLOAD(oiTile, oiGlobal);
+        TLOAD(mijDN, mijGlobalDN);
+        TLOAD(lijDN, lijGlobalDN);
+        TLOAD(miDN, miGlobalDN);
+        TLOAD(liDN, liGlobalDN);
+        set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+        wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+
+        // TRESHAPE: ColMajor(M,1) → RowMajor(1,M) for element-wise arithmetic
+        TileScalarRow miRow, mijRow, liRow, lijRow;
+        TRESHAPE(miRow, miDN);
+        TRESHAPE(mijRow, mijDN);
+        TRESHAPE(liRow, liDN);
+        TRESHAPE(lijRow, lijDN);
+
+        // Scalar arithmetic in RowMajor (1, M) layout
+        TileScalarRow miNewRow, alphaRow, betaRow, liNewRow, tmpRow;
+        TASSIGN(miNewRow, 2 * kDataBytes + 4 * kScalarDNBytes);
+        TASSIGN(alphaRow, 2 * kDataBytes + 5 * kScalarDNBytes);
+        TASSIGN(betaRow, 2 * kDataBytes + 6 * kScalarDNBytes);
+        TASSIGN(liNewRow, 2 * kDataBytes + 7 * kScalarDNBytes);
+        TASSIGN(tmpRow, 2 * kDataBytes + 8 * kScalarDNBytes);
+
+        TMAX(miNewRow, miRow, mijRow);  // mi_new = max(mi, mij)
+        pipe_barrier(PIPE_V);
+        // alphaRow and betaRow write to independent UB addresses; both only read miNewRow
+        TSUB(alphaRow, miRow, miNewRow);  // alpha_exp = mi - mi_new
+        TSUB(betaRow, mijRow, miNewRow);  // beta_exp = mij - mi_new
+        pipe_barrier(PIPE_V);
+        // TEXP on independent UB addresses
+        TEXP(alphaRow, alphaRow);  // alpha = exp(mi - mi_new)
+        TEXP(betaRow, betaRow);    // beta = exp(mij - mi_new)
+        pipe_barrier(PIPE_V);
+        // tmpRow and liNewRow write to independent UB addresses
+        TMUL(tmpRow, alphaRow, liRow);    // alpha * li
+        TMUL(liNewRow, betaRow, lijRow);  // beta * lij
+        pipe_barrier(PIPE_V);
+        TADD(liNewRow, tmpRow, liNewRow);  // li_new = alpha*li + beta*lij
+
+        // TRESHAPE back: RowMajor(1,M) → ColMajor(M,1) for TROWEXPANDMUL
+        pipe_barrier(PIPE_V);
+        TRESHAPE(alphaDN, alphaRow);
+        TRESHAPE(betaDN, betaRow);
+
+        // Scale data tiles using row-broadcast multiply
+        TROWEXPANDMUL(oiTile, oiTile, alphaDN);       // oi *= alpha
+        TROWEXPANDMUL(oiNewTile, oiNewTile, betaDN);  // oi_new *= beta
+        pipe_barrier(PIPE_V);
+        TADD(oiTile, oiTile, oiNewTile);  // oi = alpha*oi + beta*oi_new
+
+        // Store mi_new and li_new to GM (ND format)
+        // Alias ND tiles to the same UB locations as miNewRow and liNewRow
+        TileScalarND miNewND, liNewND;
+        TASSIGN(miNewND, 2 * kDataBytes + 4 * kScalarDNBytes);
+        TASSIGN(liNewND, 2 * kDataBytes + 7 * kScalarDNBytes);
+
+        if (is_last) {
+            // Normalize and output: dst = oi / li_new
+            TRESHAPE(liNewDN, liNewRow);
+            pipe_barrier(PIPE_V);
+            TROWEXPANDDIV(oiTile, oiTile, liNewDN);
+            set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+            wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+            TSTORE(miGlobalND, miNewND);  // persist mi_new
+            TSTORE(liGlobalND, liNewND);  // persist li_new
+            TSTORE(dstGlobal, oiTile);
+        } else {
+            // Store updated accumulators
+            set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+            wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+            TSTORE(miGlobalND, miNewND);  // persist mi_new
+            TSTORE(liGlobalND, liNewND);  // persist li_new
+            TSTORE(oiGlobal, oiTile);
+        }
+    }
+    set_flag(PIPE_MTE3, PIPE_S, EVENT_ID7);
+    wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID7);
+}
+
+extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) {
+    __gm__ Tensor *mij = reinterpret_cast<__gm__ Tensor *>(args[0]);
+    __gm__ Tensor *lij = reinterpret_cast<__gm__ Tensor *>(args[1]);
+    __gm__ Tensor *oi_new = reinterpret_cast<__gm__ Tensor *>(args[2]);
+    __gm__ Tensor *mi = reinterpret_cast<__gm__ Tensor *>(args[3]);
+    __gm__ Tensor *li = reinterpret_cast<__gm__ Tensor *>(args[4]);
+    __gm__ Tensor *oi = reinterpret_cast<__gm__ Tensor *>(args[5]);
+    __gm__ Tensor *dst = reinterpret_cast<__gm__ Tensor *>(args[6]);
+    uint64_t is_first = static_cast<uint64_t>(args[7]);
+    uint64_t is_last = static_cast<uint64_t>(args[8]);
+    uint64_t q_tile_size = static_cast<uint64_t>(mij->shapes[0]);
+    // args[10] = head_dim (128)
+
+    if (q_tile_size == 16) {
+        online_update_impl<16, 128>(mij, lij, oi_new, mi, li, oi, is_first, is_last, dst);
+    } else {
+        online_update_impl<64, 128>(mij, lij, oi_new, mi, li, oi, is_first, is_last, dst);
+    }
+}
diff --git a/examples/a2a3/fully_distributed_within_core/paged_attention_unroll_manual_scope/kernels/aiv/aiv_softmax_prepare.cpp b/examples/a2a3/fully_distributed_within_core/paged_attention_unroll_manual_scope/kernels/aiv/aiv_softmax_prepare.cpp
new file mode 100644
index 000000000..c18957ee5
--- /dev/null
+++ b/examples/a2a3/fully_distributed_within_core/paged_attention_unroll_manual_scope/kernels/aiv/aiv_softmax_prepare.cpp
@@ -0,0 +1,292 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+// Two-Pass Softmax Kernel (AIV) for n_blocks tiles
+//
+// Input:  sij_buf (n_blocks * M, N) fp32 — QK results stacked vertically
+// Output: pij_buf (n_blocks * M, N) bf16 — attention weights per block
+//         mij (M,) fp32 — global row max across all blocks
+//         lij (M,) fp32 — total row sum across all blocks
+//
+// Pass 1: Iterate over n_blocks tiles, mask last block,
+//         find global m = scale * max over all blocks of rowmax(S_i)
+//         Defers scale to after the loop (single M-element TMULS vs n_blocks M×N).
+//         Uses double-buffered sij tiles and TRESHAPE for DN↔Row conversion.
+// Pass 2: Iterate again, compute P_i = exp(S_i * scale - m) -> bf16,
+//         accumulate l = sum over all blocks of rowsum(P_i)
+//         Uses double-buffered sij tiles to overlap TLOAD with computation.
+//
+// Two-pass ensures all P_i tiles share the same scale (global max),
+// enabling direct TMATMUL_ACC accumulation in the PV kernel.
+//
+// Supports two tile configurations via runtime dispatch:
+//   Case1: M=16, N=128 (q_tile=16, block_size=128)
+//   Case2: M=64, N=64  (q_tile=64, block_size=64)
+
+#include <cstdint>
+#include <pto/pto-inst.hpp>
+
+#include "tensor.h"
+
+using namespace pto;
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__ [aicore]
+#endif
+
+template <int M, int N>
+static __aicore__ void softmax_prepare_n_impl(
+    __gm__ float *sij_base, float scale_value, __gm__ bfloat16_t *pij_base, __gm__ float *mij_addr,
+    __gm__ float *lij_addr, uint64_t n_blocks, uint64_t valid_len_last
+) {
+    constexpr int kAlignedRows = ((M * sizeof(float) + 31) / 32) * (32 / sizeof(float));
+    constexpr int kScalarCols = 32 / sizeof(float);
+    constexpr int kScalarRows = M / kScalarCols;
+
+    // --- GlobalTensor types ---
+    using GlobalDataMxN = GlobalTensor<float, Shape<1, 1, 1, M, N>, Stride<1, 1, 1, N, 1>>;
+    using GlobalDataMxN_bf16 = GlobalTensor<bfloat16_t, Shape<1, 1, 1, M, N>, Stride<1, 1, 1, N, 1>>;
+    using GlobalScalarDN = GlobalTensor<float, Shape<1, 1, 1, kAlignedRows, 1>, Stride<1, 1, 1, 1, 1>, Layout::DN>;
+    using GlobalScalarND =
+        GlobalTensor<float, Shape<1, 1, 1, kScalarRows, kScalarCols>, Stride<1, 1, 1, kScalarCols, 1>>;
+
+    // --- Tile types ---
+    using TileSijDyn = Tile<TileType::Vec, float, M, N, BLayout::RowMajor, M, -1>;
+    using TileSijPad = Tile<TileType::Vec, float, M, N, BLayout::RowMajor, M, N, SLayout::NoneBox, 512, PadValue::Min>;
+    using TileVecMxN = Tile<TileType::Vec, float, M, N, BLayout::RowMajor, M, N>;
+    using TileVecMxN_bf16 = Tile<TileType::Vec, bfloat16_t, M, N, BLayout::RowMajor, M, N>;
+    using TileScalarDN = Tile<TileType::Vec, float, kAlignedRows, 1, BLayout::ColMajor, M, 1>;
+    using TileScalarND =
+        Tile<TileType::Vec, float, kScalarRows, kScalarCols, BLayout::RowMajor, kScalarRows, kScalarCols>;
+    // RowMajor (1, M) tile for element-wise arithmetic via TRESHAPE
+    using TileScalarRow = Tile<TileType::Vec, float, 1, M, BLayout::RowMajor, 1, M>;
+
+    // --- UB memory layout (double-buffered sij) ---
+    constexpr int kDataBytes = M * N * sizeof(float);
+    constexpr int kScalarDNBytes = kAlignedRows * sizeof(float);
+
+    // Double-buffered sij tiles
+    TileVecMxN sijTile_A;
+    TileSijPad sijPadTile_A;
+    TileVecMxN sijTile_B;
+    TileSijPad sijPadTile_B;
+    TileVecMxN pijTile;
+    TileVecMxN tmpTile;
+    TileVecMxN sumAccTile;
+    TileScalarDN localMaxDN;
+    TileScalarDN globalMaxDN;
+    TileScalarDN sumDN;
+    TileVecMxN_bf16 pijBf16Tile;
+
+    // TRESHAPE aliases (same UB address as their DN counterparts)
+    TileScalarRow localMaxRow;
+    TileScalarRow globalMaxRow;
+
+    // ND alias for storing globalMax to GM
+    TileScalarND globalMaxND;
+
+    TASSIGN(sijTile_A, 0x0);
+    TASSIGN(sijPadTile_A, 0x0);
+    TASSIGN(sijTile_B, kDataBytes);
+    TASSIGN(sijPadTile_B, kDataBytes);
+    TASSIGN(pijTile, 2 * kDataBytes);
+    TASSIGN(tmpTile, 3 * kDataBytes);
+    TASSIGN(sumAccTile, 4 * kDataBytes);
+    int scalarBase = 5 * kDataBytes;
+    TASSIGN(localMaxDN, scalarBase);
+    TASSIGN(localMaxRow, scalarBase);  // alias: same UB as localMaxDN
+    TASSIGN(globalMaxDN, scalarBase + kScalarDNBytes);
+    TASSIGN(globalMaxRow, scalarBase + kScalarDNBytes);  // alias: same UB as globalMaxDN
+    TASSIGN(globalMaxND, scalarBase + kScalarDNBytes);   // alias: same UB as globalMaxDN
+    TASSIGN(sumDN, scalarBase + 2 * kScalarDNBytes);
+    TASSIGN(pijBf16Tile, scalarBase + 3 * kScalarDNBytes);
+
+    // GM aliases (mij/lij output buffers)
+    GlobalScalarND mijGlobalND(mij_addr);
+    GlobalScalarDN lijGlobalDN(lij_addr);
+
+    // ======== Pass 1: Find global row max (unscaled) with double-buffered sij ========
+    // rowmax(S*scale) = scale * rowmax(S) since scale > 0, so defer scale to after loop.
+    GlobalDataMxN sijGlobal_p1_0(sij_base);
+    TLOAD(sijTile_A, sijGlobal_p1_0);
+
+    for (uint64_t i = 0; i < n_blocks; i++) {
+        set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+        wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+
+        if (i == n_blocks - 1 && valid_len_last < static_cast<uint64_t>(N)) {
+            TileSijDyn sijDynTile(static_cast<size_t>(valid_len_last));
+            if (i % 2 == 0) {
+                TASSIGN(sijDynTile, 0x0);
+                TFILLPAD_INPLACE(sijPadTile_A, sijDynTile);
+            } else {
+                TASSIGN(sijDynTile, static_cast<int>(kDataBytes));
+                TFILLPAD_INPLACE(sijPadTile_B, sijDynTile);
+            }
+            pipe_barrier(PIPE_V);
+        }
+
+        // Compute unscaled TROWMAX on current buffer
+        if (i % 2 == 0) {
+            TROWMAX(localMaxDN, sijTile_A, tmpTile);
+        } else {
+            TROWMAX(localMaxDN, sijTile_B, tmpTile);
+        }
+        pipe_barrier(PIPE_V);
+
+        // Prefetch next sij into alternate buffer (overlaps with V pipe scalar ops)
+        if (i + 1 < n_blocks) {
+            GlobalDataMxN sijGlobal_next(sij_base + (i + 1) * M * N);
+            if (i % 2 == 0) {
+                TLOAD(sijTile_B, sijGlobal_next);
+            } else {
+                TLOAD(sijTile_A, sijGlobal_next);
+            }
+        }
+
+        // TRESHAPE: ColMajor(M,1) → RowMajor(1,M) for element-wise TMAX
+        TRESHAPE(localMaxRow, localMaxDN);
+        if (i == 0) {
+            TMAX(globalMaxRow, localMaxRow, localMaxRow);
+        } else {
+            TMAX(globalMaxRow, globalMaxRow, localMaxRow);
+        }
+        pipe_barrier(PIPE_V);
+    }
+
+    // Apply scale once to the global max vector (M elements, not n_blocks × M × N)
+    TMULS(globalMaxRow, globalMaxRow, scale_value);
+    pipe_barrier(PIPE_V);
+
+    // TRESHAPE back: RowMajor(1,M) → ColMajor(M,1) for Pass 2's TROWEXPANDSUB
+    TRESHAPE(globalMaxDN, globalMaxRow);
+
+    // Store final global max to mij for online_update to consume
+    set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+    wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+    TSTORE(mijGlobalND, globalMaxND);
+
+    // ======== Pass 2: Compute softmax with double-buffered sij ========
+    // globalMaxDN is already in UB from TRESHAPE — no reload needed.
+    // Sync MTE3→MTE2 to ensure the mij TSTORE completed before first sij TLOAD.
+    set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
+    wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
+
+    // Pre-load first sij tile into buffer A
+    GlobalDataMxN sijGlobal_0(sij_base);
+    TLOAD(sijTile_A, sijGlobal_0);
+
+    for (uint64_t i = 0; i < n_blocks; i++) {
+        GlobalDataMxN_bf16 pijGlobal(pij_base + i * M * N);
+
+        // Wait for current tile's TLOAD to complete
+        set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+        wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+
+        // TFILLPAD on current buffer if last block with partial valid length
+        if (i == n_blocks - 1 && valid_len_last < static_cast<uint64_t>(N)) {
+            TileSijDyn curSijDyn(static_cast<size_t>(valid_len_last));
+            if (i % 2 == 0) {
+                TASSIGN(curSijDyn, 0x0);
+                TFILLPAD_INPLACE(sijPadTile_A, curSijDyn);
+            } else {
+                TASSIGN(curSijDyn, static_cast<int>(kDataBytes));
+                TFILLPAD_INPLACE(sijPadTile_B, curSijDyn);
+            }
+            pipe_barrier(PIPE_V);
+        }
+
+        // Compute on current buffer (select A or B based on iteration parity)
+        if (i % 2 == 0) {
+            TMULS(sijTile_A, sijTile_A, scale_value);
+            pipe_barrier(PIPE_V);
+            TROWEXPANDSUB(pijTile, sijTile_A, globalMaxDN);
+        } else {
+            TMULS(sijTile_B, sijTile_B, scale_value);
+            pipe_barrier(PIPE_V);
+            TROWEXPANDSUB(pijTile, sijTile_B, globalMaxDN);
+        }
+        pipe_barrier(PIPE_V);
+        TEXP(pijTile, pijTile);
+        pipe_barrier(PIPE_V);
+        TCVT(pijBf16Tile, pijTile, RoundMode::CAST_ROUND);
+        pipe_barrier(PIPE_V);
+        TCVT(pijTile, pijBf16Tile, RoundMode::CAST_ROUND);
+
+        pipe_barrier(PIPE_V);
+        if (i == 0) {
+            TMULS(sumAccTile, pijTile, 1.0f);
+        } else {
+            TADD(sumAccTile, sumAccTile, pijTile);
+        }
+
+        // Store pij (must complete before next iteration's TCVT overwrites pijBf16Tile)
+        pipe_barrier(PIPE_V);
+        set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+        wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+        TSTORE(pijGlobal, pijBf16Tile);
+
+        // Prefetch next sij into alternate buffer (after TSTORE to avoid UB race)
+        if (i + 1 < n_blocks) {
+            set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
+            wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
+            GlobalDataMxN sijGlobal_next(sij_base + (i + 1) * M * N);
+            if (i % 2 == 0) {
+                TLOAD(sijTile_B, sijGlobal_next);
+            } else {
+                TLOAD(sijTile_A, sijGlobal_next);
+            }
+        }
+    }
+
+    // Compute final row sum from accumulated pij values
+    pipe_barrier(PIPE_V);
+    TROWSUM(sumDN, sumAccTile, tmpTile);
+
+    // Store lij (total sum). mij already stored after Pass 1.
+    set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+    wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+    TSTORE(lijGlobalDN, sumDN);
+
+    set_flag(PIPE_MTE3, PIPE_S, EVENT_ID7);
+    wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID7);
+}
+
+extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) {
+    __gm__ Tensor *sij_buf = reinterpret_cast<__gm__ Tensor *>(args[0]);
+    __gm__ Tensor *pij_buf = reinterpret_cast<__gm__ Tensor *>(args[1]);
+    __gm__ Tensor *mij = reinterpret_cast<__gm__ Tensor *>(args[2]);
+    __gm__ Tensor *lij = reinterpret_cast<__gm__ Tensor *>(args[3]);
+    union {
+        uint64_t u;
+        float f;
+    } scale_conv;
+    scale_conv.u = static_cast<uint64_t>(args[4]);
+    float scale_value = scale_conv.f;
+    uint64_t n_blocks = static_cast<uint64_t>(args[5]);
+    uint64_t valid_len_last = static_cast<uint64_t>(args[6]);
+
+    __gm__ float *sij_base = reinterpret_cast<__gm__ float *>(sij_buf->buffer.addr) + sij_buf->start_offset;
+    __gm__ bfloat16_t *pij_base = reinterpret_cast<__gm__ bfloat16_t *>(pij_buf->buffer.addr) + pij_buf->start_offset;
+    __gm__ float *mij_addr = reinterpret_cast<__gm__ float *>(mij->buffer.addr) + mij->start_offset;
+    __gm__ float *lij_addr = reinterpret_cast<__gm__ float *>(lij->buffer.addr) + lij->start_offset;
+
+    uint64_t q_tile_size = static_cast<uint64_t>(sij_buf->shapes[0]);
+
+    if (q_tile_size == 16) {
+        softmax_prepare_n_impl<16, 128>(sij_base, scale_value, pij_base, mij_addr, lij_addr, n_blocks, valid_len_last);
+    } else {
+        softmax_prepare_n_impl<64, 64>(sij_base, scale_value, pij_base, mij_addr, lij_addr, n_blocks, valid_len_last);
+    }
+}
diff --git a/examples/a2a3/fully_distributed_within_core/paged_attention_unroll_manual_scope/kernels/orchestration/paged_attention_orch.cpp b/examples/a2a3/fully_distributed_within_core/paged_attention_unroll_manual_scope/kernels/orchestration/paged_attention_orch.cpp
new file mode 100644
index 000000000..82bc89f37
--- /dev/null
+++ b/examples/a2a3/fully_distributed_within_core/paged_attention_unroll_manual_scope/kernels/orchestration/paged_attention_orch.cpp
@@ -0,0 +1,352 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Paged Attention Orchestration Function V2 - N_UNROLL=8, 4 Tasks Per Group
+ *
+ * Batches up to N_UNROLL blocks per group. Each group submits exactly 4 tasks:
+ *   1. QK matmul:  qi @ K^T for n_blocks → sij_buf (q_tile, n_blocks * block_size)
+ *   2. Softmax:    two-pass over sij_buf → pij_buf, mi, li
+ *   3. PV matmul:  SplitK accumulated P @ V → oi_new (q_tile, head_dim)
+ *   4. Update:     online softmax accumulation with group-level mi, li, oi_new
+ *
+ * Memory Layout:
+ *   Query: (batch * num_heads, head_dim) bf16
+ *   Key:   (total_blocks, block_size, head_dim) bf16 (stored as K^T for QK)
+ *   Value: (total_blocks, block_size, head_dim) bf16
+ */
+
+#include <algorithm>
+#include <cstdint>
+#include <cstring>
+
+#include "pto_orchestration_api.h"
+
+#define N_UNROLL 64
+
+#define FUNC_QK_MATMUL 0
+#define FUNC_SOFTMAX_PREPARE 1
+#define FUNC_PV_MATMUL 2
+#define FUNC_ONLINE_UPDATE 3
+constexpr uint64_t PLATFORM_PROF_SYS_CNT_FREQ = 50000000;  // 50 MHz
+
+inline double cycles_to_us(uint64_t cycles) {
+    return (static_cast<double>(cycles) / PLATFORM_PROF_SYS_CNT_FREQ) * 1000000.0;
+}
+
+inline uint64_t get_sys_cnt_aicpu() {
+    uint64_t ticks;
+    asm volatile("mrs %0, cntvct_el0" : "=r"(ticks));
+    return ticks;
+}
+
+#ifdef ENABLE_PROFILING
+#define CYCLE_COUNT_START() uint64_t _t0 = get_sys_cnt_aicpu(), _t1
+#define CYCLE_COUNT_LAP(acc)       \
+    do {                           \
+        _t1 = get_sys_cnt_aicpu(); \
+        acc += (_t1 - _t0);        \
+        _t0 = _t1;                 \
+    } while (0)
+#else
+#define CYCLE_COUNT_START() (void)0
+#define CYCLE_COUNT_LAP(acc) (void)0
+#endif
+
+extern "C" {
+/**
+ * Orchestration config — the executor reads these values to set up
+ * shared memory and runtime before calling aicpu_orchestration_entry.
+ */
+__attribute__((visibility("default"))) PTO2OrchestrationConfig aicpu_orchestration_config(const L2TaskArgs &orch_args) {
+    (void)orch_args;
+    return PTO2OrchestrationConfig{
+        .expected_arg_count = 7,
+    };
+}
+
+__attribute__((visibility("default"))) void aicpu_orchestration_entry(const L2TaskArgs &orch_args) {
+#ifdef ENABLE_PROFILING
+    uint64_t prof_param_extract = 0;
+    uint64_t prof_ext_tensor = 0;
+    uint64_t prof_make_tensor = 0;
+    uint64_t prof_tensor_view = 0;
+    uint64_t prof_param_setup = 0;
+    uint64_t prof_submit_task = 0;
+    uint64_t prof_scope_and_loop = 0;
+    int prof_submit_count = 0;
+    int prof_make_count = 0;
+    int prof_view_count = 0;
+#endif
+
+    CYCLE_COUNT_START();
+
+    // Read dimensions from tensor metadata
+    // query: shape=[batch, num_heads, head_dim]
+    uint64_t batch = orch_args.tensor(0).ref().shapes[0];
+    uint64_t num_heads = orch_args.tensor(0).ref().shapes[1];
+    uint64_t head_dim = orch_args.tensor(0).ref().shapes[2];
+    DataType data_type = orch_args.tensor(0).ref().dtype;
+
+    // key_cache: shape=[total_blocks, block_size, kv_head_num, head_dim]
+    uint64_t block_size = orch_args.tensor(1).ref().shapes[1];
+
+    // block_table: shape=[batch, max_num_blocks_per_req]
+    uint64_t block_num = orch_args.tensor(3).ref().shapes[1];
+
+    // scale from scalar arg
+    uint64_t scale_value = orch_args.scalar(0);
+    uint64_t q_head_num = num_heads;
+    uint64_t q_tile = std::min(num_heads, 128UL);
+    uint64_t q_loop = (q_head_num + q_tile - 1) / q_tile;
+    CYCLE_COUNT_LAP(prof_param_extract);
+
+    // Reshape tensors for kernel consumption (2D flattened)
+    void *query_ptr = orch_args.tensor(0).ref().data_as<void>();
+    void *kc_ptr = orch_args.tensor(1).ref().data_as<void>();
+    void *vc_ptr = orch_args.tensor(2).ref().data_as<void>();
+    void *out_ptr = orch_args.tensor(5).ref().data_as<void>();
+
+    uint64_t total_blocks_count = orch_args.tensor(1).ref().shapes[0];
+
+    uint32_t query_shapes[2] = {static_cast<uint32_t>(batch * num_heads), static_cast<uint32_t>(head_dim)};
+    uint32_t key_cache_shapes[2] = {
+        static_cast<uint32_t>(total_blocks_count * block_size), static_cast<uint32_t>(head_dim)
+    };
+    uint32_t value_cache_shapes[2] = {
+        static_cast<uint32_t>(total_blocks_count * block_size), static_cast<uint32_t>(head_dim)
+    };
+    uint32_t out_shapes[2] = {static_cast<uint32_t>(batch * num_heads), static_cast<uint32_t>(head_dim)};
+    Tensor query = make_tensor_external(query_ptr, query_shapes, 2, data_type, false);
+    Tensor key_cache = make_tensor_external(kc_ptr, key_cache_shapes, 2, data_type, false);
+    Tensor value_cache = make_tensor_external(vc_ptr, value_cache_shapes, 2, data_type, false);
+    Tensor out = make_tensor_external(out_ptr, out_shapes, 2, DataType::FLOAT32);
+
+    uint32_t bt_shapes[2] = {static_cast<uint32_t>(batch), static_cast<uint32_t>(block_num)};
+    Tensor block_table =
+        make_tensor_external(orch_args.tensor(3).ref().data_as<void>(), bt_shapes, 2, DataType::INT32, false);
+    uint32_t cl_shapes[1] = {static_cast<uint32_t>(batch)};
+    Tensor context_lens =
+        make_tensor_external(orch_args.tensor(4).ref().data_as<void>(), cl_shapes, 1, DataType::INT32, false);
+
+#ifdef ENABLE_PROFILING
+    CYCLE_COUNT_LAP(prof_ext_tensor);
+#endif
+
+    // Create infos are loop-invariant — shapes depend only on q_tile/head_dim
+    uint32_t oi_shapes[2] = {static_cast<uint32_t>(q_tile), static_cast<uint32_t>(head_dim)};
+    uint32_t li_shapes[1] = {static_cast<uint32_t>(q_tile)};
+    TensorCreateInfo tile2d_ci(oi_shapes, 2, DataType::FLOAT32);
+    TensorCreateInfo scalar_ci(li_shapes, 1, DataType::FLOAT32);
+#ifdef ENABLE_PROFILING
+    prof_make_count += 2;
+    CYCLE_COUNT_LAP(prof_make_tensor);
+#endif
+
+    for (uint64_t b_idx = 0; b_idx < batch; b_idx++) {
+        uint32_t cl_idx[1] = {static_cast<uint32_t>(b_idx)};
+        uint64_t cur_seq = static_cast<uint64_t>(get_tensor_data<int32_t>(context_lens, 1, cl_idx));
+        uint64_t bn_this_batch = (cur_seq + block_size - 1) / block_size;
+
+        for (uint64_t q_idx = 0; q_idx < q_loop; q_idx++) {
+            CYCLE_COUNT_LAP(prof_scope_and_loop);
+            PTO2_SCOPE(PTO2ScopeMode::MANUAL) {
+                uint64_t cur_offset = b_idx * q_head_num + q_idx * q_tile;
+
+                uint32_t qi_shapes[2] = {static_cast<uint32_t>(q_tile), static_cast<uint32_t>(head_dim)};
+                uint32_t qi_offsets[2] = {static_cast<uint32_t>(cur_offset), 0};
+                Tensor qi = query.view(qi_shapes, qi_offsets);
+                uint32_t out_view_shapes[2] = {static_cast<uint32_t>(q_tile), static_cast<uint32_t>(head_dim)};
+                uint32_t out_view_offsets[2] = {static_cast<uint32_t>(cur_offset), 0};
+                Tensor out_view = out.view(out_view_shapes, out_view_offsets, true);
+#ifdef ENABLE_PROFILING
+                prof_view_count += 2;
+                CYCLE_COUNT_LAP(prof_tensor_view);
+#endif
+                CYCLE_COUNT_LAP(prof_param_setup);
+                TaskOutputTensors alloc_outs = alloc_tensors(tile2d_ci, scalar_ci, scalar_ci);
+                const Tensor &oi = alloc_outs.get_ref(0);
+                const Tensor &li_update = alloc_outs.get_ref(1);
+                const Tensor &mi_update = alloc_outs.get_ref(2);
+                PTO2TaskId pre_task_id;
+#ifdef ENABLE_PROFILING
+                prof_submit_count++;
+                CYCLE_COUNT_LAP(prof_submit_task);
+#endif
+
+                // Reusable Arg objects — reset() before each use avoids
+                // repeated stack-frame construction in the inner loop.
+                L0TaskArgs params_qk, params_sf, params_pv, params_up;
+
+                for (uint64_t bn = 0; bn < bn_this_batch; bn += N_UNROLL) {
+                    uint64_t n_blocks = std::min(static_cast<uint64_t>(N_UNROLL), bn_this_batch - bn);
+
+                    // Valid length for last block in this group
+                    uint64_t last_block_seq_start = (bn + n_blocks - 1) * block_size;
+                    uint64_t valid_len_last = std::min(block_size, cur_seq - last_block_seq_start);
+                    CYCLE_COUNT_LAP(prof_param_extract);
+
+                    // === Task 1: Batched QK matmul ===
+                    uint32_t sij_buf_shapes[2] = {
+                        static_cast<uint32_t>(q_tile), static_cast<uint32_t>(n_blocks * block_size)
+                    };
+                    TensorCreateInfo sij_buf_ci(sij_buf_shapes, 2, DataType::FLOAT32);
+#ifdef ENABLE_PROFILING
+                    prof_make_count += 1;
+                    CYCLE_COUNT_LAP(prof_make_tensor);
+#endif
+
+                    params_qk.reset();
+                    params_qk.add_input(qi);
+                    params_qk.add_input(key_cache);
+                    params_qk.add_input(block_table);
+                    params_qk.add_output(sij_buf_ci);
+                    params_qk.add_scalar(n_blocks);
+                    params_qk.add_scalar(b_idx * block_num + bn);
+                    CYCLE_COUNT_LAP(prof_param_setup);
+                    TaskOutputTensors qk_outs = rt_submit_aic_task(FUNC_QK_MATMUL, params_qk);
+                    const Tensor &sij_buf = qk_outs.get_ref(0);
+#ifdef ENABLE_PROFILING
+                    prof_submit_count++;
+                    CYCLE_COUNT_LAP(prof_submit_task);
+#endif
+
+                    // === Task 2: Two-pass softmax over all blocks in group ===
+                    uint32_t pij_buf_shapes[2] = {
+                        static_cast<uint32_t>(q_tile), static_cast<uint32_t>(n_blocks * block_size)
+                    };
+                    TensorCreateInfo pij_buf_ci(pij_buf_shapes, 2, data_type);
+#ifdef ENABLE_PROFILING
+                    prof_make_count += 1;
+                    CYCLE_COUNT_LAP(prof_make_tensor);
+#endif
+
+                    params_sf.reset();
+                    params_sf.add_input(sij_buf);
+                    params_sf.add_output(pij_buf_ci);
+                    params_sf.add_output(scalar_ci);
+                    params_sf.add_output(scalar_ci);
+                    PTO2TaskId sf_deps[] = {qk_outs.task_id()};
+                    params_sf.set_dependencies(sf_deps, 1);
+                    params_sf.add_scalar(scale_value);
+                    params_sf.add_scalar(n_blocks);
+                    params_sf.add_scalar(valid_len_last);
+                    CYCLE_COUNT_LAP(prof_param_setup);
+                    TaskOutputTensors sf_outs = rt_submit_aiv_task(FUNC_SOFTMAX_PREPARE, params_sf);
+                    const Tensor &pij_buf = sf_outs.get_ref(0);
+                    const Tensor &mi = sf_outs.get_ref(1);
+                    const Tensor &li = sf_outs.get_ref(2);
+#ifdef ENABLE_PROFILING
+                    prof_submit_count++;
+                    CYCLE_COUNT_LAP(prof_submit_task);
+#endif
+
+                    // === Task 3: SplitK PV matmul (accumulated P @ V) ===
+                    params_pv.reset();
+                    params_pv.add_input(pij_buf);
+                    params_pv.add_input(value_cache);
+                    params_pv.add_input(block_table);
+                    params_pv.add_output(tile2d_ci);
+                    PTO2TaskId pv_deps[] = {sf_outs.task_id()};
+                    params_pv.set_dependencies(pv_deps, 1);
+                    params_pv.add_scalar(n_blocks);
+                    params_pv.add_scalar(b_idx * block_num + bn);
+                    CYCLE_COUNT_LAP(prof_param_setup);
+                    TaskOutputTensors pv_outs = rt_submit_aic_task(FUNC_PV_MATMUL, params_pv);
+                    const Tensor &oi_new = pv_outs.get_ref(0);
+#ifdef ENABLE_PROFILING
+                    prof_submit_count++;
+                    CYCLE_COUNT_LAP(prof_submit_task);
+#endif
+
+                    // === Task 4: Online update (per-group) ===
+                    uint64_t is_first = (bn == 0) ? 1 : 0;
+                    uint64_t is_last = (bn + n_blocks >= bn_this_batch) ? 1 : 0;
+
+                    params_up.reset();
+                    params_up.add_input(mi);
+                    params_up.add_input(li);
+                    params_up.add_input(oi_new);
+                    params_up.add_inout(mi_update);
+                    params_up.add_inout(li_update);
+                    params_up.add_inout(oi);
+                    params_up.add_inout(out_view);
+                    PTO2TaskId up_deps[3];
+                    uint32_t up_dep_count = 0;
+                    up_deps[up_dep_count++] = pv_outs.task_id();
+                    if (!is_first) {
+                        up_deps[up_dep_count++] = pre_task_id;
+                    }
+                    // alloc completes inline; this dep only keeps the scratch buffers alive until the last consumer.
+                    if (is_last) {
+                        up_deps[up_dep_count++] = alloc_outs.task_id();
+                    }
+                    params_up.set_dependencies(up_deps, up_dep_count);
+                    params_up.add_scalar(is_first);
+                    params_up.add_scalar(is_last);
+                    CYCLE_COUNT_LAP(prof_param_setup);
+                    TaskOutputTensors update_outs = rt_submit_aiv_task(FUNC_ONLINE_UPDATE, params_up);
+                    pre_task_id = update_outs.task_id();
+#ifdef ENABLE_PROFILING
+                    prof_submit_count++;
+                    CYCLE_COUNT_LAP(prof_submit_task);
+#endif
+                }
+            }
+            CYCLE_COUNT_LAP(prof_scope_and_loop);
+        }
+    }
+    CYCLE_COUNT_LAP(prof_scope_and_loop);
+
+#ifdef ENABLE_PROFILING
+    uint64_t total = prof_param_extract + prof_ext_tensor + prof_make_tensor + prof_tensor_view + prof_param_setup +
+                     prof_submit_task + prof_scope_and_loop;
+    LOG_INFO_V9(
+        "=== PagedAttn Orch Profiling: %d submits, %d makes, %d views, total=%.3fus ===", prof_submit_count,
+        prof_make_count, prof_view_count, cycles_to_us(total)
+    );
+    if (total > 0) {
+        LOG_INFO_V9(
+            "  param_extract    : %7.3fus (%5.1f%%)", cycles_to_us(prof_param_extract),
+            prof_param_extract * 100.0 / total
+        );
+        LOG_INFO_V9(
+            "  ext_tensor(x4)   : %7.3fus (%5.1f%%)", cycles_to_us(prof_ext_tensor), prof_ext_tensor * 100.0 / total
+        );
+        LOG_INFO_V9(
+            "  create_info(x%d) : %7.3fus (%5.1f%%)  avg=%.3fus", prof_make_count, cycles_to_us(prof_make_tensor),
+            prof_make_tensor * 100.0 / total,
+            prof_make_count > 0 ? cycles_to_us(prof_make_tensor) / prof_make_count : 0.0
+        );
+        LOG_INFO_V9(
+            "  tensor_view(x%d) : %7.3fus (%5.1f%%)  avg=%.3fus", prof_view_count, cycles_to_us(prof_tensor_view),
+            prof_tensor_view * 100.0 / total,
+            prof_view_count > 0 ? cycles_to_us(prof_tensor_view) / prof_view_count : 0.0
+        );
+        LOG_INFO_V9(
+            "  param_setup      : %7.3fus (%5.1f%%)", cycles_to_us(prof_param_setup), prof_param_setup * 100.0 / total
+        );
+        LOG_INFO_V9(
+            "  submit_task(x%d) : %7.3fus (%5.1f%%)  avg=%.3fus", prof_submit_count, cycles_to_us(prof_submit_task),
+            prof_submit_task * 100.0 / total,
+            prof_submit_count > 0 ? cycles_to_us(prof_submit_task) / prof_submit_count : 0.0
+        );
+        LOG_INFO_V9(
+            "  scope_and_loop   : %7.3fus (%5.1f%%)", cycles_to_us(prof_scope_and_loop),
+            prof_scope_and_loop * 100.0 / total
+        );
+    }
+#endif
+
+#undef CYCLE_COUNT_START
+#undef CYCLE_COUNT_LAP
+}
+
+}  // extern "C"
diff --git a/examples/a2a3/fully_distributed_within_core/paged_attention_unroll_manual_scope/test_paged_attention_unroll.py b/examples/a2a3/fully_distributed_within_core/paged_attention_unroll_manual_scope/test_paged_attention_unroll.py
new file mode 100644
index 000000000..34cbdde6c
--- /dev/null
+++ b/examples/a2a3/fully_distributed_within_core/paged_attention_unroll_manual_scope/test_paged_attention_unroll.py
@@ -0,0 +1,132 @@
+#!/usr/bin/env python3
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""Paged attention unroll manual-scope wrapper for A2A3 tensormap_and_ringbuffer."""
+
+import torch
+from simpler.task_interface import ArgDirection as D
+
+from simpler_setup import Scalar, SceneTestCase, TaskArgsBuilder, Tensor, scene_test
+from simpler_setup.goldens.paged_attention import compute_golden as _pa_compute_golden
+from simpler_setup.goldens.paged_attention import generate_inputs as _pa_generate_inputs
+
+
+@scene_test(level=2, runtime="fully_distributed_within_core")
+class TestPagedAttentionUnrollManualScope(SceneTestCase):
+    RTOL = 1e-3
+    ATOL = 1e-3
+
+    CALLABLE = {
+        "orchestration": {
+            "source": "kernels/orchestration/paged_attention_orch.cpp",
+            "function_name": "aicpu_orchestration_entry",
+            "signature": [D.IN, D.IN, D.IN, D.IN, D.IN, D.OUT],
+        },
+        "incores": [
+            {
+                "func_id": 0,
+                "name": "QK",
+                "source": "kernels/aic/aic_qk_matmul.cpp",
+                "core_type": "aic",
+                "signature": [D.IN, D.IN, D.OUT],
+            },
+            {
+                "func_id": 1,
+                "name": "SF",
+                "source": "kernels/aiv/aiv_softmax_prepare.cpp",
+                "core_type": "aiv",
+                "signature": [D.IN, D.OUT, D.OUT, D.OUT],
+            },
+            {
+                "func_id": 2,
+                "name": "PV",
+                "source": "kernels/aic/aic_pv_matmul.cpp",
+                "core_type": "aic",
+                "signature": [D.IN, D.IN, D.OUT],
+            },
+            {
+                "func_id": 3,
+                "name": "UP",
+                "source": "kernels/aiv/aiv_online_update.cpp",
+                "core_type": "aiv",
+                "signature": [D.IN, D.IN, D.IN, D.INOUT, D.INOUT, D.INOUT, D.INOUT],
+            },
+        ],
+    }
+
+    CASES = [
+        {
+            "name": "Case1",
+            "platforms": ["a2a3"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 24},
+            "params": {
+                "batch": 256,
+                "num_heads": 16,
+                "kv_head_num": 1,
+                "head_dim": 128,
+                "block_size": 128,
+                "context_len": 8192,
+                "max_model_len": 32768,
+                "dtype": "bfloat16",
+            },
+        },
+        {
+            "name": "Case2",
+            "platforms": ["a2a3"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 24},
+            "manual": True,
+            "params": {
+                "batch": 64,
+                "num_heads": 64,
+                "kv_head_num": 1,
+                "head_dim": 128,
+                "block_size": 64,
+                "context_len": 8192,
+                "max_model_len": 32768,
+                "dtype": "bfloat16",
+            },
+        },
+        {
+            "name": "Case3",
+            "platforms": ["a2a3"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 24},
+            "manual": True,
+            "params": {
+                "batch": 64,
+                "num_heads": 64,
+                "kv_head_num": 1,
+                "head_dim": 256,
+                "block_size": 64,
+                "context_len": 8192,
+                "max_model_len": 32768,
+                "dtype": "bfloat16",
+            },
+        },
+    ]
+
+    def generate_args(self, params):
+        inputs = _pa_generate_inputs(params)
+        specs = []
+        for name, val in inputs:
+            if isinstance(val, torch.Tensor):
+                specs.append(Tensor(name, val))
+            else:
+                specs.append(Scalar(name, val))
+        return TaskArgsBuilder(*specs)
+
+    def compute_golden(self, args, params):
+        tensors = {s.name: s.value for s in args.specs if isinstance(s, Tensor)}
+        _pa_compute_golden(tensors, params)
+        for s in args.specs:
+            if isinstance(s, Tensor) and s.name in tensors:
+                getattr(args, s.name)[:] = tensors[s.name]
+
+
+if __name__ == "__main__":
+    SceneTestCase.run_module(__name__)
diff --git a/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aic/down_proj.cpp b/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aic/down_proj.cpp
new file mode 100644
index 000000000..968515353
--- /dev/null
+++ b/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aic/down_proj.cpp
@@ -0,0 +1,328 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+// Kernel Function: down_proj
+// Generated by PyPTO IR Compiler (PTO backend)
+
+#include <cstdint>
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#if defined(__CPU_SIM)
+#define __aicore__
+#else
+#define __aicore__ [aicore]
+#endif
+#endif
+
+#include <pto/pto-inst.hpp>
+#include "tensor.h"
+
+using namespace pto;
+
+// --- ptoas-generated code ---
+
+enum class PTOAutoSyncTailMode : int {
+    kBarrierAll = 0,
+    kSetWaitMte3ToSEvent0 = 1,
+};
+
+static __aicore__ inline void ptoas_auto_sync_tail(PTOAutoSyncTailMode mode = PTOAutoSyncTailMode::kBarrierAll) {
+    switch (mode) {
+    case PTOAutoSyncTailMode::kSetWaitMte3ToSEvent0:
+        set_flag(PIPE_MTE3, PIPE_S, EVENT_ID0);
+        wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID0);
+        break;
+    case PTOAutoSyncTailMode::kBarrierAll:
+    default:
+        pipe_barrier(PIPE_ALL);
+        break;
+    }
+}
+
+static __aicore__ void down_proj(__gm__ bfloat16_t *v1, __gm__ bfloat16_t *v2, __gm__ float *v3, int32_t v4) {
+    unsigned v5 = 0;
+    const int32_t v6 = 68;
+    const int32_t v7 = 0;
+    const int32_t v8 = 256;
+    const int32_t v9 = 128;
+    const int32_t v10 = 5120;
+    const int32_t v11 = 1;
+    const int32_t v12 = 17408;
+    const int32_t v13 = 16;
+    const int64_t v14 = 32768;
+    const int64_t v15 = 4096;
+    const int64_t v16 = 8192;
+    const int64_t v17 = 0;
+    using T = float;
+
+#if defined(__DAV_CUBE__)
+    size_t v18 = (size_t)v11;
+    Tile<
+        TileType::Mat, bfloat16_t, 16, 256, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null,
+        CompactMode::Null>
+        v19 = Tile<
+            TileType::Mat, bfloat16_t, 16, 256, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null,
+            CompactMode::Null>(v13, v8);
+    uint64_t v20 = (uint64_t)v17;
+    TASSIGN(v19, v20);
+    pto::Shape<1, 1, 1, 16, 256> v21 = pto::Shape<1, 1, 1, 16, 256>();
+    pto::Stride<278528, 278528, 278528, 17408, 1> v22 = pto::Stride<278528, 278528, 278528, 17408, 1>();
+    GlobalTensor<
+        bfloat16_t, pto::Shape<1, 1, 1, 16, 256>, pto::Stride<278528, 278528, 278528, 17408, 1>, pto::Layout::ND>
+        v23 = GlobalTensor<
+            bfloat16_t, pto::Shape<1, 1, 1, 16, 256>, pto::Stride<278528, 278528, 278528, 17408, 1>, pto::Layout::ND>(
+            v1 + (v5 + v5 * (unsigned)v12 + v5 * (unsigned)v11), v21, v22
+        );
+    set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1);
+    set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID2);
+    set_flag(PIPE_M, PIPE_MTE1, EVENT_ID1);
+    set_flag(PIPE_M, PIPE_MTE1, EVENT_ID2);
+    TLOAD(v19, v23);
+    set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
+    Tile<
+        TileType::Mat, bfloat16_t, 256, 128, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null,
+        CompactMode::Null>
+        v24 = Tile<
+            TileType::Mat, bfloat16_t, 256, 128, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null,
+            CompactMode::Null>(v8, v9);
+    uint64_t v25 = (uint64_t)v16;
+    TASSIGN(v24, v25);
+    pto::Shape<1, 1, 1, 256, 128> v26 = pto::Shape<1, 1, 1, 256, 128>();
+    pto::Stride<1310720, 1310720, 1310720, 5120, 1> v27 = pto::Stride<1310720, 1310720, 1310720, 5120, 1>();
+    GlobalTensor<
+        bfloat16_t, pto::Shape<1, 1, 1, 256, 128>, pto::Stride<1310720, 1310720, 1310720, 5120, 1>, pto::Layout::ND>
+        v28 = GlobalTensor<
+            bfloat16_t, pto::Shape<1, 1, 1, 256, 128>, pto::Stride<1310720, 1310720, 1310720, 5120, 1>,
+            pto::Layout::ND>(v2 + (v5 + v5 * (unsigned)v10 + (unsigned)v4 * (unsigned)v11), v26, v27);
+    TLOAD(v24, v28);
+    set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID1);
+    Tile<
+        TileType::Left, bfloat16_t, 16, 128, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null,
+        CompactMode::Null>
+        v29 = Tile<
+            TileType::Left, bfloat16_t, 16, 128, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null,
+            CompactMode::Null>(v13, v9);
+    uint64_t v30 = (uint64_t)v17;
+    TASSIGN(v29, v30);
+    wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
+    TEXTRACT(v29, v19, v7, v7);
+    Tile<
+        TileType::Right, bfloat16_t, 128, 128, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512, PadValue::Null,
+        CompactMode::Null>
+        v31 = Tile<
+            TileType::Right, bfloat16_t, 128, 128, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512, PadValue::Null,
+            CompactMode::Null>(v9, v9);
+    uint64_t v32 = (uint64_t)v17;
+    TASSIGN(v31, v32);
+    wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID1);
+    TEXTRACT(v31, v24, v7, v7);
+    set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
+    Tile<
+        TileType::Left, bfloat16_t, 16, 128, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null,
+        CompactMode::Null>
+        v33 = Tile<
+            TileType::Left, bfloat16_t, 16, 128, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null,
+            CompactMode::Null>(v13, v9);
+    uint64_t v34 = (uint64_t)v15;
+    TASSIGN(v33, v34);
+    TEXTRACT(v33, v19, v7, v9);
+    Tile<
+        TileType::Right, bfloat16_t, 128, 128, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512, PadValue::Null,
+        CompactMode::Null>
+        v35 = Tile<
+            TileType::Right, bfloat16_t, 128, 128, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512, PadValue::Null,
+            CompactMode::Null>(v9, v9);
+    uint64_t v36 = (uint64_t)v14;
+    TASSIGN(v35, v36);
+    TEXTRACT(v35, v24, v9, v7);
+    set_flag(PIPE_MTE1, PIPE_M, EVENT_ID1);
+    set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID0);
+    Tile<
+        TileType::Acc, float, 16, 128, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null,
+        CompactMode::Null>
+        v37 = Tile<
+            TileType::Acc, float, 16, 128, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null,
+            CompactMode::Null>(v13, v9);
+    uint64_t v38 = (uint64_t)v17;
+    TASSIGN(v37, v38);
+    wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
+    TMATMUL(v37, v29, v31);
+    Tile<
+        TileType::Acc, float, 16, 128, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null,
+        CompactMode::Null>
+        v39 = Tile<
+            TileType::Acc, float, 16, 128, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null,
+            CompactMode::Null>(v13, v9);
+    uint64_t v40 = (uint64_t)v17;
+    TASSIGN(v39, v40);
+    pipe_barrier(PIPE_M);
+    wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID1);
+    TMATMUL_ACC(v39, v39, v33, v35);
+    set_flag(PIPE_M, PIPE_MTE1, EVENT_ID0);
+    wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID0);
+    wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID0);
+    for (size_t v41 = v18; v41 < ((size_t)v6); v41 += v18) {
+        int32_t v42 = (int32_t)((uint32_t)((int32_t)v41) * (uint32_t)v8);
+        Tile<
+            TileType::Mat, bfloat16_t, 16, 256, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null,
+            CompactMode::Null>
+            v43 = Tile<
+                TileType::Mat, bfloat16_t, 16, 256, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null,
+                CompactMode::Null>(v13, v8);
+        uint64_t v44 = (uint64_t)v17;
+        TASSIGN(v43, v44);
+        pto::Shape<1, 1, 1, 16, 256> v45 = pto::Shape<1, 1, 1, 16, 256>();
+        pto::Stride<278528, 278528, 278528, 17408, 1> v46 = pto::Stride<278528, 278528, 278528, 17408, 1>();
+        GlobalTensor<
+            bfloat16_t, pto::Shape<1, 1, 1, 16, 256>, pto::Stride<278528, 278528, 278528, 17408, 1>, pto::Layout::ND>
+            v47 = GlobalTensor<
+                bfloat16_t, pto::Shape<1, 1, 1, 16, 256>, pto::Stride<278528, 278528, 278528, 17408, 1>,
+                pto::Layout::ND>(v1 + (v5 + v5 * (unsigned)v12 + (unsigned)v42 * (unsigned)v11), v45, v46);
+        wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1);
+        TLOAD(v43, v47);
+        set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID2);
+        Tile<
+            TileType::Mat, bfloat16_t, 256, 128, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null,
+            CompactMode::Null>
+            v48 = Tile<
+                TileType::Mat, bfloat16_t, 256, 128, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null,
+                CompactMode::Null>(v8, v9);
+        uint64_t v49 = (uint64_t)v16;
+        TASSIGN(v48, v49);
+        pto::Shape<1, 1, 1, 256, 128> v50 = pto::Shape<1, 1, 1, 256, 128>();
+        pto::Stride<1310720, 1310720, 1310720, 5120, 1> v51 = pto::Stride<1310720, 1310720, 1310720, 5120, 1>();
+        GlobalTensor<
+            bfloat16_t, pto::Shape<1, 1, 1, 256, 128>, pto::Stride<1310720, 1310720, 1310720, 5120, 1>, pto::Layout::ND>
+            v52 = GlobalTensor<
+                bfloat16_t, pto::Shape<1, 1, 1, 256, 128>, pto::Stride<1310720, 1310720, 1310720, 5120, 1>,
+                pto::Layout::ND>(v2 + (v5 + (unsigned)v42 * (unsigned)v10 + (unsigned)v4 * (unsigned)v11), v50, v51);
+        wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID2);
+        TLOAD(v48, v52);
+        set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID3);
+        Tile<
+            TileType::Left, bfloat16_t, 16, 128, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null,
+            CompactMode::Null>
+            v53 = Tile<
+                TileType::Left, bfloat16_t, 16, 128, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null,
+                CompactMode::Null>(v13, v9);
+        uint64_t v54 = (uint64_t)v17;
+        TASSIGN(v53, v54);
+        wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID2);
+        wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID1);
+        TEXTRACT(v53, v43, v7, v7);
+        Tile<
+            TileType::Right, bfloat16_t, 128, 128, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512, PadValue::Null,
+            CompactMode::Null>
+            v55 = Tile<
+                TileType::Right, bfloat16_t, 128, 128, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512,
+                PadValue::Null, CompactMode::Null>(v9, v9);
+        uint64_t v56 = (uint64_t)v17;
+        TASSIGN(v55, v56);
+        wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID3);
+        TEXTRACT(v55, v48, v7, v7);
+        set_flag(PIPE_MTE1, PIPE_M, EVENT_ID2);
+        Tile<
+            TileType::Left, bfloat16_t, 16, 128, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null,
+            CompactMode::Null>
+            v57 = Tile<
+                TileType::Left, bfloat16_t, 16, 128, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null,
+                CompactMode::Null>(v13, v9);
+        uint64_t v58 = (uint64_t)v15;
+        TASSIGN(v57, v58);
+        wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID2);
+        TEXTRACT(v57, v43, v7, v9);
+        set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1);
+        Tile<
+            TileType::Right, bfloat16_t, 128, 128, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512, PadValue::Null,
+            CompactMode::Null>
+            v59 = Tile<
+                TileType::Right, bfloat16_t, 128, 128, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512,
+                PadValue::Null, CompactMode::Null>(v9, v9);
+        uint64_t v60 = (uint64_t)v14;
+        TASSIGN(v59, v60);
+        TEXTRACT(v59, v48, v9, v7);
+        set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID2);
+        set_flag(PIPE_MTE1, PIPE_M, EVENT_ID3);
+        Tile<
+            TileType::Acc, float, 16, 128, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null,
+            CompactMode::Null>
+            v61 = Tile<
+                TileType::Acc, float, 16, 128, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null,
+                CompactMode::Null>(v13, v9);
+        uint64_t v62 = (uint64_t)v17;
+        TASSIGN(v61, v62);
+        wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID2);
+        pipe_barrier(PIPE_M);
+        TMATMUL_ACC(v61, v61, v53, v55);
+        set_flag(PIPE_M, PIPE_MTE1, EVENT_ID1);
+        Tile<
+            TileType::Acc, float, 16, 128, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null,
+            CompactMode::Null>
+            v63 = Tile<
+                TileType::Acc, float, 16, 128, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null,
+                CompactMode::Null>(v13, v9);
+        uint64_t v64 = (uint64_t)v17;
+        TASSIGN(v63, v64);
+        pipe_barrier(PIPE_M);
+        wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID3);
+        TMATMUL_ACC(v63, v63, v57, v59);
+        set_flag(PIPE_M, PIPE_MTE1, EVENT_ID2);
+    }
+    set_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
+    pto::Shape<1, 1, 1, 16, 128> v65 = pto::Shape<1, 1, 1, 16, 128>();
+    pto::Stride<2048, 2048, 2048, 128, 1> v66 = pto::Stride<2048, 2048, 2048, 128, 1>();
+    GlobalTensor<float, pto::Shape<1, 1, 1, 16, 128>, pto::Stride<2048, 2048, 2048, 128, 1>, pto::Layout::ND> v67 =
+        GlobalTensor<float, pto::Shape<1, 1, 1, 16, 128>, pto::Stride<2048, 2048, 2048, 128, 1>, pto::Layout::ND>(
+            v3 + (v5 + v5 * (unsigned)v9 + v5 * (unsigned)v11), v65, v66
+        );
+    wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
+    TSTORE(v67, v39);
+    wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1);
+    wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID2);
+    wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID1);
+    wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID2);
+#endif  // __DAV_CUBE__
+
+    ptoas_auto_sync_tail(PTOAutoSyncTailMode::kBarrierAll);
+    return;
+}
+
+// --- Kernel entry point ---
+extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) {
+    // Unpack tensor: mlp_tile__rv_v2
+    __gm__ Tensor *mlp_tile__rv_v2_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]);
+    __gm__ bfloat16_t *mlp_tile__rv_v2 = reinterpret_cast<__gm__ bfloat16_t *>(mlp_tile__rv_v2_tensor->buffer.addr) +
+                                         mlp_tile__rv_v2_tensor->start_offset;
+
+    // Unpack tensor: w_down__ssa_v0
+    __gm__ Tensor *w_down__ssa_v0_tensor = reinterpret_cast<__gm__ Tensor *>(args[1]);
+    __gm__ bfloat16_t *w_down__ssa_v0 =
+        reinterpret_cast<__gm__ bfloat16_t *>(w_down__ssa_v0_tensor->buffer.addr) + w_down__ssa_v0_tensor->start_offset;
+
+    // Unpack tensor: fp32_chunk_gm__ssa_v0
+    __gm__ Tensor *fp32_chunk_gm__ssa_v0_tensor = reinterpret_cast<__gm__ Tensor *>(args[2]);
+    __gm__ float *fp32_chunk_gm__ssa_v0 = reinterpret_cast<__gm__ float *>(fp32_chunk_gm__ssa_v0_tensor->buffer.addr) +
+                                          fp32_chunk_gm__ssa_v0_tensor->start_offset;
+
+    // Unpack scalar: d0__ssa_v0
+    union {
+        uint64_t u64;
+        int64_t val;
+    } d0__ssa_v0_conv;
+    d0__ssa_v0_conv.u64 = args[3];
+    int64_t d0__ssa_v0 = d0__ssa_v0_conv.val;
+
+    // Forward to ptoas-generated function
+    down_proj(mlp_tile__rv_v2, w_down__ssa_v0, fp32_chunk_gm__ssa_v0, d0__ssa_v0);
+}
diff --git a/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aic/gate_proj.cpp b/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aic/gate_proj.cpp
new file mode 100644
index 000000000..11a0493e3
--- /dev/null
+++ b/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aic/gate_proj.cpp
@@ -0,0 +1,331 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+// Kernel Function: gate_proj
+// Generated by PyPTO IR Compiler (PTO backend)
+
+#include <cstdint>
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#if defined(__CPU_SIM)
+#define __aicore__
+#else
+#define __aicore__ [aicore]
+#endif
+#endif
+
+#include <pto/pto-inst.hpp>
+#include "tensor.h"
+
+using namespace pto;
+
+// --- ptoas-generated code ---
+
+enum class PTOAutoSyncTailMode : int {
+    kBarrierAll = 0,
+    kSetWaitMte3ToSEvent0 = 1,
+};
+
+static __aicore__ inline void ptoas_auto_sync_tail(PTOAutoSyncTailMode mode = PTOAutoSyncTailMode::kBarrierAll) {
+    switch (mode) {
+    case PTOAutoSyncTailMode::kSetWaitMte3ToSEvent0:
+        set_flag(PIPE_MTE3, PIPE_S, EVENT_ID0);
+        wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID0);
+        break;
+    case PTOAutoSyncTailMode::kBarrierAll:
+    default:
+        pipe_barrier(PIPE_ALL);
+        break;
+    }
+}
+
+static __aicore__ void gate_proj(__gm__ bfloat16_t *v1, __gm__ bfloat16_t *v2, __gm__ float *v3, int32_t v4) {
+    unsigned v5 = 0;
+    const int32_t v6 = 40;
+    const int32_t v7 = 64;
+    const int32_t v8 = 0;
+    const int32_t v9 = 128;
+    const int32_t v10 = 256;
+    const int32_t v11 = 17408;
+    const int32_t v12 = 1;
+    const int32_t v13 = 5120;
+    const int32_t v14 = 16;
+    const int64_t v15 = 32768;
+    const int64_t v16 = 2048;
+    const int64_t v17 = 4096;
+    const int64_t v18 = 0;
+    using T = float;
+
+#if defined(__DAV_CUBE__)
+    size_t v19 = (size_t)v12;
+    Tile<
+        TileType::Mat, bfloat16_t, 16, 128, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null,
+        CompactMode::Null>
+        v20 = Tile<
+            TileType::Mat, bfloat16_t, 16, 128, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null,
+            CompactMode::Null>(v14, v9);
+    uint64_t v21 = (uint64_t)v18;
+    TASSIGN(v20, v21);
+    pto::Shape<1, 1, 1, 16, 128> v22 = pto::Shape<1, 1, 1, 16, 128>();
+    pto::Stride<81920, 81920, 81920, 5120, 1> v23 = pto::Stride<81920, 81920, 81920, 5120, 1>();
+    GlobalTensor<bfloat16_t, pto::Shape<1, 1, 1, 16, 128>, pto::Stride<81920, 81920, 81920, 5120, 1>, pto::Layout::ND>
+        v24 = GlobalTensor<
+            bfloat16_t, pto::Shape<1, 1, 1, 16, 128>, pto::Stride<81920, 81920, 81920, 5120, 1>, pto::Layout::ND>(
+            v1 + (v5 + v5 * (unsigned)v13 + v5 * (unsigned)v12), v22, v23
+        );
+    set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1);
+    set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID2);
+    set_flag(PIPE_M, PIPE_MTE1, EVENT_ID1);
+    set_flag(PIPE_M, PIPE_MTE1, EVENT_ID2);
+    TLOAD(v20, v24);
+    set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
+    Tile<
+        TileType::Mat, bfloat16_t, 128, 256, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null,
+        CompactMode::Null>
+        v25 = Tile<
+            TileType::Mat, bfloat16_t, 128, 256, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null,
+            CompactMode::Null>(v9, v10);
+    uint64_t v26 = (uint64_t)v17;
+    TASSIGN(v25, v26);
+    pto::Shape<1, 1, 1, 128, 256> v27 = pto::Shape<1, 1, 1, 128, 256>();
+    pto::Stride<2228224, 2228224, 2228224, 17408, 1> v28 = pto::Stride<2228224, 2228224, 2228224, 17408, 1>();
+    GlobalTensor<
+        bfloat16_t, pto::Shape<1, 1, 1, 128, 256>, pto::Stride<2228224, 2228224, 2228224, 17408, 1>, pto::Layout::ND>
+        v29 = GlobalTensor<
+            bfloat16_t, pto::Shape<1, 1, 1, 128, 256>, pto::Stride<2228224, 2228224, 2228224, 17408, 1>,
+            pto::Layout::ND>(v2 + (v5 + v5 * (unsigned)v11 + (unsigned)v4 * (unsigned)v12), v27, v28);
+    TLOAD(v25, v29);
+    set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID1);
+    Tile<
+        TileType::Left, bfloat16_t, 16, 64, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null,
+        CompactMode::Null>
+        v30 = Tile<
+            TileType::Left, bfloat16_t, 16, 64, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null,
+            CompactMode::Null>(v14, v7);
+    uint64_t v31 = (uint64_t)v18;
+    TASSIGN(v30, v31);
+    wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
+    TEXTRACT(v30, v20, v8, v8);
+    Tile<
+        TileType::Right, bfloat16_t, 64, 256, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512, PadValue::Null,
+        CompactMode::Null>
+        v32 = Tile<
+            TileType::Right, bfloat16_t, 64, 256, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512, PadValue::Null,
+            CompactMode::Null>(v7, v10);
+    uint64_t v33 = (uint64_t)v18;
+    TASSIGN(v32, v33);
+    wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID1);
+    TEXTRACT(v32, v25, v8, v8);
+    set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
+    Tile<
+        TileType::Left, bfloat16_t, 16, 64, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null,
+        CompactMode::Null>
+        v34 = Tile<
+            TileType::Left, bfloat16_t, 16, 64, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null,
+            CompactMode::Null>(v14, v7);
+    uint64_t v35 = (uint64_t)v16;
+    TASSIGN(v34, v35);
+    TEXTRACT(v34, v20, v8, v7);
+    Tile<
+        TileType::Right, bfloat16_t, 64, 256, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512, PadValue::Null,
+        CompactMode::Null>
+        v36 = Tile<
+            TileType::Right, bfloat16_t, 64, 256, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512, PadValue::Null,
+            CompactMode::Null>(v7, v10);
+    uint64_t v37 = (uint64_t)v15;
+    TASSIGN(v36, v37);
+    TEXTRACT(v36, v25, v7, v8);
+    set_flag(PIPE_MTE1, PIPE_M, EVENT_ID1);
+    set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID0);
+    Tile<
+        TileType::Acc, float, 16, 256, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null,
+        CompactMode::Null>
+        v38 = Tile<
+            TileType::Acc, float, 16, 256, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null,
+            CompactMode::Null>(v14, v10);
+    uint64_t v39 = (uint64_t)v18;
+    TASSIGN(v38, v39);
+    wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
+    TMATMUL(v38, v30, v32);
+    Tile<
+        TileType::Acc, float, 16, 256, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null,
+        CompactMode::Null>
+        v40 = Tile<
+            TileType::Acc, float, 16, 256, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null,
+            CompactMode::Null>(v14, v10);
+    uint64_t v41 = (uint64_t)v18;
+    TASSIGN(v40, v41);
+    pipe_barrier(PIPE_M);
+    wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID1);
+    TMATMUL_ACC(v40, v40, v34, v36);
+    set_flag(PIPE_M, PIPE_MTE1, EVENT_ID0);
+    wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID0);
+    wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID0);
+    for (size_t v42 = v19; v42 < ((size_t)v6); v42 += v19) {
+        int32_t v43 = (int32_t)((uint32_t)((int32_t)v42) * (uint32_t)v9);
+        Tile<
+            TileType::Mat, bfloat16_t, 16, 128, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null,
+            CompactMode::Null>
+            v44 = Tile<
+                TileType::Mat, bfloat16_t, 16, 128, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null,
+                CompactMode::Null>(v14, v9);
+        uint64_t v45 = (uint64_t)v18;
+        TASSIGN(v44, v45);
+        pto::Shape<1, 1, 1, 16, 128> v46 = pto::Shape<1, 1, 1, 16, 128>();
+        pto::Stride<81920, 81920, 81920, 5120, 1> v47 = pto::Stride<81920, 81920, 81920, 5120, 1>();
+        GlobalTensor<
+            bfloat16_t, pto::Shape<1, 1, 1, 16, 128>, pto::Stride<81920, 81920, 81920, 5120, 1>, pto::Layout::ND>
+            v48 = GlobalTensor<
+                bfloat16_t, pto::Shape<1, 1, 1, 16, 128>, pto::Stride<81920, 81920, 81920, 5120, 1>, pto::Layout::ND>(
+                v1 + (v5 + v5 * (unsigned)v13 + (unsigned)v43 * (unsigned)v12), v46, v47
+            );
+        wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1);
+        TLOAD(v44, v48);
+        set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID2);
+        Tile<
+            TileType::Mat, bfloat16_t, 128, 256, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null,
+            CompactMode::Null>
+            v49 = Tile<
+                TileType::Mat, bfloat16_t, 128, 256, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null,
+                CompactMode::Null>(v9, v10);
+        uint64_t v50 = (uint64_t)v17;
+        TASSIGN(v49, v50);
+        pto::Shape<1, 1, 1, 128, 256> v51 = pto::Shape<1, 1, 1, 128, 256>();
+        pto::Stride<2228224, 2228224, 2228224, 17408, 1> v52 = pto::Stride<2228224, 2228224, 2228224, 17408, 1>();
+        GlobalTensor<
+            bfloat16_t, pto::Shape<1, 1, 1, 128, 256>, pto::Stride<2228224, 2228224, 2228224, 17408, 1>,
+            pto::Layout::ND>
+            v53 = GlobalTensor<
+                bfloat16_t, pto::Shape<1, 1, 1, 128, 256>, pto::Stride<2228224, 2228224, 2228224, 17408, 1>,
+                pto::Layout::ND>(v2 + (v5 + (unsigned)v43 * (unsigned)v11 + (unsigned)v4 * (unsigned)v12), v51, v52);
+        wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID2);
+        TLOAD(v49, v53);
+        set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID3);
+        Tile<
+            TileType::Left, bfloat16_t, 16, 64, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null,
+            CompactMode::Null>
+            v54 = Tile<
+                TileType::Left, bfloat16_t, 16, 64, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null,
+                CompactMode::Null>(v14, v7);
+        uint64_t v55 = (uint64_t)v18;
+        TASSIGN(v54, v55);
+        wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID2);
+        wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID1);
+        TEXTRACT(v54, v44, v8, v8);
+        Tile<
+            TileType::Right, bfloat16_t, 64, 256, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512, PadValue::Null,
+            CompactMode::Null>
+            v56 = Tile<
+                TileType::Right, bfloat16_t, 64, 256, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512, PadValue::Null,
+                CompactMode::Null>(v7, v10);
+        uint64_t v57 = (uint64_t)v18;
+        TASSIGN(v56, v57);
+        wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID3);
+        TEXTRACT(v56, v49, v8, v8);
+        set_flag(PIPE_MTE1, PIPE_M, EVENT_ID2);
+        Tile<
+            TileType::Left, bfloat16_t, 16, 64, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null,
+            CompactMode::Null>
+            v58 = Tile<
+                TileType::Left, bfloat16_t, 16, 64, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null,
+                CompactMode::Null>(v14, v7);
+        uint64_t v59 = (uint64_t)v16;
+        TASSIGN(v58, v59);
+        wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID2);
+        TEXTRACT(v58, v44, v8, v7);
+        set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1);
+        Tile<
+            TileType::Right, bfloat16_t, 64, 256, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512, PadValue::Null,
+            CompactMode::Null>
+            v60 = Tile<
+                TileType::Right, bfloat16_t, 64, 256, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512, PadValue::Null,
+                CompactMode::Null>(v7, v10);
+        uint64_t v61 = (uint64_t)v15;
+        TASSIGN(v60, v61);
+        TEXTRACT(v60, v49, v7, v8);
+        set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID2);
+        set_flag(PIPE_MTE1, PIPE_M, EVENT_ID3);
+        Tile<
+            TileType::Acc, float, 16, 256, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null,
+            CompactMode::Null>
+            v62 = Tile<
+                TileType::Acc, float, 16, 256, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null,
+                CompactMode::Null>(v14, v10);
+        uint64_t v63 = (uint64_t)v18;
+        TASSIGN(v62, v63);
+        wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID2);
+        pipe_barrier(PIPE_M);
+        TMATMUL_ACC(v62, v62, v54, v56);
+        set_flag(PIPE_M, PIPE_MTE1, EVENT_ID1);
+        Tile<
+            TileType::Acc, float, 16, 256, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null,
+            CompactMode::Null>
+            v64 = Tile<
+                TileType::Acc, float, 16, 256, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null,
+                CompactMode::Null>(v14, v10);
+        uint64_t v65 = (uint64_t)v18;
+        TASSIGN(v64, v65);
+        pipe_barrier(PIPE_M);
+        wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID3);
+        TMATMUL_ACC(v64, v64, v58, v60);
+        set_flag(PIPE_M, PIPE_MTE1, EVENT_ID2);
+    }
+    set_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
+    pto::Shape<1, 1, 1, 16, 256> v66 = pto::Shape<1, 1, 1, 16, 256>();
+    pto::Stride<4096, 4096, 4096, 256, 1> v67 = pto::Stride<4096, 4096, 4096, 256, 1>();
+    GlobalTensor<float, pto::Shape<1, 1, 1, 16, 256>, pto::Stride<4096, 4096, 4096, 256, 1>, pto::Layout::ND> v68 =
+        GlobalTensor<float, pto::Shape<1, 1, 1, 16, 256>, pto::Stride<4096, 4096, 4096, 256, 1>, pto::Layout::ND>(
+            v3 + (v5 + v5 * (unsigned)v10 + v5 * (unsigned)v12), v66, v67
+        );
+    wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
+    TSTORE(v68, v40);
+    wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1);
+    wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID2);
+    wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID1);
+    wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID2);
+#endif  // __DAV_CUBE__
+
+    ptoas_auto_sync_tail(PTOAutoSyncTailMode::kBarrierAll);
+    return;
+}
+
+// --- Kernel entry point ---
+extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) {
+    // Unpack tensor: post_norm_tile__rv_v2
+    __gm__ Tensor *post_norm_tile__rv_v2_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]);
+    __gm__ bfloat16_t *post_norm_tile__rv_v2 =
+        reinterpret_cast<__gm__ bfloat16_t *>(post_norm_tile__rv_v2_tensor->buffer.addr) +
+        post_norm_tile__rv_v2_tensor->start_offset;
+
+    // Unpack tensor: w_gate__ssa_v0
+    __gm__ Tensor *w_gate__ssa_v0_tensor = reinterpret_cast<__gm__ Tensor *>(args[1]);
+    __gm__ bfloat16_t *w_gate__ssa_v0 =
+        reinterpret_cast<__gm__ bfloat16_t *>(w_gate__ssa_v0_tensor->buffer.addr) + w_gate__ssa_v0_tensor->start_offset;
+
+    // Unpack tensor: ret0__out
+    __gm__ Tensor *ret0__out_tensor = reinterpret_cast<__gm__ Tensor *>(args[2]);
+    __gm__ float *ret0__out =
+        reinterpret_cast<__gm__ float *>(ret0__out_tensor->buffer.addr) + ret0__out_tensor->start_offset;
+
+    // Unpack scalar: o0__ssa_v1
+    union {
+        uint64_t u64;
+        int64_t val;
+    } o0__ssa_v1_conv;
+    o0__ssa_v1_conv.u64 = args[3];
+    int64_t o0__ssa_v1 = o0__ssa_v1_conv.val;
+
+    // Forward to ptoas-generated function
+    gate_proj(post_norm_tile__rv_v2, w_gate__ssa_v0, ret0__out, o0__ssa_v1);
+}
diff --git a/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aic/kv_proj.cpp b/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aic/kv_proj.cpp
new file mode 100644
index 000000000..2e865ac03
--- /dev/null
+++ b/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aic/kv_proj.cpp
@@ -0,0 +1,597 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+// Kernel Function: kv_proj
+// Generated by PyPTO IR Compiler (PTO backend)
+
+#include <cstdint>
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#if defined(__CPU_SIM)
+#define __aicore__
+#else
+#define __aicore__ [aicore]
+#endif
+#endif
+
+#include <pto/pto-inst.hpp>
+#include "tensor.h"
+
+using namespace pto;
+
+// --- ptoas-generated code ---
+
+enum class PTOAutoSyncTailMode : int {
+    kBarrierAll = 0,
+    kSetWaitMte3ToSEvent0 = 1,
+};
+
+static __aicore__ inline void ptoas_auto_sync_tail(PTOAutoSyncTailMode mode = PTOAutoSyncTailMode::kBarrierAll) {
+    switch (mode) {
+    case PTOAutoSyncTailMode::kSetWaitMte3ToSEvent0:
+        set_flag(PIPE_MTE3, PIPE_S, EVENT_ID0);
+        wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID0);
+        break;
+    case PTOAutoSyncTailMode::kBarrierAll:
+    default:
+        pipe_barrier(PIPE_ALL);
+        break;
+    }
+}
+
+static __aicore__ void kv_proj(
+    __gm__ float *v1, __gm__ float *v2, __gm__ bfloat16_t *v3, __gm__ bfloat16_t *v4, __gm__ bfloat16_t *v5, int32_t v6,
+    int32_t v7
+) {
+    unsigned v8 = 0;
+    const int32_t v9 = 10;
+    const int32_t v10 = 256;
+    const int32_t v11 = 0;
+    const int32_t v12 = 512;
+    const int32_t v13 = 64;
+    const int32_t v14 = 4;
+    const int32_t v15 = 5120;
+    const int32_t v16 = 1;
+    const int32_t v17 = 1024;
+    const int32_t v18 = 16;
+    const int64_t v19 = 32768;
+    const int64_t v20 = 8192;
+    const int64_t v21 = 4096;
+    const int64_t v22 = 16384;
+    const int64_t v23 = 0;
+    using T = float;
+
+#if defined(__DAV_CUBE__)
+    size_t v24 = (size_t)v16;
+    size_t v25 = (size_t)v9;
+    set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID0);
+    set_flag(PIPE_M, PIPE_MTE1, EVENT_ID0);
+    set_flag(PIPE_FIX, PIPE_M, EVENT_ID0);
+    set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID2);
+    set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID3);
+    set_flag(PIPE_M, PIPE_MTE1, EVENT_ID2);
+    set_flag(PIPE_M, PIPE_MTE1, EVENT_ID3);
+    set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID6);
+    set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID7);
+    set_flag(PIPE_M, PIPE_MTE1, EVENT_ID6);
+    set_flag(PIPE_M, PIPE_MTE1, EVENT_ID7);
+    for (size_t v26 = (size_t)v6; v26 < ((size_t)((int32_t)(uint32_t)v6 + (uint32_t)v14)); v26 += v24) {
+        int32_t v27 = (int32_t)((uint32_t)((int32_t)v26) * (uint32_t)v13);
+        Tile<
+            TileType::Mat, bfloat16_t, 16, 512, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null,
+            CompactMode::Null>
+            v28 = Tile<
+                TileType::Mat, bfloat16_t, 16, 512, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null,
+                CompactMode::Null>(v18, v12);
+        uint64_t v29 = (uint64_t)v23;
+        TASSIGN(v28, v29);
+        pto::Shape<1, 1, 1, 16, 512> v30 = pto::Shape<1, 1, 1, 16, 512>();
+        pto::Stride<81920, 81920, 81920, 5120, 1> v31 = pto::Stride<81920, 81920, 81920, 5120, 1>();
+        GlobalTensor<
+            bfloat16_t, pto::Shape<1, 1, 1, 16, 512>, pto::Stride<81920, 81920, 81920, 5120, 1>, pto::Layout::ND>
+            v32 = GlobalTensor<
+                bfloat16_t, pto::Shape<1, 1, 1, 16, 512>, pto::Stride<81920, 81920, 81920, 5120, 1>, pto::Layout::ND>(
+                v3 + (v8 + v8 * (unsigned)v15 + v8 * (unsigned)v16), v30, v31
+            );
+        wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID0);
+        TLOAD(v28, v32);
+        set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
+        Tile<
+            TileType::Mat, bfloat16_t, 512, 64, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null,
+            CompactMode::Null>
+            v33 = Tile<
+                TileType::Mat, bfloat16_t, 512, 64, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null,
+                CompactMode::Null>(v12, v13);
+        uint64_t v34 = (uint64_t)v22;
+        TASSIGN(v33, v34);
+        pto::Shape<1, 1, 1, 512, 64> v35 = pto::Shape<1, 1, 1, 512, 64>();
+        pto::Stride<524288, 524288, 524288, 1024, 1> v36 = pto::Stride<524288, 524288, 524288, 1024, 1>();
+        GlobalTensor<
+            bfloat16_t, pto::Shape<1, 1, 1, 512, 64>, pto::Stride<524288, 524288, 524288, 1024, 1>, pto::Layout::ND>
+            v37 = GlobalTensor<
+                bfloat16_t, pto::Shape<1, 1, 1, 512, 64>, pto::Stride<524288, 524288, 524288, 1024, 1>,
+                pto::Layout::ND>(v4 + (v8 + v8 * (unsigned)v17 + (unsigned)v27 * (unsigned)v16), v35, v36);
+        TLOAD(v33, v37);
+        set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID1);
+        Tile<
+            TileType::Left, bfloat16_t, 16, 256, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null,
+            CompactMode::Null>
+            v38 = Tile<
+                TileType::Left, bfloat16_t, 16, 256, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null,
+                CompactMode::Null>(v18, v10);
+        uint64_t v39 = (uint64_t)v20;
+        TASSIGN(v38, v39);
+        wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
+        wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID0);
+        TEXTRACT(v38, v28, v11, v11);
+        Tile<
+            TileType::Right, bfloat16_t, 256, 64, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512, PadValue::Null,
+            CompactMode::Null>
+            v40 = Tile<
+                TileType::Right, bfloat16_t, 256, 64, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512, PadValue::Null,
+                CompactMode::Null>(v10, v13);
+        uint64_t v41 = (uint64_t)v19;
+        TASSIGN(v40, v41);
+        wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID1);
+        TEXTRACT(v40, v33, v11, v11);
+        set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
+        Tile<
+            TileType::Left, bfloat16_t, 16, 256, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null,
+            CompactMode::Null>
+            v42 = Tile<
+                TileType::Left, bfloat16_t, 16, 256, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null,
+                CompactMode::Null>(v18, v10);
+        uint64_t v43 = (uint64_t)v23;
+        TASSIGN(v42, v43);
+        TEXTRACT(v42, v28, v11, v10);
+        Tile<
+            TileType::Right, bfloat16_t, 256, 64, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512, PadValue::Null,
+            CompactMode::Null>
+            v44 = Tile<
+                TileType::Right, bfloat16_t, 256, 64, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512, PadValue::Null,
+                CompactMode::Null>(v10, v13);
+        uint64_t v45 = (uint64_t)v23;
+        TASSIGN(v44, v45);
+        TEXTRACT(v44, v33, v10, v11);
+        set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1);
+        set_flag(PIPE_MTE1, PIPE_M, EVENT_ID1);
+        Tile<
+            TileType::Acc, float, 16, 64, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null,
+            CompactMode::Null>
+            v46 = Tile<
+                TileType::Acc, float, 16, 64, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null,
+                CompactMode::Null>(v18, v13);
+        uint64_t v47 = (uint64_t)v21;
+        TASSIGN(v46, v47);
+        wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
+        wait_flag(PIPE_FIX, PIPE_M, EVENT_ID0);
+        TMATMUL(v46, v38, v40);
+        Tile<
+            TileType::Acc, float, 16, 64, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null,
+            CompactMode::Null>
+            v48 = Tile<
+                TileType::Acc, float, 16, 64, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null,
+                CompactMode::Null>(v18, v13);
+        uint64_t v49 = (uint64_t)v21;
+        TASSIGN(v48, v49);
+        pipe_barrier(PIPE_M);
+        wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID1);
+        TMATMUL_ACC(v48, v48, v42, v44);
+        set_flag(PIPE_M, PIPE_MTE1, EVENT_ID1);
+        wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1);
+        wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID1);
+        for (size_t v50 = v24; v50 < v25; v50 += v24) {
+            int32_t v51 = (int32_t)((uint32_t)((int32_t)v50) * (uint32_t)v12);
+            Tile<
+                TileType::Mat, bfloat16_t, 16, 512, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null,
+                CompactMode::Null>
+                v52 = Tile<
+                    TileType::Mat, bfloat16_t, 16, 512, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512,
+                    PadValue::Null, CompactMode::Null>(v18, v12);
+            uint64_t v53 = (uint64_t)v23;
+            TASSIGN(v52, v53);
+            pto::Shape<1, 1, 1, 16, 512> v54 = pto::Shape<1, 1, 1, 16, 512>();
+            pto::Stride<81920, 81920, 81920, 5120, 1> v55 = pto::Stride<81920, 81920, 81920, 5120, 1>();
+            GlobalTensor<
+                bfloat16_t, pto::Shape<1, 1, 1, 16, 512>, pto::Stride<81920, 81920, 81920, 5120, 1>, pto::Layout::ND>
+                v56 = GlobalTensor<
+                    bfloat16_t, pto::Shape<1, 1, 1, 16, 512>, pto::Stride<81920, 81920, 81920, 5120, 1>,
+                    pto::Layout::ND>(v3 + (v8 + v8 * (unsigned)v15 + (unsigned)v51 * (unsigned)v16), v54, v55);
+            wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID2);
+            TLOAD(v52, v56);
+            set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID2);
+            Tile<
+                TileType::Mat, bfloat16_t, 512, 64, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null,
+                CompactMode::Null>
+                v57 = Tile<
+                    TileType::Mat, bfloat16_t, 512, 64, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512,
+                    PadValue::Null, CompactMode::Null>(v12, v13);
+            uint64_t v58 = (uint64_t)v22;
+            TASSIGN(v57, v58);
+            pto::Shape<1, 1, 1, 512, 64> v59 = pto::Shape<1, 1, 1, 512, 64>();
+            pto::Stride<524288, 524288, 524288, 1024, 1> v60 = pto::Stride<524288, 524288, 524288, 1024, 1>();
+            GlobalTensor<
+                bfloat16_t, pto::Shape<1, 1, 1, 512, 64>, pto::Stride<524288, 524288, 524288, 1024, 1>, pto::Layout::ND>
+                v61 = GlobalTensor<
+                    bfloat16_t, pto::Shape<1, 1, 1, 512, 64>, pto::Stride<524288, 524288, 524288, 1024, 1>,
+                    pto::Layout::ND>(
+                    v4 + (v8 + (unsigned)v51 * (unsigned)v17 + (unsigned)v27 * (unsigned)v16), v59, v60
+                );
+            wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID3);
+            TLOAD(v57, v61);
+            set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID3);
+            Tile<
+                TileType::Left, bfloat16_t, 16, 256, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null,
+                CompactMode::Null>
+                v62 = Tile<
+                    TileType::Left, bfloat16_t, 16, 256, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512,
+                    PadValue::Null, CompactMode::Null>(v18, v10);
+            uint64_t v63 = (uint64_t)v20;
+            TASSIGN(v62, v63);
+            wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID2);
+            wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID2);
+            TEXTRACT(v62, v52, v11, v11);
+            Tile<
+                TileType::Right, bfloat16_t, 256, 64, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512, PadValue::Null,
+                CompactMode::Null>
+                v64 = Tile<
+                    TileType::Right, bfloat16_t, 256, 64, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512,
+                    PadValue::Null, CompactMode::Null>(v10, v13);
+            uint64_t v65 = (uint64_t)v19;
+            TASSIGN(v64, v65);
+            wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID3);
+            TEXTRACT(v64, v57, v11, v11);
+            set_flag(PIPE_MTE1, PIPE_M, EVENT_ID2);
+            Tile<
+                TileType::Left, bfloat16_t, 16, 256, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null,
+                CompactMode::Null>
+                v66 = Tile<
+                    TileType::Left, bfloat16_t, 16, 256, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512,
+                    PadValue::Null, CompactMode::Null>(v18, v10);
+            uint64_t v67 = (uint64_t)v23;
+            TASSIGN(v66, v67);
+            wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID3);
+            TEXTRACT(v66, v52, v11, v10);
+            set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID2);
+            Tile<
+                TileType::Right, bfloat16_t, 256, 64, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512, PadValue::Null,
+                CompactMode::Null>
+                v68 = Tile<
+                    TileType::Right, bfloat16_t, 256, 64, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512,
+                    PadValue::Null, CompactMode::Null>(v10, v13);
+            uint64_t v69 = (uint64_t)v23;
+            TASSIGN(v68, v69);
+            TEXTRACT(v68, v57, v10, v11);
+            set_flag(PIPE_MTE1, PIPE_M, EVENT_ID3);
+            set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID3);
+            Tile<
+                TileType::Acc, float, 16, 64, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null,
+                CompactMode::Null>
+                v70 = Tile<
+                    TileType::Acc, float, 16, 64, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null,
+                    CompactMode::Null>(v18, v13);
+            uint64_t v71 = (uint64_t)v21;
+            TASSIGN(v70, v71);
+            wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID2);
+            pipe_barrier(PIPE_M);
+            TMATMUL_ACC(v70, v70, v62, v64);
+            set_flag(PIPE_M, PIPE_MTE1, EVENT_ID2);
+            Tile<
+                TileType::Acc, float, 16, 64, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null,
+                CompactMode::Null>
+                v72 = Tile<
+                    TileType::Acc, float, 16, 64, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null,
+                    CompactMode::Null>(v18, v13);
+            uint64_t v73 = (uint64_t)v21;
+            TASSIGN(v72, v73);
+            pipe_barrier(PIPE_M);
+            wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID3);
+            TMATMUL_ACC(v72, v72, v66, v68);
+            set_flag(PIPE_M, PIPE_MTE1, EVENT_ID3);
+        };
+        set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID4);
+        set_flag(PIPE_M, PIPE_MTE1, EVENT_ID4);
+        set_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
+        pto::Shape<1, 1, 1, 16, 64> v74 = pto::Shape<1, 1, 1, 16, 64>();
+        pto::Stride<16384, 16384, 16384, 1024, 1> v75 = pto::Stride<16384, 16384, 16384, 1024, 1>();
+        GlobalTensor<float, pto::Shape<1, 1, 1, 16, 64>, pto::Stride<16384, 16384, 16384, 1024, 1>, pto::Layout::ND>
+            v76 = GlobalTensor<
+                float, pto::Shape<1, 1, 1, 16, 64>, pto::Stride<16384, 16384, 16384, 1024, 1>, pto::Layout::ND>(
+                v1 + (v8 + (unsigned)v7 * (unsigned)v17 + (unsigned)v27 * (unsigned)v16), v74, v75
+            );
+        wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
+        TSTORE(v76, v48);
+        set_flag(PIPE_FIX, PIPE_M, EVENT_ID1);
+        Tile<
+            TileType::Mat, bfloat16_t, 16, 512, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null,
+            CompactMode::Null>
+            v77 = Tile<
+                TileType::Mat, bfloat16_t, 16, 512, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null,
+                CompactMode::Null>(v18, v12);
+        uint64_t v78 = (uint64_t)v23;
+        TASSIGN(v77, v78);
+        wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID4);
+        TLOAD(v77, v32);
+        set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID4);
+        Tile<
+            TileType::Mat, bfloat16_t, 512, 64, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null,
+            CompactMode::Null>
+            v79 = Tile<
+                TileType::Mat, bfloat16_t, 512, 64, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null,
+                CompactMode::Null>(v12, v13);
+        uint64_t v80 = (uint64_t)v22;
+        TASSIGN(v79, v80);
+        pto::Shape<1, 1, 1, 512, 64> v81 = pto::Shape<1, 1, 1, 512, 64>();
+        pto::Stride<524288, 524288, 524288, 1024, 1> v82 = pto::Stride<524288, 524288, 524288, 1024, 1>();
+        GlobalTensor<
+            bfloat16_t, pto::Shape<1, 1, 1, 512, 64>, pto::Stride<524288, 524288, 524288, 1024, 1>, pto::Layout::ND>
+            v83 = GlobalTensor<
+                bfloat16_t, pto::Shape<1, 1, 1, 512, 64>, pto::Stride<524288, 524288, 524288, 1024, 1>,
+                pto::Layout::ND>(v5 + (v8 + v8 * (unsigned)v17 + (unsigned)v27 * (unsigned)v16), v81, v82);
+        TLOAD(v79, v83);
+        set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID5);
+        Tile<
+            TileType::Left, bfloat16_t, 16, 256, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null,
+            CompactMode::Null>
+            v84 = Tile<
+                TileType::Left, bfloat16_t, 16, 256, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null,
+                CompactMode::Null>(v18, v10);
+        uint64_t v85 = (uint64_t)v20;
+        TASSIGN(v84, v85);
+        wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID4);
+        wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID4);
+        TEXTRACT(v84, v77, v11, v11);
+        Tile<
+            TileType::Right, bfloat16_t, 256, 64, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512, PadValue::Null,
+            CompactMode::Null>
+            v86 = Tile<
+                TileType::Right, bfloat16_t, 256, 64, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512, PadValue::Null,
+                CompactMode::Null>(v10, v13);
+        uint64_t v87 = (uint64_t)v19;
+        TASSIGN(v86, v87);
+        wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID5);
+        TEXTRACT(v86, v79, v11, v11);
+        set_flag(PIPE_MTE1, PIPE_M, EVENT_ID4);
+        Tile<
+            TileType::Left, bfloat16_t, 16, 256, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null,
+            CompactMode::Null>
+            v88 = Tile<
+                TileType::Left, bfloat16_t, 16, 256, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null,
+                CompactMode::Null>(v18, v10);
+        uint64_t v89 = (uint64_t)v23;
+        TASSIGN(v88, v89);
+        TEXTRACT(v88, v77, v11, v10);
+        Tile<
+            TileType::Right, bfloat16_t, 256, 64, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512, PadValue::Null,
+            CompactMode::Null>
+            v90 = Tile<
+                TileType::Right, bfloat16_t, 256, 64, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512, PadValue::Null,
+                CompactMode::Null>(v10, v13);
+        uint64_t v91 = (uint64_t)v23;
+        TASSIGN(v90, v91);
+        TEXTRACT(v90, v79, v10, v11);
+        set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID5);
+        set_flag(PIPE_MTE1, PIPE_M, EVENT_ID5);
+        Tile<
+            TileType::Acc, float, 16, 64, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null,
+            CompactMode::Null>
+            v92 = Tile<
+                TileType::Acc, float, 16, 64, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null,
+                CompactMode::Null>(v18, v13);
+        uint64_t v93 = (uint64_t)v21;
+        TASSIGN(v92, v93);
+        wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID4);
+        wait_flag(PIPE_FIX, PIPE_M, EVENT_ID1);
+        TMATMUL(v92, v84, v86);
+        Tile<
+            TileType::Acc, float, 16, 64, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null,
+            CompactMode::Null>
+            v94 = Tile<
+                TileType::Acc, float, 16, 64, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null,
+                CompactMode::Null>(v18, v13);
+        uint64_t v95 = (uint64_t)v21;
+        TASSIGN(v94, v95);
+        pipe_barrier(PIPE_M);
+        wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID5);
+        TMATMUL_ACC(v94, v94, v88, v90);
+        set_flag(PIPE_M, PIPE_MTE1, EVENT_ID5);
+        wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID5);
+        wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID5);
+        for (size_t v96 = v24; v96 < v25; v96 += v24) {
+            int32_t v97 = (int32_t)((uint32_t)((int32_t)v96) * (uint32_t)v12);
+            Tile<
+                TileType::Mat, bfloat16_t, 16, 512, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null,
+                CompactMode::Null>
+                v98 = Tile<
+                    TileType::Mat, bfloat16_t, 16, 512, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512,
+                    PadValue::Null, CompactMode::Null>(v18, v12);
+            uint64_t v99 = (uint64_t)v23;
+            TASSIGN(v98, v99);
+            pto::Shape<1, 1, 1, 16, 512> v100 = pto::Shape<1, 1, 1, 16, 512>();
+            pto::Stride<81920, 81920, 81920, 5120, 1> v101 = pto::Stride<81920, 81920, 81920, 5120, 1>();
+            GlobalTensor<
+                bfloat16_t, pto::Shape<1, 1, 1, 16, 512>, pto::Stride<81920, 81920, 81920, 5120, 1>, pto::Layout::ND>
+                v102 = GlobalTensor<
+                    bfloat16_t, pto::Shape<1, 1, 1, 16, 512>, pto::Stride<81920, 81920, 81920, 5120, 1>,
+                    pto::Layout::ND>(v3 + (v8 + v8 * (unsigned)v15 + (unsigned)v97 * (unsigned)v16), v100, v101);
+            wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID6);
+            TLOAD(v98, v102);
+            set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID6);
+            Tile<
+                TileType::Mat, bfloat16_t, 512, 64, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null,
+                CompactMode::Null>
+                v103 = Tile<
+                    TileType::Mat, bfloat16_t, 512, 64, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512,
+                    PadValue::Null, CompactMode::Null>(v12, v13);
+            uint64_t v104 = (uint64_t)v22;
+            TASSIGN(v103, v104);
+            pto::Shape<1, 1, 1, 512, 64> v105 = pto::Shape<1, 1, 1, 512, 64>();
+            pto::Stride<524288, 524288, 524288, 1024, 1> v106 = pto::Stride<524288, 524288, 524288, 1024, 1>();
+            GlobalTensor<
+                bfloat16_t, pto::Shape<1, 1, 1, 512, 64>, pto::Stride<524288, 524288, 524288, 1024, 1>, pto::Layout::ND>
+                v107 = GlobalTensor<
+                    bfloat16_t, pto::Shape<1, 1, 1, 512, 64>, pto::Stride<524288, 524288, 524288, 1024, 1>,
+                    pto::Layout::ND>(
+                    v5 + (v8 + (unsigned)v97 * (unsigned)v17 + (unsigned)v27 * (unsigned)v16), v105, v106
+                );
+            wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID7);
+            TLOAD(v103, v107);
+            set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID7);
+            Tile<
+                TileType::Left, bfloat16_t, 16, 256, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null,
+                CompactMode::Null>
+                v108 = Tile<
+                    TileType::Left, bfloat16_t, 16, 256, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512,
+                    PadValue::Null, CompactMode::Null>(v18, v10);
+            uint64_t v109 = (uint64_t)v20;
+            TASSIGN(v108, v109);
+            wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID6);
+            wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID6);
+            TEXTRACT(v108, v98, v11, v11);
+            Tile<
+                TileType::Right, bfloat16_t, 256, 64, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512, PadValue::Null,
+                CompactMode::Null>
+                v110 = Tile<
+                    TileType::Right, bfloat16_t, 256, 64, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512,
+                    PadValue::Null, CompactMode::Null>(v10, v13);
+            uint64_t v111 = (uint64_t)v19;
+            TASSIGN(v110, v111);
+            wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID7);
+            TEXTRACT(v110, v103, v11, v11);
+            set_flag(PIPE_MTE1, PIPE_M, EVENT_ID6);
+            Tile<
+                TileType::Left, bfloat16_t, 16, 256, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null,
+                CompactMode::Null>
+                v112 = Tile<
+                    TileType::Left, bfloat16_t, 16, 256, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512,
+                    PadValue::Null, CompactMode::Null>(v18, v10);
+            uint64_t v113 = (uint64_t)v23;
+            TASSIGN(v112, v113);
+            wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID7);
+            TEXTRACT(v112, v98, v11, v10);
+            set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID6);
+            Tile<
+                TileType::Right, bfloat16_t, 256, 64, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512, PadValue::Null,
+                CompactMode::Null>
+                v114 = Tile<
+                    TileType::Right, bfloat16_t, 256, 64, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512,
+                    PadValue::Null, CompactMode::Null>(v10, v13);
+            uint64_t v115 = (uint64_t)v23;
+            TASSIGN(v114, v115);
+            TEXTRACT(v114, v103, v10, v11);
+            set_flag(PIPE_MTE1, PIPE_M, EVENT_ID7);
+            set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID7);
+            Tile<
+                TileType::Acc, float, 16, 64, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null,
+                CompactMode::Null>
+                v116 = Tile<
+                    TileType::Acc, float, 16, 64, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null,
+                    CompactMode::Null>(v18, v13);
+            uint64_t v117 = (uint64_t)v21;
+            TASSIGN(v116, v117);
+            wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID6);
+            pipe_barrier(PIPE_M);
+            TMATMUL_ACC(v116, v116, v108, v110);
+            set_flag(PIPE_M, PIPE_MTE1, EVENT_ID6);
+            Tile<
+                TileType::Acc, float, 16, 64, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null,
+                CompactMode::Null>
+                v118 = Tile<
+                    TileType::Acc, float, 16, 64, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null,
+                    CompactMode::Null>(v18, v13);
+            uint64_t v119 = (uint64_t)v21;
+            TASSIGN(v118, v119);
+            pipe_barrier(PIPE_M);
+            wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID7);
+            TMATMUL_ACC(v118, v118, v112, v114);
+            set_flag(PIPE_M, PIPE_MTE1, EVENT_ID7);
+        };
+        set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID0);
+        set_flag(PIPE_M, PIPE_MTE1, EVENT_ID0);
+        set_flag(PIPE_M, PIPE_FIX, EVENT_ID1);
+        pto::Shape<1, 1, 1, 16, 64> v120 = pto::Shape<1, 1, 1, 16, 64>();
+        pto::Stride<16384, 16384, 16384, 1024, 1> v121 = pto::Stride<16384, 16384, 16384, 1024, 1>();
+        GlobalTensor<float, pto::Shape<1, 1, 1, 16, 64>, pto::Stride<16384, 16384, 16384, 1024, 1>, pto::Layout::ND>
+            v122 = GlobalTensor<
+                float, pto::Shape<1, 1, 1, 16, 64>, pto::Stride<16384, 16384, 16384, 1024, 1>, pto::Layout::ND>(
+                v2 + (v8 + (unsigned)v7 * (unsigned)v17 + (unsigned)v27 * (unsigned)v16), v120, v121
+            );
+        wait_flag(PIPE_M, PIPE_FIX, EVENT_ID1);
+        TSTORE(v122, v94);
+        set_flag(PIPE_FIX, PIPE_M, EVENT_ID0);
+    }
+    wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID0);
+    wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID0);
+    wait_flag(PIPE_FIX, PIPE_M, EVENT_ID0);
+    wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID2);
+    wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID3);
+    wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID2);
+    wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID3);
+    wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID6);
+    wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID7);
+    wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID6);
+    wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID7);
+#endif  // __DAV_CUBE__
+
+    ptoas_auto_sync_tail(PTOAutoSyncTailMode::kBarrierAll);
+    return;
+}
+
+// --- Kernel entry point ---
+extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) {
+    // Unpack tensor: k_proj__iter_v3
+    __gm__ Tensor *k_proj__iter_v3_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]);
+    __gm__ float *k_proj__iter_v3 =
+        reinterpret_cast<__gm__ float *>(k_proj__iter_v3_tensor->buffer.addr) + k_proj__iter_v3_tensor->start_offset;
+
+    // Unpack tensor: v_proj__iter_v3
+    __gm__ Tensor *v_proj__iter_v3_tensor = reinterpret_cast<__gm__ Tensor *>(args[1]);
+    __gm__ float *v_proj__iter_v3 =
+        reinterpret_cast<__gm__ float *>(v_proj__iter_v3_tensor->buffer.addr) + v_proj__iter_v3_tensor->start_offset;
+
+    // Unpack tensor: normed_tile__rv_v2
+    __gm__ Tensor *normed_tile__rv_v2_tensor = reinterpret_cast<__gm__ Tensor *>(args[2]);
+    __gm__ bfloat16_t *normed_tile__rv_v2 =
+        reinterpret_cast<__gm__ bfloat16_t *>(normed_tile__rv_v2_tensor->buffer.addr) +
+        normed_tile__rv_v2_tensor->start_offset;
+
+    // Unpack tensor: wk__ssa_v0
+    __gm__ Tensor *wk__ssa_v0_tensor = reinterpret_cast<__gm__ Tensor *>(args[3]);
+    __gm__ bfloat16_t *wk__ssa_v0 =
+        reinterpret_cast<__gm__ bfloat16_t *>(wk__ssa_v0_tensor->buffer.addr) + wk__ssa_v0_tensor->start_offset;
+
+    // Unpack tensor: wv__ssa_v0
+    __gm__ Tensor *wv__ssa_v0_tensor = reinterpret_cast<__gm__ Tensor *>(args[4]);
+    __gm__ bfloat16_t *wv__ssa_v0 =
+        reinterpret_cast<__gm__ bfloat16_t *>(wv__ssa_v0_tensor->buffer.addr) + wv__ssa_v0_tensor->start_offset;
+
+    // Unpack scalar: ob_chunk__idx_v0
+    union {
+        uint64_t u64;
+        int64_t val;
+    } ob_chunk__idx_v0_conv;
+    ob_chunk__idx_v0_conv.u64 = args[5];
+    int64_t ob_chunk__idx_v0 = ob_chunk__idx_v0_conv.val;
+
+    // Unpack scalar: b0__idx_v0
+    union {
+        uint64_t u64;
+        int64_t val;
+    } b0__idx_v0_conv;
+    b0__idx_v0_conv.u64 = args[6];
+    int64_t b0__idx_v0 = b0__idx_v0_conv.val;
+
+    // Forward to ptoas-generated function
+    kv_proj(k_proj__iter_v3, v_proj__iter_v3, normed_tile__rv_v2, wk__ssa_v0, wv__ssa_v0, ob_chunk__idx_v0, b0__idx_v0);
+}
diff --git a/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aic/out_proj.cpp b/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aic/out_proj.cpp
new file mode 100644
index 000000000..327c26af5
--- /dev/null
+++ b/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aic/out_proj.cpp
@@ -0,0 +1,269 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+// Kernel Function: out_proj
+// Generated by PyPTO IR Compiler (PTO backend)
+
+#include <cstdint>
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#if defined(__CPU_SIM)
+#define __aicore__
+#else
+#define __aicore__ [aicore]
+#endif
+#endif
+
+#include <pto/pto-inst.hpp>
+#include "tensor.h"
+
+using namespace pto;
+
+// --- ptoas-generated code ---
+
+enum class PTOAutoSyncTailMode : int {
+    kBarrierAll = 0,
+    kSetWaitMte3ToSEvent0 = 1,
+};
+
+static __aicore__ inline void ptoas_auto_sync_tail(PTOAutoSyncTailMode mode = PTOAutoSyncTailMode::kBarrierAll) {
+    switch (mode) {
+    case PTOAutoSyncTailMode::kSetWaitMte3ToSEvent0:
+        set_flag(PIPE_MTE3, PIPE_S, EVENT_ID0);
+        wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID0);
+        break;
+    case PTOAutoSyncTailMode::kBarrierAll:
+    default:
+        pipe_barrier(PIPE_ALL);
+        break;
+    }
+}
+
+static __aicore__ void
+out_proj(__gm__ bfloat16_t *v1, __gm__ bfloat16_t *v2, __gm__ float *v3, int32_t v4, int32_t v5) {
+    unsigned v6 = 0;
+    const int32_t v7 = 40;
+    const int32_t v8 = 128;
+    const int32_t v9 = 64;
+    const int32_t v10 = 1;
+    const int32_t v11 = 5120;
+    const int32_t v12 = 16;
+    const int64_t v13 = 4096;
+    const int64_t v14 = 0;
+    using T = float;
+
+#if defined(__DAV_CUBE__)
+    size_t v15 = (size_t)v10;
+    Tile<
+        TileType::Mat, bfloat16_t, 16, 128, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null,
+        CompactMode::Null>
+        v16 = Tile<
+            TileType::Mat, bfloat16_t, 16, 128, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null,
+            CompactMode::Null>(v12, v8);
+    uint64_t v17 = (uint64_t)v14;
+    TASSIGN(v16, v17);
+    pto::Shape<1, 1, 1, 16, 128> v18 = pto::Shape<1, 1, 1, 16, 128>();
+    pto::Stride<81920, 81920, 81920, 5120, 1> v19 = pto::Stride<81920, 81920, 81920, 5120, 1>();
+    GlobalTensor<bfloat16_t, pto::Shape<1, 1, 1, 16, 128>, pto::Stride<81920, 81920, 81920, 5120, 1>, pto::Layout::ND>
+        v20 = GlobalTensor<
+            bfloat16_t, pto::Shape<1, 1, 1, 16, 128>, pto::Stride<81920, 81920, 81920, 5120, 1>, pto::Layout::ND>(
+            v1 + (v6 + (unsigned)v4 * (unsigned)v11 + v6 * (unsigned)v10), v18, v19
+        );
+    set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1);
+    set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID2);
+    set_flag(PIPE_M, PIPE_MTE1, EVENT_ID1);
+    TLOAD(v16, v20);
+    set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
+    Tile<
+        TileType::Mat, bfloat16_t, 128, 64, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null,
+        CompactMode::Null>
+        v21 = Tile<
+            TileType::Mat, bfloat16_t, 128, 64, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null,
+            CompactMode::Null>(v8, v9);
+    uint64_t v22 = (uint64_t)v13;
+    TASSIGN(v21, v22);
+    pto::Shape<1, 1, 1, 128, 64> v23 = pto::Shape<1, 1, 1, 128, 64>();
+    pto::Stride<655360, 655360, 655360, 5120, 1> v24 = pto::Stride<655360, 655360, 655360, 5120, 1>();
+    GlobalTensor<
+        bfloat16_t, pto::Shape<1, 1, 1, 128, 64>, pto::Stride<655360, 655360, 655360, 5120, 1>, pto::Layout::ND>
+        v25 = GlobalTensor<
+            bfloat16_t, pto::Shape<1, 1, 1, 128, 64>, pto::Stride<655360, 655360, 655360, 5120, 1>, pto::Layout::ND>(
+            v2 + (v6 + v6 * (unsigned)v11 + (unsigned)v5 * (unsigned)v10), v23, v24
+        );
+    TLOAD(v21, v25);
+    set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID1);
+    Tile<
+        TileType::Left, bfloat16_t, 16, 128, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null,
+        CompactMode::Null>
+        v26 = Tile<
+            TileType::Left, bfloat16_t, 16, 128, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null,
+            CompactMode::Null>(v12, v8);
+    uint64_t v27 = (uint64_t)v14;
+    TASSIGN(v26, v27);
+    wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
+    TMOV(v26, v16);
+    Tile<
+        TileType::Right, bfloat16_t, 128, 64, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512, PadValue::Null,
+        CompactMode::Null>
+        v28 = Tile<
+            TileType::Right, bfloat16_t, 128, 64, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512, PadValue::Null,
+            CompactMode::Null>(v8, v9);
+    uint64_t v29 = (uint64_t)v14;
+    TASSIGN(v28, v29);
+    wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID1);
+    TMOV(v28, v21);
+    set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
+    set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID0);
+    Tile<
+        TileType::Acc, float, 16, 64, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null,
+        CompactMode::Null>
+        v30 = Tile<
+            TileType::Acc, float, 16, 64, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null,
+            CompactMode::Null>(v12, v9);
+    uint64_t v31 = (uint64_t)v14;
+    TASSIGN(v30, v31);
+    wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
+    TMATMUL(v30, v26, v28);
+    set_flag(PIPE_M, PIPE_MTE1, EVENT_ID0);
+    wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID0);
+    wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID0);
+    for (size_t v32 = v15; v32 < ((size_t)v7); v32 += v15) {
+        int32_t v33 = (int32_t)((uint32_t)((int32_t)v32) * (uint32_t)v8);
+        Tile<
+            TileType::Mat, bfloat16_t, 16, 128, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null,
+            CompactMode::Null>
+            v34 = Tile<
+                TileType::Mat, bfloat16_t, 16, 128, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null,
+                CompactMode::Null>(v12, v8);
+        uint64_t v35 = (uint64_t)v14;
+        TASSIGN(v34, v35);
+        pto::Shape<1, 1, 1, 16, 128> v36 = pto::Shape<1, 1, 1, 16, 128>();
+        pto::Stride<81920, 81920, 81920, 5120, 1> v37 = pto::Stride<81920, 81920, 81920, 5120, 1>();
+        GlobalTensor<
+            bfloat16_t, pto::Shape<1, 1, 1, 16, 128>, pto::Stride<81920, 81920, 81920, 5120, 1>, pto::Layout::ND>
+            v38 = GlobalTensor<
+                bfloat16_t, pto::Shape<1, 1, 1, 16, 128>, pto::Stride<81920, 81920, 81920, 5120, 1>, pto::Layout::ND>(
+                v1 + (v6 + (unsigned)v4 * (unsigned)v11 + (unsigned)v33 * (unsigned)v10), v36, v37
+            );
+        wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1);
+        TLOAD(v34, v38);
+        set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID2);
+        Tile<
+            TileType::Mat, bfloat16_t, 128, 64, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null,
+            CompactMode::Null>
+            v39 = Tile<
+                TileType::Mat, bfloat16_t, 128, 64, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null,
+                CompactMode::Null>(v8, v9);
+        uint64_t v40 = (uint64_t)v13;
+        TASSIGN(v39, v40);
+        pto::Shape<1, 1, 1, 128, 64> v41 = pto::Shape<1, 1, 1, 128, 64>();
+        pto::Stride<655360, 655360, 655360, 5120, 1> v42 = pto::Stride<655360, 655360, 655360, 5120, 1>();
+        GlobalTensor<
+            bfloat16_t, pto::Shape<1, 1, 1, 128, 64>, pto::Stride<655360, 655360, 655360, 5120, 1>, pto::Layout::ND>
+            v43 = GlobalTensor<
+                bfloat16_t, pto::Shape<1, 1, 1, 128, 64>, pto::Stride<655360, 655360, 655360, 5120, 1>,
+                pto::Layout::ND>(v2 + (v6 + (unsigned)v33 * (unsigned)v11 + (unsigned)v5 * (unsigned)v10), v41, v42);
+        wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID2);
+        TLOAD(v39, v43);
+        set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID3);
+        Tile<
+            TileType::Left, bfloat16_t, 16, 128, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null,
+            CompactMode::Null>
+            v44 = Tile<
+                TileType::Left, bfloat16_t, 16, 128, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null,
+                CompactMode::Null>(v12, v8);
+        uint64_t v45 = (uint64_t)v14;
+        TASSIGN(v44, v45);
+        wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID2);
+        wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID1);
+        TMOV(v44, v34);
+        set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1);
+        Tile<
+            TileType::Right, bfloat16_t, 128, 64, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512, PadValue::Null,
+            CompactMode::Null>
+            v46 = Tile<
+                TileType::Right, bfloat16_t, 128, 64, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512, PadValue::Null,
+                CompactMode::Null>(v8, v9);
+        uint64_t v47 = (uint64_t)v14;
+        TASSIGN(v46, v47);
+        wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID3);
+        TMOV(v46, v39);
+        set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID2);
+        set_flag(PIPE_MTE1, PIPE_M, EVENT_ID1);
+        Tile<
+            TileType::Acc, float, 16, 64, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null,
+            CompactMode::Null>
+            v48 = Tile<
+                TileType::Acc, float, 16, 64, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null,
+                CompactMode::Null>(v12, v9);
+        uint64_t v49 = (uint64_t)v14;
+        TASSIGN(v48, v49);
+        wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID1);
+        TMATMUL_ACC(v48, v48, v44, v46);
+        set_flag(PIPE_M, PIPE_MTE1, EVENT_ID1);
+    }
+    set_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
+    pto::Shape<1, 1, 1, 16, 64> v50 = pto::Shape<1, 1, 1, 16, 64>();
+    pto::Stride<1024, 1024, 1024, 64, 1> v51 = pto::Stride<1024, 1024, 1024, 64, 1>();
+    GlobalTensor<float, pto::Shape<1, 1, 1, 16, 64>, pto::Stride<1024, 1024, 1024, 64, 1>, pto::Layout::ND> v52 =
+        GlobalTensor<float, pto::Shape<1, 1, 1, 16, 64>, pto::Stride<1024, 1024, 1024, 64, 1>, pto::Layout::ND>(
+            v3 + (v6 + v6 * (unsigned)v9 + v6 * (unsigned)v10), v50, v51
+        );
+    wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
+    TSTORE(v52, v30);
+    wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1);
+    wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID2);
+    wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID1);
+#endif  // __DAV_CUBE__
+
+    ptoas_auto_sync_tail(PTOAutoSyncTailMode::kBarrierAll);
+    return;
+}
+
+// --- Kernel entry point ---
+extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) {
+    // Unpack tensor: attn_out__rv_v2
+    __gm__ Tensor *attn_out__rv_v2_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]);
+    __gm__ bfloat16_t *attn_out__rv_v2 = reinterpret_cast<__gm__ bfloat16_t *>(attn_out__rv_v2_tensor->buffer.addr) +
+                                         attn_out__rv_v2_tensor->start_offset;
+
+    // Unpack tensor: wo__ssa_v0
+    __gm__ Tensor *wo__ssa_v0_tensor = reinterpret_cast<__gm__ Tensor *>(args[1]);
+    __gm__ bfloat16_t *wo__ssa_v0 =
+        reinterpret_cast<__gm__ bfloat16_t *>(wo__ssa_v0_tensor->buffer.addr) + wo__ssa_v0_tensor->start_offset;
+
+    // Unpack tensor: ret0__out
+    __gm__ Tensor *ret0__out_tensor = reinterpret_cast<__gm__ Tensor *>(args[2]);
+    __gm__ float *ret0__out =
+        reinterpret_cast<__gm__ float *>(ret0__out_tensor->buffer.addr) + ret0__out_tensor->start_offset;
+
+    // Unpack scalar: b0__idx_v0
+    union {
+        uint64_t u64;
+        int64_t val;
+    } b0__idx_v0_conv;
+    b0__idx_v0_conv.u64 = args[3];
+    int64_t b0__idx_v0 = b0__idx_v0_conv.val;
+
+    // Unpack scalar: o0__ssa_v0
+    union {
+        uint64_t u64;
+        int64_t val;
+    } o0__ssa_v0_conv;
+    o0__ssa_v0_conv.u64 = args[4];
+    int64_t o0__ssa_v0 = o0__ssa_v0_conv.val;
+
+    // Forward to ptoas-generated function
+    out_proj(attn_out__rv_v2, wo__ssa_v0, ret0__out, b0__idx_v0, o0__ssa_v0);
+}
diff --git a/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aic/q_proj.cpp b/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aic/q_proj.cpp
new file mode 100644
index 000000000..6bfcd7cb6
--- /dev/null
+++ b/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aic/q_proj.cpp
@@ -0,0 +1,357 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+// Kernel Function: q_proj
+// Generated by PyPTO IR Compiler (PTO backend)
+
+#include <cstdint>
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#if defined(__CPU_SIM)
+#define __aicore__
+#else
+#define __aicore__ [aicore]
+#endif
+#endif
+
+#include <pto/pto-inst.hpp>
+#include "tensor.h"
+
+using namespace pto;
+
+// --- ptoas-generated code ---
+
+enum class PTOAutoSyncTailMode : int {
+    kBarrierAll = 0,
+    kSetWaitMte3ToSEvent0 = 1,
+};
+
+static __aicore__ inline void ptoas_auto_sync_tail(PTOAutoSyncTailMode mode = PTOAutoSyncTailMode::kBarrierAll) {
+    switch (mode) {
+    case PTOAutoSyncTailMode::kSetWaitMte3ToSEvent0:
+        set_flag(PIPE_MTE3, PIPE_S, EVENT_ID0);
+        wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID0);
+        break;
+    case PTOAutoSyncTailMode::kBarrierAll:
+    default:
+        pipe_barrier(PIPE_ALL);
+        break;
+    }
+}
+
+static __aicore__ void q_proj(__gm__ float *v1, __gm__ bfloat16_t *v2, __gm__ bfloat16_t *v3, int32_t v4, int32_t v5) {
+    unsigned v6 = 0;
+    const int32_t v7 = 10;
+    const int32_t v8 = 256;
+    const int32_t v9 = 0;
+    const int32_t v10 = 512;
+    const int32_t v11 = 64;
+    const int32_t v12 = 4;
+    const int32_t v13 = 1;
+    const int32_t v14 = 5120;
+    const int32_t v15 = 16;
+    const int64_t v16 = 32768;
+    const int64_t v17 = 8192;
+    const int64_t v18 = 16384;
+    const int64_t v19 = 0;
+    using T = float;
+
+#if defined(__DAV_CUBE__)
+    size_t v20 = (size_t)v13;
+    set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID0);
+    set_flag(PIPE_M, PIPE_MTE1, EVENT_ID0);
+    set_flag(PIPE_FIX, PIPE_M, EVENT_ID0);
+    set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID2);
+    set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID3);
+    set_flag(PIPE_M, PIPE_MTE1, EVENT_ID2);
+    set_flag(PIPE_M, PIPE_MTE1, EVENT_ID3);
+    for (size_t v21 = (size_t)v4; v21 < ((size_t)((int32_t)(uint32_t)v4 + (uint32_t)v12)); v21 += v20) {
+        int32_t v22 = (int32_t)((uint32_t)((int32_t)v21) * (uint32_t)v11);
+        Tile<
+            TileType::Mat, bfloat16_t, 16, 512, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null,
+            CompactMode::Null>
+            v23 = Tile<
+                TileType::Mat, bfloat16_t, 16, 512, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null,
+                CompactMode::Null>(v15, v10);
+        uint64_t v24 = (uint64_t)v19;
+        TASSIGN(v23, v24);
+        pto::Shape<1, 1, 1, 16, 512> v25 = pto::Shape<1, 1, 1, 16, 512>();
+        pto::Stride<81920, 81920, 81920, 5120, 1> v26 = pto::Stride<81920, 81920, 81920, 5120, 1>();
+        GlobalTensor<
+            bfloat16_t, pto::Shape<1, 1, 1, 16, 512>, pto::Stride<81920, 81920, 81920, 5120, 1>, pto::Layout::ND>
+            v27 = GlobalTensor<
+                bfloat16_t, pto::Shape<1, 1, 1, 16, 512>, pto::Stride<81920, 81920, 81920, 5120, 1>, pto::Layout::ND>(
+                v2 + (v6 + v6 * (unsigned)v14 + v6 * (unsigned)v13), v25, v26
+            );
+        wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID0);
+        TLOAD(v23, v27);
+        set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
+        Tile<
+            TileType::Mat, bfloat16_t, 512, 64, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null,
+            CompactMode::Null>
+            v28 = Tile<
+                TileType::Mat, bfloat16_t, 512, 64, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null,
+                CompactMode::Null>(v10, v11);
+        uint64_t v29 = (uint64_t)v18;
+        TASSIGN(v28, v29);
+        pto::Shape<1, 1, 1, 512, 64> v30 = pto::Shape<1, 1, 1, 512, 64>();
+        pto::Stride<2621440, 2621440, 2621440, 5120, 1> v31 = pto::Stride<2621440, 2621440, 2621440, 5120, 1>();
+        GlobalTensor<
+            bfloat16_t, pto::Shape<1, 1, 1, 512, 64>, pto::Stride<2621440, 2621440, 2621440, 5120, 1>, pto::Layout::ND>
+            v32 = GlobalTensor<
+                bfloat16_t, pto::Shape<1, 1, 1, 512, 64>, pto::Stride<2621440, 2621440, 2621440, 5120, 1>,
+                pto::Layout::ND>(v3 + (v6 + v6 * (unsigned)v14 + (unsigned)v22 * (unsigned)v13), v30, v31);
+        TLOAD(v28, v32);
+        set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID1);
+        Tile<
+            TileType::Left, bfloat16_t, 16, 256, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null,
+            CompactMode::Null>
+            v33 = Tile<
+                TileType::Left, bfloat16_t, 16, 256, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null,
+                CompactMode::Null>(v15, v8);
+        uint64_t v34 = (uint64_t)v19;
+        TASSIGN(v33, v34);
+        wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
+        wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID0);
+        TEXTRACT(v33, v23, v9, v9);
+        Tile<
+            TileType::Right, bfloat16_t, 256, 64, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512, PadValue::Null,
+            CompactMode::Null>
+            v35 = Tile<
+                TileType::Right, bfloat16_t, 256, 64, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512, PadValue::Null,
+                CompactMode::Null>(v8, v11);
+        uint64_t v36 = (uint64_t)v19;
+        TASSIGN(v35, v36);
+        wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID1);
+        TEXTRACT(v35, v28, v9, v9);
+        set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
+        Tile<
+            TileType::Left, bfloat16_t, 16, 256, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null,
+            CompactMode::Null>
+            v37 = Tile<
+                TileType::Left, bfloat16_t, 16, 256, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null,
+                CompactMode::Null>(v15, v8);
+        uint64_t v38 = (uint64_t)v17;
+        TASSIGN(v37, v38);
+        TEXTRACT(v37, v23, v9, v8);
+        Tile<
+            TileType::Right, bfloat16_t, 256, 64, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512, PadValue::Null,
+            CompactMode::Null>
+            v39 = Tile<
+                TileType::Right, bfloat16_t, 256, 64, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512, PadValue::Null,
+                CompactMode::Null>(v8, v11);
+        uint64_t v40 = (uint64_t)v16;
+        TASSIGN(v39, v40);
+        TEXTRACT(v39, v28, v8, v9);
+        set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1);
+        set_flag(PIPE_MTE1, PIPE_M, EVENT_ID1);
+        Tile<
+            TileType::Acc, float, 16, 64, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null,
+            CompactMode::Null>
+            v41 = Tile<
+                TileType::Acc, float, 16, 64, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null,
+                CompactMode::Null>(v15, v11);
+        uint64_t v42 = (uint64_t)v19;
+        TASSIGN(v41, v42);
+        wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
+        wait_flag(PIPE_FIX, PIPE_M, EVENT_ID0);
+        TMATMUL(v41, v33, v35);
+        Tile<
+            TileType::Acc, float, 16, 64, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null,
+            CompactMode::Null>
+            v43 = Tile<
+                TileType::Acc, float, 16, 64, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null,
+                CompactMode::Null>(v15, v11);
+        uint64_t v44 = (uint64_t)v19;
+        TASSIGN(v43, v44);
+        pipe_barrier(PIPE_M);
+        wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID1);
+        TMATMUL_ACC(v43, v43, v37, v39);
+        set_flag(PIPE_M, PIPE_MTE1, EVENT_ID1);
+        wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1);
+        wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID1);
+        for (size_t v45 = v20; v45 < ((size_t)v7); v45 += v20) {
+            int32_t v46 = (int32_t)((uint32_t)((int32_t)v45) * (uint32_t)v10);
+            Tile<
+                TileType::Mat, bfloat16_t, 16, 512, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null,
+                CompactMode::Null>
+                v47 = Tile<
+                    TileType::Mat, bfloat16_t, 16, 512, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512,
+                    PadValue::Null, CompactMode::Null>(v15, v10);
+            uint64_t v48 = (uint64_t)v19;
+            TASSIGN(v47, v48);
+            pto::Shape<1, 1, 1, 16, 512> v49 = pto::Shape<1, 1, 1, 16, 512>();
+            pto::Stride<81920, 81920, 81920, 5120, 1> v50 = pto::Stride<81920, 81920, 81920, 5120, 1>();
+            GlobalTensor<
+                bfloat16_t, pto::Shape<1, 1, 1, 16, 512>, pto::Stride<81920, 81920, 81920, 5120, 1>, pto::Layout::ND>
+                v51 = GlobalTensor<
+                    bfloat16_t, pto::Shape<1, 1, 1, 16, 512>, pto::Stride<81920, 81920, 81920, 5120, 1>,
+                    pto::Layout::ND>(v2 + (v6 + v6 * (unsigned)v14 + (unsigned)v46 * (unsigned)v13), v49, v50);
+            wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID2);
+            TLOAD(v47, v51);
+            set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID2);
+            Tile<
+                TileType::Mat, bfloat16_t, 512, 64, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null,
+                CompactMode::Null>
+                v52 = Tile<
+                    TileType::Mat, bfloat16_t, 512, 64, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512,
+                    PadValue::Null, CompactMode::Null>(v10, v11);
+            uint64_t v53 = (uint64_t)v18;
+            TASSIGN(v52, v53);
+            pto::Shape<1, 1, 1, 512, 64> v54 = pto::Shape<1, 1, 1, 512, 64>();
+            pto::Stride<2621440, 2621440, 2621440, 5120, 1> v55 = pto::Stride<2621440, 2621440, 2621440, 5120, 1>();
+            GlobalTensor<
+                bfloat16_t, pto::Shape<1, 1, 1, 512, 64>, pto::Stride<2621440, 2621440, 2621440, 5120, 1>,
+                pto::Layout::ND>
+                v56 = GlobalTensor<
+                    bfloat16_t, pto::Shape<1, 1, 1, 512, 64>, pto::Stride<2621440, 2621440, 2621440, 5120, 1>,
+                    pto::Layout::ND>(
+                    v3 + (v6 + (unsigned)v46 * (unsigned)v14 + (unsigned)v22 * (unsigned)v13), v54, v55
+                );
+            wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID3);
+            TLOAD(v52, v56);
+            set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID3);
+            Tile<
+                TileType::Left, bfloat16_t, 16, 256, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null,
+                CompactMode::Null>
+                v57 = Tile<
+                    TileType::Left, bfloat16_t, 16, 256, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512,
+                    PadValue::Null, CompactMode::Null>(v15, v8);
+            uint64_t v58 = (uint64_t)v19;
+            TASSIGN(v57, v58);
+            wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID2);
+            wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID2);
+            TEXTRACT(v57, v47, v9, v9);
+            Tile<
+                TileType::Right, bfloat16_t, 256, 64, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512, PadValue::Null,
+                CompactMode::Null>
+                v59 = Tile<
+                    TileType::Right, bfloat16_t, 256, 64, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512,
+                    PadValue::Null, CompactMode::Null>(v8, v11);
+            uint64_t v60 = (uint64_t)v19;
+            TASSIGN(v59, v60);
+            wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID3);
+            TEXTRACT(v59, v52, v9, v9);
+            set_flag(PIPE_MTE1, PIPE_M, EVENT_ID2);
+            Tile<
+                TileType::Left, bfloat16_t, 16, 256, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null,
+                CompactMode::Null>
+                v61 = Tile<
+                    TileType::Left, bfloat16_t, 16, 256, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512,
+                    PadValue::Null, CompactMode::Null>(v15, v8);
+            uint64_t v62 = (uint64_t)v17;
+            TASSIGN(v61, v62);
+            wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID3);
+            TEXTRACT(v61, v47, v9, v8);
+            set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID2);
+            Tile<
+                TileType::Right, bfloat16_t, 256, 64, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512, PadValue::Null,
+                CompactMode::Null>
+                v63 = Tile<
+                    TileType::Right, bfloat16_t, 256, 64, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512,
+                    PadValue::Null, CompactMode::Null>(v8, v11);
+            uint64_t v64 = (uint64_t)v16;
+            TASSIGN(v63, v64);
+            TEXTRACT(v63, v52, v8, v9);
+            set_flag(PIPE_MTE1, PIPE_M, EVENT_ID3);
+            set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID3);
+            Tile<
+                TileType::Acc, float, 16, 64, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null,
+                CompactMode::Null>
+                v65 = Tile<
+                    TileType::Acc, float, 16, 64, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null,
+                    CompactMode::Null>(v15, v11);
+            uint64_t v66 = (uint64_t)v19;
+            TASSIGN(v65, v66);
+            wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID2);
+            pipe_barrier(PIPE_M);
+            TMATMUL_ACC(v65, v65, v57, v59);
+            set_flag(PIPE_M, PIPE_MTE1, EVENT_ID2);
+            Tile<
+                TileType::Acc, float, 16, 64, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null,
+                CompactMode::Null>
+                v67 = Tile<
+                    TileType::Acc, float, 16, 64, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null,
+                    CompactMode::Null>(v15, v11);
+            uint64_t v68 = (uint64_t)v19;
+            TASSIGN(v67, v68);
+            pipe_barrier(PIPE_M);
+            wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID3);
+            TMATMUL_ACC(v67, v67, v61, v63);
+            set_flag(PIPE_M, PIPE_MTE1, EVENT_ID3);
+        };
+        set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID0);
+        set_flag(PIPE_M, PIPE_MTE1, EVENT_ID0);
+        set_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
+        pto::Shape<1, 1, 1, 16, 64> v69 = pto::Shape<1, 1, 1, 16, 64>();
+        pto::Stride<81920, 81920, 81920, 5120, 1> v70 = pto::Stride<81920, 81920, 81920, 5120, 1>();
+        GlobalTensor<float, pto::Shape<1, 1, 1, 16, 64>, pto::Stride<81920, 81920, 81920, 5120, 1>, pto::Layout::ND>
+            v71 = GlobalTensor<
+                float, pto::Shape<1, 1, 1, 16, 64>, pto::Stride<81920, 81920, 81920, 5120, 1>, pto::Layout::ND>(
+                v1 + (v6 + (unsigned)v5 * (unsigned)v14 + (unsigned)v22 * (unsigned)v13), v69, v70
+            );
+        wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
+        TSTORE(v71, v43);
+        set_flag(PIPE_FIX, PIPE_M, EVENT_ID0);
+    }
+    wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID0);
+    wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID0);
+    wait_flag(PIPE_FIX, PIPE_M, EVENT_ID0);
+    wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID2);
+    wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID3);
+    wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID2);
+    wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID3);
+#endif  // __DAV_CUBE__
+
+    ptoas_auto_sync_tail(PTOAutoSyncTailMode::kBarrierAll);
+    return;
+}
+
+// --- Kernel entry point ---
+extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) {
+    // Unpack tensor: q_proj__iter_v3
+    __gm__ Tensor *q_proj__iter_v3_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]);
+    __gm__ float *q_proj__iter_v3 =
+        reinterpret_cast<__gm__ float *>(q_proj__iter_v3_tensor->buffer.addr) + q_proj__iter_v3_tensor->start_offset;
+
+    // Unpack tensor: normed_tile__rv_v2
+    __gm__ Tensor *normed_tile__rv_v2_tensor = reinterpret_cast<__gm__ Tensor *>(args[1]);
+    __gm__ bfloat16_t *normed_tile__rv_v2 =
+        reinterpret_cast<__gm__ bfloat16_t *>(normed_tile__rv_v2_tensor->buffer.addr) +
+        normed_tile__rv_v2_tensor->start_offset;
+
+    // Unpack tensor: wq__ssa_v0
+    __gm__ Tensor *wq__ssa_v0_tensor = reinterpret_cast<__gm__ Tensor *>(args[2]);
+    __gm__ bfloat16_t *wq__ssa_v0 =
+        reinterpret_cast<__gm__ bfloat16_t *>(wq__ssa_v0_tensor->buffer.addr) + wq__ssa_v0_tensor->start_offset;
+
+    // Unpack scalar: ob_chunk__idx_v0
+    union {
+        uint64_t u64;
+        int64_t val;
+    } ob_chunk__idx_v0_conv;
+    ob_chunk__idx_v0_conv.u64 = args[3];
+    int64_t ob_chunk__idx_v0 = ob_chunk__idx_v0_conv.val;
+
+    // Unpack scalar: b0__idx_v0
+    union {
+        uint64_t u64;
+        int64_t val;
+    } b0__idx_v0_conv;
+    b0__idx_v0_conv.u64 = args[4];
+    int64_t b0__idx_v0 = b0__idx_v0_conv.val;
+
+    // Forward to ptoas-generated function
+    q_proj(q_proj__iter_v3, normed_tile__rv_v2, wq__ssa_v0, ob_chunk__idx_v0, b0__idx_v0);
+}
diff --git a/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aic/qk_matmul.cpp b/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aic/qk_matmul.cpp
new file mode 100644
index 000000000..bb82ee9fb
--- /dev/null
+++ b/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aic/qk_matmul.cpp
@@ -0,0 +1,307 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+// Kernel Function: qk_matmul
+// Generated by PyPTO IR Compiler (PTO backend)
+
+#include <cstdint>
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#if defined(__CPU_SIM)
+#define __aicore__
+#else
+#define __aicore__ [aicore]
+#endif
+#endif
+
+#include <pto/pto-inst.hpp>
+#include "tensor.h"
+
+using namespace pto;
+
+// --- ptoas-generated code ---
+
+enum class PTOAutoSyncTailMode : int {
+    kBarrierAll = 0,
+    kSetWaitMte3ToSEvent0 = 1,
+};
+
+static __aicore__ inline void ptoas_auto_sync_tail(PTOAutoSyncTailMode mode = PTOAutoSyncTailMode::kBarrierAll) {
+    switch (mode) {
+    case PTOAutoSyncTailMode::kSetWaitMte3ToSEvent0:
+        set_flag(PIPE_MTE3, PIPE_S, EVENT_ID0);
+        wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID0);
+        break;
+    case PTOAutoSyncTailMode::kBarrierAll:
+    default:
+        pipe_barrier(PIPE_ALL);
+        break;
+    }
+}
+
+static __aicore__ void qk_matmul(
+    __gm__ float *v1, __gm__ bfloat16_t *v2, __gm__ int32_t *v3, __gm__ bfloat16_t *v4, int32_t v5, int32_t v6,
+    int32_t v7, int32_t v8, int32_t v9, int32_t v10
+) {
+    unsigned v11 = 0;
+    const int32_t v12 = 2;
+    const int32_t v13 = 64;
+    const int32_t v14 = 16;
+    const int32_t v15 = 8;
+    const int32_t v16 = 0;
+    const int32_t v17 = 128;
+    const int32_t v18 = 1;
+    const int32_t v19 = 256;
+    const int64_t v20 = 2048;
+    const int64_t v21 = 32768;
+    const int64_t v22 = 4096;
+    const int64_t v23 = 0;
+    using T = float;
+
+#if defined(__DAV_CUBE__)
+    size_t v24 = (size_t)v18;
+    size_t v25 = (size_t)v16;
+    set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID0);
+    set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1);
+    set_flag(PIPE_M, PIPE_MTE1, EVENT_ID0);
+    set_flag(PIPE_M, PIPE_MTE1, EVENT_ID1);
+    set_flag(PIPE_FIX, PIPE_M, EVENT_ID0);
+    for (size_t v26 = v25; v26 < ((size_t)v15); v26 += v24) {
+        int32_t v27 = (int32_t)v26;
+        Tile<
+            TileType::Mat, bfloat16_t, 16, 128, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null,
+            CompactMode::Null>
+            v28 = Tile<
+                TileType::Mat, bfloat16_t, 16, 128, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null,
+                CompactMode::Null>(v14, v17);
+        uint64_t v29 = (uint64_t)v23;
+        TASSIGN(v28, v29);
+        pto::Shape<1, 1, 1, 16, 128> v30 = pto::Shape<1, 1, 1, 16, 128>();
+        pto::Stride<2048, 2048, 2048, 128, 1> v31 = pto::Stride<2048, 2048, 2048, 128, 1>();
+        GlobalTensor<bfloat16_t, pto::Shape<1, 1, 1, 16, 128>, pto::Stride<2048, 2048, 2048, 128, 1>, pto::Layout::ND>
+            v32 = GlobalTensor<
+                bfloat16_t, pto::Shape<1, 1, 1, 16, 128>, pto::Stride<2048, 2048, 2048, 128, 1>, pto::Layout::ND>(
+                v2 + (v11 +
+                      (unsigned)((int32_t)(uint32_t)((int32_t)(uint32_t)v5 * (uint32_t)v17) +
+                                 (uint32_t)((int32_t)(uint32_t)v27 * (uint32_t)v14)) *
+                          (unsigned)v17 +
+                      v11 * (unsigned)v18),
+                v30, v31
+            );
+        wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID0);
+        pipe_barrier(PIPE_MTE2);
+        TLOAD(v28, v32);
+        set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
+        wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
+        for (size_t v33 = v25; v33 < ((size_t)v13); v33 += v24) {
+            int32_t v34 = (int32_t)((uint32_t)v6 + (uint32_t)((int32_t)v33));
+            __gm__ float *v35;
+            if (v34 < v7) {
+                int32_t v36 = v3[(int32_t)((uint32_t)v8 + (uint32_t)v34)];
+                Tile<
+                    TileType::Mat, bfloat16_t, 128, 256, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512,
+                    PadValue::Null, CompactMode::Null>
+                    v37 = Tile<
+                        TileType::Mat, bfloat16_t, 128, 256, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512,
+                        PadValue::Null, CompactMode::Null>(v17, v19);
+                uint64_t v38 = (uint64_t)v22;
+                TASSIGN(v37, v38);
+                pto::Shape<1, 1, 1, 128, 256> v39 = pto::Shape<1, 1, 1, 128, 256>();
+                pto::Stride<128, 128, 128, 1, 128> v40 = pto::Stride<128, 128, 128, 1, 128>();
+                GlobalTensor<
+                    bfloat16_t, pto::Shape<1, 1, 1, 128, 256>, pto::Stride<128, 128, 128, 1, 128>, pto::Layout::DN>
+                    v41 = GlobalTensor<
+                        bfloat16_t, pto::Shape<1, 1, 1, 128, 256>, pto::Stride<128, 128, 128, 1, 128>, pto::Layout::DN>(
+                        v4 + (v11 + v11 * (unsigned)v18 +
+                              (unsigned)((int32_t)(uint32_t)((int32_t)(uint32_t)((int32_t)(uint32_t)v36 *
+                                                                                 (uint32_t)v15) +
+                                                             (uint32_t)v27) *
+                                         (uint32_t)v19) *
+                                  (unsigned)v17),
+                        v39, v40
+                    );
+                wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1);
+                TLOAD(v37, v41);
+                set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID1);
+                Tile<
+                    TileType::Left, bfloat16_t, 16, 64, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512,
+                    PadValue::Null, CompactMode::Null>
+                    v42 = Tile<
+                        TileType::Left, bfloat16_t, 16, 64, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512,
+                        PadValue::Null, CompactMode::Null>(v14, v13);
+                uint64_t v43 = (uint64_t)v23;
+                TASSIGN(v42, v43);
+                wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID0);
+                TEXTRACT(v42, v28, v16, v16);
+                Tile<
+                    TileType::Right, bfloat16_t, 64, 256, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512,
+                    PadValue::Null, CompactMode::Null>
+                    v44 = Tile<
+                        TileType::Right, bfloat16_t, 64, 256, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512,
+                        PadValue::Null, CompactMode::Null>(v13, v19);
+                uint64_t v45 = (uint64_t)v21;
+                TASSIGN(v44, v45);
+                wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID1);
+                TEXTRACT(v44, v37, v16, v16);
+                set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
+                Tile<
+                    TileType::Left, bfloat16_t, 16, 64, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512,
+                    PadValue::Null, CompactMode::Null>
+                    v46 = Tile<
+                        TileType::Left, bfloat16_t, 16, 64, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512,
+                        PadValue::Null, CompactMode::Null>(v14, v13);
+                uint64_t v47 = (uint64_t)v20;
+                TASSIGN(v46, v47);
+                wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID1);
+                TEXTRACT(v46, v28, v16, v13);
+                Tile<
+                    TileType::Right, bfloat16_t, 64, 256, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512,
+                    PadValue::Null, CompactMode::Null>
+                    v48 = Tile<
+                        TileType::Right, bfloat16_t, 64, 256, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512,
+                        PadValue::Null, CompactMode::Null>(v13, v19);
+                uint64_t v49 = (uint64_t)v23;
+                TASSIGN(v48, v49);
+                TEXTRACT(v48, v37, v13, v16);
+                set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1);
+                set_flag(PIPE_MTE1, PIPE_M, EVENT_ID1);
+                Tile<
+                    TileType::Acc, float, 16, 256, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null,
+                    CompactMode::Null>
+                    v50 = Tile<
+                        TileType::Acc, float, 16, 256, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024,
+                        PadValue::Null, CompactMode::Null>(v14, v19);
+                uint64_t v51 = (uint64_t)v23;
+                TASSIGN(v50, v51);
+                wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
+                wait_flag(PIPE_FIX, PIPE_M, EVENT_ID0);
+                TMATMUL(v50, v42, v44);
+                set_flag(PIPE_M, PIPE_MTE1, EVENT_ID0);
+                Tile<
+                    TileType::Acc, float, 16, 256, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null,
+                    CompactMode::Null>
+                    v52 = Tile<
+                        TileType::Acc, float, 16, 256, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024,
+                        PadValue::Null, CompactMode::Null>(v14, v19);
+                uint64_t v53 = (uint64_t)v23;
+                TASSIGN(v52, v53);
+                pipe_barrier(PIPE_M);
+                wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID1);
+                TMATMUL_ACC(v52, v52, v46, v48);
+                set_flag(PIPE_M, PIPE_MTE1, EVENT_ID1);
+                set_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
+                pto::Shape<1, 1, 1, 16, 256> v54 = pto::Shape<1, 1, 1, 16, 256>();
+                pto::Stride<4096, 4096, 4096, 256, 1> v55 = pto::Stride<4096, 4096, 4096, 256, 1>();
+                GlobalTensor<
+                    float, pto::Shape<1, 1, 1, 16, 256>, pto::Stride<4096, 4096, 4096, 256, 1>, pto::Layout::ND>
+                    v56 = GlobalTensor<
+                        float, pto::Shape<1, 1, 1, 16, 256>, pto::Stride<4096, 4096, 4096, 256, 1>, pto::Layout::ND>(
+                        v1 + (v11 +
+                              (unsigned)((int32_t)(uint32_t)((int32_t)(uint32_t)((int32_t)(uint32_t)v27 *
+                                                                                 (uint32_t)v12) +
+                                                             (uint32_t)v34) *
+                                         (uint32_t)v14) *
+                                  (unsigned)v19 +
+                              v11 * (unsigned)v18),
+                        v54, v55
+                    );
+                wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
+                TSTORE(v56, v52);
+                set_flag(PIPE_FIX, PIPE_M, EVENT_ID0);
+                v35 = v1;
+            } else {
+                v35 = v1;
+            };
+        };
+        set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID0);
+    }
+    wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID0);
+    wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1);
+    wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID0);
+    wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID1);
+    wait_flag(PIPE_FIX, PIPE_M, EVENT_ID0);
+#endif  // __DAV_CUBE__
+
+    ptoas_auto_sync_tail(PTOAutoSyncTailMode::kBarrierAll);
+    return;
+}
+
+// --- Kernel entry point ---
+extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) {
+    // Unpack tensor: all_raw_scores__iter_v1
+    __gm__ Tensor *all_raw_scores__iter_v1_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]);
+    __gm__ float *all_raw_scores__iter_v1 =
+        reinterpret_cast<__gm__ float *>(all_raw_scores__iter_v1_tensor->buffer.addr) +
+        all_raw_scores__iter_v1_tensor->start_offset;
+
+    // Unpack tensor: all_q_padded__rv_v7
+    __gm__ Tensor *all_q_padded__rv_v7_tensor = reinterpret_cast<__gm__ Tensor *>(args[1]);
+    __gm__ bfloat16_t *all_q_padded__rv_v7 =
+        reinterpret_cast<__gm__ bfloat16_t *>(all_q_padded__rv_v7_tensor->buffer.addr) +
+        all_q_padded__rv_v7_tensor->start_offset;
+
+    // Unpack tensor: block_table__ssa_v0
+    __gm__ Tensor *block_table__ssa_v0_tensor = reinterpret_cast<__gm__ Tensor *>(args[2]);
+    __gm__ int32_t *block_table__ssa_v0 = reinterpret_cast<__gm__ int32_t *>(block_table__ssa_v0_tensor->buffer.addr) +
+                                          block_table__ssa_v0_tensor->start_offset;
+
+    // Unpack tensor: k_cache__rv_v6
+    __gm__ Tensor *k_cache__rv_v6_tensor = reinterpret_cast<__gm__ Tensor *>(args[3]);
+    __gm__ bfloat16_t *k_cache__rv_v6 =
+        reinterpret_cast<__gm__ bfloat16_t *>(k_cache__rv_v6_tensor->buffer.addr) + k_cache__rv_v6_tensor->start_offset;
+
+    // Unpack scalar: b__idx_v0
+    union {
+        uint64_t u64;
+        int64_t val;
+    } b__idx_v0_conv;
+    b__idx_v0_conv.u64 = args[4];
+    int64_t b__idx_v0 = b__idx_v0_conv.val;
+
+    // Unpack scalar: sb_chunk__idx_v0
+    union {
+        uint64_t u64;
+        int64_t val;
+    } sb_chunk__idx_v0_conv;
+    sb_chunk__idx_v0_conv.u64 = args[5];
+    int64_t sb_chunk__idx_v0 = sb_chunk__idx_v0_conv.val;
+
+    // Unpack scalar: ctx_blocks__ssa_v0
+    union {
+        uint64_t u64;
+        int64_t val;
+    } ctx_blocks__ssa_v0_conv;
+    ctx_blocks__ssa_v0_conv.u64 = args[6];
+    int64_t ctx_blocks__ssa_v0 = ctx_blocks__ssa_v0_conv.val;
+
+    // Unpack scalar: block_table_base__ssa_v0
+    union {
+        uint64_t u64;
+        int64_t val;
+    } block_table_base__ssa_v0_conv;
+    block_table_base__ssa_v0_conv.u64 = args[7];
+    int64_t block_table_base__ssa_v0 = block_table_base__ssa_v0_conv.val;
+
+    // Extract dynamic dim: BLOCK_TABLE_FLAT_DYN
+    int64_t BLOCK_TABLE_FLAT_DYN = static_cast<int64_t>(block_table__ssa_v0_tensor->shapes[0]);
+
+    // Extract dynamic dim: KV_CACHE_ROWS_DYN
+    int64_t KV_CACHE_ROWS_DYN = static_cast<int64_t>(k_cache__rv_v6_tensor->shapes[0]);
+
+    // Forward to ptoas-generated function
+    qk_matmul(
+        all_raw_scores__iter_v1, all_q_padded__rv_v7, block_table__ssa_v0, k_cache__rv_v6, b__idx_v0, sb_chunk__idx_v0,
+        ctx_blocks__ssa_v0, block_table_base__ssa_v0, BLOCK_TABLE_FLAT_DYN, KV_CACHE_ROWS_DYN
+    );
+}
diff --git a/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aic/sv_matmul.cpp b/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aic/sv_matmul.cpp
new file mode 100644
index 000000000..7186dd07e
--- /dev/null
+++ b/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aic/sv_matmul.cpp
@@ -0,0 +1,291 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+// Kernel Function: sv_matmul
+// Generated by PyPTO IR Compiler (PTO backend)
+
+#include <cstdint>
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#if defined(__CPU_SIM)
+#define __aicore__
+#else
+#define __aicore__ [aicore]
+#endif
+#endif
+
+#include <pto/pto-inst.hpp>
+#include "tensor.h"
+
+using namespace pto;
+
+// --- ptoas-generated code ---
+
+enum class PTOAutoSyncTailMode : int {
+    kBarrierAll = 0,
+    kSetWaitMte3ToSEvent0 = 1,
+};
+
+static __aicore__ inline void ptoas_auto_sync_tail(PTOAutoSyncTailMode mode = PTOAutoSyncTailMode::kBarrierAll) {
+    switch (mode) {
+    case PTOAutoSyncTailMode::kSetWaitMte3ToSEvent0:
+        set_flag(PIPE_MTE3, PIPE_S, EVENT_ID0);
+        wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID0);
+        break;
+    case PTOAutoSyncTailMode::kBarrierAll:
+    default:
+        pipe_barrier(PIPE_ALL);
+        break;
+    }
+}
+
+static __aicore__ void sv_matmul(
+    __gm__ float *v1, __gm__ int32_t *v2, __gm__ bfloat16_t *v3, __gm__ bfloat16_t *v4, int32_t v5, int32_t v6,
+    int32_t v7, int32_t v8, int32_t v9
+) {
+    unsigned v10 = 0;
+    const int32_t v11 = 2;
+    const int32_t v12 = 16;
+    const int32_t v13 = 64;
+    const int32_t v14 = 8;
+    const int32_t v15 = 0;
+    const int32_t v16 = 1;
+    const int32_t v17 = 128;
+    const int32_t v18 = 256;
+    const int64_t v19 = 4096;
+    const int64_t v20 = 32768;
+    const int64_t v21 = 8192;
+    const int64_t v22 = 0;
+    using T = float;
+
+#if defined(__DAV_CUBE__)
+    size_t v23 = (size_t)v16;
+    size_t v24 = (size_t)v15;
+    set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID0);
+    set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1);
+    set_flag(PIPE_M, PIPE_MTE1, EVENT_ID0);
+    set_flag(PIPE_M, PIPE_MTE1, EVENT_ID1);
+    set_flag(PIPE_FIX, PIPE_M, EVENT_ID0);
+    for (size_t v25 = v24; v25 < ((size_t)v14); v25 += v23) {
+        int32_t v26 = (int32_t)v25;
+        for (size_t v27 = v24; v27 < ((size_t)v13); v27 += v23) {
+            int32_t v28 = (int32_t)((uint32_t)v5 + (uint32_t)((int32_t)v27));
+            __gm__ float *v29;
+            if (v28 < v6) {
+                int32_t v30 = v2[(int32_t)((uint32_t)v7 + (uint32_t)v28)];
+                Tile<
+                    TileType::Mat, bfloat16_t, 16, 256, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512,
+                    PadValue::Null, CompactMode::Null>
+                    v31 = Tile<
+                        TileType::Mat, bfloat16_t, 16, 256, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512,
+                        PadValue::Null, CompactMode::Null>(v12, v18);
+                uint64_t v32 = (uint64_t)v22;
+                TASSIGN(v31, v32);
+                int32_t v33 =
+                    (int32_t)((uint32_t)((int32_t)(uint32_t)((int32_t)(uint32_t)v26 * (uint32_t)v11) + (uint32_t)v28) *
+                              (uint32_t)v12);
+                pto::Shape<1, 1, 1, 16, 256> v34 = pto::Shape<1, 1, 1, 16, 256>();
+                pto::Stride<4096, 4096, 4096, 256, 1> v35 = pto::Stride<4096, 4096, 4096, 256, 1>();
+                GlobalTensor<
+                    bfloat16_t, pto::Shape<1, 1, 1, 16, 256>, pto::Stride<4096, 4096, 4096, 256, 1>, pto::Layout::ND>
+                    v36 = GlobalTensor<
+                        bfloat16_t, pto::Shape<1, 1, 1, 16, 256>, pto::Stride<4096, 4096, 4096, 256, 1>,
+                        pto::Layout::ND>(v3 + (v10 + (unsigned)v33 * (unsigned)v18 + v10 * (unsigned)v16), v34, v35);
+                wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID0);
+                TLOAD(v31, v36);
+                set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
+                Tile<
+                    TileType::Mat, bfloat16_t, 256, 128, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512,
+                    PadValue::Null, CompactMode::Null>
+                    v37 = Tile<
+                        TileType::Mat, bfloat16_t, 256, 128, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512,
+                        PadValue::Null, CompactMode::Null>(v18, v17);
+                uint64_t v38 = (uint64_t)v21;
+                TASSIGN(v37, v38);
+                pto::Shape<1, 1, 1, 256, 128> v39 = pto::Shape<1, 1, 1, 256, 128>();
+                pto::Stride<32768, 32768, 32768, 128, 1> v40 = pto::Stride<32768, 32768, 32768, 128, 1>();
+                GlobalTensor<
+                    bfloat16_t, pto::Shape<1, 1, 1, 256, 128>, pto::Stride<32768, 32768, 32768, 128, 1>,
+                    pto::Layout::ND>
+                    v41 = GlobalTensor<
+                        bfloat16_t, pto::Shape<1, 1, 1, 256, 128>, pto::Stride<32768, 32768, 32768, 128, 1>,
+                        pto::Layout::ND>(
+                        v4 + (v10 +
+                              (unsigned)((int32_t)(uint32_t)((int32_t)(uint32_t)((int32_t)(uint32_t)v30 *
+                                                                                 (uint32_t)v14) +
+                                                             (uint32_t)v26) *
+                                         (uint32_t)v18) *
+                                  (unsigned)v17 +
+                              v10 * (unsigned)v16),
+                        v39, v40
+                    );
+                wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1);
+                TLOAD(v37, v41);
+                set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID1);
+                Tile<
+                    TileType::Left, bfloat16_t, 16, 128, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512,
+                    PadValue::Null, CompactMode::Null>
+                    v42 = Tile<
+                        TileType::Left, bfloat16_t, 16, 128, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512,
+                        PadValue::Null, CompactMode::Null>(v12, v17);
+                uint64_t v43 = (uint64_t)v22;
+                TASSIGN(v42, v43);
+                wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
+                wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID0);
+                TEXTRACT(v42, v31, v15, v15);
+                Tile<
+                    TileType::Right, bfloat16_t, 128, 128, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512,
+                    PadValue::Null, CompactMode::Null>
+                    v44 = Tile<
+                        TileType::Right, bfloat16_t, 128, 128, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512,
+                        PadValue::Null, CompactMode::Null>(v17, v17);
+                uint64_t v45 = (uint64_t)v20;
+                TASSIGN(v44, v45);
+                wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID1);
+                TEXTRACT(v44, v37, v15, v15);
+                set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
+                Tile<
+                    TileType::Left, bfloat16_t, 16, 128, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512,
+                    PadValue::Null, CompactMode::Null>
+                    v46 = Tile<
+                        TileType::Left, bfloat16_t, 16, 128, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512,
+                        PadValue::Null, CompactMode::Null>(v12, v17);
+                uint64_t v47 = (uint64_t)v19;
+                TASSIGN(v46, v47);
+                wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID1);
+                TEXTRACT(v46, v31, v15, v17);
+                set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID0);
+                Tile<
+                    TileType::Right, bfloat16_t, 128, 128, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512,
+                    PadValue::Null, CompactMode::Null>
+                    v48 = Tile<
+                        TileType::Right, bfloat16_t, 128, 128, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512,
+                        PadValue::Null, CompactMode::Null>(v17, v17);
+                uint64_t v49 = (uint64_t)v22;
+                TASSIGN(v48, v49);
+                TEXTRACT(v48, v37, v17, v15);
+                set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1);
+                set_flag(PIPE_MTE1, PIPE_M, EVENT_ID1);
+                Tile<
+                    TileType::Acc, float, 16, 128, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null,
+                    CompactMode::Null>
+                    v50 = Tile<
+                        TileType::Acc, float, 16, 128, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024,
+                        PadValue::Null, CompactMode::Null>(v12, v17);
+                uint64_t v51 = (uint64_t)v22;
+                TASSIGN(v50, v51);
+                wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
+                wait_flag(PIPE_FIX, PIPE_M, EVENT_ID0);
+                TMATMUL(v50, v42, v44);
+                set_flag(PIPE_M, PIPE_MTE1, EVENT_ID0);
+                Tile<
+                    TileType::Acc, float, 16, 128, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null,
+                    CompactMode::Null>
+                    v52 = Tile<
+                        TileType::Acc, float, 16, 128, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024,
+                        PadValue::Null, CompactMode::Null>(v12, v17);
+                uint64_t v53 = (uint64_t)v22;
+                TASSIGN(v52, v53);
+                pipe_barrier(PIPE_M);
+                wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID1);
+                TMATMUL_ACC(v52, v52, v46, v48);
+                set_flag(PIPE_M, PIPE_MTE1, EVENT_ID1);
+                set_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
+                pto::Shape<1, 1, 1, 16, 128> v54 = pto::Shape<1, 1, 1, 16, 128>();
+                pto::Stride<2048, 2048, 2048, 128, 1> v55 = pto::Stride<2048, 2048, 2048, 128, 1>();
+                GlobalTensor<
+                    float, pto::Shape<1, 1, 1, 16, 128>, pto::Stride<2048, 2048, 2048, 128, 1>, pto::Layout::ND>
+                    v56 = GlobalTensor<
+                        float, pto::Shape<1, 1, 1, 16, 128>, pto::Stride<2048, 2048, 2048, 128, 1>, pto::Layout::ND>(
+                        v1 + (v10 + (unsigned)v33 * (unsigned)v17 + v10 * (unsigned)v16), v54, v55
+                    );
+                wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
+                TSTORE(v56, v52);
+                set_flag(PIPE_FIX, PIPE_M, EVENT_ID0);
+                v29 = v1;
+            } else {
+                v29 = v1;
+            };
+        };
+    }
+    wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID0);
+    wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1);
+    wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID0);
+    wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID1);
+    wait_flag(PIPE_FIX, PIPE_M, EVENT_ID0);
+#endif  // __DAV_CUBE__
+
+    ptoas_auto_sync_tail(PTOAutoSyncTailMode::kBarrierAll);
+    return;
+}
+
+// --- Kernel entry point ---
+extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) {
+    // Unpack tensor: all_oi_tmp__iter_v1
+    __gm__ Tensor *all_oi_tmp__iter_v1_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]);
+    __gm__ float *all_oi_tmp__iter_v1 = reinterpret_cast<__gm__ float *>(all_oi_tmp__iter_v1_tensor->buffer.addr) +
+                                        all_oi_tmp__iter_v1_tensor->start_offset;
+
+    // Unpack tensor: block_table__ssa_v0
+    __gm__ Tensor *block_table__ssa_v0_tensor = reinterpret_cast<__gm__ Tensor *>(args[1]);
+    __gm__ int32_t *block_table__ssa_v0 = reinterpret_cast<__gm__ int32_t *>(block_table__ssa_v0_tensor->buffer.addr) +
+                                          block_table__ssa_v0_tensor->start_offset;
+
+    // Unpack tensor: all_exp_padded__rv_v2
+    __gm__ Tensor *all_exp_padded__rv_v2_tensor = reinterpret_cast<__gm__ Tensor *>(args[2]);
+    __gm__ bfloat16_t *all_exp_padded__rv_v2 =
+        reinterpret_cast<__gm__ bfloat16_t *>(all_exp_padded__rv_v2_tensor->buffer.addr) +
+        all_exp_padded__rv_v2_tensor->start_offset;
+
+    // Unpack tensor: v_cache__rv_v6
+    __gm__ Tensor *v_cache__rv_v6_tensor = reinterpret_cast<__gm__ Tensor *>(args[3]);
+    __gm__ bfloat16_t *v_cache__rv_v6 =
+        reinterpret_cast<__gm__ bfloat16_t *>(v_cache__rv_v6_tensor->buffer.addr) + v_cache__rv_v6_tensor->start_offset;
+
+    // Unpack scalar: sb_chunk__idx_v0
+    union {
+        uint64_t u64;
+        int64_t val;
+    } sb_chunk__idx_v0_conv;
+    sb_chunk__idx_v0_conv.u64 = args[4];
+    int64_t sb_chunk__idx_v0 = sb_chunk__idx_v0_conv.val;
+
+    // Unpack scalar: ctx_blocks__ssa_v0
+    union {
+        uint64_t u64;
+        int64_t val;
+    } ctx_blocks__ssa_v0_conv;
+    ctx_blocks__ssa_v0_conv.u64 = args[5];
+    int64_t ctx_blocks__ssa_v0 = ctx_blocks__ssa_v0_conv.val;
+
+    // Unpack scalar: block_table_base__ssa_v0
+    union {
+        uint64_t u64;
+        int64_t val;
+    } block_table_base__ssa_v0_conv;
+    block_table_base__ssa_v0_conv.u64 = args[6];
+    int64_t block_table_base__ssa_v0 = block_table_base__ssa_v0_conv.val;
+
+    // Extract dynamic dim: BLOCK_TABLE_FLAT_DYN
+    int64_t BLOCK_TABLE_FLAT_DYN = static_cast<int64_t>(block_table__ssa_v0_tensor->shapes[0]);
+
+    // Extract dynamic dim: KV_CACHE_ROWS_DYN
+    int64_t KV_CACHE_ROWS_DYN = static_cast<int64_t>(v_cache__rv_v6_tensor->shapes[0]);
+
+    // Forward to ptoas-generated function
+    sv_matmul(
+        all_oi_tmp__iter_v1, block_table__ssa_v0, all_exp_padded__rv_v2, v_cache__rv_v6, sb_chunk__idx_v0,
+        ctx_blocks__ssa_v0, block_table_base__ssa_v0, BLOCK_TABLE_FLAT_DYN, KV_CACHE_ROWS_DYN
+    );
+}
diff --git a/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aic/up_proj.cpp b/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aic/up_proj.cpp
new file mode 100644
index 000000000..04e42ab45
--- /dev/null
+++ b/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aic/up_proj.cpp
@@ -0,0 +1,331 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+// Kernel Function: up_proj
+// Generated by PyPTO IR Compiler (PTO backend)
+
+#include <cstdint>
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#if defined(__CPU_SIM)
+#define __aicore__
+#else
+#define __aicore__ [aicore]
+#endif
+#endif
+
+#include <pto/pto-inst.hpp>
+#include "tensor.h"
+
+using namespace pto;
+
+// --- ptoas-generated code ---
+
+enum class PTOAutoSyncTailMode : int {
+    kBarrierAll = 0,
+    kSetWaitMte3ToSEvent0 = 1,
+};
+
+static __aicore__ inline void ptoas_auto_sync_tail(PTOAutoSyncTailMode mode = PTOAutoSyncTailMode::kBarrierAll) {
+    switch (mode) {
+    case PTOAutoSyncTailMode::kSetWaitMte3ToSEvent0:
+        set_flag(PIPE_MTE3, PIPE_S, EVENT_ID0);
+        wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID0);
+        break;
+    case PTOAutoSyncTailMode::kBarrierAll:
+    default:
+        pipe_barrier(PIPE_ALL);
+        break;
+    }
+}
+
+static __aicore__ void up_proj(__gm__ bfloat16_t *v1, __gm__ bfloat16_t *v2, __gm__ float *v3, int32_t v4) {
+    unsigned v5 = 0;
+    const int32_t v6 = 40;
+    const int32_t v7 = 64;
+    const int32_t v8 = 0;
+    const int32_t v9 = 128;
+    const int32_t v10 = 256;
+    const int32_t v11 = 17408;
+    const int32_t v12 = 1;
+    const int32_t v13 = 5120;
+    const int32_t v14 = 16;
+    const int64_t v15 = 32768;
+    const int64_t v16 = 2048;
+    const int64_t v17 = 4096;
+    const int64_t v18 = 0;
+    using T = float;
+
+#if defined(__DAV_CUBE__)
+    size_t v19 = (size_t)v12;
+    Tile<
+        TileType::Mat, bfloat16_t, 16, 128, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null,
+        CompactMode::Null>
+        v20 = Tile<
+            TileType::Mat, bfloat16_t, 16, 128, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null,
+            CompactMode::Null>(v14, v9);
+    uint64_t v21 = (uint64_t)v18;
+    TASSIGN(v20, v21);
+    pto::Shape<1, 1, 1, 16, 128> v22 = pto::Shape<1, 1, 1, 16, 128>();
+    pto::Stride<81920, 81920, 81920, 5120, 1> v23 = pto::Stride<81920, 81920, 81920, 5120, 1>();
+    GlobalTensor<bfloat16_t, pto::Shape<1, 1, 1, 16, 128>, pto::Stride<81920, 81920, 81920, 5120, 1>, pto::Layout::ND>
+        v24 = GlobalTensor<
+            bfloat16_t, pto::Shape<1, 1, 1, 16, 128>, pto::Stride<81920, 81920, 81920, 5120, 1>, pto::Layout::ND>(
+            v1 + (v5 + v5 * (unsigned)v13 + v5 * (unsigned)v12), v22, v23
+        );
+    set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1);
+    set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID2);
+    set_flag(PIPE_M, PIPE_MTE1, EVENT_ID1);
+    set_flag(PIPE_M, PIPE_MTE1, EVENT_ID2);
+    TLOAD(v20, v24);
+    set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
+    Tile<
+        TileType::Mat, bfloat16_t, 128, 256, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null,
+        CompactMode::Null>
+        v25 = Tile<
+            TileType::Mat, bfloat16_t, 128, 256, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null,
+            CompactMode::Null>(v9, v10);
+    uint64_t v26 = (uint64_t)v17;
+    TASSIGN(v25, v26);
+    pto::Shape<1, 1, 1, 128, 256> v27 = pto::Shape<1, 1, 1, 128, 256>();
+    pto::Stride<2228224, 2228224, 2228224, 17408, 1> v28 = pto::Stride<2228224, 2228224, 2228224, 17408, 1>();
+    GlobalTensor<
+        bfloat16_t, pto::Shape<1, 1, 1, 128, 256>, pto::Stride<2228224, 2228224, 2228224, 17408, 1>, pto::Layout::ND>
+        v29 = GlobalTensor<
+            bfloat16_t, pto::Shape<1, 1, 1, 128, 256>, pto::Stride<2228224, 2228224, 2228224, 17408, 1>,
+            pto::Layout::ND>(v2 + (v5 + v5 * (unsigned)v11 + (unsigned)v4 * (unsigned)v12), v27, v28);
+    TLOAD(v25, v29);
+    set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID1);
+    Tile<
+        TileType::Left, bfloat16_t, 16, 64, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null,
+        CompactMode::Null>
+        v30 = Tile<
+            TileType::Left, bfloat16_t, 16, 64, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null,
+            CompactMode::Null>(v14, v7);
+    uint64_t v31 = (uint64_t)v18;
+    TASSIGN(v30, v31);
+    wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
+    TEXTRACT(v30, v20, v8, v8);
+    Tile<
+        TileType::Right, bfloat16_t, 64, 256, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512, PadValue::Null,
+        CompactMode::Null>
+        v32 = Tile<
+            TileType::Right, bfloat16_t, 64, 256, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512, PadValue::Null,
+            CompactMode::Null>(v7, v10);
+    uint64_t v33 = (uint64_t)v18;
+    TASSIGN(v32, v33);
+    wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID1);
+    TEXTRACT(v32, v25, v8, v8);
+    set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
+    Tile<
+        TileType::Left, bfloat16_t, 16, 64, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null,
+        CompactMode::Null>
+        v34 = Tile<
+            TileType::Left, bfloat16_t, 16, 64, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null,
+            CompactMode::Null>(v14, v7);
+    uint64_t v35 = (uint64_t)v16;
+    TASSIGN(v34, v35);
+    TEXTRACT(v34, v20, v8, v7);
+    Tile<
+        TileType::Right, bfloat16_t, 64, 256, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512, PadValue::Null,
+        CompactMode::Null>
+        v36 = Tile<
+            TileType::Right, bfloat16_t, 64, 256, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512, PadValue::Null,
+            CompactMode::Null>(v7, v10);
+    uint64_t v37 = (uint64_t)v15;
+    TASSIGN(v36, v37);
+    TEXTRACT(v36, v25, v7, v8);
+    set_flag(PIPE_MTE1, PIPE_M, EVENT_ID1);
+    set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID0);
+    Tile<
+        TileType::Acc, float, 16, 256, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null,
+        CompactMode::Null>
+        v38 = Tile<
+            TileType::Acc, float, 16, 256, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null,
+            CompactMode::Null>(v14, v10);
+    uint64_t v39 = (uint64_t)v18;
+    TASSIGN(v38, v39);
+    wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
+    TMATMUL(v38, v30, v32);
+    Tile<
+        TileType::Acc, float, 16, 256, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null,
+        CompactMode::Null>
+        v40 = Tile<
+            TileType::Acc, float, 16, 256, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null,
+            CompactMode::Null>(v14, v10);
+    uint64_t v41 = (uint64_t)v18;
+    TASSIGN(v40, v41);
+    pipe_barrier(PIPE_M);
+    wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID1);
+    TMATMUL_ACC(v40, v40, v34, v36);
+    set_flag(PIPE_M, PIPE_MTE1, EVENT_ID0);
+    wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID0);
+    wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID0);
+    for (size_t v42 = v19; v42 < ((size_t)v6); v42 += v19) {
+        int32_t v43 = (int32_t)((uint32_t)((int32_t)v42) * (uint32_t)v9);
+        Tile<
+            TileType::Mat, bfloat16_t, 16, 128, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null,
+            CompactMode::Null>
+            v44 = Tile<
+                TileType::Mat, bfloat16_t, 16, 128, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null,
+                CompactMode::Null>(v14, v9);
+        uint64_t v45 = (uint64_t)v18;
+        TASSIGN(v44, v45);
+        pto::Shape<1, 1, 1, 16, 128> v46 = pto::Shape<1, 1, 1, 16, 128>();
+        pto::Stride<81920, 81920, 81920, 5120, 1> v47 = pto::Stride<81920, 81920, 81920, 5120, 1>();
+        GlobalTensor<
+            bfloat16_t, pto::Shape<1, 1, 1, 16, 128>, pto::Stride<81920, 81920, 81920, 5120, 1>, pto::Layout::ND>
+            v48 = GlobalTensor<
+                bfloat16_t, pto::Shape<1, 1, 1, 16, 128>, pto::Stride<81920, 81920, 81920, 5120, 1>, pto::Layout::ND>(
+                v1 + (v5 + v5 * (unsigned)v13 + (unsigned)v43 * (unsigned)v12), v46, v47
+            );
+        wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1);
+        TLOAD(v44, v48);
+        set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID2);
+        Tile<
+            TileType::Mat, bfloat16_t, 128, 256, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null,
+            CompactMode::Null>
+            v49 = Tile<
+                TileType::Mat, bfloat16_t, 128, 256, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null,
+                CompactMode::Null>(v9, v10);
+        uint64_t v50 = (uint64_t)v17;
+        TASSIGN(v49, v50);
+        pto::Shape<1, 1, 1, 128, 256> v51 = pto::Shape<1, 1, 1, 128, 256>();
+        pto::Stride<2228224, 2228224, 2228224, 17408, 1> v52 = pto::Stride<2228224, 2228224, 2228224, 17408, 1>();
+        GlobalTensor<
+            bfloat16_t, pto::Shape<1, 1, 1, 128, 256>, pto::Stride<2228224, 2228224, 2228224, 17408, 1>,
+            pto::Layout::ND>
+            v53 = GlobalTensor<
+                bfloat16_t, pto::Shape<1, 1, 1, 128, 256>, pto::Stride<2228224, 2228224, 2228224, 17408, 1>,
+                pto::Layout::ND>(v2 + (v5 + (unsigned)v43 * (unsigned)v11 + (unsigned)v4 * (unsigned)v12), v51, v52);
+        wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID2);
+        TLOAD(v49, v53);
+        set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID3);
+        Tile<
+            TileType::Left, bfloat16_t, 16, 64, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null,
+            CompactMode::Null>
+            v54 = Tile<
+                TileType::Left, bfloat16_t, 16, 64, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null,
+                CompactMode::Null>(v14, v7);
+        uint64_t v55 = (uint64_t)v18;
+        TASSIGN(v54, v55);
+        wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID2);
+        wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID1);
+        TEXTRACT(v54, v44, v8, v8);
+        Tile<
+            TileType::Right, bfloat16_t, 64, 256, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512, PadValue::Null,
+            CompactMode::Null>
+            v56 = Tile<
+                TileType::Right, bfloat16_t, 64, 256, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512, PadValue::Null,
+                CompactMode::Null>(v7, v10);
+        uint64_t v57 = (uint64_t)v18;
+        TASSIGN(v56, v57);
+        wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID3);
+        TEXTRACT(v56, v49, v8, v8);
+        set_flag(PIPE_MTE1, PIPE_M, EVENT_ID2);
+        Tile<
+            TileType::Left, bfloat16_t, 16, 64, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null,
+            CompactMode::Null>
+            v58 = Tile<
+                TileType::Left, bfloat16_t, 16, 64, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null,
+                CompactMode::Null>(v14, v7);
+        uint64_t v59 = (uint64_t)v16;
+        TASSIGN(v58, v59);
+        wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID2);
+        TEXTRACT(v58, v44, v8, v7);
+        set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1);
+        Tile<
+            TileType::Right, bfloat16_t, 64, 256, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512, PadValue::Null,
+            CompactMode::Null>
+            v60 = Tile<
+                TileType::Right, bfloat16_t, 64, 256, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512, PadValue::Null,
+                CompactMode::Null>(v7, v10);
+        uint64_t v61 = (uint64_t)v15;
+        TASSIGN(v60, v61);
+        TEXTRACT(v60, v49, v7, v8);
+        set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID2);
+        set_flag(PIPE_MTE1, PIPE_M, EVENT_ID3);
+        Tile<
+            TileType::Acc, float, 16, 256, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null,
+            CompactMode::Null>
+            v62 = Tile<
+                TileType::Acc, float, 16, 256, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null,
+                CompactMode::Null>(v14, v10);
+        uint64_t v63 = (uint64_t)v18;
+        TASSIGN(v62, v63);
+        wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID2);
+        pipe_barrier(PIPE_M);
+        TMATMUL_ACC(v62, v62, v54, v56);
+        set_flag(PIPE_M, PIPE_MTE1, EVENT_ID1);
+        Tile<
+            TileType::Acc, float, 16, 256, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null,
+            CompactMode::Null>
+            v64 = Tile<
+                TileType::Acc, float, 16, 256, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null,
+                CompactMode::Null>(v14, v10);
+        uint64_t v65 = (uint64_t)v18;
+        TASSIGN(v64, v65);
+        pipe_barrier(PIPE_M);
+        wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID3);
+        TMATMUL_ACC(v64, v64, v58, v60);
+        set_flag(PIPE_M, PIPE_MTE1, EVENT_ID2);
+    }
+    set_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
+    pto::Shape<1, 1, 1, 16, 256> v66 = pto::Shape<1, 1, 1, 16, 256>();
+    pto::Stride<4096, 4096, 4096, 256, 1> v67 = pto::Stride<4096, 4096, 4096, 256, 1>();
+    GlobalTensor<float, pto::Shape<1, 1, 1, 16, 256>, pto::Stride<4096, 4096, 4096, 256, 1>, pto::Layout::ND> v68 =
+        GlobalTensor<float, pto::Shape<1, 1, 1, 16, 256>, pto::Stride<4096, 4096, 4096, 256, 1>, pto::Layout::ND>(
+            v3 + (v5 + v5 * (unsigned)v10 + v5 * (unsigned)v12), v66, v67
+        );
+    wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
+    TSTORE(v68, v40);
+    wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1);
+    wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID2);
+    wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID1);
+    wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID2);
+#endif  // __DAV_CUBE__
+
+    ptoas_auto_sync_tail(PTOAutoSyncTailMode::kBarrierAll);
+    return;
+}
+
+// --- Kernel entry point ---
+extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) {
+    // Unpack tensor: post_norm_tile__rv_v2
+    __gm__ Tensor *post_norm_tile__rv_v2_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]);
+    __gm__ bfloat16_t *post_norm_tile__rv_v2 =
+        reinterpret_cast<__gm__ bfloat16_t *>(post_norm_tile__rv_v2_tensor->buffer.addr) +
+        post_norm_tile__rv_v2_tensor->start_offset;
+
+    // Unpack tensor: w_up__ssa_v0
+    __gm__ Tensor *w_up__ssa_v0_tensor = reinterpret_cast<__gm__ Tensor *>(args[1]);
+    __gm__ bfloat16_t *w_up__ssa_v0 =
+        reinterpret_cast<__gm__ bfloat16_t *>(w_up__ssa_v0_tensor->buffer.addr) + w_up__ssa_v0_tensor->start_offset;
+
+    // Unpack tensor: ret0__out
+    __gm__ Tensor *ret0__out_tensor = reinterpret_cast<__gm__ Tensor *>(args[2]);
+    __gm__ float *ret0__out =
+        reinterpret_cast<__gm__ float *>(ret0__out_tensor->buffer.addr) + ret0__out_tensor->start_offset;
+
+    // Unpack scalar: o0__ssa_v1
+    union {
+        uint64_t u64;
+        int64_t val;
+    } o0__ssa_v1_conv;
+    o0__ssa_v1_conv.u64 = args[3];
+    int64_t o0__ssa_v1 = o0__ssa_v1_conv.val;
+
+    // Forward to ptoas-generated function
+    up_proj(post_norm_tile__rv_v2, w_up__ssa_v0, ret0__out, o0__ssa_v1);
+}
diff --git a/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aiv/attention_writeback.cpp b/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aiv/attention_writeback.cpp
new file mode 100644
index 000000000..b4d9f0ba5
--- /dev/null
+++ b/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aiv/attention_writeback.cpp
@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+// Kernel Function: attention_writeback
+// Generated by PyPTO IR Compiler (PTO backend)
+
+#include <cstdint>
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#if defined(__CPU_SIM)
+#define __aicore__
+#else
+#define __aicore__ [aicore]
+#endif
+#endif
+
+#include <pto/pto-inst.hpp>
+#include "tensor.h"
+
+using namespace pto;
+
+// --- ptoas-generated code ---
+
+enum class PTOAutoSyncTailMode : int {
+    kBarrierAll = 0,
+    kSetWaitMte3ToSEvent0 = 1,
+};
+
+static __aicore__ inline void ptoas_auto_sync_tail(PTOAutoSyncTailMode mode = PTOAutoSyncTailMode::kBarrierAll) {
+    switch (mode) {
+    case PTOAutoSyncTailMode::kSetWaitMte3ToSEvent0:
+        set_flag(PIPE_MTE3, PIPE_S, EVENT_ID0);
+        wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID0);
+        break;
+    case PTOAutoSyncTailMode::kBarrierAll:
+    default:
+        pipe_barrier(PIPE_ALL);
+        break;
+    }
+}
+
+static __aicore__ void attention_writeback(__gm__ bfloat16_t *v1, __gm__ bfloat16_t *v2) {
+    unsigned v3 = 0;
+    const int32_t v4 = 128;
+    const int32_t v5 = 2048;
+    const int32_t v6 = 640;
+    const int32_t v7 = 5;
+    const int32_t v8 = 8;
+    const int32_t v9 = 0;
+    const int32_t v10 = 16384;
+    const int32_t v11 = 5120;
+    const int32_t v12 = 1;
+    const int64_t v13 = 0;
+    using T = float;
+
+#if defined(__DAV_VEC__)
+    set_mask_norm();
+    set_vector_mask(-1, -1);
+    set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
+    for (size_t v14 = (size_t)v9; v14 < ((size_t)v8); v14 += (size_t)v12) {
+        int32_t v15 = (int32_t)v14;
+        Tile<
+            TileType::Vec, bfloat16_t, 1, 640, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>
+            v16 = Tile<
+                TileType::Vec, bfloat16_t, 1, 640, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>(v12, v6);
+        uint64_t v17 = (uint64_t)v13;
+        TASSIGN(v16, v17);
+        pto::Shape<1, 1, 1, 1, 640> v18 = pto::Shape<1, 1, 1, 1, 640>();
+        pto::Stride<16384, 16384, 16384, 16384, 1> v19 = pto::Stride<16384, 16384, 16384, 16384, 1>();
+        GlobalTensor<
+            bfloat16_t, pto::Shape<1, 1, 1, 1, 640>, pto::Stride<16384, 16384, 16384, 16384, 1>, pto::Layout::ND>
+            v20 = GlobalTensor<
+                bfloat16_t, pto::Shape<1, 1, 1, 1, 640>, pto::Stride<16384, 16384, 16384, 16384, 1>, pto::Layout::ND>(
+                v2 + (v3 + v3 * (unsigned)v10 + (unsigned)((int32_t)(uint32_t)v15 * (uint32_t)v5) * (unsigned)v12), v18,
+                v19
+            );
+        wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
+        TLOAD(v16, v20);
+        set_flag(PIPE_MTE2, PIPE_MTE3, EVENT_ID0);
+        pto::Shape<1, 1, 1, 1, 640> v21 = pto::Shape<1, 1, 1, 1, 640>();
+        pto::Stride<5120, 5120, 5120, 5120, 1> v22 = pto::Stride<5120, 5120, 5120, 5120, 1>();
+        GlobalTensor<bfloat16_t, pto::Shape<1, 1, 1, 1, 640>, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>
+            v23 = GlobalTensor<
+                bfloat16_t, pto::Shape<1, 1, 1, 1, 640>, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(
+                v1 + (v3 + v3 * (unsigned)v11 +
+                      (unsigned)((int32_t)(uint32_t)((int32_t)(uint32_t)v15 * (uint32_t)v7) * (uint32_t)v4) *
+                          (unsigned)v12),
+                v21, v22
+            );
+        wait_flag(PIPE_MTE2, PIPE_MTE3, EVENT_ID0);
+        TSTORE(v23, v16);
+        set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
+    }
+    wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
+#endif  // __DAV_VEC__
+
+    ptoas_auto_sync_tail(PTOAutoSyncTailMode::kBarrierAll);
+    return;
+}
+
+// --- Kernel entry point ---
+extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) {
+    // Unpack tensor: attn_row__ssa_v0
+    __gm__ Tensor *attn_row__ssa_v0_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]);
+    __gm__ bfloat16_t *attn_row__ssa_v0 = reinterpret_cast<__gm__ bfloat16_t *>(attn_row__ssa_v0_tensor->buffer.addr) +
+                                          attn_row__ssa_v0_tensor->start_offset;
+
+    // Unpack tensor: attn_row_padded__rv_v2
+    __gm__ Tensor *attn_row_padded__rv_v2_tensor = reinterpret_cast<__gm__ Tensor *>(args[1]);
+    __gm__ bfloat16_t *attn_row_padded__rv_v2 =
+        reinterpret_cast<__gm__ bfloat16_t *>(attn_row_padded__rv_v2_tensor->buffer.addr) +
+        attn_row_padded__rv_v2_tensor->start_offset;
+
+    // Forward to ptoas-generated function
+    attention_writeback(attn_row__ssa_v0, attn_row_padded__rv_v2);
+}
diff --git a/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aiv/copy_hidden.cpp b/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aiv/copy_hidden.cpp
new file mode 100644
index 000000000..59fa858fe
--- /dev/null
+++ b/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aiv/copy_hidden.cpp
@@ -0,0 +1,141 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+// Kernel Function: copy_hidden
+// Generated by PyPTO IR Compiler (PTO backend)
+
+#include <cstdint>
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#if defined(__CPU_SIM)
+#define __aicore__
+#else
+#define __aicore__ [aicore]
+#endif
+#endif
+
+#include <pto/pto-inst.hpp>
+#include "tensor.h"
+
+using namespace pto;
+
+// --- ptoas-generated code ---
+
+enum class PTOAutoSyncTailMode : int {
+    kBarrierAll = 0,
+    kSetWaitMte3ToSEvent0 = 1,
+};
+
+static __aicore__ inline void ptoas_auto_sync_tail(PTOAutoSyncTailMode mode = PTOAutoSyncTailMode::kBarrierAll) {
+    switch (mode) {
+    case PTOAutoSyncTailMode::kSetWaitMte3ToSEvent0:
+        set_flag(PIPE_MTE3, PIPE_S, EVENT_ID0);
+        wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID0);
+        break;
+    case PTOAutoSyncTailMode::kBarrierAll:
+    default:
+        pipe_barrier(PIPE_ALL);
+        break;
+    }
+}
+
+static __aicore__ void copy_hidden(__gm__ bfloat16_t *v1, __gm__ bfloat16_t *v2, int32_t v3, int32_t v4, int32_t v5) {
+    unsigned v6 = 5120;
+    unsigned v7 = 0;
+    const int32_t v8 = 128;
+    const int32_t v9 = 40;
+    const int32_t v10 = 0;
+    const int32_t v11 = 1;
+    const int32_t v12 = 5120;
+    const int64_t v13 = 0;
+    using T = float;
+
+#if defined(__DAV_VEC__)
+    set_mask_norm();
+    set_vector_mask(-1, -1);
+    set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
+    for (size_t v14 = (size_t)v10; v14 < ((size_t)v9); v14 += (size_t)v11) {
+        int32_t v15 = (int32_t)((uint32_t)((int32_t)v14) * (uint32_t)v8);
+        Tile<
+            TileType::Vec, bfloat16_t, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>
+            v16 = Tile<
+                TileType::Vec, bfloat16_t, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>(v4, v8);
+        uint64_t v17 = (uint64_t)v13;
+        TASSIGN(v16, v17);
+        unsigned v18 = (unsigned)v4 * v6;
+        pto::Shape<1, 1, 1, -1, 128> v19 = pto::Shape<1, 1, 1, -1, 128>(v4);
+        pto::Stride<-1, -1, -1, 5120, 1> v20 = pto::Stride<-1, -1, -1, 5120, 1>(v18, v18, v18);
+        GlobalTensor<bfloat16_t, pto::Shape<1, 1, 1, -1, 128>, pto::Stride<-1, -1, -1, 5120, 1>, pto::Layout::ND> v21 =
+            GlobalTensor<bfloat16_t, pto::Shape<1, 1, 1, -1, 128>, pto::Stride<-1, -1, -1, 5120, 1>, pto::Layout::ND>(
+                v2 + (v7 + (unsigned)v3 * (unsigned)v12 + (unsigned)v15 * (unsigned)v11), v19, v20
+            );
+        wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
+        TLOAD(v16, v21);
+        set_flag(PIPE_MTE2, PIPE_MTE3, EVENT_ID0);
+        unsigned v22 = (unsigned)v4 * v6;
+        pto::Shape<1, 1, 1, -1, 128> v23 = pto::Shape<1, 1, 1, -1, 128>(v4);
+        pto::Stride<-1, -1, -1, 5120, 1> v24 = pto::Stride<-1, -1, -1, 5120, 1>(v22, v22, v22);
+        GlobalTensor<bfloat16_t, pto::Shape<1, 1, 1, -1, 128>, pto::Stride<-1, -1, -1, 5120, 1>, pto::Layout::ND> v25 =
+            GlobalTensor<bfloat16_t, pto::Shape<1, 1, 1, -1, 128>, pto::Stride<-1, -1, -1, 5120, 1>, pto::Layout::ND>(
+                v1 + (v7 + (unsigned)v3 * (unsigned)v12 + (unsigned)v15 * (unsigned)v11), v23, v24
+            );
+        wait_flag(PIPE_MTE2, PIPE_MTE3, EVENT_ID0);
+        TSTORE(v25, v16);
+        set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
+    }
+    wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
+#endif  // __DAV_VEC__
+
+    ptoas_auto_sync_tail(PTOAutoSyncTailMode::kBarrierAll);
+    return;
+}
+
+// --- Kernel entry point ---
+extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) {
+    // Unpack tensor: current_hidden__iter_v1
+    __gm__ Tensor *current_hidden__iter_v1_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]);
+    __gm__ bfloat16_t *current_hidden__iter_v1 =
+        reinterpret_cast<__gm__ bfloat16_t *>(current_hidden__iter_v1_tensor->buffer.addr) +
+        current_hidden__iter_v1_tensor->start_offset;
+
+    // Unpack tensor: hidden_states__ssa_v0
+    __gm__ Tensor *hidden_states__ssa_v0_tensor = reinterpret_cast<__gm__ Tensor *>(args[1]);
+    __gm__ bfloat16_t *hidden_states__ssa_v0 =
+        reinterpret_cast<__gm__ bfloat16_t *>(hidden_states__ssa_v0_tensor->buffer.addr) +
+        hidden_states__ssa_v0_tensor->start_offset;
+
+    // Unpack scalar: b0__idx_v0
+    union {
+        uint64_t u64;
+        int64_t val;
+    } b0__idx_v0_conv;
+    b0__idx_v0_conv.u64 = args[2];
+    int64_t b0__idx_v0 = b0__idx_v0_conv.val;
+
+    // Unpack scalar: cur_valid__ssa_v0
+    union {
+        uint64_t u64;
+        int64_t val;
+    } cur_valid__ssa_v0_conv;
+    cur_valid__ssa_v0_conv.u64 = args[3];
+    int64_t cur_valid__ssa_v0 = cur_valid__ssa_v0_conv.val;
+
+    // Extract dynamic dim: USER_BATCH_DYN
+    int64_t USER_BATCH_DYN = static_cast<int64_t>(hidden_states__ssa_v0_tensor->shapes[0]);
+
+    // Forward to ptoas-generated function
+    copy_hidden(current_hidden__iter_v1, hidden_states__ssa_v0, b0__idx_v0, cur_valid__ssa_v0, USER_BATCH_DYN);
+}
diff --git a/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aiv/copy_out.cpp b/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aiv/copy_out.cpp
new file mode 100644
index 000000000..4299251a2
--- /dev/null
+++ b/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aiv/copy_out.cpp
@@ -0,0 +1,140 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+// Kernel Function: copy_out
+// Generated by PyPTO IR Compiler (PTO backend)
+
+#include <cstdint>
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#if defined(__CPU_SIM)
+#define __aicore__
+#else
+#define __aicore__ [aicore]
+#endif
+#endif
+
+#include <pto/pto-inst.hpp>
+#include "tensor.h"
+
+using namespace pto;
+
+// --- ptoas-generated code ---
+
+enum class PTOAutoSyncTailMode : int {
+    kBarrierAll = 0,
+    kSetWaitMte3ToSEvent0 = 1,
+};
+
+static __aicore__ inline void ptoas_auto_sync_tail(PTOAutoSyncTailMode mode = PTOAutoSyncTailMode::kBarrierAll) {
+    switch (mode) {
+    case PTOAutoSyncTailMode::kSetWaitMte3ToSEvent0:
+        set_flag(PIPE_MTE3, PIPE_S, EVENT_ID0);
+        wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID0);
+        break;
+    case PTOAutoSyncTailMode::kBarrierAll:
+    default:
+        pipe_barrier(PIPE_ALL);
+        break;
+    }
+}
+
+static __aicore__ void copy_out(__gm__ bfloat16_t *v1, __gm__ bfloat16_t *v2, int32_t v3, int32_t v4, int32_t v5) {
+    unsigned v6 = 5120;
+    unsigned v7 = 0;
+    const int32_t v8 = 128;
+    const int32_t v9 = 40;
+    const int32_t v10 = 0;
+    const int32_t v11 = 1;
+    const int32_t v12 = 5120;
+    const int64_t v13 = 0;
+    using T = float;
+
+#if defined(__DAV_VEC__)
+    set_mask_norm();
+    set_vector_mask(-1, -1);
+    set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
+    for (size_t v14 = (size_t)v10; v14 < ((size_t)v9); v14 += (size_t)v11) {
+        int32_t v15 = (int32_t)((uint32_t)((int32_t)v14) * (uint32_t)v8);
+        Tile<
+            TileType::Vec, bfloat16_t, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>
+            v16 = Tile<
+                TileType::Vec, bfloat16_t, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>(v4, v8);
+        uint64_t v17 = (uint64_t)v13;
+        TASSIGN(v16, v17);
+        unsigned v18 = (unsigned)v4 * v6;
+        pto::Shape<1, 1, 1, -1, 128> v19 = pto::Shape<1, 1, 1, -1, 128>(v4);
+        pto::Stride<-1, -1, -1, 5120, 1> v20 = pto::Stride<-1, -1, -1, 5120, 1>(v18, v18, v18);
+        GlobalTensor<bfloat16_t, pto::Shape<1, 1, 1, -1, 128>, pto::Stride<-1, -1, -1, 5120, 1>, pto::Layout::ND> v21 =
+            GlobalTensor<bfloat16_t, pto::Shape<1, 1, 1, -1, 128>, pto::Stride<-1, -1, -1, 5120, 1>, pto::Layout::ND>(
+                v2 + (v7 + (unsigned)v3 * (unsigned)v12 + (unsigned)v15 * (unsigned)v11), v19, v20
+            );
+        wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
+        TLOAD(v16, v21);
+        set_flag(PIPE_MTE2, PIPE_MTE3, EVENT_ID0);
+        unsigned v22 = (unsigned)v4 * v6;
+        pto::Shape<1, 1, 1, -1, 128> v23 = pto::Shape<1, 1, 1, -1, 128>(v4);
+        pto::Stride<-1, -1, -1, 5120, 1> v24 = pto::Stride<-1, -1, -1, 5120, 1>(v22, v22, v22);
+        GlobalTensor<bfloat16_t, pto::Shape<1, 1, 1, -1, 128>, pto::Stride<-1, -1, -1, 5120, 1>, pto::Layout::ND> v25 =
+            GlobalTensor<bfloat16_t, pto::Shape<1, 1, 1, -1, 128>, pto::Stride<-1, -1, -1, 5120, 1>, pto::Layout::ND>(
+                v1 + (v7 + (unsigned)v3 * (unsigned)v12 + (unsigned)v15 * (unsigned)v11), v23, v24
+            );
+        wait_flag(PIPE_MTE2, PIPE_MTE3, EVENT_ID0);
+        TSTORE(v25, v16);
+        set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
+    }
+    wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
+#endif  // __DAV_VEC__
+
+    ptoas_auto_sync_tail(PTOAutoSyncTailMode::kBarrierAll);
+    return;
+}
+
+// --- Kernel entry point ---
+extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) {
+    // Unpack tensor: out__iter_v1
+    __gm__ Tensor *out__iter_v1_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]);
+    __gm__ bfloat16_t *out__iter_v1 =
+        reinterpret_cast<__gm__ bfloat16_t *>(out__iter_v1_tensor->buffer.addr) + out__iter_v1_tensor->start_offset;
+
+    // Unpack tensor: current_hidden__ssa_v8
+    __gm__ Tensor *current_hidden__ssa_v8_tensor = reinterpret_cast<__gm__ Tensor *>(args[1]);
+    __gm__ bfloat16_t *current_hidden__ssa_v8 =
+        reinterpret_cast<__gm__ bfloat16_t *>(current_hidden__ssa_v8_tensor->buffer.addr) +
+        current_hidden__ssa_v8_tensor->start_offset;
+
+    // Unpack scalar: b0__idx_v0
+    union {
+        uint64_t u64;
+        int64_t val;
+    } b0__idx_v0_conv;
+    b0__idx_v0_conv.u64 = args[2];
+    int64_t b0__idx_v0 = b0__idx_v0_conv.val;
+
+    // Unpack scalar: cur_valid__ssa_v3
+    union {
+        uint64_t u64;
+        int64_t val;
+    } cur_valid__ssa_v3_conv;
+    cur_valid__ssa_v3_conv.u64 = args[3];
+    int64_t cur_valid__ssa_v3 = cur_valid__ssa_v3_conv.val;
+
+    // Extract dynamic dim: USER_BATCH_DYN
+    int64_t USER_BATCH_DYN = static_cast<int64_t>(out__iter_v1_tensor->shapes[0]);
+
+    // Forward to ptoas-generated function
+    copy_out(out__iter_v1, current_hidden__ssa_v8, b0__idx_v0, cur_valid__ssa_v3, USER_BATCH_DYN);
+}
diff --git a/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aiv/down_proj_residual.cpp b/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aiv/down_proj_residual.cpp
new file mode 100644
index 000000000..451288cd7
--- /dev/null
+++ b/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aiv/down_proj_residual.cpp
@@ -0,0 +1,172 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+// Kernel Function: down_proj_residual
+// Generated by PyPTO IR Compiler (PTO backend)
+
+#include <cstdint>
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#if defined(__CPU_SIM)
+#define __aicore__
+#else
+#define __aicore__ [aicore]
+#endif
+#endif
+
+#include <pto/pto-inst.hpp>
+#include "tensor.h"
+
+using namespace pto;
+
+// --- ptoas-generated code ---
+
+enum class PTOAutoSyncTailMode : int {
+    kBarrierAll = 0,
+    kSetWaitMte3ToSEvent0 = 1,
+};
+
+static __aicore__ inline void ptoas_auto_sync_tail(PTOAutoSyncTailMode mode = PTOAutoSyncTailMode::kBarrierAll) {
+    switch (mode) {
+    case PTOAutoSyncTailMode::kSetWaitMte3ToSEvent0:
+        set_flag(PIPE_MTE3, PIPE_S, EVENT_ID0);
+        wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID0);
+        break;
+    case PTOAutoSyncTailMode::kBarrierAll:
+    default:
+        pipe_barrier(PIPE_ALL);
+        break;
+    }
+}
+
+static __aicore__ void
+down_proj_residual(__gm__ float *v1, __gm__ float *v2, __gm__ bfloat16_t *v3, int32_t v4, int32_t v5) {
+    RoundMode v6 = RoundMode::CAST_ROUND;
+    unsigned v7 = 0;
+    const int32_t v8 = 5120;
+    const int32_t v9 = 1;
+    const int32_t v10 = 128;
+    const int32_t v11 = 16;
+    const int64_t v12 = 16384;
+    const int64_t v13 = 8192;
+    const int64_t v14 = 0;
+    using T = float;
+
+#if defined(__DAV_VEC__)
+    set_mask_norm();
+    set_vector_mask(-1, -1);
+    Tile<
+        TileType::Vec, float, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+        CompactMode::Null>
+        v15 = Tile<
+            TileType::Vec, float, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>(v11, v10);
+    uint64_t v16 = (uint64_t)v14;
+    TASSIGN(v15, v16);
+    pto::Shape<1, 1, 1, 16, 128> v17 = pto::Shape<1, 1, 1, 16, 128>();
+    pto::Stride<2048, 2048, 2048, 128, 1> v18 = pto::Stride<2048, 2048, 2048, 128, 1>();
+    GlobalTensor<float, pto::Shape<1, 1, 1, 16, 128>, pto::Stride<2048, 2048, 2048, 128, 1>, pto::Layout::ND> v19 =
+        GlobalTensor<float, pto::Shape<1, 1, 1, 16, 128>, pto::Stride<2048, 2048, 2048, 128, 1>, pto::Layout::ND>(
+            v1 + (v7 + v7 * (unsigned)v10 + v7 * (unsigned)v9), v17, v18
+        );
+    TLOAD(v15, v19);
+    Tile<
+        TileType::Vec, float, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+        CompactMode::Null>
+        v20 = Tile<
+            TileType::Vec, float, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>(v11, v10);
+    uint64_t v21 = (uint64_t)v13;
+    TASSIGN(v20, v21);
+    pto::Shape<1, 1, 1, 16, 128> v22 = pto::Shape<1, 1, 1, 16, 128>();
+    pto::Stride<81920, 81920, 81920, 5120, 1> v23 = pto::Stride<81920, 81920, 81920, 5120, 1>();
+    GlobalTensor<float, pto::Shape<1, 1, 1, 16, 128>, pto::Stride<81920, 81920, 81920, 5120, 1>, pto::Layout::ND> v24 =
+        GlobalTensor<float, pto::Shape<1, 1, 1, 16, 128>, pto::Stride<81920, 81920, 81920, 5120, 1>, pto::Layout::ND>(
+            v2 + (v7 + v7 * (unsigned)v8 + (unsigned)v4 * (unsigned)v9), v22, v23
+        );
+    TLOAD(v20, v24);
+    set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+    Tile<
+        TileType::Vec, float, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+        CompactMode::Null>
+        v25 = Tile<
+            TileType::Vec, float, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>(v11, v10);
+    uint64_t v26 = (uint64_t)v14;
+    TASSIGN(v25, v26);
+    wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+    TADD(v25, v15, v20);
+    Tile<
+        TileType::Vec, bfloat16_t, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+        CompactMode::Null>
+        v27 = Tile<
+            TileType::Vec, bfloat16_t, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>(v11, v10);
+    uint64_t v28 = (uint64_t)v12;
+    TASSIGN(v27, v28);
+    pipe_barrier(PIPE_V);
+    TCVT(v27, v25, v6);
+    set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+    pto::Shape<1, 1, 1, 16, 128> v29 = pto::Shape<1, 1, 1, 16, 128>();
+    pto::Stride<81920, 81920, 81920, 5120, 1> v30 = pto::Stride<81920, 81920, 81920, 5120, 1>();
+    GlobalTensor<bfloat16_t, pto::Shape<1, 1, 1, 16, 128>, pto::Stride<81920, 81920, 81920, 5120, 1>, pto::Layout::ND>
+        v31 = GlobalTensor<
+            bfloat16_t, pto::Shape<1, 1, 1, 16, 128>, pto::Stride<81920, 81920, 81920, 5120, 1>, pto::Layout::ND>(
+            v3 + (v7 + (unsigned)v5 * (unsigned)v8 + (unsigned)v4 * (unsigned)v9), v29, v30
+        );
+    wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+    TSTORE(v31, v27);
+#endif  // __DAV_VEC__
+
+    ptoas_auto_sync_tail(PTOAutoSyncTailMode::kBarrierAll);
+    return;
+}
+
+// --- Kernel entry point ---
+extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) {
+    // Unpack tensor: fp32_chunk_gm__ssa_v1
+    __gm__ Tensor *fp32_chunk_gm__ssa_v1_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]);
+    __gm__ float *fp32_chunk_gm__ssa_v1 = reinterpret_cast<__gm__ float *>(fp32_chunk_gm__ssa_v1_tensor->buffer.addr) +
+                                          fp32_chunk_gm__ssa_v1_tensor->start_offset;
+
+    // Unpack tensor: resid1_tile__rv_v2
+    __gm__ Tensor *resid1_tile__rv_v2_tensor = reinterpret_cast<__gm__ Tensor *>(args[1]);
+    __gm__ float *resid1_tile__rv_v2 = reinterpret_cast<__gm__ float *>(resid1_tile__rv_v2_tensor->buffer.addr) +
+                                       resid1_tile__rv_v2_tensor->start_offset;
+
+    // Unpack tensor: next_hidden__iter_v3
+    __gm__ Tensor *next_hidden__iter_v3_tensor = reinterpret_cast<__gm__ Tensor *>(args[2]);
+    __gm__ bfloat16_t *next_hidden__iter_v3 =
+        reinterpret_cast<__gm__ bfloat16_t *>(next_hidden__iter_v3_tensor->buffer.addr) +
+        next_hidden__iter_v3_tensor->start_offset;
+
+    // Unpack scalar: d0__ssa_v0
+    union {
+        uint64_t u64;
+        int64_t val;
+    } d0__ssa_v0_conv;
+    d0__ssa_v0_conv.u64 = args[3];
+    int64_t d0__ssa_v0 = d0__ssa_v0_conv.val;
+
+    // Unpack scalar: b0__idx_v0
+    union {
+        uint64_t u64;
+        int64_t val;
+    } b0__idx_v0_conv;
+    b0__idx_v0_conv.u64 = args[4];
+    int64_t b0__idx_v0 = b0__idx_v0_conv.val;
+
+    // Forward to ptoas-generated function
+    down_proj_residual(fp32_chunk_gm__ssa_v1, resid1_tile__rv_v2, next_hidden__iter_v3, d0__ssa_v0, b0__idx_v0);
+}
diff --git a/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aiv/online_softmax.cpp b/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aiv/online_softmax.cpp
new file mode 100644
index 000000000..5af6c32c7
--- /dev/null
+++ b/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aiv/online_softmax.cpp
@@ -0,0 +1,550 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+// Kernel Function: online_softmax
+// Generated by PyPTO IR Compiler (PTO backend)
+
+#include <cstdint>
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#if defined(__CPU_SIM)
+#define __aicore__
+#else
+#define __aicore__ [aicore]
+#endif
+#endif
+
+#include <pto/pto-inst.hpp>
+#include "tensor.h"
+
+using namespace pto;
+
+// --- ptoas-generated code ---
+
+enum class PTOAutoSyncTailMode : int {
+    kBarrierAll = 0,
+    kSetWaitMte3ToSEvent0 = 1,
+};
+
+static __aicore__ inline void ptoas_auto_sync_tail(PTOAutoSyncTailMode mode = PTOAutoSyncTailMode::kBarrierAll) {
+    switch (mode) {
+    case PTOAutoSyncTailMode::kSetWaitMte3ToSEvent0:
+        set_flag(PIPE_MTE3, PIPE_S, EVENT_ID0);
+        wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID0);
+        break;
+    case PTOAutoSyncTailMode::kBarrierAll:
+    default:
+        pipe_barrier(PIPE_ALL);
+        break;
+    }
+}
+
+static __aicore__ void
+online_softmax(__gm__ bfloat16_t *v1, __gm__ float *v2, __gm__ float *v3, __gm__ float *v4, int32_t v5) {
+    RoundMode v6 = RoundMode::CAST_ROUND;
+    unsigned v7 = 0;
+    const int32_t v8 = 2048;
+    const int32_t v9 = 16;
+    const int32_t v10 = 32;
+    const int32_t v11 = 8;
+    const int32_t v12 = 0;
+    const int32_t v13 = 128;
+    const int32_t v14 = 256;
+    const int32_t v15 = 16384;
+    const int32_t v16 = 1;
+    const int64_t v17 = 8512;
+    const int64_t v18 = 320;
+    const int64_t v19 = 256;
+    const int64_t v20 = 192;
+    const int64_t v21 = 128;
+    const int64_t v22 = 64;
+    const int64_t v23 = 0;
+    const int64_t v24 = 29184;
+    const int64_t v25 = 29120;
+    const int64_t v26 = 20928;
+    const int64_t v27 = 20864;
+    const int64_t v28 = 20800;
+    const int64_t v29 = 12608;
+    using T = float;
+
+#if defined(__DAV_VEC__)
+    set_mask_norm();
+    set_vector_mask(-1, -1);
+    size_t v30 = (size_t)v16;
+    set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0);
+    set_flag(PIPE_V, PIPE_MTE2, EVENT_ID1);
+    set_flag(PIPE_MTE3, PIPE_V, EVENT_ID0);
+    for (size_t v31 = (size_t)v12; v31 < ((size_t)v11); v31 += v30) {
+        int32_t v32 = (int32_t)v31;
+        int32_t v33 = (int32_t)((uint32_t)v32 * (uint32_t)v10);
+        Tile<
+            TileType::Vec, float, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>
+            v34 = Tile<
+                TileType::Vec, float, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>(v9, v13);
+        uint64_t v35 = (uint64_t)v29;
+        TASSIGN(v34, v35);
+        pto::Shape<1, 1, 1, 16, 128> v36 = pto::Shape<1, 1, 1, 16, 128>();
+        pto::Stride<2048, 2048, 2048, 128, 1> v37 = pto::Stride<2048, 2048, 2048, 128, 1>();
+        GlobalTensor<float, pto::Shape<1, 1, 1, 16, 128>, pto::Stride<2048, 2048, 2048, 128, 1>, pto::Layout::ND> v38 =
+            GlobalTensor<float, pto::Shape<1, 1, 1, 16, 128>, pto::Stride<2048, 2048, 2048, 128, 1>, pto::Layout::ND>(
+                v2 + (v7 + (unsigned)v33 * (unsigned)v13 + v7 * (unsigned)v16), v36, v37
+            );
+        wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0);
+        TLOAD(v34, v38);
+        Tile<
+            TileType::Vec, float, 16, 1, BLayout::ColMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>
+            v39 = Tile<
+                TileType::Vec, float, 16, 1, BLayout::ColMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>(v9, v16);
+        uint64_t v40 = (uint64_t)v28;
+        TASSIGN(v39, v40);
+        pto::Shape<1, 1, 1, 16, 1> v41 = pto::Shape<1, 1, 1, 16, 1>();
+        pto::Stride<16, 16, 16, 1, 256> v42 = pto::Stride<16, 16, 16, 1, 256>();
+        GlobalTensor<float, pto::Shape<1, 1, 1, 16, 1>, pto::Stride<16, 16, 16, 1, 256>, pto::Layout::DN> v43 =
+            GlobalTensor<float, pto::Shape<1, 1, 1, 16, 1>, pto::Stride<16, 16, 16, 1, 256>, pto::Layout::DN>(
+                v3 + (v7 + (unsigned)v33 * (unsigned)v16 + v7 * (unsigned)v14), v41, v42
+            );
+        TLOAD(v39, v43);
+        Tile<
+            TileType::Vec, float, 16, 1, BLayout::ColMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>
+            v44 = Tile<
+                TileType::Vec, float, 16, 1, BLayout::ColMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>(v9, v16);
+        uint64_t v45 = (uint64_t)v27;
+        TASSIGN(v44, v45);
+        pto::Shape<1, 1, 1, 16, 1> v46 = pto::Shape<1, 1, 1, 16, 1>();
+        pto::Stride<16, 16, 16, 1, 256> v47 = pto::Stride<16, 16, 16, 1, 256>();
+        GlobalTensor<float, pto::Shape<1, 1, 1, 16, 1>, pto::Stride<16, 16, 16, 1, 256>, pto::Layout::DN> v48 =
+            GlobalTensor<float, pto::Shape<1, 1, 1, 16, 1>, pto::Stride<16, 16, 16, 1, 256>, pto::Layout::DN>(
+                v4 + (v7 + (unsigned)v33 * (unsigned)v16 + v7 * (unsigned)v14), v46, v47
+            );
+        TLOAD(v44, v48);
+        set_flag(PIPE_MTE2, PIPE_V, EVENT_ID2);
+        for (size_t v49 = v30; v49 < ((size_t)v5); v49 += v30) {
+            int32_t v50 = (int32_t)((uint32_t)v33 + (uint32_t)((int32_t)(uint32_t)((int32_t)v49) * (uint32_t)v9));
+            Tile<
+                TileType::Vec, float, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>
+                v51 = Tile<
+                    TileType::Vec, float, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                    CompactMode::Null>(v9, v13);
+            uint64_t v52 = (uint64_t)v26;
+            TASSIGN(v51, v52);
+            pto::Shape<1, 1, 1, 16, 128> v53 = pto::Shape<1, 1, 1, 16, 128>();
+            pto::Stride<2048, 2048, 2048, 128, 1> v54 = pto::Stride<2048, 2048, 2048, 128, 1>();
+            GlobalTensor<float, pto::Shape<1, 1, 1, 16, 128>, pto::Stride<2048, 2048, 2048, 128, 1>, pto::Layout::ND>
+                v55 = GlobalTensor<
+                    float, pto::Shape<1, 1, 1, 16, 128>, pto::Stride<2048, 2048, 2048, 128, 1>, pto::Layout::ND>(
+                    v2 + (v7 + (unsigned)v50 * (unsigned)v13 + v7 * (unsigned)v16), v53, v54
+                );
+            wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID1);
+            TLOAD(v51, v55);
+            Tile<
+                TileType::Vec, float, 16, 1, BLayout::ColMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>
+                v56 = Tile<
+                    TileType::Vec, float, 16, 1, BLayout::ColMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                    CompactMode::Null>(v9, v16);
+            uint64_t v57 = (uint64_t)v25;
+            TASSIGN(v56, v57);
+            pto::Shape<1, 1, 1, 16, 1> v58 = pto::Shape<1, 1, 1, 16, 1>();
+            pto::Stride<16, 16, 16, 1, 256> v59 = pto::Stride<16, 16, 16, 1, 256>();
+            GlobalTensor<float, pto::Shape<1, 1, 1, 16, 1>, pto::Stride<16, 16, 16, 1, 256>, pto::Layout::DN> v60 =
+                GlobalTensor<float, pto::Shape<1, 1, 1, 16, 1>, pto::Stride<16, 16, 16, 1, 256>, pto::Layout::DN>(
+                    v3 + (v7 + (unsigned)v50 * (unsigned)v16 + v7 * (unsigned)v14), v58, v59
+                );
+            TLOAD(v56, v60);
+            set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+            Tile<
+                TileType::Vec, float, 16, 1, BLayout::ColMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>
+                v61 = Tile<
+                    TileType::Vec, float, 16, 1, BLayout::ColMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                    CompactMode::Null>(v9, v16);
+            uint64_t v62 = (uint64_t)v24;
+            TASSIGN(v61, v62);
+            pto::Shape<1, 1, 1, 16, 1> v63 = pto::Shape<1, 1, 1, 16, 1>();
+            pto::Stride<16, 16, 16, 1, 256> v64 = pto::Stride<16, 16, 16, 1, 256>();
+            GlobalTensor<float, pto::Shape<1, 1, 1, 16, 1>, pto::Stride<16, 16, 16, 1, 256>, pto::Layout::DN> v65 =
+                GlobalTensor<float, pto::Shape<1, 1, 1, 16, 1>, pto::Stride<16, 16, 16, 1, 256>, pto::Layout::DN>(
+                    v4 + (v7 + (unsigned)v50 * (unsigned)v16 + v7 * (unsigned)v14), v63, v64
+                );
+            TLOAD(v61, v65);
+            set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1);
+            Tile<
+                TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>
+                v66 = Tile<
+                    TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                    CompactMode::Null>(v16, v9);
+            uint64_t v67 = (uint64_t)v28;
+            TASSIGN(v66, v67);
+            Tile<
+                TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>
+                v68 = Tile<
+                    TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                    CompactMode::Null>(v16, v9);
+            uint64_t v69 = (uint64_t)v25;
+            TASSIGN(v68, v69);
+            Tile<
+                TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>
+                v70 = Tile<
+                    TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                    CompactMode::Null>(v16, v9);
+            uint64_t v71 = (uint64_t)v23;
+            TASSIGN(v70, v71);
+            wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+            pipe_barrier(PIPE_V);
+            TMAX(v70, v66, v68);
+            Tile<
+                TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>
+                v72 = Tile<
+                    TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                    CompactMode::Null>(v16, v9);
+            uint64_t v73 = (uint64_t)v28;
+            TASSIGN(v72, v73);
+            Tile<
+                TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>
+                v74 = Tile<
+                    TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                    CompactMode::Null>(v16, v9);
+            uint64_t v75 = (uint64_t)v23;
+            TASSIGN(v74, v75);
+            Tile<
+                TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>
+                v76 = Tile<
+                    TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                    CompactMode::Null>(v16, v9);
+            uint64_t v77 = (uint64_t)v22;
+            TASSIGN(v76, v77);
+            pipe_barrier(PIPE_V);
+            TSUB(v76, v72, v74);
+            Tile<
+                TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>
+                v78 = Tile<
+                    TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                    CompactMode::Null>(v16, v9);
+            uint64_t v79 = (uint64_t)v22;
+            TASSIGN(v78, v79);
+            Tile<
+                TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>
+                v80 = Tile<
+                    TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                    CompactMode::Null>(v16, v9);
+            uint64_t v81 = (uint64_t)v22;
+            TASSIGN(v80, v81);
+            pipe_barrier(PIPE_V);
+            TEXP(v80, v78);
+            Tile<
+                TileType::Vec, float, 16, 1, BLayout::ColMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>
+                v82 = Tile<
+                    TileType::Vec, float, 16, 1, BLayout::ColMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                    CompactMode::Null>(v9, v16);
+            uint64_t v83 = (uint64_t)v22;
+            TASSIGN(v82, v83);
+            Tile<
+                TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>
+                v84 = Tile<
+                    TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                    CompactMode::Null>(v16, v9);
+            uint64_t v85 = (uint64_t)v25;
+            TASSIGN(v84, v85);
+            Tile<
+                TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>
+                v86 = Tile<
+                    TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                    CompactMode::Null>(v16, v9);
+            uint64_t v87 = (uint64_t)v23;
+            TASSIGN(v86, v87);
+            Tile<
+                TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>
+                v88 = Tile<
+                    TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                    CompactMode::Null>(v16, v9);
+            uint64_t v89 = (uint64_t)v21;
+            TASSIGN(v88, v89);
+            TSUB(v88, v84, v86);
+            Tile<
+                TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>
+                v90 = Tile<
+                    TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                    CompactMode::Null>(v16, v9);
+            uint64_t v91 = (uint64_t)v21;
+            TASSIGN(v90, v91);
+            Tile<
+                TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>
+                v92 = Tile<
+                    TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                    CompactMode::Null>(v16, v9);
+            uint64_t v93 = (uint64_t)v21;
+            TASSIGN(v92, v93);
+            pipe_barrier(PIPE_V);
+            TEXP(v92, v90);
+            Tile<
+                TileType::Vec, float, 16, 1, BLayout::ColMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>
+                v94 = Tile<
+                    TileType::Vec, float, 16, 1, BLayout::ColMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                    CompactMode::Null>(v9, v16);
+            uint64_t v95 = (uint64_t)v21;
+            TASSIGN(v94, v95);
+            Tile<
+                TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>
+                v96 = Tile<
+                    TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                    CompactMode::Null>(v16, v9);
+            uint64_t v97 = (uint64_t)v22;
+            TASSIGN(v96, v97);
+            Tile<
+                TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>
+                v98 = Tile<
+                    TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                    CompactMode::Null>(v16, v9);
+            uint64_t v99 = (uint64_t)v27;
+            TASSIGN(v98, v99);
+            Tile<
+                TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>
+                v100 = Tile<
+                    TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                    CompactMode::Null>(v16, v9);
+            uint64_t v101 = (uint64_t)v20;
+            TASSIGN(v100, v101);
+            TMUL(v100, v96, v98);
+            Tile<
+                TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>
+                v102 = Tile<
+                    TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                    CompactMode::Null>(v16, v9);
+            uint64_t v103 = (uint64_t)v21;
+            TASSIGN(v102, v103);
+            Tile<
+                TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>
+                v104 = Tile<
+                    TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                    CompactMode::Null>(v16, v9);
+            uint64_t v105 = (uint64_t)v24;
+            TASSIGN(v104, v105);
+            Tile<
+                TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>
+                v106 = Tile<
+                    TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                    CompactMode::Null>(v16, v9);
+            uint64_t v107 = (uint64_t)v19;
+            TASSIGN(v106, v107);
+            pipe_barrier(PIPE_V);
+            wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1);
+            TMUL(v106, v102, v104);
+            Tile<
+                TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>
+                v108 = Tile<
+                    TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                    CompactMode::Null>(v16, v9);
+            uint64_t v109 = (uint64_t)v20;
+            TASSIGN(v108, v109);
+            Tile<
+                TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>
+                v110 = Tile<
+                    TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                    CompactMode::Null>(v16, v9);
+            uint64_t v111 = (uint64_t)v19;
+            TASSIGN(v110, v111);
+            Tile<
+                TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>
+                v112 = Tile<
+                    TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                    CompactMode::Null>(v16, v9);
+            uint64_t v113 = (uint64_t)v20;
+            TASSIGN(v112, v113);
+            pipe_barrier(PIPE_V);
+            TADD(v112, v108, v110);
+            Tile<
+                TileType::Vec, float, 16, 1, BLayout::ColMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>
+                v114 = Tile<
+                    TileType::Vec, float, 16, 1, BLayout::ColMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                    CompactMode::Null>(v9, v16);
+            uint64_t v115 = (uint64_t)v20;
+            TASSIGN(v114, v115);
+            Tile<
+                TileType::Vec, float, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>
+                v116 = Tile<
+                    TileType::Vec, float, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                    CompactMode::Null>(v9, v13);
+            uint64_t v117 = (uint64_t)v18;
+            TASSIGN(v116, v117);
+            TROWEXPANDMUL(v116, v34, v82);
+            Tile<
+                TileType::Vec, float, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>
+                v118 = Tile<
+                    TileType::Vec, float, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                    CompactMode::Null>(v9, v13);
+            uint64_t v119 = (uint64_t)v26;
+            TASSIGN(v118, v119);
+            TROWEXPANDMUL(v118, v51, v94);
+            Tile<
+                TileType::Vec, float, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>
+                v120 = Tile<
+                    TileType::Vec, float, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                    CompactMode::Null>(v9, v13);
+            uint64_t v121 = (uint64_t)v29;
+            TASSIGN(v120, v121);
+            pipe_barrier(PIPE_V);
+            TADD(v120, v116, v118);
+            set_flag(PIPE_V, PIPE_MTE2, EVENT_ID1);
+            Tile<
+                TileType::Vec, float, 16, 1, BLayout::ColMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>
+                v122 = Tile<
+                    TileType::Vec, float, 16, 1, BLayout::ColMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                    CompactMode::Null>(v9, v16);
+            uint64_t v123 = (uint64_t)v23;
+            TASSIGN(v122, v123);
+            Tile<
+                TileType::Vec, float, 16, 1, BLayout::ColMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>
+                v124 = Tile<
+                    TileType::Vec, float, 16, 1, BLayout::ColMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                    CompactMode::Null>(v9, v16);
+            uint64_t v125 = (uint64_t)v27;
+            TASSIGN(v124, v125);
+            TMOV(v124, v114);
+            Tile<
+                TileType::Vec, float, 16, 1, BLayout::ColMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>
+                v126 = Tile<
+                    TileType::Vec, float, 16, 1, BLayout::ColMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                    CompactMode::Null>(v9, v16);
+            uint64_t v127 = (uint64_t)v28;
+            TASSIGN(v126, v127);
+            TMOV(v126, v122);
+        };
+        Tile<
+            TileType::Vec, float, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>
+            v128 = Tile<
+                TileType::Vec, float, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>(v9, v13);
+        uint64_t v129 = (uint64_t)v29;
+        TASSIGN(v128, v129);
+        pipe_barrier(PIPE_V);
+        wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID2);
+        TROWEXPANDDIV(v128, v34, v44);
+        Tile<
+            TileType::Vec, float, 1, 2048, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>
+            v130 = Tile<
+                TileType::Vec, float, 1, 2048, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>(v16, v8);
+        uint64_t v131 = (uint64_t)v29;
+        TASSIGN(v130, v131);
+        Tile<
+            TileType::Vec, bfloat16_t, 1, 2048, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>
+            v132 = Tile<
+                TileType::Vec, bfloat16_t, 1, 2048, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>(v16, v8);
+        uint64_t v133 = (uint64_t)v17;
+        TASSIGN(v132, v133);
+        pipe_barrier(PIPE_V);
+        wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID0);
+        TCVT(v132, v130, v6);
+        set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0);
+        set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+        pto::Shape<1, 1, 1, 1, 2048> v134 = pto::Shape<1, 1, 1, 1, 2048>();
+        pto::Stride<16384, 16384, 16384, 16384, 1> v135 = pto::Stride<16384, 16384, 16384, 16384, 1>();
+        GlobalTensor<
+            bfloat16_t, pto::Shape<1, 1, 1, 1, 2048>, pto::Stride<16384, 16384, 16384, 16384, 1>, pto::Layout::ND>
+            v136 = GlobalTensor<
+                bfloat16_t, pto::Shape<1, 1, 1, 1, 2048>, pto::Stride<16384, 16384, 16384, 16384, 1>, pto::Layout::ND>(
+                v1 + (v7 + v7 * (unsigned)v15 + (unsigned)((int32_t)(uint32_t)v32 * (uint32_t)v8) * (unsigned)v16),
+                v134, v135
+            );
+        wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+        TSTORE(v136, v132);
+        set_flag(PIPE_MTE3, PIPE_V, EVENT_ID0);
+    }
+    wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0);
+    wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID1);
+    wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID0);
+#endif  // __DAV_VEC__
+
+    ptoas_auto_sync_tail(PTOAutoSyncTailMode::kBarrierAll);
+    return;
+}
+
+// --- Kernel entry point ---
+extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) {
+    // Unpack tensor: attn_row_padded__ssa_v0
+    __gm__ Tensor *attn_row_padded__ssa_v0_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]);
+    __gm__ bfloat16_t *attn_row_padded__ssa_v0 =
+        reinterpret_cast<__gm__ bfloat16_t *>(attn_row_padded__ssa_v0_tensor->buffer.addr) +
+        attn_row_padded__ssa_v0_tensor->start_offset;
+
+    // Unpack tensor: all_oi_tmp__rv_v2
+    __gm__ Tensor *all_oi_tmp__rv_v2_tensor = reinterpret_cast<__gm__ Tensor *>(args[1]);
+    __gm__ float *all_oi_tmp__rv_v2 = reinterpret_cast<__gm__ float *>(all_oi_tmp__rv_v2_tensor->buffer.addr) +
+                                      all_oi_tmp__rv_v2_tensor->start_offset;
+
+    // Unpack tensor: all_cur_mi__rv_v2
+    __gm__ Tensor *all_cur_mi__rv_v2_tensor = reinterpret_cast<__gm__ Tensor *>(args[2]);
+    __gm__ float *all_cur_mi__rv_v2 = reinterpret_cast<__gm__ float *>(all_cur_mi__rv_v2_tensor->buffer.addr) +
+                                      all_cur_mi__rv_v2_tensor->start_offset;
+
+    // Unpack tensor: all_cur_li__rv_v2
+    __gm__ Tensor *all_cur_li__rv_v2_tensor = reinterpret_cast<__gm__ Tensor *>(args[3]);
+    __gm__ float *all_cur_li__rv_v2 = reinterpret_cast<__gm__ float *>(all_cur_li__rv_v2_tensor->buffer.addr) +
+                                      all_cur_li__rv_v2_tensor->start_offset;
+
+    // Unpack scalar: ctx_blocks__ssa_v0
+    union {
+        uint64_t u64;
+        int64_t val;
+    } ctx_blocks__ssa_v0_conv;
+    ctx_blocks__ssa_v0_conv.u64 = args[4];
+    int64_t ctx_blocks__ssa_v0 = ctx_blocks__ssa_v0_conv.val;
+
+    // Forward to ptoas-generated function
+    online_softmax(
+        attn_row_padded__ssa_v0, all_oi_tmp__rv_v2, all_cur_mi__rv_v2, all_cur_li__rv_v2, ctx_blocks__ssa_v0
+    );
+}
diff --git a/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aiv/out_proj_residual.cpp b/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aiv/out_proj_residual.cpp
new file mode 100644
index 000000000..e34aeaa10
--- /dev/null
+++ b/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aiv/out_proj_residual.cpp
@@ -0,0 +1,183 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+// Kernel Function: out_proj_residual
+// Generated by PyPTO IR Compiler (PTO backend)
+
+#include <cstdint>
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#if defined(__CPU_SIM)
+#define __aicore__
+#else
+#define __aicore__ [aicore]
+#endif
+#endif
+
+#include <pto/pto-inst.hpp>
+#include "tensor.h"
+
+using namespace pto;
+
+// --- ptoas-generated code ---
+
+enum class PTOAutoSyncTailMode : int {
+    kBarrierAll = 0,
+    kSetWaitMte3ToSEvent0 = 1,
+};
+
+static __aicore__ inline void ptoas_auto_sync_tail(PTOAutoSyncTailMode mode = PTOAutoSyncTailMode::kBarrierAll) {
+    switch (mode) {
+    case PTOAutoSyncTailMode::kSetWaitMte3ToSEvent0:
+        set_flag(PIPE_MTE3, PIPE_S, EVENT_ID0);
+        wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID0);
+        break;
+    case PTOAutoSyncTailMode::kBarrierAll:
+    default:
+        pipe_barrier(PIPE_ALL);
+        break;
+    }
+}
+
+static __aicore__ void
+out_proj_residual(__gm__ bfloat16_t *v1, __gm__ float *v2, __gm__ float *v3, int32_t v4, int32_t v5, int32_t v6) {
+    RoundMode v7 = RoundMode::CAST_ROUND;
+    unsigned v8 = 5120;
+    unsigned v9 = 0;
+    const int32_t v10 = 64;
+    const int32_t v11 = 1;
+    const int32_t v12 = 5120;
+    const int32_t v13 = 16;
+    const int64_t v14 = 6144;
+    const int64_t v15 = 4096;
+    const int64_t v16 = 0;
+    using T = float;
+
+#if defined(__DAV_VEC__)
+    set_mask_norm();
+    set_vector_mask(-1, -1);
+    Tile<
+        TileType::Vec, float, 16, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+        CompactMode::Null>
+        v17 = Tile<
+            TileType::Vec, float, 16, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>(v13, v10);
+    uint64_t v18 = (uint64_t)v16;
+    TASSIGN(v17, v18);
+    pto::Shape<1, 1, 1, 16, 64> v19 = pto::Shape<1, 1, 1, 16, 64>();
+    pto::Stride<1024, 1024, 1024, 64, 1> v20 = pto::Stride<1024, 1024, 1024, 64, 1>();
+    GlobalTensor<float, pto::Shape<1, 1, 1, 16, 64>, pto::Stride<1024, 1024, 1024, 64, 1>, pto::Layout::ND> v21 =
+        GlobalTensor<float, pto::Shape<1, 1, 1, 16, 64>, pto::Stride<1024, 1024, 1024, 64, 1>, pto::Layout::ND>(
+            v2 + (v9 + v9 * (unsigned)v10 + v9 * (unsigned)v11), v19, v20
+        );
+    TLOAD(v17, v21);
+    Tile<
+        TileType::Vec, bfloat16_t, 16, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+        CompactMode::Null>
+        v22 = Tile<
+            TileType::Vec, bfloat16_t, 16, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>(v6, v10);
+    uint64_t v23 = (uint64_t)v15;
+    TASSIGN(v22, v23);
+    unsigned v24 = (unsigned)v6 * v8;
+    pto::Shape<1, 1, 1, -1, 64> v25 = pto::Shape<1, 1, 1, -1, 64>(v6);
+    pto::Stride<-1, -1, -1, 5120, 1> v26 = pto::Stride<-1, -1, -1, 5120, 1>(v24, v24, v24);
+    GlobalTensor<bfloat16_t, pto::Shape<1, 1, 1, -1, 64>, pto::Stride<-1, -1, -1, 5120, 1>, pto::Layout::ND> v27 =
+        GlobalTensor<bfloat16_t, pto::Shape<1, 1, 1, -1, 64>, pto::Stride<-1, -1, -1, 5120, 1>, pto::Layout::ND>(
+            v1 + (v9 + (unsigned)v4 * (unsigned)v12 + (unsigned)v5 * (unsigned)v11), v25, v26
+        );
+    TLOAD(v22, v27);
+    set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+    Tile<
+        TileType::Vec, float, 16, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+        CompactMode::Null>
+        v28 = Tile<
+            TileType::Vec, float, 16, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>(v13, v10);
+    uint64_t v29 = (uint64_t)v14;
+    TASSIGN(v28, v29);
+    wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+    TCVT(v28, v22, v7);
+    Tile<
+        TileType::Vec, float, 16, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+        CompactMode::Null>
+        v30 = Tile<
+            TileType::Vec, float, 16, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>(v13, v10);
+    uint64_t v31 = (uint64_t)v16;
+    TASSIGN(v30, v31);
+    pipe_barrier(PIPE_V);
+    TADD(v30, v17, v28);
+    set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+    pto::Shape<1, 1, 1, 16, 64> v32 = pto::Shape<1, 1, 1, 16, 64>();
+    pto::Stride<81920, 81920, 81920, 5120, 1> v33 = pto::Stride<81920, 81920, 81920, 5120, 1>();
+    GlobalTensor<float, pto::Shape<1, 1, 1, 16, 64>, pto::Stride<81920, 81920, 81920, 5120, 1>, pto::Layout::ND> v34 =
+        GlobalTensor<float, pto::Shape<1, 1, 1, 16, 64>, pto::Stride<81920, 81920, 81920, 5120, 1>, pto::Layout::ND>(
+            v3 + (v9 + v9 * (unsigned)v12 + (unsigned)v5 * (unsigned)v11), v32, v33
+        );
+    wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+    TSTORE(v34, v30);
+#endif  // __DAV_VEC__
+
+    ptoas_auto_sync_tail(PTOAutoSyncTailMode::kBarrierAll);
+    return;
+}
+
+// --- Kernel entry point ---
+extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) {
+    // Unpack tensor: current_hidden__rv_v2
+    __gm__ Tensor *current_hidden__rv_v2_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]);
+    __gm__ bfloat16_t *current_hidden__rv_v2 =
+        reinterpret_cast<__gm__ bfloat16_t *>(current_hidden__rv_v2_tensor->buffer.addr) +
+        current_hidden__rv_v2_tensor->start_offset;
+
+    // Unpack tensor: o_acc__rv_v2
+    __gm__ Tensor *o_acc__rv_v2_tensor = reinterpret_cast<__gm__ Tensor *>(args[1]);
+    __gm__ float *o_acc__rv_v2 =
+        reinterpret_cast<__gm__ float *>(o_acc__rv_v2_tensor->buffer.addr) + o_acc__rv_v2_tensor->start_offset;
+
+    // Unpack tensor: resid1_tile__iter_v1
+    __gm__ Tensor *resid1_tile__iter_v1_tensor = reinterpret_cast<__gm__ Tensor *>(args[2]);
+    __gm__ float *resid1_tile__iter_v1 = reinterpret_cast<__gm__ float *>(resid1_tile__iter_v1_tensor->buffer.addr) +
+                                         resid1_tile__iter_v1_tensor->start_offset;
+
+    // Unpack scalar: b0__idx_v0
+    union {
+        uint64_t u64;
+        int64_t val;
+    } b0__idx_v0_conv;
+    b0__idx_v0_conv.u64 = args[3];
+    int64_t b0__idx_v0 = b0__idx_v0_conv.val;
+
+    // Unpack scalar: o0__ssa_v0
+    union {
+        uint64_t u64;
+        int64_t val;
+    } o0__ssa_v0_conv;
+    o0__ssa_v0_conv.u64 = args[4];
+    int64_t o0__ssa_v0 = o0__ssa_v0_conv.val;
+
+    // Unpack scalar: cur_valid__ssa_v2
+    union {
+        uint64_t u64;
+        int64_t val;
+    } cur_valid__ssa_v2_conv;
+    cur_valid__ssa_v2_conv.u64 = args[5];
+    int64_t cur_valid__ssa_v2 = cur_valid__ssa_v2_conv.val;
+
+    // Forward to ptoas-generated function
+    out_proj_residual(
+        current_hidden__rv_v2, o_acc__rv_v2, resid1_tile__iter_v1, b0__idx_v0, o0__ssa_v0, cur_valid__ssa_v2
+    );
+}
diff --git a/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aiv/post_rmsnorm.cpp b/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aiv/post_rmsnorm.cpp
new file mode 100644
index 000000000..e4f07e151
--- /dev/null
+++ b/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aiv/post_rmsnorm.cpp
@@ -0,0 +1,324 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+// Kernel Function: post_rmsnorm
+// Generated by PyPTO IR Compiler (PTO backend)
+
+#include <cstdint>
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#if defined(__CPU_SIM)
+#define __aicore__
+#else
+#define __aicore__ [aicore]
+#endif
+#endif
+
+#include <pto/pto-inst.hpp>
+#include "tensor.h"
+
+using namespace pto;
+
+// --- ptoas-generated code ---
+
+enum class PTOAutoSyncTailMode : int {
+    kBarrierAll = 0,
+    kSetWaitMte3ToSEvent0 = 1,
+};
+
+static __aicore__ inline void ptoas_auto_sync_tail(PTOAutoSyncTailMode mode = PTOAutoSyncTailMode::kBarrierAll) {
+    switch (mode) {
+    case PTOAutoSyncTailMode::kSetWaitMte3ToSEvent0:
+        set_flag(PIPE_MTE3, PIPE_S, EVENT_ID0);
+        wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID0);
+        break;
+    case PTOAutoSyncTailMode::kBarrierAll:
+    default:
+        pipe_barrier(PIPE_ALL);
+        break;
+    }
+}
+
+static __aicore__ void post_rmsnorm(__gm__ float *v1, __gm__ bfloat16_t *v2, __gm__ float *v3) {
+    RoundMode v4 = RoundMode::CAST_ROUND;
+    unsigned v5 = 0;
+    const float v6 = 9.99999997E-7f;
+    const float v7 = 1.95312503E-4f;
+    const int32_t v8 = 128;
+    const int32_t v9 = 40;
+    const int32_t v10 = 0;
+    const float v11 = 0.0f;
+    const int32_t v12 = 1;
+    const int32_t v13 = 5120;
+    const int32_t v14 = 16;
+    const int64_t v15 = 576;
+    const int64_t v16 = 64;
+    const int64_t v17 = 0;
+    const int64_t v18 = 21120;
+    const int64_t v19 = 12928;
+    const int64_t v20 = 4736;
+    const int64_t v21 = 4672;
+    using T = float;
+
+#if defined(__DAV_VEC__)
+    set_mask_norm();
+    set_vector_mask(-1, -1);
+    size_t v22 = (size_t)v12;
+    size_t v23 = (size_t)v10;
+    size_t v24 = (size_t)v9;
+    Tile<
+        TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+        CompactMode::Null>
+        v25 = Tile<
+            TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>(v12, v14);
+    uint64_t v26 = (uint64_t)v21;
+    TASSIGN(v25, v26);
+    set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0);
+    set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2);
+    set_flag(PIPE_MTE3, PIPE_V, EVENT_ID0);
+    TEXPANDS(v25, v11);
+    for (size_t v27 = v23; v27 < v24; v27 += v22) {
+        Tile<
+            TileType::Vec, float, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>
+            v28 = Tile<
+                TileType::Vec, float, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>(v14, v8);
+        uint64_t v29 = (uint64_t)v20;
+        TASSIGN(v28, v29);
+        pto::Shape<1, 1, 1, 16, 128> v30 = pto::Shape<1, 1, 1, 16, 128>();
+        pto::Stride<81920, 81920, 81920, 5120, 1> v31 = pto::Stride<81920, 81920, 81920, 5120, 1>();
+        GlobalTensor<float, pto::Shape<1, 1, 1, 16, 128>, pto::Stride<81920, 81920, 81920, 5120, 1>, pto::Layout::ND>
+            v32 = GlobalTensor<
+                float, pto::Shape<1, 1, 1, 16, 128>, pto::Stride<81920, 81920, 81920, 5120, 1>, pto::Layout::ND>(
+                v1 + (v5 + v5 * (unsigned)v13 +
+                      (unsigned)((int32_t)(uint32_t)((int32_t)v27) * (uint32_t)v8) * (unsigned)v12),
+                v30, v31
+            );
+        wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0);
+        TLOAD(v28, v32);
+        set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+        Tile<
+            TileType::Vec, float, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>
+            v33 = Tile<
+                TileType::Vec, float, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>(v14, v8);
+        uint64_t v34 = (uint64_t)v20;
+        TASSIGN(v33, v34);
+        wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+        TMUL(v33, v28, v28);
+        Tile<
+            TileType::Vec, float, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>
+            v35 = Tile<
+                TileType::Vec, float, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>(v14, v8);
+        uint64_t v36 = (uint64_t)v19;
+        TASSIGN(v35, v36);
+        Tile<
+            TileType::Vec, float, 16, 1, BLayout::ColMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>
+            v37 = Tile<
+                TileType::Vec, float, 16, 1, BLayout::ColMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>(v14, v12);
+        uint64_t v38 = (uint64_t)v18;
+        TASSIGN(v37, v38);
+        pipe_barrier(PIPE_V);
+        TROWSUM(v37, v33, v35);
+        set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0);
+        Tile<
+            TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>
+            v39 = Tile<
+                TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>(v12, v14);
+        uint64_t v40 = (uint64_t)v18;
+        TASSIGN(v39, v40);
+        Tile<
+            TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>
+            v41 = Tile<
+                TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>(v12, v14);
+        uint64_t v42 = (uint64_t)v21;
+        TASSIGN(v41, v42);
+        pipe_barrier(PIPE_V);
+        TADD(v41, v25, v39);
+    }
+    set_flag(PIPE_V, PIPE_MTE2, EVENT_ID1);
+    Tile<
+        TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+        CompactMode::Null>
+        v43 = Tile<
+            TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>(v12, v14);
+    uint64_t v44 = (uint64_t)v21;
+    TASSIGN(v43, v44);
+    pipe_barrier(PIPE_V);
+    TMULS(v43, v25, v7);
+    Tile<
+        TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+        CompactMode::Null>
+        v45 = Tile<
+            TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>(v12, v14);
+    uint64_t v46 = (uint64_t)v21;
+    TASSIGN(v45, v46);
+    pipe_barrier(PIPE_V);
+    TADDS(v45, v43, v6);
+    Tile<
+        TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+        CompactMode::Null>
+        v47 = Tile<
+            TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>(v12, v14);
+    uint64_t v48 = (uint64_t)v21;
+    TASSIGN(v47, v48);
+    pipe_barrier(PIPE_V);
+    TSQRT(v47, v45);
+    Tile<
+        TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+        CompactMode::Null>
+        v49 = Tile<
+            TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>(v12, v14);
+    uint64_t v50 = (uint64_t)v17;
+    TASSIGN(v49, v50);
+    pipe_barrier(PIPE_V);
+    TRECIP(v49, v47);
+    wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID1);
+    for (size_t v51 = v23; v51 < v24; v51 += v22) {
+        int32_t v52 = (int32_t)((uint32_t)((int32_t)v51) * (uint32_t)v8);
+        Tile<
+            TileType::Vec, float, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>
+            v53 = Tile<
+                TileType::Vec, float, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>(v14, v8);
+        uint64_t v54 = (uint64_t)v20;
+        TASSIGN(v53, v54);
+        pto::Shape<1, 1, 1, 16, 128> v55 = pto::Shape<1, 1, 1, 16, 128>();
+        pto::Stride<81920, 81920, 81920, 5120, 1> v56 = pto::Stride<81920, 81920, 81920, 5120, 1>();
+        GlobalTensor<float, pto::Shape<1, 1, 1, 16, 128>, pto::Stride<81920, 81920, 81920, 5120, 1>, pto::Layout::ND>
+            v57 = GlobalTensor<
+                float, pto::Shape<1, 1, 1, 16, 128>, pto::Stride<81920, 81920, 81920, 5120, 1>, pto::Layout::ND>(
+                v1 + (v5 + v5 * (unsigned)v13 + (unsigned)v52 * (unsigned)v12), v55, v56
+            );
+        wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2);
+        TLOAD(v53, v57);
+        set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1);
+        Tile<
+            TileType::Vec, float, 1, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>
+            v58 = Tile<
+                TileType::Vec, float, 1, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>(v12, v8);
+        uint64_t v59 = (uint64_t)v16;
+        TASSIGN(v58, v59);
+        pto::Shape<1, 1, 1, 1, 128> v60 = pto::Shape<1, 1, 1, 1, 128>();
+        pto::Stride<5120, 5120, 5120, 5120, 1> v61 = pto::Stride<5120, 5120, 5120, 5120, 1>();
+        GlobalTensor<float, pto::Shape<1, 1, 1, 1, 128>, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v62 =
+            GlobalTensor<float, pto::Shape<1, 1, 1, 1, 128>, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(
+                v3 + (v5 + v5 * (unsigned)v13 + (unsigned)v52 * (unsigned)v12), v60, v61
+            );
+        TLOAD(v58, v62);
+        set_flag(PIPE_MTE2, PIPE_V, EVENT_ID2);
+        Tile<
+            TileType::Vec, float, 16, 1, BLayout::ColMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>
+            v63 = Tile<
+                TileType::Vec, float, 16, 1, BLayout::ColMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>(v14, v12);
+        uint64_t v64 = (uint64_t)v17;
+        TASSIGN(v63, v64);
+        Tile<
+            TileType::Vec, float, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>
+            v65 = Tile<
+                TileType::Vec, float, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>(v14, v8);
+        uint64_t v66 = (uint64_t)v20;
+        TASSIGN(v65, v66);
+        wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1);
+        pipe_barrier(PIPE_V);
+        TROWEXPANDMUL(v65, v53, v63);
+        Tile<
+            TileType::Vec, float, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>
+            v67 = Tile<
+                TileType::Vec, float, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>(v14, v8);
+        uint64_t v68 = (uint64_t)v20;
+        TASSIGN(v67, v68);
+        pipe_barrier(PIPE_V);
+        wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID2);
+        TCOLEXPANDMUL(v67, v65, v58);
+        Tile<
+            TileType::Vec, bfloat16_t, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>
+            v69 = Tile<
+                TileType::Vec, bfloat16_t, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>(v14, v8);
+        uint64_t v70 = (uint64_t)v15;
+        TASSIGN(v69, v70);
+        pipe_barrier(PIPE_V);
+        wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID0);
+        TCVT(v69, v67, v4);
+        set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2);
+        set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+        pto::Shape<1, 1, 1, 16, 128> v71 = pto::Shape<1, 1, 1, 16, 128>();
+        pto::Stride<81920, 81920, 81920, 5120, 1> v72 = pto::Stride<81920, 81920, 81920, 5120, 1>();
+        GlobalTensor<
+            bfloat16_t, pto::Shape<1, 1, 1, 16, 128>, pto::Stride<81920, 81920, 81920, 5120, 1>, pto::Layout::ND>
+            v73 = GlobalTensor<
+                bfloat16_t, pto::Shape<1, 1, 1, 16, 128>, pto::Stride<81920, 81920, 81920, 5120, 1>, pto::Layout::ND>(
+                v2 + (v5 + v5 * (unsigned)v13 + (unsigned)v52 * (unsigned)v12), v71, v72
+            );
+        wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+        TSTORE(v73, v69);
+        set_flag(PIPE_MTE3, PIPE_V, EVENT_ID0);
+    }
+    wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0);
+    wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2);
+    wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID0);
+#endif  // __DAV_VEC__
+
+    ptoas_auto_sync_tail(PTOAutoSyncTailMode::kBarrierAll);
+    return;
+}
+
+// --- Kernel entry point ---
+extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) {
+    // Unpack tensor: resid1_tile__rv_v2
+    __gm__ Tensor *resid1_tile__rv_v2_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]);
+    __gm__ float *resid1_tile__rv_v2 = reinterpret_cast<__gm__ float *>(resid1_tile__rv_v2_tensor->buffer.addr) +
+                                       resid1_tile__rv_v2_tensor->start_offset;
+
+    // Unpack tensor: post_norm_tile__ssa_v0
+    __gm__ Tensor *post_norm_tile__ssa_v0_tensor = reinterpret_cast<__gm__ Tensor *>(args[1]);
+    __gm__ bfloat16_t *post_norm_tile__ssa_v0 =
+        reinterpret_cast<__gm__ bfloat16_t *>(post_norm_tile__ssa_v0_tensor->buffer.addr) +
+        post_norm_tile__ssa_v0_tensor->start_offset;
+
+    // Unpack tensor: post_rms_weight__ssa_v0
+    __gm__ Tensor *post_rms_weight__ssa_v0_tensor = reinterpret_cast<__gm__ Tensor *>(args[2]);
+    __gm__ float *post_rms_weight__ssa_v0 =
+        reinterpret_cast<__gm__ float *>(post_rms_weight__ssa_v0_tensor->buffer.addr) +
+        post_rms_weight__ssa_v0_tensor->start_offset;
+
+    // Forward to ptoas-generated function
+    post_rmsnorm(resid1_tile__rv_v2, post_norm_tile__ssa_v0, post_rms_weight__ssa_v0);
+}
diff --git a/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aiv/q_pad.cpp b/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aiv/q_pad.cpp
new file mode 100644
index 000000000..5bbc66c08
--- /dev/null
+++ b/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aiv/q_pad.cpp
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+// Kernel Function: q_pad
+// Generated by PyPTO IR Compiler (PTO backend)
+
+#include <cstdint>
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#if defined(__CPU_SIM)
+#define __aicore__
+#else
+#define __aicore__ [aicore]
+#endif
+#endif
+
+#include <pto/pto-inst.hpp>
+#include "tensor.h"
+
+using namespace pto;
+
+// --- ptoas-generated code ---
+
+enum class PTOAutoSyncTailMode : int {
+    kBarrierAll = 0,
+    kSetWaitMte3ToSEvent0 = 1,
+};
+
+static __aicore__ inline void ptoas_auto_sync_tail(PTOAutoSyncTailMode mode = PTOAutoSyncTailMode::kBarrierAll) {
+    switch (mode) {
+    case PTOAutoSyncTailMode::kSetWaitMte3ToSEvent0:
+        set_flag(PIPE_MTE3, PIPE_S, EVENT_ID0);
+        wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID0);
+        break;
+    case PTOAutoSyncTailMode::kBarrierAll:
+    default:
+        pipe_barrier(PIPE_ALL);
+        break;
+    }
+}
+
+static __aicore__ void q_pad(__gm__ bfloat16_t *v1) {
+    unsigned v2 = 0;
+    RoundMode v3 = RoundMode::CAST_ROUND;
+    const int32_t v4 = 5;
+    const int32_t v5 = 16;
+    const float v6 = 0.0f;
+    const int32_t v7 = 11;
+    const int32_t v8 = 0;
+    const int32_t v9 = 1;
+    const int32_t v10 = 128;
+    const int64_t v11 = 5632;
+    const int64_t v12 = 0;
+    using T = float;
+
+#if defined(__DAV_VEC__)
+    set_mask_norm();
+    set_vector_mask(-1, -1);
+    set_flag(PIPE_MTE3, PIPE_V, EVENT_ID0);
+    for (size_t v13 = (size_t)v8; v13 < ((size_t)v10); v13 += (size_t)v9) {
+        Tile<
+            TileType::Vec, float, 11, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>
+            v14 = Tile<
+                TileType::Vec, float, 11, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>(v7, v10);
+        uint64_t v15 = (uint64_t)v12;
+        TASSIGN(v14, v15);
+        pipe_barrier(PIPE_V);
+        TEXPANDS(v14, v6);
+        Tile<
+            TileType::Vec, bfloat16_t, 11, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>
+            v16 = Tile<
+                TileType::Vec, bfloat16_t, 11, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>(v7, v10);
+        uint64_t v17 = (uint64_t)v11;
+        TASSIGN(v16, v17);
+        pipe_barrier(PIPE_V);
+        wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID0);
+        TCVT(v16, v14, v3);
+        set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+        pto::Shape<1, 1, 1, 11, 128> v18 = pto::Shape<1, 1, 1, 11, 128>();
+        pto::Stride<1408, 1408, 1408, 128, 1> v19 = pto::Stride<1408, 1408, 1408, 128, 1>();
+        GlobalTensor<bfloat16_t, pto::Shape<1, 1, 1, 11, 128>, pto::Stride<1408, 1408, 1408, 128, 1>, pto::Layout::ND>
+            v20 = GlobalTensor<
+                bfloat16_t, pto::Shape<1, 1, 1, 11, 128>, pto::Stride<1408, 1408, 1408, 128, 1>, pto::Layout::ND>(
+                v1 + (v2 +
+                      (unsigned)((int32_t)(uint32_t)((int32_t)(uint32_t)((int32_t)v13) * (uint32_t)v5) + (uint32_t)v4) *
+                          (unsigned)v10 +
+                      v2 * (unsigned)v9),
+                v18, v19
+            );
+        wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+        TSTORE(v20, v16);
+        set_flag(PIPE_MTE3, PIPE_V, EVENT_ID0);
+    }
+    wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID0);
+#endif  // __DAV_VEC__
+
+    ptoas_auto_sync_tail(PTOAutoSyncTailMode::kBarrierAll);
+    return;
+}
+
+// --- Kernel entry point ---
+extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) {
+    // Unpack tensor: all_q_padded__ssa_v0
+    __gm__ Tensor *all_q_padded__ssa_v0_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]);
+    __gm__ bfloat16_t *all_q_padded__ssa_v0 =
+        reinterpret_cast<__gm__ bfloat16_t *>(all_q_padded__ssa_v0_tensor->buffer.addr) +
+        all_q_padded__ssa_v0_tensor->start_offset;
+
+    // Forward to ptoas-generated function
+    q_pad(all_q_padded__ssa_v0);
+}
diff --git a/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aiv/qk_norm.cpp b/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aiv/qk_norm.cpp
new file mode 100644
index 000000000..f375142f7
--- /dev/null
+++ b/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aiv/qk_norm.cpp
@@ -0,0 +1,456 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+// Kernel Function: qk_norm
+// Generated by PyPTO IR Compiler (PTO backend)
+
+#include <cstdint>
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#if defined(__CPU_SIM)
+#define __aicore__
+#else
+#define __aicore__ [aicore]
+#endif
+#endif
+
+#include <pto/pto-inst.hpp>
+#include "tensor.h"
+
+using namespace pto;
+
+// --- ptoas-generated code ---
+
+enum class PTOAutoSyncTailMode : int {
+    kBarrierAll = 0,
+    kSetWaitMte3ToSEvent0 = 1,
+};
+
+static __aicore__ inline void ptoas_auto_sync_tail(PTOAutoSyncTailMode mode = PTOAutoSyncTailMode::kBarrierAll) {
+    switch (mode) {
+    case PTOAutoSyncTailMode::kSetWaitMte3ToSEvent0:
+        set_flag(PIPE_MTE3, PIPE_S, EVENT_ID0);
+        wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID0);
+        break;
+    case PTOAutoSyncTailMode::kBarrierAll:
+    default:
+        pipe_barrier(PIPE_ALL);
+        break;
+    }
+}
+
+static __aicore__ void qk_norm(
+    __gm__ float *v1, __gm__ float *v2, __gm__ float *v3, __gm__ float *v4, __gm__ float *v5, __gm__ float *v6,
+    int32_t v7
+) {
+    unsigned v8 = 0;
+    const int32_t v9 = 8;
+    const float v10 = 9.99999997E-7f;
+    const float v11 = 0.0078125f;
+    const int32_t v12 = 40;
+    const int32_t v13 = 0;
+    const int32_t v14 = 1024;
+    const int32_t v15 = 128;
+    const int32_t v16 = 1;
+    const int32_t v17 = 5120;
+    const int32_t v18 = 16;
+    const int64_t v19 = 64;
+    const int64_t v20 = 0;
+    const int64_t v21 = 25152;
+    const int64_t v22 = 16960;
+    const int64_t v23 = 8768;
+    const int64_t v24 = 576;
+    using T = float;
+
+#if defined(__DAV_VEC__)
+    set_mask_norm();
+    set_vector_mask(-1, -1);
+    size_t v25 = (size_t)v16;
+    size_t v26 = (size_t)v13;
+    set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
+    set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2);
+    for (size_t v27 = v26; v27 < ((size_t)v12); v27 += v25) {
+        int32_t v28 = (int32_t)((uint32_t)((int32_t)v27) * (uint32_t)v15);
+        Tile<
+            TileType::Vec, float, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>
+            v29 = Tile<
+                TileType::Vec, float, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>(v18, v15);
+        uint64_t v30 = (uint64_t)v24;
+        TASSIGN(v29, v30);
+        pto::Shape<1, 1, 1, 16, 128> v31 = pto::Shape<1, 1, 1, 16, 128>();
+        pto::Stride<81920, 81920, 81920, 5120, 1> v32 = pto::Stride<81920, 81920, 81920, 5120, 1>();
+        GlobalTensor<float, pto::Shape<1, 1, 1, 16, 128>, pto::Stride<81920, 81920, 81920, 5120, 1>, pto::Layout::ND>
+            v33 = GlobalTensor<
+                float, pto::Shape<1, 1, 1, 16, 128>, pto::Stride<81920, 81920, 81920, 5120, 1>, pto::Layout::ND>(
+                v2 + (v8 + (unsigned)v7 * (unsigned)v17 + (unsigned)v28 * (unsigned)v16), v31, v32
+            );
+        wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
+        TLOAD(v29, v33);
+        set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+        Tile<
+            TileType::Vec, float, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>
+            v34 = Tile<
+                TileType::Vec, float, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>(v18, v15);
+        uint64_t v35 = (uint64_t)v23;
+        TASSIGN(v34, v35);
+        wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+        TMUL(v34, v29, v29);
+        Tile<
+            TileType::Vec, float, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>
+            v36 = Tile<
+                TileType::Vec, float, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>(v18, v15);
+        uint64_t v37 = (uint64_t)v22;
+        TASSIGN(v36, v37);
+        Tile<
+            TileType::Vec, float, 16, 1, BLayout::ColMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>
+            v38 = Tile<
+                TileType::Vec, float, 16, 1, BLayout::ColMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>(v18, v16);
+        uint64_t v39 = (uint64_t)v21;
+        TASSIGN(v38, v39);
+        pipe_barrier(PIPE_V);
+        TROWSUM(v38, v34, v36);
+        Tile<
+            TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>
+            v40 = Tile<
+                TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>(v16, v18);
+        uint64_t v41 = (uint64_t)v21;
+        TASSIGN(v40, v41);
+        Tile<
+            TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>
+            v42 = Tile<
+                TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>(v16, v18);
+        uint64_t v43 = (uint64_t)v20;
+        TASSIGN(v42, v43);
+        pipe_barrier(PIPE_V);
+        TMULS(v42, v40, v11);
+        Tile<
+            TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>
+            v44 = Tile<
+                TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>(v16, v18);
+        uint64_t v45 = (uint64_t)v20;
+        TASSIGN(v44, v45);
+        Tile<
+            TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>
+            v46 = Tile<
+                TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>(v16, v18);
+        uint64_t v47 = (uint64_t)v20;
+        TASSIGN(v46, v47);
+        pipe_barrier(PIPE_V);
+        TADDS(v46, v44, v10);
+        Tile<
+            TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>
+            v48 = Tile<
+                TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>(v16, v18);
+        uint64_t v49 = (uint64_t)v20;
+        TASSIGN(v48, v49);
+        Tile<
+            TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>
+            v50 = Tile<
+                TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>(v16, v18);
+        uint64_t v51 = (uint64_t)v20;
+        TASSIGN(v50, v51);
+        pipe_barrier(PIPE_V);
+        TRSQRT(v50, v48);
+        Tile<
+            TileType::Vec, float, 16, 1, BLayout::ColMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>
+            v52 = Tile<
+                TileType::Vec, float, 16, 1, BLayout::ColMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>(v18, v16);
+        uint64_t v53 = (uint64_t)v20;
+        TASSIGN(v52, v53);
+        Tile<
+            TileType::Vec, float, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>
+            v54 = Tile<
+                TileType::Vec, float, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>(v18, v15);
+        uint64_t v55 = (uint64_t)v24;
+        TASSIGN(v54, v55);
+        pipe_barrier(PIPE_V);
+        TROWEXPANDMUL(v54, v29, v52);
+        Tile<
+            TileType::Vec, float, 1, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>
+            v56 = Tile<
+                TileType::Vec, float, 1, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>(v16, v15);
+        uint64_t v57 = (uint64_t)v19;
+        TASSIGN(v56, v57);
+        pto::Shape<1, 1, 1, 1, 128> v58 = pto::Shape<1, 1, 1, 1, 128>();
+        pto::Stride<128, 128, 128, 128, 1> v59 = pto::Stride<128, 128, 128, 128, 1>();
+        GlobalTensor<float, pto::Shape<1, 1, 1, 1, 128>, pto::Stride<128, 128, 128, 128, 1>, pto::Layout::ND> v60 =
+            GlobalTensor<float, pto::Shape<1, 1, 1, 1, 128>, pto::Stride<128, 128, 128, 128, 1>, pto::Layout::ND>(
+                v3 + (v8 + v8 * (unsigned)v15 + v8 * (unsigned)v16), v58, v59
+            );
+        TLOAD(v56, v60);
+        set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1);
+        Tile<
+            TileType::Vec, float, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>
+            v61 = Tile<
+                TileType::Vec, float, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>(v18, v15);
+        uint64_t v62 = (uint64_t)v24;
+        TASSIGN(v61, v62);
+        wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1);
+        pipe_barrier(PIPE_V);
+        TCOLEXPANDMUL(v61, v54, v56);
+        set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+        pto::Shape<1, 1, 1, 16, 128> v63 = pto::Shape<1, 1, 1, 16, 128>();
+        pto::Stride<81920, 81920, 81920, 5120, 1> v64 = pto::Stride<81920, 81920, 81920, 5120, 1>();
+        GlobalTensor<float, pto::Shape<1, 1, 1, 16, 128>, pto::Stride<81920, 81920, 81920, 5120, 1>, pto::Layout::ND>
+            v65 = GlobalTensor<
+                float, pto::Shape<1, 1, 1, 16, 128>, pto::Stride<81920, 81920, 81920, 5120, 1>, pto::Layout::ND>(
+                v1 + (v8 + (unsigned)v7 * (unsigned)v17 + (unsigned)v28 * (unsigned)v16), v63, v64
+            );
+        wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+        TSTORE(v65, v61);
+        set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
+    }
+    set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID1);
+    wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID1);
+    for (size_t v66 = v26; v66 < ((size_t)v9); v66 += v25) {
+        int32_t v67 = (int32_t)((uint32_t)((int32_t)v66) * (uint32_t)v15);
+        Tile<
+            TileType::Vec, float, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>
+            v68 = Tile<
+                TileType::Vec, float, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>(v18, v15);
+        uint64_t v69 = (uint64_t)v24;
+        TASSIGN(v68, v69);
+        pto::Shape<1, 1, 1, 16, 128> v70 = pto::Shape<1, 1, 1, 16, 128>();
+        pto::Stride<16384, 16384, 16384, 1024, 1> v71 = pto::Stride<16384, 16384, 16384, 1024, 1>();
+        GlobalTensor<float, pto::Shape<1, 1, 1, 16, 128>, pto::Stride<16384, 16384, 16384, 1024, 1>, pto::Layout::ND>
+            v72 = GlobalTensor<
+                float, pto::Shape<1, 1, 1, 16, 128>, pto::Stride<16384, 16384, 16384, 1024, 1>, pto::Layout::ND>(
+                v5 + (v8 + (unsigned)v7 * (unsigned)v14 + (unsigned)v67 * (unsigned)v16), v70, v71
+            );
+        wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2);
+        TLOAD(v68, v72);
+        set_flag(PIPE_MTE2, PIPE_V, EVENT_ID2);
+        Tile<
+            TileType::Vec, float, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>
+            v73 = Tile<
+                TileType::Vec, float, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>(v18, v15);
+        uint64_t v74 = (uint64_t)v23;
+        TASSIGN(v73, v74);
+        wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID2);
+        TMUL(v73, v68, v68);
+        Tile<
+            TileType::Vec, float, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>
+            v75 = Tile<
+                TileType::Vec, float, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>(v18, v15);
+        uint64_t v76 = (uint64_t)v22;
+        TASSIGN(v75, v76);
+        Tile<
+            TileType::Vec, float, 16, 1, BLayout::ColMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>
+            v77 = Tile<
+                TileType::Vec, float, 16, 1, BLayout::ColMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>(v18, v16);
+        uint64_t v78 = (uint64_t)v21;
+        TASSIGN(v77, v78);
+        pipe_barrier(PIPE_V);
+        TROWSUM(v77, v73, v75);
+        Tile<
+            TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>
+            v79 = Tile<
+                TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>(v16, v18);
+        uint64_t v80 = (uint64_t)v21;
+        TASSIGN(v79, v80);
+        Tile<
+            TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>
+            v81 = Tile<
+                TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>(v16, v18);
+        uint64_t v82 = (uint64_t)v20;
+        TASSIGN(v81, v82);
+        pipe_barrier(PIPE_V);
+        TMULS(v81, v79, v11);
+        Tile<
+            TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>
+            v83 = Tile<
+                TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>(v16, v18);
+        uint64_t v84 = (uint64_t)v20;
+        TASSIGN(v83, v84);
+        Tile<
+            TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>
+            v85 = Tile<
+                TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>(v16, v18);
+        uint64_t v86 = (uint64_t)v20;
+        TASSIGN(v85, v86);
+        pipe_barrier(PIPE_V);
+        TADDS(v85, v83, v10);
+        Tile<
+            TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>
+            v87 = Tile<
+                TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>(v16, v18);
+        uint64_t v88 = (uint64_t)v20;
+        TASSIGN(v87, v88);
+        Tile<
+            TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>
+            v89 = Tile<
+                TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>(v16, v18);
+        uint64_t v90 = (uint64_t)v20;
+        TASSIGN(v89, v90);
+        pipe_barrier(PIPE_V);
+        TRSQRT(v89, v87);
+        Tile<
+            TileType::Vec, float, 16, 1, BLayout::ColMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>
+            v91 = Tile<
+                TileType::Vec, float, 16, 1, BLayout::ColMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>(v18, v16);
+        uint64_t v92 = (uint64_t)v20;
+        TASSIGN(v91, v92);
+        Tile<
+            TileType::Vec, float, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>
+            v93 = Tile<
+                TileType::Vec, float, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>(v18, v15);
+        uint64_t v94 = (uint64_t)v24;
+        TASSIGN(v93, v94);
+        pipe_barrier(PIPE_V);
+        TROWEXPANDMUL(v93, v68, v91);
+        Tile<
+            TileType::Vec, float, 1, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>
+            v95 = Tile<
+                TileType::Vec, float, 1, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>(v16, v15);
+        uint64_t v96 = (uint64_t)v19;
+        TASSIGN(v95, v96);
+        pto::Shape<1, 1, 1, 1, 128> v97 = pto::Shape<1, 1, 1, 1, 128>();
+        pto::Stride<128, 128, 128, 128, 1> v98 = pto::Stride<128, 128, 128, 128, 1>();
+        GlobalTensor<float, pto::Shape<1, 1, 1, 1, 128>, pto::Stride<128, 128, 128, 128, 1>, pto::Layout::ND> v99 =
+            GlobalTensor<float, pto::Shape<1, 1, 1, 1, 128>, pto::Stride<128, 128, 128, 128, 1>, pto::Layout::ND>(
+                v6 + (v8 + v8 * (unsigned)v15 + v8 * (unsigned)v16), v97, v98
+            );
+        TLOAD(v95, v99);
+        set_flag(PIPE_MTE2, PIPE_V, EVENT_ID3);
+        Tile<
+            TileType::Vec, float, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>
+            v100 = Tile<
+                TileType::Vec, float, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>(v18, v15);
+        uint64_t v101 = (uint64_t)v24;
+        TASSIGN(v100, v101);
+        wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID3);
+        pipe_barrier(PIPE_V);
+        TCOLEXPANDMUL(v100, v93, v95);
+        set_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);
+        pto::Shape<1, 1, 1, 16, 128> v102 = pto::Shape<1, 1, 1, 16, 128>();
+        pto::Stride<16384, 16384, 16384, 1024, 1> v103 = pto::Stride<16384, 16384, 16384, 1024, 1>();
+        GlobalTensor<float, pto::Shape<1, 1, 1, 16, 128>, pto::Stride<16384, 16384, 16384, 1024, 1>, pto::Layout::ND>
+            v104 = GlobalTensor<
+                float, pto::Shape<1, 1, 1, 16, 128>, pto::Stride<16384, 16384, 16384, 1024, 1>, pto::Layout::ND>(
+                v4 + (v8 + (unsigned)v7 * (unsigned)v14 + (unsigned)v67 * (unsigned)v16), v102, v103
+            );
+        wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);
+        TSTORE(v104, v100);
+        set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2);
+    }
+    wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
+    wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2);
+#endif  // __DAV_VEC__
+
+    ptoas_auto_sync_tail(PTOAutoSyncTailMode::kBarrierAll);
+    return;
+}
+
+// --- Kernel entry point ---
+extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) {
+    // Unpack tensor: q_proj_norm__iter_v1
+    __gm__ Tensor *q_proj_norm__iter_v1_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]);
+    __gm__ float *q_proj_norm__iter_v1 = reinterpret_cast<__gm__ float *>(q_proj_norm__iter_v1_tensor->buffer.addr) +
+                                         q_proj_norm__iter_v1_tensor->start_offset;
+
+    // Unpack tensor: q_proj__rv_v2
+    __gm__ Tensor *q_proj__rv_v2_tensor = reinterpret_cast<__gm__ Tensor *>(args[1]);
+    __gm__ float *q_proj__rv_v2 =
+        reinterpret_cast<__gm__ float *>(q_proj__rv_v2_tensor->buffer.addr) + q_proj__rv_v2_tensor->start_offset;
+
+    // Unpack tensor: q_norm_weight__ssa_v0
+    __gm__ Tensor *q_norm_weight__ssa_v0_tensor = reinterpret_cast<__gm__ Tensor *>(args[2]);
+    __gm__ float *q_norm_weight__ssa_v0 = reinterpret_cast<__gm__ float *>(q_norm_weight__ssa_v0_tensor->buffer.addr) +
+                                          q_norm_weight__ssa_v0_tensor->start_offset;
+
+    // Unpack tensor: k_proj_norm__iter_v1
+    __gm__ Tensor *k_proj_norm__iter_v1_tensor = reinterpret_cast<__gm__ Tensor *>(args[3]);
+    __gm__ float *k_proj_norm__iter_v1 = reinterpret_cast<__gm__ float *>(k_proj_norm__iter_v1_tensor->buffer.addr) +
+                                         k_proj_norm__iter_v1_tensor->start_offset;
+
+    // Unpack tensor: k_proj__rv_v2
+    __gm__ Tensor *k_proj__rv_v2_tensor = reinterpret_cast<__gm__ Tensor *>(args[4]);
+    __gm__ float *k_proj__rv_v2 =
+        reinterpret_cast<__gm__ float *>(k_proj__rv_v2_tensor->buffer.addr) + k_proj__rv_v2_tensor->start_offset;
+
+    // Unpack tensor: k_norm_weight__ssa_v0
+    __gm__ Tensor *k_norm_weight__ssa_v0_tensor = reinterpret_cast<__gm__ Tensor *>(args[5]);
+    __gm__ float *k_norm_weight__ssa_v0 = reinterpret_cast<__gm__ float *>(k_norm_weight__ssa_v0_tensor->buffer.addr) +
+                                          k_norm_weight__ssa_v0_tensor->start_offset;
+
+    // Unpack scalar: b0__idx_v0
+    union {
+        uint64_t u64;
+        int64_t val;
+    } b0__idx_v0_conv;
+    b0__idx_v0_conv.u64 = args[6];
+    int64_t b0__idx_v0 = b0__idx_v0_conv.val;
+
+    // Forward to ptoas-generated function
+    qk_norm(
+        q_proj_norm__iter_v1, q_proj__rv_v2, q_norm_weight__ssa_v0, k_proj_norm__iter_v1, k_proj__rv_v2,
+        k_norm_weight__ssa_v0, b0__idx_v0
+    );
+}
diff --git a/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aiv/rmsnorm.cpp b/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aiv/rmsnorm.cpp
new file mode 100644
index 000000000..677bee8e4
--- /dev/null
+++ b/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aiv/rmsnorm.cpp
@@ -0,0 +1,383 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+// Kernel Function: rmsnorm
+// Generated by PyPTO IR Compiler (PTO backend)
+
+#include <cstdint>
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#if defined(__CPU_SIM)
+#define __aicore__
+#else
+#define __aicore__ [aicore]
+#endif
+#endif
+
+#include <pto/pto-inst.hpp>
+#include "tensor.h"
+
+using namespace pto;
+
+// --- ptoas-generated code ---
+
+enum class PTOAutoSyncTailMode : int {
+    kBarrierAll = 0,
+    kSetWaitMte3ToSEvent0 = 1,
+};
+
+static __aicore__ inline void ptoas_auto_sync_tail(PTOAutoSyncTailMode mode = PTOAutoSyncTailMode::kBarrierAll) {
+    switch (mode) {
+    case PTOAutoSyncTailMode::kSetWaitMte3ToSEvent0:
+        set_flag(PIPE_MTE3, PIPE_S, EVENT_ID0);
+        wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID0);
+        break;
+    case PTOAutoSyncTailMode::kBarrierAll:
+    default:
+        pipe_barrier(PIPE_ALL);
+        break;
+    }
+}
+
+static __aicore__ void rmsnorm(__gm__ bfloat16_t *v1, __gm__ bfloat16_t *v2, __gm__ float *v3, int32_t v4, int32_t v5) {
+    RoundMode v6 = RoundMode::CAST_ROUND;
+    unsigned v7 = 5120;
+    unsigned v8 = 0;
+    const float v9 = 9.99999997E-7f;
+    const float v10 = 1.95312503E-4f;
+    const int32_t v11 = 512;
+    const int32_t v12 = 10;
+    const int32_t v13 = 0;
+    const float v14 = 0.0f;
+    const int32_t v15 = 1;
+    const int32_t v16 = 5120;
+    const int32_t v17 = 16;
+    const int64_t v18 = 2112;
+    const int64_t v19 = 64;
+    const int64_t v20 = 0;
+    const int64_t v21 = 100480;
+    const int64_t v22 = 67712;
+    const int64_t v23 = 34944;
+    const int64_t v24 = 18560;
+    const int64_t v25 = 18496;
+    using T = float;
+
+#if defined(__DAV_VEC__)
+    set_mask_norm();
+    set_vector_mask(-1, -1);
+    size_t v26 = (size_t)v15;
+    size_t v27 = (size_t)v13;
+    size_t v28 = (size_t)v12;
+    Tile<
+        TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+        CompactMode::Null>
+        v29 = Tile<
+            TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>(v15, v17);
+    uint64_t v30 = (uint64_t)v25;
+    TASSIGN(v29, v30);
+    set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0);
+    set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2);
+    set_flag(PIPE_V, PIPE_MTE2, EVENT_ID3);
+    set_flag(PIPE_MTE3, PIPE_V, EVENT_ID0);
+    TEXPANDS(v29, v14);
+    for (size_t v31 = v27; v31 < v28; v31 += v26) {
+        Tile<
+            TileType::Vec, bfloat16_t, 16, 512, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>
+            v32 = Tile<
+                TileType::Vec, bfloat16_t, 16, 512, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>(v5, v11);
+        uint64_t v33 = (uint64_t)v24;
+        TASSIGN(v32, v33);
+        unsigned v34 = (unsigned)v5 * v7;
+        pto::Shape<1, 1, 1, -1, 512> v35 = pto::Shape<1, 1, 1, -1, 512>(v5);
+        pto::Stride<-1, -1, -1, 5120, 1> v36 = pto::Stride<-1, -1, -1, 5120, 1>(v34, v34, v34);
+        GlobalTensor<bfloat16_t, pto::Shape<1, 1, 1, -1, 512>, pto::Stride<-1, -1, -1, 5120, 1>, pto::Layout::ND> v37 =
+            GlobalTensor<bfloat16_t, pto::Shape<1, 1, 1, -1, 512>, pto::Stride<-1, -1, -1, 5120, 1>, pto::Layout::ND>(
+                v1 + (v8 + (unsigned)v4 * (unsigned)v16 +
+                      (unsigned)((int32_t)(uint32_t)((int32_t)v31) * (uint32_t)v11) * (unsigned)v15),
+                v35, v36
+            );
+        wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0);
+        TLOAD(v32, v37);
+        set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+        Tile<
+            TileType::Vec, float, 16, 512, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>
+            v38 = Tile<
+                TileType::Vec, float, 16, 512, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>(v17, v11);
+        uint64_t v39 = (uint64_t)v23;
+        TASSIGN(v38, v39);
+        wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+        TCVT(v38, v32, v6);
+        set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0);
+        Tile<
+            TileType::Vec, float, 16, 512, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>
+            v40 = Tile<
+                TileType::Vec, float, 16, 512, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>(v17, v11);
+        uint64_t v41 = (uint64_t)v23;
+        TASSIGN(v40, v41);
+        pipe_barrier(PIPE_V);
+        TMUL(v40, v38, v38);
+        Tile<
+            TileType::Vec, float, 16, 512, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>
+            v42 = Tile<
+                TileType::Vec, float, 16, 512, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>(v17, v11);
+        uint64_t v43 = (uint64_t)v22;
+        TASSIGN(v42, v43);
+        Tile<
+            TileType::Vec, float, 16, 1, BLayout::ColMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>
+            v44 = Tile<
+                TileType::Vec, float, 16, 1, BLayout::ColMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>(v17, v15);
+        uint64_t v45 = (uint64_t)v21;
+        TASSIGN(v44, v45);
+        pipe_barrier(PIPE_V);
+        TROWSUM(v44, v40, v42);
+        Tile<
+            TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>
+            v46 = Tile<
+                TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>(v15, v17);
+        uint64_t v47 = (uint64_t)v21;
+        TASSIGN(v46, v47);
+        Tile<
+            TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>
+            v48 = Tile<
+                TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>(v15, v17);
+        uint64_t v49 = (uint64_t)v25;
+        TASSIGN(v48, v49);
+        pipe_barrier(PIPE_V);
+        TADD(v48, v29, v46);
+    }
+    set_flag(PIPE_V, PIPE_MTE2, EVENT_ID1);
+    Tile<
+        TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+        CompactMode::Null>
+        v50 = Tile<
+            TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>(v15, v17);
+    uint64_t v51 = (uint64_t)v25;
+    TASSIGN(v50, v51);
+    pipe_barrier(PIPE_V);
+    TMULS(v50, v29, v10);
+    Tile<
+        TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+        CompactMode::Null>
+        v52 = Tile<
+            TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>(v15, v17);
+    uint64_t v53 = (uint64_t)v25;
+    TASSIGN(v52, v53);
+    pipe_barrier(PIPE_V);
+    TADDS(v52, v50, v9);
+    Tile<
+        TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+        CompactMode::Null>
+        v54 = Tile<
+            TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>(v15, v17);
+    uint64_t v55 = (uint64_t)v25;
+    TASSIGN(v54, v55);
+    Tile<
+        TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+        CompactMode::Null>
+        v56 = Tile<
+            TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>(v15, v17);
+    uint64_t v57 = (uint64_t)v25;
+    TASSIGN(v56, v57);
+    pipe_barrier(PIPE_V);
+    TSQRT(v56, v54);
+    Tile<
+        TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+        CompactMode::Null>
+        v58 = Tile<
+            TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>(v15, v17);
+    uint64_t v59 = (uint64_t)v25;
+    TASSIGN(v58, v59);
+    Tile<
+        TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+        CompactMode::Null>
+        v60 = Tile<
+            TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>(v15, v17);
+    uint64_t v61 = (uint64_t)v20;
+    TASSIGN(v60, v61);
+    pipe_barrier(PIPE_V);
+    TRECIP(v60, v58);
+    Tile<
+        TileType::Vec, float, 16, 1, BLayout::ColMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+        CompactMode::Null>
+        v62 = Tile<
+            TileType::Vec, float, 16, 1, BLayout::ColMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>(v17, v15);
+    uint64_t v63 = (uint64_t)v20;
+    TASSIGN(v62, v63);
+    wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID1);
+    for (size_t v64 = v27; v64 < v28; v64 += v26) {
+        int32_t v65 = (int32_t)((uint32_t)((int32_t)v64) * (uint32_t)v11);
+        Tile<
+            TileType::Vec, bfloat16_t, 16, 512, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>
+            v66 = Tile<
+                TileType::Vec, bfloat16_t, 16, 512, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>(v5, v11);
+        uint64_t v67 = (uint64_t)v24;
+        TASSIGN(v66, v67);
+        unsigned v68 = (unsigned)v5 * v7;
+        pto::Shape<1, 1, 1, -1, 512> v69 = pto::Shape<1, 1, 1, -1, 512>(v5);
+        pto::Stride<-1, -1, -1, 5120, 1> v70 = pto::Stride<-1, -1, -1, 5120, 1>(v68, v68, v68);
+        GlobalTensor<bfloat16_t, pto::Shape<1, 1, 1, -1, 512>, pto::Stride<-1, -1, -1, 5120, 1>, pto::Layout::ND> v71 =
+            GlobalTensor<bfloat16_t, pto::Shape<1, 1, 1, -1, 512>, pto::Stride<-1, -1, -1, 5120, 1>, pto::Layout::ND>(
+                v1 + (v8 + (unsigned)v4 * (unsigned)v16 + (unsigned)v65 * (unsigned)v15), v69, v70
+            );
+        wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2);
+        TLOAD(v66, v71);
+        set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1);
+        Tile<
+            TileType::Vec, float, 16, 512, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>
+            v72 = Tile<
+                TileType::Vec, float, 16, 512, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>(v17, v11);
+        uint64_t v73 = (uint64_t)v23;
+        TASSIGN(v72, v73);
+        wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1);
+        pipe_barrier(PIPE_V);
+        TCVT(v72, v66, v6);
+        set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2);
+        Tile<
+            TileType::Vec, float, 1, 512, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>
+            v74 = Tile<
+                TileType::Vec, float, 1, 512, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>(v15, v11);
+        uint64_t v75 = (uint64_t)v19;
+        TASSIGN(v74, v75);
+        pto::Shape<1, 1, 1, 1, 512> v76 = pto::Shape<1, 1, 1, 1, 512>();
+        pto::Stride<5120, 5120, 5120, 5120, 1> v77 = pto::Stride<5120, 5120, 5120, 5120, 1>();
+        GlobalTensor<float, pto::Shape<1, 1, 1, 1, 512>, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v78 =
+            GlobalTensor<float, pto::Shape<1, 1, 1, 1, 512>, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(
+                v3 + (v8 + v8 * (unsigned)v16 + (unsigned)v65 * (unsigned)v15), v76, v77
+            );
+        wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID3);
+        TLOAD(v74, v78);
+        set_flag(PIPE_MTE2, PIPE_V, EVENT_ID2);
+        Tile<
+            TileType::Vec, float, 16, 512, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>
+            v79 = Tile<
+                TileType::Vec, float, 16, 512, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>(v17, v11);
+        uint64_t v80 = (uint64_t)v23;
+        TASSIGN(v79, v80);
+        pipe_barrier(PIPE_V);
+        TROWEXPANDMUL(v79, v72, v62);
+        Tile<
+            TileType::Vec, float, 16, 512, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>
+            v81 = Tile<
+                TileType::Vec, float, 16, 512, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>(v17, v11);
+        uint64_t v82 = (uint64_t)v23;
+        TASSIGN(v81, v82);
+        pipe_barrier(PIPE_V);
+        wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID2);
+        TCOLEXPANDMUL(v81, v79, v74);
+        set_flag(PIPE_V, PIPE_MTE2, EVENT_ID3);
+        Tile<
+            TileType::Vec, bfloat16_t, 16, 512, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>
+            v83 = Tile<
+                TileType::Vec, bfloat16_t, 16, 512, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>(v17, v11);
+        uint64_t v84 = (uint64_t)v18;
+        TASSIGN(v83, v84);
+        pipe_barrier(PIPE_V);
+        wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID0);
+        TCVT(v83, v81, v6);
+        set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+        pto::Shape<1, 1, 1, 16, 512> v85 = pto::Shape<1, 1, 1, 16, 512>();
+        pto::Stride<81920, 81920, 81920, 5120, 1> v86 = pto::Stride<81920, 81920, 81920, 5120, 1>();
+        GlobalTensor<
+            bfloat16_t, pto::Shape<1, 1, 1, 16, 512>, pto::Stride<81920, 81920, 81920, 5120, 1>, pto::Layout::ND>
+            v87 = GlobalTensor<
+                bfloat16_t, pto::Shape<1, 1, 1, 16, 512>, pto::Stride<81920, 81920, 81920, 5120, 1>, pto::Layout::ND>(
+                v2 + (v8 + v8 * (unsigned)v16 + (unsigned)v65 * (unsigned)v15), v85, v86
+            );
+        wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+        TSTORE(v87, v83);
+        set_flag(PIPE_MTE3, PIPE_V, EVENT_ID0);
+    }
+    wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0);
+    wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2);
+    wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID3);
+    wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID0);
+#endif  // __DAV_VEC__
+
+    ptoas_auto_sync_tail(PTOAutoSyncTailMode::kBarrierAll);
+    return;
+}
+
+// --- Kernel entry point ---
+extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) {
+    // Unpack tensor: current_hidden__rv_v2
+    __gm__ Tensor *current_hidden__rv_v2_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]);
+    __gm__ bfloat16_t *current_hidden__rv_v2 =
+        reinterpret_cast<__gm__ bfloat16_t *>(current_hidden__rv_v2_tensor->buffer.addr) +
+        current_hidden__rv_v2_tensor->start_offset;
+
+    // Unpack tensor: normed_tile__ssa_v0
+    __gm__ Tensor *normed_tile__ssa_v0_tensor = reinterpret_cast<__gm__ Tensor *>(args[1]);
+    __gm__ bfloat16_t *normed_tile__ssa_v0 =
+        reinterpret_cast<__gm__ bfloat16_t *>(normed_tile__ssa_v0_tensor->buffer.addr) +
+        normed_tile__ssa_v0_tensor->start_offset;
+
+    // Unpack tensor: input_rms_weight__ssa_v0
+    __gm__ Tensor *input_rms_weight__ssa_v0_tensor = reinterpret_cast<__gm__ Tensor *>(args[2]);
+    __gm__ float *input_rms_weight__ssa_v0 =
+        reinterpret_cast<__gm__ float *>(input_rms_weight__ssa_v0_tensor->buffer.addr) +
+        input_rms_weight__ssa_v0_tensor->start_offset;
+
+    // Unpack scalar: b0__idx_v0
+    union {
+        uint64_t u64;
+        int64_t val;
+    } b0__idx_v0_conv;
+    b0__idx_v0_conv.u64 = args[3];
+    int64_t b0__idx_v0 = b0__idx_v0_conv.val;
+
+    // Unpack scalar: cur_valid__ssa_v1
+    union {
+        uint64_t u64;
+        int64_t val;
+    } cur_valid__ssa_v1_conv;
+    cur_valid__ssa_v1_conv.u64 = args[4];
+    int64_t cur_valid__ssa_v1 = cur_valid__ssa_v1_conv.val;
+
+    // Forward to ptoas-generated function
+    rmsnorm(current_hidden__rv_v2, normed_tile__ssa_v0, input_rms_weight__ssa_v0, b0__idx_v0, cur_valid__ssa_v1);
+}
diff --git a/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aiv/rope_kv_cache.cpp b/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aiv/rope_kv_cache.cpp
new file mode 100644
index 000000000..54bce68c8
--- /dev/null
+++ b/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aiv/rope_kv_cache.cpp
@@ -0,0 +1,593 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+// Kernel Function: rope_kv_cache
+// Generated by PyPTO IR Compiler (PTO backend)
+
+#include <cstdint>
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#if defined(__CPU_SIM)
+#define __aicore__
+#else
+#define __aicore__ [aicore]
+#endif
+#endif
+
+#include <pto/pto-inst.hpp>
+#include "tensor.h"
+
+using namespace pto;
+
+// --- ptoas-generated code ---
+
+enum class PTOAutoSyncTailMode : int {
+    kBarrierAll = 0,
+    kSetWaitMte3ToSEvent0 = 1,
+};
+
+static __aicore__ inline void ptoas_auto_sync_tail(PTOAutoSyncTailMode mode = PTOAutoSyncTailMode::kBarrierAll) {
+    switch (mode) {
+    case PTOAutoSyncTailMode::kSetWaitMte3ToSEvent0:
+        set_flag(PIPE_MTE3, PIPE_S, EVENT_ID0);
+        wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID0);
+        break;
+    case PTOAutoSyncTailMode::kBarrierAll:
+    default:
+        pipe_barrier(PIPE_ALL);
+        break;
+    }
+}
+
+static __aicore__ void rope_kv_cache(
+    __gm__ bfloat16_t *v1, __gm__ bfloat16_t *v2, __gm__ bfloat16_t *v3, __gm__ float *v4, __gm__ float *v5,
+    __gm__ float *v6, __gm__ float *v7, __gm__ float *v8, __gm__ float *v9, __gm__ float *v10, int32_t v11, int32_t v12,
+    int32_t v13, int32_t v14, int32_t v15
+) {
+    unsigned v16 = 64;
+    RoundMode v17 = RoundMode::CAST_ROUND;
+    unsigned v18 = 0;
+    const int32_t v19 = 5;
+    const int32_t v20 = 256;
+    const int32_t v21 = 8;
+    const int32_t v22 = 0;
+    const int32_t v23 = 5120;
+    const int32_t v24 = 64;
+    const int32_t v25 = 1024;
+    const int32_t v26 = 16;
+    const int32_t v27 = 1;
+    const int32_t v28 = 128;
+    const int64_t v29 = 2944;
+    const int64_t v30 = 2688;
+    const int64_t v31 = 2176;
+    const int64_t v32 = 2048;
+    const int64_t v33 = 1792;
+    const int64_t v34 = 1536;
+    const int64_t v35 = 1280;
+    const int64_t v36 = 1024;
+    const int64_t v37 = 768;
+    const int64_t v38 = 512;
+    const int64_t v39 = 256;
+    const int64_t v40 = 0;
+    using T = float;
+
+#if defined(__DAV_VEC__)
+    set_mask_norm();
+    set_vector_mask(-1, -1);
+    size_t v41 = (size_t)v27;
+    Tile<
+        TileType::Vec, float, 1, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+        CompactMode::Null>
+        v42 = Tile<
+            TileType::Vec, float, 1, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>(v27, v24);
+    uint64_t v43 = (uint64_t)v40;
+    TASSIGN(v42, v43);
+    pto::Shape<1, 1, 1, 1, 64> v44 = pto::Shape<1, 1, 1, 1, 64>();
+    pto::Stride<128, 128, 128, 128, 1> v45 = pto::Stride<128, 128, 128, 128, 1>();
+    GlobalTensor<float, pto::Shape<1, 1, 1, 1, 64>, pto::Stride<128, 128, 128, 128, 1>, pto::Layout::ND> v46 =
+        GlobalTensor<float, pto::Shape<1, 1, 1, 1, 64>, pto::Stride<128, 128, 128, 128, 1>, pto::Layout::ND>(
+            v5 + (v18 + v18 * (unsigned)v28 + v18 * (unsigned)v27), v44, v45
+        );
+    set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0);
+    set_flag(PIPE_MTE3, PIPE_V, EVENT_ID0);
+    set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2);
+    set_flag(PIPE_MTE3, PIPE_V, EVENT_ID3);
+    set_flag(PIPE_MTE3, PIPE_V, EVENT_ID4);
+    TLOAD(v42, v46);
+    Tile<
+        TileType::Vec, float, 1, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+        CompactMode::Null>
+        v47 = Tile<
+            TileType::Vec, float, 1, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>(v27, v24);
+    uint64_t v48 = (uint64_t)v39;
+    TASSIGN(v47, v48);
+    pto::Shape<1, 1, 1, 1, 64> v49 = pto::Shape<1, 1, 1, 1, 64>();
+    pto::Stride<128, 128, 128, 128, 1> v50 = pto::Stride<128, 128, 128, 128, 1>();
+    GlobalTensor<float, pto::Shape<1, 1, 1, 1, 64>, pto::Stride<128, 128, 128, 128, 1>, pto::Layout::ND> v51 =
+        GlobalTensor<float, pto::Shape<1, 1, 1, 1, 64>, pto::Stride<128, 128, 128, 128, 1>, pto::Layout::ND>(
+            v6 + (v18 + v18 * (unsigned)v28 + v18 * (unsigned)v27), v49, v50
+        );
+    TLOAD(v47, v51);
+    Tile<
+        TileType::Vec, float, 1, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+        CompactMode::Null>
+        v52 = Tile<
+            TileType::Vec, float, 1, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>(v27, v24);
+    uint64_t v53 = (uint64_t)v38;
+    TASSIGN(v52, v53);
+    pto::Shape<1, 1, 1, 1, 64> v54 = pto::Shape<1, 1, 1, 1, 64>();
+    pto::Stride<128, 128, 128, 128, 1> v55 = pto::Stride<128, 128, 128, 128, 1>();
+    GlobalTensor<float, pto::Shape<1, 1, 1, 1, 64>, pto::Stride<128, 128, 128, 128, 1>, pto::Layout::ND> v56 =
+        GlobalTensor<float, pto::Shape<1, 1, 1, 1, 64>, pto::Stride<128, 128, 128, 128, 1>, pto::Layout::ND>(
+            v7 + (v18 + v18 * (unsigned)v28 + v18 * (unsigned)v27), v54, v55
+        );
+    TLOAD(v52, v56);
+    Tile<
+        TileType::Vec, float, 1, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+        CompactMode::Null>
+        v57 = Tile<
+            TileType::Vec, float, 1, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>(v27, v24);
+    uint64_t v58 = (uint64_t)v37;
+    TASSIGN(v57, v58);
+    pto::Shape<1, 1, 1, 1, 64> v59 = pto::Shape<1, 1, 1, 1, 64>();
+    pto::Stride<128, 128, 128, 128, 1> v60 = pto::Stride<128, 128, 128, 128, 1>();
+    GlobalTensor<float, pto::Shape<1, 1, 1, 1, 64>, pto::Stride<128, 128, 128, 128, 1>, pto::Layout::ND> v61 =
+        GlobalTensor<float, pto::Shape<1, 1, 1, 1, 64>, pto::Stride<128, 128, 128, 128, 1>, pto::Layout::ND>(
+            v8 + (v18 + v18 * (unsigned)v28 + v18 * (unsigned)v27), v59, v60
+        );
+    TLOAD(v57, v61);
+    for (size_t v62 = (size_t)v11; v62 < ((size_t)((int32_t)(uint32_t)v11 + (uint32_t)v21)); v62 += v41) {
+        int32_t v63 = (int32_t)v62;
+        int32_t v64 = (int32_t)((uint32_t)v63 * (uint32_t)v28);
+        int32_t v65 =
+            (int32_t)((uint32_t)((int32_t)(uint32_t)((int32_t)(uint32_t)((int32_t)(uint32_t)v12 * (uint32_t)v21) +
+                                                     (uint32_t)v63) *
+                                 (uint32_t)v20) +
+                      (uint32_t)v13);
+        Tile<
+            TileType::Vec, float, 1, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>
+            v66 = Tile<
+                TileType::Vec, float, 1, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>(v27, v24);
+        uint64_t v67 = (uint64_t)v36;
+        TASSIGN(v66, v67);
+        pto::Shape<1, 1, 1, 1, 64> v68 = pto::Shape<1, 1, 1, 1, 64>();
+        pto::Stride<1024, 1024, 1024, 1024, 1> v69 = pto::Stride<1024, 1024, 1024, 1024, 1>();
+        GlobalTensor<float, pto::Shape<1, 1, 1, 1, 64>, pto::Stride<1024, 1024, 1024, 1024, 1>, pto::Layout::ND> v70 =
+            GlobalTensor<float, pto::Shape<1, 1, 1, 1, 64>, pto::Stride<1024, 1024, 1024, 1024, 1>, pto::Layout::ND>(
+                v4 + (v18 + (unsigned)v14 * (unsigned)v25 + (unsigned)v64 * (unsigned)v27), v68, v69
+            );
+        wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0);
+        TLOAD(v66, v70);
+        set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+        Tile<
+            TileType::Vec, float, 1, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>
+            v71 = Tile<
+                TileType::Vec, float, 1, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>(v27, v24);
+        uint64_t v72 = (uint64_t)v35;
+        TASSIGN(v71, v72);
+        pto::Shape<1, 1, 1, 1, 64> v73 = pto::Shape<1, 1, 1, 1, 64>();
+        pto::Stride<1024, 1024, 1024, 1024, 1> v74 = pto::Stride<1024, 1024, 1024, 1024, 1>();
+        GlobalTensor<float, pto::Shape<1, 1, 1, 1, 64>, pto::Stride<1024, 1024, 1024, 1024, 1>, pto::Layout::ND> v75 =
+            GlobalTensor<float, pto::Shape<1, 1, 1, 1, 64>, pto::Stride<1024, 1024, 1024, 1024, 1>, pto::Layout::ND>(
+                v4 + (v18 + (unsigned)v14 * (unsigned)v25 +
+                      (unsigned)((int32_t)(uint32_t)v64 + (uint32_t)v24) * (unsigned)v27),
+                v73, v74
+            );
+        TLOAD(v71, v75);
+        set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1);
+        Tile<
+            TileType::Vec, float, 1, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>
+            v76 = Tile<
+                TileType::Vec, float, 1, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>(v27, v24);
+        uint64_t v77 = (uint64_t)v34;
+        TASSIGN(v76, v77);
+        wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+        TCOLEXPANDMUL(v76, v66, v42);
+        Tile<
+            TileType::Vec, float, 1, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>
+            v78 = Tile<
+                TileType::Vec, float, 1, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>(v27, v24);
+        uint64_t v79 = (uint64_t)v33;
+        TASSIGN(v78, v79);
+        wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1);
+        TCOLEXPANDMUL(v78, v71, v47);
+        Tile<
+            TileType::Vec, float, 1, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>
+            v80 = Tile<
+                TileType::Vec, float, 1, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>(v27, v24);
+        uint64_t v81 = (uint64_t)v34;
+        TASSIGN(v80, v81);
+        pipe_barrier(PIPE_V);
+        TSUB(v80, v76, v78);
+        Tile<
+            TileType::Vec, float, 1, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>
+            v82 = Tile<
+                TileType::Vec, float, 1, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>(v27, v24);
+        uint64_t v83 = (uint64_t)v35;
+        TASSIGN(v82, v83);
+        TCOLEXPANDMUL(v82, v71, v52);
+        Tile<
+            TileType::Vec, float, 1, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>
+            v84 = Tile<
+                TileType::Vec, float, 1, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>(v27, v24);
+        uint64_t v85 = (uint64_t)v36;
+        TASSIGN(v84, v85);
+        TCOLEXPANDMUL(v84, v66, v57);
+        Tile<
+            TileType::Vec, float, 1, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>
+            v86 = Tile<
+                TileType::Vec, float, 1, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>(v27, v24);
+        uint64_t v87 = (uint64_t)v36;
+        TASSIGN(v86, v87);
+        pipe_barrier(PIPE_V);
+        TADD(v86, v82, v84);
+        Tile<
+            TileType::Vec, bfloat16_t, 1, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>
+            v88 = Tile<
+                TileType::Vec, bfloat16_t, 1, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>(v27, v24);
+        uint64_t v89 = (uint64_t)v32;
+        TASSIGN(v88, v89);
+        wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID0);
+        TCVT(v88, v80, v17);
+        set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+        pto::Shape<1, 1, 1, 1, 64> v90 = pto::Shape<1, 1, 1, 1, 64>();
+        pto::Stride<128, 128, 128, 128, 1> v91 = pto::Stride<128, 128, 128, 128, 1>();
+        GlobalTensor<bfloat16_t, pto::Shape<1, 1, 1, 1, 64>, pto::Stride<128, 128, 128, 128, 1>, pto::Layout::ND> v92 =
+            GlobalTensor<bfloat16_t, pto::Shape<1, 1, 1, 1, 64>, pto::Stride<128, 128, 128, 128, 1>, pto::Layout::ND>(
+                v2 + (v18 + (unsigned)v65 * (unsigned)v28 + v18 * (unsigned)v27), v90, v91
+            );
+        wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+        TSTORE(v92, v88);
+        set_flag(PIPE_MTE3, PIPE_V, EVENT_ID1);
+        Tile<
+            TileType::Vec, bfloat16_t, 1, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>
+            v93 = Tile<
+                TileType::Vec, bfloat16_t, 1, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>(v27, v24);
+        uint64_t v94 = (uint64_t)v32;
+        TASSIGN(v93, v94);
+        wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID1);
+        TCVT(v93, v86, v17);
+        set_flag(PIPE_V, PIPE_MTE2, EVENT_ID1);
+        set_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);
+        pto::Shape<1, 1, 1, 1, 64> v95 = pto::Shape<1, 1, 1, 1, 64>();
+        pto::Stride<128, 128, 128, 128, 1> v96 = pto::Stride<128, 128, 128, 128, 1>();
+        GlobalTensor<bfloat16_t, pto::Shape<1, 1, 1, 1, 64>, pto::Stride<128, 128, 128, 128, 1>, pto::Layout::ND> v97 =
+            GlobalTensor<bfloat16_t, pto::Shape<1, 1, 1, 1, 64>, pto::Stride<128, 128, 128, 128, 1>, pto::Layout::ND>(
+                v2 + (v18 + (unsigned)v65 * (unsigned)v28 + v16 * (unsigned)v27), v95, v96
+            );
+        wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);
+        TSTORE(v97, v93);
+        set_flag(PIPE_MTE3, PIPE_V, EVENT_ID2);
+        Tile<
+            TileType::Vec, float, 1, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>
+            v98 = Tile<
+                TileType::Vec, float, 1, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>(v27, v28);
+        uint64_t v99 = (uint64_t)v31;
+        TASSIGN(v98, v99);
+        pto::Shape<1, 1, 1, 1, 128> v100 = pto::Shape<1, 1, 1, 1, 128>();
+        pto::Stride<1024, 1024, 1024, 1024, 1> v101 = pto::Stride<1024, 1024, 1024, 1024, 1>();
+        GlobalTensor<float, pto::Shape<1, 1, 1, 1, 128>, pto::Stride<1024, 1024, 1024, 1024, 1>, pto::Layout::ND> v102 =
+            GlobalTensor<float, pto::Shape<1, 1, 1, 1, 128>, pto::Stride<1024, 1024, 1024, 1024, 1>, pto::Layout::ND>(
+                v9 + (v18 + (unsigned)v14 * (unsigned)v25 + (unsigned)v64 * (unsigned)v27), v100, v101
+            );
+        TLOAD(v98, v102);
+        set_flag(PIPE_MTE2, PIPE_V, EVENT_ID2);
+        Tile<
+            TileType::Vec, bfloat16_t, 1, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>
+            v103 = Tile<
+                TileType::Vec, bfloat16_t, 1, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>(v27, v28);
+        uint64_t v104 = (uint64_t)v30;
+        TASSIGN(v103, v104);
+        wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID2);
+        TCVT(v103, v98, v17);
+        set_flag(PIPE_V, PIPE_MTE3, EVENT_ID2);
+        pto::Shape<1, 1, 1, 1, 128> v105 = pto::Shape<1, 1, 1, 1, 128>();
+        pto::Stride<128, 128, 128, 128, 1> v106 = pto::Stride<128, 128, 128, 128, 1>();
+        GlobalTensor<bfloat16_t, pto::Shape<1, 1, 1, 1, 128>, pto::Stride<128, 128, 128, 128, 1>, pto::Layout::ND>
+            v107 = GlobalTensor<
+                bfloat16_t, pto::Shape<1, 1, 1, 1, 128>, pto::Stride<128, 128, 128, 128, 1>, pto::Layout::ND>(
+                v3 + (v18 + (unsigned)v65 * (unsigned)v28 + v18 * (unsigned)v27), v105, v106
+            );
+        wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID2);
+        TSTORE(v107, v103);
+        wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID1);
+        wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID2);
+        for (size_t v108 = (size_t)v22; v108 < ((size_t)v19); v108 += v41) {
+            int32_t v109 = (int32_t)v108;
+            int32_t v110 =
+                (int32_t)((uint32_t)((int32_t)(uint32_t)((int32_t)(uint32_t)v63 * (uint32_t)v19) + (uint32_t)v109) *
+                          (uint32_t)v28);
+            Tile<
+                TileType::Vec, float, 1, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>
+                v111 = Tile<
+                    TileType::Vec, float, 1, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                    CompactMode::Null>(v27, v24);
+            uint64_t v112 = (uint64_t)v36;
+            TASSIGN(v111, v112);
+            pto::Shape<1, 1, 1, 1, 64> v113 = pto::Shape<1, 1, 1, 1, 64>();
+            pto::Stride<5120, 5120, 5120, 5120, 1> v114 = pto::Stride<5120, 5120, 5120, 5120, 1>();
+            GlobalTensor<float, pto::Shape<1, 1, 1, 1, 64>, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>
+                v115 = GlobalTensor<
+                    float, pto::Shape<1, 1, 1, 1, 64>, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(
+                    v10 + (v18 + (unsigned)v14 * (unsigned)v23 + (unsigned)v110 * (unsigned)v27), v113, v114
+                );
+            wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2);
+            TLOAD(v111, v115);
+            set_flag(PIPE_MTE2, PIPE_V, EVENT_ID3);
+            Tile<
+                TileType::Vec, float, 1, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>
+                v116 = Tile<
+                    TileType::Vec, float, 1, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                    CompactMode::Null>(v27, v24);
+            uint64_t v117 = (uint64_t)v35;
+            TASSIGN(v116, v117);
+            pto::Shape<1, 1, 1, 1, 64> v118 = pto::Shape<1, 1, 1, 1, 64>();
+            pto::Stride<5120, 5120, 5120, 5120, 1> v119 = pto::Stride<5120, 5120, 5120, 5120, 1>();
+            GlobalTensor<float, pto::Shape<1, 1, 1, 1, 64>, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>
+                v120 = GlobalTensor<
+                    float, pto::Shape<1, 1, 1, 1, 64>, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(
+                    v10 + (v18 + (unsigned)v14 * (unsigned)v23 +
+                           (unsigned)((int32_t)(uint32_t)v110 + (uint32_t)v24) * (unsigned)v27),
+                    v118, v119
+                );
+            TLOAD(v116, v120);
+            set_flag(PIPE_MTE2, PIPE_V, EVENT_ID4);
+            Tile<
+                TileType::Vec, float, 1, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>
+                v121 = Tile<
+                    TileType::Vec, float, 1, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                    CompactMode::Null>(v27, v24);
+            uint64_t v122 = (uint64_t)v34;
+            TASSIGN(v121, v122);
+            wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID3);
+            TCOLEXPANDMUL(v121, v111, v42);
+            Tile<
+                TileType::Vec, float, 1, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>
+                v123 = Tile<
+                    TileType::Vec, float, 1, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                    CompactMode::Null>(v27, v24);
+            uint64_t v124 = (uint64_t)v33;
+            TASSIGN(v123, v124);
+            wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID4);
+            TCOLEXPANDMUL(v123, v116, v47);
+            Tile<
+                TileType::Vec, float, 1, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>
+                v125 = Tile<
+                    TileType::Vec, float, 1, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                    CompactMode::Null>(v27, v24);
+            uint64_t v126 = (uint64_t)v34;
+            TASSIGN(v125, v126);
+            pipe_barrier(PIPE_V);
+            TSUB(v125, v121, v123);
+            Tile<
+                TileType::Vec, bfloat16_t, 1, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>
+                v127 = Tile<
+                    TileType::Vec, bfloat16_t, 1, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                    CompactMode::Null>(v27, v24);
+            uint64_t v128 = (uint64_t)v32;
+            TASSIGN(v127, v128);
+            pipe_barrier(PIPE_V);
+            wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID3);
+            TCVT(v127, v125, v17);
+            set_flag(PIPE_V, PIPE_MTE3, EVENT_ID3);
+            Tile<
+                TileType::Vec, float, 1, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>
+                v129 = Tile<
+                    TileType::Vec, float, 1, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                    CompactMode::Null>(v27, v24);
+            uint64_t v130 = (uint64_t)v35;
+            TASSIGN(v129, v130);
+            TCOLEXPANDMUL(v129, v116, v52);
+            Tile<
+                TileType::Vec, float, 1, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>
+                v131 = Tile<
+                    TileType::Vec, float, 1, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                    CompactMode::Null>(v27, v24);
+            uint64_t v132 = (uint64_t)v36;
+            TASSIGN(v131, v132);
+            TCOLEXPANDMUL(v131, v111, v57);
+            Tile<
+                TileType::Vec, float, 1, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>
+                v133 = Tile<
+                    TileType::Vec, float, 1, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                    CompactMode::Null>(v27, v24);
+            uint64_t v134 = (uint64_t)v36;
+            TASSIGN(v133, v134);
+            pipe_barrier(PIPE_V);
+            TADD(v133, v129, v131);
+            Tile<
+                TileType::Vec, bfloat16_t, 1, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                CompactMode::Null>
+                v135 = Tile<
+                    TileType::Vec, bfloat16_t, 1, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                    CompactMode::Null>(v27, v24);
+            uint64_t v136 = (uint64_t)v29;
+            TASSIGN(v135, v136);
+            pipe_barrier(PIPE_V);
+            wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID4);
+            TCVT(v135, v133, v17);
+            set_flag(PIPE_V, PIPE_MTE3, EVENT_ID4);
+            set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2);
+            int32_t v137 = (int32_t)((uint32_t)((int32_t)(uint32_t)((int32_t)(uint32_t)v14 * (uint32_t)v28) +
+                                                (uint32_t)((int32_t)(uint32_t)v63 * (uint32_t)v26)) +
+                                     (uint32_t)v109);
+            pto::Shape<1, 1, 1, 1, 64> v138 = pto::Shape<1, 1, 1, 1, 64>();
+            pto::Stride<128, 128, 128, 128, 1> v139 = pto::Stride<128, 128, 128, 128, 1>();
+            GlobalTensor<bfloat16_t, pto::Shape<1, 1, 1, 1, 64>, pto::Stride<128, 128, 128, 128, 1>, pto::Layout::ND>
+                v140 = GlobalTensor<
+                    bfloat16_t, pto::Shape<1, 1, 1, 1, 64>, pto::Stride<128, 128, 128, 128, 1>, pto::Layout::ND>(
+                    v1 + (v18 + (unsigned)v137 * (unsigned)v28 + v18 * (unsigned)v27), v138, v139
+                );
+            wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID3);
+            pipe_barrier(PIPE_MTE3);
+            TSTORE(v140, v127);
+            set_flag(PIPE_MTE3, PIPE_V, EVENT_ID3);
+            pto::Shape<1, 1, 1, 1, 64> v141 = pto::Shape<1, 1, 1, 1, 64>();
+            pto::Stride<128, 128, 128, 128, 1> v142 = pto::Stride<128, 128, 128, 128, 1>();
+            GlobalTensor<bfloat16_t, pto::Shape<1, 1, 1, 1, 64>, pto::Stride<128, 128, 128, 128, 1>, pto::Layout::ND>
+                v143 = GlobalTensor<
+                    bfloat16_t, pto::Shape<1, 1, 1, 1, 64>, pto::Stride<128, 128, 128, 128, 1>, pto::Layout::ND>(
+                    v1 + (v18 + (unsigned)v137 * (unsigned)v28 + v16 * (unsigned)v27), v141, v142
+                );
+            pipe_barrier(PIPE_MTE3);
+            wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID4);
+            TSTORE(v143, v135);
+            set_flag(PIPE_MTE3, PIPE_V, EVENT_ID4);
+        };
+        set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0);
+        set_flag(PIPE_MTE3, PIPE_V, EVENT_ID0);
+    }
+    wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0);
+    wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID0);
+    wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2);
+    wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID3);
+    wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID4);
+#endif  // __DAV_VEC__
+
+    ptoas_auto_sync_tail(PTOAutoSyncTailMode::kBarrierAll);
+    return;
+}
+
+// --- Kernel entry point ---
+extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) {
+    // Unpack tensor: all_q_padded__iter_v6
+    __gm__ Tensor *all_q_padded__iter_v6_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]);
+    __gm__ bfloat16_t *all_q_padded__iter_v6 =
+        reinterpret_cast<__gm__ bfloat16_t *>(all_q_padded__iter_v6_tensor->buffer.addr) +
+        all_q_padded__iter_v6_tensor->start_offset;
+
+    // Unpack tensor: k_cache__iter_v5
+    __gm__ Tensor *k_cache__iter_v5_tensor = reinterpret_cast<__gm__ Tensor *>(args[1]);
+    __gm__ bfloat16_t *k_cache__iter_v5 = reinterpret_cast<__gm__ bfloat16_t *>(k_cache__iter_v5_tensor->buffer.addr) +
+                                          k_cache__iter_v5_tensor->start_offset;
+
+    // Unpack tensor: v_cache__iter_v5
+    __gm__ Tensor *v_cache__iter_v5_tensor = reinterpret_cast<__gm__ Tensor *>(args[2]);
+    __gm__ bfloat16_t *v_cache__iter_v5 = reinterpret_cast<__gm__ bfloat16_t *>(v_cache__iter_v5_tensor->buffer.addr) +
+                                          v_cache__iter_v5_tensor->start_offset;
+
+    // Unpack tensor: k_proj_norm__rv_v2
+    __gm__ Tensor *k_proj_norm__rv_v2_tensor = reinterpret_cast<__gm__ Tensor *>(args[3]);
+    __gm__ float *k_proj_norm__rv_v2 = reinterpret_cast<__gm__ float *>(k_proj_norm__rv_v2_tensor->buffer.addr) +
+                                       k_proj_norm__rv_v2_tensor->start_offset;
+
+    // Unpack tensor: cos_lo__ssa_v0
+    __gm__ Tensor *cos_lo__ssa_v0_tensor = reinterpret_cast<__gm__ Tensor *>(args[4]);
+    __gm__ float *cos_lo__ssa_v0 =
+        reinterpret_cast<__gm__ float *>(cos_lo__ssa_v0_tensor->buffer.addr) + cos_lo__ssa_v0_tensor->start_offset;
+
+    // Unpack tensor: sin_lo__ssa_v0
+    __gm__ Tensor *sin_lo__ssa_v0_tensor = reinterpret_cast<__gm__ Tensor *>(args[5]);
+    __gm__ float *sin_lo__ssa_v0 =
+        reinterpret_cast<__gm__ float *>(sin_lo__ssa_v0_tensor->buffer.addr) + sin_lo__ssa_v0_tensor->start_offset;
+
+    // Unpack tensor: cos_hi__ssa_v0
+    __gm__ Tensor *cos_hi__ssa_v0_tensor = reinterpret_cast<__gm__ Tensor *>(args[6]);
+    __gm__ float *cos_hi__ssa_v0 =
+        reinterpret_cast<__gm__ float *>(cos_hi__ssa_v0_tensor->buffer.addr) + cos_hi__ssa_v0_tensor->start_offset;
+
+    // Unpack tensor: sin_hi__ssa_v0
+    __gm__ Tensor *sin_hi__ssa_v0_tensor = reinterpret_cast<__gm__ Tensor *>(args[7]);
+    __gm__ float *sin_hi__ssa_v0 =
+        reinterpret_cast<__gm__ float *>(sin_hi__ssa_v0_tensor->buffer.addr) + sin_hi__ssa_v0_tensor->start_offset;
+
+    // Unpack tensor: v_proj__rv_v2
+    __gm__ Tensor *v_proj__rv_v2_tensor = reinterpret_cast<__gm__ Tensor *>(args[8]);
+    __gm__ float *v_proj__rv_v2 =
+        reinterpret_cast<__gm__ float *>(v_proj__rv_v2_tensor->buffer.addr) + v_proj__rv_v2_tensor->start_offset;
+
+    // Unpack tensor: q_proj_norm__rv_v2
+    __gm__ Tensor *q_proj_norm__rv_v2_tensor = reinterpret_cast<__gm__ Tensor *>(args[9]);
+    __gm__ float *q_proj_norm__rv_v2 = reinterpret_cast<__gm__ float *>(q_proj_norm__rv_v2_tensor->buffer.addr) +
+                                       q_proj_norm__rv_v2_tensor->start_offset;
+
+    // Unpack scalar: ki_chunk__idx_v0
+    union {
+        uint64_t u64;
+        int64_t val;
+    } ki_chunk__idx_v0_conv;
+    ki_chunk__idx_v0_conv.u64 = args[10];
+    int64_t ki_chunk__idx_v0 = ki_chunk__idx_v0_conv.val;
+
+    // Unpack scalar: slot_block__ssa_v0
+    union {
+        uint64_t u64;
+        int64_t val;
+    } slot_block__ssa_v0_conv;
+    slot_block__ssa_v0_conv.u64 = args[11];
+    int64_t slot_block__ssa_v0 = slot_block__ssa_v0_conv.val;
+
+    // Unpack scalar: slot_offset__ssa_v0
+    union {
+        uint64_t u64;
+        int64_t val;
+    } slot_offset__ssa_v0_conv;
+    slot_offset__ssa_v0_conv.u64 = args[12];
+    int64_t slot_offset__ssa_v0 = slot_offset__ssa_v0_conv.val;
+
+    // Unpack scalar: b__idx_v0
+    union {
+        uint64_t u64;
+        int64_t val;
+    } b__idx_v0_conv;
+    b__idx_v0_conv.u64 = args[13];
+    int64_t b__idx_v0 = b__idx_v0_conv.val;
+
+    // Extract dynamic dim: KV_CACHE_ROWS_DYN
+    int64_t KV_CACHE_ROWS_DYN = static_cast<int64_t>(k_cache__iter_v5_tensor->shapes[0]);
+
+    // Forward to ptoas-generated function
+    rope_kv_cache(
+        all_q_padded__iter_v6, k_cache__iter_v5, v_cache__iter_v5, k_proj_norm__rv_v2, cos_lo__ssa_v0, sin_lo__ssa_v0,
+        cos_hi__ssa_v0, sin_hi__ssa_v0, v_proj__rv_v2, q_proj_norm__rv_v2, ki_chunk__idx_v0, slot_block__ssa_v0,
+        slot_offset__ssa_v0, b__idx_v0, KV_CACHE_ROWS_DYN
+    );
+}
diff --git a/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aiv/silu.cpp b/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aiv/silu.cpp
new file mode 100644
index 000000000..69af06ad5
--- /dev/null
+++ b/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aiv/silu.cpp
@@ -0,0 +1,219 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+// Kernel Function: silu
+// Generated by PyPTO IR Compiler (PTO backend)
+
+#include <cstdint>
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#if defined(__CPU_SIM)
+#define __aicore__
+#else
+#define __aicore__ [aicore]
+#endif
+#endif
+
+#include <pto/pto-inst.hpp>
+#include "tensor.h"
+
+using namespace pto;
+
+// --- ptoas-generated code ---
+
+enum class PTOAutoSyncTailMode : int {
+    kBarrierAll = 0,
+    kSetWaitMte3ToSEvent0 = 1,
+};
+
+static __aicore__ inline void ptoas_auto_sync_tail(PTOAutoSyncTailMode mode = PTOAutoSyncTailMode::kBarrierAll) {
+    switch (mode) {
+    case PTOAutoSyncTailMode::kSetWaitMte3ToSEvent0:
+        set_flag(PIPE_MTE3, PIPE_S, EVENT_ID0);
+        wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID0);
+        break;
+    case PTOAutoSyncTailMode::kBarrierAll:
+    default:
+        pipe_barrier(PIPE_ALL);
+        break;
+    }
+}
+
+static __aicore__ void silu(__gm__ float *v1, __gm__ float *v2, __gm__ bfloat16_t *v3, int32_t v4) {
+    RoundMode v5 = RoundMode::CAST_ROUND;
+    unsigned v6 = 0;
+    const float v7 = 1.0f;
+    const int32_t v8 = 17408;
+    const int32_t v9 = 1;
+    const int32_t v10 = 256;
+    const int32_t v11 = 16;
+    const int64_t v12 = 0;
+    const int64_t v13 = 57344;
+    const int64_t v14 = 40960;
+    const int64_t v15 = 24576;
+    const int64_t v16 = 8192;
+    using T = float;
+
+#if defined(__DAV_VEC__)
+    set_mask_norm();
+    set_vector_mask(-1, -1);
+    Tile<
+        TileType::Vec, float, 16, 256, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+        CompactMode::Null>
+        v17 = Tile<
+            TileType::Vec, float, 16, 256, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>(v11, v10);
+    uint64_t v18 = (uint64_t)v16;
+    TASSIGN(v17, v18);
+    pto::Shape<1, 1, 1, 16, 256> v19 = pto::Shape<1, 1, 1, 16, 256>();
+    pto::Stride<4096, 4096, 4096, 256, 1> v20 = pto::Stride<4096, 4096, 4096, 256, 1>();
+    GlobalTensor<float, pto::Shape<1, 1, 1, 16, 256>, pto::Stride<4096, 4096, 4096, 256, 1>, pto::Layout::ND> v21 =
+        GlobalTensor<float, pto::Shape<1, 1, 1, 16, 256>, pto::Stride<4096, 4096, 4096, 256, 1>, pto::Layout::ND>(
+            v1 + (v6 + v6 * (unsigned)v10 + v6 * (unsigned)v9), v19, v20
+        );
+    TLOAD(v17, v21);
+    set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+    Tile<
+        TileType::Vec, float, 16, 256, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+        CompactMode::Null>
+        v22 = Tile<
+            TileType::Vec, float, 16, 256, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>(v11, v10);
+    uint64_t v23 = (uint64_t)v15;
+    TASSIGN(v22, v23);
+    pto::Shape<1, 1, 1, 16, 256> v24 = pto::Shape<1, 1, 1, 16, 256>();
+    pto::Stride<4096, 4096, 4096, 256, 1> v25 = pto::Stride<4096, 4096, 4096, 256, 1>();
+    GlobalTensor<float, pto::Shape<1, 1, 1, 16, 256>, pto::Stride<4096, 4096, 4096, 256, 1>, pto::Layout::ND> v26 =
+        GlobalTensor<float, pto::Shape<1, 1, 1, 16, 256>, pto::Stride<4096, 4096, 4096, 256, 1>, pto::Layout::ND>(
+            v2 + (v6 + v6 * (unsigned)v10 + v6 * (unsigned)v9), v24, v25
+        );
+    TLOAD(v22, v26);
+    set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1);
+    Tile<
+        TileType::Vec, float, 16, 256, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+        CompactMode::Null>
+        v27 = Tile<
+            TileType::Vec, float, 16, 256, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>(v11, v10);
+    uint64_t v28 = (uint64_t)v14;
+    TASSIGN(v27, v28);
+    wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+    TNEG(v27, v17);
+    Tile<
+        TileType::Vec, float, 16, 256, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+        CompactMode::Null>
+        v29 = Tile<
+            TileType::Vec, float, 16, 256, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>(v11, v10);
+    uint64_t v30 = (uint64_t)v14;
+    TASSIGN(v29, v30);
+    pipe_barrier(PIPE_V);
+    TEXP(v29, v27);
+    Tile<
+        TileType::Vec, float, 16, 256, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+        CompactMode::Null>
+        v31 = Tile<
+            TileType::Vec, float, 16, 256, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>(v11, v10);
+    uint64_t v32 = (uint64_t)v14;
+    TASSIGN(v31, v32);
+    pipe_barrier(PIPE_V);
+    TADDS(v31, v29, v7);
+    Tile<
+        TileType::Vec, float, 16, 256, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+        CompactMode::Null>
+        v33 = Tile<
+            TileType::Vec, float, 16, 256, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>(v11, v10);
+    uint64_t v34 = (uint64_t)v13;
+    TASSIGN(v33, v34);
+    pipe_barrier(PIPE_V);
+    TRECIP(v33, v31);
+    Tile<
+        TileType::Vec, float, 16, 256, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+        CompactMode::Null>
+        v35 = Tile<
+            TileType::Vec, float, 16, 256, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>(v11, v10);
+    uint64_t v36 = (uint64_t)v16;
+    TASSIGN(v35, v36);
+    pipe_barrier(PIPE_V);
+    TMUL(v35, v17, v33);
+    Tile<
+        TileType::Vec, float, 16, 256, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+        CompactMode::Null>
+        v37 = Tile<
+            TileType::Vec, float, 16, 256, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>(v11, v10);
+    uint64_t v38 = (uint64_t)v16;
+    TASSIGN(v37, v38);
+    pipe_barrier(PIPE_V);
+    wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1);
+    TMUL(v37, v35, v22);
+    Tile<
+        TileType::Vec, bfloat16_t, 16, 256, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+        CompactMode::Null>
+        v39 = Tile<
+            TileType::Vec, bfloat16_t, 16, 256, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+            CompactMode::Null>(v11, v10);
+    uint64_t v40 = (uint64_t)v12;
+    TASSIGN(v39, v40);
+    pipe_barrier(PIPE_V);
+    TCVT(v39, v37, v5);
+    set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+    pto::Shape<1, 1, 1, 16, 256> v41 = pto::Shape<1, 1, 1, 16, 256>();
+    pto::Stride<278528, 278528, 278528, 17408, 1> v42 = pto::Stride<278528, 278528, 278528, 17408, 1>();
+    GlobalTensor<
+        bfloat16_t, pto::Shape<1, 1, 1, 16, 256>, pto::Stride<278528, 278528, 278528, 17408, 1>, pto::Layout::ND>
+        v43 = GlobalTensor<
+            bfloat16_t, pto::Shape<1, 1, 1, 16, 256>, pto::Stride<278528, 278528, 278528, 17408, 1>, pto::Layout::ND>(
+            v3 + (v6 + v6 * (unsigned)v8 + (unsigned)v4 * (unsigned)v9), v41, v42
+        );
+    wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+    TSTORE(v43, v39);
+#endif  // __DAV_VEC__
+
+    ptoas_auto_sync_tail(PTOAutoSyncTailMode::kBarrierAll);
+    return;
+}
+
+// --- Kernel entry point ---
+extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) {
+    // Unpack tensor: gate_acc__rv_v2
+    __gm__ Tensor *gate_acc__rv_v2_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]);
+    __gm__ float *gate_acc__rv_v2 =
+        reinterpret_cast<__gm__ float *>(gate_acc__rv_v2_tensor->buffer.addr) + gate_acc__rv_v2_tensor->start_offset;
+
+    // Unpack tensor: up_acc__rv_v2
+    __gm__ Tensor *up_acc__rv_v2_tensor = reinterpret_cast<__gm__ Tensor *>(args[1]);
+    __gm__ float *up_acc__rv_v2 =
+        reinterpret_cast<__gm__ float *>(up_acc__rv_v2_tensor->buffer.addr) + up_acc__rv_v2_tensor->start_offset;
+
+    // Unpack tensor: mlp_tile__iter_v1
+    __gm__ Tensor *mlp_tile__iter_v1_tensor = reinterpret_cast<__gm__ Tensor *>(args[2]);
+    __gm__ bfloat16_t *mlp_tile__iter_v1 =
+        reinterpret_cast<__gm__ bfloat16_t *>(mlp_tile__iter_v1_tensor->buffer.addr) +
+        mlp_tile__iter_v1_tensor->start_offset;
+
+    // Unpack scalar: o0__ssa_v1
+    union {
+        uint64_t u64;
+        int64_t val;
+    } o0__ssa_v1_conv;
+    o0__ssa_v1_conv.u64 = args[3];
+    int64_t o0__ssa_v1 = o0__ssa_v1_conv.val;
+
+    // Forward to ptoas-generated function
+    silu(gate_acc__rv_v2, up_acc__rv_v2, mlp_tile__iter_v1, o0__ssa_v1);
+}
diff --git a/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aiv/softmax.cpp b/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aiv/softmax.cpp
new file mode 100644
index 000000000..e94af05e1
--- /dev/null
+++ b/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aiv/softmax.cpp
@@ -0,0 +1,313 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+// Kernel Function: softmax
+// Generated by PyPTO IR Compiler (PTO backend)
+
+#include <cstdint>
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#if defined(__CPU_SIM)
+#define __aicore__
+#else
+#define __aicore__ [aicore]
+#endif
+#endif
+
+#include <pto/pto-inst.hpp>
+#include "tensor.h"
+
+using namespace pto;
+
+// --- ptoas-generated code ---
+
+enum class PTOAutoSyncTailMode : int {
+    kBarrierAll = 0,
+    kSetWaitMte3ToSEvent0 = 1,
+};
+
+static __aicore__ inline void ptoas_auto_sync_tail(PTOAutoSyncTailMode mode = PTOAutoSyncTailMode::kBarrierAll) {
+    switch (mode) {
+    case PTOAutoSyncTailMode::kSetWaitMte3ToSEvent0:
+        set_flag(PIPE_MTE3, PIPE_S, EVENT_ID0);
+        wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID0);
+        break;
+    case PTOAutoSyncTailMode::kBarrierAll:
+    default:
+        pipe_barrier(PIPE_ALL);
+        break;
+    }
+}
+
+static __aicore__ void softmax(
+    __gm__ float *v1, __gm__ float *v2, __gm__ bfloat16_t *v3, __gm__ float *v4, int32_t v5, int32_t v6, int32_t v7
+) {
+    RoundMode v8 = RoundMode::CAST_ROUND;
+    unsigned v9 = 0;
+    const float v10 = 0.0883883461f;
+    const int32_t v11 = 2;
+    const int32_t v12 = 16;
+    const int32_t v13 = 64;
+    const int32_t v14 = 8;
+    const int32_t v15 = 0;
+    const int32_t v16 = 1;
+    const int32_t v17 = 256;
+    const int64_t v18 = 8192;
+    const int64_t v19 = 0;
+    const int64_t v20 = 57408;
+    const int64_t v21 = 41024;
+    const int64_t v22 = 24640;
+    const int64_t v23 = 8256;
+    using T = float;
+
+#if defined(__DAV_VEC__)
+    set_mask_norm();
+    set_vector_mask(-1, -1);
+    size_t v24 = (size_t)v16;
+    size_t v25 = (size_t)v15;
+    set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0);
+    set_flag(PIPE_MTE3, PIPE_V, EVENT_ID0);
+    set_flag(PIPE_MTE3, PIPE_V, EVENT_ID1);
+    for (size_t v26 = v25; v26 < ((size_t)v14); v26 += v24) {
+        for (size_t v27 = v25; v27 < ((size_t)v13); v27 += v24) {
+            int32_t v28 = (int32_t)((uint32_t)v5 + (uint32_t)((int32_t)v27));
+            __gm__ float *v29;
+            __gm__ float *v30;
+            __gm__ bfloat16_t *v31;
+            if (v28 < v6) {
+                int32_t v32 = (int32_t)((uint32_t)v7 - (uint32_t)((int32_t)(uint32_t)v28 * (uint32_t)v17));
+                int32_t v33 = v32 < v17 ? v32 : v17;
+                Tile<
+                    TileType::Vec, float, 16, 256, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                    CompactMode::Null>
+                    v34 = Tile<
+                        TileType::Vec, float, 16, 256, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                        CompactMode::Null>(v12, v33);
+                uint64_t v35 = (uint64_t)v23;
+                TASSIGN(v34, v35);
+                int32_t v36 =
+                    (int32_t)((uint32_t)((int32_t)(uint32_t)((int32_t)(uint32_t)((int32_t)v26) * (uint32_t)v11) +
+                                         (uint32_t)v28) *
+                              (uint32_t)v12);
+                unsigned v37 = (unsigned)v33;
+                pto::Shape<1, 1, 1, 16, -1> v38 = pto::Shape<1, 1, 1, 16, -1>(v33);
+                pto::Stride<4096, 4096, 4096, 256, 1> v39 = pto::Stride<4096, 4096, 4096, 256, 1>();
+                GlobalTensor<float, pto::Shape<1, 1, 1, 16, -1>, pto::Stride<4096, 4096, 4096, 256, 1>, pto::Layout::ND>
+                    v40 = GlobalTensor<
+                        float, pto::Shape<1, 1, 1, 16, -1>, pto::Stride<4096, 4096, 4096, 256, 1>, pto::Layout::ND>(
+                        v4 + (v9 + (unsigned)v36 * (unsigned)v17 + v9 * (unsigned)v16), v38, v39
+                    );
+                wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0);
+                TLOAD(v34, v40);
+                set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+                Tile<
+                    TileType::Vec, float, 16, 256, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Min,
+                    CompactMode::Null>
+                    v41 = Tile<
+                        TileType::Vec, float, 16, 256, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Min,
+                        CompactMode::Null>(v12, v17);
+                uint64_t v42 = (uint64_t)v22;
+                TASSIGN(v41, v42);
+                wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+                pipe_barrier(PIPE_V);
+                TFILLPAD(v41, v34);
+                set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0);
+                Tile<
+                    TileType::Vec, float, 16, 256, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Min,
+                    CompactMode::Null>
+                    v43 = Tile<
+                        TileType::Vec, float, 16, 256, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Min,
+                        CompactMode::Null>(v12, v17);
+                uint64_t v44 = (uint64_t)v22;
+                TASSIGN(v43, v44);
+                pipe_barrier(PIPE_V);
+                TMULS(v43, v41, v10);
+                Tile<
+                    TileType::Vec, float, 16, 256, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                    CompactMode::Null>
+                    v45 = Tile<
+                        TileType::Vec, float, 16, 256, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                        CompactMode::Null>(v12, v17);
+                uint64_t v46 = (uint64_t)v21;
+                TASSIGN(v45, v46);
+                Tile<
+                    TileType::Vec, float, 16, 1, BLayout::ColMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                    CompactMode::Null>
+                    v47 = Tile<
+                        TileType::Vec, float, 16, 1, BLayout::ColMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                        CompactMode::Null>(v12, v16);
+                uint64_t v48 = (uint64_t)v20;
+                TASSIGN(v47, v48);
+                pipe_barrier(PIPE_V);
+                wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID0);
+                TROWMAX(v47, v43, v45);
+                Tile<
+                    TileType::Vec, float, 16, 256, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Min,
+                    CompactMode::Null>
+                    v49 = Tile<
+                        TileType::Vec, float, 16, 256, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Min,
+                        CompactMode::Null>(v12, v17);
+                uint64_t v50 = (uint64_t)v22;
+                TASSIGN(v49, v50);
+                pipe_barrier(PIPE_V);
+                TROWEXPANDSUB(v49, v43, v47);
+                Tile<
+                    TileType::Vec, float, 16, 256, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Min,
+                    CompactMode::Null>
+                    v51 = Tile<
+                        TileType::Vec, float, 16, 256, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Min,
+                        CompactMode::Null>(v12, v17);
+                uint64_t v52 = (uint64_t)v22;
+                TASSIGN(v51, v52);
+                pipe_barrier(PIPE_V);
+                TEXP(v51, v49);
+                Tile<
+                    TileType::Vec, bfloat16_t, 16, 256, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Min,
+                    CompactMode::Null>
+                    v53 = Tile<
+                        TileType::Vec, bfloat16_t, 16, 256, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512,
+                        PadValue::Min, CompactMode::Null>(v12, v17);
+                uint64_t v54 = (uint64_t)v19;
+                TASSIGN(v53, v54);
+                pipe_barrier(PIPE_V);
+                TCVT(v53, v51, v8);
+                set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+                Tile<
+                    TileType::Vec, float, 16, 256, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Min,
+                    CompactMode::Null>
+                    v55 = Tile<
+                        TileType::Vec, float, 16, 256, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Min,
+                        CompactMode::Null>(v12, v17);
+                uint64_t v56 = (uint64_t)v22;
+                TASSIGN(v55, v56);
+                pipe_barrier(PIPE_V);
+                TCVT(v55, v53, v8);
+                Tile<
+                    TileType::Vec, float, 16, 256, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                    CompactMode::Null>
+                    v57 = Tile<
+                        TileType::Vec, float, 16, 256, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                        CompactMode::Null>(v12, v17);
+                uint64_t v58 = (uint64_t)v21;
+                TASSIGN(v57, v58);
+                Tile<
+                    TileType::Vec, float, 16, 1, BLayout::ColMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                    CompactMode::Null>
+                    v59 = Tile<
+                        TileType::Vec, float, 16, 1, BLayout::ColMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null,
+                        CompactMode::Null>(v12, v16);
+                uint64_t v60 = (uint64_t)v18;
+                TASSIGN(v59, v60);
+                pipe_barrier(PIPE_V);
+                wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID1);
+                TROWSUM(v59, v55, v57);
+                set_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);
+                pto::Shape<1, 1, 1, 16, 256> v61 = pto::Shape<1, 1, 1, 16, 256>();
+                pto::Stride<4096, 4096, 4096, 256, 1> v62 = pto::Stride<4096, 4096, 4096, 256, 1>();
+                GlobalTensor<
+                    bfloat16_t, pto::Shape<1, 1, 1, 16, 256>, pto::Stride<4096, 4096, 4096, 256, 1>, pto::Layout::ND>
+                    v63 = GlobalTensor<
+                        bfloat16_t, pto::Shape<1, 1, 1, 16, 256>, pto::Stride<4096, 4096, 4096, 256, 1>,
+                        pto::Layout::ND>(v3 + (v9 + (unsigned)v36 * (unsigned)v17 + v9 * (unsigned)v16), v61, v62);
+                wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+                TSTORE(v63, v53);
+                pto::Shape<1, 1, 1, 16, 1> v64 = pto::Shape<1, 1, 1, 16, 1>();
+                pto::Stride<16, 16, 16, 1, 256> v65 = pto::Stride<16, 16, 16, 1, 256>();
+                GlobalTensor<float, pto::Shape<1, 1, 1, 16, 1>, pto::Stride<16, 16, 16, 1, 256>, pto::Layout::DN> v66 =
+                    GlobalTensor<float, pto::Shape<1, 1, 1, 16, 1>, pto::Stride<16, 16, 16, 1, 256>, pto::Layout::DN>(
+                        v2 + (v9 + (unsigned)v36 * (unsigned)v16 + v9 * (unsigned)v17), v64, v65
+                    );
+                TSTORE(v66, v47);
+                set_flag(PIPE_MTE3, PIPE_V, EVENT_ID0);
+                pto::Shape<1, 1, 1, 16, 1> v67 = pto::Shape<1, 1, 1, 16, 1>();
+                pto::Stride<16, 16, 16, 1, 256> v68 = pto::Stride<16, 16, 16, 1, 256>();
+                GlobalTensor<float, pto::Shape<1, 1, 1, 16, 1>, pto::Stride<16, 16, 16, 1, 256>, pto::Layout::DN> v69 =
+                    GlobalTensor<float, pto::Shape<1, 1, 1, 16, 1>, pto::Stride<16, 16, 16, 1, 256>, pto::Layout::DN>(
+                        v1 + (v9 + (unsigned)v36 * (unsigned)v16 + v9 * (unsigned)v17), v67, v68
+                    );
+                wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);
+                TSTORE(v69, v59);
+                set_flag(PIPE_MTE3, PIPE_V, EVENT_ID1);
+                v29 = v1;
+                v30 = v2;
+                v31 = v3;
+            } else {
+                v29 = v1;
+                v30 = v2;
+                v31 = v3;
+            };
+        };
+    }
+    wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0);
+    wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID0);
+    wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID1);
+#endif  // __DAV_VEC__
+
+    ptoas_auto_sync_tail(PTOAutoSyncTailMode::kBarrierAll);
+    return;
+}
+
+// --- Kernel entry point ---
+extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) {
+    // Unpack tensor: all_cur_li__iter_v1
+    __gm__ Tensor *all_cur_li__iter_v1_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]);
+    __gm__ float *all_cur_li__iter_v1 = reinterpret_cast<__gm__ float *>(all_cur_li__iter_v1_tensor->buffer.addr) +
+                                        all_cur_li__iter_v1_tensor->start_offset;
+
+    // Unpack tensor: all_cur_mi__iter_v1
+    __gm__ Tensor *all_cur_mi__iter_v1_tensor = reinterpret_cast<__gm__ Tensor *>(args[1]);
+    __gm__ float *all_cur_mi__iter_v1 = reinterpret_cast<__gm__ float *>(all_cur_mi__iter_v1_tensor->buffer.addr) +
+                                        all_cur_mi__iter_v1_tensor->start_offset;
+
+    // Unpack tensor: all_exp_padded__iter_v1
+    __gm__ Tensor *all_exp_padded__iter_v1_tensor = reinterpret_cast<__gm__ Tensor *>(args[2]);
+    __gm__ bfloat16_t *all_exp_padded__iter_v1 =
+        reinterpret_cast<__gm__ bfloat16_t *>(all_exp_padded__iter_v1_tensor->buffer.addr) +
+        all_exp_padded__iter_v1_tensor->start_offset;
+
+    // Unpack tensor: all_raw_scores__rv_v2
+    __gm__ Tensor *all_raw_scores__rv_v2_tensor = reinterpret_cast<__gm__ Tensor *>(args[3]);
+    __gm__ float *all_raw_scores__rv_v2 = reinterpret_cast<__gm__ float *>(all_raw_scores__rv_v2_tensor->buffer.addr) +
+                                          all_raw_scores__rv_v2_tensor->start_offset;
+
+    // Unpack scalar: sb_chunk__idx_v0
+    union {
+        uint64_t u64;
+        int64_t val;
+    } sb_chunk__idx_v0_conv;
+    sb_chunk__idx_v0_conv.u64 = args[4];
+    int64_t sb_chunk__idx_v0 = sb_chunk__idx_v0_conv.val;
+
+    // Unpack scalar: ctx_blocks__ssa_v0
+    union {
+        uint64_t u64;
+        int64_t val;
+    } ctx_blocks__ssa_v0_conv;
+    ctx_blocks__ssa_v0_conv.u64 = args[5];
+    int64_t ctx_blocks__ssa_v0 = ctx_blocks__ssa_v0_conv.val;
+
+    // Unpack scalar: ctx_len__ssa_v0
+    union {
+        uint64_t u64;
+        int32_t val;
+    } ctx_len__ssa_v0_conv;
+    ctx_len__ssa_v0_conv.u64 = args[6];
+    int32_t ctx_len__ssa_v0 = ctx_len__ssa_v0_conv.val;
+
+    // Forward to ptoas-generated function
+    softmax(
+        all_cur_li__iter_v1, all_cur_mi__iter_v1, all_exp_padded__iter_v1, all_raw_scores__rv_v2, sb_chunk__idx_v0,
+        ctx_blocks__ssa_v0, ctx_len__ssa_v0
+    );
+}
diff --git a/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/orchestration/qwen3_decode.cpp b/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/orchestration/qwen3_decode.cpp
new file mode 100644
index 000000000..c4899e663
--- /dev/null
+++ b/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/orchestration/qwen3_decode.cpp
@@ -0,0 +1,455 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+// Orchestration Function: qwen3_decode
+// Generated by PyPTO IR Compiler
+
+#include "runtime.h"
+#include <iostream>
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+
+#include "pto_orchestration_api.h"
+
+extern "C" {
+
+__attribute__((visibility("default"))) PTO2OrchestrationConfig aicpu_orchestration_config(const L2TaskArgs &orch_args) {
+    (void)orch_args;
+    return PTO2OrchestrationConfig{
+        .expected_arg_count = 20,
+    };
+}
+
+__attribute__((visibility("default"))) void aicpu_orchestration_entry(const L2TaskArgs &orch_args) {
+    // External tensors
+    const Tensor &ext_hidden_states = orch_args.tensor(0).ref();
+    const Tensor &ext_input_rms_weight = orch_args.tensor(1).ref();
+    const Tensor &ext_wq = orch_args.tensor(2).ref();
+    const Tensor &ext_wk = orch_args.tensor(3).ref();
+    const Tensor &ext_wv = orch_args.tensor(4).ref();
+    const Tensor &ext_q_norm_weight = orch_args.tensor(5).ref();
+    const Tensor &ext_k_norm_weight = orch_args.tensor(6).ref();
+    const Tensor &ext_seq_lens = orch_args.tensor(7).ref();
+    const Tensor &ext_block_table = orch_args.tensor(8).ref();
+    const Tensor &ext_slot_mapping = orch_args.tensor(9).ref();
+    const Tensor &ext_rope_cos = orch_args.tensor(10).ref();
+    const Tensor &ext_rope_sin = orch_args.tensor(11).ref();
+    const Tensor &ext_k_cache = orch_args.tensor(12).ref();
+    const Tensor &ext_v_cache = orch_args.tensor(13).ref();
+    const Tensor &ext_wo = orch_args.tensor(14).ref();
+    const Tensor &ext_post_rms_weight = orch_args.tensor(15).ref();
+    const Tensor &ext_w_gate = orch_args.tensor(16).ref();
+    const Tensor &ext_w_up = orch_args.tensor(17).ref();
+    const Tensor &ext_w_down = orch_args.tensor(18).ref();
+    const Tensor &ext_out = orch_args.tensor(19).ref();
+
+    PTO2_SCOPE() {
+        uint32_t current_hidden_ci_shapes[2] = {16, 5120};
+        TensorCreateInfo current_hidden_ci(current_hidden_ci_shapes, 2, DataType::BFLOAT16);
+        uint32_t next_hidden_ci_shapes[2] = {16, 5120};
+        TensorCreateInfo next_hidden_ci(next_hidden_ci_shapes, 2, DataType::BFLOAT16);
+        uint32_t q_proj_ci_shapes[2] = {16, 5120};
+        TensorCreateInfo q_proj_ci(q_proj_ci_shapes, 2, DataType::FLOAT32);
+        uint32_t k_proj_ci_shapes[2] = {16, 1024};
+        TensorCreateInfo k_proj_ci(k_proj_ci_shapes, 2, DataType::FLOAT32);
+        uint32_t v_proj_ci_shapes[2] = {16, 1024};
+        TensorCreateInfo v_proj_ci(v_proj_ci_shapes, 2, DataType::FLOAT32);
+        uint32_t q_proj_norm_ci_shapes[2] = {16, 5120};
+        TensorCreateInfo q_proj_norm_ci(q_proj_norm_ci_shapes, 2, DataType::FLOAT32);
+        uint32_t k_proj_norm_ci_shapes[2] = {16, 1024};
+        TensorCreateInfo k_proj_norm_ci(k_proj_norm_ci_shapes, 2, DataType::FLOAT32);
+        uint32_t attn_out_ci_shapes[2] = {16, 5120};
+        TensorCreateInfo attn_out_ci(attn_out_ci_shapes, 2, DataType::BFLOAT16);
+        uint32_t all_q_padded_ci_shapes[2] = {2048, 128};
+        TensorCreateInfo all_q_padded_ci(all_q_padded_ci_shapes, 2, DataType::BFLOAT16);
+        TaskOutputTensors alloc_0 = alloc_tensors(
+            current_hidden_ci, next_hidden_ci, q_proj_ci, k_proj_ci, v_proj_ci, q_proj_norm_ci, k_proj_norm_ci,
+            attn_out_ci, all_q_padded_ci
+        );
+        const Tensor &current_hidden = alloc_0.get_ref(0);
+        const Tensor &next_hidden = alloc_0.get_ref(1);
+        const Tensor &q_proj = alloc_0.get_ref(2);
+        const Tensor &k_proj = alloc_0.get_ref(3);
+        const Tensor &v_proj = alloc_0.get_ref(4);
+        const Tensor &q_proj_norm = alloc_0.get_ref(5);
+        const Tensor &k_proj_norm = alloc_0.get_ref(6);
+        const Tensor &attn_out = alloc_0.get_ref(7);
+        const Tensor &all_q_padded = alloc_0.get_ref(8);
+        int64_t user_batch = (int64_t)orch_args.tensor(0).ref().shapes[0];
+        int64_t batch_padded = (((user_batch + 15) / 16) * 16);
+        for (int64_t b0 = 0; b0 < batch_padded; b0 += 16) {
+            PTO2_SCOPE() {
+                int64_t cur_valid = std::min<int64_t>((user_batch - b0), 16);
+
+                // Task 0: copy_hidden
+                L0TaskArgs params_t0;
+                params_t0.add_output(current_hidden);
+                params_t0.add_input(ext_hidden_states);
+                params_t0.add_scalar(b0);
+                params_t0.add_scalar(cur_valid);
+                rt_submit_aiv_task(0, params_t0);
+                const Tensor &current_hidden__rv_v4 = current_hidden;
+            }
+        }
+        for (int64_t b0 = 0; b0 < batch_padded; b0 += 16) {
+            PTO2_SCOPE() {
+                uint32_t normed_tile_ci_shapes[2] = {16, 5120};
+                TensorCreateInfo normed_tile_ci(normed_tile_ci_shapes, 2, DataType::BFLOAT16);
+                TaskOutputTensors alloc_1 = alloc_tensors(normed_tile_ci);
+                const Tensor &normed_tile = alloc_1.get_ref(0);
+                int64_t cur_valid__ssa_v1 = std::min<int64_t>((user_batch - b0), 16);
+
+                // Task 1: rmsnorm
+                L0TaskArgs params_t1;
+                params_t1.add_input(current_hidden);
+                params_t1.add_output(normed_tile);
+                params_t1.add_input(ext_input_rms_weight);
+                params_t1.add_scalar(b0);
+                params_t1.add_scalar(cur_valid__ssa_v1);
+                rt_submit_aiv_task(1, params_t1);
+                const Tensor &normed_tile__rv_v2 = normed_tile;
+                for (int64_t ob_chunk = 0; ob_chunk < 80; ob_chunk += 4) {
+                    PTO2_SCOPE() {
+                        // Task 2: q_proj
+                        L0TaskArgs params_t2;
+                        params_t2.add_output(q_proj);
+                        params_t2.add_input(normed_tile__rv_v2);
+                        params_t2.add_input(ext_wq);
+                        params_t2.add_scalar(ob_chunk);
+                        params_t2.add_scalar(b0);
+                        rt_submit_aic_task(2, params_t2);
+                        const Tensor &q_proj__rv_v6 = q_proj;
+                    }
+                }
+                for (int64_t ob_chunk = 0; ob_chunk < 16; ob_chunk += 4) {
+                    PTO2_SCOPE() {
+                        // Task 3: kv_proj
+                        L0TaskArgs params_t3;
+                        params_t3.add_output(k_proj);
+                        params_t3.add_output(v_proj);
+                        params_t3.add_input(normed_tile__rv_v2);
+                        params_t3.add_input(ext_wk);
+                        params_t3.add_input(ext_wv);
+                        params_t3.add_scalar(ob_chunk);
+                        params_t3.add_scalar(b0);
+                        rt_submit_aic_task(3, params_t3);
+                        const Tensor &k_proj__rv_v6 = k_proj;
+                        const Tensor &v_proj__rv_v6 = v_proj;
+                    }
+                }
+            }
+        }
+        for (int64_t b0 = 0; b0 < batch_padded; b0 += 16) {
+            PTO2_SCOPE() {
+                // Task 4: qk_norm
+                L0TaskArgs params_t4;
+                params_t4.add_output(q_proj_norm);
+                params_t4.add_input(q_proj);
+                params_t4.add_input(ext_q_norm_weight);
+                params_t4.add_output(k_proj_norm);
+                params_t4.add_input(k_proj);
+                params_t4.add_input(ext_k_norm_weight);
+                params_t4.add_scalar(b0);
+                rt_submit_aiv_task(4, params_t4);
+                const Tensor &q_proj_norm__rv_v4 = q_proj_norm;
+                const Tensor &k_proj_norm__rv_v4 = k_proj_norm;
+            }
+        }
+
+        // Task 5: q_pad
+        L0TaskArgs params_t5;
+        params_t5.add_output(all_q_padded);
+        rt_submit_aiv_task(5, params_t5);
+        const Tensor &all_q_padded__rv_v2 = all_q_padded;
+        for (int64_t b = 0; b < user_batch; b += 1) {
+            PTO2_SCOPE() {
+                uint32_t attn_row_padded_ci_shapes[2] = {1, 16384};
+                TensorCreateInfo attn_row_padded_ci(attn_row_padded_ci_shapes, 2, DataType::BFLOAT16);
+                uint32_t all_raw_scores_ci_shapes[2] = {256, 256};
+                TensorCreateInfo all_raw_scores_ci(all_raw_scores_ci_shapes, 2, DataType::FLOAT32);
+                uint32_t all_exp_padded_ci_shapes[2] = {256, 256};
+                TensorCreateInfo all_exp_padded_ci(all_exp_padded_ci_shapes, 2, DataType::BFLOAT16);
+                uint32_t all_oi_tmp_ci_shapes[2] = {256, 128};
+                TensorCreateInfo all_oi_tmp_ci(all_oi_tmp_ci_shapes, 2, DataType::FLOAT32);
+                uint32_t all_cur_mi_ci_shapes[2] = {256, 1};
+                TensorCreateInfo all_cur_mi_ci(all_cur_mi_ci_shapes, 2, DataType::FLOAT32);
+                uint32_t all_cur_li_ci_shapes[2] = {256, 1};
+                TensorCreateInfo all_cur_li_ci(all_cur_li_ci_shapes, 2, DataType::FLOAT32);
+                TaskOutputTensors alloc_2 = alloc_tensors(
+                    attn_row_padded_ci, all_raw_scores_ci, all_exp_padded_ci, all_oi_tmp_ci, all_cur_mi_ci,
+                    all_cur_li_ci
+                );
+                const Tensor &attn_row_padded = alloc_2.get_ref(0);
+                const Tensor &all_raw_scores = alloc_2.get_ref(1);
+                const Tensor &all_exp_padded = alloc_2.get_ref(2);
+                const Tensor &all_oi_tmp = alloc_2.get_ref(3);
+                const Tensor &all_cur_mi = alloc_2.get_ref(4);
+                const Tensor &all_cur_li = alloc_2.get_ref(5);
+                size_t idx_ctx_len = b;
+                int32_t ctx_len = static_cast<int32_t *>(orch_args.tensor(7).ref().data_as<void>())[idx_ctx_len];
+                int64_t pos = (static_cast<int64_t>(ctx_len) - 1);
+                int64_t ctx_blocks = ((static_cast<int64_t>(ctx_len) + 255) / 256);
+                int64_t block_table_base = (b * 2);
+                size_t idx_slot = b;
+                int32_t slot = static_cast<int32_t *>(orch_args.tensor(9).ref().data_as<void>())[idx_slot];
+                int64_t slot_block = (static_cast<int64_t>(slot) / 256);
+                int64_t slot_offset = (static_cast<int64_t>(slot) - (slot_block * 256));
+                uint32_t cos_row_shapes[2] = {1, 128};
+                uint32_t cos_row_offsets[2] = {static_cast<uint32_t>(pos), 0};
+                Tensor cos_row = ext_rope_cos.view(cos_row_shapes, cos_row_offsets);
+                uint32_t sin_row_shapes[2] = {1, 128};
+                uint32_t sin_row_offsets[2] = {static_cast<uint32_t>(pos), 0};
+                Tensor sin_row = ext_rope_sin.view(sin_row_shapes, sin_row_offsets);
+                uint32_t cos_lo_shapes[2] = {1, 64};
+                uint32_t cos_lo_offsets[2] = {0, 0};
+                Tensor cos_lo = cos_row.view(cos_lo_shapes, cos_lo_offsets);
+                uint32_t cos_hi_shapes[2] = {1, 64};
+                uint32_t cos_hi_offsets[2] = {0, 64};
+                Tensor cos_hi = cos_row.view(cos_hi_shapes, cos_hi_offsets);
+                uint32_t sin_lo_shapes[2] = {1, 64};
+                uint32_t sin_lo_offsets[2] = {0, 0};
+                Tensor sin_lo = sin_row.view(sin_lo_shapes, sin_lo_offsets);
+                uint32_t sin_hi_shapes[2] = {1, 64};
+                uint32_t sin_hi_offsets[2] = {0, 64};
+                Tensor sin_hi = sin_row.view(sin_hi_shapes, sin_hi_offsets);
+                for (int64_t ki_chunk = 0; ki_chunk < 8; ki_chunk += 8) {
+                    PTO2_SCOPE() {
+                        // Task 6: rope_kv_cache
+                        L0TaskArgs params_t6;
+                        params_t6.add_inout(all_q_padded__rv_v2);
+                        params_t6.add_output(ext_k_cache);
+                        params_t6.add_output(ext_v_cache);
+                        params_t6.add_input(k_proj_norm);
+                        params_t6.add_input(cos_lo);
+                        params_t6.add_input(sin_lo);
+                        params_t6.add_input(cos_hi);
+                        params_t6.add_input(sin_hi);
+                        params_t6.add_input(v_proj);
+                        params_t6.add_input(q_proj_norm);
+                        params_t6.add_scalar(ki_chunk);
+                        params_t6.add_scalar(slot_block);
+                        params_t6.add_scalar(slot_offset);
+                        params_t6.add_scalar(b);
+                        rt_submit_aiv_task(6, params_t6);
+                        const Tensor &all_q_padded__rv_v9 = all_q_padded__rv_v2;
+                        const Tensor &k_cache__rv_v8 = ext_k_cache;
+                        const Tensor &v_cache__rv_v8 = ext_v_cache;
+                    }
+                }
+                uint32_t attn_row_shapes[2] = {1, 5120};
+                uint32_t attn_row_offsets[2] = {static_cast<uint32_t>(b), 0};
+                Tensor attn_row = attn_out.view(attn_row_shapes, attn_row_offsets);
+                for (int64_t sb_chunk = 0; sb_chunk < ctx_blocks; sb_chunk += 64) {
+                    PTO2_SCOPE() {
+                        // Task 7: qk_matmul
+                        L0TaskArgs params_t7;
+                        params_t7.add_output(all_raw_scores);
+                        params_t7.add_input(all_q_padded__rv_v2);
+                        params_t7.add_input(ext_block_table);
+                        params_t7.add_input(ext_k_cache);
+                        params_t7.add_scalar(b);
+                        params_t7.add_scalar(sb_chunk);
+                        params_t7.add_scalar(ctx_blocks);
+                        params_t7.add_scalar(block_table_base);
+                        rt_submit_aic_task(7, params_t7);
+                        const Tensor &all_raw_scores__rv_v4 = all_raw_scores;
+                    }
+                }
+                for (int64_t sb_chunk = 0; sb_chunk < ctx_blocks; sb_chunk += 64) {
+                    PTO2_SCOPE() {
+                        // Task 8: softmax
+                        L0TaskArgs params_t8;
+                        params_t8.add_output(all_cur_li);
+                        params_t8.add_output(all_cur_mi);
+                        params_t8.add_output(all_exp_padded);
+                        params_t8.add_input(all_raw_scores);
+                        params_t8.add_scalar(sb_chunk);
+                        params_t8.add_scalar(ctx_blocks);
+                        params_t8.add_scalar(ctx_len);
+                        rt_submit_aiv_task(8, params_t8);
+                        const Tensor &all_cur_li__rv_v4 = all_cur_li;
+                        const Tensor &all_cur_mi__rv_v4 = all_cur_mi;
+                        const Tensor &all_exp_padded__rv_v4 = all_exp_padded;
+                    }
+                }
+                for (int64_t sb_chunk = 0; sb_chunk < ctx_blocks; sb_chunk += 64) {
+                    PTO2_SCOPE() {
+                        // Task 9: sv_matmul
+                        L0TaskArgs params_t9;
+                        params_t9.add_output(all_oi_tmp);
+                        params_t9.add_input(ext_block_table);
+                        params_t9.add_input(all_exp_padded);
+                        params_t9.add_input(ext_v_cache);
+                        params_t9.add_scalar(sb_chunk);
+                        params_t9.add_scalar(ctx_blocks);
+                        params_t9.add_scalar(block_table_base);
+                        rt_submit_aic_task(9, params_t9);
+                        const Tensor &all_oi_tmp__rv_v4 = all_oi_tmp;
+                    }
+                }
+
+                // Task 10: online_softmax
+                L0TaskArgs params_t10;
+                params_t10.add_output(attn_row_padded);
+                params_t10.add_input(all_oi_tmp);
+                params_t10.add_input(all_cur_mi);
+                params_t10.add_input(all_cur_li);
+                params_t10.add_scalar(ctx_blocks);
+                rt_submit_aiv_task(10, params_t10);
+                const Tensor &attn_row_padded__rv_v2 = attn_row_padded;
+
+                // Task 11: attention_writeback
+                L0TaskArgs params_t11;
+                params_t11.add_output(attn_row);
+                params_t11.add_input(attn_row_padded__rv_v2);
+                rt_submit_aiv_task(11, params_t11);
+            }
+        }
+        for (int64_t b0 = 0; b0 < batch_padded; b0 += 16) {
+            PTO2_SCOPE() {
+                uint32_t resid1_tile_ci_shapes[2] = {16, 5120};
+                TensorCreateInfo resid1_tile_ci(resid1_tile_ci_shapes, 2, DataType::FLOAT32);
+                uint32_t post_norm_tile_ci_shapes[2] = {16, 5120};
+                TensorCreateInfo post_norm_tile_ci(post_norm_tile_ci_shapes, 2, DataType::BFLOAT16);
+                uint32_t mlp_tile_ci_shapes[2] = {16, 17408};
+                TensorCreateInfo mlp_tile_ci(mlp_tile_ci_shapes, 2, DataType::BFLOAT16);
+                TaskOutputTensors alloc_3 = alloc_tensors(resid1_tile_ci, post_norm_tile_ci, mlp_tile_ci);
+                const Tensor &resid1_tile = alloc_3.get_ref(0);
+                const Tensor &post_norm_tile = alloc_3.get_ref(1);
+                const Tensor &mlp_tile = alloc_3.get_ref(2);
+                int64_t cur_valid__ssa_v2 = std::min<int64_t>((user_batch - b0), 16);
+                for (int64_t ob = 0; ob < 80; ob += 1) {
+                    PTO2_SCOPE() {
+                        uint32_t ret0__out_ci_shapes[2] = {16, 64};
+                        TensorCreateInfo ret0__out_ci(ret0__out_ci_shapes, 2, DataType::FLOAT32);
+                        TaskOutputTensors alloc_4 = alloc_tensors(ret0__out_ci);
+                        const Tensor &ret0__out = alloc_4.get_ref(0);
+                        int64_t o0 = (ob * 64);
+
+                        // Task 12: out_proj
+                        L0TaskArgs params_t12;
+                        params_t12.add_input(attn_out);
+                        params_t12.add_input(ext_wo);
+                        params_t12.add_inout(ret0__out);
+                        params_t12.add_scalar(b0);
+                        params_t12.add_scalar(o0);
+                        rt_submit_aic_task(12, params_t12);
+                        const Tensor &o_acc = ret0__out;
+
+                        // Task 13: out_proj_residual
+                        L0TaskArgs params_t13;
+                        params_t13.add_input(current_hidden);
+                        params_t13.add_input(o_acc);
+                        params_t13.add_inout(resid1_tile);
+                        params_t13.add_scalar(b0);
+                        params_t13.add_scalar(o0);
+                        params_t13.add_scalar(cur_valid__ssa_v2);
+                        rt_submit_aiv_task(13, params_t13);
+                        const Tensor &resid1_tile__ssa_v3 = resid1_tile;
+                    }
+                }
+
+                // Task 14: post_rmsnorm
+                L0TaskArgs params_t14;
+                params_t14.add_input(resid1_tile);
+                params_t14.add_output(post_norm_tile);
+                params_t14.add_input(ext_post_rms_weight);
+                rt_submit_aiv_task(14, params_t14);
+                const Tensor &post_norm_tile__rv_v2 = post_norm_tile;
+                for (int64_t ob = 0; ob < 68; ob += 1) {
+                    PTO2_SCOPE() {
+                        uint32_t ret0__out_1_ci_shapes[2] = {16, 256};
+                        TensorCreateInfo ret0__out_1_ci(ret0__out_1_ci_shapes, 2, DataType::FLOAT32);
+                        uint32_t ret0__out_2_ci_shapes[2] = {16, 256};
+                        TensorCreateInfo ret0__out_2_ci(ret0__out_2_ci_shapes, 2, DataType::FLOAT32);
+                        TaskOutputTensors alloc_5 = alloc_tensors(ret0__out_1_ci, ret0__out_2_ci);
+                        const Tensor &ret0__out_1 = alloc_5.get_ref(0);
+                        const Tensor &ret0__out_2 = alloc_5.get_ref(1);
+                        int64_t o0__ssa_v1 = (ob * 256);
+
+                        // Task 15: gate_proj
+                        L0TaskArgs params_t15;
+                        params_t15.add_input(post_norm_tile__rv_v2);
+                        params_t15.add_input(ext_w_gate);
+                        params_t15.add_inout(ret0__out_1);
+                        params_t15.add_scalar(o0__ssa_v1);
+                        rt_submit_aic_task(15, params_t15);
+                        const Tensor &gate_acc = ret0__out_1;
+
+                        // Task 16: up_proj
+                        L0TaskArgs params_t16;
+                        params_t16.add_input(post_norm_tile__rv_v2);
+                        params_t16.add_input(ext_w_up);
+                        params_t16.add_inout(ret0__out_2);
+                        params_t16.add_scalar(o0__ssa_v1);
+                        rt_submit_aic_task(16, params_t16);
+                        const Tensor &up_acc = ret0__out_2;
+
+                        // Task 17: silu
+                        L0TaskArgs params_t17;
+                        params_t17.add_input(gate_acc);
+                        params_t17.add_input(up_acc);
+                        params_t17.add_inout(mlp_tile);
+                        params_t17.add_scalar(o0__ssa_v1);
+                        rt_submit_aiv_task(17, params_t17);
+                        const Tensor &mlp_tile__ssa_v3 = mlp_tile;
+                    }
+                }
+                for (int64_t dob = 0; dob < 40; dob += 1) {
+                    PTO2_SCOPE() {
+                        uint32_t fp32_chunk_gm_ci_shapes[2] = {16, 128};
+                        TensorCreateInfo fp32_chunk_gm_ci(fp32_chunk_gm_ci_shapes, 2, DataType::FLOAT32);
+                        TaskOutputTensors alloc_6 = alloc_tensors(fp32_chunk_gm_ci);
+                        const Tensor &fp32_chunk_gm = alloc_6.get_ref(0);
+                        int64_t d0 = (dob * 128);
+
+                        // Task 18: down_proj
+                        L0TaskArgs params_t18;
+                        params_t18.add_input(mlp_tile);
+                        params_t18.add_input(ext_w_down);
+                        params_t18.add_inout(fp32_chunk_gm);
+                        params_t18.add_scalar(d0);
+                        rt_submit_aic_task(18, params_t18);
+                        const Tensor &fp32_chunk_gm__ssa_v1 = fp32_chunk_gm;
+
+                        // Task 19: down_proj_residual
+                        L0TaskArgs params_t19;
+                        params_t19.add_input(fp32_chunk_gm__ssa_v1);
+                        params_t19.add_input(resid1_tile);
+                        params_t19.add_inout(next_hidden);
+                        params_t19.add_scalar(d0);
+                        params_t19.add_scalar(b0);
+                        rt_submit_aiv_task(19, params_t19);
+                        const Tensor &next_hidden__ssa_v5 = next_hidden;
+                    }
+                }
+            }
+        }
+        Tensor current_hidden__ssa_v8 = next_hidden;
+        for (int64_t b0 = 0; b0 < batch_padded; b0 += 16) {
+            PTO2_SCOPE() {
+                int64_t cur_valid__ssa_v3 = std::min<int64_t>((user_batch - b0), 16);
+
+                // Task 20: copy_out
+                L0TaskArgs params_t20;
+                params_t20.add_output(ext_out);
+                params_t20.add_input(current_hidden__ssa_v8);
+                params_t20.add_scalar(b0);
+                params_t20.add_scalar(cur_valid__ssa_v3);
+                rt_submit_aiv_task(20, params_t20);
+                const Tensor &out = ext_out;
+            }
+        }
+    }
+}
+
+}  // extern "C"
diff --git a/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/test_qwen3_14b_decode.py b/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/test_qwen3_14b_decode.py
new file mode 100644
index 000000000..8ef7c8b34
--- /dev/null
+++ b/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/test_qwen3_14b_decode.py
@@ -0,0 +1,250 @@
+#!/usr/bin/env python3
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""Qwen3-14B single-layer decode — tensormap_and_ringbuffer SceneTestCase.
+
+A single fused decode step (21 kernels: 8 AIC + 13 AIV) covering
+RMSNorm → QKV → per-head Q/K RMS → RoPE → paged KV-cache write → paged
+attention (online softmax) → output projection + residual → post-RMSNorm
+→ SwiGLU FFN → down-proj + residual, against the production Qwen3-14B
+hidden/intermediate/head shapes (HIDDEN=5120, INTERMEDIATE=17408,
+NUM_HEADS=40 / NUM_KV_HEADS=8, HEAD_DIM=128, BLOCK_SIZE=256).
+"""
+
+from simpler.task_interface import ArgDirection as D
+
+from simpler_setup import SceneTestCase, scene_test
+from simpler_setup.goldens.qwen3_14b_decode import (
+    compute_golden as _decode_golden,
+)
+from simpler_setup.goldens.qwen3_14b_decode import (
+    generate_inputs as _decode_generate_inputs,
+)
+
+
+@scene_test(level=2, runtime="fully_distributed_within_core")
+class TestQwen314BDecode(SceneTestCase):
+    """Single-layer Qwen3-14B decode against a torch reference."""
+
+    # Bf16 deep-transformer drift over 21 kernels in series — paged attention
+    # plus FFN accumulate, so values O(10) settle in the ~1e-1 absolute range.
+    RTOL = 5e-2
+    ATOL = 1e-1
+
+    CALLABLE = {
+        "orchestration": {
+            "source": "kernels/orchestration/qwen3_decode.cpp",
+            "function_name": "aicpu_orchestration_entry",
+            "signature": [
+                D.IN,  # 0  hidden_states
+                D.IN,  # 1  input_rms_weight
+                D.IN,  # 2  wq
+                D.IN,  # 3  wk
+                D.IN,  # 4  wv
+                D.IN,  # 5  q_norm_weight
+                D.IN,  # 6  k_norm_weight
+                D.IN,  # 7  seq_lens
+                D.IN,  # 8  block_table
+                D.IN,  # 9  slot_mapping
+                D.IN,  # 10 rope_cos
+                D.IN,  # 11 rope_sin
+                D.INOUT,  # 12 k_cache
+                D.INOUT,  # 13 v_cache
+                D.IN,  # 14 wo
+                D.IN,  # 15 post_rms_weight
+                D.IN,  # 16 w_gate
+                D.IN,  # 17 w_up
+                D.IN,  # 18 w_down
+                D.OUT,  # 19 out
+            ],
+        },
+        "incores": [
+            {
+                "func_id": 0,
+                "name": "copy_hidden",
+                "source": "kernels/aiv/copy_hidden.cpp",
+                "core_type": "aiv",
+                "signature": [D.OUT, D.IN, D.SCALAR, D.SCALAR],
+            },
+            {
+                "func_id": 1,
+                "name": "rmsnorm",
+                "source": "kernels/aiv/rmsnorm.cpp",
+                "core_type": "aiv",
+                "signature": [D.IN, D.OUT, D.IN, D.SCALAR, D.SCALAR],
+            },
+            {
+                "func_id": 2,
+                "name": "q_proj",
+                "source": "kernels/aic/q_proj.cpp",
+                "core_type": "aic",
+                "signature": [D.OUT, D.IN, D.IN, D.SCALAR, D.SCALAR],
+            },
+            {
+                "func_id": 3,
+                "name": "kv_proj",
+                "source": "kernels/aic/kv_proj.cpp",
+                "core_type": "aic",
+                "signature": [D.OUT, D.OUT, D.IN, D.IN, D.IN, D.SCALAR, D.SCALAR],
+            },
+            {
+                "func_id": 4,
+                "name": "qk_norm",
+                "source": "kernels/aiv/qk_norm.cpp",
+                "core_type": "aiv",
+                "signature": [D.OUT, D.IN, D.IN, D.OUT, D.IN, D.IN, D.SCALAR],
+            },
+            {
+                "func_id": 5,
+                "name": "q_pad",
+                "source": "kernels/aiv/q_pad.cpp",
+                "core_type": "aiv",
+                "signature": [D.OUT],
+            },
+            {
+                "func_id": 6,
+                "name": "rope_kv_cache",
+                "source": "kernels/aiv/rope_kv_cache.cpp",
+                "core_type": "aiv",
+                "signature": [
+                    D.INOUT,
+                    D.OUT,
+                    D.OUT,
+                    D.IN,
+                    D.IN,
+                    D.IN,
+                    D.IN,
+                    D.IN,
+                    D.IN,
+                    D.IN,
+                    D.SCALAR,
+                    D.SCALAR,
+                    D.SCALAR,
+                    D.SCALAR,
+                ],
+            },
+            {
+                "func_id": 7,
+                "name": "qk_matmul",
+                "source": "kernels/aic/qk_matmul.cpp",
+                "core_type": "aic",
+                "signature": [D.OUT, D.IN, D.IN, D.IN, D.SCALAR, D.SCALAR, D.SCALAR, D.SCALAR],
+            },
+            {
+                "func_id": 8,
+                "name": "softmax",
+                "source": "kernels/aiv/softmax.cpp",
+                "core_type": "aiv",
+                "signature": [D.OUT, D.OUT, D.OUT, D.IN, D.SCALAR, D.SCALAR, D.SCALAR],
+            },
+            {
+                "func_id": 9,
+                "name": "sv_matmul",
+                "source": "kernels/aic/sv_matmul.cpp",
+                "core_type": "aic",
+                "signature": [D.OUT, D.IN, D.IN, D.IN, D.SCALAR, D.SCALAR, D.SCALAR],
+            },
+            {
+                "func_id": 10,
+                "name": "online_softmax",
+                "source": "kernels/aiv/online_softmax.cpp",
+                "core_type": "aiv",
+                "signature": [D.OUT, D.IN, D.IN, D.IN, D.SCALAR],
+            },
+            {
+                "func_id": 11,
+                "name": "attention_writeback",
+                "source": "kernels/aiv/attention_writeback.cpp",
+                "core_type": "aiv",
+                "signature": [D.OUT, D.IN],
+            },
+            {
+                "func_id": 12,
+                "name": "out_proj",
+                "source": "kernels/aic/out_proj.cpp",
+                "core_type": "aic",
+                "signature": [D.IN, D.IN, D.INOUT, D.SCALAR, D.SCALAR],
+            },
+            {
+                "func_id": 13,
+                "name": "out_proj_residual",
+                "source": "kernels/aiv/out_proj_residual.cpp",
+                "core_type": "aiv",
+                "signature": [D.IN, D.IN, D.INOUT, D.SCALAR, D.SCALAR, D.SCALAR],
+            },
+            {
+                "func_id": 14,
+                "name": "post_rmsnorm",
+                "source": "kernels/aiv/post_rmsnorm.cpp",
+                "core_type": "aiv",
+                "signature": [D.IN, D.OUT, D.IN],
+            },
+            {
+                "func_id": 15,
+                "name": "gate_proj",
+                "source": "kernels/aic/gate_proj.cpp",
+                "core_type": "aic",
+                "signature": [D.IN, D.IN, D.INOUT, D.SCALAR],
+            },
+            {
+                "func_id": 16,
+                "name": "up_proj",
+                "source": "kernels/aic/up_proj.cpp",
+                "core_type": "aic",
+                "signature": [D.IN, D.IN, D.INOUT, D.SCALAR],
+            },
+            {
+                "func_id": 17,
+                "name": "silu",
+                "source": "kernels/aiv/silu.cpp",
+                "core_type": "aiv",
+                "signature": [D.IN, D.IN, D.INOUT, D.SCALAR],
+            },
+            {
+                "func_id": 18,
+                "name": "down_proj",
+                "source": "kernels/aic/down_proj.cpp",
+                "core_type": "aic",
+                "signature": [D.IN, D.IN, D.INOUT, D.SCALAR],
+            },
+            {
+                "func_id": 19,
+                "name": "down_proj_residual",
+                "source": "kernels/aiv/down_proj_residual.cpp",
+                "core_type": "aiv",
+                "signature": [D.IN, D.IN, D.INOUT, D.SCALAR, D.SCALAR],
+            },
+            {
+                "func_id": 20,
+                "name": "copy_out",
+                "source": "kernels/aiv/copy_out.cpp",
+                "core_type": "aiv",
+                "signature": [D.OUT, D.IN, D.SCALAR, D.SCALAR],
+            },
+        ],
+    }
+
+    CASES = [
+        {
+            "name": "SmallSingle",
+            "platforms": ["a2a3"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 24},
+            "params": {"user_batch": 1, "seq_len": 8},
+        },
+    ]
+
+    def generate_args(self, params):
+        return _decode_generate_inputs(params["user_batch"], params["seq_len"])
+
+    def compute_golden(self, args, params):
+        _decode_golden(args, params["user_batch"], params["seq_len"])
+
+
+if __name__ == "__main__":
+    SceneTestCase.run_module(__name__)
diff --git a/examples/a2a3/fully_distributed_within_core/runtime_overhead_test/test_runtime_overhead.py b/examples/a2a3/fully_distributed_within_core/runtime_overhead_test/test_runtime_overhead.py
new file mode 100644
index 000000000..7367d2d08
--- /dev/null
+++ b/examples/a2a3/fully_distributed_within_core/runtime_overhead_test/test_runtime_overhead.py
@@ -0,0 +1,410 @@
+#!/usr/bin/env python3
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""Runtime overhead benchmark for the fully_distributed_within_core runtime.
+
+Goal: isolate the cost of *on-core orchestration + claim race + scheduling*
+(everything the distributed runtime does instead of an AICPU scheduler) from
+the cost of the kernels themselves, and see how that cost scales with the
+number of physical blocks (cores).
+
+Method:
+  * Reuse the ``benchmark_bgemm`` workload (same orchestration + GEMM/ADD
+    incores) — referenced directly, not duplicated.
+  * Set ``PTO_DIST_SKIP_EXEC=1`` so the engine skips every incore kernel call
+    and treats each (sub)task as 0-cost, while keeping all ownership/completion
+    bookkeeping. The wall clock then reflects orchestration/scheduling only.
+  * Sweep ``block_dim`` (1 block = 1 AIC + 2 AIV) and report the program wall
+    clock for each, so the relative overhead across core counts is visible.
+
+a2a3sim caps ``block_dim`` at PLATFORM_MAX_BLOCKDIM = 24 (24 AIC + 48 AIV = 72
+cores); 48 *blocks* is not representable (that 48 is the AIV-core count at the
+24-block max). The default sweep is the full ramp 1..24 (``--blocks 1-24``);
+pass an explicit list/range to narrow it (e.g. ``--blocks 1,2,12,24``).
+
+Run (standalone driver produces the comparison table)::
+
+    python test_runtime_overhead.py -p a2a3sim
+    python test_runtime_overhead.py -p a2a3sim --blocks 1,12,24 --rounds 5 --tasks 480
+    python test_runtime_overhead.py -p a2a3sim --exec        # include kernel work (baseline)
+    python test_runtime_overhead.py -p a2a3sim --bind node:0,1   # pin sim threads to NUMA nodes 0,1
+    python test_runtime_overhead.py -p a2a3sim --bind cpu:0-79    # pin to an explicit CPU range
+    # Confine the AICore working set to ONE NUMA node (1:1 thread->cpu) while
+    # auxiliary threads ride the wider --bind set; needs cores=block*3 <= node size:
+    python test_runtime_overhead.py -p a2a3sim --blocks 1-13 --bind node:1,2,3 --aicore-numa 2
+
+The class is also a valid SceneTestCase (cases marked manual), so the workload
+can be golden-checked the normal way with kernels enabled::
+
+    python test_runtime_overhead.py -p a2a3sim --case Blk24 --manual only
+"""
+
+import sys
+
+import torch
+from simpler.task_interface import ArgDirection as D
+
+from simpler_setup import SceneTestCase, TaskArgsBuilder, Tensor, scene_test
+
+# The bgemm incore/orchestration sources live in the sibling example; reference
+# them so this benchmark exercises exactly that workload without duplication.
+_BGEMM = "../benchmark_bgemm/kernels"
+
+
+@scene_test(level=2, runtime="fully_distributed_within_core")
+class TestRuntimeOverhead(SceneTestCase):
+    RTOL = 1e-3
+    ATOL = 1e-3
+
+    CALLABLE = {
+        "orchestration": {
+            "source": f"{_BGEMM}/orchestration/bgemm_orch.cpp",
+            "function_name": "aicpu_orchestration_entry",
+            "signature": [D.IN, D.IN, D.OUT, D.IN],
+        },
+        "incores": [
+            {
+                "func_id": 0,
+                "name": "GEMM",
+                "source": f"{_BGEMM}/aic/kernel_gemm_tile.cpp",
+                "core_type": "aic",
+                "signature": [D.IN, D.IN, D.OUT],
+            },
+            {
+                "func_id": 1,
+                "name": "ADD",
+                "source": f"{_BGEMM}/aiv/kernel_tile_add.cpp",
+                "core_type": "aiv",
+                "signature": [D.INOUT, D.IN],
+            },
+        ],
+    }
+
+    # Cases for the normal (golden-checked, kernels-on) pytest path. All manual
+    # so the benchmark never slows the default suite; the headline artifact is
+    # the standalone comparison table below.
+    _BENCH_PARAMS = {"matmul_add_task_num": 1000, "incore_data_size": 128, "incore_loop": 4, "grid_k": 2}
+    CASES = [
+        {
+            "name": "Blk1",
+            "manual": True,
+            "platforms": ["a2a3sim", "a2a3"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 1},
+            "params": _BENCH_PARAMS,
+        },
+        {
+            "name": "Blk2",
+            "manual": True,
+            "platforms": ["a2a3sim", "a2a3"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 2},
+            "params": _BENCH_PARAMS,
+        },
+        {
+            "name": "Blk12",
+            "manual": True,
+            "platforms": ["a2a3sim", "a2a3"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 12},
+            "params": _BENCH_PARAMS,
+        },
+        {
+            "name": "Blk24",
+            "manual": True,
+            "platforms": ["a2a3sim", "a2a3"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 24},
+            "params": _BENCH_PARAMS,
+        },
+    ]
+
+    def generate_args(self, params):
+        tile_size = params["incore_data_size"]
+        incore_loop = params["incore_loop"]
+        grid_k = params["grid_k"]
+        num_groups = params["matmul_add_task_num"] // grid_k
+        A = torch.randn(num_groups, grid_k, incore_loop, tile_size, tile_size, dtype=torch.float32) * 0.01
+        B = torch.randn(num_groups, grid_k, incore_loop, tile_size, tile_size, dtype=torch.float32) * 0.01
+        C = torch.zeros(incore_loop * num_groups, tile_size, tile_size, dtype=torch.float32)
+        config = torch.tensor([tile_size, grid_k, num_groups, incore_loop], dtype=torch.int64)
+        return TaskArgsBuilder(
+            Tensor("A", A.flatten()), Tensor("B", B.flatten()), Tensor("C", C.flatten()), Tensor("config", config)
+        )
+
+    def compute_golden(self, args, params):
+        tile_size = params["incore_data_size"]
+        incore_loop = params["incore_loop"]
+        grid_k = params["grid_k"]
+        num_groups = params["matmul_add_task_num"] // grid_k
+        A = args.A.reshape(num_groups, grid_k, incore_loop, tile_size, tile_size)
+        B = args.B.reshape(num_groups, grid_k, incore_loop, tile_size, tile_size)
+        C = args.C.reshape(incore_loop * num_groups, tile_size, tile_size)
+        C[:] = 0.0
+        for group in range(num_groups):
+            for k_idx in range(grid_k):
+                for i in range(incore_loop):
+                    C[group * incore_loop + i] += torch.matmul(A[group, k_idx, i], B[group, k_idx, i])
+
+
+# ---------------------------------------------------------------------------
+# CPU-affinity (core-binding) control.
+#
+# The sim runs every AICore/AICPU "core" as a host std::thread; those threads
+# inherit the launching process's CPU-affinity mask, so binding the Python
+# process here pins the whole simulation without external numactl/taskset (and
+# without numactl --membind, whose memory pinning starved allocations and added
+# noise). Threads are created lazily inside worker.run(), so applying the mask
+# before the first run is sufficient for all of them.
+# ---------------------------------------------------------------------------
+
+
+def _parse_cpu_list(spec):
+    """Parse a CPU list like '0-3,8,10-12' into a set of ints."""
+    cpus = set()
+    for tok in spec.split(","):
+        tok = tok.strip()
+        if not tok:
+            continue
+        if "-" in tok:
+            lo, hi = (int(v) for v in tok.split("-", 1))
+            cpus.update(range(lo, hi + 1))
+        else:
+            cpus.add(int(tok))
+    return cpus
+
+
+def _node_cpus(nodes_spec):
+    """Union the online CPUs of the given NUMA node(s), e.g. '0,1'."""
+    cpus = set()
+    for node in _parse_cpu_list(nodes_spec):
+        path = f"/sys/devices/system/node/node{node}/cpulist"
+        with open(path) as f:  # noqa: PTH123
+            cpus |= _parse_cpu_list(f.read().strip())
+    return cpus
+
+
+def _apply_cpu_binding(bind):
+    """Apply a core-binding strategy to this process; return the bound cpu set.
+
+    Strategies (``--bind``):
+      * ``none``                : no pinning (sim threads float over all CPUs).
+      * ``node:<nodes>``        : pin to all CPUs of the given NUMA node(s),
+                                  e.g. ``node:0`` or ``node:0,1``.
+      * ``cpu:<list>`` / ``<list>`` : pin to an explicit CPU list/range,
+                                  e.g. ``cpu:0-79`` or ``0,1,2``.
+    """
+    import os  # noqa: PLC0415
+
+    spec = (bind or "none").strip()
+    online = os.sched_getaffinity(0) if hasattr(os, "sched_getaffinity") else None
+
+    if spec.lower() in ("", "none"):
+        cpus = online
+    elif spec.lower().startswith("node:"):
+        cpus = _node_cpus(spec[len("node:") :])
+    elif spec.lower().startswith("cpu:"):
+        cpus = _parse_cpu_list(spec[len("cpu:") :])
+    else:
+        cpus = _parse_cpu_list(spec)
+
+    if spec.lower() not in ("", "none") and cpus:
+        if not hasattr(os, "sched_setaffinity"):
+            raise RuntimeError("os.sched_setaffinity unavailable on this platform")
+        os.sched_setaffinity(0, cpus)
+        cpus = os.sched_getaffinity(0)  # echo back what the OS actually accepted
+
+    n = len(cpus) if cpus else 0
+    print(f"CPU binding: strategy='{spec}' -> {n} physical cores" + (f" {sorted(cpus)}" if cpus and n <= 32 else ""))
+    return cpus
+
+
+# ---------------------------------------------------------------------------
+# Standalone comparison driver: sweep block_dim, print a wall-clock table.
+# ---------------------------------------------------------------------------
+
+
+def _bench(platform, block_dims, params, rounds, skip_exec, warmup, device):
+    """Run the workload once per block_dim and return per-config timings (us)."""
+    import os  # noqa: PLC0415
+    import statistics  # noqa: PLC0415
+    import time  # noqa: PLC0415
+    from pathlib import Path  # noqa: PLC0415
+
+    from simpler_setup.scene_test import _build_chip_task_args, _resolve_callable_paths  # noqa: PLC0415
+
+    # Engine reads PTO_DIST_SKIP_EXEC at dist_engine_register (once per run()).
+    if skip_exec:
+        os.environ["PTO_DIST_SKIP_EXEC"] = "1"
+    else:
+        os.environ.pop("PTO_DIST_SKIP_EXEC", None)
+
+    # The standalone path skips scene_test's per-class setup, so resolve the
+    # (relative) bgemm kernel sources against this file's directory ourselves.
+    _resolve_callable_paths(TestRuntimeOverhead, Path(__file__).parent)
+
+    inst = TestRuntimeOverhead()
+    orch_sig = TestRuntimeOverhead.CALLABLE["orchestration"]["signature"]
+
+    worker = TestRuntimeOverhead._create_worker(platform, device)
+    results = []
+    try:
+        callable_obj = inst.build_callable(platform)
+        handle = worker.register(callable_obj)
+
+        for bd in block_dims:
+            cfg = inst._build_config({"aicpu_thread_num": 4, "block_dim": bd})
+
+            # Build args/chip_args once per block_dim (data content is irrelevant
+            # to orchestration/scheduling timing, and skip-exec never reads it).
+            # Hoisting it out of the timed loop keeps large --tasks sweeps fast:
+            # otherwise every round re-runs torch.randn over multi-GB tensors.
+            args = inst.generate_args(params)
+            chip_args, _ = _build_chip_task_args(args, orch_sig)
+
+            def _one_run():
+                t0 = time.perf_counter()
+                timing = worker.run(handle, chip_args, config=cfg)
+                wall_us = (time.perf_counter() - t0) * 1e6
+                dev_us = float(getattr(timing, "device_wall_us", 0.0) or 0.0)
+                host_us = float(getattr(timing, "host_wall_us", 0.0) or 0.0)
+                return wall_us, host_us, dev_us
+
+            for _ in range(warmup):
+                _one_run()
+            samples = [_one_run() for _ in range(rounds)]
+            wall = statistics.median(s[0] for s in samples)
+            host = statistics.median(s[1] for s in samples)
+            dev = statistics.median(s[2] for s in samples)
+            results.append((bd, wall, host, dev))
+            print(f"  block_dim={bd:>2} ({bd * 3:>3} cores): wall={wall / 1000:8.3f} ms  device={dev / 1000:8.3f} ms")
+    finally:
+        worker.close()
+    return results
+
+
+def _print_table(results, params, rounds, skip_exec, bind_spec="none", bind_ncores=0):
+    task_num = params["matmul_add_task_num"]
+    # bgemm submits one GEMM (1C) and one ADD (1V) per matmul-add unit.
+    total_tasks = task_num * 2
+    # The on-device orchestrator wall is the metric of interest: it is the pure
+    # on-core orchestration + claim race + scheduling cost. The host wall is
+    # dominated by fixed Python/sim-launch overhead and is shown only for context.
+    base_dev = results[0][3] if results else 0.0
+    mode = "skip-exec (orchestration/scheduling only)" if skip_exec else "with kernels"
+    # Echo the active core-binding so the table is self-describing (the bound
+    # physical-core count is the key axis when comparing pinned vs unpinned runs
+    # and over/under-subscription effects — see docs §6.3).
+    bind_str = f"unpinned ({bind_ncores} cores available)" if bind_spec in ("", "none") else (
+        f"{bind_ncores} physical cores (strategy='{bind_spec}')"
+    )
+    print()
+    print(f"Runtime overhead — fully_distributed_within_core [{mode}]")
+    print(f"workload=bgemm  matmul_add_task_num={task_num}  (~{total_tasks} tasks)  rounds={rounds} (median)")
+    print(f"cpu_bind={bind_str}")
+    print()
+    header = (
+        f"| {'blocks':>6} | {'cores':>5} | {'device (ms)':>11} | {'us/task':>8} "
+        f"| {'dev vs 1blk':>11} | {'host (ms)':>10} |"
+    )
+    sep = "|" + "-" * 8 + "|" + "-" * 7 + "|" + "-" * 13 + "|" + "-" * 10 + "|" + "-" * 13 + "|" + "-" * 12 + "|"
+    print(header)
+    print(sep)
+    for bd, wall, _host, dev in results:
+        ratio = (dev / base_dev) if base_dev > 0 else 0.0
+        us_task = dev / total_tasks if total_tasks else 0.0
+        print(
+            f"| {bd:>6} | {bd * 3:>5} | {dev / 1000:>11.3f} | {us_task:>8.2f} "
+            f"| {ratio:>10.2f}× | {wall / 1000:>10.3f} |"
+        )
+    print()
+    print("device (ms) = on-core orchestration + claim race + scheduling wall (PTO2 profiling).")
+    print("host (ms)   = Python wall incl. fixed sim-launch overhead (context only).")
+
+
+def main():
+    import argparse  # noqa: PLC0415
+
+    p = argparse.ArgumentParser(description="fully_distributed_within_core runtime-overhead benchmark")
+    p.add_argument("-p", "--platform", required=True)
+    p.add_argument("-d", "--device", type=int, default=0)
+    p.add_argument(
+        "--blocks",
+        default=None,
+        help="block_dim values: comma list and/or a-b ranges, e.g. '1,2,12,24' or '1-24' (a2a3sim max 24). "
+        "Default is platform-aware: macOS -> '1-4' (few physical cores), Linux -> '1-13'.",
+    )
+    p.add_argument("--rounds", type=int, default=5, help="timed rounds per config (median reported)")
+    p.add_argument("--warmup", type=int, default=1, help="untimed warmup rounds per config")
+    p.add_argument("--tasks", type=int, default=1000, help="matmul_add_task_num (total tasks = 2x; batch)")
+    p.add_argument("--data-size", type=int, default=128, help="incore tile shape (NxN)")
+    p.add_argument("--loop", type=int, default=4)
+    p.add_argument("--grid-k", type=int, default=2)
+    p.add_argument("--exec", action="store_true", help="actually run kernels (default: skip for overhead isolation)")
+    p.add_argument(
+        "--bind",
+        default="none",
+        help="CPU core-binding strategy: 'none' | 'node:<nodes>' (e.g. node:0,1) | "
+        "'cpu:<list>' or bare '<list>' (e.g. cpu:0-79). Pins all sim threads via "
+        "sched_setaffinity, no external numactl needed.",
+    )
+    p.add_argument(
+        "--aicore-numa",
+        type=int,
+        default=None,
+        help="Pin every AICore sim thread 1:1 into this single NUMA node (sets "
+        "PTO_SIM_AICORE_NUMA_NODE), keeping the AICore working set inside one node. "
+        "Use a node with >= cores (=block_dim*3) CPUs; combine with --bind on a few "
+        "idle nodes so auxiliary threads don't oversubscribe the AICore node.",
+    )
+    args = p.parse_args()
+
+    import os  # noqa: PLC0415
+
+    bound_cpus = _apply_cpu_binding(args.bind)
+    bind_ncores = len(bound_cpus) if bound_cpus else 0
+
+    if args.aicore_numa is not None:
+        os.environ["PTO_SIM_AICORE_NUMA_NODE"] = str(args.aicore_numa)
+        node_cpus = sorted(_node_cpus(str(args.aicore_numa)))
+        print(
+            f"AICore pinning: every AICore thread -> NUMA node {args.aicore_numa} "
+            f"({len(node_cpus)} cpus: {min(node_cpus)}..{max(node_cpus)}), 1:1 exclusive"
+        )
+    else:
+        os.environ.pop("PTO_SIM_AICORE_NUMA_NODE", None)
+
+    # Platform-aware default block sweep: macOS hosts have few physical cores
+    # (heavy oversubscription past a couple of blocks makes the wall meaningless),
+    # so default to '1-4'; Linux dev boxes default to '1-13'.
+    blocks_spec = args.blocks
+    if blocks_spec is None:
+        blocks_spec = "1-4" if sys.platform == "darwin" else "1-13"
+        print(f"--blocks not given; using platform default '{blocks_spec}' ({sys.platform}).")
+
+    block_dims = []
+    for tok in blocks_spec.split(","):
+        tok = tok.strip()
+        if not tok:
+            continue
+        if "-" in tok:
+            lo, hi = (int(v) for v in tok.split("-", 1))
+            block_dims.extend(range(lo, hi + 1))
+        else:
+            block_dims.append(int(tok))
+    params = {
+        "matmul_add_task_num": args.tasks,
+        "incore_data_size": args.data_size,
+        "incore_loop": args.loop,
+        "grid_k": args.grid_k,
+    }
+    skip_exec = not args.exec
+    print(f"Benchmarking block_dims={block_dims} on {args.platform} (skip_exec={skip_exec}) ...")
+    results = _bench(args.platform, block_dims, params, args.rounds, skip_exec, args.warmup, args.device)
+    _print_table(results, params, args.rounds, skip_exec, args.bind, bind_ncores)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/a2a3/fully_distributed_within_core/scalar_data_test/kernels/aiv/kernel_add.cpp b/examples/a2a3/fully_distributed_within_core/scalar_data_test/kernels/aiv/kernel_add.cpp
new file mode 100644
index 000000000..8a119554d
--- /dev/null
+++ b/examples/a2a3/fully_distributed_within_core/scalar_data_test/kernels/aiv/kernel_add.cpp
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Element-wise Tensor Addition Kernel
+ *
+ * Implements: out[i] = src0[i] + src1[i]
+ *
+ * This kernel performs element-wise addition of two tensors. It's compiled
+ * separately as a standalone kernel and linked with the dispatcher using
+ * function pointers, demonstrating the separation pattern used in production
+ * systems where kernel binaries are loaded dynamically.
+ */
+
+#include <cstdint>
+#include <pto/pto-inst.hpp>
+
+#include "tensor.h"
+
+using namespace pto;
+
+#include "pipe_sync.h"
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__ [aicore]
+#endif
+
+/**
+ * Element-wise addition kernel implementation
+ *
+ * Unified signature: all arguments passed via int64_t array
+ * @param args  Argument array:
+ *              args[0] = src0 pointer (first input tensor)
+ *              args[1] = src1 pointer (second input tensor)
+ *              args[2] = out pointer (output tensor)
+ *              args[3] = size (number of elements)
+ */
+extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) {
+    // Unpack arguments (Tensor* pointers from runtime)
+    __gm__ Tensor *src0_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]);
+    __gm__ Tensor *src1_tensor = reinterpret_cast<__gm__ Tensor *>(args[1]);
+    __gm__ Tensor *out_tensor = reinterpret_cast<__gm__ Tensor *>(args[2]);
+    __gm__ float *src0 = reinterpret_cast<__gm__ float *>(src0_tensor->buffer.addr) + src0_tensor->start_offset;
+    __gm__ float *src1 = reinterpret_cast<__gm__ float *>(src1_tensor->buffer.addr) + src1_tensor->start_offset;
+    __gm__ float *out = reinterpret_cast<__gm__ float *>(out_tensor->buffer.addr) + out_tensor->start_offset;
+
+    // Configuration: float, 128, 128, 128, 128
+    constexpr int kTRows_ = 128;
+    constexpr int kTCols_ = 128;
+    constexpr int vRows = 128;
+    constexpr int vCols = 128;
+
+    using DynShapeDim5 = Shape<1, 1, 1, vRows, vCols>;
+    using DynStridDim5 = Stride<1, 1, 1, kTCols_, 1>;
+    using GlobalData = GlobalTensor<float, DynShapeDim5, DynStridDim5>;
+    using TileData = Tile<TileType::Vec, float, kTRows_, kTCols_, BLayout::RowMajor, -1, -1>;
+
+    TileData src0Tile(vRows, vCols);
+    TileData src1Tile(vRows, vCols);
+    TileData dstTile(vRows, vCols);
+    TASSIGN(src0Tile, 0x0);
+    TASSIGN(src1Tile, 0x10000);
+    TASSIGN(dstTile, 0x20000);
+
+    GlobalData src0Global(src0);
+    GlobalData src1Global(src1);
+    GlobalData dstGlobal(out);
+
+    TLOAD(src0Tile, src0Global);
+    TLOAD(src1Tile, src1Global);
+    set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+    wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+    TADD(dstTile, src0Tile, src1Tile);
+    set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+    wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+    TSTORE(dstGlobal, dstTile);
+
+    pipe_sync();
+}
diff --git a/examples/a2a3/fully_distributed_within_core/scalar_data_test/kernels/aiv/kernel_noop.cpp b/examples/a2a3/fully_distributed_within_core/scalar_data_test/kernels/aiv/kernel_noop.cpp
new file mode 100644
index 000000000..8187197c4
--- /dev/null
+++ b/examples/a2a3/fully_distributed_within_core/scalar_data_test/kernels/aiv/kernel_noop.cpp
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * No-op Kernel
+ *
+ * Empty kernel used to trigger runtime allocation for tensors passed
+ * as OUTPUT/INOUT via add_inout(). The runtime allocates HeapRing memory
+ * and writes initial values before dispatching this task; the kernel
+ * itself does not read or modify any data.
+ */
+
+#include <cstdint>
+#include <pto/pto-inst.hpp>
+
+using namespace pto;
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__ [aicore]
+#endif
+
+extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) { (void)args; }
diff --git a/examples/a2a3/fully_distributed_within_core/scalar_data_test/kernels/orchestration/scalar_data_orch.cpp b/examples/a2a3/fully_distributed_within_core/scalar_data_test/kernels/orchestration/scalar_data_orch.cpp
new file mode 100644
index 000000000..a0a8ed7d8
--- /dev/null
+++ b/examples/a2a3/fully_distributed_within_core/scalar_data_test/kernels/orchestration/scalar_data_orch.cpp
@@ -0,0 +1,265 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Scalar Data Dependency Test Orchestration
+ *
+ * End-to-end test for get_tensor_data, set_tensor_data, and add_inout
+ * with runtime-created outputs and initial value support.
+ *
+ * Flow:
+ *   1. c = a + b           (kernel_add, runtime-created tensor)
+ *   2. get_tensor_data(c, {0})   → check[0] = 2.0
+ *   3. get_tensor_data(c, {100}) → check[1] = 102.0
+ *   4. scalar_tensor = add_output(TensorCreateInfo, 77.0f), submit noop
+ *   5. get_tensor_data(scalar_tensor, {0}) → check[2] = 77.0
+ *   6. add_inout(scalar_tensor) (INOUT path), submit noop
+ *   7. get_tensor_data(scalar_tensor, {0}) → check[3] = 77.0
+ *   8. check[4] = 2.0 + 77.0 = 79.0  (orchestration arithmetic)
+ *   9. set_tensor_data(scalar_tensor, {0}, 42.0), get_tensor_data → check[5] = 42.0
+ *  10. Orch set_tensor_data(d, {0}, 10.0) → kernel_add(d, a) → check[6] = 12.0
+ *  11. WAW+WAR: kernel_add reads c → set_tensor_data(c, 88.0) auto-waits → check[7] = 88.0
+ *  12. External WAR with INOUT: noop(ext_b as INOUT) → set_tensor_data(ext_b) → check[8] = 55.0
+ *  13. result = a + b      (kernel_add, external output via INOUT)
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "pto_orchestration_api.h"  // NOLINT(build/include_subdir)
+
+#define FUNC_ADD 0
+#define FUNC_NOOP 1
+
+extern "C" {
+
+__attribute__((visibility("default"))) PTO2OrchestrationConfig aicpu_orchestration_config(const L2TaskArgs &orch_args) {
+    (void)orch_args;  // NOLINT(readability/casting)
+    return PTO2OrchestrationConfig{
+        .expected_arg_count = 4,  // a, b, result, check
+    };
+}
+
+__attribute__((visibility("default"))) void aicpu_orchestration_entry(const L2TaskArgs &orch_args) {
+    // External tensors from golden.py
+    const Tensor &ext_a = orch_args.tensor(0).ref();
+    const Tensor &ext_b = orch_args.tensor(1).ref();
+    const Tensor &ext_result = orch_args.tensor(2).ref();
+    const Tensor &ext_check = orch_args.tensor(3).ref();
+
+    uint32_t SIZE = orch_args.tensor(0).ref().shapes[0];
+    LOG_INFO_V0("scalar_data_test: SIZE=%u, check_size=%u", SIZE, orch_args.tensor(3).ref().shapes[0]);
+
+    uint32_t inter_shapes[1] = {SIZE};
+    TensorCreateInfo inter_ci(inter_shapes, 1, DataType::FLOAT32);
+
+    // =========================================================
+    // Step 1: c = a + b (runtime-created tensor, kernel_add)
+    // =========================================================
+    L0TaskArgs params_c;
+    params_c.add_input(ext_a);
+    params_c.add_input(ext_b);
+    params_c.add_output(inter_ci);
+    TaskOutputTensors c_outs = rt_submit_aiv_task(FUNC_ADD, params_c);
+    const Tensor &c = c_outs.get_ref(0);
+
+    // =========================================================
+    // Step 2: get_tensor_data(c, {0}) → check[0]
+    //   Tests TensorMap lookup + spin-wait for kernel completion
+    // =========================================================
+    uint32_t idx[1] = {0};
+    float c0_val = get_tensor_data<float>(c, 1, idx);
+    LOG_INFO_V0("get_tensor_data(c, {0}) = %f (expected 2.0)", static_cast<double>(c0_val));
+
+    uint32_t check_idx[1] = {0};
+    set_tensor_data(ext_check, 1, check_idx, c0_val);
+
+    // =========================================================
+    // Step 3: get_tensor_data(c, {100}) → check[1]
+    //   Tests flat offset calculation for non-zero index
+    // =========================================================
+    idx[0] = 100;
+    float c100_val = get_tensor_data<float>(c, 1, idx);
+    LOG_INFO_V0("get_tensor_data(c, {100}) = %f (expected 102.0)", static_cast<double>(c100_val));
+
+    check_idx[0] = 1;
+    set_tensor_data(ext_check, 1, check_idx, c100_val);
+
+    // =========================================================
+    // Step 4: Runtime-created scalar output with initial value
+    //   Runtime allocates HeapRing buffer, writes 77.0 to element [0]
+    // =========================================================
+    uint32_t scalar_shapes[1] = {1};
+    TensorCreateInfo scalar_ci(scalar_shapes, 1, DataType::FLOAT32);
+    scalar_ci.set_initial_value(77.0f);
+    TaskOutputTensors scalar_alloc_outs = alloc_tensors(scalar_ci);
+    const Tensor &scalar_tensor = scalar_alloc_outs.get_ref(0);
+
+    // =========================================================
+    // Step 5: get_tensor_data(scalar_tensor, {0}) → check[2]
+    //   Verifies initial value was written correctly
+    // =========================================================
+    idx[0] = 0;
+    float s0_val = get_tensor_data<float>(scalar_tensor, 1, idx);
+    LOG_INFO_V0("get_tensor_data(scalar_tensor, {0}) after init = %f (expected 77.0)", static_cast<double>(s0_val));
+
+    check_idx[0] = 2;
+    set_tensor_data(ext_check, 1, check_idx, s0_val);
+
+    // =========================================================
+    // Step 6: add_inout(scalar_tensor) second use → INOUT path
+    //   Buffer already exists, so the noop just registers dependency
+    // =========================================================
+    {
+        L0TaskArgs args;
+        args.add_inout(scalar_tensor);
+        rt_submit_aiv_task(FUNC_NOOP, args);
+    }
+
+    // =========================================================
+    // Step 7: get_tensor_data(scalar_tensor, {0}) → check[3]
+    //   Value should be preserved (noop kernel didn't modify it)
+    // =========================================================
+    float s1_val = get_tensor_data<float>(scalar_tensor, 1, idx);
+    LOG_INFO_V0("get_tensor_data(scalar_tensor, {0}) after 2nd noop = %f (expected 77.0)", static_cast<double>(s1_val));
+
+    check_idx[0] = 3;
+    set_tensor_data(ext_check, 1, check_idx, s1_val);
+
+    // =========================================================
+    // Step 8: set_tensor_data with orchestration-computed value → check[4]
+    //   Tests set_tensor_data write + orchestration arithmetic
+    // =========================================================
+    float combined = c0_val + s0_val;  // 2.0 + 77.0 = 79.0
+    LOG_INFO_V0(
+        "Orchestration arithmetic: %f + %f = %f", static_cast<double>(c0_val), static_cast<double>(s0_val),
+        static_cast<double>(combined)
+    );  // NOLINT(whitespace/line_length)
+
+    check_idx[0] = 4;
+    set_tensor_data(ext_check, 1, check_idx, combined);
+
+    // =========================================================
+    // Step 9: Orch set→get round-trip on internal tensor
+    //   Validates that set_tensor_data writes are visible to get_tensor_data
+    //   on the same tensor. Uses scalar_tensor (currently 77.0), overwrites to 42.0.
+    // =========================================================
+    set_tensor_data(scalar_tensor, 1, idx, 42.0f);
+    float rw_val = get_tensor_data<float>(scalar_tensor, 1, idx);
+    LOG_INFO_V0("set_tensor_data→get_tensor_data round-trip = %f (expected 42.0)", static_cast<double>(rw_val));
+
+    check_idx[0] = 5;
+    set_tensor_data(ext_check, 1, check_idx, rw_val);
+
+    // =========================================================
+    // Step 10: Orch→AICore RAW (set_tensor_data → kernel reads)
+    //   Orchestration writes d[0]=10.0 via set_tensor_data, then
+    //   kernel_add reads d as input: e[0] = d[0] + a[0] = 12.0
+    // =========================================================
+    TaskOutputTensors d_alloc_outs = alloc_tensors(inter_ci);
+    const Tensor &d = d_alloc_outs.get_ref(0);
+
+    idx[0] = 0;
+    set_tensor_data(d, 1, idx, 10.0f);
+
+    L0TaskArgs params_e;
+    params_e.add_input(d);
+    params_e.add_input(ext_a);
+    params_e.add_output(inter_ci);
+    TaskOutputTensors e_outs = rt_submit_aiv_task(FUNC_ADD, params_e);
+    const Tensor &e = e_outs.get_ref(0);
+
+    float e0_val = get_tensor_data<float>(e, 1, idx);
+    LOG_INFO_V0("Orch→AICore RAW: e[0] = %f (expected 12.0)", static_cast<double>(e0_val));
+
+    check_idx[0] = 6;
+    set_tensor_data(ext_check, 1, check_idx, e0_val);
+
+    // =========================================================
+    // Step 11: WAW + WAR on internal tensor
+    //   c was written by Step 1 (kernel_add, TensorMap has producer entry).
+    //   Submit a new kernel that reads c as INPUT (creates consumer dep).
+    //   Then set_tensor_data(c) — no manual get_tensor_data sync.
+    //   set_tensor_data internally waits for:
+    //     - WAW: producer (Step 1) COMPLETED
+    //     - WAR: consumer (this kernel) done (fanout_refcount check)
+    //
+    //   NOTE on external tensors: ext_a was read by Step 1 as INPUT,
+    //   but TensorMap has no producer entry for ext_a (only consumers).
+    //   set_tensor_data(ext_a) would NOT detect the reader — data race.
+    //   To ensure WAR safety on external tensors, use add_inout()
+    //   instead of add_input() so TensorMap tracks the access chain.
+    // =========================================================
+    {
+        L0TaskArgs args;
+        args.add_input(c);
+        args.add_input(ext_b);
+        args.add_output(inter_ci);
+        (void)rt_submit_aiv_task(FUNC_ADD, args);  // NOLINT(readability/casting)
+    }
+
+    // set_tensor_data auto-waits for producer + consumer before writing
+    idx[0] = 0;
+    set_tensor_data(c, 1, idx, 88.0f);
+    float waw_val = get_tensor_data<float>(c, 1, idx);
+    LOG_INFO_V0("WAW+WAR: set_tensor_data(c, 88.0) after consumer = %f (expected 88.0)", static_cast<double>(waw_val));
+
+    check_idx[0] = 7;
+    set_tensor_data(ext_check, 1, check_idx, waw_val);
+
+    // =========================================================
+    // Step 12: External tensor WAR — must use add_output or add_inout, not add_input
+    //
+    //   For external tensors, using add_input() does NOT create a
+    //   TensorMap entry. set_tensor_data would then write immediately
+    //   without waiting for the reader kernel — a WAR data race.
+    //
+    //   Using add_output() (or add_inout()) creates a TensorMap entry,
+    //   enabling set_tensor_data to detect the producer via TensorMap lookup
+    //   and wait for fanout_refcount (all consumers done).
+    //
+    //   Here we submit noop with ext_b as write-only output (noop doesn't
+    //   read data), then set_tensor_data overwrites ext_b[0] = 55.0.
+    //   set_tensor_data auto-waits for the noop to complete.
+    // =========================================================
+    {
+        L0TaskArgs args;
+        args.add_output(ext_b);  // write-only: creates TensorMap entry (not add_input!)
+        rt_submit_aiv_task(FUNC_NOOP, args);
+    }
+
+    idx[0] = 0;
+    set_tensor_data(ext_b, 1, idx, 55.0f);
+    float ext_war_val = get_tensor_data<float>(ext_b, 1, idx);
+    LOG_INFO_V0(
+        "External WAR (INOUT): set_tensor_data(ext_b, 55.0) = %f (expected 55.0)", static_cast<double>(ext_war_val)
+    );
+
+    check_idx[0] = 8;
+    set_tensor_data(ext_check, 1, check_idx, ext_war_val);
+
+    // Restore ext_b[0] for final result comparison
+    set_tensor_data(ext_b, 1, idx, 0.0f);
+
+    // =========================================================
+    // Step 13: result = a + b (external output via add_output, kernel_add)
+    // =========================================================
+    {
+        L0TaskArgs args;
+        args.add_input(ext_a);
+        args.add_input(ext_b);
+        args.add_output(ext_result);
+        rt_submit_aiv_task(FUNC_ADD, args);
+    }
+
+    LOG_INFO_V0("scalar_data_test: orchestration complete");
+}
+
+}  // extern "C"
diff --git a/examples/a2a3/fully_distributed_within_core/scalar_data_test/test_scalar_data.py b/examples/a2a3/fully_distributed_within_core/scalar_data_test/test_scalar_data.py
new file mode 100644
index 000000000..4cce2af1d
--- /dev/null
+++ b/examples/a2a3/fully_distributed_within_core/scalar_data_test/test_scalar_data.py
@@ -0,0 +1,83 @@
+#!/usr/bin/env python3
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""Scalar data dependency test: GetTensorData, SetTensorData, add_inout.
+
+Tests orchestration-level data manipulation: scalar initialization,
+Get/Set round-trips, WAW+WAR dependency auto-wait, and external tensor WAR.
+"""
+
+import torch
+from simpler.task_interface import ArgDirection as D
+
+from simpler_setup import SceneTestCase, TaskArgsBuilder, Tensor, scene_test
+
+
+@scene_test(level=2, runtime="fully_distributed_within_core")
+class TestScalarData(SceneTestCase):
+    """Scalar data dependency: Get/SetTensorData, add_inout with initial value."""
+
+    CALLABLE = {
+        "orchestration": {
+            "source": "kernels/orchestration/scalar_data_orch.cpp",
+            "function_name": "aicpu_orchestration_entry",
+            "signature": [D.IN, D.IN, D.OUT, D.OUT],
+        },
+        "incores": [
+            {
+                "func_id": 0,
+                "source": "kernels/aiv/kernel_add.cpp",
+                "core_type": "aiv",
+                "signature": [D.IN, D.IN, D.OUT],
+            },
+            {
+                "func_id": 1,
+                "source": "kernels/aiv/kernel_noop.cpp",
+                "core_type": "aiv",
+                "signature": [],
+            },
+        ],
+    }
+
+    CASES = [
+        {
+            "name": "default",
+            "platforms": ["a2a3"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 3},
+            "params": {},
+        },
+    ]
+
+    def generate_args(self, params):
+        SIZE = 128 * 128
+        return TaskArgsBuilder(
+            Tensor("a", torch.full((SIZE,), 2.0, dtype=torch.float32)),
+            Tensor("b", torch.arange(SIZE, dtype=torch.float32)),
+            Tensor("result", torch.zeros(SIZE, dtype=torch.float32)),
+            Tensor("check", torch.zeros(10, dtype=torch.float32)),
+        )
+
+    def compute_golden(self, args, params):
+        # result = a + b (computed by kernel_add)
+        args.result[:] = args.a + args.b
+
+        # check values written by orchestration via SetTensorData
+        args.check[0] = 2.0  # GetTensorData(c, {0}): c = a + b, c[0] = 2.0+0.0
+        args.check[1] = 102.0  # GetTensorData(c, {100}): c[100] = 2.0+100.0
+        args.check[2] = 77.0  # runtime-created scalar output initialized to 77.0
+        args.check[3] = 77.0  # second noop via add_inout preserves the value
+        args.check[4] = 79.0  # orchestration arithmetic: 2.0 + 77.0
+        args.check[5] = 42.0  # Orch set->get round-trip: SetTensorData then GetTensorData
+        args.check[6] = 12.0  # Orch->AICore RAW: SetTensorData(d,10.0) + kernel_add(d,a) -> 10.0+2.0
+        args.check[7] = 88.0  # WAW+WAR: kernel reads c, SetTensorData(c,88.0) auto-waits
+        args.check[8] = 55.0  # External WAR: noop(ext_b INOUT) -> SetTensorData(ext_b,55.0) auto-waits
+
+
+if __name__ == "__main__":
+    SceneTestCase.run_module(__name__)
diff --git a/examples/a2a3/fully_distributed_within_core/sdma_async_completion_demo/kernels/aiv/kernel_consumer.cpp b/examples/a2a3/fully_distributed_within_core/sdma_async_completion_demo/kernels/aiv/kernel_consumer.cpp
new file mode 100644
index 000000000..ef56d934e
--- /dev/null
+++ b/examples/a2a3/fully_distributed_within_core/sdma_async_completion_demo/kernels/aiv/kernel_consumer.cpp
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+#include <cstdint>
+
+#ifndef __gm__
+#define __gm__
+#endif
+#ifndef __aicore__
+#define __aicore__ [aicore]
+#endif
+
+#include <pto/pto-inst.hpp>
+
+#include "tensor.h"
+
+using namespace pto;
+
+extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) {
+    __gm__ Tensor *src_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]);
+    __gm__ Tensor *result_tensor = reinterpret_cast<__gm__ Tensor *>(args[1]);
+
+    __gm__ float *src = reinterpret_cast<__gm__ float *>(src_tensor->buffer.addr) + src_tensor->start_offset;
+    __gm__ float *result = reinterpret_cast<__gm__ float *>(result_tensor->buffer.addr) + result_tensor->start_offset;
+
+    constexpr int kTotalRows = 128;
+    constexpr int kRows = 64;
+    constexpr int kCols = 128;
+    constexpr int kIters = kTotalRows / kRows;
+    using DynShapeDim5 = Shape<1, 1, 1, kRows, kCols>;
+    using DynStrideDim5 = Stride<1, 1, 1, kCols, 1>;
+    using GlobalData = GlobalTensor<float, DynShapeDim5, DynStrideDim5>;
+    using TileData = Tile<TileType::Vec, float, kRows, kCols, BLayout::RowMajor, -1, -1>;
+
+    TileData src_tile(kRows, kCols);
+    TileData result_tile(kRows, kCols);
+    TASSIGN(src_tile, 0x0);
+    TASSIGN(result_tile, 0x10000);
+
+    constexpr int kChunkElems = kRows * kCols;
+    for (int iter = 0; iter < kIters; ++iter) {
+        GlobalData src_global(src + iter * kChunkElems);
+        GlobalData result_global(result + iter * kChunkElems);
+        TLOAD(src_tile, src_global);
+        set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+        wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+
+        TADDS(result_tile, src_tile, 1.0f);
+        set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+        wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+
+        TSTORE(result_global, result_tile);
+        set_flag(PIPE_MTE3, PIPE_S, EVENT_ID7);
+        wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID7);
+    }
+}
diff --git a/examples/a2a3/fully_distributed_within_core/sdma_async_completion_demo/kernels/aiv/kernel_sdma_tget_async.cpp b/examples/a2a3/fully_distributed_within_core/sdma_async_completion_demo/kernels/aiv/kernel_sdma_tget_async.cpp
new file mode 100644
index 000000000..eb8b5aeb3
--- /dev/null
+++ b/examples/a2a3/fully_distributed_within_core/sdma_async_completion_demo/kernels/aiv/kernel_sdma_tget_async.cpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+#include <cstdint>
+
+#ifndef __gm__
+#define __gm__
+#endif
+#ifndef __aicore__
+#define __aicore__ [aicore]
+#endif
+
+#include <pto/pto-inst.hpp>
+
+#include "backend/sdma/sdma_completion_kernel.h"
+#include "platform_comm/comm_context.h"
+#include "pto_async_kernel_api.h"
+#include "tensor.h"
+
+using namespace pto;
+
+template <typename T>
+static inline __aicore__ __gm__ T *comm_remote_ptr(__gm__ CommContext *ctx, __gm__ T *local_ptr, int peer_rank) {
+    uint64_t local_base = ctx->windowsIn[ctx->rankId];
+    uint64_t offset = reinterpret_cast<uint64_t>(local_ptr) - local_base;
+    return reinterpret_cast<__gm__ T *>(ctx->windowsIn[peer_rank] + offset);
+}
+
+extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) {
+    __gm__ Tensor *in_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]);
+    __gm__ Tensor *out_tensor = reinterpret_cast<__gm__ Tensor *>(args[1]);
+    __gm__ CommContext *comm_ctx = reinterpret_cast<__gm__ CommContext *>(args[2]);
+
+    __gm__ float *local_in = reinterpret_cast<__gm__ float *>(in_tensor->buffer.addr) + in_tensor->start_offset;
+    __gm__ float *local_out = reinterpret_cast<__gm__ float *>(out_tensor->buffer.addr) + out_tensor->start_offset;
+
+    int rank = static_cast<int>(comm_ctx->rankId);
+    int nranks = static_cast<int>(comm_ctx->rankNum);
+    if (nranks != 2 || comm_ctx->workSpace == 0) {
+        pipe_barrier(PIPE_ALL);
+        return;
+    }
+    int peer_rank = 1 - rank;
+
+    constexpr int kElems = 128 * 128;
+    using FlatShape = Shape<1, 1, 1, 1, kElems>;
+    using FlatStride = Stride<kElems, kElems, kElems, kElems, 1>;
+    using GlobalData = GlobalTensor<float, FlatShape, FlatStride>;
+    using ScratchTile = Tile<TileType::Vec, uint8_t, 1, SDMA_SCRATCH_ALIGNMENT>;
+
+    __gm__ float *remote_in = comm_remote_ptr(comm_ctx, local_in, peer_rank);
+    GlobalData remote_global(remote_in);
+    GlobalData local_global(local_out);
+
+    ScratchTile scratch_tile;
+    TASSIGN(scratch_tile, 0x0);
+
+    AsyncCtx async_ctx = get_async_ctx(args);
+    send_request_entry(
+        async_ctx,
+        SdmaTget(local_global, remote_global, scratch_tile, reinterpret_cast<__gm__ uint8_t *>(comm_ctx->workSpace))
+    );
+}
diff --git a/examples/a2a3/fully_distributed_within_core/sdma_async_completion_demo/kernels/orchestration/sdma_async_completion_orch.cpp b/examples/a2a3/fully_distributed_within_core/sdma_async_completion_demo/kernels/orchestration/sdma_async_completion_orch.cpp
new file mode 100644
index 000000000..a33c96730
--- /dev/null
+++ b/examples/a2a3/fully_distributed_within_core/sdma_async_completion_demo/kernels/orchestration/sdma_async_completion_orch.cpp
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+#include <stdint.h>
+
+#include "platform_comm/comm_context.h"
+#include "pto_orchestration_api.h"
+
+extern "C" {
+
+__attribute__((visibility("default"))) PTO2OrchestrationConfig
+sdma_async_completion_orchestration_config(const L2TaskArgs &orch_args) {
+    (void)orch_args;
+    return PTO2OrchestrationConfig{.expected_arg_count = 4};
+}
+
+__attribute__((visibility("default"))) PTO2OrchestrationConfig aicpu_orchestration_config(const L2TaskArgs &orch_args) {
+    return sdma_async_completion_orchestration_config(orch_args);
+}
+
+__attribute__((visibility("default"))) void sdma_async_completion_orchestration(const L2TaskArgs &orch_args) {
+    if (orch_args.tensor_count() + orch_args.scalar_count() != 4) {
+        LOG_ERROR("sdma_async_completion_demo: expected 4 args");
+        return;
+    }
+
+    const Tensor &input = orch_args.tensor(0).ref();
+    const Tensor &out = orch_args.tensor(1).ref();
+    const Tensor &result = orch_args.tensor(2).ref();
+    auto *comm_ctx = reinterpret_cast<CommContext *>(static_cast<uintptr_t>(orch_args.scalar(0)));
+
+    L0TaskArgs producer_args;
+    producer_args.add_input(input);
+    producer_args.add_output(out);
+    producer_args.add_scalar(reinterpret_cast<uint64_t>(comm_ctx));
+    rt_submit_aiv_task(0, producer_args);
+
+    L0TaskArgs consumer_args;
+    consumer_args.add_input(out);
+    consumer_args.add_output(result);
+    rt_submit_aiv_task(1, consumer_args);
+}
+
+}  // extern "C"
diff --git a/examples/a2a3/fully_distributed_within_core/sdma_async_completion_demo/test_sdma_async_completion_demo.py b/examples/a2a3/fully_distributed_within_core/sdma_async_completion_demo/test_sdma_async_completion_demo.py
new file mode 100644
index 000000000..f727d3a72
--- /dev/null
+++ b/examples/a2a3/fully_distributed_within_core/sdma_async_completion_demo/test_sdma_async_completion_demo.py
@@ -0,0 +1,209 @@
+#!/usr/bin/env python3
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""SDMA deferred completion smoke test for onboard a2a3.
+
+Each rank stages its input inside the HCCL window.  The deferred producer
+TGET_ASYNCs the peer rank's input into local ``out`` and registers the PTO
+AsyncEvent through ``defer_pto_async_event``.  The consumer depends on the
+producer output and writes ``result = out + 1``.  Correct ``out`` and
+``result`` therefore validate both the SDMA completion polling and the
+deferred-release dependency path.
+"""
+
+from __future__ import annotations
+
+import argparse
+import os
+
+import pytest
+import torch
+from simpler.task_interface import (
+    ArgDirection,
+    CallConfig,
+    ChipCallable,
+    CommBufferSpec,
+    CoreCallable,
+    DataType,
+    TaskArgs,
+    Tensor,
+    TensorArgType,
+)
+from simpler.worker import Worker
+
+from simpler_setup.elf_parser import extract_text_section
+from simpler_setup.kernel_compiler import KernelCompiler
+from simpler_setup.pto_isa import ensure_pto_isa_root
+from simpler_setup.torch_interop import make_tensor_arg
+
+HERE = os.path.dirname(os.path.abspath(__file__))
+N = 128 * 128
+DTYPE_NBYTES = 4
+
+
+def parse_device_range(spec: str) -> list[int]:
+    if "," in spec:
+        return [int(x) for x in spec.split(",") if x]
+    if "-" in spec:
+        lo, hi = (int(x) for x in spec.split("-"))
+        return list(range(lo, hi + 1))
+    return [int(spec)]
+
+
+def build_chip_callable(platform: str, pto_isa_commit: str | None, clone_protocol: str) -> ChipCallable:
+    kc = KernelCompiler(platform=platform)
+    runtime = "fully_distributed_within_core"
+    pto_isa_root = ensure_pto_isa_root(commit=pto_isa_commit, clone_protocol=clone_protocol)
+    include_dirs = kc.get_orchestration_include_dirs(runtime)
+    extra_includes = list(include_dirs) + [str(kc.project_root / "src" / "common")]
+
+    children = []
+    for func_id, rel in [
+        (0, "kernels/aiv/kernel_sdma_tget_async.cpp"),
+        (1, "kernels/aiv/kernel_consumer.cpp"),
+    ]:
+        kernel = kc.compile_incore(
+            source_path=os.path.join(HERE, rel),
+            core_type="aiv",
+            pto_isa_root=pto_isa_root,
+            extra_include_dirs=extra_includes,
+        )
+        if not platform.endswith("sim"):
+            kernel = extract_text_section(kernel)
+        children.append(
+            (
+                func_id,
+                CoreCallable.build(
+                    signature=[ArgDirection.IN, ArgDirection.OUT, ArgDirection.OUT, ArgDirection.IN],
+                    binary=kernel,
+                ),
+            )
+        )
+
+    orch = kc.compile_orchestration(
+        runtime_name=runtime,
+        source_path=os.path.join(HERE, "kernels/orchestration/sdma_async_completion_orch.cpp"),
+        extra_include_dirs=[str(kc.project_root / "src" / "common")],
+    )
+    return ChipCallable.build(
+        signature=[ArgDirection.IN, ArgDirection.OUT, ArgDirection.OUT, ArgDirection.IN],
+        func_name="sdma_async_completion_orchestration",
+        binary=orch,
+        children=children,
+    )
+
+
+def run(
+    platform: str = "a2a3",
+    device_ids: list[int] | None = None,
+    pto_isa_commit: str | None = None,
+) -> int:
+    if device_ids is None:
+        device_ids = [0, 1]
+    nranks = len(device_ids)
+    if nranks != 2:
+        raise ValueError(f"sdma_async_completion_demo needs exactly 2 devices, got {device_ids}")
+    if platform.endswith("sim"):
+        raise ValueError("sdma_async_completion_demo requires onboard a2a3 hardware")
+
+    input_nbytes = N * DTYPE_NBYTES
+    window_size = max(input_nbytes, 4 * 1024)
+
+    # `inputs` must live in shared memory: `orch.copy_to` stages each rank's
+    # data into its HCCL window from the forked chip child, which reads `src`
+    # out of its own address space.
+    inputs = [
+        torch.tensor([float(rank * 1000 + (i % 251)) / 10.0 for i in range(N)], dtype=torch.float32).share_memory_()
+        for rank in range(nranks)
+    ]
+    out = [torch.zeros(N, dtype=torch.float32).share_memory_() for _ in range(nranks)]
+    result = [torch.zeros(N, dtype=torch.float32).share_memory_() for _ in range(nranks)]
+
+    chip_callable = build_chip_callable(platform, pto_isa_commit, "https")
+    worker = Worker(
+        level=3,
+        platform=platform,
+        runtime="fully_distributed_within_core",
+        device_ids=device_ids,
+        num_sub_workers=0,
+    )
+    chip_handle = worker.register(chip_callable)
+    try:
+        worker.init()
+
+        def orch_fn(orch, _args, cfg):
+            with orch.allocate_domain(
+                name="default",
+                workers=list(range(nranks)),
+                window_size=window_size,
+                buffers=[
+                    CommBufferSpec(name="input_window", dtype="float32", count=N, nbytes=input_nbytes),
+                ],
+            ) as handle:
+                # Stage every rank's input window before submitting any kernel:
+                # each producer TGET_ASYNCs the *peer* rank's window, so all
+                # windows must hold real data before execution begins.
+                for rank in range(nranks):
+                    orch.copy_to(
+                        rank,
+                        dst=handle[rank].buffer_ptrs["input_window"],
+                        src=inputs[rank].data_ptr(),
+                        size=input_nbytes,
+                    )
+                for rank in range(nranks):
+                    domain = handle[rank]
+                    args = TaskArgs()
+                    args.add_tensor(
+                        Tensor.make(
+                            data=domain.buffer_ptrs["input_window"],
+                            shapes=(N,),
+                            dtype=DataType.FLOAT32,
+                            child_memory=True,
+                        ),
+                        TensorArgType.INPUT,
+                    )
+                    args.add_tensor(make_tensor_arg(out[rank]), TensorArgType.OUTPUT_EXISTING)
+                    args.add_tensor(make_tensor_arg(result[rank]), TensorArgType.OUTPUT_EXISTING)
+                    args.add_scalar(domain.device_ctx)
+                    orch.submit_next_level(chip_handle, args, cfg, worker=rank)
+
+        worker.run(orch_fn, args=None, config=CallConfig())
+
+        ok = True
+        for rank in range(nranks):
+            peer = 1 - rank
+            expected_out = inputs[peer]
+            expected_result = expected_out + 1.0
+            max_out = float(torch.max(torch.abs(out[rank] - expected_out)))
+            max_result = float(torch.max(torch.abs(result[rank] - expected_result)))
+            print(f"[sdma_async_completion_demo] rank {rank}: max_out={max_out:.3e} max_result={max_result:.3e}")
+            ok = ok and max_out <= 1e-3 and max_result <= 1e-3
+        return 0 if ok else 1
+    finally:
+        worker.close()
+
+
+@pytest.mark.platforms(["a2a3"])
+@pytest.mark.runtime("fully_distributed_within_core")
+@pytest.mark.device_count(2)
+def test_sdma_async_completion_demo(st_device_ids, st_platform) -> None:
+    assert run(st_platform, [int(d) for d in st_device_ids]) == 0
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-p", "--platform", default="a2a3")
+    parser.add_argument("-d", "--device", default="0-1")
+    parser.add_argument("--pto-isa-commit", default=None)
+    args = parser.parse_args()
+    return run(args.platform, parse_device_range(args.device), args.pto_isa_commit)
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/examples/a2a3/fully_distributed_within_core/vector_example/kernels/aiv/kernel_add.cpp b/examples/a2a3/fully_distributed_within_core/vector_example/kernels/aiv/kernel_add.cpp
new file mode 100644
index 000000000..8a119554d
--- /dev/null
+++ b/examples/a2a3/fully_distributed_within_core/vector_example/kernels/aiv/kernel_add.cpp
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Element-wise Tensor Addition Kernel
+ *
+ * Implements: out[i] = src0[i] + src1[i]
+ *
+ * This kernel performs element-wise addition of two tensors. It's compiled
+ * separately as a standalone kernel and linked with the dispatcher using
+ * function pointers, demonstrating the separation pattern used in production
+ * systems where kernel binaries are loaded dynamically.
+ */
+
+#include <cstdint>
+#include <pto/pto-inst.hpp>
+
+#include "tensor.h"
+
+using namespace pto;
+
+#include "pipe_sync.h"
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__ [aicore]
+#endif
+
+/**
+ * Element-wise addition kernel implementation
+ *
+ * Unified signature: all arguments passed via int64_t array
+ * @param args  Argument array:
+ *              args[0] = src0 pointer (first input tensor)
+ *              args[1] = src1 pointer (second input tensor)
+ *              args[2] = out pointer (output tensor)
+ *              args[3] = size (number of elements)
+ */
+extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) {
+    // Unpack arguments (Tensor* pointers from runtime)
+    __gm__ Tensor *src0_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]);
+    __gm__ Tensor *src1_tensor = reinterpret_cast<__gm__ Tensor *>(args[1]);
+    __gm__ Tensor *out_tensor = reinterpret_cast<__gm__ Tensor *>(args[2]);
+    __gm__ float *src0 = reinterpret_cast<__gm__ float *>(src0_tensor->buffer.addr) + src0_tensor->start_offset;
+    __gm__ float *src1 = reinterpret_cast<__gm__ float *>(src1_tensor->buffer.addr) + src1_tensor->start_offset;
+    __gm__ float *out = reinterpret_cast<__gm__ float *>(out_tensor->buffer.addr) + out_tensor->start_offset;
+
+    // Configuration: float, 128, 128, 128, 128
+    constexpr int kTRows_ = 128;
+    constexpr int kTCols_ = 128;
+    constexpr int vRows = 128;
+    constexpr int vCols = 128;
+
+    using DynShapeDim5 = Shape<1, 1, 1, vRows, vCols>;
+    using DynStridDim5 = Stride<1, 1, 1, kTCols_, 1>;
+    using GlobalData = GlobalTensor<float, DynShapeDim5, DynStridDim5>;
+    using TileData = Tile<TileType::Vec, float, kTRows_, kTCols_, BLayout::RowMajor, -1, -1>;
+
+    TileData src0Tile(vRows, vCols);
+    TileData src1Tile(vRows, vCols);
+    TileData dstTile(vRows, vCols);
+    TASSIGN(src0Tile, 0x0);
+    TASSIGN(src1Tile, 0x10000);
+    TASSIGN(dstTile, 0x20000);
+
+    GlobalData src0Global(src0);
+    GlobalData src1Global(src1);
+    GlobalData dstGlobal(out);
+
+    TLOAD(src0Tile, src0Global);
+    TLOAD(src1Tile, src1Global);
+    set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+    wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+    TADD(dstTile, src0Tile, src1Tile);
+    set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+    wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+    TSTORE(dstGlobal, dstTile);
+
+    pipe_sync();
+}
diff --git a/examples/a2a3/fully_distributed_within_core/vector_example/kernels/aiv/kernel_add_scalar.cpp b/examples/a2a3/fully_distributed_within_core/vector_example/kernels/aiv/kernel_add_scalar.cpp
new file mode 100644
index 000000000..42ec41bcc
--- /dev/null
+++ b/examples/a2a3/fully_distributed_within_core/vector_example/kernels/aiv/kernel_add_scalar.cpp
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+/**
+ * Scalar Addition Kernel
+ *
+ * Implements: out[i] = src[i] + scalar
+ *
+ * This kernel adds a scalar value to each element of a tensor. It's compiled
+ * separately as a standalone kernel and linked with the dispatcher using
+ * function pointers, demonstrating the separation pattern used in production
+ * systems where kernel binaries are loaded dynamically.
+ */
+
+#include <cstdint>
+#include <pto/pto-inst.hpp>
+
+#include "tensor.h"  // NOLINT(build/include_subdir)
+
+// NOLINTNEXTLINE(build/namespaces)
+using namespace pto;
+
+#include "pipe_sync.h"
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__ [aicore]  // NOLINT(whitespace/braces)
+#endif
+
+/**
+ * Scalar addition kernel implementation
+ *
+ * Unified signature: all arguments passed via int64_t array
+ * @param args  Argument array:
+ *              args[0] = src pointer (input tensor)
+ *              args[1] = out pointer (output tensor)
+ *              args[2] = scalar value (as uint64_t, needs conversion to float)
+ *              args[3] = size (number of elements)
+ */
+extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) {
+    // Unpack arguments (Tensor* pointers from runtime)
+    __gm__ Tensor *src_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]);
+    __gm__ Tensor *out_tensor = reinterpret_cast<__gm__ Tensor *>(args[1]);
+    __gm__ float *src = reinterpret_cast<__gm__ float *>(src_tensor->buffer.addr) + src_tensor->start_offset;
+    __gm__ float *out = reinterpret_cast<__gm__ float *>(out_tensor->buffer.addr) + out_tensor->start_offset;
+
+    // Convert scalar from uint64_t to float
+    float scalar = from_u64<float>(static_cast<uint64_t>(args[2]));
+
+    // Configuration: float, 128, 128, 128, 128
+    constexpr int kTRows_ = 128;
+    constexpr int kTCols_ = 128;
+    constexpr int vRows = 128;
+    constexpr int vCols = 128;
+
+    using DynShapeDim5 = Shape<1, 1, 1, vRows, vCols>;
+    using DynStridDim5 = Stride<1, 1, 1, kTCols_, 1>;
+    using GlobalData = GlobalTensor<float, DynShapeDim5, DynStridDim5>;
+    using TileData = Tile<TileType::Vec, float, kTRows_, kTCols_, BLayout::RowMajor, -1, -1>;
+
+    TileData srcTile(vRows, vCols);
+    TileData dstTile(vRows, vCols);
+    TASSIGN(srcTile, 0x0);
+    TASSIGN(dstTile, 0x10000);
+
+    GlobalData srcGlobal(src);
+    GlobalData dstGlobal(out);
+
+    TLOAD(srcTile, srcGlobal);
+    set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+    wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+    TADDS(dstTile, srcTile, scalar);
+    set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+    wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+    TSTORE(dstGlobal, dstTile);
+
+    pipe_sync();
+}
diff --git a/examples/a2a3/fully_distributed_within_core/vector_example/kernels/aiv/kernel_mul.cpp b/examples/a2a3/fully_distributed_within_core/vector_example/kernels/aiv/kernel_mul.cpp
new file mode 100644
index 000000000..d48c63e27
--- /dev/null
+++ b/examples/a2a3/fully_distributed_within_core/vector_example/kernels/aiv/kernel_mul.cpp
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Element-wise Tensor Multiplication Kernel
+ *
+ * Implements: out[i] = src0[i] * src1[i]
+ *
+ * This kernel performs element-wise multiplication of two tensors. It's
+ * compiled separately as a standalone kernel and linked with the dispatcher
+ * using function pointers, demonstrating the separation pattern used in
+ * production systems where kernel binaries are loaded dynamically.
+ */
+
+#include <cstdint>
+#include <pto/pto-inst.hpp>
+
+#include "tensor.h"
+
+using namespace pto;
+
+#include "pipe_sync.h"
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__ [aicore]
+#endif
+
+/**
+ * Element-wise multiplication kernel implementation
+ *
+ * Unified signature: all arguments passed via int64_t array
+ * @param args  Argument array:
+ *              args[0] = src0 pointer (first input tensor)
+ *              args[1] = src1 pointer (second input tensor)
+ *              args[2] = out pointer (output tensor)
+ *              args[3] = size (number of elements)
+ */
+extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) {
+    // Unpack arguments (Tensor* pointers from runtime)
+    __gm__ Tensor *src0_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]);
+    __gm__ Tensor *src1_tensor = reinterpret_cast<__gm__ Tensor *>(args[1]);
+    __gm__ Tensor *out_tensor = reinterpret_cast<__gm__ Tensor *>(args[2]);
+    __gm__ float *src0 = reinterpret_cast<__gm__ float *>(src0_tensor->buffer.addr) + src0_tensor->start_offset;
+    __gm__ float *src1 = reinterpret_cast<__gm__ float *>(src1_tensor->buffer.addr) + src1_tensor->start_offset;
+    __gm__ float *out = reinterpret_cast<__gm__ float *>(out_tensor->buffer.addr) + out_tensor->start_offset;
+
+    // Configuration: float, 128, 128, 128, 128
+    constexpr int kTRows_ = 128;
+    constexpr int kTCols_ = 128;
+    constexpr int vRows = 128;
+    constexpr int vCols = 128;
+
+    using DynShapeDim5 = Shape<1, 1, 1, vRows, vCols>;
+    using DynStridDim5 = Stride<1, 1, 1, kTCols_, 1>;
+    using GlobalData = GlobalTensor<float, DynShapeDim5, DynStridDim5>;
+    using TileData = Tile<TileType::Vec, float, kTRows_, kTCols_, BLayout::RowMajor, -1, -1>;
+
+    TileData src0Tile(vRows, vCols);
+    TileData src1Tile(vRows, vCols);
+    TileData dstTile(vRows, vCols);
+    TASSIGN(src0Tile, 0x0);
+    TASSIGN(src1Tile, 0x10000);
+    TASSIGN(dstTile, 0x20000);
+
+    GlobalData src0Global(src0);
+    GlobalData src1Global(src1);
+    GlobalData dstGlobal(out);
+
+    TLOAD(src0Tile, src0Global);
+    TLOAD(src1Tile, src1Global);
+    set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+    wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+    TMUL(dstTile, src0Tile, src1Tile);
+    set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+    wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+    TSTORE(dstGlobal, dstTile);
+
+    pipe_sync();
+}
diff --git a/examples/a2a3/fully_distributed_within_core/vector_example/kernels/orchestration/example_orchestration.cpp b/examples/a2a3/fully_distributed_within_core/vector_example/kernels/orchestration/example_orchestration.cpp
new file mode 100644
index 000000000..a4b865326
--- /dev/null
+++ b/examples/a2a3/fully_distributed_within_core/vector_example/kernels/orchestration/example_orchestration.cpp
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Example: aicpu_orchestration_entry (device-side orchestration)
+ *
+ * DAG structure for formula: (a + b + 1)(a + b + 2) + (a + b)
+ *   t0: c = a + b     (func_id=0, kernel_add)       [outer scope]
+ *   t1: d = c + 1     (func_id=1, kernel_add_scalar) [inner scope]
+ *   t2: e = c + 2     (func_id=1, kernel_add_scalar) [inner scope]
+ *   t3: g = d * e     (func_id=2, kernel_mul)        [inner scope]
+ *   t4: f = g + c     (func_id=0, kernel_add)        [inner scope]
+ *   Dependencies: t0->t1, t0->t2, t1->t3, t2->t3, t0->t4, t3->t4
+ *
+ * Nested scope demonstration:
+ *   - Inner scope owns t1, t2, t3, t4; intermediates d, e, g release on inner scope end
+ *   - Outer scope owns t0; c persists across inner scope for t1, t2, t4
+ *   - c flows from outer to inner scope (outer-scope tensors are visible to inner scopes)
+ *
+ * This file compiles as a standalone .so with zero runtime link dependencies.
+ * All runtime calls go through the PTO2RuntimeOps function-pointer table.
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "pto_orchestration_api.h"  // NOLINT(build/include_subdir)
+
+extern "C" {
+
+/**
+ * Orchestration config — the executor reads these values to set up
+ * shared memory and runtime before calling aicpu_orchestration_entry.
+ */
+__attribute__((visibility("default"))) PTO2OrchestrationConfig aicpu_orchestration_config(const L2TaskArgs &orch_args) {
+    (void)orch_args;  // NOLINT(readability/casting)
+    return PTO2OrchestrationConfig{
+        .expected_arg_count = 3,
+    };
+}
+
+/**
+ * Orchestration entry — runtime is bound implicitly by the framework.
+ * The executor wraps this call in PTO2_SCOPE, so we are already inside
+ * the outer scope on entry.
+ */
+__attribute__((visibility("default"))) void aicpu_orchestration_entry(const L2TaskArgs &orch_args) {
+    // golden shape = kernel shape, use orch_args.tensor(i).ref() directly
+    const Tensor &ext_a = orch_args.tensor(0).ref();
+    const Tensor &ext_b = orch_args.tensor(1).ref();
+    const Tensor &ext_f = orch_args.tensor(2).ref();
+
+    uint32_t SIZE = orch_args.tensor(0).ref().shapes[0];
+    LOG_INFO_V0("===============SIZE=%u", SIZE);
+
+    uint32_t inter_shapes[1] = {SIZE};
+    TensorCreateInfo inter_ci(inter_shapes, 1, DataType::FLOAT32);
+
+    // t0: c = a + b (kernel_id=0, kernel_add) [outer scope]
+    L0TaskArgs params_t0;
+    params_t0.add_input(ext_a);
+    params_t0.add_input(ext_b);
+    params_t0.add_output(inter_ci);
+    TaskOutputTensors outs_t0 = rt_submit_aiv_task(0, params_t0);  // kernel_add
+    const Tensor &c = outs_t0.get_ref(0);
+
+    // Inner scope: owns t1, t2, t3, t4; intermediates d, e, g release on scope end.
+    // c flows in from outer scope (outer-scope tensors are visible to inner scopes).
+    PTO2_SCOPE() {
+        // t1: d = c + 1 (kernel_id=1, kernel_add_scalar)
+        L0TaskArgs params_t1;
+        params_t1.add_input(c);
+        params_t1.add_output(inter_ci);
+        params_t1.add_scalar(1.0f);
+        params_t1.add_scalar(3u);
+        TaskOutputTensors outs_t1 = rt_submit_aiv_task(1, params_t1);  // kernel_add_scalar
+        const Tensor &d = outs_t1.get_ref(0);
+
+        // t2: e = c + 2 (kernel_id=1, kernel_add_scalar)
+        L0TaskArgs params_t2;
+        params_t2.add_input(c);
+        params_t2.add_output(inter_ci);
+        params_t2.add_scalar(2.0f);
+        params_t2.add_scalar(3u);
+        TaskOutputTensors outs_t2 = rt_submit_aiv_task(1, params_t2);  // kernel_add_scalar
+        const Tensor &e = outs_t2.get_ref(0);
+
+        // t3: g = d * e (kernel_id=2, kernel_mul)
+        L0TaskArgs params_t3;
+        params_t3.add_input(d);
+        params_t3.add_input(e);
+        params_t3.add_output(inter_ci);
+        params_t3.add_scalar(3u);
+        TaskOutputTensors outs_t3 = rt_submit_aiv_task(2, params_t3);  // kernel_mul
+        const Tensor &g = outs_t3.get_ref(0);
+
+        // t4: f = g + c (kernel_id=0, kernel_add)
+        L0TaskArgs params_t4;
+        params_t4.add_input(g);
+        params_t4.add_input(c);
+        params_t4.add_output(ext_f);
+        rt_submit_aiv_task(0, params_t4);  // kernel_add
+    }  // inner scope ends: releases d, e, g
+}
+
+}  // extern "C"
diff --git a/examples/a2a3/fully_distributed_within_core/vector_example/test_vector_example.py b/examples/a2a3/fully_distributed_within_core/vector_example/test_vector_example.py
new file mode 100644
index 000000000..15a92d667
--- /dev/null
+++ b/examples/a2a3/fully_distributed_within_core/vector_example/test_vector_example.py
@@ -0,0 +1,72 @@
+#!/usr/bin/env python3
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""Tensormap-and-ringbuffer vector example: f = (a+b+1)*(a+b+2) + (a+b)."""
+
+import torch
+from simpler.task_interface import ArgDirection as D
+
+from simpler_setup import SceneTestCase, TaskArgsBuilder, Tensor, scene_test
+
+
+@scene_test(level=2, runtime="fully_distributed_within_core")
+class TestVectorExample(SceneTestCase):
+    """f = (a+b+1)*(a+b+2) + (a+b), where a=2.0, b=3.0 -> f=47.0."""
+
+    CALLABLE = {
+        "orchestration": {
+            "source": "kernels/orchestration/example_orchestration.cpp",
+            "function_name": "aicpu_orchestration_entry",
+            "signature": [D.IN, D.IN, D.OUT],
+        },
+        "incores": [
+            {
+                "func_id": 0,
+                "source": "kernels/aiv/kernel_add.cpp",
+                "core_type": "aiv",
+                "signature": [D.IN, D.IN, D.OUT],
+            },
+            {
+                "func_id": 1,
+                "source": "kernels/aiv/kernel_add_scalar.cpp",
+                "core_type": "aiv",
+                "signature": [D.IN, D.OUT],
+            },
+            {
+                "func_id": 2,
+                "source": "kernels/aiv/kernel_mul.cpp",
+                "core_type": "aiv",
+                "signature": [D.IN, D.IN, D.OUT],
+            },
+        ],
+    }
+
+    CASES = [
+        {
+            "name": "default",
+            "platforms": ["a2a3sim", "a2a3"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 3},
+            "params": {},
+        },
+    ]
+
+    def generate_args(self, params):
+        SIZE = 128 * 128
+        return TaskArgsBuilder(
+            Tensor("a", torch.full((SIZE,), 2.0, dtype=torch.float32)),
+            Tensor("b", torch.full((SIZE,), 3.0, dtype=torch.float32)),
+            Tensor("f", torch.zeros(SIZE, dtype=torch.float32)),
+        )
+
+    def compute_golden(self, args, params):
+        args.f[:] = (args.a + args.b + 1) * (args.a + args.b + 2) + (args.a + args.b)
+
+
+if __name__ == "__main__":
+    SceneTestCase.run_module(__name__)
diff --git a/python/bindings/task_interface.cpp b/python/bindings/task_interface.cpp
index d486fe3ae..976780d7f 100644
--- a/python/bindings/task_interface.cpp
+++ b/python/bindings/task_interface.cpp
@@ -763,6 +763,30 @@ NB_MODULE(_task_interface, m) {
                 c.enable_scope_stats = v ? 1 : 0;
             }
         )
+        .def_rw("use_example_exec_time", &CallConfig::use_example_exec_time)
+        // Per-func reference durations (nanoseconds), indexed by func_id; a
+        // CALLCONFIG_MAX_EXAMPLE_FUNCS-length list. Surfaced as a Python list;
+        // shorter input is zero-padded, longer is rejected.
+        .def_prop_rw(
+            "example_exec_time_ns",
+            [](const CallConfig &c) {
+                return std::vector<int32_t>(
+                    c.example_exec_time_ns, c.example_exec_time_ns + CALLCONFIG_MAX_EXAMPLE_FUNCS
+                );
+            },
+            [](CallConfig &c, const std::vector<int32_t> &v) {
+                if (v.size() > static_cast<size_t>(CALLCONFIG_MAX_EXAMPLE_FUNCS)) {
+                    throw std::invalid_argument(
+                        "CallConfig.example_exec_time_ns length " + std::to_string(v.size()) +
+                        " exceeds CALLCONFIG_MAX_EXAMPLE_FUNCS (" + std::to_string(CALLCONFIG_MAX_EXAMPLE_FUNCS) + ")"
+                    );
+                }
+                std::memset(c.example_exec_time_ns, 0, sizeof(c.example_exec_time_ns));
+                for (size_t i = 0; i < v.size(); ++i) {
+                    c.example_exec_time_ns[i] = v[i];
+                }
+            }
+        )
         .def_prop_rw(
             "output_prefix",
             [](const CallConfig &c) -> std::string {
diff --git a/python/simpler/worker.py b/python/simpler/worker.py
index c7abcafc7..1d34f78ba 100644
--- a/python/simpler/worker.py
+++ b/python/simpler/worker.py
@@ -127,13 +127,17 @@ def my_l4_orch(orch, args, config):
 _OFF_CALLABLE = 8
 _OFF_CONFIG = 16
 # Packed CallConfig wire layout — must match call_config.h byte for byte:
-# 7 int32 (block_dim, aicpu_thread_num, enable_l2_swimlane, enable_dump_tensor,
-# enable_pmu, enable_dep_gen, enable_scope_stats) + uint64 ring sizing
-# overrides (3 scalar fields + 3 x RUNTIME_ENV_RING_COUNT per-ring arrays) + 1024-byte
+# 8 int32 (block_dim, aicpu_thread_num, enable_l2_swimlane, enable_dump_tensor,
+# enable_pmu, enable_dep_gen, enable_scope_stats, use_example_exec_time) + uint64
+# ring sizing overrides (3 scalar fields + 3 x RUNTIME_ENV_RING_COUNT per-ring
+# arrays) + CALLCONFIG_MAX_EXAMPLE_FUNCS int32 example_exec_time_ns + 1024-byte
 # NUL-terminated output_prefix. Log config travels separately via
 # ChipWorker.init(log_level, log_info_v) — not on per-task wire.
 _RUNTIME_ENV_UINT64_FIELD_COUNT = 3 + 3 * RUNTIME_ENV_RING_COUNT
-_CFG_FMT = struct.Struct("=iiiiiii" + ("Q" * _RUNTIME_ENV_UINT64_FIELD_COUNT) + "1024s")
+_CALLCONFIG_MAX_EXAMPLE_FUNCS = 64
+_CFG_FMT = struct.Struct(
+    "=iiiiiiii" + ("Q" * _RUNTIME_ENV_UINT64_FIELD_COUNT) + ("i" * _CALLCONFIG_MAX_EXAMPLE_FUNCS) + "1024s"
+)
 # Args region starts after CONFIG, rounded up to 8 bytes so the first
 # Tensor.data (uint64_t at OFF_ARGS+8) is 8-byte aligned, avoiding
 # SIGBUS on strict-alignment platforms (aarch64 atomics, some ARM cores).
@@ -1017,6 +1021,7 @@ def _chip_process_loop(
 
 def _read_config_from_mailbox(buf: memoryview) -> "CallConfig":
     """Reconstruct a CallConfig from the unified mailbox layout."""
+    _f = _CFG_FMT.unpack_from(buf, _OFF_CONFIG)
     (
         block_dim,
         aicpu_tn,
@@ -1025,12 +1030,16 @@ def _read_config_from_mailbox(buf: memoryview) -> "CallConfig":
         pmu,
         dep_gen,
         scope_stats,
+        use_example,
         ring_task_window,
         ring_heap,
         ring_dep_pool,
-        *ring_values,
-        prefix_bytes,
-    ) = _CFG_FMT.unpack_from(buf, _OFF_CONFIG)
+    ) = _f[:11]
+    _ring_base = 11
+    ring_values = _f[_ring_base : _ring_base + 3 * RUNTIME_ENV_RING_COUNT]
+    _ex_base = _ring_base + 3 * RUNTIME_ENV_RING_COUNT
+    example_values = _f[_ex_base : _ex_base + _CALLCONFIG_MAX_EXAMPLE_FUNCS]
+    prefix_bytes = _f[-1]
     ring_task_windows = list(ring_values[:RUNTIME_ENV_RING_COUNT])
     ring_heaps = list(ring_values[RUNTIME_ENV_RING_COUNT : 2 * RUNTIME_ENV_RING_COUNT])
     ring_dep_pools = list(ring_values[2 * RUNTIME_ENV_RING_COUNT : 3 * RUNTIME_ENV_RING_COUNT])
@@ -1042,12 +1051,14 @@ def _read_config_from_mailbox(buf: memoryview) -> "CallConfig":
     cfg.enable_pmu = pmu
     cfg.enable_dep_gen = bool(dep_gen)
     cfg.enable_scope_stats = bool(scope_stats)
+    cfg.use_example_exec_time = use_example
     cfg.runtime_env.ring_task_window = ring_task_window
     cfg.runtime_env.ring_heap = ring_heap
     cfg.runtime_env.ring_dep_pool = ring_dep_pool
     cfg.runtime_env.ring_task_windows = ring_task_windows
     cfg.runtime_env.ring_heaps = ring_heaps
     cfg.runtime_env.ring_dep_pools = ring_dep_pools
+    cfg.example_exec_time_ns = list(example_values)
     # NUL-terminated C string in a 1024-byte field.
     cfg.output_prefix = prefix_bytes.split(b"\x00", 1)[0].decode("utf-8")
     return cfg
diff --git a/simpler_setup/scene_test.py b/simpler_setup/scene_test.py
index 37c5621a6..bf24bd35b 100644
--- a/simpler_setup/scene_test.py
+++ b/simpler_setup/scene_test.py
@@ -579,6 +579,54 @@ def _dump_name_map(mapping: dict, output_path: Path) -> Path | None:
     return output_path
 
 
+def _inject_dist_swimlane_names(callable_spec: dict) -> None:
+    """Resolve incore func_ids -> names in a distributed-runtime swimlane.
+
+    The fully_distributed_within_core engine writes a Chrome-trace swimlane to
+    ``$PTO_DIST_SWIMLANE`` whose events carry only ``func_id`` (the device has
+    no incore name — ``CoreCallable`` stores none). When that capture happened
+    this run, rewrite each event's label to the kernel name from the CALLABLE
+    spec (``func_id -> name``) and stamp ``args.name`` so both Perfetto and
+    ``simpler_setup.tools.dist_swimlane_render`` show e.g. ``GEMM``/``ADD``
+    instead of ``f0``/``f1``. No-op when the env var is unset, the file is
+    absent, or the spec carries no names.
+    """
+    import json as _json  # noqa: PLC0415
+
+    path = os.environ.get("PTO_DIST_SWIMLANE")
+    if not path or not os.path.isfile(path):
+        return
+    names = _extract_name_map(callable_spec).get("callable_id_to_name", {})
+    if not names:
+        return
+    try:
+        with open(path) as f:
+            data = _json.load(f)
+    except (OSError, ValueError):
+        return
+    changed = False
+    for e in data.get("traceEvents", []):
+        if e.get("ph") != "X":
+            continue
+        args = e.get("args", {})
+        nm = names.get(str(args.get("func_id")))
+        if nm is None:
+            continue
+        args["name"] = nm
+        tid = args.get("task_id")
+        # A distributed-runtime span carries a `phase` (kernel / build / alloc /
+        # replay / drain_won). The func_id -> name map only knows the kernel
+        # label, so prefix non-kernel phases to keep e.g. a task's `build` span
+        # distinct from its `kernel` span on the same lane (both share func_id).
+        phase = args.get("phase")
+        label = nm if (phase in (None, "kernel")) else f"{phase}:{nm}"
+        e["name"] = f"{label}#{tid}" if tid is not None else label
+        changed = True
+    if changed:
+        with open(path, "w") as f:
+            _json.dump(data, f, indent=2)
+
+
 def _parse_case_selector(value: str) -> tuple[str | None, str | None]:
     """Parse one ``--case`` value into ``(class_name, case_name)``.
 
@@ -1107,6 +1155,20 @@ def _build_config(
         config.enable_pmu = enable_pmu  # 0=disabled, >0=enabled with event type
         config.enable_dep_gen = enable_dep_gen
         config.enable_scope_stats = enable_scope_stats
+        # Sim-only trace-driven replay: only fully_distributed_within_core
+        # implements it; every other runtime rejects the flag here so no other
+        # runtime needs to adapt (the C++ side leaves it a weak no-op). When on,
+        # each incore's CALLABLE example_exec_time_ns (nanoseconds) is plumbed
+        # to the runtime, which busy-waits it instead of running the real kernel.
+        if getattr(self, "_st_use_example_exec_time", 0):
+            if self._st_runtime != "fully_distributed_within_core":
+                raise NotImplementedError(
+                    "--use-example-exec-time is only supported by the "
+                    "fully_distributed_within_core runtime, not "
+                    f"{self._st_runtime!r}"
+                )
+            config.use_example_exec_time = 1
+            config.example_exec_time_ns = self._collect_example_exec_time_ns()
         # `output_prefix` is required by CallConfig::validate() whenever any
         # diagnostic flag is enabled. Caller threads it down from the per-case
         # directory built by _build_output_prefix().
@@ -1114,6 +1176,21 @@ def _build_config(
             config.output_prefix = str(output_prefix)
         return config
 
+    def _collect_example_exec_time_ns(self):
+        """Per-func reference durations (nanoseconds) keyed by func_id, read from
+        the CALLABLE ``incores`` ``example_exec_time_ns`` fields (an integer
+        nanosecond count). A func without the field stays 0 (runs for real under
+        use_example_exec_time)."""
+        incores = self.CALLABLE.get("incores", [])
+        max_fid = max((k.get("func_id", -1) for k in incores), default=-1)
+        table = [0] * (max_fid + 1)
+        for k in incores:
+            fid = k.get("func_id")
+            ns = k.get("example_exec_time_ns")
+            if ns is not None and isinstance(fid, int) and fid >= 0:
+                table[fid] = int(ns)
+        return table
+
     def _resolve_env(self):
         env = self.RUNTIME_ENV
         if not env:
@@ -1250,6 +1327,10 @@ def _run_and_validate_l2(  # noqa: PLR0913 -- threads CLI diagnostic flags + cas
         if timings:
             _log_round_timings(timings)
 
+        # If a distributed-runtime swimlane was captured (PTO_DIST_SWIMLANE),
+        # label its events with the incore function names from the spec.
+        _inject_dist_swimlane_names(self.CALLABLE)
+
     def _run_and_validate_l3(  # noqa: PLR0913 -- threads CLI diagnostic flags + L3 ns context
         self,
         worker,
@@ -1339,6 +1420,10 @@ def task_orch(orch, _args, _cfg, _ns=ns, _test_args=test_args, _config=config):
         if timings:
             _log_round_timings(timings)
 
+        # If a distributed-runtime swimlane was captured (PTO_DIST_SWIMLANE),
+        # label its events with the incore function names from the spec.
+        _inject_dist_swimlane_names(self.CALLABLE)
+
     # ------------------------------------------------------------------
     # pytest auto test method
     # ------------------------------------------------------------------
@@ -1372,6 +1457,16 @@ def test_run(self, st_platform, st_worker, request):
         enable_pmu = request.config.getoption("--enable-pmu", default=0)
         enable_dep_gen = self._effective_enable_dep_gen(request, warn=True)
         enable_scope_stats = request.config.getoption("--enable-scope-stats", default=False)
+        # Sim-only trace-driven replay switch; consumed in _build_config (which
+        # enforces the fully_distributed_within_core-only constraint and reads
+        # the per-func durations from CALLABLE). Stashed on the instance so it
+        # reaches _build_config without threading through every run variant.
+        self._st_use_example_exec_time = 1 if request.config.getoption("--use-example-exec-time", default=False) else 0
+        # With trace-driven replay on, incore kernels are skipped (busy-wait
+        # only) so their outputs are never computed — force-skip the golden
+        # comparison regardless of --skip-golden.
+        if self._st_use_example_exec_time:
+            skip_golden = True
         # device-log timing is cheap (PTO2_PROFILING markers, one block/round)
         # so unlike the heavy diagnostics it is NOT disabled when --rounds > 1.
         enable_device_log_timing = request.config.getoption("--enable-device-log-timing", default=False)
diff --git a/simpler_setup/tools/dist_swimlane_render.py b/simpler_setup/tools/dist_swimlane_render.py
new file mode 100644
index 000000000..9287168a4
--- /dev/null
+++ b/simpler_setup/tools/dist_swimlane_render.py
@@ -0,0 +1,224 @@
+#!/usr/bin/env python3
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""
+fully_distributed_within_core swimlane renderer.
+
+Renders the per-core execution swimlane emitted by the distributed engine
+(``dist_engine_dump_trace`` -> Chrome Trace Event JSON, written when the run is
+launched with ``PTO_DIST_SWIMLANE=<path>``) into a Gantt-style PNG. Each row is
+a physical lane (block x AIC/AIV0/AIV1); each bar is one executed (sub)task,
+colored by ``func_id`` (kernel id).
+
+This is the distributed-runtime counterpart to ``swimlane_converter`` (which
+targets the centralized scheduler's L2 records and is empty for this runtime,
+since orchestration/scheduling/execution all run on the AI cores). Perfetto
+remains the authoritative interactive view — drag the same JSON into
+https://ui.perfetto.dev/. This tool is for a quick static picture.
+
+Usage:
+    # Latest outputs/dist_swimlane/*.json (or outputs/**/*swimlane*.json):
+    python -m simpler_setup.tools.dist_swimlane_render
+    python -m simpler_setup.tools.dist_swimlane_render path/to/bgemm_swimlane.json
+    python -m simpler_setup.tools.dist_swimlane_render in.json -o out.png
+    python -m simpler_setup.tools.dist_swimlane_render in.json --names 0=GEMM,1=ADD
+"""
+
+import argparse
+import json
+import sys
+from pathlib import Path
+
+LANE_NAMES = ("AIC", "AIV0", "AIV1")
+
+# Distinct, color-blind-friendly palette indexed by func_id (wraps if exceeded).
+_PALETTE = [
+    "#2c7fb8",  # blue
+    "#de2d26",  # red
+    "#31a354",  # green
+    "#756bb1",  # purple
+    "#e6550d",  # orange
+    "#636363",  # gray
+    "#c51b8a",  # magenta
+    "#1c9099",  # teal
+]
+
+
+def _resolve_input(arg: str | None) -> Path | None:
+    """Resolve the input JSON: explicit arg, else the most recent candidate
+    under outputs/dist_swimlane/ then outputs/ (by mtime)."""
+    if arg:
+        p = Path(arg)
+        if not p.is_file():
+            print(f"Error: input file not found: {p}", file=sys.stderr)
+            return None
+        return p
+    candidates: list[Path] = []
+    d = Path("outputs/dist_swimlane")
+    if d.is_dir():
+        candidates += list(d.glob("*.json"))
+    if not candidates:
+        out = Path("outputs")
+        if out.is_dir():
+            candidates += list(out.glob("**/*swimlane*.json"))
+    if not candidates:
+        print(
+            "Error: no input given and no outputs/dist_swimlane/*.json found. "
+            "Run with PTO_DIST_SWIMLANE=<path> first, then pass that path.",
+            file=sys.stderr,
+        )
+        return None
+    return max(candidates, key=lambda p: p.stat().st_mtime)
+
+
+def _parse_names(spec: str | None) -> dict[int, str]:
+    """Parse a ``0=GEMM,1=ADD`` style func_id->name mapping."""
+    names: dict[int, str] = {}
+    if not spec:
+        return names
+    for tok in spec.split(","):
+        tok = tok.strip()
+        if not tok or "=" not in tok:
+            continue
+        k, v = tok.split("=", 1)
+        try:
+            names[int(k.strip())] = v.strip()
+        except ValueError:
+            continue
+    return names
+
+
+def _load_trace(path: Path) -> tuple[dict[tuple[int, int], str], list[dict]]:
+    """Return (lane_name_by_(pid,tid), duration_events) from a Chrome trace."""
+    data = json.loads(path.read_text())
+    events = data.get("traceEvents", [])
+    lane_names: dict[tuple[int, int], str] = {}
+    durs: list[dict] = []
+    for e in events:
+        ph = e.get("ph")
+        if ph == "M" and e.get("name") == "thread_name" and "tid" in e:
+            lane_names[(e["pid"], e["tid"])] = e.get("args", {}).get("name", f'{e["pid"]}:{e["tid"]}')
+        elif ph == "X":
+            durs.append(e)
+    return lane_names, durs
+
+
+def render(input_path: Path, output_path: Path, names: dict[int, str], title: str | None, verbose: bool) -> int:
+    try:
+        import matplotlib
+
+        matplotlib.use("Agg")
+        import matplotlib.pyplot as plt
+        from matplotlib.patches import Patch
+    except ImportError:
+        print("Error: matplotlib is required (pip install matplotlib).", file=sys.stderr)
+        return 1
+
+    lane_names, durs = _load_trace(input_path)
+    if not durs:
+        print(f"Error: no execution (ph=X) events in {input_path}.", file=sys.stderr)
+        return 1
+
+    # Lane rows: prefer the metadata order; fall back to whatever events carry.
+    lanes = sorted(lane_names.keys()) if lane_names else sorted({(e["pid"], e["tid"]) for e in durs})
+    row = {lk: i for i, lk in enumerate(lanes)}
+
+    def label_for(lk: tuple[int, int]) -> str:
+        return lane_names.get(lk, f'{LANE_NAMES[lk[1]] if lk[1] < len(LANE_NAMES) else lk[1]} (blk{lk[0]})')
+
+    func_ids = sorted({int(e.get("args", {}).get("func_id", -1)) for e in durs})
+
+    # Resolve func_id -> name. Priority: --names (CLI) > args.name baked into the
+    # JSON (scene_test injects the incore function name when capturing) > none.
+    auto_names: dict[int, str] = {}
+    for e in durs:
+        a = e.get("args", {})
+        nm = a.get("name")
+        if nm:
+            auto_names[int(a.get("func_id", -1))] = nm
+    effective_names = {**auto_names, **names}
+
+    def color_for(fid: int) -> str:
+        if fid < 0:
+            return "#999999"
+        return _PALETTE[fid % len(_PALETTE)]
+
+    fig_h = max(2.5, 0.5 * len(lanes) + 1.0)
+    fig, ax = plt.subplots(figsize=(13, fig_h))
+    for e in durs:
+        lk = (e["pid"], e["tid"])
+        if lk not in row:
+            continue
+        fid = int(e.get("args", {}).get("func_id", -1))
+        ax.barh(row[lk], e["dur"], left=e["ts"], height=0.7, color=color_for(fid), edgecolor="white", linewidth=0.4)
+
+    ax.set_yticks(range(len(lanes)))
+    ax.set_yticklabels([label_for(lk) for lk in lanes], fontsize=9)
+    ax.invert_yaxis()
+    ax.set_xlabel("time (us, relative to run start)")
+    ax.set_title(title or f"fully_distributed_within_core per-core execution swimlane\n{input_path.name}")
+    ax.grid(axis="x", alpha=0.3)
+
+    handles = [
+        Patch(color=color_for(fid), label=effective_names.get(fid, f"func {fid}" if fid >= 0 else "unknown"))
+        for fid in func_ids
+    ]
+    ax.legend(handles=handles, loc="upper right", fontsize=9)
+
+    plt.tight_layout()
+    plt.savefig(output_path, dpi=130)
+    plt.close(fig)
+
+    if verbose:
+        per_lane: dict[tuple[int, int], int] = {}
+        for e in durs:
+            per_lane[(e["pid"], e["tid"])] = per_lane.get((e["pid"], e["tid"]), 0) + 1
+        print("tasks per lane:")
+        for lk in lanes:
+            print(f"  {label_for(lk):28s} n={per_lane.get(lk, 0)}")
+    print(f"✓ Rendered {len(durs)} events across {len(lanes)} lanes")
+    print(f"  Input:  {input_path}")
+    print(f"  Output: {output_path}")
+    print(f"\nFor the interactive view, drag {input_path} into https://ui.perfetto.dev/")
+    return 0
+
+
+def _build_parser() -> argparse.ArgumentParser:
+    p = argparse.ArgumentParser(
+        description="Render a fully_distributed_within_core execution swimlane JSON to a PNG.",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    p.add_argument(
+        "input",
+        nargs="?",
+        default=None,
+        help="Chrome-trace JSON from PTO_DIST_SWIMLANE. Default: latest outputs/dist_swimlane/*.json.",
+    )
+    p.add_argument("-o", "--output", default=None, help="Output PNG path. Default: <input>.png next to the input.")
+    p.add_argument(
+        "--names",
+        default=None,
+        help="func_id->name legend map, e.g. '0=GEMM,1=ADD'. Without it, lanes are labeled 'func N'.",
+    )
+    p.add_argument("--title", default=None, help="Override the plot title.")
+    p.add_argument("-v", "--verbose", action="store_true", help="Also print per-lane task counts.")
+    return p
+
+
+def main() -> int:
+    args = _build_parser().parse_args()
+    input_path = _resolve_input(args.input)
+    if input_path is None:
+        return 1
+    output_path = Path(args.output) if args.output else input_path.with_suffix(".png")
+    return render(input_path, output_path, _parse_names(args.names), args.title, args.verbose)
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/src/a2a3/platform/sim/aicore/kernel.cpp b/src/a2a3/platform/sim/aicore/kernel.cpp
index b2fb5fe24..d894cf771 100644
--- a/src/a2a3/platform/sim/aicore/kernel.cpp
+++ b/src/a2a3/platform/sim/aicore/kernel.cpp
@@ -17,6 +17,8 @@
  */
 
 #include <cstdint>
+#include <cstdio>
+#include <cstdlib>
 #include <pthread.h>
 
 #include "inner_kernel.h"
@@ -38,13 +40,52 @@ static pthread_key_t g_aicore_profiling_flag_key;
 static pthread_key_t g_l2_swimlane_aicore_head_slot_key;
 static pthread_key_t g_l2_swimlane_aicore_head_key;
 static pthread_once_t g_tls_once = PTHREAD_ONCE_INIT;
+// True once create_tls_keys() has successfully created ALL keys; gates the
+// unload-time delete so we never pthread_key_delete a stale/uncreated key.
+static bool g_tls_keys_ready = false;
+
+// All pthread keys owned by this DSO, in creation order. destroy_tls_keys()
+// rolls these back at unload so a per-run dlopen/dlclose cycle is net-zero on
+// the process-wide TLS key pool (see destroy_tls_keys()).
+static pthread_key_t *const g_all_keys[] = {
+    &g_reg_base_key,
+    &g_core_id_key,
+    &g_aicore_profiling_flag_key,
+    &g_l2_swimlane_aicore_head_slot_key,
+    &g_l2_swimlane_aicore_head_key,
+};
+constexpr int kNumTlsKeys = sizeof(g_all_keys) / sizeof(g_all_keys[0]);
 
 static void create_tls_keys() {
-    pthread_key_create(&g_reg_base_key, nullptr);
-    pthread_key_create(&g_core_id_key, nullptr);
-    pthread_key_create(&g_aicore_profiling_flag_key, nullptr);
-    pthread_key_create(&g_l2_swimlane_aicore_head_slot_key, nullptr);
-    pthread_key_create(&g_l2_swimlane_aicore_head_key, nullptr);
+    for (int i = 0; i < kNumTlsKeys; i++) {
+        if (pthread_key_create(g_all_keys[i], nullptr) != 0) {
+            // The process-wide pthread key pool (PTHREAD_KEYS_MAX, 1024) is
+            // exhausted. Roll back what we created and fail loudly: silently
+            // leaving a key at 0 makes sim_get_reg_base() return NULL and
+            // crashes write_reg() on a NULL register base (hard-to-debug
+            // SIGSEGV). With destroy_tls_keys() reclaiming keys on unload this
+            // path should never be hit.
+            for (int j = 0; j < i; j++) pthread_key_delete(*g_all_keys[j]);
+            fprintf(stderr, "[aicore_sim] FATAL: pthread_key_create failed at key %d/%d — TLS key pool exhausted\n", i,
+                    kNumTlsKeys);
+            abort();
+        }
+    }
+    g_tls_keys_ready = true;
+}
+
+// Release this DSO's pthread TLS keys when it is unloaded (dlclose). The AICore
+// kernel .so is dlopen/dlclose'd once per run (device_runner.cpp reloads it
+// because the kernel binary can vary per case), and glibc does NOT reclaim a
+// DSO's pthread keys on unload. Without this, every run leaked these keys and
+// after ~PTHREAD_KEYS_MAX/kNumTlsKeys runs pthread_key_create() began failing
+// (EAGAIN), leaving the keys at 0 → sim_get_reg_base() == NULL → write_reg()
+// NULL-deref SIGSEGV mid-sweep. All AICore worker threads are joined before the
+// DSO is dlclose'd, so deleting the keys here is race-free.
+__attribute__((destructor)) static void destroy_tls_keys() {
+    if (!g_tls_keys_ready) return;
+    for (int i = 0; i < kNumTlsKeys; i++) pthread_key_delete(*g_all_keys[i]);
+    g_tls_keys_ready = false;
 }
 
 volatile uint8_t *sim_get_reg_base() { return static_cast<volatile uint8_t *>(pthread_getspecific(g_reg_base_key)); }
diff --git a/src/a2a3/platform/sim/host/device_runner.cpp b/src/a2a3/platform/sim/host/device_runner.cpp
index 1c384569e..97755d74c 100644
--- a/src/a2a3/platform/sim/host/device_runner.cpp
+++ b/src/a2a3/platform/sim/host/device_runner.cpp
@@ -16,15 +16,22 @@
 
 #include "device_runner.h"
 
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+
 #include <dlfcn.h>
+#include <sched.h>
 #include <stdlib.h>
 #include <sys/stat.h>
 #include <unistd.h>
 
 #include <atomic>
+#include <cerrno>
 #include <chrono>
 #include <cstdio>
 #include <cstring>
+#include <fstream>
 #include <functional>
 #include <string>
 #include <vector>
@@ -54,6 +61,60 @@ extern "C" __attribute__((weak, visibility("hidden"))) int dep_gen_replay_emit_d
     return -1;
 }
 
+// --- AICore single-NUMA pinning (opt-in via PTO_SIM_AICORE_NUMA_NODE) --------
+// When the env var names a NUMA node, each AICore sim thread `i` is pinned 1:1
+// to the i-th CPU of that node. This keeps the entire AICore working set inside
+// one NUMA node (no cross-node cursor/claim contention) and gives every core
+// exclusive use of a physical CPU. Auxiliary threads (AICPU over-launch,
+// collectors, host) are deliberately left unpinned so they don't oversubscribe
+// the node. Returns the node's CPU list (empty if unset/invalid). The node must
+// hold at least `num_aicore` CPUs for strict single-core-per-thread placement.
+static std::vector<int> sim_aicore_pin_cpus(int &out_node) {
+    out_node = -1;
+    const char *env = std::getenv("PTO_SIM_AICORE_NUMA_NODE");
+    if (env == nullptr || env[0] == '\0') {
+        return {};
+    }
+    int node = std::atoi(env);
+    char path[160];
+    std::snprintf(path, sizeof(path), "/sys/devices/system/node/node%d/cpulist", node);
+    std::ifstream f(path);
+    if (!f.good()) {
+        LOG_ERROR("PTO_SIM_AICORE_NUMA_NODE=%d: cannot open %s; AICore pinning disabled", node, path);
+        return {};
+    }
+    std::string line;
+    std::getline(f, line);
+    std::vector<int> cpus;
+    size_t i = 0;
+    while (i < line.size()) {
+        if (line[i] < '0' || line[i] > '9') {
+            i++;
+            continue;
+        }
+        int a = 0;
+        while (i < line.size() && line[i] >= '0' && line[i] <= '9') {
+            a = a * 10 + (line[i] - '0');
+            i++;
+        }
+        if (i < line.size() && line[i] == '-') {
+            i++;
+            int b = 0;
+            while (i < line.size() && line[i] >= '0' && line[i] <= '9') {
+                b = b * 10 + (line[i] - '0');
+                i++;
+            }
+            for (int c = a; c <= b; c++) {
+                cpus.push_back(c);
+            }
+        } else {
+            cpus.push_back(a);
+        }
+    }
+    out_node = node;
+    return cpus;
+}
+
 DeviceRunner::~DeviceRunner() { finalize(); }
 
 int DeviceRunner::ensure_binaries_loaded() {
@@ -457,11 +518,54 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) {
     }
 
     LOG_INFO_V0("Launching %d AICore thread(s)", num_aicore);
+    const bool pin_verbose = std::getenv("PTO_SIM_AICORE_PIN_VERBOSE") != nullptr;
+    int aicore_pin_node = -1;
+    std::vector<int> aicore_pin_cpus = sim_aicore_pin_cpus(aicore_pin_node);
+    if (!aicore_pin_cpus.empty()) {
+        if (num_aicore > static_cast<int>(aicore_pin_cpus.size())) {
+            LOG_WARN(
+                "AICore pin: num_aicore=%d exceeds NUMA node %d cpu count=%zu; threads will wrap around the node "
+                "(no longer one-core-per-thread). Restrict block_dim so cores<=node size for strict placement.",
+                num_aicore, aicore_pin_node, aicore_pin_cpus.size()
+            );
+        }
+        LOG_INFO_V0(
+            "AICore pin: confining %d AICore thread(s) to NUMA node %d (cpus %d..%d, 1:1)", num_aicore,
+            aicore_pin_node, aicore_pin_cpus.front(), aicore_pin_cpus.back()
+        );
+        if (pin_verbose) {
+            std::fprintf(
+                stderr, "[aicore-pin] confining %d AICore thread(s) to NUMA node %d (cpus %d..%d, 1:1)\n",
+                num_aicore, aicore_pin_node, aicore_pin_cpus.front(), aicore_pin_cpus.back()
+            );
+        }
+    }
     std::vector<std::thread> aicore_threads;
     for (int i = 0; i < num_aicore; i++) {
         CoreType core_type = runtime.workers[i].core_type;
         uint32_t physical_core_id = static_cast<uint32_t>(i);
-        aicore_threads.push_back(create_thread([this, &runtime, i, core_type, physical_core_id]() {
+        int target_cpu = aicore_pin_cpus.empty() ? -1 : aicore_pin_cpus[i % aicore_pin_cpus.size()];
+        aicore_threads.push_back(create_thread([this, &runtime, i, core_type, physical_core_id, target_cpu,
+                                                pin_verbose]() {
+            if (target_cpu >= 0) {
+#if defined(__linux__)
+                cpu_set_t set;
+                CPU_ZERO(&set);
+                CPU_SET(target_cpu, &set);
+                if (sched_setaffinity(0, sizeof(set), &set) != 0) {
+                    LOG_ERROR("AICore thread %d: sched_setaffinity(cpu=%d) failed (errno=%d)", i, target_cpu, errno);
+                } else if (pin_verbose) {
+                    std::fprintf(
+                        stderr, "[aicore-pin] thread %d -> cpu %d (running on cpu %d)\n", i, target_cpu,
+                        sched_getcpu()
+                    );
+                }
+#else
+                // CPU affinity (sched_setaffinity/cpu_set_t/sched_getcpu) is Linux-only;
+                // on other hosts (e.g. macOS) AICore pinning is a no-op.
+                (void)pin_verbose;
+#endif
+            }
             aicore_execute_func_(
                 &runtime, i, core_type, physical_core_id, kernel_args_.regs, kernel_args_.enable_profiling_flag,
                 kernel_args_.l2_swimlane_aicore_rotation_table
diff --git a/src/a2a3/runtime/fully_distributed_within_core/aicore/aicore_executor.cpp b/src/a2a3/runtime/fully_distributed_within_core/aicore/aicore_executor.cpp
new file mode 100644
index 000000000..21fb63133
--- /dev/null
+++ b/src/a2a3/runtime/fully_distributed_within_core/aicore/aicore_executor.cpp
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+#include "aicore/aicore.h"
+#include "aicore/aicore_profiling_state.h"
+#include "aicore/l2_swimlane_collector_aicore.h"
+#include "aicore/pmu_collector_aicore.h"
+#include "common/l2_swimlane_profiling.h"
+#include "common/platform_config.h"  // Register-based communication
+#include "pto2_dispatch_payload.h"
+#include "runtime.h"
+
+/**
+ * AICore main execution loop
+ *
+ * Implements the AICPU-AICore register-based dispatch protocol:
+ * 1. Wait for AICPU ready signal via handshake buffer
+ * 2. Report physical core ID and core type, signal AICore ready
+ * 3. Cache per-core PTO2DispatchPayload pointer from hank->task
+ * 4. Poll DATA_MAIN_BASE register for task dispatch until exit signal
+ *
+ * AICPU writes &s_payload_per_core[i] to hank->task before setting
+ * aicpu_ready=1. AICore caches this pointer and reads function_bin_addr +
+ * args pointer from it on each dispatch. reg_val is a monotonically
+ * increasing task ID used only for dispatch signaling and ACK/FIN protocol.
+ *
+ * Profiling state (enable flag, L2 swimlane rotation channel) is published into the platform
+ * via set_aicore_profiling_flag / set_aicore_l2_swimlane_ring at kernel entry —
+ * this routine reads it through the matching getters, so neither Handshake
+ * nor this signature carry profiling fields.
+ *
+ * @param runtime Pointer to Runtime in global memory
+ * @param block_idx Block index (core ID)
+ * @param core_type Core type (AIC or AIV)
+ */
+__aicore__ __attribute__((weak)) void aicore_execute(__gm__ Runtime *runtime, int block_idx, CoreType core_type) {
+    __gm__ Handshake *my_hank = (__gm__ Handshake *)(&runtime->workers[block_idx]);
+
+    // Phase 1: Wait for AICPU initialization signal
+    while (my_hank->aicpu_ready == 0) {
+        dcci(my_hank, SINGLE_CACHE_LINE);
+        SPIN_WAIT_HINT();
+    }
+
+    // Phase 2: Report physical core ID, signal ready
+    my_hank->physical_core_id = get_physical_core_id();
+    OUT_OF_ORDER_STORE_BARRIER();
+    my_hank->aicore_regs_ready = 1;
+    dcci(&my_hank->aicore_regs_ready, SINGLE_CACHE_LINE, CACHELINE_OUT);
+    while (my_hank->aicpu_regs_ready == 0) {
+        dcci(&my_hank->aicpu_regs_ready, SINGLE_CACHE_LINE);
+        SPIN_WAIT_HINT();
+    }
+    // Report initial idle status via register
+    write_reg(RegId::COND, AICORE_IDLE_VALUE);
+
+    // Phase 3: Report core type, signal ready
+    my_hank->core_type = core_type;
+    OUT_OF_ORDER_STORE_BARRIER();
+    my_hank->aicore_done = block_idx + 1;  // Signal ready (use block_idx + 1 to avoid 0)
+
+    dcci(my_hank, SINGLE_CACHE_LINE, CACHELINE_OUT);
+
+    // ===========================================================================
+    // fully_distributed_within_core: run orchestration + scheduling + execution
+    // ON this AICore worker (SPMD). Instead of polling DATA_MAIN_BASE for
+    // AICPU-dispatched tasks, each worker invokes the distributed engine entry
+    // (compiled into the AICPU .so, but executed here on the AICore thread so
+    // kernels run with this thread's sim TLS in place). The engine replays the
+    // orchestration submit stream, claims/builds the tasks it wins, and executes
+    // them; on return it has set this worker's completion flags. The worker then
+    // honors the existing teardown protocol (wait for EXIT, ack EXITED) so the
+    // AICPU scheduler/shutdown path is reused unchanged.
+    // See runtime/dist_engine.* and docs/fully_distributed_within_core.md.
+    // ===========================================================================
+    while (runtime->dist.go == 0) {
+        dcci(&runtime->dist, SINGLE_CACHE_LINE);
+        SPIN_WAIT_HINT();
+    }
+    {
+        DistCoreMainFn core_main = reinterpret_cast<DistCoreMainFn>(runtime->dist.core_main_fn);
+        if (core_main != nullptr) {
+            core_main(runtime, block_idx, static_cast<int>(core_type));
+        } else {
+            __atomic_add_fetch(&runtime->dist.done_count, 1, __ATOMIC_ACQ_REL);
+        }
+    }
+
+    // Teardown: wait for the AICPU EXIT signal on DATA_MAIN_BASE and ack.
+    while (true) {
+        uint32_t reg_val = static_cast<uint32_t>(read_reg(RegId::DATA_MAIN_BASE));
+        if (reg_val == AICORE_EXIT_SIGNAL) {
+            write_reg(RegId::COND, AICORE_EXITED_VALUE);
+            break;
+        }
+        SPIN_WAIT_HINT();
+    }
+    dcci(my_hank, SINGLE_CACHE_LINE, CACHELINE_OUT);
+}
diff --git a/src/a2a3/runtime/fully_distributed_within_core/aicpu/aicpu_executor.cpp b/src/a2a3/runtime/fully_distributed_within_core/aicpu/aicpu_executor.cpp
new file mode 100644
index 000000000..6925541e7
--- /dev/null
+++ b/src/a2a3/runtime/fully_distributed_within_core/aicpu/aicpu_executor.cpp
@@ -0,0 +1,876 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+#include <dlfcn.h>
+#include <unistd.h>
+
+#include <atomic>
+#include <cerrno>
+#include <cinttypes>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#ifdef __linux__
+#include <sys/mman.h>
+#endif
+
+#include "aicpu/device_time.h"
+#include "aicpu/orch_so_file.h"
+#include "callable_protocol.h"
+#include "pto2_dispatch_payload.h"
+#include "runtime.h"
+#include "spin_hint.h"
+
+// Runtime headers (full struct definition for create/destroy + PTO2_SCOPE)
+#include "pto_runtime2.h"
+#include "pto_runtime2_types.h"
+#include "pto_shared_memory.h"
+
+// fully_distributed_within_core engine — orchestration + scheduling + execution
+// run on the AICore workers; this AICPU thread only wires the engine and waits.
+#include "dist_engine.h"
+
+// Performance profiling headers
+#include "aicpu/l2_swimlane_collector_aicpu.h"
+#include "aicpu/scope_stats_collector_aicpu.h"
+#include "aicpu/tensor_dump_aicpu.h"
+#include "aicpu/dep_gen_collector_aicpu.h"
+#include "common/l2_swimlane_profiling.h"
+#include "common/unified_log.h"
+
+// Register-based communication
+#include "aicpu/platform_regs.h"
+#include "common/platform_config.h"
+
+// Core type definitions
+#include "common/core_type.h"
+
+// CoreCallable for resolved dispatch address
+#include "callable.h"
+
+// Scheduler data structures (CoreExecState, CoreTracker, etc.)
+#include "scheduler/scheduler_types.h"
+
+// Scheduler context class
+#include "scheduler/scheduler_context.h"
+
+// Device orchestration function signature (loaded via dlopen).
+// The executor binds the current thread's PTO2Runtime into orchestration TLS
+// before calling the user entry.
+typedef void (*DeviceOrchestrationFunc)(const L2TaskArgs &orch_args);
+typedef void (*DeviceOrchestrationBindRuntimeFunc)(PTO2Runtime *rt);
+
+// Config function exported by orchestration .so
+typedef PTO2OrchestrationConfig (*DeviceOrchestrationConfigFunc)(const L2TaskArgs &orch_args);
+
+// From orchestration/common.cpp linked into this DSO — updates g_current_runtime here (distinct from
+// framework_bind_runtime in the dlopen'd libdevice_orch_*.so).
+extern "C" void framework_bind_runtime(PTO2Runtime *rt);
+
+constexpr const char *DEFAULT_ORCH_ENTRY_SYMBOL = "aicpu_orchestration_entry";
+constexpr const char *DEFAULT_ORCH_CONFIG_SYMBOL = "aicpu_orchestration_config";
+
+static int32_t read_pto2_runtime_status(Runtime *runtime) {
+    if (runtime == nullptr) {
+        return 0;
+    }
+
+    void *sm = runtime->get_gm_sm_ptr();
+    if (sm == nullptr) {
+        return 0;
+    }
+
+    auto *header = static_cast<PTO2SharedMemoryHeader *>(sm);
+    int32_t orch_error_code = header->orch_error_code.load(std::memory_order_acquire);
+    int32_t sched_error_code = header->sched_error_code.load(std::memory_order_acquire);
+    return runtime_status_from_error_codes(orch_error_code, sched_error_code);
+}
+
+static PTO2Runtime *rt{nullptr};
+
+// Per-callable_id orchestration SO table. The executor dispatches
+// `orch_so_table_[active_callable_id_]` (created on first sighting of
+// that callable_id, kept warm across runs).
+// MAX_REGISTERED_CALLABLE_IDS is the protocol hard cap on callable_id values
+// (mailbox uint32 callable_id, register() returns small ints) and is shared
+// with the host bounds check in DeviceRunner::register_callable —
+// see src/common/task_interface/callable_protocol.h.
+
+struct OrchSoEntry {
+    bool in_use{false};
+    void *handle{nullptr};
+    char path[256]{};
+    DeviceOrchestrationFunc func{nullptr};
+    DeviceOrchestrationBindRuntimeFunc bind{nullptr};
+    DeviceOrchestrationConfigFunc config_func{nullptr};
+};
+
+struct AicpuExecutor {
+    int32_t sched_thread_num_;
+    bool orch_to_sched_{false};
+
+    // ===== Thread management state =====
+    std::atomic<int32_t> thread_idx_{0};
+    std::atomic<bool> initialized_{false};
+    std::atomic<bool> init_done_{false};
+    std::atomic<bool> init_failed_{false};
+    std::atomic<bool> finished_{false};
+
+    int32_t aicpu_thread_num_{0};
+
+    // ===== Task queue state (managed by scheduler ready queues) =====
+
+    std::atomic<int32_t> finished_count_{0};
+    std::atomic<bool> runtime_init_ready_{false};
+
+    // Per-Worker arena backing the PTO2Runtime + sm_handle + orch/sched/mailbox
+    // sub-regions (created in runtime_create_from_sm, released in runtime_destroy).
+    // Default-constructed: libc-backed backend, no ctx.
+    DeviceArena runtime_arena_;
+
+    // Entry-arg L2TaskArgs built (via create_from_chip_args) from get_orch_args()
+    // before scheduler init; consumed by the (*p_func)(orch_args_cached_) below.
+    L2TaskArgs orch_args_cached_;
+
+    // Per-callable_id table. Single orch thread today, so first-write/read
+    // race is not possible; if multiple orch threads are ever introduced,
+    // guard the in_use=false→true transition with a mutex.
+    OrchSoEntry orch_so_table_[MAX_REGISTERED_CALLABLE_IDS];
+
+    // ===== Scheduler context (owns all dispatch/completion/drain state) =====
+    SchedulerContext sched_ctx_;
+
+    // ===== Methods =====
+    int32_t init(Runtime *runtime);
+    int32_t run(Runtime *runtime);
+    void deinit(Runtime *runtime);
+
+    ~AicpuExecutor() {
+        // Process-wide teardown (the single static instance dies here). Every
+        // in-use callable_id slot is dlclose()'d here; each is otherwise kept
+        // alive across runs for cache-hit reuse.
+        for (auto &e : orch_so_table_) {
+            if (!e.in_use) continue;
+            if (e.handle != nullptr) dlclose(e.handle);
+            if (e.path[0] != '\0') unlink(e.path);
+            e = OrchSoEntry{};
+        }
+    }
+};
+
+static AicpuExecutor g_aicpu_executor;
+
+// ===== AicpuExecutor Method Implementations =====
+
+int32_t AicpuExecutor::init(Runtime *runtime) {
+    bool expected = false;
+    if (!initialized_.compare_exchange_strong(expected, true, std::memory_order_acq_rel, std::memory_order_acquire)) {
+        return 0;
+    }
+
+    LOG_INFO_V0("AicpuExecutor: Initializing");
+
+    if (runtime == nullptr) {
+        LOG_ERROR("runtime is nullptr");
+        init_failed_.store(true, std::memory_order_release);
+        return -1;
+    }
+
+    // Read execution parameters from runtime. The 0 → 1 fixup runs before the
+    // sched_thread_num_ derivation so a zero input doesn't leave the scheduler
+    // count at -1.
+    aicpu_thread_num_ = runtime->aicpu_thread_num;
+    if (aicpu_thread_num_ == 0) aicpu_thread_num_ = 1;
+    sched_thread_num_ = aicpu_thread_num_ - 1;
+    orch_to_sched_ = runtime->orch_to_sched;
+
+    if (aicpu_thread_num_ < 1 || aicpu_thread_num_ > MAX_AICPU_THREADS) {
+        LOG_ERROR("Invalid aicpu_thread_num: %d", aicpu_thread_num_);
+        init_failed_.store(true, std::memory_order_release);
+        return -1;
+    }
+
+    if (sched_ctx_.init(runtime, aicpu_thread_num_, sched_thread_num_, orch_to_sched_, get_platform_regs()) != 0) {
+        init_failed_.store(true, std::memory_order_release);
+        return -1;
+    }
+
+    finished_count_.store(0, std::memory_order_release);
+
+    init_done_.store(true, std::memory_order_release);
+    LOG_INFO_V0("AicpuExecutor: Init complete");
+    return 0;
+}
+
+/**
+ * Shutdown AICore - Send exit signal via registers to all AICore kernels
+ */
+int32_t AicpuExecutor::run(Runtime *runtime) {
+    int32_t thread_idx = thread_idx_++;
+    int32_t run_rc = 0;
+    LOG_INFO_V0("Thread %d: Start", thread_idx);
+
+    // Orchestrator check
+    if (thread_idx >= sched_thread_num_) {
+#if PTO2_PROFILING
+        uint64_t orch_cycle_start = 0;
+        int32_t pto2_submitted_tasks = -1;
+#endif
+        // Orchestrator thread: load + run the device orchestration SO. The braces
+        // scope the per-callable dlopen / SO-table locals to this block.
+        {
+            // Per-callable_id dispatch: the orch SO state lives in
+            // `orch_so_table_[callable_id]` keyed by registration order;
+            // reload is governed by `register_new_callable_id_`.
+            const int32_t callable_id = runtime->get_active_callable_id();
+            if (callable_id < 0 || callable_id >= MAX_REGISTERED_CALLABLE_IDS) {
+                LOG_ERROR(
+                    "Thread %d: invalid callable_id %d (limit=%d)", thread_idx, callable_id, MAX_REGISTERED_CALLABLE_IDS
+                );
+                runtime_init_ready_.store(true, std::memory_order_release);
+                return -1;
+            }
+            void **p_handle = &orch_so_table_[callable_id].handle;
+            char *p_path = orch_so_table_[callable_id].path;
+            DeviceOrchestrationFunc *p_func = &orch_so_table_[callable_id].func;
+            DeviceOrchestrationBindRuntimeFunc *p_bind = &orch_so_table_[callable_id].bind;
+            DeviceOrchestrationConfigFunc *p_config_func = &orch_so_table_[callable_id].config_func;
+            const bool reload_so = runtime->register_new_callable_id();
+
+            if (reload_so) {
+                LOG_INFO_V0("Thread %d: New orch SO detected (callable_id=%d), (re)loading", thread_idx, callable_id);
+                if (*p_handle != nullptr) {
+                    dlclose(*p_handle);
+                    *p_handle = nullptr;
+                    *p_func = nullptr;
+                    *p_bind = nullptr;
+                    if (p_path[0] != '\0') {
+                        // Unlink the old file so the new open() lands on a
+                        // fresh inode — protects against SIGBUS / ETXTBSY when
+                        // the kernel still has the old mapping pinned.
+                        unlink(p_path);
+                        p_path[0] = '\0';
+                    }
+                }
+
+                const void *so_data = reinterpret_cast<const void *>(runtime->get_dev_orch_so_addr());
+                size_t so_size = runtime->get_dev_orch_so_size();
+
+                if (so_data == nullptr || so_size == 0) {
+                    LOG_ERROR("Thread %d: Device orchestration SO not set", thread_idx);
+                    // Unblock scheduler threads before returning so they don't spin forever.
+                    runtime_init_ready_.store(true, std::memory_order_release);
+                    return -1;
+                }
+
+                // Try multiple paths that may allow execution on AICPU.
+                char so_path[256];
+                bool file_created = false;
+                const char *candidate_dirs[] = {
+                    "/usr/lib64/aicpu_kernels/0/aicpu_kernels_device", "/usr/lib64", "/lib64", "/var/tmp", "/tmp"
+                };
+                const int32_t num_candidates = sizeof(candidate_dirs) / sizeof(candidate_dirs[0]);
+
+                for (int32_t i = 0; i < num_candidates && !file_created; i++) {
+                    int32_t fd = create_orch_so_file(
+                        candidate_dirs[i], callable_id, get_orch_device_id(), so_path, sizeof(so_path)
+                    );
+                    if (fd < 0) {
+                        LOG_INFO_V0(
+                            "Thread %d: Cannot create SO at %s (errno=%d), trying next path", thread_idx, so_path, errno
+                        );
+                        continue;
+                    }
+                    ssize_t written = write(fd, so_data, so_size);
+                    close(fd);
+                    if (written != static_cast<ssize_t>(so_size)) {
+                        LOG_INFO_V0(
+                            "Thread %d: Cannot write SO to %s (errno=%d), trying next path", thread_idx, so_path, errno
+                        );
+                        unlink(so_path);
+                        continue;
+                    }
+                    file_created = true;
+                    LOG_INFO_V0("Thread %d: Created SO file at %s (%zu bytes)", thread_idx, so_path, so_size);
+                }
+
+                if (!file_created) {
+                    LOG_ERROR("Thread %d: Failed to create SO file in any candidate path", thread_idx);
+                    // Unblock scheduler threads before returning so they don't spin forever.
+                    runtime_init_ready_.store(true, std::memory_order_release);
+                    return -1;
+                }
+
+                dlerror();
+                void *handle = dlopen(so_path, RTLD_LAZY | RTLD_LOCAL);
+                const char *dlopen_err = dlerror();
+                if (handle == nullptr) {
+                    LOG_ERROR("Thread %d: dlopen failed: %s", thread_idx, dlopen_err ? dlopen_err : "unknown");
+                    unlink(so_path);
+                    // Unblock scheduler threads before returning so they don't spin forever.
+                    runtime_init_ready_.store(true, std::memory_order_release);
+                    return -1;
+                }
+                LOG_INFO_V0("Thread %d: dlopen succeeded, handle=%p", thread_idx, handle);
+
+                // Unlink the on-disk SO immediately: dlopen has already mmap'd
+                // the image, so the kernel keeps the inode alive until the
+                // matching dlclose / process exit. This prevents stale
+                // libdevice_orch_<pid>_<cid>.so files from accumulating in
+                // /tmp when child processes exit via os._exit(0), which skips
+                // ~AicpuExecutor (worker.py: _sub/_chip/_child loops).
+                unlink(so_path);
+
+                const char *entry_symbol = runtime->get_device_orch_func_name();
+                if (entry_symbol == nullptr || entry_symbol[0] == '\0') {
+                    entry_symbol = DEFAULT_ORCH_ENTRY_SYMBOL;
+                }
+                const char *config_symbol = runtime->get_device_orch_config_name();
+                if (config_symbol == nullptr || config_symbol[0] == '\0') {
+                    config_symbol = DEFAULT_ORCH_CONFIG_SYMBOL;
+                }
+
+                dlerror();
+                DeviceOrchestrationFunc orch_func =
+                    reinterpret_cast<DeviceOrchestrationFunc>(dlsym(handle, entry_symbol));
+                const char *entry_dlsym_error = dlerror();
+                if (entry_dlsym_error != nullptr) {
+                    LOG_ERROR(
+                        "Thread %d: dlsym failed for entry symbol '%s': %s", thread_idx, entry_symbol, entry_dlsym_error
+                    );
+                    dlclose(handle);
+                    unlink(so_path);
+                    // Unblock scheduler threads before returning so they don't spin forever.
+                    runtime_init_ready_.store(true, std::memory_order_release);
+                    return -1;
+                }
+                if (orch_func == nullptr) {
+                    LOG_ERROR("Thread %d: dlsym returned NULL for entry symbol '%s'", thread_idx, entry_symbol);
+                    dlclose(handle);
+                    unlink(so_path);
+                    // Unblock scheduler threads before returning so they don't spin forever.
+                    runtime_init_ready_.store(true, std::memory_order_release);
+                    return -1;
+                }
+
+                dlerror();
+                auto config_func = reinterpret_cast<DeviceOrchestrationConfigFunc>(dlsym(handle, config_symbol));
+                const char *config_dlsym_error = dlerror();
+                if (config_dlsym_error != nullptr || config_func == nullptr) {
+                    LOG_ERROR(
+                        "Thread %d: dlsym failed for config symbol '%s': %s", thread_idx, config_symbol,
+                        config_dlsym_error ? config_dlsym_error : "NULL function pointer"
+                    );
+                    config_func = nullptr;
+                }
+
+                dlerror();
+                auto bind_runtime_func =
+                    reinterpret_cast<DeviceOrchestrationBindRuntimeFunc>(dlsym(handle, "framework_bind_runtime"));
+                const char *bind_runtime_error = dlerror();
+                if (bind_runtime_error != nullptr) {
+                    LOG_ERROR("Thread %d: dlsym failed for framework_bind_runtime: %s", thread_idx, bind_runtime_error);
+                    bind_runtime_func = nullptr;
+                }
+
+                *p_handle = handle;
+                *p_func = orch_func;
+                *p_bind = bind_runtime_func;
+                *p_config_func = config_func;
+                snprintf(p_path, 256, "%s", so_path);
+                orch_so_table_[callable_id].in_use = true;
+            } else {
+                LOG_INFO_V0(
+                    "Thread %d: Reusing cached orch SO handle=%p (callable_id=%d)", thread_idx, *p_handle, callable_id
+                );
+                if (*p_handle == nullptr || *p_func == nullptr) {
+                    LOG_ERROR(
+                        "Thread %d: reload=false but no cached SO handle/func for callable_id=%d", thread_idx,
+                        callable_id
+                    );
+                    // Unblock scheduler threads before returning so they don't spin forever.
+                    runtime_init_ready_.store(true, std::memory_order_release);
+                    return -1;
+                }
+            }
+
+            // Build the entry-arg once per run; both the config call below and
+            // the orchestration entry (consumed at orch_args_cached_) use it.
+            orch_args_cached_.create_from_chip_args(runtime->get_orch_args());
+
+            // Validate arg count on every run (reload or cache hit).
+            if (*p_config_func != nullptr) {
+                PTO2OrchestrationConfig cfg = (*p_config_func)(orch_args_cached_);
+                LOG_INFO_V0("Thread %d: Config: expected_args=%d", thread_idx, cfg.expected_arg_count);
+                if (cfg.expected_arg_count > 0) {
+                    const ChipStorageTaskArgs &args_validate = runtime->get_orch_args();
+                    int32_t actual_arg_count = args_validate.tensor_count() + args_validate.scalar_count();
+                    if (actual_arg_count < cfg.expected_arg_count) {
+                        LOG_ERROR(
+                            "Thread %d: arg_count %d < expected %d", thread_idx, actual_arg_count,
+                            cfg.expected_arg_count
+                        );
+                        // Clean up cached state so a subsequent run does a full reload.
+                        if (*p_handle != nullptr) {
+                            dlclose(*p_handle);
+                            *p_handle = nullptr;
+                        }
+                        if (p_path[0] != '\0') {
+                            unlink(p_path);
+                            p_path[0] = '\0';
+                        }
+                        *p_func = nullptr;
+                        *p_bind = nullptr;
+                        *p_config_func = nullptr;
+                        orch_so_table_[callable_id].in_use = false;
+                        // Unblock scheduler threads before returning so they don't spin forever.
+                        runtime_init_ready_.store(true, std::memory_order_release);
+                        return -1;
+                    }
+                }
+            } else {
+                LOG_INFO_V0("Thread %d: No config function, using defaults", thread_idx);
+            }
+
+            // sm_handle / rt are bound to *this* run's memory and must be
+            // (re)created every run, regardless of whether the SO itself was
+            // reused above.
+            const ChipStorageTaskArgs &args = runtime->get_orch_args();
+            int32_t arg_count = args.tensor_count() + args.scalar_count();
+            LOG_INFO_V0("Thread %d: sm_ptr=%p, arg_count=%d", thread_idx, runtime->get_gm_sm_ptr(), arg_count);
+            for (int32_t i = 0; i < args.tensor_count() && i < 20; i++) {
+                const Tensor &t = args.tensor(i);
+                LOG_INFO_V0(
+                    "Thread %d: orch_args[%d] = TENSOR(data=0x%lx, ndims=%u, dtype=%u)", thread_idx, i,
+                    static_cast<uint64_t>(t.buffer.addr), t.ndims, static_cast<unsigned>(t.dtype)
+                );
+            }
+            for (int32_t i = 0; i < args.scalar_count() && (args.tensor_count() + i) < 20; i++) {
+                LOG_INFO_V0(
+                    "Thread %d: orch_args[%d] = SCALAR(0x%lx)", thread_idx, args.tensor_count() + i,
+                    static_cast<uint64_t>(args.scalar(i))
+                );
+            }
+
+            void *sm_ptr = runtime->get_gm_sm_ptr();
+
+            // Prebuilt-arena fast path. Host has pre-populated the entire
+            // runtime arena (PTO2Runtime + orchestrator/scheduler/tensor_map
+            // sub-regions + sm_handle wrapper + mailbox) and uploaded it via
+            // rtMemcpy into the pooled runtime_arena buffer. We attach to it,
+            // wire arena-internal pointers to their device addresses, reset
+            // the SM, and finalize the few device-only fields the host could
+            // not know at image-build time.
+            void *prebuilt_arena = runtime->get_prebuilt_arena_base();
+            size_t off_runtime = runtime->get_prebuilt_runtime_offset();
+            if (prebuilt_arena == nullptr) {
+                LOG_ERROR("Thread %d: prebuilt_arena_base is null", thread_idx);
+                runtime_init_ready_.store(true, std::memory_order_release);
+                return -1;
+            }
+            runtime_arena_.attach(prebuilt_arena, DeviceArena::kDefaultBaseAlign);
+            rt = reinterpret_cast<PTO2Runtime *>(static_cast<char *>(prebuilt_arena) + off_runtime);
+
+            // Wire every arena-internal pointer field (host wrote host-mirror
+            // addresses; we overwrite them with device addresses).
+            runtime_wire_arena_pointers(runtime_arena_, rt->prebuilt_layout, rt);
+            uint64_t sm_size = PTO2SharedMemoryHandle::calculate_size_per_ring(rt->prebuilt_layout.task_window_sizes);
+            for (int r = 0; r < PTO2_MAX_RING_DEPTH; ++r) {
+                LOG_INFO_V0(
+                    "Thread %d: Ring %d sizes: task_window=%" PRIu64 " heap=%" PRIu64 " dep_pool=%d", thread_idx, r,
+                    rt->prebuilt_layout.task_window_sizes[r], rt->prebuilt_layout.heap_sizes[r],
+                    rt->prebuilt_layout.dep_pool_capacities[r]
+                );
+            }
+
+            // Reset SM state. setup_pointers + init_header_per_ring restore
+            // ring flow-control counters, layout metadata, error flags, and
+            // the per-slot ring->slot_states[] (bind_ring + reset_for_reuse +
+            // fanin_count/active_mask zero — previously done inside
+            // RingSchedState::init).
+            memset(rt->sm_handle, 0, sizeof(*rt->sm_handle));
+            if (!rt->sm_handle->init_per_ring(
+                    sm_ptr, sm_size, rt->prebuilt_layout.task_window_sizes, rt->prebuilt_layout.heap_sizes
+                )) {
+                LOG_ERROR("Thread %d: sm_handle->init_per_ring failed", thread_idx);
+                rt = nullptr;
+                runtime_init_ready_.store(true, std::memory_order_release);
+                return -1;
+            }
+
+            // AICore completion mailbox lives in the arena; reset it each
+            // boot so stale completion notifications from a previous run do
+            // not leak.
+            memset(rt->aicore_mailbox, 0, sizeof(*rt->aicore_mailbox));
+
+            // Fill ops / core counts (host can't resolve s_runtime_ops's
+            // device address nor know the SchedulerContext's core fan-out).
+            runtime_finalize_after_wire(rt, sched_ctx_.aic_count(), sched_ctx_.aiv_count());
+#if PTO2_PROFILING
+            rt->orchestrator.l2_swimlane_level = get_l2_swimlane_level();
+            {
+                auto &orch = rt->orchestrator;
+                for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+                    auto &alloc = orch.rings[r].task_allocator;
+                    scope_stats_set_ring_capacity(
+                        r, alloc.window_size(), alloc.heap_capacity(), rt->prebuilt_layout.dep_pool_capacities[r]
+                    );
+                }
+                scope_stats_set_tensormap_capacity(orch.tensor_map.pool_capacity());
+            }
+#endif
+
+            // With multi-ring, slot_states are per-ring inside the scheduler.
+            runtime->set_slot_states_ptr(nullptr);
+
+            // Wire scheduler context to the newly created PTO2Runtime before
+            // releasing scheduler threads from runtime_init_ready_.
+            sched_ctx_.bind_runtime(rt);
+
+            runtime_init_ready_.store(true, std::memory_order_release);
+
+            // Wait for scheduler's one-time init to complete
+            sched_ctx_.wait_pto2_init_complete();
+
+#if PTO2_PROFILING
+            if (get_l2_swimlane_level() >= L2SwimlaneLevel::ORCH_PHASES) {
+                l2_swimlane_aicpu_set_orch_thread_idx(thread_idx);
+            }
+#endif
+
+            // dep_gen plugs into the orchestrator thread (single-instance subsystem):
+            // set the per-thread queue index and pop the initial buffer before any
+            // submit_task can fire inside orch_func_.
+            if (is_dep_gen_enabled()) {
+                dep_gen_aicpu_set_orch_thread_idx(thread_idx);
+                dep_gen_aicpu_init();
+            }
+
+#if PTO2_PROFILING
+            // scope_stats streams scope_end records off the orchestrator thread:
+            // record the per-thread ready_queue index. No-op (writer shared
+            // state null) when scope_stats is disabled; the current buffer is
+            // popped lazily on the first scope_end append.
+            scope_stats_aicpu_set_orch_thread_idx(thread_idx);
+#endif
+
+#if PTO2_PROFILING
+            orch_cycle_start = get_sys_cnt_aicpu();
+#endif
+            framework_bind_runtime(rt);
+            if (*p_bind != nullptr) {
+                (*p_bind)(rt);
+            }
+
+            // ---- fully_distributed_within_core handoff ----
+            // Instead of running orchestration here, wire the distributed engine
+            // (resets cursors/flags/heap, points rt->ops at the distributed
+            // submit path) and hand the per-core entry off to the AICore worker
+            // threads, which replay orchestration in SPMD fashion and execute
+            // the tasks they win. This AICPU thread then waits for all workers.
+            // See runtime/dist_engine.* and docs/fully_distributed_within_core.md.
+            {
+                const int32_t num_workers = runtime->worker_count;
+                // dist_engine_register repoints rt->ops at the distributed submit
+                // table for the duration of the on-core orchestration replay. Save
+                // the centralized ops so the scheduler-handoff calls below
+                // (rt_orchestration_done / on_orchestration_done) work unchanged.
+                const PTO2RuntimeOps *saved_ops = rt->ops;
+                void *core_main =
+                    dist_engine_register(rt, *p_func, &orch_args_cached_, num_workers, runtime);
+                runtime->dist.core_main_fn = reinterpret_cast<uint64_t>(core_main);
+                runtime->dist.num_workers = num_workers;
+                __atomic_store_n(&runtime->dist.done_count, 0, __ATOMIC_RELEASE);
+                __atomic_store_n(&runtime->dist.go, 1u, __ATOMIC_RELEASE);
+                const bool dist_trace = (getenv("PTO_DIST_TRACE") != nullptr);
+                if (dist_trace)
+                    fprintf(stderr, "[dist] Thread %d: engine wired, %d workers launched\n", thread_idx, num_workers);
+                while (__atomic_load_n(&runtime->dist.done_count, __ATOMIC_ACQUIRE) < num_workers) {
+                    SPIN_WAIT_HINT();
+                }
+                // All workers done (single-threaded here): emit the per-core
+                // execution swimlane if PTO_DIST_SWIMLANE is set (else no-op).
+                dist_engine_dump_trace();
+                rt->ops = saved_ops;
+                if (dist_trace)
+                    fprintf(stderr, "[dist] Thread %d: all %d distributed workers finished\n", thread_idx, num_workers);
+            }
+
+            // Flush the (potentially partially-filled) DepGenBuffer so the host
+            // collector can pick it up before this orchestrator thread joins.
+            if (is_dep_gen_enabled()) {
+                dep_gen_aicpu_flush();
+            }
+#if PTO2_PROFILING
+            // Push the partially-filled scope_stats buffer so the host gets the
+            // final scope_end records. Idempotent / no-op when disabled.
+            scope_stats_aicpu_flush_buffers();
+#endif
+#if PTO2_PROFILING
+            uint64_t orch_cycle_end = get_sys_cnt_aicpu();
+            (void)orch_cycle_end;
+#endif
+
+            // Print orchestrator profiling data
+#if PTO2_ORCH_PROFILING
+            PTO2OrchProfilingData p = orchestrator_get_profiling();
+            uint64_t total =
+                p.sync_cycle + p.alloc_cycle + p.args_cycle + p.lookup_cycle + p.insert_cycle + p.fanin_cycle;
+            if (total == 0) total = 1;  // avoid div-by-zero
+            LOG_INFO_V9(
+                "Thread %d: === Orchestrator Profiling: %" PRId64 " tasks, total=%.3fus ===", thread_idx,
+                static_cast<int64_t>(p.submit_count), cycles_to_us(total)
+            );
+            LOG_INFO_V9(
+                "Thread %d:   task+heap_alloc: %.3fus (%.1f%%)  work=%.3fus wait=%.3fus  atomics=%" PRIu64 "",
+                thread_idx, cycles_to_us(p.alloc_cycle), p.alloc_cycle * 100.0 / total,
+                cycles_to_us(p.alloc_cycle - p.alloc_wait_cycle), cycles_to_us(p.alloc_wait_cycle),
+                static_cast<uint64_t>(p.alloc_atomic_count)
+            );
+            LOG_INFO_V9(
+                "Thread %d:   sync_tensormap : %.3fus (%.1f%%)", thread_idx, cycles_to_us(p.sync_cycle),
+                p.sync_cycle * 100.0 / total
+            );
+            LOG_INFO_V9(
+                "Thread %d:   lookup+dep     : %.3fus (%.1f%%)", thread_idx, cycles_to_us(p.lookup_cycle),
+                p.lookup_cycle * 100.0 / total
+            );
+            LOG_INFO_V9(
+                "Thread %d:   tensormap_ins  : %.3fus (%.1f%%)", thread_idx, cycles_to_us(p.insert_cycle),
+                p.insert_cycle * 100.0 / total
+            );
+            LOG_INFO_V9(
+                "Thread %d:   param_copy     : %.3fus (%.1f%%)  atomics=%" PRIu64 "", thread_idx,
+                cycles_to_us(p.args_cycle), p.args_cycle * 100.0 / total, static_cast<uint64_t>(p.args_atomic_count)
+            );
+            LOG_INFO_V9(
+                "Thread %d:   fanin+ready    : %.3fus (%.1f%%)  work=%.3fus wait=%.3fus", thread_idx,
+                cycles_to_us(p.fanin_cycle), p.fanin_cycle * 100.0 / total,
+                cycles_to_us(p.fanin_cycle - p.fanin_wait_cycle), cycles_to_us(p.fanin_wait_cycle)
+            );
+            LOG_INFO_V9(
+                "Thread %d:   avg/task       : %.3fus", thread_idx,
+                p.submit_count > 0 ? cycles_to_us(total) / p.submit_count : 0.0
+            );
+
+#if PTO2_TENSORMAP_PROFILING
+            PTO2TensorMapProfilingData tp = pto2_tensormap_get_profiling();
+            LOG_INFO_V9("Thread %d: === TensorMap Lookup Stats ===", thread_idx);
+            LOG_INFO_V9(
+                "Thread %d:   lookups        : %" PRIu64 ", inserts: %" PRIu64 "", thread_idx,
+                static_cast<uint64_t>(tp.lookup_count), static_cast<uint64_t>(tp.insert_count)
+            );
+            LOG_INFO_V9(
+                "Thread %d:   chain walked   : total=%" PRIu64 ", avg=%.1f, max=%d", thread_idx,
+                static_cast<uint64_t>(tp.lookup_chain_total),
+                tp.lookup_count > 0 ? static_cast<double>(tp.lookup_chain_total) / tp.lookup_count : 0.0,
+                tp.lookup_chain_max
+            );
+            LOG_INFO_V9(
+                "Thread %d:   overlap checks : %" PRIu64 ", hits=%" PRIu64 " (%.1f%%)", thread_idx,
+                static_cast<uint64_t>(tp.overlap_checks), static_cast<uint64_t>(tp.overlap_hits),
+                tp.overlap_checks > 0 ? tp.overlap_hits * 100.0 / tp.overlap_checks : 0.0
+            );
+#endif
+#endif  // PTO2_ORCH_PROFILING
+
+            // Latch task count from PTO2 shared memory to hand off to the
+            // scheduler. The orchestrator's run window (start_time / end_time /
+            // submit_count) is no longer published to shared memory — the
+            // device LOG_INFO_V9 "orch_start=… orch_end=… orch_cost=…" line
+            // below carries the same envelope info for debugging, and
+            // host-side swimlane derives per-phase timing from the per-event
+            // L2SwimlaneAicpuSchedPhaseRecord[] + L2SwimlaneAicpuOrchPhaseRecord[]
+            // streams that already cover everything inside submit_task().
+            int32_t total_tasks = 0;
+            if (rt->orchestrator.sm_header) {
+                for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+                    total_tasks +=
+                        rt->orchestrator.sm_header->rings[r].fc.current_task_index.load(std::memory_order_acquire);
+                }
+            }
+
+#if PTO2_PROFILING
+            pto2_submitted_tasks = total_tasks;
+#endif
+
+            // Signal completion to the orchestrator state machine
+            rt_orchestration_done(rt);
+
+            sched_ctx_.on_orchestration_done(runtime, rt, thread_idx, total_tasks);
+        }
+#if PTO2_PROFILING
+        uint64_t orch_end_ts = get_sys_cnt_aicpu();
+        LOG_INFO_V9(
+            "Thread %d: orch_start=%" PRIu64 " orch_end=%" PRIu64 " orch_cost=%.3fus", thread_idx,
+            static_cast<uint64_t>(orch_cycle_start), static_cast<uint64_t>(orch_end_ts),
+            cycles_to_us(orch_end_ts - orch_cycle_start)
+        );
+        if (pto2_submitted_tasks >= 0) {
+            LOG_INFO_V9(
+                "PTO2 total submitted tasks = %d, already executed %d tasks", pto2_submitted_tasks,
+                sched_ctx_.completed_tasks_count()
+            );
+        }
+#endif
+        LOG_INFO_V0("Thread %d: Orchestrator completed", thread_idx);
+    }
+
+    // Scheduler thread (orchestrator threads skip dispatch when orch_to_sched_ is false)
+    if (!sched_ctx_.is_completed() && (thread_idx < sched_thread_num_ || orch_to_sched_)) {
+        // Device orchestration: wait for the primary orchestrator to initialize the SM header
+        while (!runtime_init_ready_.load(std::memory_order_acquire)) {
+            SPIN_WAIT_HINT();
+        }
+        if (rt == nullptr) {
+            LOG_ERROR("Thread %d: rt is null after orchestrator error, skipping dispatch", thread_idx);
+        } else {
+            sched_ctx_.bind_runtime(rt);
+            int32_t completed = sched_ctx_.resolve_and_dispatch(runtime, thread_idx);
+            if (completed < 0) {
+                LOG_ERROR("Thread %d: Scheduler failed with rc=%d", thread_idx, completed);
+                run_rc = completed;
+            } else {
+                LOG_INFO_V0("Thread %d: Executed %d tasks from runtime", thread_idx, completed);
+            }
+        }
+    }
+
+    // Always shutdown AICore — even if sched_ctx_.completed_ was already true.
+    // platform_deinit_aicore_regs is idempotent; orchestrator threads have
+    // core_trackers_[thread_idx].core_num() == 0 so they skip the loop harmlessly.
+    int32_t shutdown_rc = sched_ctx_.shutdown(thread_idx);
+    if (shutdown_rc != 0 && run_rc == 0) {
+        run_rc = shutdown_rc;
+    }
+
+    LOG_INFO_V0("Thread %d: Completed", thread_idx);
+
+    // Check if this is the last thread to finish
+    int32_t prev_finished = finished_count_.fetch_add(1, std::memory_order_acq_rel);
+    if (prev_finished + 1 == aicpu_thread_num_) {
+        finished_.store(true, std::memory_order_release);
+        // Destroy PTO2 runtime. sm_handle / rt are recreated every run so we
+        // always tear them down here, but we keep the per-cid orch SO entries
+        // alive for the next run's cache-hit reuse (see run() reload_so branch).
+        if (rt != nullptr) {
+            // Clear g_current_runtime in this DSO and in the orchestration SO before destroying rt.
+            const int32_t callable_id = runtime->get_active_callable_id();
+            framework_bind_runtime(nullptr);
+            if (callable_id >= 0 && callable_id < MAX_REGISTERED_CALLABLE_IDS) {
+                DeviceOrchestrationBindRuntimeFunc bind = orch_so_table_[callable_id].bind;
+                if (bind != nullptr) {
+                    bind(nullptr);
+                }
+            }
+            runtime_destroy(rt, runtime_arena_);
+            rt = nullptr;
+        }
+    }
+
+    return run_rc;
+}
+
+void AicpuExecutor::deinit(Runtime *runtime) {
+    // 1. Invalidate AICPU cache for Runtime address range.
+    //    Next round's Host DMA (rtMemcpy) writes fresh Runtime to HBM but
+    //    bypasses this cache. Invalidating now ensures next round reads from HBM.
+    cache_invalidate_range(runtime, sizeof(Runtime));
+
+    // Reset all SchedulerContext-owned state in one place.
+    sched_ctx_.deinit();
+
+    finished_count_.store(0, std::memory_order_release);
+    runtime_init_ready_.store(false, std::memory_order_release);
+
+    aicpu_thread_num_ = 0;
+    sched_thread_num_ = 0;
+    orch_to_sched_ = false;
+
+    orch_args_cached_.reset();
+    // orch_so_table_ entries are intentionally preserved across deinit: the
+    // next run reuses cached handles when register_new_callable_id() returns
+    // false. The destructor releases them at process teardown.
+
+    // Clear file-scope PTO2Runtime pointer (freed by orchestrator thread before deinit)
+    rt = nullptr;
+
+    // Clear dep_gen file-local bookkeeping. No-op when dep_gen is disabled.
+    dep_gen_aicpu_finalize();
+
+    LOG_INFO_V0("DeInit: Runtime execution state reset");
+
+    initialized_.store(false, std::memory_order_release);
+    init_done_.store(false, std::memory_order_release);
+    init_failed_.store(false, std::memory_order_release);
+    thread_idx_.store(0, std::memory_order_release);
+    finished_.store(false, std::memory_order_release);
+
+    LOG_INFO_V0("DeInit: AicpuExecutor reset complete");
+}
+
+// ===== Public Entry Point =====
+
+/**
+ * aicpu_execute - Main AICPU kernel execution entry point
+ *
+ * This is called by DynTileFwkBackendKernelServer in kernel.cpp.
+ * Orchestrates the complete task runtime execution:
+ * 1. Initialize executor (thread-safe, first thread only)
+ * 2. Wait for initialization to complete
+ * 3. Execute tasks on managed cores
+ * 4. Cleanup when last thread finishes
+ *
+ * @param runtime Pointer to Runtime structure
+ * @return 0 on success, non-zero on error
+ */
+extern "C" int32_t aicpu_execute(Runtime *runtime) {
+    if (runtime == nullptr) {
+        LOG_ERROR("%s", "Invalid argument: null Runtime pointer");
+        return -1;
+    }
+
+    LOG_INFO_V0("%s", "aicpu_execute: Starting AICPU kernel execution");
+
+    g_aicpu_executor.init(runtime);
+
+    while (!g_aicpu_executor.init_done_.load(std::memory_order_acquire)) {
+        if (g_aicpu_executor.init_failed_.load(std::memory_order_acquire)) {
+            LOG_ERROR("%s", "aicpu_execute: Initialization failed, aborting execution");
+            return -1;
+        }
+    }
+
+    int32_t rc = g_aicpu_executor.run(runtime);
+    if (rc != 0) {
+        LOG_ERROR("aicpu_execute: Thread execution failed with rc=%d", rc);
+    }
+
+    int32_t runtime_rc = read_pto2_runtime_status(runtime);
+
+    // Last thread cleans up
+    if (g_aicpu_executor.finished_.load(std::memory_order_acquire)) {
+        LOG_INFO_V0("aicpu_execute: Last thread finished, cleaning up");
+        g_aicpu_executor.deinit(runtime);
+    }
+
+    if (runtime_rc != 0) {
+        LOG_ERROR("aicpu_execute: PTO2 runtime failed with rc=%d", runtime_rc);
+        return runtime_rc;
+    }
+
+    if (rc != 0) {
+        return rc;
+    }
+
+    LOG_INFO_V0("%s", "aicpu_execute: Kernel execution completed successfully");
+    return 0;
+}
diff --git a/src/a2a3/runtime/fully_distributed_within_core/build_config.py b/src/a2a3/runtime/fully_distributed_within_core/build_config.py
new file mode 100644
index 000000000..da34f14f9
--- /dev/null
+++ b/src/a2a3/runtime/fully_distributed_within_core/build_config.py
@@ -0,0 +1,32 @@
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+# fully_distributed_within_core runtime build configuration
+# All paths are relative to this file's directory (src/runtime/fully_distributed_within_core/)
+#
+# Goal: orchestration + scheduling + execution run on the AI cores themselves in
+# SPMD fashion, removing AICPU from orchestration/scheduling. See the design spec:
+#   docs/fully_distributed_within_core.md
+#
+# This tree is currently re-based on the tensormap_and_ringbuffer runtime so it
+# is discoverable and compiles; it reuses TensorMap, MixedKernels/ActiveMask,
+# L0TaskArgs, the pto_orchestration_api submit API, and kernel-address
+# resolution. The distributed model (claim race + per-core TensorMap + private
+# task ring + global completion-flag ring) is layered on incrementally per the
+# spec; the AICPU is reduced to an init/teardown stub.
+#
+# The "orchestration" directory contains source files compiled into both
+# runtime targets AND the orchestration .so (e.g., tensor methods needed
+# by the Tensor constructor's validation logic).
+
+BUILD_CONFIG = {
+    "aicore": {"include_dirs": ["runtime", "common", ".."], "source_dirs": ["aicore", "orchestration"]},
+    "aicpu": {"include_dirs": ["runtime", "common", ".."], "source_dirs": ["aicpu", "runtime", "orchestration"]},
+    "host": {"include_dirs": ["runtime", "common", ".."], "source_dirs": ["host", "runtime/shared", "orchestration"]},
+    "orchestration": {"include_dirs": ["runtime", "orchestration", "common", ".."], "source_dirs": ["orchestration"]},
+}
diff --git a/src/a2a3/runtime/fully_distributed_within_core/common/intrinsic.h b/src/a2a3/runtime/fully_distributed_within_core/common/intrinsic.h
new file mode 100644
index 000000000..768e6a612
--- /dev/null
+++ b/src/a2a3/runtime/fully_distributed_within_core/common/intrinsic.h
@@ -0,0 +1,199 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+/**
+ * @file intrinsic.h
+ * @brief SPMD execution context for AICore user kernels
+ *
+ * Topology data exposed to user kernels has two distinct lifetimes:
+ *
+ *   1. Global topology (per-core, fixed after runtime init):
+ *      - sub_block_id : identifies the AIV lane within a cluster
+ *        (0 = AIV0/left, 1 = AIV1/right).  Initialized once at runtime
+ *        startup based on each core's cluster position; never changes.
+ *        Only meaningful for AIV kernels in MIX tasks.
+ *
+ *   2. Local per-dispatch context (changes each dispatch):
+ *      - block_idx : which logical block the current worker is executing
+ *      - block_num : total number of blocks in this task (= block_dim)
+ *      Written by build_payload() before each dispatch.
+ *
+ * Both categories are injected via two pointer slots appended at the tail
+ * of the kernel args[] array:
+ *
+ *   args layout:
+ *     [0 .. tensor_count-1]                 = tensor GM pointers
+ *     [tensor_count .. +scalar_count-1]     = scalar values
+ *     ...
+ *     [SPMD_LOCAL_CONTEXT_INDEX]            = (uint64_t)&LocalContext   (per-dispatch)
+ *     [SPMD_GLOBAL_CONTEXT_INDEX]           = (uint64_t)&GlobalContext  (per-core)
+ *
+ * The suffix positions are compile-time constants and do not depend on the
+ * runtime tensor_count or scalar_count.
+ *
+ * Include this header in AICore kernel source files to use the Get* accessors.
+ * Do NOT depend on the raw index constants; always use the accessor functions.
+ *
+ * On CCEC (real hardware), __gm__ and __aicore__ must be defined before
+ * including this header (e.g. via <pto/pto-inst.hpp> or manual #define).
+ * The #ifndef guards below provide fallbacks for non-kernel builds
+ * (AICPU, HOST) where these qualifiers are not needed.
+ *
+ * IMPORTANT — do NOT mix these with the CCE built-in topology intrinsics
+ * (`get_subblockid()`, `get_block_idx()`, `get_block_num()` declared in
+ * `kernel_operator.h` / tikcfw). Those intrinsics read AICore hardware
+ * registers that simpler's tensormap_and_ringbuffer runtime does NOT
+ * program. Specifically:
+ *
+ *   - CCE `get_subblockid()` returns whatever stale value the AICore
+ *     sub-block register holds — under simpler's MIX dispatch it is 0
+ *     for BOTH AIV0 and AIV1 of every cluster, so a kernel that uses
+ *     it to partition heads will silently have AIV1 redo AIV0's work
+ *     and the AIV1 share of the output is never written. This is the
+ *     exact failure mode that produced the partial-zero output in
+ *     issue #900 (PR #899 spmd_paged_attention_highperf); the kernel
+ *     compiled, ran without error, and produced wrong output. Use
+ *     `get_sub_block_id(args)` instead, which reads from the runtime's
+ *     `GlobalContext.sub_block_id` that the scheduler initializes per
+ *     AIV core in `scheduler_cold_path.cpp::SchedulerContext::init`.
+ *
+ *   - `get_block_idx()` and `get_block_num()` are not redirected to
+ *     simpler's LocalContext either — use the `(args)` variants below
+ *     so the values reflect simpler's logical block_dim (which can
+ *     differ from `RUNTIME_CONFIG.block_dim`, the physical core count).
+ *
+ * If you are porting a kernel originally written for native CANN dispatch
+ * (AscendC, ascend-transformer-boost, etc.), every reference to those
+ * three CCE intrinsics needs to be rewritten against this header. See
+ * `docs/aicore-kernel-programming.md` for the full author contract,
+ * porting checklist, and the worked example from PR #899 / issue #900.
+ */
+
+#pragma once
+
+#include <stdint.h>
+
+#include "aicore_completion_mailbox_types.h"
+#include "pto_task_id.h"
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__
+#endif
+
+/** Number of extra pointer slots appended to the args[] tail (LocalContext + GlobalContext). */
+static constexpr int32_t PTO2_EXT_PARAMS_COUNT = 2;
+
+/**
+ * Args[] suffix indices for context pointers.
+ * Derived from MAX_TENSOR_ARGS(32) + MAX_SCALAR_ARGS(16).
+ * Users should not depend on these values; use the Get* functions below.
+ */
+static constexpr int32_t SPMD_LOCAL_CONTEXT_INDEX = 48;
+static constexpr int32_t SPMD_GLOBAL_CONTEXT_INDEX = 49;
+static constexpr int32_t PAYLOAD_LOCAL_CONTEXT_INDEX = SPMD_LOCAL_CONTEXT_INDEX;
+static constexpr int32_t PAYLOAD_GLOBAL_CONTEXT_INDEX = SPMD_GLOBAL_CONTEXT_INDEX;
+
+/**
+ * Per-core global context, stored in PTO2DispatchPayload.
+ * Initialized once at runtime startup (init_global_context) based on each
+ * core's cluster position.  Never modified after initialization.
+ */
+struct GlobalContext {
+    // AIV lane within cluster: 0=AIV0(left), 1=AIV1(right).
+    // Used by AIV to select the correct intra-cluster hw instruction.
+    // Not meaningful for AIC kernels or single-AIV tasks.
+    int32_t sub_block_id;
+};
+
+struct AsyncCtx {
+    volatile __gm__ uint32_t *completion_count;
+    volatile __gm__ int32_t *completion_error_code;
+    volatile __gm__ DeferredCompletionEntry *completion_entries;
+    uint32_t completion_capacity;
+    PTO2TaskId task_token;
+
+    static inline AsyncCtx make(PTO2TaskId task_token, volatile __gm__ DeferredCompletionSlab *buffer) {
+        AsyncCtx ctx{};
+        ctx.task_token = task_token;
+        if (buffer == nullptr) {
+            ctx.task_token = PTO2TaskId::invalid();
+            return ctx;
+        }
+        ctx.completion_count = &buffer->count;
+        ctx.completion_error_code = &buffer->error_code;
+        ctx.completion_entries = &buffer->entries[0];
+        ctx.completion_capacity = MAX_COMPLETIONS_PER_TASK;
+        return ctx;
+    }
+};
+
+/**
+ * Per-dispatch local context, stored in PTO2DispatchPayload.
+ * Written by build_payload() before each dispatch. Different blocks of the
+ * same task receive different block_idx values but the same block_num.
+ *
+ */
+struct LocalContext {
+    int32_t block_idx;  // Logical block index within the task [0, block_num)
+    int32_t block_num;  // How many logical blocks this task requires.
+                        // Currently fixed to 1 (block_dim > 1 not yet implemented).
+                        // NOT the same as RUNTIME_CONFIG.block_dim in kernel_config.py,
+                        // which controls how many physical cores the runtime launches.
+    AsyncCtx async_ctx;
+};
+
+/**
+ * Return the AIV lane index within the cluster.
+ * In a MIX 1C2V task: AIV0(left)=0, AIV1(right)=1.
+ *
+ * This value is only meaningful for AIV kernels in MIX tasks.  It tells
+ * the AIV whether it is the left lane or the right lane within the cluster,
+ * which determines the correct hardware instruction for intra-cluster
+ * communication.
+ *
+ * AIC kernels should NOT call this function.
+ * Single-AIV tasks have no intra-cluster communication, so sub_block_id
+ * has no meaning and should not be used.
+ */
+static __aicore__ inline int32_t get_sub_block_id(__gm__ int64_t *args) {
+    __gm__ GlobalContext *ctx =
+        reinterpret_cast<__gm__ GlobalContext *>(static_cast<uint64_t>(args[SPMD_GLOBAL_CONTEXT_INDEX]));
+    return ctx->sub_block_id;
+}
+
+/**
+ * Return the logical block index assigned to the current worker.
+ * Range: [0, get_block_num(args)).
+ * Within the same task, different blocks receive different indices.
+ */
+static __aicore__ inline int32_t get_block_idx(__gm__ int64_t *args) {
+    __gm__ LocalContext *ctx =
+        reinterpret_cast<__gm__ LocalContext *>(static_cast<uint64_t>(args[SPMD_LOCAL_CONTEXT_INDEX]));
+    return ctx->block_idx;
+}
+
+/**
+ * Return how many logical blocks the current task requires.
+ * All blocks of the same task see the same value.
+ * Currently always returns 1 (block_dim>1 not yet implemented).
+ *
+ * Note: this is NOT the same as RUNTIME_CONFIG.block_dim in
+ * kernel_config.py, which controls how many physical cores are launched.
+ */
+static __aicore__ inline int32_t get_block_num(__gm__ int64_t *args) {
+    __gm__ LocalContext *ctx =
+        reinterpret_cast<__gm__ LocalContext *>(static_cast<uint64_t>(args[SPMD_LOCAL_CONTEXT_INDEX]));
+    return ctx->block_num;
+}
diff --git a/src/a2a3/runtime/fully_distributed_within_core/common/pto_runtime_status.h b/src/a2a3/runtime/fully_distributed_within_core/common/pto_runtime_status.h
new file mode 100644
index 000000000..5d33fe18d
--- /dev/null
+++ b/src/a2a3/runtime/fully_distributed_within_core/common/pto_runtime_status.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+/**
+ * PTO2 Runtime Status Helpers
+ *
+ * Shared error-code contract used inside the tensormap_and_ringbuffer runtime.
+ */
+
+#ifndef SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_COMMON_PTO_RUNTIME_STATUS_H_
+#define SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_COMMON_PTO_RUNTIME_STATUS_H_
+
+#include <stdint.h>
+
+// Orchestrator errors (1-99): detected in orchestrator thread
+#define PTO2_ERROR_NONE 0  // Explicitly means "no error"; it is not an "unknown/unspecified" error code.
+#define PTO2_ERROR_SCOPE_DEADLOCK 1
+#define PTO2_ERROR_HEAP_RING_DEADLOCK 2
+#define PTO2_ERROR_FLOW_CONTROL_DEADLOCK 3
+#define PTO2_ERROR_DEP_POOL_OVERFLOW 4
+#define PTO2_ERROR_INVALID_ARGS 5         // Arg construction error (invalid args)
+#define PTO2_ERROR_DEPENDENCY_OVERFLOW 6  // Too many unique fanin dependencies for one task
+#define PTO2_ERROR_REQUIRE_SYNC_START_INVALID 7
+#define PTO2_ERROR_TENSOR_WAIT_TIMEOUT 8
+#define PTO2_ERROR_EXPLICIT_ORCH_FATAL 9
+#define PTO2_ERROR_SCOPE_TASKS_OVERFLOW 10  // scope_tasks buffer saturated (all rings full)
+
+// Scheduler errors (100+): detected in scheduler threads
+#define PTO2_ERROR_SCHEDULER_TIMEOUT 100
+#define PTO2_ERROR_ASYNC_COMPLETION_INVALID 101
+#define PTO2_ERROR_ASYNC_WAIT_OVERFLOW 102
+#define PTO2_ERROR_ASYNC_REGISTRATION_FAILED 103
+
+static inline int32_t runtime_status_from_error_codes(int32_t orch_error_code, int32_t sched_error_code) {
+    if (orch_error_code != PTO2_ERROR_NONE) {
+        return orch_error_code < 0 ? orch_error_code : -orch_error_code;
+    }
+    if (sched_error_code != PTO2_ERROR_NONE) {
+        return sched_error_code < 0 ? sched_error_code : -sched_error_code;
+    }
+    return 0;
+}
+
+#endif  // SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_COMMON_PTO_RUNTIME_STATUS_H_
diff --git a/src/a2a3/runtime/fully_distributed_within_core/docs/MULTI_RING.md b/src/a2a3/runtime/fully_distributed_within_core/docs/MULTI_RING.md
new file mode 100644
index 000000000..0de4f96ba
--- /dev/null
+++ b/src/a2a3/runtime/fully_distributed_within_core/docs/MULTI_RING.md
@@ -0,0 +1,330 @@
+# Multi-Ring Buffer Architecture
+
+> Extension to the PTO2 runtime. For the base architecture, see [RUNTIME_LOGIC.md](RUNTIME_LOGIC.md).
+
+## 1. Problem
+
+The single-ring design uses one `last_task_alive` watermark shared by HeapRing, TaskRing, and DepPool. When tasks from an inner scope (e.g., per-block iteration) complete, their resources cannot be reclaimed until **all** prior tasks — including those from the outer scope — also complete. This wastes ring capacity and can trigger deadlocks when ring sizes are small.
+
+## 2. Solution
+
+Split HeapRing, TaskRing, and DepPool into arrays of `PTO2_MAX_RING_DEPTH` (4) independent instances. Each scope depth maps to its own ring, with an independent `last_task_alive` watermark.
+
+```text
+Scope depth 0  ──►  rings[0] = { HeapRing, TaskRing, DepPool }
+Scope depth 1  ──►  rings[1] = { HeapRing, TaskRing, DepPool }
+Scope depth 2  ──►  rings[2] = { HeapRing, TaskRing, DepPool }
+Scope depth ≥3 ──►  rings[3] = { HeapRing, TaskRing, DepPool }  (clamped)
+```
+
+Inner-scope tasks can now be reclaimed independently without waiting for outer-scope tasks to complete.
+
+## 3. Task ID Encoding
+
+Task IDs are widened from 32-bit to 64-bit to carry the ring identity:
+
+```text
+task_id.raw = (ring_id << 32) | local_id
+```
+
+`PTO2TaskId` exposes direct accessors in `pto_runtime2_types.h`:
+
+| API | Purpose |
+| --- | ------- |
+| `PTO2TaskId::make(ring_id, local_id)` | Compose a 64-bit task ID (`PTO2TaskId`) |
+| `task_id.ring()` | Extract `ring_id` (bits 63-32) |
+| `task_id.local()` | Extract `local_id` (bits 31-0) |
+| `task_id.raw` | Access the packed 64-bit encoding |
+
+Type changes:
+
+| Field | Before | After |
+| ----- | ------ | ----- |
+| `PTO2TaskDescriptor.task_id` | `int32_t` | `PTO2TaskId` |
+| `PTO2TensorMapEntry.producer_task_id` | `int32_t` | `PTO2TaskId` |
+| `PTO2TaskSlotState.ring_id` | N/A | `uint8_t` (new, denormalized for fast access) |
+
+## 4. Data Structures
+
+### 4.1 PTO2RingSet (new)
+
+Bundles the three per-ring resources into a single aggregate (`pto_ring_buffer.h`):
+
+```cpp
+struct PTO2RingSet {
+    PTO2HeapRing   heap_ring;
+    PTO2TaskRing   task_ring;
+    PTO2FaninPool fanin_pool;
+};
+```
+
+### 4.2 PTO2OrchestratorState (modified)
+
+```cpp
+// Before: single ring
+PTO2HeapRing heap_ring;
+PTO2TaskRing task_ring;
+PTO2DepListPool dep_pool;
+
+// After: per-ring array (dep_pool moved to scheduler, see §4.5)
+PTO2RingSet rings[PTO2_MAX_RING_DEPTH];
+```
+
+Ring selection: `current_ring_id() = min(scope_stack_top, PTO2_MAX_RING_DEPTH - 1)`.
+
+### 4.3 PTO2SharedMemoryHeader (modified)
+
+Per-ring flow control and per-ring layout info are grouped together:
+
+```cpp
+struct PTO2RingFlowControl {
+    std::atomic<int32_t> current_task_index;  // task ring head
+    std::atomic<int32_t> last_task_alive;     // task ring tail
+    std::atomic<uint64_t> heap_top;           // heap alloc pointer
+    std::atomic<uint64_t> heap_tail;          // heap reclaim pointer
+};
+
+struct alignas(64) PTO2SharedMemoryRingHeader {
+    PTO2RingFlowControl fc;
+
+    // Layout metadata (set once at init)
+    uint64_t task_window_size;
+    int32_t task_window_mask;       // task_window_size - 1
+    uint64_t heap_size;
+    uint64_t task_descriptors_offset;
+
+    // Per-ring data pointers (host-side, set by setup_pointers)
+    PTO2TaskDescriptor *task_descriptors;
+    PTO2TaskPayload *task_payloads;
+    PTO2TaskSlotState *slot_states;
+
+    // Accessors (slot = local_id & task_window_mask)
+    PTO2TaskDescriptor &get_task_by_slot(int32_t slot);
+    PTO2TaskDescriptor &get_task_by_task_id(int32_t local_id);
+    PTO2TaskPayload &get_payload_by_slot(int32_t slot);
+    PTO2TaskPayload &get_payload_by_task_id(int32_t local_id);
+    PTO2TaskSlotState &get_slot_state_by_slot(int32_t slot);
+    PTO2TaskSlotState &get_slot_state_by_task_id(int32_t local_id);
+};
+
+// In header:
+PTO2SharedMemoryRingHeader rings[PTO2_MAX_RING_DEPTH];
+```
+
+Per-ring try-locks in the scheduler state prevent concurrent scheduler threads from interleaving watermark writes within the same ring. `FaninPool`/`DepListPool` `reclaim`/`ensure_space` take `PTO2SharedMemoryRingHeader&` directly (no `ring_id` or `fc` parameters).
+
+### 4.4 PTO2SharedMemoryHandle (lifecycle-only)
+
+Slimmed to lifecycle management only. Per-ring data pointers now live in `PTO2SharedMemoryRingHeader` (§4.3). Runtime components (orchestrator, scheduler) store `PTO2SharedMemoryHeader*` directly, eliminating one indirection on every per-ring access.
+
+```cpp
+struct PTO2SharedMemoryHandle {
+    void *sm_base;
+    uint64_t sm_size;
+    PTO2SharedMemoryHeader *header;
+    bool is_owner;
+};
+```
+
+### 4.5 PTO2SchedulerState (modified)
+
+```cpp
+struct RingSchedState {
+    // Cache Line 0: ring pointer (read-only) + hot path (read-write)
+    PTO2SharedMemoryRingHeader *ring;  // direct pointer, no indirection
+    int32_t last_task_alive;
+    std::atomic<int32_t> advance_lock;  // multi-thread CAS
+
+    // Cache Line 1+: Thread 0 only (wiring dep_pool, cache-isolated)
+    alignas(64) PTO2DepListPool dep_pool;
+};
+
+RingSchedState ring_sched_states[PTO2_MAX_RING_DEPTH];
+PTO2SpscQueue wiring_queue;  // global SPSC queue: orchestrator pushes, scheduler thread 0 drains
+```
+
+`slot_states`, `task_window_size`, and `task_window_mask` are no longer duplicated — callers access them via `ring->get_slot_state_by_*()` and other ring header accessors. The ring pointer shares cache line 0 with `last_task_alive` and `advance_lock`.
+
+### 4.6 PTO2TensorMap (modified)
+
+```cpp
+PTO2TensorMapEntry** task_entry_heads[PTO2_MAX_RING_DEPTH];
+int64_t last_task_alives[PTO2_MAX_RING_DEPTH];
+```
+
+Entry validity checks and `cleanup_retired` operate per-ring:
+
+```cpp
+bool entry_valid(const PTO2TensorMapEntry& e) {
+    int32_t ring = e.producer_task_id.ring();
+    int32_t local = e.producer_task_id.local();
+    return local >= last_task_alives[ring];
+}
+```
+
+### 4.7 Unchanged Structures
+
+| Structure | Reason |
+| --------- | ------ |
+| `PTO2DepListEntry` | Stores `PTO2TaskSlotState*` pointer — naturally crosses ring boundaries |
+| `PTO2TaskPayload` | `fanin_slot_states[]` are pointers — no ring coupling |
+| `PTO2ReadyQueue` | Global ready queues shared across all rings (tasks ready to dispatch regardless of origin ring) |
+| `PTO2DispatchPayload` | Built per-dispatch, no ring state needed |
+
+## 5. Reclamation
+
+### 5.1 Per-Ring Watermark Advancement
+
+Each ring's `last_task_alive` advances independently:
+
+```text
+advance_ring_pointers(ring_id):  // protected by per-ring advance_lock
+    la = ring->fc.last_task_alive
+    while ring->get_slot_state_by_task_id(la).task_state >= CONSUMED:
+        reset slot for reuse
+        la++
+    sync_to_sm()  // release-store last_task_alive
+```
+
+Per-ring try-locks in the scheduler state prevent concurrent scheduler threads from interleaving heap_tail writes within the same ring.
+
+### 5.2 Cross-Ring Dependencies
+
+Dependency edges use `PTO2TaskSlotState*` pointers, which naturally span rings:
+
+- Ring 1 task depends on ring 0 producer → ring 0's `fanout_head` linked list contains a ring 1 `PTO2TaskSlotState*`
+- When ring 0 task completes, it walks its fanout list and decrements ring 1 consumers' `fanin_refcount`
+- No special cross-ring logic needed — pointer-based design is ring-agnostic
+
+### 5.3 DepPool Reclamation
+
+DepPool is exclusively managed by scheduler thread 0 (allocation during wiring, reclamation during watermark advancement):
+
+```text
+// Called by scheduler thread 0 during wiring_queue drain:
+dep_pool_reclaim(ring_id):
+    la = ring->fc.last_task_alive
+    newest_consumed = la - 1
+    mark = ring->get_slot_state_by_task_id(newest_consumed).dep_pool_mark
+    if mark > 0:
+        ring_sched_states[ring_id].dep_pool.advance_tail(mark)
+```
+
+Note: dep entries from ring N's pool may appear in ring M's fanout lists. Reclamation is safe because the entries are accessed during fanout traversal (completion time), which always happens before the consumer task — and therefore the dep entry — becomes eligible for reclamation.
+
+## 6. AICPU Register Protocol Fix
+
+The AICore dispatch protocol uses 32-bit registers. With multi-ring, `task_id` truncation to 32-bit loses the `ring_id`, causing collisions:
+
+```text
+Ring 0, local_id=0  →  DATA_MAIN_BASE = 0 + 1 = 1
+Ring 1, local_id=0  →  DATA_MAIN_BASE = 0 + 1 = 1  (collision!)
+```
+
+AICore uses `last_reg_val` to detect new dispatches — identical values cause skipped tasks and false completions from stale COND registers.
+
+**Fix**: Per-core monotonic dispatch counter `s_dispatch_seq[core_id]` replaces `task_id` in register writes, guaranteeing unique `DATA_MAIN_BASE` values per core regardless of ring origin.
+
+## 7. Configuration
+
+### 7.1 Compile-Time Defaults (per ring)
+
+| Constant | Default | Total (×4 rings) |
+| -------- | ------- | ---------------- |
+| `PTO2_TASK_WINDOW_SIZE` | 16384 | 65536 |
+| `PTO2_HEAP_SIZE` | 256 MB | 1 GB |
+| `PTO2_DEP_LIST_POOL_SIZE` | 16384 | 65536 |
+
+### 7.2 Runtime Overrides
+
+Ring sizing can be configured either uniformly for every ring or independently
+per ring. Precedence is resolved independently for each resource and ring:
+
+```text
+per-ring CallConfig value
+  > scalar CallConfig value
+  > per-ring PTO2_RING_* env value
+  > scalar PTO2_RING_* env value
+  > compile-time default
+```
+
+`ring_id` is the scope-depth ring selected by the runtime:
+
+```text
+scope depth 0 -> ring 0
+scope depth 1 -> ring 1
+scope depth 2 -> ring 2
+scope depth >=3 -> ring 3
+```
+
+Per-task via `CallConfig.runtime_env` — different L2 tasks in one launch can
+each carry their own sizes. Invalid values raise at submit time (`validate()`).
+The scalar fields preserve the old behavior and broadcast one value to all
+rings:
+
+```python
+cfg = CallConfig()
+cfg.runtime_env.ring_task_window = 128   # power of 2, >= 4
+cfg.runtime_env.ring_heap = 262144       # bytes/ring, >= 1024
+cfg.runtime_env.ring_dep_pool = 256      # 4 .. INT32_MAX
+orchestrator.submit_next_level(handle, args, cfg)
+```
+
+Set the array fields to tune the four scope-depth rings independently. Each
+array must contain exactly four entries; use `0` for an entry that should fall
+through to the next precedence tier. All `CallConfig` values are integer
+byte/count values.
+
+```python
+cfg = CallConfig()
+cfg.runtime_env.ring_task_windows = [8192, 16384, 131072, 524288]
+cfg.runtime_env.ring_heaps = [
+    128 * 1024 * 1024,
+    256 * 1024 * 1024,
+    384 * 1024 * 1024,
+    512 * 1024 * 1024,
+]
+cfg.runtime_env.ring_dep_pools = [4096, 8192, 16384, 32768]
+orchestrator.submit_next_level(handle, args, cfg)
+```
+
+Scene tests set the same keys under a nested `runtime_env` block in the
+per-case `config` dict:
+
+```python
+"config": {
+    "runtime_env": {
+        "ring_task_windows": [8192, 16384, 131072, 524288],
+        "ring_heaps": [134217728, 268435456, 402653184, 536870912],
+        "ring_dep_pools": [4096, 8192, 16384, 32768],
+    }
+}
+```
+
+Process-wide env fallback accepts either one scalar value or exactly four
+comma-separated per-ring values. Invalid env values are logged and ignored, then
+fall through to defaults. `PTO2_RING_HEAP` values are integer bytes:
+
+```bash
+# Uniform, old behavior:
+PTO2_RING_TASK_WINDOW=1024
+PTO2_RING_HEAP=1048576
+PTO2_RING_DEP_POOL=1024
+
+# Per-ring, indexed by ring_id 0..3:
+PTO2_RING_TASK_WINDOW=8192,16384,131072,524288
+PTO2_RING_HEAP=134217728,268435456,402653184,536870912
+PTO2_RING_DEP_POOL=4096,8192,16384,32768
+```
+
+Use `--enable-scope-stats` to confirm the effective values for a real run. The
+first line of `scope_stats/scope_stats.jsonl` includes `task_window_max`,
+`heap_max`, and `dep_pool_max`, indexed by `ring`.
+
+### 7.3 Sizing Guidelines
+
+- `task_window` must be ≥ max tasks in any single scope + headroom for concurrent scopes
+- `heap` must accommodate peak output buffer allocation across all in-flight tasks on that ring
+- `dep_pool` must be ≥ total dependency entries for all in-flight tasks on that ring
+- On hardware, back-pressure latency is higher than in simulation — size conservatively
+- Adding inner `PTO2_SCOPE` reduces peak per-ring usage, enabling smaller sizes
diff --git a/src/a2a3/runtime/fully_distributed_within_core/docs/RUNTIME_LOGIC.md b/src/a2a3/runtime/fully_distributed_within_core/docs/RUNTIME_LOGIC.md
new file mode 100644
index 000000000..e6760fb1e
--- /dev/null
+++ b/src/a2a3/runtime/fully_distributed_within_core/docs/RUNTIME_LOGIC.md
@@ -0,0 +1,39 @@
+# Runtime Logic: fully_distributed_within_core
+
+**Target design.** Orchestration, scheduling, and execution all run on the AI
+cores in SPMD fashion; the AICPU is removed from orchestration/scheduling. The
+authoritative specification is:
+
+- [`docs/fully_distributed_within_core.md`](../../../../docs/fully_distributed_within_core.md)
+
+Core elements (see the spec):
+
+- Task ownership via a claim race over two global cursors (`cube_cursor`,
+  `vector_cursor`); `owner = builder = executor`.
+- Per-core full-duplicate TensorMap for dependency discovery (pull model via a
+  global `task_completed_flag` ring).
+- Per-core private task ring + block-shared `block.won[N]` deposit table for
+  multi-core (MIX / 2V) co-ownership (anchor push + follower async drain).
+- Deterministic, per-core-replicated GM output heap with frontier-based
+  reclamation.
+
+## Current state (re-based on tensormap_and_ringbuffer)
+
+This runtime is re-based on `tensormap_and_ringbuffer` to reuse its
+`PTO2TensorMap`, `MixedKernels`/`ActiveMask`, `L0TaskArgs`, the
+`pto_orchestration_api.h` submit API, and kernel-address resolution. The
+distributed model is layered on incrementally:
+
+- `runtime/` — adds global claim cursors, a global completion-flag ring, a
+  deterministic GM output heap, and per-core replicated TensorMap + private task
+  ring on top of the reused types.
+- `aicore/` — the SPMD run-ahead orchestrate+execute loop (spec section 6).
+- `aicpu/` — reduced to an init/wire/signal/wait stub (no orchestration,
+  scheduling, or dispatch).
+- `host/` — runtime maker / compile info (orchestration entry is invoked on the
+  cores).
+- `orchestration/` — the PTO2 orchestration API (unchanged surface).
+
+The legacy AICPU orchestrator/scheduler sources inherited from
+`tensormap_and_ringbuffer` (`runtime/scheduler/`, the orchestrator pipeline) are
+progressively replaced or bypassed by the distributed path.
diff --git a/src/a2a3/runtime/fully_distributed_within_core/docs/SCALAR_DATA_ACCESS.md b/src/a2a3/runtime/fully_distributed_within_core/docs/SCALAR_DATA_ACCESS.md
new file mode 100644
index 000000000..ef1de83b4
--- /dev/null
+++ b/src/a2a3/runtime/fully_distributed_within_core/docs/SCALAR_DATA_ACCESS.md
@@ -0,0 +1,137 @@
+# Scalar Data Access — get/set_tensor_data Design
+
+## 1. Overview
+
+During task graph construction, orchestration sometimes needs to read InCore kernel results (for control-flow decisions) or write initial values into tensors. `get_tensor_data` / `set_tensor_data` provide **blocking** cross-layer data access, allowing orchestration to safely read and write tensor data.
+
+**Core design principle**: Reuse the existing TensorMap dependency tracking mechanism — no new synchronization infrastructure.
+
+## 2. API
+
+```cpp
+// Blocking read: returns value at the given indices (default: raw uint64_t bits)
+// Specify T for typed read: float val = get_tensor_data<float>(tensor, 1, idx);
+template<typename T = uint64_t>
+T get_tensor_data(const Tensor& tensor, uint32_t ndims, const uint32_t indices[]);
+
+// Blocking write: stores value at the given indices (type deduced from argument)
+// Typed write: set_tensor_data(tensor, 1, idx, 42.0f);
+template<typename T = uint64_t>
+void set_tensor_data(Tensor& tensor, uint32_t ndims, const uint32_t indices[], T value);
+```
+
+Both call into the runtime through the ops table — orchestration .so needs no runtime symbol linkage.
+
+## 3. Blocking Interface Design
+
+### 3.1 get_tensor_data Flow
+
+```text
+addr null-check → TensorMap lookup → spin-wait producer COMPLETED → compute flat offset → memcpy read
+```
+
+- **addr null-check**: `buffer.addr == 0` means unallocated — log error, return 0
+- **TensorMap lookup**: find producer task by `buffer.addr`
+- **spin-wait**: wait until producer `task_state >= PTO2_TASK_COMPLETED`
+- **No producer** (lookup callback never fires): skip waiting, read immediately
+
+### 3.2 set_tensor_data Flow
+
+```text
+addr null-check → TensorMap lookup → spin-wait producer COMPLETED → spin-wait consumers done → memcpy write
+```
+
+One extra step versus get_tensor_data: wait for all consumers to finish (`fanout_refcount >= fanout_count - 1`, excluding the scope reference).
+
+### 3.3 Timeout
+
+- Uses cycle counter (`get_sys_cnt_aicpu()`), checked every 1024 spins
+- Threshold: `PTO2_TENSOR_DATA_TIMEOUT_CYCLES` (~10 s at 1.5 GHz)
+- On timeout: sets `orch.fatal = true`, preventing further task submission
+
+## 4. add_output with Initial Value
+
+```cpp
+TensorCreateInfo ci(shapes, ndims, dtype);
+ci.set_initial_value(initial_value);
+args.add_output(ci);
+```
+
+**Mechanism**:
+
+1. `ci.set_initial_value(value)` marks the create-info with an initial value before submission
+2. `add_output(ci)` stores a pointer to `ci` in `L0TaskArgs` (the original must remain valid until submit)
+3. During payload init, the output tensor is materialized via `init_from_create_info()` which triggers the fill
+4. Fill strategy:
+   - Small buffer (< 64 B): element-by-element memcpy directly into dst
+   - Large buffer (≥ 64 B): fill the first 64 bytes as a template block, then bulk-memcpy in 64 B chunks; partial tail copy for remainder
+
+**Constraint**: existing tensors are write targets only through `add_inout()`.
+
+## 5. Scalar Dependencies via 1-Element Tensors
+
+Traditional scalars (`L0TaskArgs::add_scalar`) are one-way inputs with no TensorMap tracking. For cross-task scalar values, use a 1-element tensor as the carrier:
+
+```cpp
+uint32_t shapes[1] = {1};
+TensorCreateInfo scalar_ci(shapes, 1, DataType::FLOAT32);
+
+// Submit with initial value and keep the returned tensor
+scalar_ci.set_initial_value(float_to_u64(77.0f));
+L0TaskArgs args;
+args.add_output(scalar_ci);
+TaskOutputTensors outs = rt_submit_aiv_task(FUNC_NOOP, args);
+const Tensor& scalar_tensor = outs.get_ref(0);
+
+// Orchestration-side blocking read (waits for kernel completion)
+uint32_t idx[1] = {0};
+float val = get_tensor_data<float>(scalar_tensor, 1, idx);
+```
+
+**Advantage**: Fully reuses existing TensorMap (producer tracking, fanin/fanout dependencies) — no new infrastructure needed.
+
+## 6. Data Hazard Analysis
+
+Three actors:
+
+- **Kernel**: InCore task submitted via add_input/add_output/add_inout (asynchronous execution)
+- **Orch Read**: orchestration calls `get_tensor_data` (blocking read)
+- **Orch Write**: orchestration calls `set_tensor_data` (blocking write)
+
+### Hazard Matrix (earlier operation → later operation)
+
+| # | Earlier Op | Later Op | Hazard | Guarantee | Safe? |
+| - | ---------- | -------- | ------ | --------- | ----- |
+| 1 | Kernel write (OUTPUT) | Orch Read | RAW | spin-wait producer COMPLETED | Yes |
+| 2 | Kernel write (OUTPUT) | Orch Write | WAW | spin-wait producer COMPLETED | Yes |
+| 3 | Kernel read (INPUT) | Orch Write | WAR | spin-wait fanout_refcount | **Needs INOUT** |
+| 4 | Kernel read-write (INOUT) | Orch Read | RAW | spin-wait producer COMPLETED | Yes |
+| 5 | Kernel read-write (INOUT) | Orch Write | WAW+WAR | spin-wait producer + consumers | Yes |
+| 6 | Orch Write | Kernel read (INPUT) | RAW | blocking completes before next submit | Yes |
+| 7 | Orch Write | Kernel write (OUTPUT) | WAW | same — serial guarantee | Yes |
+| 8 | Orch Read | Kernel write (OUTPUT) | WAR | same — serial guarantee | Yes |
+| 9–12 | Orch ↔ Orch | — | — | same-thread serial execution | Yes |
+
+### Key Design Points
+
+**Scenario #3 is the only case requiring special attention**:
+
+TensorMap tracks only producers (OUTPUT/INOUT), not pure INPUT consumers. If a tensor is only registered via `add_input()`, TensorMap has no producer entry for it. `set_tensor_data`'s `wait_for_tensor_ready()` finds no matching producer (the lookup callback never fires) and returns immediately — but the kernel may still be reading → **WAR data race**.
+
+**Solution**: For tensors that may later be written via `set_tensor_data`, use `add_inout()` instead of `add_input()`. INOUT registers a producer entry in TensorMap, enabling `set_tensor_data` to track all consumers through `fanout_refcount`.
+
+**Scenarios #6–8 serial guarantee**:
+
+get/set_tensor_data are blocking calls, and orchestration is single-threaded serial submission. After a blocking operation completes, subsequent code (including task submissions) executes strictly afterward.
+
+## 7. External Tensor Behavior
+
+`make_tensor_external()` creates tensors with a pre-set `buffer.addr` (pointing to host-allocated device memory).
+
+| Scenario | Behavior |
+| -------- | -------- |
+| External tensor never submitted as OUTPUT/INOUT | No TensorMap entry — get/set execute immediately |
+| External tensor previously submitted as OUTPUT/INOUT | TensorMap has producer entry — get/set spin-wait |
+| External tensor submitted as INPUT, then set_tensor_data | **WAR risk** — must use INOUT instead (same as scenario #3) |
+
+**Key rule**: If an external tensor will later be written via `set_tensor_data`, all prior kernel accesses must use `add_inout()`, not `add_input()`.
diff --git a/src/a2a3/runtime/fully_distributed_within_core/docs/SUBMIT_BY_CLUSTER.md b/src/a2a3/runtime/fully_distributed_within_core/docs/SUBMIT_BY_CLUSTER.md
new file mode 100644
index 000000000..8cba7e90c
--- /dev/null
+++ b/src/a2a3/runtime/fully_distributed_within_core/docs/SUBMIT_BY_CLUSTER.md
@@ -0,0 +1,222 @@
+# Submit by Cluster - Requirements and Main-Branch-Aligned Design
+
+## 1. Goal
+
+Define a single, main-branch-aligned specification for PTO2 cluster submission that combines:
+
+1. Product requirements (what must be true).
+2. Runtime design (how it is implemented on current main baseline).
+
+The target model is: one submitted graph node is one `MixedTask`, and dispatch/completion is mixed-task-granular.
+
+## 2. Background and Motivation
+
+Future Ascend hardware is expected to provide stronger locality within an AICore cluster (`1 AIC + 2 AIV`).
+The runtime therefore needs a "submit together, run together" model for related AIC/AIV kernels.
+
+Legacy per-task submit (`kernel_id + worker_type`) cannot express atomic co-dispatch of multiple kernels to one cluster.
+
+## 3. Scope
+
+### In Scope
+
+1. New orchestration-facing submit API for cluster-aware mixed submission.
+2. Runtime/backend scheduler and executor changes to treat a mixed submit as one atomic scheduling unit.
+3. Dependency gating, readiness, dispatch, completion, and reclamation at mixed-task granularity.
+4. AIV slot equivalence (`AIV0` and `AIV1` are equivalent execution targets).
+
+### Out of Scope
+
+1. User-facing cluster pinning (`allocate_cluster/free_cluster`-style APIs).
+2. New worker types beyond AIC/AIV.
+3. Cross-cluster user placement policies.
+4. Hardware topology changes beyond `1 AIC + 2 AIV` per cluster.
+
+## 4. Main-Branch Baseline Constraints
+
+Design must preserve the current main runtime architecture:
+
+1. Executor threading split (orchestrator thread vs scheduler threads), and post-orchestrator transition (`transition_requested_` + `reassign_cores_for_all_threads()`).
+2. Shared-memory hot/cold split (`PTO2TaskDescriptor` hot + `PTO2TaskPayload` cold).
+
+## 5. Terminology
+
+1. `cluster`: one physical unit with `1 AIC + 2 AIV`.
+2. `MixedKernels`: 3 submit slots (`AIC`, `AIV0`, `AIV1`) with `INVALID_KERNEL_ID` for inactive slots.
+3. `MixedTask`: one runtime graph node created by one submit call.
+4. `active_mask`: bitmask of active subtask slots.
+5. `resource shape`: normalized lane demand class of a mixed task.
+
+## 6. API Contract
+
+```cpp
+inline constexpr int32_t INVALID_KERNEL_ID = -1;
+
+struct MixedKernels {
+    int32_t aic_kernel_id{INVALID_KERNEL_ID};
+    int32_t aiv0_kernel_id{INVALID_KERNEL_ID};
+    int32_t aiv1_kernel_id{INVALID_KERNEL_ID};
+};
+
+static inline void rt_submit_task(PTO2Runtime* rt,
+                                       const MixedKernels& mixed_kernels,
+                                       Arg* args,
+                                       int32_t num_args);
+
+static inline void rt_submit_aic_task(PTO2Runtime* rt,
+                                           int32_t kernel_id,
+                                           Arg* args,
+                                           int32_t num_args);
+
+static inline void rt_submit_aiv_task(PTO2Runtime* rt,
+                                           int32_t kernel_id,
+                                           Arg* args,
+                                           int32_t num_args);
+```
+
+Rules:
+
+1. One submit call creates one `MixedTask`.
+2. All active slots share the same `args` and `num_args`.
+3. At least one slot must be active.
+4. `aiv0_kernel_id` and `aiv1_kernel_id` are semantically equivalent.
+5. Wrappers are orchestration sugar only (inline in orchestration API); no dedicated runtime ops entries.
+6. Submit-contract types are defined once in a shared header-only submit-types surface consumed by orchestration and runtime headers.
+7. Invalid submits follow existing PTO2 behavior (`always_assert`), not a new recoverable return-code API.
+
+## 7. Data Model (Requirements + Design)
+
+`PTO2TaskDescriptor` (hot path) carries mixed-task identity/state:
+
+1. `task_id`
+2. `active_mask`
+3. `completed_subtasks` (atomic counter, incremented per subtask completion)
+4. `kernel_id[3]` for `(AIC, AIV0, AIV1)`
+5. dependency heads/counters and packed-buffer metadata
+
+`PTO2TaskPayload` (cold path) carries:
+
+1. shared args/tensors/scalars copied once per mixed submit
+2. fanin mixed-task IDs
+3. other cold-path submit metadata
+
+Producer identity in TensorMap is mixed-task ID end-to-end.
+
+## 8. Scheduling Model
+
+### 8.1 Resource Shapes
+
+Runtime uses shape-based ready queues (not worker-type queues):
+
+1. `AIC_ONLY`
+2. `AIV_X1`
+3. `AIV_X2`
+4. `AIC_AIV_X1`
+5. `AIC_AIV_X2`
+
+Queueing key is normalized resource shape (not raw slot label).
+
+### 8.2 Atomic Cluster Dispatch
+
+1. Dispatch decision unit is one mixed task.
+2. For multi-slot mixed tasks, partial launch is forbidden.
+3. A mixed task is dispatchable only when one local owned cluster can satisfy all required lanes.
+4. Compatible mixed tasks may co-reside over time if they use disjoint free lanes.
+
+### 8.3 Dependency and Completion
+
+1. Fanin release/readiness remains dependency-correct and graph-level.
+2. Two-stage completion:
+   - `on_subtask_complete(task_id, subslot)`
+   - `on_task_complete(task_id)` only when `completed_subtasks == total_required_subtasks`
+3. Downstream release is triggered once per mixed task completion, not once per subslot.
+
+## 9. Executor Ownership and Numbering
+
+### 9.1 Canonical Flattened Numbering (Unchanged)
+
+Given `block_dim` clusters:
+
+1. AIC IDs: `[0, block_dim)`
+2. AIV IDs: `[block_dim, 3 * block_dim)`
+3. Cluster `i`: `{i, block_dim + i, 2 * block_dim + i}`
+
+This project-defined flattened numbering is kept unchanged.
+
+### 9.2 Cluster Ownership
+
+1. One cluster must be owned by one scheduler domain/thread at a time.
+2. No split-cluster ownership in either:
+   - initial `assign_cores_to_threads()`
+   - post-orchestrator `reassign_cores_for_all_threads()`
+3. Lane occupancy bookkeeping must remain consistent with ownership after reassignment.
+
+## 10. Functional Requirements
+
+### 10.1 Valid Mixed Shapes
+
+1. AIC only
+2. AIV only (1 or 2 AIV lanes)
+3. AIC + 1 AIV
+4. AIC + 2 AIV
+
+### 10.2 Runtime Behavior per Submit
+
+1. Validate submit arguments.
+2. Allocate mixed-task ID and initialize descriptor/payload/slot_state once.
+3. Lookup producers via TensorMap; collect fanin metadata and increment producers' `fanout_count`.
+4. Push task to scheduler's wiring queue (scheduler thread 0 asynchronously wires fanout edges and determines readiness).
+5. Dispatch all active lanes atomically when resources allow.
+6. Aggregate completion and release downstream once.
+
+## 11. Non-Functional Requirements
+
+1. Correctness: no dependency violation, no partial mixed-task dispatch.
+2. Determinism: dependency-correct ordering preserved; AIV lane choice may vary but remains semantically equivalent.
+3. Fairness: resource-aware polling heuristic is allowed; strict starvation-free guarantee across all shapes is not required.
+4. Performance: no obvious regression for non-cluster workflows.
+5. Observability: lifecycle visibility for submit/ready/dispatch/block/complete.
+
+## 12. Acceptance Criteria
+
+Feature is accepted when:
+
+1. Orchestration compiles and submits via `MixedKernels` API/wrappers.
+2. Scheduler dispatches each mixed task as one cluster scheduling decision.
+3. Dependencies gate mixed-task readiness correctly.
+4. AIV execution remains cluster-local and semantically equivalent across lanes.
+5. Existing non-cluster workflows continue to pass without behavior regression.
+6. Cluster ownership is never split across scheduler domains before/after transition.
+
+## 13. Verification Matrix
+
+Recommended validation coverage:
+
+1. Mapping correctness for cluster-to-core ID relation.
+2. Atomic dispatch for multi-slot shapes.
+3. Dependency gating and completion aggregation (`done_mask == active_mask`).
+4. Lane-occupancy co-residency behavior for compatible shapes.
+5. Core-transition ownership stability.
+6. Invalid submit handling (`always_assert` path).
+7. Regression coverage for existing examples/tests.
+
+Milestone command (device):
+
+```bash
+python tests/st/a2a3/tensormap_and_ringbuffer/batch_paged_attention/test_batch_paged_attention.py \
+  -p a2a3 -d 9
+```
+
+Final validation:
+
+```bash
+pytest examples tests/st --platform a2a3
+```
+
+## 14. Resolved Decisions
+
+1. Legacy orchestration-facing single-task submit is replaced by mixed submit contract.
+2. Invalid mixed submits fail with existing submit-time assert behavior.
+3. Per-cluster concurrent capacity is lane-occupancy-driven, not a fixed constant.
+4. Submit-contract types live in one shared header-only surface.
+5. Resource-aware dispatch heuristics are allowed without a strict starvation-free guarantee.
diff --git a/src/a2a3/runtime/fully_distributed_within_core/docs/device_log_profiling.md b/src/a2a3/runtime/fully_distributed_within_core/docs/device_log_profiling.md
new file mode 100644
index 000000000..af661d440
--- /dev/null
+++ b/src/a2a3/runtime/fully_distributed_within_core/docs/device_log_profiling.md
@@ -0,0 +1,166 @@
+# PTO2 Device Log Profiling Guide
+
+## How to Find Device Logs
+
+AICPU logs (via `LOG_INFO_V9`) are written by CANN's **dlog** subsystem and do **not** appear in the `python test_*.py` / pytest terminal output. They are written to CANN's device log directory:
+
+```text
+$HOME/ascend/log/debug/device-<device_id>/device-<pid>_<timestamp>.log
+```
+
+Each run produces a new log file (or appends to an existing one). Find the most recent file by modification time:
+
+```bash
+ls -lt $HOME/ascend/log/debug/device-<device_id>/ | head -5
+```
+
+## Log Structure Overview
+
+A single run produces two profiling blocks in the device log:
+
+| Block | Emitted by | Function | Content |
+| ----- | ---------- | -------- | ------- |
+| **Orchestrator Profiling** | Thread 3 (orchestrator) | `aicpu_orchestration_entry` | Time breakdown of graph construction on device |
+| **PTO2 Scheduler Summary** | Threads 0/1/2 (schedulers) | `SchedulerContext::resolve_and_dispatch` | Per-thread scheduling statistics, phase timing, and lock contention |
+
+All timing values are in microseconds (us), converted from AICPU cycle counters.
+
+---
+
+## Block 1: Orchestrator Profiling
+
+Thread 3 loads the orchestration `.so` via `dlopen`, calls `aicpu_orchestration_entry`, and prints a profiling summary after it returns.
+
+### Example (from a real run: batch=64, 16704 tasks)
+
+```text
+Thread 3: Calling aicpu_orchestration_entry from SO
+Thread 3: aicpu_orchestration_entry returned, cost 20943.940us
+Thread 3: === Orchestrator Profiling: 16704 tasks, total=14601.580us ===
+Thread 3:   sync_tensormap : 286.300us (2.0%)
+Thread 3:   task_ring_alloc: 380.400us (2.6%)
+Thread 3:   param_copy     : 2147.800us (14.7%)
+Thread 3:   lookup+dep     : 7290.300us (49.9%)
+Thread 3:   heap_alloc     : 701.500us (4.8%)
+Thread 3:   tensormap_ins  : 1890.380us (12.9%)
+Thread 3:   fanin+ready    : 1207.400us (8.3%)
+Thread 3:   finalize+SM    : 697.500us (4.8%)
+Thread 3:   scope_end      : 364.080us
+Thread 3:   avg/task       : 0.874us
+Thread 3: PTO2 total submitted tasks = 16704
+```
+
+### Field Reference
+
+| Field | Source (`pto_orchestrator.cpp`) | Description |
+| ----- | ------------------------------- | ----------- |
+| **cost** | Wall-clock around `orch_func()` call | Total time including orchestration logic + scope overhead |
+| **total** | Sum of all sub-steps below | Accumulated time inside `submit_task` across all tasks |
+| **sync_tensormap** | `g_orch_sync_cycle` | TensorMap validity sync and optional cleanup before each submission |
+| **task_ring_alloc** | `g_orch_alloc_cycle` | Allocating a task slot from the task ring buffer |
+| **param_copy** | `g_orch_args_cycle` | Copying param descriptors + tensor descriptor copies into task-owned storage |
+| **lookup+dep** | `g_orch_lookup_cycle` | TensorMap lookup for inputs/inouts + building fanin/fanout dependency edges |
+| **heap_alloc** | `g_orch_heap_cycle` | Allocating packed output buffers from the heap ring |
+| **tensormap_ins** | `g_orch_insert_cycle` | Inserting output/inout tensors into the TensorMap |
+| **fanin+ready** | `g_orch_fanin_cycle` | Building the fanin list + checking if task is already ready (Step 5/5b) |
+| **scope_end** | `g_orch_scope_end_cycle` | `end_scope` overhead (notifying scheduler of scope completion) |
+| **avg/task** | `total / submit_count` | Average orchestrator time per task submission |
+
+### Interpreting the Numbers
+
+- **cost > total**: The difference is overhead outside `submit_task` (the orchestration user code itself, scope_begin/end, TensorCreateInfo construction, etc.).
+- **lookup+dep** is typically the dominant cost (~50%) because it involves TensorMap hash lookups and building dependency edges with spinlock-protected fanout list insertions.
+- **param_copy** scales with the number of parameters per task.
+- **avg/task < 1us** indicates efficient graph construction.
+
+---
+
+## Block 2: PTO2 Scheduler Summary
+
+Each of the 3 scheduler threads (Thread 0, 1, 2) prints its own summary after completing all tasks. The output has two sub-sections: **summary** and **phase breakdown**.
+
+### Example (Thread 0, from a different run: batch=1, 1044 tasks)
+
+```text
+Thread 0: completed=352 tasks in 3477.420us (147 loops, 2.4 tasks/loop)
+Thread 0: --- Phase Breakdown ---
+Thread 0:   complete:    1485.020us (42.7%)
+Thread 0:   scan:        14.400us (0.4%)
+Thread 0:   dispatch:    1973.060us (56.7%)
+Thread 0:   idle:        4.940us (0.1%)
+```
+
+### Summary Line
+
+```text
+Thread N: completed=X tasks in Yus (Z loops, W tasks/loop)
+```
+
+| Field | Description |
+| ----- | ----------- |
+| **completed** | Number of tasks this thread processed to completion |
+| **Y us** | Total scheduler loop time (sum of all phase cycles) |
+| **Z loops** | Number of scheduler loop iterations |
+| **W tasks/loop** | Average tasks completed per loop iteration; higher = better throughput |
+
+### Phase Breakdown
+
+The scheduler loop runs four phases each iteration. Each phase's time is accumulated across all loop iterations.
+
+| Phase | What it does | Inline stats |
+| ----- | ------------ | ------------ |
+| **complete** | Polls handshake on each managed core; when a core completes, calls `on_subtask_complete(task_id, subslot)` to increment the completion counter; when `completed_subtasks == total_required_subtasks`, triggers `on_task_complete` which traverses fanout list (notify consumers) and fanin list (release producers) | `fanout`: edges/max_degree/avg for consumer notification; `fanin`: edges/max_degree/avg for producer release |
+| **scan** | Updates the perf profiling header with latest scheduler state | — |
+| **dispatch** | For each idle core, pops a task from the shape-based ready queue via `get_ready_task(shape)`, builds the dispatch payload, and writes the task to the core's handshake register | `pop`: `hit` = successful pops (task dispatched), `miss` = empty queue pops, `hit_rate` = hit/(hit+miss) |
+| **idle** | Scheduler loop iteration where no progress was made (no completions, no dispatches) | — |
+
+**Interpreting phase percentages:**
+
+- **dispatch** is typically the largest (~55-60%) because it includes ready-queue pops (with spinlock), payload construction, and cache flush (`dc cvac` + `dsb sy`).
+- **complete** is the second largest (~40-45%) because it traverses both fanout (CAS-based fanin decrement, conditional ready-queue push) and fanin (release_producer, check_consumed, ring pointer advancement).
+- **scan** is small (<1%) — only updates the perf header.
+- **idle** is negligible when tasks are flowing; high idle% indicates the scheduler is starved.
+
+**Interpreting pop hit_rate:**
+
+- **High hit_rate (>50%)**: Ready queue is well-supplied; dispatch is efficient.
+- **Low hit_rate (<10%)**: Ready queue is mostly empty when cores become idle. The bottleneck is upstream (orchestrator submission speed or fanout resolution latency), not dispatch itself.
+
+### Per-Task Averages
+
+Divide each thread's phase times by its `completed` count to get per-task scheduling cost:
+
+| Metric | Formula | Typical value |
+| ------ | ------- | ------------- |
+| Scheduling overhead per task | total_time / completed | ~5-10 us/task |
+| Dispatch per task | dispatch_time / completed | ~3-6 us/task |
+| Complete per task | complete_time / completed | ~2-4 us/task |
+
+---
+
+## Cross-Referencing with Host Profiling
+
+When `--enable-l2-swimlane` is used, the host terminal prints a **Task Statistics by Function** table with `Total_Exec` (total AICore kernel execution time). Combined with device log data:
+
+| Metric | Source | Description |
+| ------ | ------ | ----------- |
+| Avg kernel exec time | `Total_Exec / total_tasks` (host) | Time AICore spends executing each kernel |
+| Avg scheduling overhead | `sum(thread_total) / total_tasks` (device log) | Time AICPU spends scheduling each task |
+| Sched/Exec ratio | scheduling / execution | Scheduling overhead relative to kernel execution |
+
+A high sched/exec ratio (e.g., >3x) indicates that scheduling overhead dominates, and optimizations should target the scheduler's dispatch hot path (cache flush, payload construction) or upstream task flow.
+
+---
+
+## Quick Reference: Extracting Profiling Data
+
+```bash
+# Find the latest device log for device 2
+ls -t $HOME/ascend/log/debug/device-2/device-*.log | head -1
+
+# Extract orchestrator profiling (Thread 3)
+grep "Thread 3:" <logfile>
+
+# Extract scheduler profiling (Threads 0/1/2)
+grep -E "Thread [012]:" <logfile>
+```
diff --git a/src/a2a3/runtime/fully_distributed_within_core/docs/profiling_levels.md b/src/a2a3/runtime/fully_distributed_within_core/docs/profiling_levels.md
new file mode 100644
index 000000000..bd669f365
--- /dev/null
+++ b/src/a2a3/runtime/fully_distributed_within_core/docs/profiling_levels.md
@@ -0,0 +1,480 @@
+# PTO Runtime2 Profiling Levels
+
+This document describes the profiling macro hierarchy and logging control in the PTO Runtime2 system.
+
+## Overview
+
+PTO Runtime2 uses a hierarchical profiling system with compile-time macros to control profiling code compilation and log output. The `enable_l2_swimlane` runtime flag (integer perf_level 0–4) controls data collection granularity (performance buffers, shared memory writes) but does NOT control log output.
+
+## Profiling Macro Hierarchy
+
+Defaults and dependency validation are centralized in
+`src/common/task_interface/profiling_config.h`. Runtime headers include that
+file before using the macros, so both a2a3 and a5 share the same default
+values and compile-time checks.
+
+```text
+PTO2_PROFILING (base level, default=1)
+├── PTO2_ORCH_PROFILING (orchestrator, default=0, requires PTO2_PROFILING=1)
+|   └──PTO2_TENSORMAP_PROFILING (tensormap, default=0, requires PTO2_ORCH_PROFILING=1)
+├── PTO2_SCHED_PROFILING (scheduler, default=0, requires PTO2_PROFILING=1)
+└── --enable-l2-swimlane [PERF_LEVEL] (L2 swimlane data collection, 0-4, bare=4, requires PTO2_PROFILING=1)
+
+```
+
+### Compile-Time Validation
+
+Each sub-level macro requires `PTO2_PROFILING=1`:
+
+```cpp
+#if PTO2_ORCH_PROFILING && !PTO2_PROFILING
+#error "PTO2_ORCH_PROFILING requires PTO2_PROFILING=1"
+#endif
+
+#if PTO2_SCHED_PROFILING && !PTO2_PROFILING
+#error "PTO2_SCHED_PROFILING requires PTO2_PROFILING=1"
+#endif
+
+#if PTO2_TENSORMAP_PROFILING && !PTO2_ORCH_PROFILING
+#error "PTO2_TENSORMAP_PROFILING requires PTO2_ORCH_PROFILING=1"
+#endif
+```
+
+## Profiling Levels
+
+### Level 0: No Profiling (PTO2_PROFILING=0)
+
+**What's compiled:**
+
+- Debug/diagnostic logs (always present)
+- Progress tracking (`PTO2 progress: completed=...`)
+- Stall detection and dump (triggered after the `SCHEDULER_TIMEOUT_MS` wall-clock no-progress budget)
+- Deadlock/livelock detection (`diagnose_stuck_state`, called on stall)
+
+**What's NOT compiled:**
+
+- All `CYCLE_COUNT_*` timing counters (`sched_*_cycle`, orchestrator cost counters)
+- Scheduler/Orchestrator profiling summary logs guarded by `#if PTO2_PROFILING`
+- Performance data collection paths (`enable_l2_swimlane` runtime flag becomes ineffective because profiling code is not compiled)
+
+**Log output (normal run, no stall):**
+
+- No `sched_start/sched_end/sched_cost` timestamps
+- No `orch_start/orch_end/orch_cost` timestamps
+- No `Scheduler summary: total_time=...`
+- No `PTO2 total submitted tasks` log
+- `PTO2 progress: completed=... total=...` may appear (thread 0 only, at task completion milestones)
+
+---
+
+### Level 1: Basic Profiling (PTO2_PROFILING=1)
+
+**What's compiled:**
+
+- Base timing counters for scheduler loop (`sched_complete/dispatch/idle/scan`)
+- Per-thread orchestration timing (`orch_start`, `orch_end`, `orch_cost`)
+- Stage-level orchestration end timestamp (`orch_stage_end`, printed by last orch thread only, marks the moment all orch threads have finished and core transition is about to be requested; only when `orch_to_sched_` is true)
+- PTO2 total submitted tasks count (printed by last orch thread, after orch timing line)
+- Scheduler summary output (`total_time`, `loops`, `tasks_scheduled`)
+- Scheduler lifetime timestamps and cost (`sched_start`, `sched_end`, `sched_cost` — captured inside `resolve_and_dispatch_pto2()`, printed before Scheduler summary)
+
+**What's NOT compiled:**
+
+- Detailed phase breakdowns
+- TensorMap statistics
+
+**Log output (additional lines vs Level 0, per normal run):**
+
+- `Thread %d: orch_start=%llu orch_end=%llu orch_cost=%.3fus` — each orch thread, after orchestration fully complete
+- `PTO2 total submitted tasks = %d, already executed %d tasks` — last orch thread only (×1), after orch timing line
+- `Thread %d: orch_stage_end=%llu` — last orch thread only (×1), only when `orch_to_sched_=true`
+- `Thread %d: sched_start=%llu sched_end=%llu sched_cost=%.3fus` — each sched thread, printed before Scheduler summary
+- `Thread %d: Scheduler summary: total_time=%.3fus, loops=%llu, tasks_scheduled=%d` — each sched thread
+- `Thread %d: sched_start=%llu sched_end(timeout)=%llu sched_cost=%.3fus` — timeout path only (replaces normal `sched_end`)
+
+**LOG_INFO_V9 count (normal run):**
+
+- `orch_to_sched_=false` (default): `N_sched*2 + N_orch*1 + 1` (orch_timing + PTO2_total + sched_timing + Scheduler_summary)
+- `orch_to_sched_=true` (`PTO2_ORCH_TO_SCHED=1`): adds 1 (`orch_stage_end`)
+
+> See the table at the end for concrete counts based on the `paged_attention` example.
+
+**Example log output — `orch_to_sched_=false`** (from `paged_attention`, device 10):
+
+```text
+Thread 2: orch_start=48214752948321 orch_end=48214752959379 orch_cost=230.000us
+Thread 3: orch_start=48214752948316 orch_end=48214752961505 orch_cost=275.000us
+PTO2 total submitted tasks = 13, already executed 13 tasks
+Thread 1: sched_start=48214752948235 sched_end=48214752962379 sched_cost=295.000us
+Thread 1: Scheduler summary: total_time=159.560us, loops=3782, tasks_scheduled=6
+Thread 0: sched_start=48214752948200 sched_end=48214752963571 sched_cost=320.000us
+Thread 0: Scheduler summary: total_time=183.180us, loops=4611, tasks_scheduled=7
+```
+
+**Example log output — `orch_to_sched_=true`** (`PTO2_ORCH_TO_SCHED=1`, from `paged_attention`, device 11):
+
+```text
+Thread 3: orch_stage_end=48236915058307
+Thread 3: orch_start=48236915044001 orch_end=48236915058781 orch_cost=308.000us
+Thread 2: orch_start=48236915044003 orch_end=48236915058782 orch_cost=308.000us
+PTO2 total submitted tasks = 13, already executed 13 tasks
+Thread 0: sched_start=48236915043911 sched_end=48236915059191 sched_cost=318.000us
+Thread 0: Scheduler summary: total_time=187.920us, loops=4561, tasks_scheduled=4
+Thread 1: sched_start=48236915043947 sched_end=48236915061881 sched_cost=372.000us
+Thread 1: Scheduler summary: total_time=168.620us, loops=3880, tasks_scheduled=9
+```
+
+> With `orch_to_sched_=true`, orch threads transition to schedulers after orchestration. They print `orch_end` but do NOT print `Scheduler summary` or `sched_end` (they have no cores assigned at shutdown time).
+
+**Note:**
+
+- All logs above are controlled by compile-time macro `PTO2_PROFILING`, not by `enable_l2_swimlane`.
+- `enable_l2_swimlane` only controls shared-memory data collection / swimlane export.
+- Enable `orch_to_sched_` via environment variable: `PTO2_ORCH_TO_SCHED=1`.
+
+---
+
+### Level 2: Scheduler Detailed Profiling (PTO2_SCHED_PROFILING=1)
+
+**Requires:** `PTO2_PROFILING=1`
+
+**What's compiled:**
+
+- All Level 1 features
+- Detailed scheduler phase counters
+- Phase-specific statistics (complete, scan, dispatch, idle)
+- Hit rate tracking (complete poll, ready queue pop)
+
+**Log output:** 18 LOG_INFO_V9 logs (11 debug + 2 basic + 7 scheduler detailed - 2 replaced)
+
+- Replaces scheduler summary with detailed breakdown
+
+**Scheduler output:**
+
+```text
+Thread X: === Scheduler Phase Breakdown: total=XXXus, XXX tasks ===
+Thread X:   complete       : XXXus (XX.X%)
+Thread X:     poll         : XXXus (XX.X%)  hit=XXX, miss=XXX, hit_rate=XX.X%
+Thread X:     otc_lock     : XXXus (XX.X%)  work=XXXus wait=XXXus  atomics=XXX
+Thread X:     otc_fanout   : XXXus (XX.X%)  work=XXXus wait=XXXus  atomics=XXX
+Thread X:     otc_fanin    : XXXus (XX.X%)  atomics=XXX
+Thread X:     otc_self     : XXXus (XX.X%)  atomics=XXX
+Thread X:     perf         : XXXus (XX.X%)
+Thread X:   dispatch       : XXXus (XX.X%)
+Thread X:     poll         : XXXus (XX.X%)
+Thread X:     pop          : XXXus (XX.X%)  work=XXXus wait=XXXus  atomics=XXX
+Thread X:     setup        : XXXus (XX.X%)
+Thread X:   scan           : XXXus (XX.X%)
+Thread X:   idle           : XXXus (XX.X%)
+Thread X:   avg/complete   : XXXus
+Thread X: Scheduler summary: total_time=XXXus, loops=XXX, tasks_scheduled=XXX
+```
+
+Per-thread fanout / fanin edge counts and ready-queue pop hit / miss
+stats live in `aicpu_scheduler_phases[]` (in `l2_swimlane_records.json`
+captured at l2_swimlane_level >= 3) and `deps.json`; consume them via
+`simpler_setup/tools/sched_overhead_analysis.py`.
+
+---
+
+### Level 3: Orchestrator Detailed Profiling (PTO2_ORCH_PROFILING=1)
+
+**Requires:** `PTO2_PROFILING=1`
+
+**What's compiled:**
+
+- All Level 1 features
+- Detailed orchestrator phase counters
+- Per-phase cycle tracking
+- Atomic operation counters
+- Wait time tracking
+
+**Log output:** 30 LOG_INFO_V9 logs (11 debug + 2 basic + 1 scheduler summary + 17 orchestrator detailed - 1 replaced)
+
+- Replaces basic orchestration completion with detailed breakdown
+
+**Orchestrator output:**
+
+```text
+Thread X: === Orchestrator Profiling: XXX tasks, total=XXXus ===
+Thread X:   sync_tensormap : XXXus (XX.X%)
+Thread X:   task_ring_alloc: XXXus (XX.X%)  work=XXXus wait=XXXus  atomics=XXX
+Thread X:   param_copy     : XXXus (XX.X%)  atomics=XXX
+Thread X:   lookup+dep     : XXXus (XX.X%)
+Thread X:   heap_alloc     : XXXus (XX.X%)  work=XXXus wait=XXXus  atomics=XXX
+Thread X:   tensormap_ins  : XXXus (XX.X%)
+Thread X:   fanin+ready    : XXXus (XX.X%)  work=XXXus wait=XXXus  atomics=XXX
+Thread X:   finalize+SM    : XXXus (XX.X%)  work=XXXus wait=XXXus  atomics=XXX
+Thread X:   scope_end      : XXXus  atomics=XXX
+Thread X:   avg/task       : XXXus
+```
+
+**Note:** Orchestrator logs always print when `PTO2_ORCH_PROFILING=1`, regardless of `enable_l2_swimlane` flag.
+
+---
+
+### Level 4: TensorMap Profiling (PTO2_TENSORMAP_PROFILING=1)
+
+**Requires:** `PTO2_PROFILING=1` AND `PTO2_ORCH_PROFILING=1`
+
+**What's compiled:**
+
+- All Level 3 features
+- TensorMap lookup statistics
+- Hash chain walk tracking
+- Overlap check counters
+
+**Log output:** 34 LOG_INFO_V9 logs (30 from Level 3 + 4 tensormap)
+
+**TensorMap output:**
+
+```text
+Thread X: === TensorMap Lookup Stats ===
+Thread X:   lookups        : XXX, inserts: XXX
+Thread X:   chain walked   : total=XXX, avg=X.X, max=X
+Thread X:   overlap checks : XXX, hits=XXX (XX.X%)
+```
+
+---
+
+## Runtime Flag: enable_l2_swimlane (perf_level)
+
+`--enable-l2-swimlane` accepts an integer perf_level (0–4). Transport
+mirrors the PMU pattern — two independent channels (one binary, one int):
+
+- **Binary on/off** — `KernelArgs::enable_profiling_flag` bit1
+  (`PROFILING_FLAG_L2_SWIMLANE`). Set by the host whenever level > 0; read
+  by AICore (which only needs on/off to decide whether to write timing) and
+  by AICPU kernel entry via `set_l2_swimlane_enabled(bool)`.
+- **Granular level (0–4)** — `L2SwimlaneDataHeader::l2_swimlane_level`
+  (shared memory). Host writes it in `L2SwimlaneCollector::initialize`; AICPU
+  promotes it from the header in `l2_swimlane_aicpu_init` and exposes it via
+  `get_l2_swimlane_level()` (typed `L2SwimlaneLevel`) for
+  `>= AICPU_TIMING / SCHED_PHASES / ORCH_PHASES` gates.
+
+On sim, the binary on/off travels via the dlsym'd `set_l2_swimlane_enabled`
+entry point; the granular level still goes through the shared-memory
+header just like on onboard.
+
+| Level | Collects |
+| ----- | -------- |
+| 0 | Nothing (disabled) |
+| 1 | AICore timing only (start/end/task_token_raw) — AICPU `complete_task` is bypassed |
+| 2 | + AICPU dispatch_time, finish_time |
+| 3 | + Scheduler phases (`SCHED_*`) |
+| 4 | + Orchestrator phases (full) |
+
+At level 1 the AICore record carries the full PTO2 `task_token_raw`
+(`(ring_id << 32) | local_id`), read straight from
+`LocalContext.async_ctx.task_token.raw` inside the AICore helper —
+already in cache from the dispatch payload, so no extra GM load.
+Identity fields the AICPU side used to write at level 1 (`func_id`,
+`core_type`) are derived host-side:
+
+- `func_id` ← `deps.json`'s per-task `kernel_ids[]`, joined by
+  `task_id` at post-process by `swimlane_converter.py`. Same model
+  `fanout` already uses.
+- `core_type` ← per-core static table published by the host into the
+  collector (`L2SwimlaneCollector::set_core_types`).
+
+AICore buffer rotation no longer piggy-backs on `complete_task`. AICPU
+counts dispatches per core in the dispatch path (scheduler_dispatch in
+tensormap_and_ringbuffer; aicpu_executor in host_build_graph) and rotates
+the AICore buffer when the count is about to cross a
+`PLATFORM_AICORE_BUFFER_SIZE` boundary — strictly before
+`write_reg(DATA_MAIN_BASE)` for the first task of the new batch. The
+hook is `l2_swimlane_aicpu_on_aicore_dispatch`. No AICore-side signal is
+needed: AICPU has full dispatch visibility on its own. Race safety comes
+from the completion-before-dispatch invariant (AICore per core is
+single-threaded and AICPU does not dispatch task K+1 until K FIN'd), which
+guarantees AICore has FIN'd — and `dcci`'d out — every record in the old
+buffer by rotation time. This decoupling is what lets level 1 skip
+`complete_task` without losing rotations.
+
+Fanout edges are no longer carried on the device hot path — `swimlane_converter.py`
+joins them from the sibling `deps.json` (produced by dep_gen) at post-process time.
+
+Bare `--enable-l2-swimlane` = level 4 (backward compatible).
+
+### Level gating in AICPU code
+
+Use the strongly-typed `L2SwimlaneLevel` enum so each gate names the
+content it depends on instead of relying on magic numbers:
+
+```cpp
+// Any level > 0: AICPU task record buffer init / flush.
+// Cheap binary check, available immediately after kernel entry.
+if (is_l2_swimlane_enabled()) { ... }
+
+// AICPU dispatch/finish timestamps.
+// Granular checks below require l2_swimlane_aicpu_init to have already run
+// (so the level has been promoted from the shared-memory header).
+if (get_l2_swimlane_level() >= L2SwimlaneLevel::AICPU_TIMING) { ... }
+
+// Scheduler main-loop phase records (SCHED_*)
+if (get_l2_swimlane_level() >= L2SwimlaneLevel::SCHED_PHASES) { ... }
+
+// Orchestrator phase records
+if (get_l2_swimlane_level() >= L2SwimlaneLevel::ORCH_PHASES) { ... }
+```
+
+`L2SwimlaneLevel` is defined in `common/l2_swimlane_profiling.h` with
+underlying type `uint32_t` (matches the `L2SwimlaneDataHeader::l2_swimlane_level`
+shared-memory field and mirrors `PmuEventType : uint32_t`):
+
+| Enumerator | Underlying value |
+| ---------- | ---------------- |
+| `DISABLED` | 0 |
+| `AICORE_TIMING` | 1 |
+| `AICPU_TIMING` | 2 |
+| `SCHED_PHASES` | 3 |
+| `ORCH_PHASES` | 4 |
+
+### When enable_l2_swimlane=0
+
+- No performance data collection
+- No shared memory writes
+- Logs still print (controlled by macros only)
+
+---
+
+## Common Profiling Configurations
+
+### Development (minimal overhead)
+
+```bash
+# No profiling overhead
+PTO2_PROFILING=0
+```
+
+### Basic Performance Monitoring
+
+```bash
+# Minimal overhead, summary logs only
+PTO2_PROFILING=1
+PTO2_ORCH_PROFILING=0
+PTO2_SCHED_PROFILING=0
+```
+
+### Scheduler Performance Analysis
+
+```bash
+# Detailed scheduler breakdown
+PTO2_PROFILING=1
+PTO2_ORCH_PROFILING=0
+PTO2_SCHED_PROFILING=1
+```
+
+### Orchestrator Performance Analysis
+
+```bash
+# Detailed orchestrator breakdown
+PTO2_PROFILING=1
+PTO2_ORCH_PROFILING=1
+PTO2_SCHED_PROFILING=0
+```
+
+### Full Profiling (maximum overhead)
+
+```bash
+# All profiling features enabled
+PTO2_PROFILING=1
+PTO2_ORCH_PROFILING=1
+PTO2_SCHED_PROFILING=1
+PTO2_TENSORMAP_PROFILING=1
+```
+
+---
+
+## Setting Profiling Macros
+
+### At compile time
+
+Pass compile definitions through the build command or CI `CXXFLAGS`.
+This overrides the defaults in `profiling_config.h` without changing source.
+
+```bash
+# Example: disable all profiling code
+CXXFLAGS="-DPTO2_PROFILING=0" pip install --no-build-isolation -e .
+
+# Example: enable orchestrator and tensormap profiling
+CXXFLAGS="-DPTO2_ORCH_PROFILING=1 -DPTO2_TENSORMAP_PROFILING=1" \
+    pip install --no-build-isolation -e .
+```
+
+### In source code (before including headers)
+
+Source-level overrides are only for local experiments. They must appear before
+any header includes `profiling_config.h`; do not add duplicated fallback
+definitions to runtime headers.
+
+```cpp
+#define PTO2_PROFILING 1
+#define PTO2_ORCH_PROFILING 1
+#include "pto_runtime2_types.h"
+```
+
+---
+
+## Log Output Summary
+
+> Example: `paged_attention` on Ascend hardware, 2 sched threads + 2 orch threads, normal run (no stall/timeout).
+
+| Level | Macro Settings | LOG_INFO_V9 Count (`orch_to_sched_=false`) | LOG_INFO_V9 Count (`orch_to_sched_=true`) | Description |
+| ----- | -------------- | ------------------------------------------ | ----------------------------------------- | ----------- |
+| 0 | `PTO2_PROFILING=0` | 0 | 0 | No timing output |
+| 1 | `PTO2_PROFILING=1` | 7 | 8 | Timing timestamps + scheduler summary |
+| 2 | `+PTO2_SCHED_PROFILING=1` | — | — | Scheduler detailed phase breakdown |
+| 3 | `+PTO2_ORCH_PROFILING=1` | — | — | Orchestrator detailed phase breakdown |
+| 4 | `+PTO2_TENSORMAP_PROFILING=1` | — | — | TensorMap lookup stats |
+
+---
+
+## Implementation Notes
+
+### Key Principles
+
+1. **Macros control compilation and logging**
+   - `#if PTO2_PROFILING` controls whether profiling code is compiled
+   - Logs print when macro is enabled, regardless of runtime flag
+
+2. **Runtime flag controls data collection**
+   - `enable_l2_swimlane` controls performance buffer allocation
+   - Controls shared memory writes for host-side export
+   - Does NOT control log output
+
+3. **Consistent behavior across components**
+   - Scheduler logs: macro-controlled only
+   - Orchestrator logs: macro-controlled only
+   - Data collection: runtime flag controlled
+
+### Code Locations
+
+- Macro defaults and validation: `src/common/task_interface/profiling_config.h`
+- Scheduler profiling: `src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp` and `scheduler_cold_path.cpp`
+- Orchestrator profiling: `src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp`
+- TensorMap profiling: `src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h`
+
+---
+
+## Performance Impact
+
+### Compilation overhead
+
+- Level 0: No overhead
+- Level 1: Minimal (counter increments, basic arithmetic)
+- Level 2-4: Low to moderate (additional counters, cycle measurements)
+
+### Runtime overhead
+
+- Logging: Negligible (device logs are asynchronous)
+- Data collection (`enable_l2_swimlane>0`): Low to moderate
+  - Performance buffer writes
+  - Shared memory updates
+  - Per-task timing measurements
+
+### Recommendation
+
+- Use Level 0 for production
+- Use Level 1-2 for performance monitoring
+- Use Level 3-4 for detailed performance analysis only
diff --git a/src/a2a3/runtime/fully_distributed_within_core/host/dep_gen_replay.cpp b/src/a2a3/runtime/fully_distributed_within_core/host/dep_gen_replay.cpp
new file mode 100644
index 000000000..55565e885
--- /dev/null
+++ b/src/a2a3/runtime/fully_distributed_within_core/host/dep_gen_replay.cpp
@@ -0,0 +1,784 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+/**
+ * @file dep_gen_replay.cpp
+ * @brief Replay in-memory DepGenRecord stream → deps.json (strided tensor
+ *        representation, tensor-annotated) via a host-resident PTO2TensorMap,
+ *        with a differential check against the runtime template `compute_task_fanin`.
+ *
+ * Two passes run per record against two parallel PTO2TensorMap instances that
+ * evolve in lockstep:
+ *
+ *   ORACLE pass (read-only contract):
+ *     Drives `compute_task_fanin` (the same template the device orchestrator
+ *     uses in pto_orchestrator.cpp:submit_task) against `tm_oracle`. Emits
+ *     only PTO2TaskId values — the canonical set of producer IDs the runtime
+ *     would have wired. We never widen this template's emit signature: this
+ *     pass IS the contract, and any future change to `compute_task_fanin`
+ *     automatically refreshes the oracle.
+ *
+ *   ANNOT pass (this file's feature):
+ *     Inlines the same STEP A (creator retention) + STEP B (tensormap lookup)
+ *     against `tm_annot`, but the callback fires with the full
+ *     `PTO2TensorMapEntry&` + the consumer Tensor* + the arg index, so the
+ *     replay can record per-edge tensor metadata (producer/consumer
+ *     shape/offset, dtype, version).
+ *
+ * After both passes finish per record, we compare the producer-ID set the
+ * oracle emitted to the producer-ID set the annot pass emitted. They MUST
+ * match. If they diverge, deps.json is not written and the function returns
+ * non-zero — this is the "no shotgun modifications" guarantee: anyone who
+ * changes `compute_task_fanin` will trip this gate immediately and know to
+ * mirror the change in the annot pass.
+ *
+ * STEP 1 (explicit_deps) is emitted at the call site (per pto_dep_compute.h's
+ * "kept at call site" note); both passes run the same explicit-deps loop, so
+ * the comparison covers it too.
+ *
+ * STEP 4 (`register_task_outputs`) runs on BOTH tensor maps after both passes
+ * complete, keeping `tm_oracle` and `tm_annot` bit-equivalent for the next
+ * record's INOUT+COVERED `remove_entry` mutations.
+ *
+ * Pool sizing: replay never advances last_task_alive, so each tensor map's
+ * entry pool must accommodate every output write across the whole trace. We
+ * scan the record buffer once to count INOUT + OUTPUT_EXISTING slots and size
+ * the pool accordingly. Both maps get the same size.
+ */
+
+#include "dep_gen_replay.h"
+
+#include <cinttypes>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <fstream>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "common/dep_gen.h"
+#include "common/unified_log.h"
+#include "data_type.h"
+#include "pto_dep_compute.h"
+#include "pto_task_id.h"
+#include "pto_tensormap.h"
+#include "tensor.h"
+
+namespace {
+
+int32_t ceil_pow2(int32_t v) {
+    if (v <= 1) return 1;
+    v--;
+    v |= v >> 1;
+    v |= v >> 2;
+    v |= v >> 4;
+    v |= v >> 8;
+    v |= v >> 16;
+    return v + 1;
+}
+
+// Count INOUT + OUTPUT_EXISTING slots across the record buffer —
+// register_task_outputs only inserts those, and skips entries with manual_dep
+// set. Counting both without inspecting manual_dep is a conservative upper
+// bound (manual_dep is rare; the small over-allocation pays for itself in
+// avoided pool exhaustion).
+int32_t count_outputs(const DepGenRecord *records, size_t n) {
+    int32_t total = 0;
+    for (size_t i = 0; i < n; i++) {
+        const DepGenRecord &r = records[i];
+        // Overflow chain slots are reinterpret_cast views with no tensor data;
+        // their `tensor_count` bytes are actually the overflow `dep_count` field,
+        // which would mislead the loop below if read as a tensor count.
+        if (r.flags & DEP_GEN_FLAG_OVERFLOW) continue;
+        for (uint16_t j = 0; j < r.tensor_count; j++) {
+            auto t = static_cast<TensorArgType>(r.arg_types[j]);
+            if (t == TensorArgType::INOUT || t == TensorArgType::OUTPUT_EXISTING) {
+                total++;
+            }
+        }
+    }
+    return total;
+}
+
+// ---------------------------------------------------------------------------
+// JSON output accumulators (in-memory tables that get serialized at the end)
+// ---------------------------------------------------------------------------
+
+// Edge categories — matches the three places a runtime fanin edge is born.
+enum class EdgeSource { EXPLICIT, CREATOR, TENSORMAP };
+
+const char *edge_source_str(EdgeSource s) {
+    switch (s) {
+    case EdgeSource::EXPLICIT:
+        return "explicit";
+    case EdgeSource::CREATOR:
+        return "creator";
+    case EdgeSource::TENSORMAP:
+        return "tensormap";
+    }
+    return "unknown";
+}
+
+const char *overlap_status_str(OverlapStatus s) {
+    switch (s) {
+    case OverlapStatus::COVERED:
+        return "covered";
+    case OverlapStatus::OTHER:
+        return "other";
+    case OverlapStatus::NO_OVERLAP:
+        return "no_overlap";
+    }
+    return "unknown";
+}
+
+// One annotated edge. consumer_* always populated. producer_* populated for
+// TENSORMAP source only — the explicit/creator emit paths don't have a
+// matched tensormap entry to copy from.
+//
+// Slice description follows the strided Tensor model: (start_offset, strides[])
+// in element units. Byte offset of element coords[] is
+//   (start_offset + Σ coords[i] · strides[i]) · dtype_bytes
+struct EdgeAnnot {
+    uint64_t pred;
+    uint64_t succ;
+    int32_t consumer_arg_idx;  // -1 for EXPLICIT (not tied to a tensor arg)
+    EdgeSource source;
+    OverlapStatus overlap;  // only meaningful for TENSORMAP
+    uint64_t tensor_id;     // 0 for EXPLICIT
+    // Consumer side (the Tensor the submitting task is reading).
+    uint8_t consumer_dtype;
+    uint32_t consumer_ndims;
+    uint32_t consumer_shape[MAX_TENSOR_DIMS];
+    uint64_t consumer_start_offset;  // 1D element offset
+    uint32_t consumer_strides[MAX_TENSOR_DIMS];
+    // Producer side (the slice the producer wrote, from the tensormap entry).
+    // Only populated when source == TENSORMAP.
+    uint32_t producer_ndims;
+    uint32_t producer_shape[MAX_TENSOR_DIMS];
+    uint64_t producer_start_offset;
+    uint32_t producer_strides[MAX_TENSOR_DIMS];
+};
+
+// One entry in the tensors[] table: the underlying storage, keyed by
+// (buffer_addr, version). buffer_numel is the storage element count;
+// per-edge fields describe the slice (start_offset + stride).
+struct TensorTableEntry {
+    uint64_t tensor_id;
+    uint64_t buffer_addr;
+    uint64_t buffer_numel;  // storage size in elements (= buffer.size / dtype_bytes)
+    int32_t version;
+    uint8_t dtype;
+};
+
+// One arg slot of a task, captured for the `tasks[].args[]` block so
+// downstream viewers can render per-task input / output compartments without
+// having to scan every edge. `has_tensor_info` is false only for OUTPUT slots:
+// the runtime hasn't materialized a Tensor yet at submit_task time, so the
+// captured blob is zeroed.
+struct TaskArgEntry {
+    int32_t idx;
+    TensorArgType arg_type;
+    bool has_tensor_info;
+    uint64_t tensor_id;
+    uint8_t dtype;
+    uint32_t ndims;
+    uint32_t shape[MAX_TENSOR_DIMS];
+    uint64_t start_offset;  // 1D element offset
+    uint32_t strides[MAX_TENSOR_DIMS];
+};
+
+struct TaskTableEntry {
+    uint64_t task_id;
+    bool in_manual_scope;
+    int32_t kernel_id[3];  // per-subslot {AIC, AIV0, AIV1}, -1 = inactive
+    std::vector<TaskArgEntry> args;
+};
+
+const char *arg_type_str(TensorArgType t) {
+    switch (t) {
+    case TensorArgType::INPUT:
+        return "INPUT";
+    case TensorArgType::OUTPUT:
+        return "OUTPUT";
+    case TensorArgType::INOUT:
+        return "INOUT";
+    case TensorArgType::OUTPUT_EXISTING:
+        return "OUTPUT_EXISTING";
+    }
+    return "UNKNOWN";
+}
+
+// FNV-1a 64-bit hash of (buffer_addr, version) — stable tensor identity
+// across runs (no time-dependent inputs).
+uint64_t make_tensor_id(uint64_t buffer_addr, int32_t version) {
+    constexpr uint64_t FNV_OFFSET = 0xcbf29ce484222325ULL;
+    constexpr uint64_t FNV_PRIME = 0x100000001b3ULL;
+    uint64_t h = FNV_OFFSET;
+    const uint8_t *p;
+    p = reinterpret_cast<const uint8_t *>(&buffer_addr);
+    for (size_t i = 0; i < sizeof(buffer_addr); i++) {
+        h ^= p[i];
+        h *= FNV_PRIME;
+    }
+    uint32_t v = static_cast<uint32_t>(version);
+    p = reinterpret_cast<const uint8_t *>(&v);
+    for (size_t i = 0; i < sizeof(v); i++) {
+        h ^= p[i];
+        h *= FNV_PRIME;
+    }
+    return h;
+}
+
+// Register a tensor in the tensors[] table on first sight of (addr,
+// version). buffer_numel describes the underlying storage size in elements;
+// per-edge fields describe the slice via (start_offset, strides[]). Subsequent
+// sightings of the same (addr, version) are no-ops.
+uint64_t register_tensor(
+    std::unordered_map<uint64_t, size_t> &index_by_id, std::vector<TensorTableEntry> &table, const Tensor &t
+) {
+    uint64_t id = make_tensor_id(t.buffer.addr, t.version);
+    auto it = index_by_id.find(id);
+    if (it != index_by_id.end()) {
+        return id;
+    }
+    TensorTableEntry e;
+    e.tensor_id = id;
+    e.buffer_addr = t.buffer.addr;
+    e.version = t.version;
+    e.dtype = static_cast<uint8_t>(t.dtype);
+    const uint64_t elem_size = get_element_size(t.dtype);
+    e.buffer_numel = (elem_size == 0) ? 0 : (t.buffer.size / elem_size);
+    index_by_id[id] = table.size();
+    table.push_back(e);
+    return id;
+}
+
+// Copy a Tensor's slice description (shape + start_offset + stride) into an
+// EdgeAnnot's consumer_* fields.
+void fill_consumer(EdgeAnnot &e, const Tensor &t) {
+    e.consumer_dtype = static_cast<uint8_t>(t.dtype);
+    e.consumer_ndims = t.ndims;
+    e.consumer_start_offset = t.start_offset;
+    for (uint32_t i = 0; i < t.ndims && i < MAX_TENSOR_DIMS; i++) {
+        e.consumer_shape[i] = t.shapes[i];
+        e.consumer_strides[i] = t.strides[i];
+    }
+}
+
+// Copy a PTO2TensorMapEntry's slice description into an EdgeAnnot's producer_*
+// fields. Only called from the TENSORMAP emit path.
+void fill_producer(EdgeAnnot &e, const PTO2TensorMapEntry &entry) {
+    e.producer_ndims = entry.ndims;
+    e.producer_start_offset = entry.start_offset;
+    for (uint32_t i = 0; i < entry.ndims && i < MAX_TENSOR_DIMS; i++) {
+        e.producer_shape[i] = entry.shapes[i];
+        e.producer_strides[i] = entry.strides[i];
+    }
+}
+
+// ---------------------------------------------------------------------------
+// JSON writer
+// ---------------------------------------------------------------------------
+
+void write_uint_array(std::ofstream &out, const uint32_t *data, uint32_t n) {
+    out << '[';
+    for (uint32_t i = 0; i < n; i++) {
+        if (i > 0) out << ',';
+        out << data[i];
+    }
+    out << ']';
+}
+
+bool write_deps_json(
+    const char *path, const std::vector<TaskTableEntry> &tasks, const std::vector<TensorTableEntry> &tensors,
+    const std::vector<EdgeAnnot> &edges
+) {
+    std::ofstream out(path, std::ios::out | std::ios::trunc);
+    if (!out) {
+        LOG_ERROR("dep_gen replay: failed to open '%s' for write", path);
+        return false;
+    }
+    // Strided tensor representation. tensors[].buffer_numel is the underlying
+    // storage element count; tasks[].args[] and edges[] carry per-slice
+    // geometry as (start_offset uint64, strides[] uint32 — runtime invariant
+    // forbids zero / negative strides, see runtime/tensor.h).
+    out << "{\"tasks\":[";
+    for (size_t i = 0; i < tasks.size(); i++) {
+        if (i > 0) out << ',';
+        const auto &t = tasks[i];
+        // uint64 fields are quoted as strings — task_id/tensor_id/buffer_addr/
+        // pred/succ can exceed Number.MAX_SAFE_INTEGER (2^53-1), silently
+        // losing precision in JS-based JSON parsers. Python consumers already
+        // pass these through int(...) and don't care which form they receive.
+        out << "{\"task_id\":\"" << t.task_id << '"';
+        out << ",\"scope\":\"" << (t.in_manual_scope ? "manual" : "auto") << '"';
+        // Per-subslot kernel ids {AIC, AIV0, AIV1}; INVALID_KERNEL_ID = -1 for
+        // inactive subslots. Emitted as a plain int triple — downstream viewers
+        // (and the swimlane host post-processor) use it to resolve task_id →
+        // kernel without the AICore record carrying the field itself.
+        out << ",\"kernel_ids\":[" << t.kernel_id[0] << ',' << t.kernel_id[1] << ',' << t.kernel_id[2] << ']';
+        out << ",\"args\":[";
+        for (size_t a = 0; a < t.args.size(); a++) {
+            if (a > 0) out << ',';
+            const auto &arg = t.args[a];
+            out << "{\"idx\":" << arg.idx;
+            out << ",\"type\":\"" << arg_type_str(arg.arg_type) << '"';
+            if (arg.has_tensor_info) {
+                out << ",\"tensor_id\":\"" << arg.tensor_id << '"';
+                out << ",\"dtype\":\"" << get_dtype_name(static_cast<DataType>(arg.dtype)) << '"';
+                out << ",\"shape\":";
+                write_uint_array(out, arg.shape, arg.ndims);
+                out << ",\"start_offset\":\"" << arg.start_offset << '"';
+                out << ",\"strides\":";
+                write_uint_array(out, arg.strides, arg.ndims);
+            }
+            out << '}';
+        }
+        out << "]}";
+    }
+    out << ']';
+
+    out << ",\"tensors\":[";
+    for (size_t i = 0; i < tensors.size(); i++) {
+        if (i > 0) out << ',';
+        const auto &t = tensors[i];
+        out << "{\"tensor_id\":\"" << t.tensor_id << '"';
+        out << ",\"buffer_addr\":\"" << t.buffer_addr << '"';
+        out << ",\"version\":" << t.version;
+        out << ",\"dtype\":\"" << get_dtype_name(static_cast<DataType>(t.dtype)) << '"';
+        out << ",\"buffer_numel\":\"" << t.buffer_numel << '"';
+        out << '}';
+    }
+    out << ']';
+
+    out << ",\"edges\":[";
+    for (size_t i = 0; i < edges.size(); i++) {
+        if (i > 0) out << ',';
+        const auto &e = edges[i];
+        out << "{\"pred\":\"" << e.pred << "\",\"succ\":\"" << e.succ << '"';
+        out << ",\"arg\":" << e.consumer_arg_idx;
+        out << ",\"source\":\"" << edge_source_str(e.source) << '"';
+        if (e.source == EdgeSource::TENSORMAP) {
+            out << ",\"overlap\":\"" << overlap_status_str(e.overlap) << '"';
+        }
+        if (e.source != EdgeSource::EXPLICIT) {
+            out << ",\"tensor_id\":\"" << e.tensor_id << '"';
+            out << ",\"consumer_dtype\":\"" << get_dtype_name(static_cast<DataType>(e.consumer_dtype)) << '"';
+            out << ",\"consumer_shape\":";
+            write_uint_array(out, e.consumer_shape, e.consumer_ndims);
+            out << ",\"consumer_start_offset\":\"" << e.consumer_start_offset << '"';
+            out << ",\"consumer_strides\":";
+            write_uint_array(out, e.consumer_strides, e.consumer_ndims);
+        }
+        if (e.source == EdgeSource::TENSORMAP) {
+            out << ",\"producer_shape\":";
+            write_uint_array(out, e.producer_shape, e.producer_ndims);
+            out << ",\"producer_start_offset\":\"" << e.producer_start_offset << '"';
+            out << ",\"producer_strides\":";
+            write_uint_array(out, e.producer_strides, e.producer_ndims);
+        }
+        out << '}';
+    }
+    out << "]}\n";
+    return static_cast<bool>(out);
+}
+
+// ---------------------------------------------------------------------------
+// Annot pass — mirrors compute_task_fanin step-by-step against tm_annot.
+// Must stay bit-equivalent to pto_dep_compute.h::compute_task_fanin in terms
+// of which producer IDs are emitted (the differential check enforces this).
+// ---------------------------------------------------------------------------
+
+template <typename EmitTM, typename EmitCreator>
+void annot_pass(
+    const DepInputs &inputs, PTO2TensorMap &tensor_map, bool in_manual_scope, EmitCreator emit_creator,
+    EmitTM emit_tensormap
+) {
+    if (in_manual_scope) {
+        return;
+    }
+    for (int32_t i = 0; i < inputs.tensor_count; i++) {
+        TensorArgType ptype = inputs.arg_types[i];
+        if (ptype == TensorArgType::OUTPUT) {
+            continue;
+        }
+        const Tensor *tensor = &inputs.tensors[i].ref();
+
+        // STEP A: creator retention.
+        PTO2TaskId owner = tensor->owner_task_id;
+        if (owner.is_valid()) {
+            emit_creator(owner, i, *tensor);
+        }
+
+        // STEP B: tensormap lookup (only INPUT/INOUT, skip manual_dep).
+        if (ptype != TensorArgType::INPUT && ptype != TensorArgType::INOUT) {
+            continue;
+        }
+        if (tensor->manual_dep) {
+            continue;
+        }
+
+        tensor_map.lookup(*tensor, [&](PTO2TensorMapEntry &entry, OverlapStatus overlap_status) -> bool {
+            emit_tensormap(entry.producer_task_id, i, *tensor, entry, overlap_status);
+            if (ptype == TensorArgType::INOUT && overlap_status == OverlapStatus::COVERED) {
+                tensor_map.remove_entry(entry);
+            }
+            return true;
+        });
+    }
+}
+
+}  // namespace
+
+extern "C" int
+dep_gen_replay_emit_deps_json(const DepGenRecord *records, size_t num_records, const char *deps_json_path) {
+    if (deps_json_path == nullptr) {
+        LOG_ERROR("dep_gen replay: null deps_json_path");
+        return -1;
+    }
+    if (num_records > 0 && records == nullptr) {
+        LOG_ERROR("dep_gen replay: num_records=%zu but records pointer is null", num_records);
+        return -1;
+    }
+    LOG_INFO_V0("dep_gen replay: processing %zu in-memory records (dual-pass)", num_records);
+
+    // Per-ring task window sizes — tensormap masks slot indices and requires
+    // each to be a power of two. Auto-size from the records themselves so each
+    // ring's window comfortably covers its observed max local_id (no slot
+    // aliasing during INOUT+COVERED remove_from_task). Same sizes feed both
+    // maps so they stay in lockstep.
+    int32_t task_window_sizes[PTO2_MAX_RING_DEPTH];
+    uint32_t max_local[PTO2_MAX_RING_DEPTH] = {0};
+    for (size_t i = 0; i < num_records; i++) {
+        PTO2TaskId tid{records[i].task_id};
+        uint8_t ring = tid.ring();
+        uint32_t local = tid.local();
+        if (ring < PTO2_MAX_RING_DEPTH && local > max_local[ring]) {
+            max_local[ring] = local;
+        }
+    }
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        int32_t need = static_cast<int32_t>(max_local[r] + 1);
+        task_window_sizes[r] = ceil_pow2(need < 16 ? 16 : need);
+    }
+
+    int32_t output_count = count_outputs(records, num_records);
+    int32_t pool_size = output_count + (output_count / 10) + 64;
+    if (pool_size < PTO2_TENSORMAP_POOL_SIZE) {
+        pool_size = PTO2_TENSORMAP_POOL_SIZE;
+    }
+
+    PTO2TensorMap tm_oracle;
+    PTO2TensorMap tm_annot;
+    std::memset(&tm_oracle, 0, sizeof(tm_oracle));
+    std::memset(&tm_annot, 0, sizeof(tm_annot));
+
+    // Libc-backed arena (default ctor) that owns both replay tensormaps'
+    // storage. Released by the arena destructor when this function returns.
+    DeviceArena replay_arena;
+
+    auto oracle_layout =
+        PTO2TensorMap::reserve_layout(replay_arena, PTO2_TENSORMAP_NUM_BUCKETS, pool_size, task_window_sizes);
+    auto annot_layout =
+        PTO2TensorMap::reserve_layout(replay_arena, PTO2_TENSORMAP_NUM_BUCKETS, pool_size, task_window_sizes);
+    if (replay_arena.commit() == nullptr || !tm_oracle.init_data_from_layout(oracle_layout, replay_arena) ||
+        !tm_annot.init_data_from_layout(annot_layout, replay_arena)) {
+        LOG_ERROR("dep_gen replay: tensormap.init failed (buckets=%d, pool=%d)", PTO2_TENSORMAP_NUM_BUCKETS, pool_size);
+        return -3;
+    }
+    // Replay tensormaps live entirely on host; only arena-internal pointer
+    // fields need wiring (no parent-orch back-reference exists anymore).
+    tm_oracle.wire_arena_pointers(oracle_layout, replay_arena);
+    tm_annot.wire_arena_pointers(annot_layout, replay_arena);
+
+    // JSON output accumulators.
+    std::vector<TaskTableEntry> task_table;
+    std::vector<TensorTableEntry> tensor_table;
+    std::unordered_map<uint64_t, size_t> tensor_index;  // tensor_id → table idx
+    std::vector<EdgeAnnot> annot_edges;
+    annot_edges.reserve(num_records * 2);
+
+    TensorRef tref_buf[CORE_MAX_TENSOR_ARGS];
+    TensorArgType atype_buf[CORE_MAX_TENSOR_ARGS];
+
+    // Per-record dedup of producer IDs — must match runtime's
+    // PTO2FaninBuilder::append_fanin_or_fail semantics, which collapses STEP 1
+    // (explicit_deps) + STEP A (creator retention) + STEP B (tensormap lookup)
+    // into a single per-task fanin list. Both oracle and annot use this same
+    // semantics so the divergence check is meaningful.
+    std::unordered_set<uint64_t> oracle_preds;
+    std::unordered_set<uint64_t> annot_preds;
+
+    // Scratch buffer for assembling full dep lists across overflow chains.
+    // Declared outside the loop so it can be reused (clear() keeps capacity).
+    std::vector<uint64_t> full_deps_buf;
+
+    for (size_t rec_i = 0; rec_i < num_records; rec_i++) {
+        const DepGenRecord &rec = records[rec_i];
+
+        // Overflow chain records are consumed by the preceding base; skip
+        // them in the main scan so we don't double-process or read the
+        // overflow's reinterpreted bytes as tensor/dep info.
+        if (rec.flags & DEP_GEN_FLAG_OVERFLOW) continue;
+
+        PTO2TaskId task_id{rec.task_id};
+        bool in_manual_scope = (rec.flags & DEP_GEN_FLAG_IN_MANUAL_SCOPE) != 0;
+
+        oracle_preds.clear();
+        annot_preds.clear();
+
+        int32_t tc = static_cast<int32_t>(rec.tensor_count);
+        if (tc > CORE_MAX_TENSOR_ARGS) {
+            tc = CORE_MAX_TENSOR_ARGS;
+        }
+        for (int32_t i = 0; i < tc; i++) {
+            tref_buf[i] = reinterpret_cast<const Tensor *>(&rec.tensors[i][0]);
+            atype_buf[i] = static_cast<TensorArgType>(rec.arg_types[i]);
+        }
+
+        // Assemble the full dep list. Fast path: ≤ DEP_GEN_MAX_EXPLICIT_DEPS,
+        // no chain, point straight at rec.explicit_deps. Slow path: gather
+        // base + chain into full_deps_buf and point at the buffer.
+        //
+        // `explicit_dep_count` / `over->dep_count` originate from device
+        // shared memory and are bounded by the writer to the array sizes, but
+        // we clamp on read too so a corrupted record never drives an OOB read
+        // off the end of rec.explicit_deps[64] / over->deps[582].
+        const uint64_t *deps_data;
+        int32_t dc;
+        if (rec.flags & DEP_GEN_FLAG_HAS_OVERFLOW) {
+            full_deps_buf.clear();
+            uint16_t base_dc = rec.explicit_dep_count;
+            if (base_dc > DEP_GEN_MAX_EXPLICIT_DEPS) {
+                LOG_ERROR(
+                    "dep_gen replay: clamping base explicit_dep_count %u > %d at rec_idx=%zu (task_id=%" PRIu64 ")",
+                    base_dc, DEP_GEN_MAX_EXPLICIT_DEPS, rec_i, rec.task_id
+                );
+                base_dc = DEP_GEN_MAX_EXPLICIT_DEPS;
+            }
+            full_deps_buf.reserve(static_cast<size_t>(base_dc) + DEP_GEN_OVERFLOW_DEPS_PER_RECORD);
+            full_deps_buf.insert(full_deps_buf.end(), rec.explicit_deps, rec.explicit_deps + base_dc);
+            bool chain_complete = false;
+            for (size_t j = rec_i + 1; j < num_records; j++) {
+                const DepGenRecord &maybe = records[j];
+                if (!(maybe.flags & DEP_GEN_FLAG_OVERFLOW)) {
+                    LOG_ERROR(
+                        "dep_gen replay: unterminated overflow chain at rec_idx=%zu (task_id=%" PRIu64 ")", rec_i,
+                        rec.task_id
+                    );
+                    break;
+                }
+                if (maybe.task_id != rec.task_id) {
+                    LOG_ERROR(
+                        "dep_gen replay: orphan overflow at rec_idx=%zu (expected task_id=%" PRIu64 ", found %" PRIu64
+                        ")",
+                        j, rec.task_id, maybe.task_id
+                    );
+                    break;
+                }
+                const auto *over = reinterpret_cast<const DepGenOverflowRecord *>(&maybe);
+                uint16_t over_dc = over->dep_count;
+                if (over_dc > DEP_GEN_OVERFLOW_DEPS_PER_RECORD) {
+                    LOG_ERROR(
+                        "dep_gen replay: clamping overflow dep_count %u > %d at rec_idx=%zu (task_id=%" PRIu64 ")",
+                        over_dc, DEP_GEN_OVERFLOW_DEPS_PER_RECORD, j, rec.task_id
+                    );
+                    over_dc = DEP_GEN_OVERFLOW_DEPS_PER_RECORD;
+                }
+                full_deps_buf.insert(full_deps_buf.end(), over->deps, over->deps + over_dc);
+                if (over->flags & DEP_GEN_FLAG_LAST_OVERFLOW) {
+                    chain_complete = true;
+                    break;
+                }
+            }
+            if (!chain_complete) {
+                LOG_ERROR(
+                    "dep_gen replay: chain for task_id=%" PRIu64 " missing LAST_OVERFLOW marker — "
+                    "using partial dep list (%zu deps)",
+                    rec.task_id, full_deps_buf.size()
+                );
+            }
+            deps_data = full_deps_buf.data();
+            dc = static_cast<int32_t>(full_deps_buf.size());
+        } else {
+            deps_data = rec.explicit_deps;
+            uint16_t base_dc = rec.explicit_dep_count;
+            if (base_dc > DEP_GEN_MAX_EXPLICIT_DEPS) {
+                LOG_ERROR(
+                    "dep_gen replay: clamping no-chain explicit_dep_count %u > %d at rec_idx=%zu (task_id=%" PRIu64 ")",
+                    base_dc, DEP_GEN_MAX_EXPLICIT_DEPS, rec_i, rec.task_id
+                );
+                base_dc = DEP_GEN_MAX_EXPLICIT_DEPS;
+            }
+            dc = static_cast<int32_t>(base_dc);
+        }
+
+        DepInputs inputs;
+        inputs.tensor_count = tc;
+        inputs.tensors = tref_buf;
+        inputs.arg_types = atype_buf;
+        inputs.explicit_dep_count = dc;
+        inputs.explicit_deps = reinterpret_cast<const PTO2TaskId *>(deps_data);
+
+        // Register tasks[] entry (with per-arg slot info) and any unseen
+        // tensors[] entries up-front. Tensors are registered from the
+        // consumer-side blob so raw_shapes / dtype are populated (the
+        // producer-side PTO2TensorMapEntry drops raw_shapes to fit in two
+        // cache lines).
+        TaskTableEntry task_entry;
+        task_entry.task_id = rec.task_id;
+        task_entry.in_manual_scope = in_manual_scope;
+        task_entry.kernel_id[0] = rec.kernel_id[0];
+        task_entry.kernel_id[1] = rec.kernel_id[1];
+        task_entry.kernel_id[2] = rec.kernel_id[2];
+        task_entry.args.reserve(tc);
+        for (int32_t i = 0; i < tc; i++) {
+            TaskArgEntry slot{};
+            slot.idx = i;
+            slot.arg_type = atype_buf[i];
+            if (atype_buf[i] == TensorArgType::OUTPUT) {
+                // OUTPUT blob is zero at submit time (writer has no Tensor
+                // yet); leave has_tensor_info=false. Viewers render this as
+                // a placeholder "alloc" output slot.
+                slot.has_tensor_info = false;
+            } else {
+                const Tensor &t = tref_buf[i].ref();
+                register_tensor(tensor_index, tensor_table, t);
+                slot.has_tensor_info = true;
+                slot.tensor_id = make_tensor_id(t.buffer.addr, t.version);
+                slot.dtype = static_cast<uint8_t>(t.dtype);
+                slot.ndims = t.ndims;
+                slot.start_offset = t.start_offset;
+                for (uint32_t d = 0; d < t.ndims && d < MAX_TENSOR_DIMS; d++) {
+                    slot.shape[d] = t.shapes[d];
+                    slot.strides[d] = t.strides[d];
+                }
+            }
+            task_entry.args.push_back(slot);
+        }
+        task_table.push_back(std::move(task_entry));
+
+        // ============ STEP 1 — explicit_deps (call-site emit) ============
+        // Same loop on both passes; they MUST produce identical sets here
+        // because they read the same record. Annot records explicit edges
+        // with consumer_arg_idx = -1 (not tied to any tensor arg). Reads
+        // from deps_data (base record's explicit_deps[] on fast path, the
+        // gathered base+chain buffer on overflow path).
+        for (int32_t i = 0; i < dc; i++) {
+            uint64_t pred_raw = deps_data[i];
+            if (oracle_preds.insert(pred_raw).second) {
+                // First time this pred is seen at runtime call site.
+            }
+            if (annot_preds.insert(pred_raw).second) {
+                EdgeAnnot e{};
+                e.pred = pred_raw;
+                e.succ = rec.task_id;
+                e.consumer_arg_idx = -1;
+                e.source = EdgeSource::EXPLICIT;
+                annot_edges.push_back(e);
+            }
+        }
+
+        // ============ ORACLE pass — drive compute_task_fanin ============
+        bool ok = compute_task_fanin(inputs, tm_oracle, in_manual_scope, [&](PTO2TaskId producer) -> bool {
+            oracle_preds.insert(producer.raw);
+            return true;
+        });
+        if (!ok) {
+            LOG_ERROR("dep_gen replay: compute_task_fanin returned fatal at task_id=%" PRIu64, rec.task_id);
+            tm_oracle.destroy();
+            tm_annot.destroy();
+            return -4;
+        }
+
+        // ============ ANNOT pass — inline mirror, full entry capture ============
+        annot_pass(
+            inputs, tm_annot, in_manual_scope,
+            // emit_creator(producer, arg_idx, consumer_tensor)
+            [&](PTO2TaskId producer, int32_t arg_idx, const Tensor &consumer) {
+                if (!annot_preds.insert(producer.raw).second) {
+                    return;  // already covered by an earlier emit on this record
+                }
+                EdgeAnnot e{};
+                e.pred = producer.raw;
+                e.succ = rec.task_id;
+                e.consumer_arg_idx = arg_idx;
+                e.source = EdgeSource::CREATOR;
+                e.tensor_id = make_tensor_id(consumer.buffer.addr, consumer.version);
+                fill_consumer(e, consumer);
+                annot_edges.push_back(e);
+            },
+            // emit_tensormap(producer, arg_idx, consumer_tensor, entry, status)
+            [&](PTO2TaskId producer, int32_t arg_idx, const Tensor &consumer, const PTO2TensorMapEntry &entry,
+                OverlapStatus status) {
+                // Per-(succ, arg_idx, producer_buffer_addr, producer_version)
+                // dedup gives us "the same producer slice fired twice for the
+                // same consumer arg" collapse — but two distinct slices from
+                // the same producer (different version), or two different
+                // producers, both yield their own edges. The producer-id-set
+                // comparison below uses annot_preds, which dedups by pred
+                // only, matching runtime PTO2FaninBuilder semantics.
+                annot_preds.insert(producer.raw);
+                EdgeAnnot e{};
+                e.pred = producer.raw;
+                e.succ = rec.task_id;
+                e.consumer_arg_idx = arg_idx;
+                e.source = EdgeSource::TENSORMAP;
+                e.overlap = status;
+                e.tensor_id = make_tensor_id(entry.buffer_addr, entry.version);
+                fill_consumer(e, consumer);
+                fill_producer(e, entry);
+                annot_edges.push_back(e);
+            }
+        );
+
+        // ============ Differential check ============
+        if (oracle_preds != annot_preds) {
+            LOG_ERROR(
+                "dep_gen replay: DIVERGENCE at task_id=%" PRIu64 " (rec_idx=%zu): oracle has %zu preds, annot has %zu",
+                rec.task_id, rec_i, oracle_preds.size(), annot_preds.size()
+            );
+            // Log the symmetric difference for debugging.
+            for (uint64_t p : oracle_preds) {
+                if (annot_preds.find(p) == annot_preds.end()) {
+                    LOG_ERROR("  only-in-oracle pred: %" PRIu64, p);
+                }
+            }
+            for (uint64_t p : annot_preds) {
+                if (oracle_preds.find(p) == oracle_preds.end()) {
+                    LOG_ERROR("  only-in-annot  pred: %" PRIu64, p);
+                }
+            }
+            tm_oracle.destroy();
+            tm_annot.destroy();
+            return -6;
+        }
+
+        // ============ STEP 4 — publish outputs on BOTH maps ============
+        register_task_outputs(inputs, task_id, tm_oracle, in_manual_scope);
+        register_task_outputs(inputs, task_id, tm_annot, in_manual_scope);
+    }
+
+    tm_oracle.destroy();
+    tm_annot.destroy();
+
+    if (!write_deps_json(deps_json_path, task_table, tensor_table, annot_edges)) {
+        return -5;
+    }
+    LOG_INFO_V0(
+        "dep_gen replay: wrote deps.json to %s (tasks=%zu, tensors=%zu, edges=%zu)", deps_json_path, task_table.size(),
+        tensor_table.size(), annot_edges.size()
+    );
+    return 0;
+}
diff --git a/src/a2a3/runtime/fully_distributed_within_core/host/dep_gen_replay.h b/src/a2a3/runtime/fully_distributed_within_core/host/dep_gen_replay.h
new file mode 100644
index 000000000..2ea3d5768
--- /dev/null
+++ b/src/a2a3/runtime/fully_distributed_within_core/host/dep_gen_replay.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+/**
+ * @file dep_gen_replay.h
+ * @brief Host-side replay of in-memory DepGenRecord stream → deps.json.
+ *
+ * Takes the records the host collector drained from the device ring buffer
+ * (``DepGenCollector::records()``) and runs them back through a host-resident
+ * PTO2TensorMap using the same ``compute_task_fanin`` / ``register_task_outputs``
+ * primitives the device orchestrator uses, emitting the full
+ * predecessor → successor edge list to deps.json.
+ *
+ * The records buffer is passed in directly — there is no intermediate
+ * ``submit_trace.bin`` on disk. The host already has the records once the
+ * device run completes, so going through the filesystem would just be
+ * extra I/O and an extra file in the output directory.
+ *
+ * deps.json is the sole source of truth for fanout: the L2 swimlane hot
+ * path no longer records ``L2SwimlaneAicpuTaskRecord::fanout[]`` (taking the per-task
+ * 1 KB GM store off the scheduler critical path). Replay sees every
+ * submit and reconstructs the complete dependency graph.
+ *
+ * Output format (deps.json, strided tensor representation):
+ *
+ *   {"tasks":   [{"task_id":<u64>, "scope":"auto|manual",
+ *                 "args":[{"idx":<i32>, "type":"<arg_type>",
+ *                          "tensor_id":<u64>, "dtype":"...", "shape":[...],
+ *                          "start_offset":<u64>, "strides":[...]}, ...]}, ...],
+ *    "tensors": [{"tensor_id":<u64>, "buffer_addr":<u64>, "version":<i32>,
+ *                 "dtype":"FLOAT32", "buffer_numel":<u64>}, ...],
+ *    "edges":   [{"pred":<u64>, "succ":<u64>, "arg":<i32>,
+ *                 "source":"explicit|creator|tensormap",
+ *                 "overlap":"covered|other" (tensormap only),
+ *                 "tensor_id":<u64> (non-explicit),
+ *                 "consumer_dtype":"...", "consumer_shape":[...],
+ *                 "consumer_start_offset":<u64>, "consumer_strides":[...],
+ *                 "producer_shape":[...] (tensormap),
+ *                 "producer_start_offset":<u64> (tensormap),
+ *                 "producer_strides":[...] (tensormap)},
+ *                ...]}
+ *
+ *   - All task ids are ``PTO2TaskId::raw`` values (``(ring_id << 32) | local_id``).
+ *   - ``tensor_id`` is a stable FNV-1a hash of ``(buffer_addr, version)``.
+ *   - ``buffer_numel`` is the underlying storage element count; tensor shapes
+ *     are carried per-arg / per-edge alongside ``start_offset`` + ``strides``.
+ *   - Distinct producers / arg indices / sources keep their own edges; per-record
+ *     deduplication of producer ids mirrors the runtime
+ *     ``PTO2FaninBuilder::append_fanin_or_fail`` semantics so the set of
+ *     ``(pred, succ)`` pairs is identical to what the runtime would have
+ *     recorded.
+ *
+ * Self-checking: the replay runs two parallel tensormap instances per record —
+ * an "oracle" map driven by the canonical ``compute_task_fanin`` template, and
+ * an "annotated" map driven by an inlined mirror that captures the per-edge
+ * tensor metadata. If the producer-id set on the two passes ever diverges,
+ * deps.json is NOT written and the function returns a non-zero error code.
+ * This is the guarantee against silent shotgun modifications: anyone who
+ * changes ``compute_task_fanin`` semantics has to mirror the change here too
+ * or the gate fires immediately.
+ *
+ * The replay is single-threaded and pure CPU: no device handle is required.
+ */
+
+#ifndef SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_HOST_DEP_GEN_REPLAY_H_
+#define SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_HOST_DEP_GEN_REPLAY_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+// Opaque forward decl — the canonical layout lives in common/dep_gen.h, but
+// replay's API only needs to take a pointer + count. Callers who construct
+// the buffer must include common/dep_gen.h themselves.
+struct DepGenRecord;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * Replay an in-memory DepGenRecord stream and write deps.json.
+ *
+ * Per-ring task window sizes are auto-derived from the trace itself so each
+ * ring's window covers its observed max local_id without slot aliasing.
+ *
+ * @param records            Pointer to a contiguous DepGenRecord array
+ *                           (typically ``DepGenCollector::records().data()``).
+ * @param num_records        Number of records in the array.
+ * @param deps_json_path     Output path; truncated if it exists.
+ * @return 0 on success; negative on error (see source for codes).
+ */
+int dep_gen_replay_emit_deps_json(const struct DepGenRecord *records, size_t num_records, const char *deps_json_path);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_HOST_DEP_GEN_REPLAY_H_
diff --git a/src/a2a3/runtime/fully_distributed_within_core/host/runtime_compile_info.cpp b/src/a2a3/runtime/fully_distributed_within_core/host/runtime_compile_info.cpp
new file mode 100644
index 000000000..dfc5590c1
--- /dev/null
+++ b/src/a2a3/runtime/fully_distributed_within_core/host/runtime_compile_info.cpp
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+#include "host/platform_compile_info.h"
+#include "host/runtime_compile_info.h"
+#include <string.h>
+
+extern "C" {
+
+ToolchainType get_incore_compiler(void) {
+    if (strcmp(get_platform(), "a2a3") == 0) return TOOLCHAIN_CCEC;
+    return TOOLCHAIN_HOST_GXX_15;
+}
+
+ToolchainType get_orchestration_compiler(void) {
+    // tensormap_and_ringbuffer: a2a3 needs aarch64 cross-compile (AICPU is aarch64)
+    if (strcmp(get_platform(), "a2a3") == 0) return TOOLCHAIN_AARCH64_GXX;
+    return TOOLCHAIN_HOST_GXX;
+}
+}
diff --git a/src/a2a3/runtime/fully_distributed_within_core/host/runtime_maker.cpp b/src/a2a3/runtime/fully_distributed_within_core/host/runtime_maker.cpp
new file mode 100644
index 000000000..35e8872fe
--- /dev/null
+++ b/src/a2a3/runtime/fully_distributed_within_core/host/runtime_maker.cpp
@@ -0,0 +1,711 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Runtime Builder - rt2 Implementation (Device Orchestration)
+ *
+ * Provides init_runtime_impl and validate_runtime_impl functions for rt2 runtime.
+ * Supports device orchestration where AICPU thread 3 runs the orchestrator.
+ *
+ * init_runtime_impl:
+ *   - Converts host tensor pointers to device pointers (all inputs copied H2D;
+ *     only OUTPUT/INOUT tensors are copied back D2H)
+ *   - Copies orchestration SO to device memory
+ *   - Sets up runtime state for device orchestration
+ *
+ * validate_runtime_impl:
+ *   - Copies OUTPUT/INOUT tensors back from device to host (read-only inputs
+ *     are skipped)
+ *   - Frees device memory
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+#include <sys/time.h>
+
+#include <cerrno>
+#include <cinttypes>
+#include <cstddef>
+#include <cstdint>
+#include <cctype>
+#include <cstdlib>
+#include <cstring>
+#include <limits>
+#include <string>
+
+#include "../common/pto_runtime_status.h"
+#include "../runtime/pto_runtime2.h"
+#include "../runtime/pto_shared_memory.h"
+#include "../runtime/runtime.h"
+#include "../../../../common/task_interface/call_config.h"
+#include "callable.h"
+#include "common/platform_config.h"
+#include "common/unified_log.h"
+#include "utils/device_arena.h"
+#include "prepare_callable_common.h"
+
+static_assert(
+    RUNTIME_ENV_RING_COUNT == PTO2_MAX_RING_DEPTH, "RuntimeEnv ring count must match PTO2 runtime ring depth"
+);
+
+// Helper: return current time in milliseconds
+static int64_t _now_ms() {
+    struct timeval tv;
+    gettimeofday(&tv, nullptr);
+    return static_cast<int64_t>(tv.tv_sec) * 1000 + tv.tv_usec / 1000;
+}
+
+static bool is_power_of_2_u64(uint64_t value) { return value != 0 && (value & (value - 1)) == 0; }
+
+template <typename T>
+static std::string format_ring_array(const T (&values)[PTO2_MAX_RING_DEPTH]) {
+    std::string out = "[";
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; ++r) {
+        if (r != 0) {
+            out += ", ";
+        }
+        out += std::to_string(values[r]);
+    }
+    out += "]";
+    return out;
+}
+
+static std::string trim_copy(const std::string &input) {
+    size_t begin = 0;
+    while (begin < input.size() && std::isspace(static_cast<unsigned char>(input[begin]))) {
+        ++begin;
+    }
+    size_t end = input.size();
+    while (end > begin && std::isspace(static_cast<unsigned char>(input[end - 1]))) {
+        --end;
+    }
+    return input.substr(begin, end - begin);
+}
+
+static bool parse_uint_token(
+    const char *name, const std::string &raw, uint64_t min_val, uint64_t max_val, bool require_power_of_2, uint64_t *out
+) {
+    std::string token = trim_copy(raw);
+    if (token.empty()) {
+        LOG_WARN("%s has an empty value in '%s', ignored", name, raw.c_str());
+        return false;
+    }
+
+    if (token[0] == '-') {
+        LOG_WARN("%s=%s invalid (must be a non-negative integer), ignored", name, token.c_str());
+        return false;
+    }
+    char *endptr = nullptr;
+    errno = 0;
+    unsigned long long parsed = std::strtoull(token.c_str(), &endptr, 10);
+    if (errno == ERANGE || endptr == token.c_str() || *endptr != '\0') {
+        LOG_WARN("%s=%s invalid (must be a non-negative integer), ignored", name, token.c_str());
+        return false;
+    }
+    uint64_t val = static_cast<uint64_t>(parsed);
+
+    if (val < min_val || val > max_val) {
+        LOG_WARN(
+            "%s=%s invalid (must be in [%" PRIu64 ", %" PRIu64 "]), ignored", name, token.c_str(), min_val, max_val
+        );
+        return false;
+    }
+    if (require_power_of_2 && !is_power_of_2_u64(val)) {
+        LOG_WARN("%s=%s invalid (must be a power of 2), ignored", name, token.c_str());
+        return false;
+    }
+    *out = val;
+    return true;
+}
+
+static void apply_env_ring_values(
+    const char *name, uint64_t min_val, uint64_t max_val, bool require_power_of_2, uint64_t out[PTO2_MAX_RING_DEPTH]
+) {
+    const char *env = std::getenv(name);
+    if (!env) return;
+
+    std::string text(env);
+    if (text.find(',') == std::string::npos) {
+        uint64_t value = 0;
+        if (!parse_uint_token(name, text, min_val, max_val, require_power_of_2, &value)) {
+            return;
+        }
+        for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+            out[r] = value;
+        }
+        return;
+    }
+
+    uint64_t parsed[PTO2_MAX_RING_DEPTH]{};
+    size_t pos = 0;
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        size_t comma = text.find(',', pos);
+        std::string token = text.substr(pos, comma == std::string::npos ? std::string::npos : comma - pos);
+        if (!parse_uint_token(name, token, min_val, max_val, require_power_of_2, &parsed[r])) {
+            return;
+        }
+        if (comma == std::string::npos) {
+            if (r != PTO2_MAX_RING_DEPTH - 1) {
+                LOG_WARN(
+                    "%s=%s invalid (expected exactly %d comma-separated values), ignored", name, env,
+                    PTO2_MAX_RING_DEPTH
+                );
+                return;
+            }
+            pos = text.size();
+        } else {
+            pos = comma + 1;
+        }
+    }
+    if (pos < text.size() || (!text.empty() && text.back() == ',')) {
+        LOG_WARN("%s=%s invalid (expected exactly %d comma-separated values), ignored", name, env, PTO2_MAX_RING_DEPTH);
+        return;
+    }
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        out[r] = parsed[r];
+    }
+}
+
+static bool resolve_ring_config(
+    uint64_t ring_task_window, uint64_t ring_heap, uint64_t ring_dep_pool, const uint64_t *ring_task_windows,
+    const uint64_t *ring_heaps, const uint64_t *ring_dep_pools, uint64_t eff_task_window_sizes[PTO2_MAX_RING_DEPTH],
+    uint64_t eff_heap_sizes[PTO2_MAX_RING_DEPTH], int32_t eff_dep_pool_capacities[PTO2_MAX_RING_DEPTH]
+) {
+    uint64_t dep_pool_values[PTO2_MAX_RING_DEPTH];
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        eff_task_window_sizes[r] = PTO2_TASK_WINDOW_SIZE;
+        eff_heap_sizes[r] = PTO2_HEAP_SIZE;
+        dep_pool_values[r] = PTO2_DEP_LIST_POOL_SIZE;
+    }
+
+    apply_env_ring_values("PTO2_RING_TASK_WINDOW", 4, static_cast<uint64_t>(INT32_MAX), true, eff_task_window_sizes);
+    apply_env_ring_values("PTO2_RING_HEAP", 1024, std::numeric_limits<uint64_t>::max(), false, eff_heap_sizes);
+    apply_env_ring_values("PTO2_RING_DEP_POOL", 4, static_cast<uint64_t>(INT32_MAX), false, dep_pool_values);
+
+    if (ring_task_window != 0) {
+        if (ring_task_window < 4 || ring_task_window > static_cast<uint64_t>(INT32_MAX) ||
+            !is_power_of_2_u64(ring_task_window)) {
+            LOG_ERROR(
+                "runtime_env.ring_task_window=%" PRIu64 " must be a power of 2 in [4, INT32_MAX]", ring_task_window
+            );
+            return false;
+        }
+        for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+            eff_task_window_sizes[r] = ring_task_window;
+        }
+    }
+    if (ring_heap != 0) {
+        if (ring_heap < 1024) {
+            LOG_ERROR("runtime_env.ring_heap=%" PRIu64 " must be >= 1024", ring_heap);
+            return false;
+        }
+        for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+            eff_heap_sizes[r] = ring_heap;
+        }
+    }
+    if (ring_dep_pool != 0) {
+        if (ring_dep_pool < 4 || ring_dep_pool > static_cast<uint64_t>(INT32_MAX)) {
+            LOG_ERROR("runtime_env.ring_dep_pool=%" PRIu64 " must be in [4, INT32_MAX]", ring_dep_pool);
+            return false;
+        }
+        for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+            dep_pool_values[r] = ring_dep_pool;
+        }
+    }
+
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        if (ring_task_windows != nullptr && ring_task_windows[r] != 0) {
+            eff_task_window_sizes[r] = ring_task_windows[r];
+        }
+        if (ring_heaps != nullptr && ring_heaps[r] != 0) {
+            eff_heap_sizes[r] = ring_heaps[r];
+        }
+        if (ring_dep_pools != nullptr && ring_dep_pools[r] != 0) {
+            dep_pool_values[r] = ring_dep_pools[r];
+        }
+
+        if (eff_task_window_sizes[r] < 4 || eff_task_window_sizes[r] > static_cast<uint64_t>(INT32_MAX) ||
+            !is_power_of_2_u64(eff_task_window_sizes[r])) {
+            LOG_ERROR(
+                "ring_task_windows[%d]=%" PRIu64 " must be a power of 2 in [4, INT32_MAX]", r, eff_task_window_sizes[r]
+            );
+            return false;
+        }
+        if (eff_heap_sizes[r] < 1024) {
+            LOG_ERROR("ring_heaps[%d]=%" PRIu64 " must be >= 1024", r, eff_heap_sizes[r]);
+            return false;
+        }
+        if (dep_pool_values[r] < 4 || dep_pool_values[r] > static_cast<uint64_t>(INT32_MAX)) {
+            LOG_ERROR("ring_dep_pools[%d]=%" PRIu64 " must be in [4, INT32_MAX]", r, dep_pool_values[r]);
+            return false;
+        }
+        eff_dep_pool_capacities[r] = static_cast<int32_t>(dep_pool_values[r]);
+    }
+
+    return true;
+}
+
+static int32_t pto2_read_runtime_status(Runtime *runtime, PTO2SharedMemoryHeader *host_header) {
+    if (runtime == nullptr || host_header == nullptr) {
+        return 0;
+    }
+
+    void *pto2_sm = runtime->get_gm_sm_ptr();
+    if (pto2_sm == nullptr) {
+        return 0;
+    }
+
+    int hdr_rc = runtime->host_api.copy_from_device(host_header, pto2_sm, sizeof(PTO2SharedMemoryHeader));
+    if (hdr_rc != 0) {
+        LOG_WARN("Failed to copy PTO2 header from device");
+        return 0;
+    }
+
+    int32_t orch_error_code = host_header->orch_error_code.load(std::memory_order_relaxed);
+    int32_t sched_error_code = host_header->sched_error_code.load(std::memory_order_relaxed);
+    return runtime_status_from_error_codes(orch_error_code, sched_error_code);
+}
+
+/**
+ * Stage the per-callable resources (kernel binaries + orchestration SO) into
+ * the supplied runtime so a subsequent bind_callable_to_runtime_impl can use
+ * them. This is the cacheable half of init_runtime_impl: nothing here depends
+ * on per-run argument values, so the prepare_callable / run_prepared split
+ * lets us run this once per callable_id and amortize across runs.
+ *
+ * @param runtime   Pointer to pre-constructed Runtime (host_api populated)
+ * @param callable  ChipCallable carrying the orch SO + child kernel binaries
+ * @return 0 on success, -1 on failure
+ */
+extern "C" int
+prepare_callable_impl(const ChipCallable *callable, uint64_t (*upload_fn)(const void *), CallableArtifacts *out) {
+    if (callable == nullptr) {
+        LOG_ERROR("Callable pointer is null");
+        return -1;
+    }
+    if (upload_fn == nullptr || out == nullptr) {
+        LOG_ERROR("upload_fn or out is null");
+        return -1;
+    }
+    *out = CallableArtifacts{};
+    out->signature.assign(callable->signature_, callable->signature_ + callable->sig_count());
+
+    LOG_INFO_V0("Registering %d kernel(s) in prepare_callable_impl", callable->child_count());
+    if (upload_and_collect_child_addrs(callable, upload_fn, &out->kernel_addrs) != 0) {
+        LOG_ERROR("Failed to upload ChipCallable buffer");
+        return -1;
+    }
+    for (const ChildKernelAddr &c : out->kernel_addrs) {
+        if (c.func_id < 0 || c.func_id >= RUNTIME_MAX_FUNC_ID) {
+            LOG_ERROR("func_id=%d is out of range [0, %d)", c.func_id, RUNTIME_MAX_FUNC_ID);
+            return -1;
+        }
+    }
+
+    const uint8_t *orch_so_binary = static_cast<const uint8_t *>(callable->binary_data());
+    size_t orch_so_size = callable->binary_size();
+
+    if (orch_so_binary == nullptr || orch_so_size == 0) {
+        LOG_ERROR("Orchestration SO binary is required for device orchestration");
+        return -1;
+    }
+
+    out->orch_so_data = orch_so_binary;
+    out->orch_so_size = orch_so_size;
+    out->func_name = callable->func_name();
+    out->config_name = callable->config_name();
+    LOG_INFO_V0("Orchestration SO: %zu bytes staged (host-only)", orch_so_size);
+    return 0;
+}
+
+/**
+ * Per-run binding: build device-side argument storage (tensor copy-out, GM
+ * heap, PTO2 shared memory) and publish it to the runtime. Assumes the
+ * callable-side state (kernel binaries, orch SO bytes, func/config names)
+ * is already populated by prepare_callable_impl.
+ *
+ * Splitting this from prepare_callable_impl matches the per-callable_id
+ * design: register/run_prepared invokes this every call, while the prep
+ * half runs only once per callable_id.
+ *
+ * @param runtime    Pointer to pre-constructed Runtime (host_api populated)
+ * @param orch_args  Separated tensor/scalar arguments for this run
+ * @return 0 on success, -1 on failure
+ */
+extern "C" int bind_callable_to_runtime_impl(
+    Runtime *runtime, const ChipStorageTaskArgs *orch_args, void *host_orch_func_ptr, const ArgDirection *signature,
+    int sig_count, uint64_t ring_task_window, uint64_t ring_heap, uint64_t ring_dep_pool,
+    const uint64_t *ring_task_windows, const uint64_t *ring_heaps, const uint64_t *ring_dep_pools
+) {
+    if (runtime == nullptr) {
+        LOG_ERROR("Runtime pointer is null");
+        return -1;
+    }
+    if (orch_args == nullptr) {
+        LOG_ERROR("orch_args pointer is null");
+        return -1;
+    }
+    // trb runs orchestration on the device — there is no host-side orch
+    // function pointer to invoke. The c_api signature accepts one for
+    // symmetry with hbg; assert the trb-side invariant here.
+    if (host_orch_func_ptr != nullptr) {
+        LOG_ERROR("bind_callable_to_runtime_impl: trb does not accept a host_orch_func_ptr");
+        return -1;
+    }
+
+    int tensor_count = orch_args->tensor_count();
+    int scalar_count = orch_args->scalar_count();
+    LOG_INFO_V0("RT2 bind: %d tensors + %d scalars, device orchestration mode", tensor_count, scalar_count);
+
+    int64_t t_total_start = _now_ms();
+
+    uint64_t eff_task_window_sizes[PTO2_MAX_RING_DEPTH];
+    uint64_t eff_heap_sizes[PTO2_MAX_RING_DEPTH];
+    int32_t eff_dep_pool_capacities[PTO2_MAX_RING_DEPTH];
+    if (!resolve_ring_config(
+            ring_task_window, ring_heap, ring_dep_pool, ring_task_windows, ring_heaps, ring_dep_pools,
+            eff_task_window_sizes, eff_heap_sizes, eff_dep_pool_capacities
+        )) {
+        return -1;
+    }
+    const std::string task_window_log = format_ring_array(eff_task_window_sizes);
+    const std::string heap_log = format_ring_array(eff_heap_sizes);
+    const std::string dep_pool_log = format_ring_array(eff_dep_pool_capacities);
+    LOG_INFO_V0(
+        "Ring buffer sizes: task_window=%s heap=%s dep_pool=%s", task_window_log.c_str(), heap_log.c_str(),
+        dep_pool_log.c_str()
+    );
+
+    // Build device args: copy from input, replace host tensor pointers with device pointers
+    ChipStorageTaskArgs device_args;
+
+    int64_t t_args_start = _now_ms();
+    for (int i = 0; i < tensor_count; i++) {
+        Tensor t = orch_args->tensor(i);
+
+        if (t.is_child_memory()) {
+            LOG_INFO_V0("  Tensor %d: child memory, pass-through (0x%" PRIx64 ")", i, t.buffer.addr);
+            device_args.add_tensor(t);
+            continue;
+        }
+
+        void *host_ptr = reinterpret_cast<void *>(static_cast<uintptr_t>(t.buffer.addr));
+        size_t size = static_cast<size_t>(t.nbytes());
+
+        void *dev_ptr = runtime->host_api.device_malloc(size);
+        if (dev_ptr == nullptr) {
+            LOG_ERROR("Failed to allocate device memory for tensor %d", i);
+            return -1;
+        }
+
+        // Pure write-only OUTPUT buffers carry no meaningful host content, so
+        // the H2D copy-in is wasted. Zero them on-device instead (cheap HBM
+        // memset, no PCIe) so any region the kernel leaves unwritten reads as 0
+        // rather than pooled-allocator garbage. INOUT (read-before-write)
+        // and IN keep the H2D copy. Falls back to copy_to_device if a backend
+        // did not wire device_memset.
+        bool is_pure_output = (signature != nullptr && i < sig_count && signature[i] == ArgDirection::OUT);
+        int rc;
+        if (is_pure_output && runtime->host_api.device_memset != nullptr) {
+            rc = runtime->host_api.device_memset(dev_ptr, 0, size);
+        } else {
+            rc = runtime->host_api.copy_to_device(dev_ptr, host_ptr, size);
+        }
+        if (rc != 0) {
+            LOG_ERROR("Failed to stage tensor %d to device", i);
+            runtime->host_api.device_free(dev_ptr);
+            return -1;
+        }
+        // Read-only INPUT tensors are never written by the kernel, so there is
+        // no point copying them back D2H at the end. Index the signature
+        // by the orch tensor index `i` (child_memory tensors are skipped above
+        // but do not consume a separate signature slot — scalars follow the
+        // tensor entries). Anything not provably IN keeps the safe default of
+        // copying back.
+        bool needs_copy_back = !(signature != nullptr && i < sig_count && signature[i] == ArgDirection::IN);
+        runtime->tensor_pairs_.push_back({host_ptr, dev_ptr, size, needs_copy_back});
+        LOG_INFO_V0("  Tensor %d: %zu bytes at %p", i, size, dev_ptr);
+
+        t.buffer.addr = reinterpret_cast<uint64_t>(dev_ptr);
+        device_args.add_tensor(t);
+    }
+    for (int i = 0; i < scalar_count; i++) {
+        device_args.add_scalar(orch_args->scalar(i));
+    }
+    int64_t t_args_end = _now_ms();
+
+    // Read orchestrator-to-scheduler transition flag from environment
+    {
+        const char *env_val = std::getenv("PTO2_ORCH_TO_SCHED");
+        if (env_val && (env_val[0] == '1' || env_val[0] == 't' || env_val[0] == 'T')) {
+            runtime->orch_to_sched = true;
+        }
+        LOG_INFO_V0("Orchestrator-to-scheduler transition: %s", runtime->orch_to_sched ? "enabled" : "disabled");
+    }
+
+    // Lay out the per-Worker static device arena. GM heap, PTO2 shared memory,
+    // and the prebuilt runtime arena all live in a single backing allocation;
+    // setup_static_arena reserves the three regions and commits in one shot.
+    // Owned by DeviceRunner across runs — do NOT record in tensor_pairs_; the
+    // free is deferred to DeviceRunner::finalize(). The runtime-arena size is
+    // determined by replaying the reserve sequence on a host-side arena.
+    uint64_t total_heap_size = 0;
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        if (eff_heap_sizes[r] > std::numeric_limits<uint64_t>::max() - total_heap_size) {
+            LOG_ERROR("Total ring heap size overflows uint64_t");
+            return -1;
+        }
+        total_heap_size += eff_heap_sizes[r];
+    }
+    uint64_t sm_size = PTO2SharedMemoryHandle::calculate_size_per_ring(eff_task_window_sizes);
+
+    int64_t t_prebuilt_start = _now_ms();
+    DeviceArena host_arena;  // libc malloc backend by default
+    PTO2RuntimeArenaLayout layout =
+        runtime_reserve_layout(host_arena, eff_task_window_sizes, eff_heap_sizes, eff_dep_pool_capacities);
+    if (host_arena.commit(DeviceArena::kDefaultBaseAlign) == nullptr) {
+        LOG_ERROR("Failed to commit host arena for prebuilt runtime image");
+        return -1;
+    }
+
+    int64_t t_setup_start = _now_ms();
+    if (runtime->host_api.setup_static_arena(total_heap_size, sm_size, layout.arena_size) != 0) {
+        LOG_ERROR("Failed to setup pooled static arena");
+        return -1;
+    }
+    int64_t t_setup_end = _now_ms();
+
+    int64_t t_heap_start = _now_ms();
+    void *gm_heap = runtime->host_api.acquire_pooled_gm_heap();
+    int64_t t_heap_end = _now_ms();
+    if (gm_heap == nullptr) {
+        LOG_ERROR("Failed to acquire pooled GM heap");
+        return -1;
+    }
+    runtime->set_gm_heap(gm_heap);
+
+    int64_t t_sm_start = _now_ms();
+    void *sm_ptr = runtime->host_api.acquire_pooled_gm_sm();
+    int64_t t_sm_end = _now_ms();
+    if (sm_ptr == nullptr) {
+        LOG_ERROR("Failed to acquire pooled PTO2 shared memory");
+        return -1;
+    }
+    runtime->set_gm_sm_ptr(sm_ptr);
+
+    void *runtime_arena_dev = runtime->host_api.acquire_pooled_runtime_arena();
+    if (runtime_arena_dev == nullptr) {
+        LOG_ERROR("Failed to acquire pooled runtime arena");
+        return -1;
+    }
+
+    // Set up device orchestration state
+    runtime->set_orch_args(device_args);
+
+    // -------------------------------------------------------------------------
+    // Build the prebuilt runtime-arena image on host.
+    //
+    // We pre-compute every byte the AICPU's runtime arena would otherwise have
+    // to write at boot: layout offsets, sub-structure init data, and pointers
+    // back to the SM / GM heap. Then we rtMemcpy the image into the pooled
+    // runtime-arena region that DeviceRunner keeps alive across runs. AICPU
+    // boot becomes attach + wire (cheap pointer fixup) + sm_handle->init (SM
+    // reset) + a handful of device-only field fixups.
+    // -------------------------------------------------------------------------
+    PTO2Runtime *rt =
+        runtime_init_data_from_layout(host_arena, layout, PTO2_MODE_EXECUTE, sm_ptr, sm_size, gm_heap, eff_heap_sizes);
+    if (rt == nullptr) {
+        LOG_ERROR("runtime_init_data_from_layout failed");
+        return -1;
+    }
+    runtime_wire_arena_pointers(host_arena, layout, rt);
+
+    // Stash the layout inside the PTO2Runtime image so the AICPU can recover
+    // every arena-internal offset after rtMemcpy. The runtime arena's device
+    // base does NOT travel in this image — it's on the host Runtime
+    // (set_prebuilt_arena below), since the AICPU needs that pointer
+    // *before* it can dereference the image.
+    rt->prebuilt_layout = layout;
+
+    int rc_upload = runtime->host_api.copy_to_device(runtime_arena_dev, host_arena.base(), layout.arena_size);
+    if (rc_upload != 0) {
+        LOG_ERROR("Failed to rtMemcpy prebuilt runtime arena to device (rc=%d)", rc_upload);
+        return -1;
+    }
+    runtime->set_prebuilt_arena(runtime_arena_dev, layout.off_runtime);
+    int64_t t_prebuilt_end = _now_ms();
+
+    LOG_INFO_V0("Device orchestration ready: %d tensors + %d scalars", tensor_count, scalar_count);
+
+    int64_t t_total_end = _now_ms();
+    LOG_INFO_V0("TIMING: args_malloc_copy = %" PRId64 "ms", t_args_end - t_args_start);
+    LOG_INFO_V0("TIMING: static_arena_setup = %" PRId64 "ms", t_setup_end - t_setup_start);
+    LOG_INFO_V0("TIMING: gm_heap_acquire = %" PRId64 "ms", t_heap_end - t_heap_start);
+    LOG_INFO_V0("TIMING: shared_mem_acquire = %" PRId64 "ms", t_sm_end - t_sm_start);
+    LOG_INFO_V0("TIMING: prebuilt_runtime_arena = %" PRId64 "ms", t_prebuilt_end - t_prebuilt_start);
+    LOG_INFO_V0("TIMING: total_init_runtime_impl = %" PRId64 "ms", t_total_end - t_total_start);
+
+    return 0;
+}
+
+/**
+ * Validate runtime results and cleanup.
+ *
+ * This function:
+ * 1. Copies recorded tensors from device back to host
+ * 2. Frees device memory for recorded tensors
+ * 3. Clears tensor pair state
+ *
+ * @param runtime  Pointer to Runtime
+ * @return 0 on success, -1 on failure
+ */
+extern "C" int validate_runtime_impl(Runtime *runtime) {
+    if (runtime == nullptr) {
+        LOG_ERROR("Runtime pointer is null");
+        return -1;
+    }
+
+    int rc = 0;
+
+    LOG_INFO_V0("=== Copying Results Back to Host ===");
+
+    // Copy all recorded tensors from device back to host
+    TensorPair *tensor_pairs = runtime->tensor_pairs_.data();
+    int tensor_pair_count = static_cast<int>(runtime->tensor_pairs_.size());
+
+    LOG_INFO_V0("Tensor pairs to process: %d", tensor_pair_count);
+
+    // PTO2 (device orchestration): graph output may be in packed buffer
+    uint64_t graph_out_ptr = 0;
+    uint64_t graph_out_size = 0;
+    bool skip_tensor_copy_back = false;
+    int32_t runtime_status = 0;
+    PTO2SharedMemoryHeader host_header;
+    memset(&host_header, 0, sizeof(host_header));
+
+    runtime_status = pto2_read_runtime_status(runtime, &host_header);
+    if (runtime_status != 0) {
+        int32_t orch_error_code = host_header.orch_error_code.load(std::memory_order_relaxed);
+        int32_t sched_error_code = host_header.sched_error_code.load(std::memory_order_relaxed);
+        LOG_ERROR(
+            "PTO2 runtime failed: orch_error_code=%d sched_error_code=%d runtime_status=%d", orch_error_code,
+            sched_error_code, runtime_status
+        );
+        skip_tensor_copy_back = true;
+    } else {
+        graph_out_ptr = host_header.graph_output_ptr;
+        graph_out_size = host_header.graph_output_size;
+        if (graph_out_ptr != 0) {
+            LOG_INFO_V0("Graph output buffer: ptr=0x%" PRIx64 ", size=%" PRIu64, graph_out_ptr, graph_out_size);
+        }
+    }
+
+    if (skip_tensor_copy_back) {
+        LOG_WARN("Skipping tensor copy-back because PTO2 runtime reported fatal status");
+    } else {
+        bool first_output_tensor = true;
+        for (int i = 0; i < tensor_pair_count; i++) {
+            const TensorPair &pair = tensor_pairs[i];
+
+            // Skip if device pointer is null
+            if (pair.dev_ptr == nullptr) {
+                LOG_WARN("Tensor %d has null device pointer, skipping", i);
+                continue;
+            }
+
+            // If host pointer is null, this is a device-only allocation (no copy-back)
+            if (pair.host_ptr == nullptr) {
+                LOG_INFO_V0("Tensor %d: device-only allocation (no copy-back)", i);
+                continue;
+            }
+
+            // Read-only INPUT tensors were uploaded H2D but the kernel never
+            // wrote them — copying them back (potentially ~GB) is pure waste.
+            // They are still device_free'd in the cleanup loop below.
+            if (!pair.needs_copy_back) {
+                LOG_INFO_V0("Tensor %d: read-only input, skipping copy-back", i);
+                continue;
+            }
+
+            void *src_ptr = pair.dev_ptr;
+            size_t copy_size = pair.size;
+
+            // Use graph_output_ptr for the first output tensor if available
+            if (first_output_tensor && graph_out_ptr != 0 && graph_out_size > 0) {
+                src_ptr = reinterpret_cast<void *>(static_cast<uintptr_t>(graph_out_ptr));
+                copy_size = static_cast<size_t>(graph_out_size);
+                LOG_INFO_V0("Using packed output buffer for tensor %d", i);
+                first_output_tensor = false;
+            }
+
+            int copy_rc = runtime->host_api.copy_from_device(pair.host_ptr, src_ptr, copy_size);
+            if (copy_rc != 0) {
+                LOG_ERROR("Failed to copy tensor %d from device: %d", i, copy_rc);
+                rc = copy_rc;
+            } else {
+                LOG_INFO_V0("Tensor %d: %zu bytes copied to host", i, pair.size);
+            }
+        }
+    }
+
+    // Cleanup device tensors
+    LOG_INFO_V0("=== Cleaning Up ===");
+    for (int i = 0; i < tensor_pair_count; i++) {
+        if (tensor_pairs[i].dev_ptr != nullptr) {
+            runtime->host_api.device_free(tensor_pairs[i].dev_ptr);
+        }
+    }
+    LOG_INFO_V0("Freed %d device allocations", tensor_pair_count);
+
+    // Clear the per-run dispatch-table entries staged by prepare_callable_impl.
+    // The underlying chip-callable device buffer is pool-managed by
+    // DeviceRunner (keyed by content hash) and bulk-freed in
+    // DeviceRunner::finalize(); re-running the same callable repeatedly
+    // should not re-upload.
+    int kernel_count = runtime->get_registered_kernel_count();
+    for (int i = 0; i < kernel_count; i++) {
+        int func_id = runtime->get_registered_kernel_func_id(i);
+        runtime->set_function_bin_addr(func_id, 0);
+    }
+    if (kernel_count > 0) {
+        LOG_INFO_V0("Cleared %d kernel dispatch-table entries", kernel_count);
+    }
+    runtime->clear_registered_kernels();
+
+    // Clear tensor pairs
+    runtime->tensor_pairs_.clear();
+
+    LOG_INFO_V0("=== Finalize Complete ===");
+
+    if (rc == 0 && runtime_status != 0) {
+        rc = runtime_status;
+    }
+
+    return rc;
+}
+
+// Strong override of the weak runtime_apply_example_exec_time hook declared in
+// pto_runtime_c_api.h. fully_distributed_within_core is the only runtime that
+// implements the sim-only trace-driven replay feature: stash the per-func
+// reference durations on the Runtime so execute_slot busy-waits
+// example_exec_time_ns_[func_id] in place of the real incore kernel. A func
+// left at 0 (or func_id beyond the table) still runs for real. See
+// call_config.h::use_example_exec_time.
+extern "C" void
+runtime_apply_example_exec_time(void *runtime, int use_example_exec_time, const int32_t *example_exec_time_ns) {
+    Runtime *rt = static_cast<Runtime *>(runtime);
+    rt->use_example_exec_time_ = (use_example_exec_time != 0);
+    for (int i = 0; i < RUNTIME_MAX_FUNC_ID; ++i) {
+        rt->example_exec_time_ns_[i] =
+            (use_example_exec_time != 0 && example_exec_time_ns != nullptr && i < CALLCONFIG_MAX_EXAMPLE_FUNCS) ?
+                example_exec_time_ns[i] :
+                0;
+    }
+}
diff --git a/src/a2a3/runtime/fully_distributed_within_core/orchestration/common.cpp b/src/a2a3/runtime/fully_distributed_within_core/orchestration/common.cpp
new file mode 100644
index 000000000..c4878a1c2
--- /dev/null
+++ b/src/a2a3/runtime/fully_distributed_within_core/orchestration/common.cpp
@@ -0,0 +1,197 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+#include "common.h"
+
+#ifdef __linux__
+#include <cxxabi.h>
+#include <dlfcn.h>
+#include <execinfo.h>
+#include <unistd.h>
+
+#include <array>
+#include <cstring>
+#include <vector>
+#endif
+
+struct PTO2Runtime;
+
+// Unified-log error sink. Forward-declared here rather than pulled via
+// common/unified_log.h: that header lives under common/log/include, which is
+// not on the orchestration .so build's include path. The symbol resolves at
+// link time for the runtime targets, and at dlopen time for the orchestration
+// .so (against the executor's unified_log_device), so onboard diagnostics still
+// reach the CANN device log.
+extern "C" void unified_log_error(const char *func, const char *fmt, ...);
+
+namespace {
+// Plain global (not thread_local) to avoid glibc TLSDESC stale-resolution
+// crash (BZ #32412) when the orchestration SO is dlclose'd/re-dlopen'd
+// between execution rounds.  All orchestrator threads bind the same rt
+// value, so per-thread storage is unnecessary.
+PTO2Runtime *g_current_runtime = nullptr;
+}  // namespace
+
+extern "C" __attribute__((visibility("default"))) void framework_bind_runtime(PTO2Runtime *rt) {
+    g_current_runtime = rt;
+}
+
+// Keep current_runtime local to this .so so orchestration helpers do not
+// accidentally bind to the AICPU binary's same-named symbol.
+extern "C" __attribute__((visibility("hidden"))) PTO2Runtime *framework_current_runtime() { return g_current_runtime; }
+
+/**
+ * Use addr2line to convert an address to file:line information.
+ * Uses the -i flag to expand inlines; returns the first line (innermost actual code location).
+ * If inlining is present, also returns the outer call chain via inline_chain.
+ */
+#ifdef __linux__
+static std::string addr_to_line(const char *executable, void *addr, std::string *inline_chain = nullptr) {
+    char cmd[512];
+    snprintf(cmd, sizeof(cmd), "addr2line -e %s -f -C -p -i %p 2>/dev/null", executable, addr);
+
+    std::array<char, 256> buffer;
+    std::string raw_output;
+
+    FILE *pipe = popen(cmd, "r");
+    if (pipe) {
+        while (fgets(buffer.data(), buffer.size(), pipe) != nullptr) {
+            raw_output += buffer.data();
+        }
+        pclose(pipe);
+    }
+
+    if (raw_output.empty() || raw_output.find("??") != std::string::npos) {
+        return "";
+    }
+
+    // Split by lines
+    std::vector<std::string> lines;
+    size_t pos = 0;
+    while (pos < raw_output.size()) {
+        size_t nl = raw_output.find('\n', pos);
+        if (nl == std::string::npos) nl = raw_output.size();
+        std::string line = raw_output.substr(pos, nl - pos);
+        while (!line.empty() && line.back() == '\r')
+            line.pop_back();
+        if (!line.empty()) lines.push_back(line);
+        pos = nl + 1;
+    }
+
+    if (lines.empty()) return "";
+
+    // First line is the innermost actual code location; subsequent lines are outer inline callers
+    if (inline_chain && lines.size() > 1) {
+        *inline_chain = "";
+        for (size_t j = 1; j < lines.size(); j++) {
+            *inline_chain += "    [inlined by] " + lines[j] + "\n";
+        }
+    }
+
+    return lines.front();
+}
+#endif
+
+/**
+ * Get current stack trace information (including file paths and line numbers).
+ * Uses dladdr to locate the shared library for each stack frame, then calls addr2line with relative addresses.
+ */
+std::string get_stacktrace(int skip_frames) {
+    (void)skip_frames;  // May be unused on non-Linux platforms
+    std::string result;
+#ifdef __linux__
+    const int max_frames = 64;
+    void *buffer[max_frames];
+    int nframes = backtrace(buffer, max_frames);
+    char **symbols = backtrace_symbols(buffer, nframes);
+
+    if (symbols) {
+        result = "Stack trace:\n";
+        for (int i = skip_frames; i < nframes; i++) {
+            std::string frame_info;
+
+            void *addr = (void *)((char *)buffer[i] - 1);
+
+            Dl_info dl_info;
+            std::string inline_chain;
+            if (dladdr(addr, &dl_info) && dl_info.dli_fname) {
+                void *rel_addr = (void *)((char *)addr - (char *)dl_info.dli_fbase);
+                std::string addr2line_result = addr_to_line(dl_info.dli_fname, rel_addr, &inline_chain);
+
+                if (addr2line_result.empty()) {
+                    addr2line_result = addr_to_line(dl_info.dli_fname, addr, &inline_chain);
+                }
+
+                if (!addr2line_result.empty()) {
+                    frame_info = std::string(dl_info.dli_fname) + ": " + addr2line_result;
+                }
+            }
+
+            if (frame_info.empty()) {
+                std::string frame(symbols[i]);
+
+                size_t start = frame.find('(');
+                size_t end = frame.find('+', start);
+                if (start != std::string::npos && end != std::string::npos) {
+                    std::string mangled = frame.substr(start + 1, end - start - 1);
+                    int status;
+                    char *demangled = abi::__cxa_demangle(mangled.c_str(), nullptr, nullptr, &status);
+                    if (status == 0 && demangled) {
+                        frame = frame.substr(0, start + 1) + demangled + frame.substr(end);
+                        free(demangled);
+                    }
+                }
+                frame_info = frame;
+            }
+
+            char buf[16];
+            snprintf(buf, sizeof(buf), "  #%d ", i - skip_frames);
+            result += buf + frame_info + "\n";
+            if (!inline_chain.empty()) {
+                result += inline_chain;
+            }
+        }
+        free(symbols);
+    }
+#else
+    result = "(Stack trace is only available on Linux)\n";
+#endif
+    return result;
+}
+
+// AssertionError constructor
+static std::string build_assert_message(const char *condition, const char *file, int line) {
+    std::string msg = "Assertion failed: " + std::string(condition) + "\n";
+    msg += "  Location: " + std::string(file) + ":" + std::to_string(line) + "\n";
+    msg += get_stacktrace(3);
+    return msg;
+}
+
+AssertionError::AssertionError(const char *condition, const char *file, int line) :
+    std::runtime_error(build_assert_message(condition, file, line)),
+    condition_(condition),
+    file_(file),
+    line_(line) {}
+
+[[noreturn]] void assert_impl(const char *condition, const char *file, int line) {
+    // Use unified_log_error directly rather than the LOG_ERROR macro: that macro
+    // lives in pto_orchestration_api.h and expands to
+    // current_runtime()->ops->log_error, but the ops table's definition pulls in
+    // pto_types.h (Arg → __aicore__-only to_u64), which the AICore build of this
+    // TU cannot compile. unified_log_error reaches the same sink without that
+    // dependency.
+    unified_log_error(__FUNCTION__, "\n========================================");
+    unified_log_error(__FUNCTION__, "Assertion failed: %s", condition);
+    unified_log_error(__FUNCTION__, "Location: %s:%d", file, line);
+    unified_log_error(__FUNCTION__, "%s", get_stacktrace(2).c_str());
+    unified_log_error(__FUNCTION__, "========================================\n");
+
+    throw AssertionError(condition, file, line);
+}
diff --git a/src/a2a3/runtime/fully_distributed_within_core/orchestration/pto_arg_with_deps.h b/src/a2a3/runtime/fully_distributed_within_core/orchestration/pto_arg_with_deps.h
new file mode 100644
index 000000000..863bed92d
--- /dev/null
+++ b/src/a2a3/runtime/fully_distributed_within_core/orchestration/pto_arg_with_deps.h
@@ -0,0 +1,140 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Convenience layer over Arg: bundles a fixed-capacity dependency buffer with
+ * an Arg and exposes an incremental add_dep(...) API on top of the runtime
+ * primitive L0TaskArgs::set_dependencies(ptr, count).
+ *
+ * Layering:
+ *   - Primitive:   Arg + set_dependencies(ptr, count) in pto_types.h.
+ *                  No cap, caller owns the deps buffer.
+ *   - Convenience: L0TaskArgsWithDeps<N> in this header. Owns a stack-sized dep
+ *                  buffer of capacity N (default 16); provides add_dep().
+ *                  Submitted via the rt_submit_*_task overloads below, which
+ *                  forward the bundled deps into the underlying Arg.
+ *
+ * This file is auto-included at the bottom of pto_orchestration_api.h so
+ * orchestration sources see L0TaskArgsWithDeps after a single `#include
+ * "pto_orchestration_api.h"`. The split is purely organizational —
+ * orchestration code should not include this header directly. Code generated
+ * from pypto can ignore the convenience layer entirely and target Arg +
+ * set_dependencies(ptr, count) directly.
+ *
+ * L0TaskArgsWithDeps uses private inheritance from Arg so that set_dependencies and
+ * the explicit_dep* accessors are NOT reachable on a wrapper instance — users
+ * who pick the convenience layer cannot accidentally mix it with the
+ * primitive layer's dep API on the same object.
+ */
+
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <type_traits>
+
+#include "pto_orchestration_api.h"  // Arg, MixedKernels, rt_submit_* primitives
+
+template <size_t MAX_DEP_COUNT = 16>
+class L0TaskArgsWithDeps : private L0TaskArgs {
+public:
+    // Tensor / scalar setters — forward to Arg
+    using L0TaskArgs::add_inout;
+    using L0TaskArgs::add_input;
+    using L0TaskArgs::add_no_dep;
+    using L0TaskArgs::add_output;
+    using L0TaskArgs::add_scalar;
+    using L0TaskArgs::add_scalars;
+    using L0TaskArgs::add_scalars_i32;
+    using L0TaskArgs::allow_early_resolve;  // speculative early-dispatch hint (getter)
+    using L0TaskArgs::copy_scalars_from;
+    using L0TaskArgs::set_allow_early_resolve;  // speculative early-dispatch hint (setter)
+
+    // Error / status — forward to Arg
+    using L0TaskArgs::error_msg;
+    using L0TaskArgs::has_error;
+    using L0TaskArgs::launch_spec;
+    using L0TaskArgs::set_error;
+
+    // NOT exposed: set_dependencies, explicit_dep_count, explicit_dep,
+    // explicit_deps_data — these are the primitive-layer dep API. Users of
+    // the convenience layer reach dependencies only through add_dep() below.
+
+    /**
+     * Append one or more dependencies to the bundled buffer. May be called
+     * multiple times; deps accumulate. Variadic accepts any non-zero number
+     * of PTO2TaskId arguments.
+     *
+     * Overflow (more than MAX_DEP_COUNT total) records an error on the
+     * underlying Arg; the error surfaces at submit time.
+     */
+    template <typename... Ids>
+    void add_dep(Ids... ids) {
+        static_assert(sizeof...(Ids) >= 1, "add_dep: at least one task id is required");
+        static_assert(
+            (std::is_same_v<std::decay_t<Ids>, PTO2TaskId> && ...), "add_dep: all arguments must be PTO2TaskId"
+        );
+        if (count_ + sizeof...(Ids) > MAX_DEP_COUNT) {
+            L0TaskArgs::set_error(
+                "L0TaskArgsWithDeps::add_dep: dep count exceeds MAX_DEP_COUNT (bump the template arg)"
+            );
+            return;
+        }
+        ((deps_[count_++] = ids), ...);
+    }
+
+    /**
+     * Clear the bundled dep buffer and reset the underlying Arg.
+     * Use this to recycle an L0TaskArgsWithDeps across loop iterations.
+     */
+    void reset() {
+        L0TaskArgs::reset();
+        count_ = 0;
+    }
+
+    /**
+     * Submit-only hook: bind the bundled deps onto the underlying Arg and
+     * return it as Arg&. Called by the rt_submit_*_task overloads below;
+     * orchestration code does not invoke this directly.
+     *
+     * Idempotent: explicitly clears any prior dep binding before re-setting,
+     * so a wrapper can be re-finalized (e.g. resubmitted) without tripping
+     * the primitive layer's single-shot check.
+     */
+    L0TaskArgs &finalize_for_submit() {
+        L0TaskArgs::set_dependencies(nullptr, 0);
+        L0TaskArgs::set_dependencies(deps_, count_);
+        return *this;
+    }
+
+private:
+    PTO2TaskId deps_[MAX_DEP_COUNT];
+    uint32_t count_ = 0;
+};
+
+// =============================================================================
+// Submit overloads — accept L0TaskArgsWithDeps<N> transparently
+// =============================================================================
+
+template <size_t N>
+static inline TaskOutputTensors rt_submit_task(const MixedKernels &mixed_kernels, L0TaskArgsWithDeps<N> &awd) {
+    return rt_submit_task(mixed_kernels, awd.finalize_for_submit());
+}
+
+template <size_t N>
+static inline TaskOutputTensors rt_submit_aic_task(int32_t kernel_id, L0TaskArgsWithDeps<N> &awd) {
+    return rt_submit_aic_task(kernel_id, awd.finalize_for_submit());
+}
+
+template <size_t N>
+static inline TaskOutputTensors rt_submit_aiv_task(int32_t kernel_id, L0TaskArgsWithDeps<N> &awd) {
+    return rt_submit_aiv_task(kernel_id, awd.finalize_for_submit());
+}
diff --git a/src/a2a3/runtime/fully_distributed_within_core/orchestration/pto_orchestration_api.h b/src/a2a3/runtime/fully_distributed_within_core/orchestration/pto_orchestration_api.h
new file mode 100644
index 000000000..b07c94926
--- /dev/null
+++ b/src/a2a3/runtime/fully_distributed_within_core/orchestration/pto_orchestration_api.h
@@ -0,0 +1,385 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * PTO Orchestration API - Slim header for orchestration .so files
+ *
+ * This header provides everything an orchestration source needs without
+ * pulling in runtime implementation headers.  The orchestration .so has
+ * zero link dependencies on runtime .cpp files; all runtime calls go
+ * through the PTO2RuntimeOps function-pointer table embedded in
+ * PTO2Runtime.
+ *
+ * Orchestration sources include ONLY this header:
+ *   #include "pto_orchestration_api.h"
+ *
+ * Runtime sources continue to use pto_runtime2.h (which defines the
+ * full PTO2Runtime struct with all internal fields).
+ */
+
+#pragma once
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include <type_traits>
+
+// Type headers needed by orchestration
+#include "common.h"              // framework_bind_runtime / framework_current_runtime
+#include "pto_runtime2_types.h"  // PTO2_ERROR_*
+#include "pto_submit_types.h"    // MixedKernels, INVALID_KERNEL_ID, subtask slots
+#include "pto_types.h"           // Arg, TaskOutputTensors, TensorArgType
+#include "task_args.h"           // ChipStorageTaskArgs, Tensor
+#include "tensor.h"              // Tensor, TensorCreateInfo
+
+// =============================================================================
+// Tensor Factory Helpers
+// =============================================================================
+
+// make_tensor_external(...) — canonical factory for pre-allocated external
+// memory — is defined in the unified tensor.h (common), so host and runtime
+// build Tensors through the same controlled path.
+
+// =============================================================================
+// Ops Table and Opaque Runtime
+// =============================================================================
+
+/**
+ * Forward declaration — the orchestration sees PTO2Runtime as a partial
+ * struct whose first field is the ops pointer.  The full definition
+ * lives in pto_runtime2.h (used only by runtime .cpp files).
+ */
+typedef struct PTO2Runtime PTO2Runtime;
+
+/**
+ * Function-pointer table for runtime operations.
+ * Populated by the runtime; called by orchestration through inline wrappers.
+ */
+typedef struct PTO2RuntimeOps {
+    TaskOutputTensors (*submit_task)(PTO2Runtime *rt, const MixedKernels &mixed_kernels, const L0TaskArgs &args);
+    void (*scope_begin)(PTO2Runtime *rt);
+    void (*scope_end)(PTO2Runtime *rt);
+    void (*orchestration_done)(PTO2Runtime *rt);
+    bool (*is_fatal)(PTO2Runtime *rt);
+    void (*report_fatal)(PTO2Runtime *rt, int32_t error_code, const char *func, const char *fmt, ...);
+
+    // Logging (populated by runtime, called by orchestration)
+    void (*log_error)(const char *func, const char *fmt, ...);
+    void (*log_warn)(const char *func, const char *fmt, ...);
+    void (*log_debug)(const char *func, const char *fmt, ...);
+    // INFO with explicit verbosity tier (v ∈ [0,9]; gating done inside).
+    void (*log_info_v)(const char *func, int v, const char *fmt, ...);
+
+    // Cross-layer data access (orchestration reads/writes tensor values via runtime)
+    // Placed after logging to avoid shifting hot-path field offsets.
+    uint64_t (*get_tensor_data)(PTO2Runtime *rt, const Tensor &tensor, uint32_t ndims, const uint32_t indices[]);
+    void (*set_tensor_data)(
+        PTO2Runtime *rt, const Tensor &tensor, uint32_t ndims, const uint32_t indices[], uint64_t value
+    );
+    TaskOutputTensors (*alloc_tensors)(PTO2Runtime *rt, const L0TaskArgs &args);
+    TaskOutputTensors (*submit_dummy_task)(PTO2Runtime *rt, const L0TaskArgs &args);
+
+    // Stash the call-site of the next PTO2ScopeGuard so the [ScopeStats]
+    // collector can log it. Always present to keep ops-table layout stable
+    // across PTO2_PROFILING settings; set to nullptr at PTO2_PROFILING=0.
+    void (*scope_set_site)(const char *file, int line);
+} PTO2RuntimeOps;
+
+/**
+ * Partial PTO2Runtime definition for orchestration.
+ *
+ * Exposes the ops pointer (for runtime calls) and pending_scope_mode
+ * (read directly by inline scope wrappers).  The real struct (in
+ * pto_runtime2.h) has the same first fields, so accessing them through
+ * this definition is well-defined (C struct layout guarantee).
+ */
+struct PTO2Runtime {
+    const PTO2RuntimeOps *ops;
+    PTO2ScopeMode pending_scope_mode;
+};
+
+// =============================================================================
+// Inline Convenience Wrappers (call through ops table)
+// =============================================================================
+
+static inline PTO2Runtime *current_runtime() { return framework_current_runtime(); }
+
+static inline TaskOutputTensors alloc_tensors(const L0TaskArgs &args) {
+    PTO2Runtime *rt = current_runtime();
+    if (rt->ops->is_fatal(rt)) {
+        return TaskOutputTensors{};
+    }
+    return rt->ops->alloc_tensors(rt, args);
+}
+
+static inline TaskOutputTensors alloc_tensors(const TensorCreateInfo create_infos[], uint32_t count) {
+    PTO2Runtime *rt = current_runtime();
+    if (rt->ops->is_fatal(rt)) {
+        return TaskOutputTensors{};
+    }
+    L0TaskArgs args;
+    for (uint32_t i = 0; i < count; i++) {
+        args.add_output(create_infos[i]);
+    }
+    if (args.has_error) {
+        rt->ops->report_fatal(
+            rt, PTO2_ERROR_INVALID_ARGS, __FUNCTION__, "%s",
+            args.error_msg ? args.error_msg : "alloc_tensors failed to construct output-only Arg"
+        );
+        return TaskOutputTensors{};
+    }
+    return alloc_tensors(args);
+}
+
+template <typename... CIs>
+static inline TaskOutputTensors alloc_tensors(const CIs &...cis) {
+    static_assert(sizeof...(cis) > 0, "alloc_tensors requires at least one TensorCreateInfo");
+    static_assert(
+        (std::is_same_v<std::decay_t<CIs>, TensorCreateInfo> && ...),
+        "alloc_tensors only accepts TensorCreateInfo arguments"
+    );
+    PTO2Runtime *rt = current_runtime();
+    if (rt->ops->is_fatal(rt)) {
+        return TaskOutputTensors{};
+    }
+    L0TaskArgs args;
+    (args.add_output(cis), ...);
+    if (args.has_error) {
+        rt->ops->report_fatal(
+            rt, PTO2_ERROR_INVALID_ARGS, __FUNCTION__, "%s",
+            args.error_msg ? args.error_msg : "alloc_tensors failed to construct output-only Arg"
+        );
+        return TaskOutputTensors{};
+    }
+    return alloc_tensors(args);
+}
+
+static inline TaskOutputTensors rt_submit_task(const MixedKernels &mixed_kernels, const L0TaskArgs &args) {
+    PTO2Runtime *rt = current_runtime();
+    if (rt->ops->is_fatal(rt)) {
+        return TaskOutputTensors{};
+    }
+    return rt->ops->submit_task(rt, mixed_kernels, args);
+}
+
+/**
+ * Convenience wrapper: submit an AIC-only task.
+ */
+static inline TaskOutputTensors rt_submit_aic_task(int32_t kernel_id, const L0TaskArgs &args) {
+    MixedKernels mk;
+    mk.aic_kernel_id = kernel_id;
+    return rt_submit_task(mk, args);
+}
+
+/**
+ * Convenience wrapper: submit an AIV-only task (uses AIV0 slot).
+ */
+static inline TaskOutputTensors rt_submit_aiv_task(int32_t kernel_id, const L0TaskArgs &args) {
+    MixedKernels mk;
+    mk.aiv0_kernel_id = kernel_id;
+    return rt_submit_task(mk, args);
+}
+
+/**
+ * Submit a dependency-only task. Accepts the same Arg shape as rt_submit_task
+ * (inputs, outputs, inouts, explicit_deps, scalars) but does not run any
+ * AICore kernel. The task still participates in the dependency graph: it
+ * waits on its fanin and notifies its fanout. Useful as a synchronization
+ * barrier or as a placeholder producer for tests / dep-graph wiring.
+ */
+static inline TaskOutputTensors rt_submit_dummy_task(const L0TaskArgs &args) {
+    PTO2Runtime *rt = current_runtime();
+    if (rt->ops->is_fatal(rt)) {
+        return TaskOutputTensors{};
+    }
+    return rt->ops->submit_dummy_task(rt, args);
+}
+
+static inline void rt_scope_begin(PTO2ScopeMode mode = PTO2ScopeMode::AUTO) {
+    PTO2Runtime *rt = current_runtime();
+    if (rt->ops->is_fatal(rt)) {
+        return;
+    }
+    rt->pending_scope_mode = mode;
+    rt->ops->scope_begin(rt);
+}
+
+static inline void rt_scope_end() {
+    PTO2Runtime *rt = current_runtime();
+    if (rt->ops->is_fatal(rt)) {
+        return;
+    }
+    rt->ops->scope_end(rt);
+}
+
+static inline void rt_orchestration_done() {
+    PTO2Runtime *rt = current_runtime();
+    rt->ops->orchestration_done(rt);
+}
+
+static inline bool rt_is_fatal() {
+    PTO2Runtime *rt = current_runtime();
+    return rt->ops->is_fatal(rt);
+}
+
+#define rt_report_fatal(code, fmt, ...)                                          \
+    do {                                                                         \
+        PTO2Runtime *_rt = current_runtime();                                    \
+        _rt->ops->report_fatal(_rt, (code), __FUNCTION__, (fmt), ##__VA_ARGS__); \
+    } while (0)
+
+// =============================================================================
+// Logging Macros for Orchestration (call through ops table)
+// =============================================================================
+
+#define LOG_ERROR(fmt, ...) current_runtime()->ops->log_error(__FUNCTION__, fmt, ##__VA_ARGS__)
+#define LOG_WARN(fmt, ...) current_runtime()->ops->log_warn(__FUNCTION__, fmt, ##__VA_ARGS__)
+#define LOG_DEBUG(fmt, ...) current_runtime()->ops->log_debug(__FUNCTION__, fmt, ##__VA_ARGS__)
+
+// INFO verbosity tiers. v=0 most verbose, v=9 must-see, v=5 default.
+#define LOG_INFO_V0(fmt, ...) current_runtime()->ops->log_info_v(__FUNCTION__, 0, fmt, ##__VA_ARGS__)
+#define LOG_INFO_V1(fmt, ...) current_runtime()->ops->log_info_v(__FUNCTION__, 1, fmt, ##__VA_ARGS__)
+#define LOG_INFO_V2(fmt, ...) current_runtime()->ops->log_info_v(__FUNCTION__, 2, fmt, ##__VA_ARGS__)
+#define LOG_INFO_V3(fmt, ...) current_runtime()->ops->log_info_v(__FUNCTION__, 3, fmt, ##__VA_ARGS__)
+#define LOG_INFO_V4(fmt, ...) current_runtime()->ops->log_info_v(__FUNCTION__, 4, fmt, ##__VA_ARGS__)
+#define LOG_INFO_V5(fmt, ...) current_runtime()->ops->log_info_v(__FUNCTION__, 5, fmt, ##__VA_ARGS__)
+#define LOG_INFO_V6(fmt, ...) current_runtime()->ops->log_info_v(__FUNCTION__, 6, fmt, ##__VA_ARGS__)
+#define LOG_INFO_V7(fmt, ...) current_runtime()->ops->log_info_v(__FUNCTION__, 7, fmt, ##__VA_ARGS__)
+#define LOG_INFO_V8(fmt, ...) current_runtime()->ops->log_info_v(__FUNCTION__, 8, fmt, ##__VA_ARGS__)
+#define LOG_INFO_V9(fmt, ...) current_runtime()->ops->log_info_v(__FUNCTION__, 9, fmt, ##__VA_ARGS__)
+
+// =============================================================================
+// Cross-Layer Data Access
+// =============================================================================
+
+/**
+ * Read a value from a tensor at the given multi-dimensional indices.
+ *
+ * Default T = uint64_t preserves old behavior (raw bits).
+ * Specify T to get automatic type conversion:
+ *
+ *   uint64_t raw = get_tensor_data(tensor, 1, idx);       // old usage unchanged
+ *   float val = get_tensor_data<float>(tensor, 1, idx);   // typed read
+ *
+ * If the tensor has a producer in TensorMap, spin-waits until the producer
+ * task completes before reading. External tensors (make_tensor_external)
+ * are read immediately without waiting.
+ */
+template <typename T = uint64_t>
+static inline T get_tensor_data(const Tensor &tensor, uint32_t ndims, const uint32_t indices[]) {
+    PTO2Runtime *rt = current_runtime();
+    if (rt->ops->is_fatal(rt)) {
+        return from_u64<T>(0);
+    }
+    return from_u64<T>(rt->ops->get_tensor_data(rt, tensor, ndims, indices));
+}
+
+/**
+ * Write a value to a tensor at the given multi-dimensional indices.
+ *
+ * Type is deduced from value argument; uint64_t by default:
+ *
+ *   set_tensor_data(tensor, 1, idx, raw_u64);     // old usage unchanged
+ *   set_tensor_data(tensor, 1, idx, 42.0f);       // typed write (T = float)
+ *
+ * If the tensor has a producer in TensorMap, spin-waits until the producer
+ * and all its consumers complete before writing (WAW + WAR safety).
+ * External tensors (make_tensor_external) with no TensorMap entry are
+ * written immediately without waiting.
+ *
+ * Limitation: TensorMap only tracks producers (OUTPUT/INOUT), not consumers
+ * that used the tensor as INPUT. If a kernel reads this tensor as INPUT
+ * (not INOUT) and the tensor has no TensorMap producer entry, set_tensor_data
+ * cannot detect the reader and may cause a data race.
+ *
+ * To ensure WAR safety for all access patterns, use add_inout() instead of
+ * add_input() for kernel parameters that may later be written via
+ * set_tensor_data. INOUT creates a TensorMap entry that enables automatic
+ * consumer tracking via fanout_refcount.
+ *
+ * The tensor must already have an allocated buffer (addr != 0).
+ * For runtime-created outputs, call this only on the Tensor returned by
+ * add_output(TensorCreateInfo) after submit returns.
+ */
+template <typename T = uint64_t>
+static inline void set_tensor_data(const Tensor &tensor, uint32_t ndims, const uint32_t indices[], T value) {
+    PTO2Runtime *rt = current_runtime();
+    if (rt->ops->is_fatal(rt)) {
+        return;
+    }
+    rt->ops->set_tensor_data(rt, tensor, ndims, indices, to_u64(value));
+}
+
+// =============================================================================
+// C++ Scope Guards and Macros
+// =============================================================================
+
+/**
+ * RAII Scope Guard (calls through ops table)
+ */
+class PTO2ScopeGuard {
+public:
+    explicit PTO2ScopeGuard(
+        PTO2ScopeMode mode = PTO2ScopeMode::AUTO, const char *file = __builtin_FILE(), int line = __builtin_LINE()
+    ) :
+        rt_(current_runtime()) {
+        if (!rt_->ops->is_fatal(rt_)) {
+            rt_->pending_scope_mode = mode;
+            if (rt_->ops->scope_set_site) rt_->ops->scope_set_site(file, line);
+            rt_->ops->scope_begin(rt_);
+        }
+    }
+    ~PTO2ScopeGuard() {
+        if (!rt_->ops->is_fatal(rt_)) {
+            rt_->ops->scope_end(rt_);
+        }
+    }
+
+private:
+    PTO2Runtime *rt_;
+};
+
+#define _PTO2_CONCATENATE_IMPL(x, y) x##y
+#define _PTO2_CONCATENATE(x, y) _PTO2_CONCATENATE_IMPL(x, y)
+
+#define PTO2_SCOPE_GUARD() [[maybe_unused]] PTO2ScopeGuard _PTO2_CONCATENATE(scope_guard_, __COUNTER__)
+
+/**
+ * Scoped block macro:
+ *   PTO2_SCOPE() {
+ *       rt_submit_task(...);
+ *   }
+ */
+#define PTO2_SCOPE(...) if (PTO2ScopeGuard _PTO2_CONCATENATE(scope_guard_, __COUNTER__){__VA_ARGS__}; true)
+
+// =============================================================================
+// Orchestration Config
+// =============================================================================
+
+/**
+ * Configuration exported by orchestration .so via aicpu_orchestration_config().
+ * The executor reads these values to set up shared memory and runtime.
+ *
+ * This struct is defined identically in pto_runtime2.h (with an include
+ * guard) so the executor can use the same type without including this header.
+ */
+#ifndef PTO2_ORCHESTRATION_CONFIG_DEFINED
+#define PTO2_ORCHESTRATION_CONFIG_DEFINED
+struct PTO2OrchestrationConfig {
+    int expected_arg_count;
+};
+#endif
+
+// Convenience layer (L0TaskArgsWithDeps<N> + matching rt_submit_*_task overloads).
+// Pulled in at the bottom so the wrapper sees L0TaskArgs, MixedKernels, and the
+// rt_submit_*_task primitives defined above. Orchestration sources include
+// only this single header to access both the primitive and convenience APIs.
+#include "pto_arg_with_deps.h"  // NOLINT(build/include_subdir)
diff --git a/src/a2a3/runtime/fully_distributed_within_core/runtime/aicore_completion_mailbox.h b/src/a2a3/runtime/fully_distributed_within_core/runtime/aicore_completion_mailbox.h
new file mode 100644
index 000000000..0f73a043a
--- /dev/null
+++ b/src/a2a3/runtime/fully_distributed_within_core/runtime/aicore_completion_mailbox.h
@@ -0,0 +1,189 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+#ifndef SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_AICORE_COMPLETION_MAILBOX_H_
+#define SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_AICORE_COMPLETION_MAILBOX_H_
+
+#include <atomic>
+#include <cstdint>
+
+#include "aicore_completion_mailbox_types.h"
+#include "pto_constants.h"
+#include "pto_task_id.h"
+
+// AICPU-only MPSC ring used to convey deferred-completion observations from
+// FIN-handling scheduler threads to the dispatch thread. Producers push under
+// CAS on `head`; the single consumer (dispatch thread, under AsyncWaitList::
+// busy) drains in seq order. Kernel-side code never touches this struct —
+// AICore writes go into DeferredCompletionSlab (see
+// aicore_completion_mailbox_types.h), which the FIN thread reads, flattens
+// into messages here, and forwards.
+
+#define AICORE_COMPLETION_MAILBOX_CAPACITY 4096u
+#define AICORE_COMPLETION_MAILBOX_MASK (AICORE_COMPLETION_MAILBOX_CAPACITY - 1u)
+
+static_assert(
+    (AICORE_COMPLETION_MAILBOX_CAPACITY & (AICORE_COMPLETION_MAILBOX_CAPACITY - 1u)) == 0,
+    "AICORE_COMPLETION_MAILBOX_CAPACITY must be a power of two"
+);
+
+// Mailbox message discriminator. CONDITION carries one deferred-completion
+// observation flattened from a DeferredCompletionEntry. TASK_NORMAL_DONE
+// carries the slot_state pointer in `addr` so the consumer can finalize the
+// AsyncWaitEntry.slot_state binding for tasks whose conditions arrived
+// before the FIN thread saw task_complete. New kinds may be added in future
+// without growing the message — the `_pad[5]` slack is reserved for
+// kind-specific payload extension.
+#define MSG_KIND_CONDITION 0u
+#define MSG_KIND_TASK_NORMAL_DONE 1u
+
+struct AICoreCompletionMailboxMessage {
+    // Per-slot ready flag. Producer publishes `tail+1` after filling the rest
+    // of the slot with a release store; consumer waits for the matching seq
+    // value with an acquire load. The release-acquire pair publishes all
+    // other fields below as a side effect, so they stay plain.
+    std::atomic<uint64_t> seq;
+    PTO2TaskId task_token;
+    // CONDITION: completion observation addr (counter / SDMA event record).
+    // TASK_NORMAL_DONE: PTO2TaskSlotState pointer carried over to the consumer
+    //   so it can finalize the AsyncWaitEntry.slot_state binding.
+    uint64_t addr;
+    uint32_t expected_value;
+    uint32_t engine;
+    int32_t completion_type;
+    uint32_t kind;
+    uint32_t _pad[5];
+};
+
+static_assert(sizeof(AICoreCompletionMailboxMessage) == PTO2_ALIGN_SIZE, "AICoreCompletionMailboxMessage layout drift");
+static_assert(
+    sizeof(std::atomic<uint64_t>) == sizeof(uint64_t),
+    "std::atomic<uint64_t> must be layout-compatible with uint64_t for the message slot layout to hold"
+);
+static_assert(
+    std::atomic<uint64_t>::is_always_lock_free,
+    "AICoreCompletionMailbox requires lock-free uint64_t atomics on every supported target"
+);
+
+// POD view of a drained message. `seq` is the ring's publication flag, not
+// payload, so try_pop copies out only the fields below (and seq is not even
+// copyable — it is a std::atomic).
+struct AICoreCompletionMsgView {
+    PTO2TaskId task_token{PTO2TaskId::invalid()};
+    uint64_t addr{0};
+    uint32_t expected_value{0};
+    uint32_t engine{0};
+    int32_t completion_type{0};
+    uint32_t kind{0};
+};
+
+struct AICoreCompletionMailbox {
+    // head and tail live on their own cache lines so producer CAS contention
+    // on head can't false-share with the consumer's tail updates.
+    alignas(PTO2_ALIGN_SIZE) std::atomic<uint64_t> head;
+    uint8_t _head_pad[PTO2_ALIGN_SIZE - sizeof(uint64_t)];
+    alignas(PTO2_ALIGN_SIZE) std::atomic<uint64_t> tail;
+    uint8_t _tail_pad[PTO2_ALIGN_SIZE - sizeof(uint64_t)];
+    alignas(PTO2_ALIGN_SIZE) AICoreCompletionMailboxMessage entries[AICORE_COMPLETION_MAILBOX_CAPACITY];
+
+    // Cheap, lock-free pending hint. Callers may invoke this outside the
+    // consumer lock; a stale answer only over/under-triggers a drain attempt.
+    bool has_pending() { return tail.load(std::memory_order_acquire) < head.load(std::memory_order_acquire); }
+
+    // MPSC push for a CONDITION message. Returns false when the ring is full
+    // (head - tail >= CAPACITY); caller should SPIN_WAIT_HINT and retry.
+    // Lock-free: CAS the shared head to claim a slot, write the fields, then
+    // release-store seq so the single consumer observes the publication.
+    //
+    // The head CAS is relaxed: head is a pure ticket counter and carries no
+    // data to the consumer — publication is solely the seq release-store, and
+    // slot-reuse safety rests on the acquire load of tail. The relaxed failure
+    // order is likewise sufficient since a lost CAS just re-reads head and
+    // retries. compare_exchange_weak is used because this loop already re-reads
+    // head and re-checks fullness, so masking LL/SC spurious failures (what
+    // _strong adds on aarch64) would only be a redundant inner retry.
+    //
+    // Safe to call concurrently from any number of producers; structurally
+    // independent of the AsyncWaitList::busy lock.
+    bool try_push_condition(
+        PTO2TaskId task_token, uint64_t addr, uint32_t expected_value, uint32_t engine, int32_t completion_type
+    ) {
+        while (true) {
+            uint64_t h = head.load(std::memory_order_relaxed);
+            uint64_t t = tail.load(std::memory_order_acquire);
+            if (h - t >= AICORE_COMPLETION_MAILBOX_CAPACITY) return false;
+            uint64_t new_head = h + 1;
+            if (head.compare_exchange_weak(h, new_head, std::memory_order_relaxed, std::memory_order_relaxed)) {
+                AICoreCompletionMailboxMessage *slot = &entries[h & AICORE_COMPLETION_MAILBOX_MASK];
+                slot->task_token.raw = task_token.raw;
+                slot->addr = addr;
+                slot->expected_value = expected_value;
+                slot->engine = engine;
+                slot->completion_type = completion_type;
+                slot->kind = MSG_KIND_CONDITION;
+                slot->seq.store(new_head, std::memory_order_release);
+                return true;
+            }
+            // CAS lost: another producer claimed the slot, retry with refreshed head.
+        }
+    }
+
+    // MPSC push for a TASK_NORMAL_DONE sentinel. Carries the PTO2TaskSlotState
+    // pointer in the `addr` field so the consumer can finish binding the
+    // AsyncWaitEntry.slot_state without going back to the FIN-handling thread.
+    bool try_push_normal_done(PTO2TaskId task_token, uint64_t slot_state_addr) {
+        while (true) {
+            uint64_t h = head.load(std::memory_order_relaxed);
+            uint64_t t = tail.load(std::memory_order_acquire);
+            if (h - t >= AICORE_COMPLETION_MAILBOX_CAPACITY) return false;
+            uint64_t new_head = h + 1;
+            if (head.compare_exchange_weak(h, new_head, std::memory_order_relaxed, std::memory_order_relaxed)) {
+                AICoreCompletionMailboxMessage *slot = &entries[h & AICORE_COMPLETION_MAILBOX_MASK];
+                slot->task_token.raw = task_token.raw;
+                slot->addr = slot_state_addr;
+                slot->expected_value = 0;
+                slot->engine = 0;
+                slot->completion_type = 0;
+                slot->kind = MSG_KIND_TASK_NORMAL_DONE;
+                slot->seq.store(new_head, std::memory_order_release);
+                return true;
+            }
+        }
+    }
+
+    // Single-consumer transport-level dequeue (caller holds the consumer lock).
+    // Returns false at the first not-yet-published slot (gap) or when empty;
+    // otherwise copies the next message in tail order into `out`, advances
+    // tail, and returns true. tail is consumer-only-written (relaxed read);
+    // head bounds the scan (relaxed); the seq acquire is the real publication
+    // gate; the tail release publishes "slot free" to reusing producers.
+    bool try_pop(AICoreCompletionMsgView &out) {
+        uint64_t t = tail.load(std::memory_order_relaxed);
+        uint64_t h = head.load(std::memory_order_relaxed);
+        if (t >= h) return false;
+        AICoreCompletionMailboxMessage *slot = &entries[t & AICORE_COMPLETION_MAILBOX_MASK];
+        if (slot->seq.load(std::memory_order_acquire) != t + 1) return false;
+        out.task_token.raw = slot->task_token.raw;
+        out.addr = slot->addr;
+        out.expected_value = slot->expected_value;
+        out.engine = slot->engine;
+        out.completion_type = slot->completion_type;
+        out.kind = slot->kind;
+        tail.store(t + 1, std::memory_order_release);
+        return true;
+    }
+};
+
+static_assert(
+    sizeof(AICoreCompletionMailbox) % PTO2_ALIGN_SIZE == 0, "AICoreCompletionMailbox size must be cache-line aligned"
+);
+
+#endif  // SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_AICORE_COMPLETION_MAILBOX_H_
diff --git a/src/a2a3/runtime/fully_distributed_within_core/runtime/aicore_completion_mailbox_types.h b/src/a2a3/runtime/fully_distributed_within_core/runtime/aicore_completion_mailbox_types.h
new file mode 100644
index 000000000..da0d89ad7
--- /dev/null
+++ b/src/a2a3/runtime/fully_distributed_within_core/runtime/aicore_completion_mailbox_types.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+#ifndef SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_AICORE_COMPLETION_MAILBOX_TYPES_H_
+#define SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_AICORE_COMPLETION_MAILBOX_TYPES_H_
+
+#include <stdint.h>
+
+#include "pto_constants.h"
+
+// Types shared across the AICore↔AICPU boundary.
+//
+// This header is reachable from AICore-side translation units (via
+// pto_async_kernel_api.h / pto_completion_token.h / sdma_completion_kernel.h)
+// and must stay parseable by every AICore toolchain configuration: no
+// <atomic>, no __atomic_* intrinsics, no MPSC ring buffer struct.
+//
+// The MPSC ring (AICoreCompletionMailbox) and its push/drain helpers live in
+// aicore_completion_mailbox.h, which is AICPU-only.
+
+inline constexpr int32_t MAX_COMPLETIONS_PER_TASK = 64;
+
+#define COMPLETION_ENGINE_SDMA 0u
+#define COMPLETION_ENGINE_ROCE 1u
+#define COMPLETION_ENGINE_URMA 2u
+#define COMPLETION_ENGINE_CCU 3u
+
+#define COMPLETION_TYPE_COUNTER 0
+#define COMPLETION_TYPE_SDMA_EVENT_RECORD 1
+
+// DeferredCompletionEntry / DeferredCompletionSlab back the per-task scratch
+// area that AICore writes into to record "this completion has to be observed
+// before the task can retire." The FIN-handling scheduler thread reads the
+// slab, flattens entries into AICoreCompletionMailbox messages, and forwards
+// them to the dispatch thread. `volatile` here is load-bearing: writers live
+// on AICore and readers on AICPU, so the qualifier is the correct way to
+// pin the compiler against caching / reordering on either side.
+struct DeferredCompletionEntry {
+    uint64_t addr;
+    uint32_t expected_value;
+    uint32_t engine;
+    int32_t completion_type;
+    uint32_t _pad;
+};
+
+static_assert(sizeof(DeferredCompletionEntry) == 24, "DeferredCompletionEntry layout drift");
+
+struct alignas(PTO2_ALIGN_SIZE) DeferredCompletionSlab {
+    volatile uint32_t count;
+    volatile int32_t error_code;
+    DeferredCompletionEntry entries[MAX_COMPLETIONS_PER_TASK];
+};
+
+static_assert(
+    sizeof(DeferredCompletionSlab) % PTO2_ALIGN_SIZE == 0,
+    "DeferredCompletionSlab size must preserve array element cache-line boundaries"
+);
+
+#endif  // SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_AICORE_COMPLETION_MAILBOX_TYPES_H_
diff --git a/src/a2a3/runtime/fully_distributed_within_core/runtime/backend/sdma/sdma_completion_kernel.h b/src/a2a3/runtime/fully_distributed_within_core/runtime/backend/sdma/sdma_completion_kernel.h
new file mode 100644
index 000000000..49ee7cc11
--- /dev/null
+++ b/src/a2a3/runtime/fully_distributed_within_core/runtime/backend/sdma/sdma_completion_kernel.h
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+#pragma once
+
+#include <stdint.h>
+
+#include <pto/comm/async_common/async_event_impl.hpp>
+#include <pto/npu/comm/async/sdma/sdma_async_intrin.hpp>
+
+#include "pto_async_kernel_api.h"
+#include "aicore_completion_mailbox_types.h"
+#include "pto_runtime_status.h"
+
+#ifndef __aicore__
+#define __aicore__
+#endif
+#ifndef __gm__
+#define __gm__
+#endif
+
+// Re-exposed PTO-ISA constant so examples / callers don't need to include
+// <pto/npu/comm/async/sdma/sdma_types.hpp> just to spell their scratch tile.
+inline constexpr uint32_t SDMA_SCRATCH_ALIGNMENT = pto::comm::sdma::UB_ALIGN_SIZE;
+
+enum class SdmaOp : uint8_t {
+    TGET = 0,
+    TPUT = 1,
+};
+
+// SdmaRequestDescriptor bundles everything send_request_entry needs to drive
+// one SDMA transfer + completion registration. It is a template because the
+// destination / source / scratch types carry tensor shape & stride at compile
+// time; the SdmaTget() / SdmaTput() helpers below let callers skip the
+// template arguments.
+//
+// sync_id selects which event-record slot inside the workspace the engine
+// writes into. Concurrent dispatches must use distinct sync_ids; today every
+// caller submits one request per kernel invocation so passing 0 is safe.
+// Future work (see .docs/25.comm-api-refactor/03.implementation-plan.md §5.2)
+// will fold sync_id allocation into the adapter.
+template <typename DstTensor, typename SrcTensor, typename ScratchTileT>
+struct SdmaRequestDescriptor {
+    SdmaOp op;
+    DstTensor dst;
+    SrcTensor src;
+    ScratchTileT scratch;
+    __gm__ uint8_t *workspace;
+    uint32_t sync_id;
+};
+
+template <typename DstTensor, typename SrcTensor, typename ScratchTileT>
+inline __aicore__ SdmaRequestDescriptor<DstTensor, SrcTensor, ScratchTileT> SdmaTget(
+    const DstTensor &dst, const SrcTensor &src, const ScratchTileT &scratch, __gm__ uint8_t *workspace,
+    uint32_t sync_id = 0
+) {
+    return SdmaRequestDescriptor<DstTensor, SrcTensor, ScratchTileT>{SdmaOp::TGET, dst,       src,
+                                                                     scratch,      workspace, sync_id};
+}
+
+template <typename DstTensor, typename SrcTensor, typename ScratchTileT>
+inline __aicore__ SdmaRequestDescriptor<DstTensor, SrcTensor, ScratchTileT> SdmaTput(
+    const DstTensor &dst, const SrcTensor &src, const ScratchTileT &scratch, __gm__ uint8_t *workspace,
+    uint32_t sync_id = 0
+) {
+    return SdmaRequestDescriptor<DstTensor, SrcTensor, ScratchTileT>{SdmaOp::TPUT, dst,       src,
+                                                                     scratch,      workspace, sync_id};
+}
+
+namespace pto2::detail {
+
+inline __aicore__ void register_sdma_event_record(AsyncCtx &ctx, volatile __gm__ void *record_addr) {
+    CompletionToken token{
+        reinterpret_cast<uint64_t>(record_addr), 0, COMPLETION_ENGINE_SDMA, COMPLETION_TYPE_SDMA_EVENT_RECORD, 0
+    };
+    (void)register_completion_condition(ctx, token);
+}
+
+template <typename PtoAsyncEvent, typename PtoAsyncSession>
+inline __aicore__ void
+register_pto_async_event(AsyncCtx &ctx, const PtoAsyncEvent &event, const PtoAsyncSession &session) {
+    if (ctx.task_token.is_invalid() || ctx.completion_count == nullptr || ctx.completion_entries == nullptr) {
+        (void)event.Wait(session);
+        return;
+    }
+    if (event.handle == 0) {
+        return;
+    }
+
+    const uint32_t engine = static_cast<uint32_t>(event.engine);
+    if (engine != static_cast<uint32_t>(::pto::comm::DmaEngine::SDMA)) {
+        defer_error(ctx, PTO2_ERROR_ASYNC_COMPLETION_INVALID);
+        return;
+    }
+
+    ::pto::comm::sdma::detail::UbTmpBuf tmp_buf;
+    uint32_t sync_id = 0;
+    __gm__ uint8_t *recv_workspace = nullptr;
+    uint32_t queue_num = 0;
+    if (!::pto::comm::sdma::detail::PrepareEventCheck(
+            session.sdmaSession, tmp_buf, sync_id, recv_workspace, queue_num
+        )) {
+        defer_error(ctx, PTO2_ERROR_ASYNC_COMPLETION_INVALID);
+        return;
+    }
+    for (uint32_t queue_id = 0; queue_id < queue_num; ++queue_id) {
+        register_sdma_event_record(ctx, ::pto::comm::sdma::detail::GetEventRecord(recv_workspace, queue_id));
+    }
+}
+
+}  // namespace pto2::detail
+
+// SDMA overload of the runtime's send_request_entry. Submits the descriptor
+// to PTO-ISA, then registers the resulting AsyncEvent's GM flag(s) into the
+// AsyncCtx deferred-wait slab and flushes. Returns false on submit/session
+// failure (also records the error in ctx.completion_error_code).
+template <typename DstTensor, typename SrcTensor, typename ScratchTileT>
+inline __aicore__ bool
+send_request_entry(AsyncCtx &ctx, SdmaRequestDescriptor<DstTensor, SrcTensor, ScratchTileT> desc) {
+    pto::comm::AsyncSession session;
+    if (!pto::comm::BuildAsyncSession(desc.scratch, desc.workspace, session, desc.sync_id)) {
+        pto2::detail::defer_error(ctx, PTO2_ERROR_ASYNC_COMPLETION_INVALID);
+        return false;
+    }
+
+    pto::comm::AsyncEvent event;
+    if (desc.op == SdmaOp::TGET) {
+        event = pto::comm::TGET_ASYNC(desc.dst, desc.src, session);
+    } else {
+        event = pto::comm::TPUT_ASYNC(desc.dst, desc.src, session);
+    }
+    pto2::detail::register_pto_async_event(ctx, event, session);
+    pto2::detail::defer_flush(ctx);
+    return true;
+}
diff --git a/src/a2a3/runtime/fully_distributed_within_core/runtime/backend/sdma/sdma_completion_scheduler.h b/src/a2a3/runtime/fully_distributed_within_core/runtime/backend/sdma/sdma_completion_scheduler.h
new file mode 100644
index 000000000..689219c35
--- /dev/null
+++ b/src/a2a3/runtime/fully_distributed_within_core/runtime/backend/sdma/sdma_completion_scheduler.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+
+#include "aicpu/platform_regs.h"
+#include "aicore_completion_mailbox.h"
+#include "pto_completion_token.h"
+#include "pto_runtime_status.h"
+
+// runtime-side mirror of the PTO-ISA SdmaEventRecord. SDMA backend is the only
+// allowed holder of this ABI knowledge; the generic scheduler dispatches into
+// the helpers below through the completion ops table.
+struct SdmaEventRecord {
+    uint32_t flag;
+    uint32_t sq_tail;
+    uint64_t channel_info;
+};
+
+static_assert(sizeof(SdmaEventRecord) == 16, "SDMA event record ABI drift");
+static_assert(offsetof(SdmaEventRecord, sq_tail) == 4, "SDMA event record ABI drift");
+
+inline uintptr_t sdma_completion_cache_line(const volatile void *addr) {
+    return reinterpret_cast<uintptr_t>(addr) & ~(uintptr_t(PTO2_ALIGN_SIZE) - 1u);
+}
+
+inline CompletionPollResult poll_sdma_event_record(uint64_t record_addr) {
+    if (record_addr == 0) {
+        return {CompletionPollState::FAILED, PTO2_ERROR_ASYNC_COMPLETION_INVALID};
+    }
+    volatile SdmaEventRecord *record =
+        reinterpret_cast<volatile SdmaEventRecord *>(static_cast<uintptr_t>(record_addr));
+    cache_invalidate_range(reinterpret_cast<const void *>(sdma_completion_cache_line(record)), PTO2_ALIGN_SIZE);
+    uint32_t flag = __atomic_load_n(&record->flag, __ATOMIC_ACQUIRE);
+    return {flag != 0 ? CompletionPollState::READY : CompletionPollState::PENDING, PTO2_ERROR_NONE};
+}
+
+inline void retire_sdma_event_record(uint64_t record_addr) {
+    if (record_addr == 0) return;
+    volatile SdmaEventRecord *record =
+        reinterpret_cast<volatile SdmaEventRecord *>(static_cast<uintptr_t>(record_addr));
+    cache_invalidate_range(reinterpret_cast<const void *>(sdma_completion_cache_line(record)), PTO2_ALIGN_SIZE);
+    uint32_t completed_tail = __atomic_load_n(&record->sq_tail, __ATOMIC_ACQUIRE);
+    uint64_t channel_info_addr = __atomic_load_n(&record->channel_info, __ATOMIC_ACQUIRE);
+
+    volatile uint64_t *record_head = reinterpret_cast<volatile uint64_t *>(record);
+    __atomic_store_n(record_head, 0ULL, __ATOMIC_RELEASE);
+    cache_flush_range(const_cast<const void *>(reinterpret_cast<volatile void *>(record_head)), sizeof(uint64_t));
+
+    if (channel_info_addr == 0) return;
+    uint64_t packed = (static_cast<uint64_t>(completed_tail) << 32) | static_cast<uint64_t>(completed_tail);
+    volatile uint64_t *channel_info = reinterpret_cast<volatile uint64_t *>(static_cast<uintptr_t>(channel_info_addr));
+    __atomic_store_n(channel_info, packed, __ATOMIC_RELEASE);
+    cache_flush_range(const_cast<const void *>(reinterpret_cast<volatile void *>(channel_info)), sizeof(uint64_t));
+}
diff --git a/src/a2a3/runtime/fully_distributed_within_core/runtime/common.h b/src/a2a3/runtime/fully_distributed_within_core/runtime/common.h
new file mode 100644
index 000000000..9dcf438ed
--- /dev/null
+++ b/src/a2a3/runtime/fully_distributed_within_core/runtime/common.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <stdlib.h>
+
+// Assertion macros (always_assert / debug_assert), AssertionError, and the
+// MAYBE_UNINITIALIZED diagnostics live in the shared header so the unified
+// Tensor (src/common/task_interface/tensor.h) can use them without depending
+// on this runtime-specific header. assert_impl / get_stacktrace are defined in
+// orchestration/common.cpp for runtime targets.
+#include "assert_compat.h"
+
+// Framework-internal TLS bridge. The executor binds the current thread's
+// runtime before invoking the orchestration entry, so orchestration helpers can
+// fetch the current PTO2Runtime without explicit parameter threading. Declared
+// here (rather than in pto_orchestration_api.h) so framework TUs the AICore
+// build also compiles — notably orchestration/common.cpp — see these symbols
+// without pulling in pto_types.h, whose Arg::add_scalar → to_u64 path is
+// __aicore__-only and would break the ccec build.
+#ifdef __cplusplus
+extern "C" {
+#endif
+struct PTO2Runtime;
+PTO2Runtime *framework_current_runtime(void);
+void framework_bind_runtime(PTO2Runtime *rt);
+#ifdef __cplusplus
+}
+#endif
diff --git a/src/a2a3/runtime/fully_distributed_within_core/runtime/dist_engine.cpp b/src/a2a3/runtime/fully_distributed_within_core/runtime/dist_engine.cpp
new file mode 100644
index 000000000..6a466b830
--- /dev/null
+++ b/src/a2a3/runtime/fully_distributed_within_core/runtime/dist_engine.cpp
@@ -0,0 +1,2097 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+/**
+ * fully_distributed_within_core engine.
+ *
+ * SPMD orchestration + scheduling + execution on the AI cores. See
+ * docs/fully_distributed_within_core.md for the authoritative design and
+ * src/.../docs/RUNTIME_LOGIC.md for the local overview.
+ *
+ * Each AICore worker thread runs dist_core_main(), which:
+ *   1. replays the full orchestration submit stream (every core builds an
+ *      identical per-core TensorMap and computes identical deterministic GM
+ *      output-heap addresses; only ownership differs);
+ *   2. on each rt_submit_*, races to claim the task on one of two global
+ *      cursors (cube for AIC-anchored, vector for AIV-only). The winner is
+ *      owner = builder = executor and builds the task into its private ring;
+ *   3. runs an EXECUTE-FIRST run-ahead loop: on every submit point it first
+ *      drains ready owned tasks (and pulls follower deposits), THEN claims at
+ *      most this one new task. Because claim+build is fast but execute is slow,
+ *      interleaving execution with claiming stops a fast core from greedily
+ *      claiming a full ring of consecutive tasks: while it executes a long task
+ *      other cores advance the cursor and claim subsequent tasks (load balance,
+ *      see docs §6/§6.1). The ring (small, kPrivateSlots) only back-pressures
+ *      when genuinely full of not-yet-ready tasks. After orchestration returns,
+ *      a final loop drains the ring to completion. A task is ready once all its
+ *      fan-in producers have set their entry in the global completion-flag
+ *      ring; on completion the owner sets its own flag (release).
+ *
+ * This file is compiled into the AICPU .so (build_config aicore source_dirs do
+ * not include runtime/), but dist_core_main runs ON the AICore worker threads
+ * (invoked through a function pointer), so kernels execute on AICore threads
+ * with their sim TLS in place.
+ *
+ * M2 scope: single-core tasks (1C / 1V) only — sufficient for benchmark_bgemm.
+ * Multi-core co-ownership (MIX / 2V, block.won) is M3; GM heap reclamation is
+ * M4. A MIX task encountered in M2 raises a fatal error.
+ */
+
+#include "dist_engine.h"
+
+#include <atomic>
+#include <chrono>
+#include <csignal>
+#include <ctime>
+#include <cstdarg>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <string>
+#include <vector>
+
+#include "callable.h"
+#include "common/core_type.h"
+#include "intrinsic.h"
+#include "pto2_dispatch_payload.h"
+#include "pto_constants.h"
+#include "pto_runtime2.h"
+#include "pto_submit_types.h"
+#include "pto_types.h"
+#include "runtime.h"
+#include "spin_hint.h"
+#include "tensor.h"
+#include "tensor_create_info.h"
+
+// -----------------------------------------------------------------------------
+// Compile-time gates.
+//
+// PTO2_PROFILING comes from profiling_config.h (default 1; a CCEC build passes
+// -DPTO2_PROFILING=0). It is pulled in transitively via pto_types.h above, which
+// is included before this point — so the gate below sees the real value.
+//
+// DIST_TRACE_ENABLED — swimlane tracing (per-task span capture + JSON dump).
+// Reuses the project's PTO2_PROFILING macro: sim builds pass PTO2_PROFILING=1, so
+// tracing is on there; an AICore/CCEC build that does not pass the macro gets
+// `#if (PTO2_PROFILING + 0)` == `#if 0`, so all tracing code (and its host-only
+// std::vector / std::chrono / clock_gettime / fprintf usage) is compiled out.
+// No #ifndef fallback on purpose: undefined ⇒ off.
+#define DIST_TRACE_ENABLED (PTO2_PROFILING + 0)
+
+// DIST_SIM_HOST_CLOCK — sim-only host facilities (steady_clock now_ns() and the
+// use_example_exec_time busy-wait kernel emulation). Unavailable under CCEC.
+#if defined(__CCE_AICORE__) || defined(__DAV_C220__) || defined(__CCE_KT_TEST__)
+#define DIST_SIM_HOST_CLOCK 0
+#else
+#define DIST_SIM_HOST_CLOCK 1
+#endif
+
+// Tracing needs the host wall clock (now_ns lives under DIST_SIM_HOST_CLOCK), so
+// the two gates cannot diverge into "trace on, host clock off". In practice both
+// are off together on a CCEC build; assert it so a stray -D combination fails loud.
+#if DIST_TRACE_ENABLED && !DIST_SIM_HOST_CLOCK
+#error "DIST_TRACE_ENABLED requires DIST_SIM_HOST_CLOCK (swimlane uses the host clock)"
+#endif
+
+namespace {
+
+// -----------------------------------------------------------------------------
+// Tunables. The completion-flag ring is sized to hold an entire run without
+// wrap (>= total tasks); the GM output heap is a BOUNDED RING reclaimed by the
+// completion frontier (M4, §9.5/§11.4) rather than a run-sized bump.
+// -----------------------------------------------------------------------------
+// Kept deliberately SMALL: the out-of-order window is num_cores * kPrivateSlots,
+// and this also caps how far a single core can run ahead of "ready-to-execute".
+// A large ring lets one fast core greedily claim a long run of consecutive tasks
+// and serialize them while other cores starve (load imbalance, docs §6.1). OoO
+// capacity should come from the core-count dimension, not a deep per-core ring.
+constexpr int32_t kPrivateSlots = 4;  // PRIVATE_TASK_SLOT_NUM (back-pressure cap)
+// Ring slots a core reserves for draining block.won deposits addressed to its
+// lane. Self-claimed tasks (consumers / single-core / own anchor subtask) may
+// only occupy kPrivateSlots - kWonReserve slots, so a follower can ALWAYS pull
+// and run an (immediately-ready) deposit even when its ring is otherwise full of
+// not-yet-ready consumers — breaking the consumer<->deposit priority inversion.
+constexpr int32_t kWonReserve = 2;
+constexpr int32_t kMaxFanin = 16;        // max distinct producers a task waits on
+constexpr int32_t kOutPoolSlots = 1024;  // per-core ring of materialized output Tensors
+constexpr int32_t kMapCap = 16384;       // per-core producer-map capacity (distinct regions)
+constexpr int32_t kFlagCap = 1 << 16;    // global completion-flag ring (>= total tasks)
+
+// M4 GM-heap reclamation (§9.5/§11.4).
+//   kHeapRingDefault — bounded physical heap ring (env PTO_DIST_HEAP_MB overrides,
+//     in MiB). The deterministic virtual bump is unbounded; physical address is
+//     (virtual_offset mod ring). A region is reused only after its previous
+//     occupant's task id <= R (the reclaim frontier), enforced by back-pressure.
+//   kHDefault — dependency-span bound H (env PTO_DIST_H overrides): every consumer
+//     of task N has id <= N + H. R = F - H. Must be >= the graph's true heap span
+//     or a producer region could be recycled while a late consumer still reads it
+//     (run-time-checked → fatal "heap span exceeded").
+constexpr size_t kHeapRingDefault = 64ull << 20;
+constexpr int32_t kHDefault = 64;
+
+// -----------------------------------------------------------------------------
+// Per-core producer map (the "full per-core duplicate TensorMap").
+//
+// A faithful, compact stand-in for PTO2TensorMap: keyed by GM byte range, it
+// records the most recent producer task id of each written region. INPUT/INOUT
+// fan-in resolves to the producer(s) whose region overlaps. Exact-region writes
+// (e.g. an INOUT accumulation chain) replace in place; new regions append.
+// Every core builds an identical map by replaying the same submit stream.
+// -----------------------------------------------------------------------------
+// Intrusive entry, modeled on PTO2TensorMapEntry (tensormap_and_ringbuffer) but
+// compact: it keys overlap on a byte range [lo, hi) instead of mirroring a full
+// Tensor cache line, since the distributed map only needs producer lookup.
+//   - bucket chain (doubly linked) — O(1) unlink during cleanup
+//   - task chain (singly linked)   — cleanup frees a retired task's entries by
+//                                     walking ITS chain, never scanning the pool
+struct MapEntry {
+    uint64_t buf_addr;       // Tensor.buffer.addr (GM buffer base, bytes) — hash key
+    uint64_t lo;             // byte offset of view origin within buffer
+    uint64_t hi;             // byte offset one-past the view extent
+    int32_t producer;        // task id that wrote this region
+    int32_t bucket;          // owning bucket index, or -1 when free
+    int32_t next_in_bucket;  // bucket-chain links (entry indices, -1 = none)
+    int32_t prev_in_bucket;
+    int32_t next_in_task;  // task-chain link (entry index, -1 = none)
+};
+
+// Hash buckets (power of 2). Hashing by buffer BASE address groups every
+// sub-region of one buffer into one chain; overlap is then tested per entry.
+constexpr int32_t kMapBuckets = 1 << 13;  // 8192
+constexpr int32_t kMapBucketShift = 13;   // log2(kMapBuckets)
+// Per-task entry-head window (power of 2). Task `id` parks its entries under
+// slot id & (kTaskWindow-1); the slot is recycled by id + kTaskWindow. cleanup
+// retires a task once it leaves the H span, so kTaskWindow MUST exceed H (with
+// margin) or a slot could be reused before its prior task is cleaned. Validated
+// against g_dist.H at register time.
+constexpr int32_t kTaskWindow = 1 << 10;  // 1024  (>> kHDefault = 64)
+constexpr int32_t kTaskWindowMask = kTaskWindow - 1;
+
+// Per-core producer map ("full per-core duplicate TensorMap"), a direct compact
+// port of tensormap_and_ringbuffer's PTO2TensorMap (hash table + bucket chains +
+// per-task entry tracking + free list + lazy invalidation + cleanup_retired).
+//
+// WHY (vs. the original O(count) linear array, which made submit O(N^2)):
+// bgemm writes hundreds of disjoint tiles of ONE flattened output buffer, so the
+// old `entries[count]` grew with the whole run and every lookup/insert rescanned
+// it. Following the proven runtime, we instead:
+//   * hash by buffer base + chain — distinct buffers cost O(1);
+//   * RETIRE by H window — an entry whose producer is older than `alive_floor`
+//     (= N - H) can never be a fan-in of any future task (a consumer of producer
+//     p has id <= p + H, §9.5/§11.4, the same bound under which p's GM heap region
+//     is recycled), so cleanup frees it. This bounds each chain to ~the live
+//     H-window instead of the entire run → O(N*H) ~ O(N).
+// Like the reference, insert ALWAYS links a fresh entry under its producer's task
+// chain (no in-place replace), so cleanup_retired can free a task's entries via
+// that chain without scanning; lookup returns the MAX (newest) overlapping
+// producer, which subsumes the old replace-in-place semantics.
+//
+// `alive_floor` is N-derived (deterministic, identical on every core), never
+// frontier-based (timing-dependent), so every per-core map — including the free
+// list and cleanup progress — evolves identically. Determinism is preserved.
+struct DistTensorMap {
+    MapEntry entries[kMapCap];
+    int32_t buckets[kMapBuckets];     // bucket head entry idx, or -1
+    int32_t task_heads[kTaskWindow];  // per-task entry-chain head idx, or -1
+    int32_t free_head;                // recycled-slot free list head, or -1
+    int32_t high_water;               // next never-used slot in `entries`
+    int32_t alive_floor;              // producer < alive_floor == retired
+    int32_t cleaned_upto;             // tasks < cleaned_upto already freed
+
+    void reset() {
+        free_head = -1;
+        high_water = 0;
+        alive_floor = 0;
+        cleaned_upto = 0;
+        for (int32_t i = 0; i < kMapBuckets; i++)
+            buckets[i] = -1;
+        for (int32_t i = 0; i < kTaskWindow; i++)
+            task_heads[i] = -1;
+    }
+
+    static uint32_t hash(uint64_t addr) {
+        addr *= 0x9E3779B97F4A7C15ULL;  // golden-ratio multiplicative mix
+        return static_cast<uint32_t>(addr >> (64 - kMapBucketShift));
+    }
+
+    static void byte_range(const Tensor &t, uint64_t &addr, uint64_t &lo, uint64_t &hi) {
+        const uint64_t esz = get_element_size(t.dtype);
+        addr = t.buffer.addr;
+        lo = t.start_offset * esz;
+        hi = (t.start_offset + t.extent_elem()) * esz;
+    }
+
+    int32_t alloc_slot() {
+        if (free_head >= 0) {
+            const int32_t s = free_head;
+            free_head = entries[s].next_in_bucket;
+            return s;
+        }
+        if (high_water < kMapCap) return high_water++;
+        return -1;  // pool exhausted (live H-window exceeds kMapCap)
+    }
+
+    // Unlink `idx` from its bucket chain (O(1) via prev) and push to the free list.
+    void free_entry(int32_t idx) {
+        MapEntry &e = entries[idx];
+        if (e.prev_in_bucket < 0) buckets[e.bucket] = e.next_in_bucket;
+        else entries[e.prev_in_bucket].next_in_bucket = e.next_in_bucket;
+        if (e.next_in_bucket >= 0) entries[e.next_in_bucket].prev_in_bucket = e.prev_in_bucket;
+        e.bucket = -1;
+        e.next_in_bucket = free_head;
+        free_head = idx;
+    }
+
+    // Free every entry produced by retired tasks [cleaned_upto, new_floor) by
+    // walking each task's own chain (never the whole pool). Mirrors PTO2TensorMap
+    // ::cleanup_retired. Advances alive_floor so lookups skip the freed window.
+    void advance_retire(int32_t N, int32_t H) {
+        const int32_t new_floor = N - H;
+        if (new_floor <= cleaned_upto) {  // nothing newly retired
+            if (new_floor > alive_floor) alive_floor = new_floor;
+            return;
+        }
+        for (int32_t id = cleaned_upto; id < new_floor; id++) {
+            int32_t cur = task_heads[id & kTaskWindowMask];
+            while (cur >= 0) {
+                const int32_t nxt = entries[cur].next_in_task;
+                debug_assert(entries[cur].producer == id);
+                free_entry(cur);
+                cur = nxt;
+            }
+            task_heads[id & kTaskWindowMask] = -1;
+        }
+        cleaned_upto = new_floor;
+        alive_floor = new_floor;
+    }
+
+    // Link a fresh entry for `producer`'s write of `t`'s region. Always a new
+    // entry (no in-place replace) so it parks under producer's task chain.
+    void insert(const Tensor &t, int32_t producer) {
+        uint64_t addr, lo, hi;
+        byte_range(t, addr, lo, hi);
+        const int32_t s = alloc_slot();
+        if (s < 0) return;  // pool full within the live window (should not happen)
+        const uint32_t b = hash(addr);
+        MapEntry &e = entries[s];
+        e.buf_addr = addr;
+        e.lo = lo;
+        e.hi = hi;
+        e.producer = producer;
+        e.bucket = static_cast<int32_t>(b);
+        // Insert at bucket head.
+        e.prev_in_bucket = -1;
+        e.next_in_bucket = buckets[b];
+        if (buckets[b] >= 0) entries[buckets[b]].prev_in_bucket = s;
+        buckets[b] = s;
+        // Insert at task-chain head.
+        const int32_t slot = producer & kTaskWindowMask;
+        e.next_in_task = task_heads[slot];
+        task_heads[slot] = s;
+    }
+
+    // Most-recent producer whose region overlaps `t`, or -1 if none. Entries
+    // below alive_floor are treated as already retired (skipped — defensive,
+    // since cleanup has usually freed them already).
+    int32_t lookup(const Tensor &t) const {
+        uint64_t addr, lo, hi;
+        byte_range(t, addr, lo, hi);
+        int32_t best = -1;
+        for (int32_t cur = buckets[hash(addr)]; cur >= 0; cur = entries[cur].next_in_bucket) {
+            const MapEntry &e = entries[cur];
+            if (e.producer < alive_floor) continue;
+            if (e.buf_addr == addr && lo < e.hi && e.lo < hi) {
+                if (e.producer > best) best = e.producer;
+            }
+        }
+        return best;
+    }
+};
+
+// -----------------------------------------------------------------------------
+// A private-ring slot: a fully materialized, self-contained task this core owns
+// and will execute itself. Holds its own copy of the argument Tensors so it can
+// be executed at any later point (deferred past further orchestration).
+// -----------------------------------------------------------------------------
+// One traced span on a core's timeline, recorded only when swimlane tracing is
+// on. `phase` distinguishes the orchestration stage so the exported lane shows
+// not just kernel execution but also the work between kernels (alloc, claim/
+// build, deposit drains). Laid out in the Chrome trace by physical block (pid)
+// and lane (tid).
+#if DIST_TRACE_ENABLED
+enum class TracePhase : int32_t {
+    Kernel = 0,    // incore kernel execution (or busy-wait replay)
+    Alloc = 1,     // dist_alloc_tensors body (materialize + reclaim back-pressure)
+    Build = 2,     // winner-only: fan-in resolution + built[] assembly (up to back-pressure)
+    DrainWon = 3,  // drain_block_won pulled+built a follower deposit
+    Replay = 4,    // submit replayed but claim LOST (per-core map/heap bookkeeping only)
+    RingBp = 5,    // winner spun on ring/heap back-pressure (waiting for a free slot / reclaim)
+    EfDrain = 6,   // execute-first drain at submit entry (deposits + ready owned tasks)
+    Commit = 7,    // winner-only: alloc ring/won slot + build_ring_slot (publish the task)
+};
+
+struct TraceEvent {
+    int32_t task_id;
+    int32_t func_id;  // kernel id (e.g. 0=GEMM, 1=ADD); -1 if unknown
+    int32_t lane;     // AIC=0 / AIV0=1 / AIV1=2
+    uint8_t multicore;
+    TracePhase phase;
+    // Raw nanosecond timestamps — NO unit conversion on the hot path. The dump
+    // stage divides by 1000 to emit microseconds (the swimlane unit).
+    uint64_t ts_ns;   // start, ns from g_trace_epoch (wall clock)
+    uint64_t dur_ns;  // span duration, ns (wall clock)
+    // CPU time this thread actually accrued during the span (CLOCK_THREAD_CPUTIME_ID).
+    // On an oversubscribed host dur_ns inflates while the thread is descheduled;
+    // cpu_ns does not, so a large dur_ns with small cpu_ns == "swapped out, not work".
+    // Only meaningful for non-kernel overhead spans (kernel spans set it to dur_ns).
+    uint64_t cpu_ns;
+};
+#endif  // DIST_TRACE_ENABLED
+
+struct RingSlot {
+    bool occupied;
+    // A slot can be reserved (occupied=true) before it is fully populated: the
+    // submit winner grabs a slot up front so concurrent drains do not reuse it,
+    // then may spin in block.won back-pressure (which itself drains Phase B)
+    // before calling build_ring_slot. `built` gates execution so Phase B never
+    // (re)runs a reserved-but-unbuilt slot still holding a prior occupant's
+    // task_id/fanin/won linkage. build_ring_slot sets it; execute_slot clears it.
+    bool built;
+    int32_t task_id;
+    int32_t func_id;  // kernel id of this slot's lane (swimlane label); -1 if none
+    uint64_t function_bin_addr;
+
+    int32_t tensor_count;
+    int32_t scalar_count;
+    Tensor tensors[MAX_TENSOR_ARGS];
+    uint64_t scalars[MAX_SCALAR_ARGS];
+
+    uint64_t args[PTO2_DISPATCH_MAX_ARGS];
+    LocalContext local_ctx;
+    GlobalContext global_ctx;
+
+    int32_t fanin[kMaxFanin];
+    int32_t fanin_count;
+
+    // Multi-core (MIX / 2V) linkage. When is_multicore, the completion flag for
+    // task_id is owned jointly: each co-owner decrements block.won[won_slot].remaining
+    // after executing its own subtask, and the one driving it to zero publishes
+    // the single global task_completed_flag. Single-core tasks set the flag directly.
+    bool is_multicore;
+    int32_t won_block;
+    int32_t won_slot;
+};
+
+// -----------------------------------------------------------------------------
+// block.won — the id-keyed anchor→follower deposit table (block-shared, §3.1).
+// One BlockWon per physical block (1 AIC + 2 AIV). The anchor that wins a
+// multi-core task builds its OWN physical-lane subtask into its private ring and
+// deposits the remaining active-lane subtasks here; followers asynchronously
+// drain the entry addressed to their physical lane (no blocking, no per-walk
+// wait). Keyed by task id via per-slot task_id so concurrent multi-core tasks of
+// one block never alias. `remaining` = popcount(active_mask) drives the single
+// completion flag (§3.1). Lane index uses PTO2SubtaskSlot (AIC=0/AIV0=1/AIV1=2).
+// -----------------------------------------------------------------------------
+struct BuiltSubtask {
+    bool present;
+    int32_t func_id;  // kernel id of this lane's subtask (swimlane label); -1 if none
+    uint64_t function_bin_addr;
+    int32_t tensor_count;
+    int32_t scalar_count;
+    Tensor tensors[MAX_TENSOR_ARGS];
+    uint64_t scalars[MAX_SCALAR_ARGS];
+    int32_t fanin[kMaxFanin];
+    int32_t fanin_count;
+    int32_t sub_block_id;
+};
+
+struct WonSlot {
+    std::atomic<int32_t> state;  // 0=free, 1=published, 2=reserving
+    int32_t task_id;
+    std::atomic<int32_t> remaining;                         // co-owners (incl. anchor) left to finish
+    std::atomic<int32_t> drained[PTO2_SUBTASK_SLOT_COUNT];  // 0/1 per follower lane
+    BuiltSubtask lane[PTO2_SUBTASK_SLOT_COUNT];             // deposited follower subtasks
+};
+
+struct BlockWon {
+    WonSlot slots[kPrivateSlots];
+    // Monotone "has any anchor ever published a deposit into this block?" flag.
+    // Lets follower drains short-circuit the per-slot scan for workloads with no
+    // multi-core (e.g. 2V) tasks — the common case (bgemm is all single-core), so
+    // every AIV core skips a 4-slot won-scan on every submit. Never reset within a
+    // session; once true the scan path is taken (those workloads have real work).
+    std::atomic<int32_t> any_pub;
+};
+
+enum LaneId : int32_t { LANE_AIC = 0, LANE_AIV0 = 1, LANE_AIV1 = 2, LANE_NONE = -1 };
+
+#if DIST_TRACE_ENABLED
+// Swimlane tracing globals. Defined here (before DistCore) so DistCore::reset can
+// see g_trace_reserve; g_trace_on / g_trace_epoch_ns sit alongside for one place.
+//   g_trace_on      — set from PTO_DIST_SWIMLANE at register time; gates capture.
+//   g_trace_epoch_ns — run-start epoch so every core's span ts is relative to it.
+//   g_trace_reserve — per-core span reserve: 0 when off (reset never reserves, so
+//     a normal run pays nothing), else a generous upper bound on spans/core so
+//     push_back never reallocs mid-run (stable heap layout).
+bool g_trace_on = false;
+uint64_t g_trace_epoch_ns = 0;
+int32_t g_trace_reserve = 0;
+#endif
+
+struct CoreLayout {
+    int32_t block_id;  // physical block index
+    int32_t lane;      // LaneId of this core within its block
+};
+
+// -----------------------------------------------------------------------------
+// Per-core engine state (the SPMD worker context).
+// -----------------------------------------------------------------------------
+struct DistCore {
+    CoreType role;
+    int32_t core_idx;  // index into g_dist.cores[] (for trace ownership)
+    int32_t block_id;  // physical block this core belongs to
+    int32_t lane;      // LaneId within the block (AIC / AIV0 / AIV1)
+    int32_t sub_block_id;
+    int32_t local_index;  // next task id this core will see (== tasks replayed)
+    uint64_t heap_next;   // deterministic GM output-heap bump cursor (bytes)
+
+    DistTensorMap map;
+
+    RingSlot slots[kPrivateSlots];
+    int32_t occupied_count;
+    int32_t owned_total;  // tasks this core claimed+executed (debug)
+
+    Tensor outpool[kOutPoolSlots];
+    int32_t outpool_head;
+
+#if DIST_TRACE_ENABLED
+    // Per-core swimlane events (only populated when tracing is on). Owned solely
+    // by this core's worker thread, so push_back is lock-free.
+    std::vector<TraceEvent> trace;
+
+    // Running-cursor timestamps for lap-style tracing (see trace_lap). Each span is
+    // [trace_last_ns, now); after recording, the cursor advances to now, so the next
+    // span abuts this one with zero gap — the whole submit flow (incl. the orch
+    // round-trip between two submits) is covered by exactly one span each, no code
+    // path left un-timed. Reset at replay entry; wall + this-thread CPU clocks.
+    uint64_t trace_last_ns;
+    uint64_t trace_last_cpu;
+
+    // Per-core static dependency edges (tracing only): one per fan-in resolved at
+    // build time — {consumer_task, producer_task}. Dumped as Chrome-trace flow
+    // events (producer's span -> consumer's span) so the swimlane shows the full
+    // dependency graph; following the arrows hop-by-hop walks the chain "what is
+    // this task waiting on, and what is THAT waiting on". Recorded by whichever
+    // core builds the task, so every executed task contributes its in-edges.
+    struct DepEdge {
+        int32_t consumer_task;
+        int32_t producer_task;
+    };
+    std::vector<DepEdge> dep_edges;
+
+    // Per-core SLOT-RELEASE edges (tracing only): why a ringbp actually stalls.
+    // When task N's owner enters the ring back-pressure, it is waiting not on N's
+    // data producers but on the tasks ALREADY occupying its private ring to
+    // execute (free a slot). Snapshot those occupants ({waiter=N, occupant}).
+    // Dumped as flow events occupant-kernel -> ringbp: the occupant's execution is
+    // the release event that ends the wait. Chains with dep_edges: ringbp -> its
+    // ring occupant (slot edge) -> that occupant's data producers (dep edges).
+    std::vector<DepEdge> slot_edges;
+#endif  // DIST_TRACE_ENABLED
+
+    void reset(CoreType r, int32_t block, int32_t lane_id) {
+        role = r;
+        block_id = block;
+        lane = lane_id;
+        sub_block_id = (lane_id == LANE_AIV1) ? 1 : 0;
+        local_index = 0;
+        heap_next = 0;
+        map.reset();
+        occupied_count = 0;
+        owned_total = 0;
+        outpool_head = 0;
+        for (int32_t i = 0; i < kPrivateSlots; i++) {
+            slots[i].occupied = false;
+            slots[i].built = false;
+        }
+#if DIST_TRACE_ENABLED
+        trace_last_ns = 0;
+        trace_last_cpu = 0;
+        trace.clear();
+        // Pre-size the trace vector only when tracing is on (see g_trace_on),
+        // so push_back never reallocs mid-run (a realloc would perturb the heap
+        // layout — exactly the kind of disturbance that historically interacted
+        // badly with the sim; keep it stable). Costs nothing on a normal run.
+        if (g_trace_reserve > 0) trace.reserve(g_trace_reserve);
+        dep_edges.clear();
+        if (g_trace_reserve > 0) dep_edges.reserve(g_trace_reserve);
+        slot_edges.clear();
+        if (g_trace_reserve > 0) slot_edges.reserve(g_trace_reserve);
+#endif  // DIST_TRACE_ENABLED
+    }
+};
+
+// -----------------------------------------------------------------------------
+// Cursor sharding (docs §6.6). Each per-anchor-type claim cursor is split into
+// kCursorShards independent sub-cursors; task id N claims on shard (N %
+// kCursorShards). The shard is a pure function of N (identical on every core, no
+// worker partitioning), so the claim semantics are byte-for-byte equivalent to a
+// single cursor (exactly one owner per task, every core eligible) — sharding
+// ONLY spreads the CAS traffic across kCursorShards cache lines, cutting the
+// false-sharing / coherence contention that dominated us/task at high core
+// counts (§6.5). Each sub-cursor is padded to its own cache line so adjacent
+// shards never share a line; all entries init to -1 (no id claimed yet).
+constexpr int32_t kCursorShards = 4;
+constexpr size_t kCacheLine = 64;
+
+struct alignas(kCacheLine) PaddedCursor {
+    std::atomic<int32_t> v;
+    uint8_t pad[kCacheLine - sizeof(std::atomic<int32_t>)];
+};
+
+// -----------------------------------------------------------------------------
+// Global engine state (shared by all worker threads in this process). Cursors +
+// flags live here rather than in GM because in sim every core is a host thread
+// in one address space; the GM output heap below is a real shared buffer.
+// -----------------------------------------------------------------------------
+struct DistGlobal {
+    PaddedCursor cube_cursor[kCursorShards];    // highest claimed AIC-anchored id, per shard
+    PaddedCursor vector_cursor[kCursorShards];  // highest claimed AIV-only id, per shard
+    PaddedCursor alloc_cursor[kCursorShards];   // highest claimed kernel-less alloc id, per shard
+    std::atomic<uint8_t> flags[kFlagCap];       // completion-flag ring (1 == task done)
+
+    // M4 reclamation (§9.5/§11.4). `frontier` (F) is the global continuous
+    // completion frontier — the largest prefix s.t. every task id <= F is done;
+    // advanced cooperatively (CAS) by whichever core sets the flag that extends
+    // the prefix. `R = frontier - H` is the reclaim frontier. `vend[N]` is the
+    // cumulative virtual heap bytes through task N (deterministic & identical on
+    // every core), so any core can compute the live byte window [vend[R], top).
+    std::atomic<int32_t> frontier;
+    int32_t H;
+    std::atomic<uint64_t> vend[kFlagCap];
+
+    uint8_t *heap_base;
+    size_t heap_size;  // == bounded ring size
+
+    DistOrchFunc orch_func;
+    const L2TaskArgs *orch_args;
+    PTO2Runtime *rt;
+    Runtime *runtime;  // outer Runtime (for kernel-address resolution + done_count)
+
+    std::atomic<int32_t> fatal;
+
+    // Physical-block topology (1 AIC + 2 AIV per block), derived once at register
+    // time from Runtime::workers[].core_type, identical to the centralized
+    // scheduler's cluster discovery (AIC core b pairs with the 2b-th / (2b+1)-th
+    // AIV cores in worker-index order).
+    int32_t num_workers;
+    int32_t num_blocks;
+    CoreLayout layout[RUNTIME_MAX_WORKER];
+    BlockWon blocks[RUNTIME_MAX_WORKER];  // indexed by block_id (<= num AIC)
+
+    // Global "all cores finished orchestration replay" counter. A follower must
+    // not conclude "no more pushes are coming for my lane" until every core has
+    // finished replaying the submit stream (§7 tail-idle).
+    std::atomic<int32_t> replay_done;
+
+    // Startup barrier: every worker thread bumps this on entry and spins until it
+    // reaches num_workers before beginning replay. In sim each "core" is a host
+    // pthread that the OS schedules in one at a time (hundreds of µs apart on a
+    // busy box), so without this the first-claimed tasks start executing while
+    // later cores have not even been scheduled — the swimlane shows a long
+    // cold-start stagger that is host-scheduling noise, not engine behavior.
+    // Aligning the start makes the trace reflect steady-state contention.
+    std::atomic<int32_t> started_count;
+
+    DistCore cores[RUNTIME_MAX_WORKER];
+};
+
+DistGlobal g_dist;
+thread_local DistCore *g_self = nullptr;
+
+#if DIST_SIM_HOST_CLOCK
+// Orchestration/scheduling overhead isolation (set PTO_DIST_SKIP_EXEC=1). When
+// on, execute_slot skips the actual incore kernel call — every (sub)task is
+// treated as 0-cost and "completes" instantly — while ALL ownership/completion
+// bookkeeping runs unchanged, so the loop terminates identically. This lets a
+// benchmark measure the pure cost of on-core orchestration + claim race +
+// scheduling, independent of kernel work. Outputs are NOT computed (run with
+// golden checks disabled). See examples/.../runtime_overhead_test.
+bool g_skip_exec = false;
+
+inline uint64_t now_ns() {
+    return static_cast<uint64_t>(
+        std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::steady_clock::now().time_since_epoch())
+            .count()
+    );
+}
+#endif  // DIST_SIM_HOST_CLOCK
+
+#if DIST_TRACE_ENABLED
+// Per-thread CPU time (excludes time the thread spends descheduled). Used only by
+// the swimlane to tell genuine work from host-oversubscription stalls, so it lives
+// under DIST_TRACE_ENABLED (not the sim-clock gate — busy-wait never needs it).
+inline uint64_t thread_cpu_ns() {
+    struct timespec ts;
+    clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts);
+    return static_cast<uint64_t>(ts.tv_sec) * 1000000000ull + static_cast<uint64_t>(ts.tv_nsec);
+}
+
+// Snapshot the clock only when tracing is on (callers pass the result as a span
+// start). Returns 0 otherwise so the matching trace_overhead() is a no-op.
+inline uint64_t trace_now() { return g_trace_on ? now_ns() : 0; }
+inline uint64_t trace_now_cpu() { return g_trace_on ? thread_cpu_ns() : 0; }
+
+// Record a non-kernel overhead span [t0_ns, now) on this core's lane. Stores RAW
+// nanoseconds (no unit conversion on the hot path — the dump stage divides by
+// 1000). cpu_ns is this thread's CPU time over the span (small cpu with large dur
+// == descheduled, not work). No-op unless tracing is on.
+inline void trace_overhead_impl(
+    DistCore *self, int32_t task_id, int32_t func_id, TracePhase phase, uint64_t t0_ns, uint64_t t0_cpu
+) {
+    if (!g_trace_on) return;
+    const uint64_t t1 = now_ns();
+    const uint64_t c1 = thread_cpu_ns();
+    self->trace.push_back(
+        TraceEvent{
+            task_id, func_id, self->lane, /*multicore=*/0, phase, t0_ns - g_trace_epoch_ns, t1 - t0_ns, c1 - t0_cpu
+        }
+    );
+}
+
+// Reset the lap cursor to "now" — call once at replay entry so the first lap span
+// measures from a well-defined origin (not from an uninitialized cursor).
+inline void trace_lap_reset_impl(DistCore *self) {
+    if (!g_trace_on) return;
+    self->trace_last_ns = now_ns();
+    self->trace_last_cpu = thread_cpu_ns();
+}
+
+// Lap-style span: record [trace_last_ns, now) then advance the cursor to now, so
+// the next lap continues seamlessly from here (same idiom as pto_orchestrator's
+// CYCLE_COUNT_LAP: acc += t1 - t0; t0 = t1). Every code path between two laps is
+// attributed to exactly one span — no gaps, no double-counting. Stores raw ns.
+inline void trace_lap_impl(DistCore *self, int32_t task_id, int32_t func_id, TracePhase phase) {
+    if (!g_trace_on) return;
+    const uint64_t t1 = now_ns();
+    const uint64_t c1 = thread_cpu_ns();
+    self->trace.push_back(
+        TraceEvent{
+            task_id, func_id, self->lane, /*multicore=*/0, phase, self->trace_last_ns - g_trace_epoch_ns,
+            t1 - self->trace_last_ns, c1 - self->trace_last_cpu
+        }
+    );
+    self->trace_last_ns = t1;
+    self->trace_last_cpu = c1;
+}
+
+// Trace call-site macros forward to the _impl inlines above; the #else branch below
+// expands them to nothing — so call sites need no #if, and the phase enum /
+// TraceEvent need not even exist when off (the preprocessor eats the whole argument
+// list, TracePhase::X included). Same idiom as pto_orchestrator's CYCLE_COUNT_LAP.
+#define TRACE_LAP(self, task_id, func_id, phase) trace_lap_impl((self), (task_id), (func_id), (phase))
+#define TRACE_LAP_RESET(self) trace_lap_reset_impl((self))
+#define TRACE_OVERHEAD(self, task_id, func_id, phase, t0_ns, t0_cpu) \
+    trace_overhead_impl((self), (task_id), (func_id), (phase), (t0_ns), (t0_cpu))
+#else  // !DIST_TRACE_ENABLED — tracing compiled out; call sites become no-ops.
+#define TRACE_LAP(self, task_id, func_id, phase) ((void)0)
+#define TRACE_LAP_RESET(self) ((void)0)
+#define TRACE_OVERHEAD(self, task_id, func_id, phase, t0_ns, t0_cpu) ((void)0)
+#endif  // DIST_TRACE_ENABLED
+
+// Opt-in per-core tracing (set PTO_DIST_TRACE=1). Off by default so a passing
+// run is quiet; fatal/error/heap-exhaustion diagnostics are always emitted.
+inline bool dist_trace() {
+    static const bool on = (getenv("PTO_DIST_TRACE") != nullptr);
+    return on;
+}
+
+// -----------------------------------------------------------------------------
+// Fatal / claim / execution helpers
+// -----------------------------------------------------------------------------
+inline bool fatal_set() { return g_dist.fatal.load(std::memory_order_acquire) != 0; }
+inline void set_fatal() { g_dist.fatal.store(1, std::memory_order_release); }
+
+void dist_dump_state(int);  // defined below; dumps full engine state for hangs
+
+// Env-gated stall watchdog (set PTO_DIST_WATCHDOG=<seconds>, default off). Called
+// from inside the engine's spin loops on a worker thread (so fprintf is safe,
+// unlike a signal handler). On the first call it records a start time; if a loop
+// keeps spinning past the budget the engine is presumed deadlocked, so it dumps
+// the full state once and sets fatal to unwind every core for a fast, diagnosed
+// failure instead of an indefinite hang.
+inline void watchdog(uint64_t &start_ns) {
+    static const long budget_s = []() -> long {
+        const char *e = getenv("PTO_DIST_WATCHDOG");
+        return e ? atol(e) : 0;
+    }();
+    if (budget_s <= 0) return;
+    const uint64_t now = static_cast<uint64_t>(
+        std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::steady_clock::now().time_since_epoch())
+            .count()
+    );
+    if (start_ns == 0) {
+        start_ns = now;
+        return;
+    }
+    if (now - start_ns > static_cast<uint64_t>(budget_s) * 1000000000ull) {
+        static std::atomic<int32_t> dumped{0};
+        int32_t exp = 0;
+        if (dumped.compare_exchange_strong(exp, 1, std::memory_order_acq_rel)) {
+            fprintf(stderr, "[dist_engine] WATCHDOG fired after %lds — presumed deadlock, dumping state\n", budget_s);
+            dist_dump_state(0);
+        }
+        set_fatal();
+    }
+}
+
+// CAS-loop fetch_max (§11.1): returns true (WON) iff this core advanced the
+// cursor to N. No hardware fetch_max on the target, so this is the equivalent
+// acq-rel CAS retry. Monotonic: each task id is claimed by exactly one core and
+// no id is skipped within a cursor's subsequence.
+bool claim(std::atomic<int32_t> &cursor, int32_t N) {
+    int32_t c = cursor.load(std::memory_order_acquire);
+    while (true) {
+        if (N <= c) return false;
+        if (cursor.compare_exchange_weak(c, N, std::memory_order_acq_rel, std::memory_order_acquire)) return true;
+    }
+}
+
+// Cooperatively advance the global completion frontier F (§11.4): after any core
+// publishes flag(N), the contiguous-done prefix may have grown, so any core walks
+// F forward while flag(F+1) is set. Lock-free; the CAS makes exactly one core win
+// each step and the cost is amortized across all cores.
+void advance_frontier() {
+    int32_t f = g_dist.frontier.load(std::memory_order_acquire);
+    while (true) {
+        const int32_t next = f + 1;
+        if (next >= kFlagCap) break;
+        if (g_dist.flags[next & (kFlagCap - 1)].load(std::memory_order_acquire) == 0) break;
+        if (g_dist.frontier.compare_exchange_weak(f, next, std::memory_order_acq_rel, std::memory_order_acquire)) {
+            f = next;
+        }
+        // On CAS failure f was reloaded with the current value; retry.
+    }
+}
+
+// Resolve a kernel id to its executable address (CoreCallable::resolved_addr()).
+uint64_t resolve_kernel_addr(Runtime *runtime, int32_t kernel_id) {
+    if (kernel_id == INVALID_KERNEL_ID) return 0;
+    uint64_t callable_addr = runtime->get_function_bin_addr(kernel_id);
+    if (callable_addr == 0) return 0;
+    const CoreCallable *callable = reinterpret_cast<const CoreCallable *>(callable_addr);
+    return callable->resolved_addr();
+}
+
+// Execute one owned task, then publish its completion flag (release). In sim all
+// cores share the address space, so the release/acquire pair is the visibility
+// barrier between the kernel's output writes and a consumer's input reads.
+void execute_slot([[maybe_unused]] DistCore *self, RingSlot &s) {
+    typedef void (*KernelFn)(int64_t *);
+#if DIST_SIM_HOST_CLOCK
+    // Sim-only trace-driven replay (CallConfig::use_example_exec_time): when the
+    // host filled example_exec_time_ns_[func_id] > 0 for this func, "execute" it
+    // by busy-waiting that many nanoseconds instead of calling the real kernel,
+    // so a fast sim run reflects measured on-hardware kernel durations. 320 host
+    // cores >> 72 workers, so the spin does not contend; funcs left at 0 fall
+    // through to the real call below. See Runtime::example_exec_time_ns_.
+    const Runtime *rt = g_dist.runtime;
+    const int32_t sim_ns =
+        (rt != nullptr && rt->use_example_exec_time_ && s.func_id >= 0 && s.func_id < RUNTIME_MAX_FUNC_ID) ?
+            rt->example_exec_time_ns_[s.func_id] :
+            0;
+    if (sim_ns > 0) {
+        const uint64_t t0 = now_ns();
+        const uint64_t target = t0 + static_cast<uint64_t>(sim_ns);
+        while (now_ns() < target) { /* spin: emulate kernel busy time */
+        }
+#if DIST_TRACE_ENABLED
+        if (g_trace_on) {
+            self->trace.push_back(
+                TraceEvent{
+                    s.task_id, s.func_id, self->lane, static_cast<uint8_t>(s.is_multicore ? 1 : 0), TracePhase::Kernel,
+                    t0 - g_trace_epoch_ns, static_cast<uint64_t>(sim_ns), static_cast<uint64_t>(sim_ns)
+                }
+            );
+        }
+#endif
+    } else if (s.function_bin_addr != 0 && !g_skip_exec) {
+        // PTO_DIST_SKIP_EXEC: treat the incore task as 0-cost — skip the kernel call
+        // but keep every flag/frontier/slot update below so termination is identical.
+        KernelFn fn = reinterpret_cast<KernelFn>(s.function_bin_addr);
+#if DIST_TRACE_ENABLED
+        if (g_trace_on) {
+            const uint64_t t0 = now_ns();
+            fn(reinterpret_cast<int64_t *>(s.args));
+            const uint64_t t1 = now_ns();
+            self->trace.push_back(
+                TraceEvent{
+                    s.task_id, s.func_id, self->lane, static_cast<uint8_t>(s.is_multicore ? 1 : 0), TracePhase::Kernel,
+                    t0 - g_trace_epoch_ns, t1 - t0, t1 - t0
+                }
+            );
+        } else {
+            fn(reinterpret_cast<int64_t *>(s.args));
+        }
+#else
+        fn(reinterpret_cast<int64_t *>(s.args));
+#endif
+    }
+#else   // !DIST_SIM_HOST_CLOCK — AICore/CCEC: no host clock, no busy-wait emulation.
+    if (s.function_bin_addr != 0) {
+        KernelFn fn = reinterpret_cast<KernelFn>(s.function_bin_addr);
+        fn(reinterpret_cast<int64_t *>(s.args));
+    }
+#endif  // DIST_SIM_HOST_CLOCK
+    if (s.is_multicore) {
+        // Joint ownership: the co-owner that drives remaining to zero (the last
+        // subtask to finish) publishes the single global completion flag (§3.1),
+        // then frees the block.won entry for reuse.
+        WonSlot &w = g_dist.blocks[s.won_block].slots[s.won_slot];
+        if (w.remaining.fetch_sub(1, std::memory_order_acq_rel) == 1) {
+            g_dist.flags[s.task_id & (kFlagCap - 1)].store(1, std::memory_order_release);
+            w.state.store(0, std::memory_order_release);  // recycle the id-keyed slot
+            advance_frontier();
+        }
+    } else {
+        g_dist.flags[s.task_id & (kFlagCap - 1)].store(1, std::memory_order_release);
+        advance_frontier();
+    }
+    s.built = false;
+    s.occupied = false;
+}
+
+// Phase B: execute every ready owned task in the private ring. A task is ready
+// once all its fan-in producers have set their completion flag (acquire).
+// Returns the number of slots freed this pass.
+int32_t drain_phase_b(DistCore *self) {
+    // Fast path: an empty private ring has nothing to drain. Skips the per-slot
+    // scan on every submit point (called twice per task, on every core) when the
+    // ring is empty — the common case for fine-grained / skip-exec workloads.
+    // Behavior-identical: the loop below is a no-op when occupied_count == 0.
+    if (self->occupied_count == 0) return 0;
+    int32_t freed = 0;
+    for (int32_t i = 0; i < kPrivateSlots; i++) {
+        RingSlot &s = self->slots[i];
+        if (!s.occupied || !s.built) continue;  // skip reserved-but-unbuilt slots
+        bool ready = true;
+        for (int32_t f = 0; f < s.fanin_count; f++) {
+            if (g_dist.flags[s.fanin[f] & (kFlagCap - 1)].load(std::memory_order_acquire) == 0) {
+                ready = false;
+                break;
+            }
+        }
+        if (!ready) continue;
+        execute_slot(self, s);
+        self->occupied_count--;
+        freed++;
+    }
+    return freed;
+}
+
+int32_t alloc_ring_slot(DistCore *self) {
+    for (int32_t i = 0; i < kPrivateSlots; i++) {
+        if (!self->slots[i].occupied) return i;
+    }
+    return -1;
+}
+
+// Kernel id for a physical lane (AIC/AIV0/AIV1) of a MixedKernels.
+inline int32_t kernel_id_for_lane(const MixedKernels &mixed, int32_t lane) {
+    switch (lane) {
+    case LANE_AIC:
+        return mixed.aic_kernel_id;
+    case LANE_AIV0:
+        return mixed.aiv0_kernel_id;
+    case LANE_AIV1:
+        return mixed.aiv1_kernel_id;
+    default:
+        return INVALID_KERNEL_ID;
+    }
+}
+
+inline bool lane_active(const ActiveMask &M, int32_t lane) {
+    return M.subtask_active(static_cast<PTO2SubtaskSlot>(lane));
+}
+
+// Materialize a private-ring slot from already-resolved components (shared by the
+// owner build path and the follower drain path). `tensors`/`scalars` are copied
+// in; args[] is (re)built to point at this slot's own copies so the slot is
+// self-contained and executable at any later time.
+void build_ring_slot(
+    RingSlot &s, int32_t task_id, int32_t func_id, uint64_t fn_addr, const Tensor *tensors, int32_t tc,
+    const uint64_t *scalars, int32_t sc, const int32_t *fanin, int32_t fc, int32_t sub_block_id, bool is_multicore,
+    int32_t won_block, int32_t won_slot
+) {
+    s.occupied = true;
+    s.task_id = task_id;
+    s.func_id = func_id;
+    s.function_bin_addr = fn_addr;
+    s.built = true;  // fully populated below — now safe for Phase B to execute
+    s.tensor_count = tc;
+    s.scalar_count = sc;
+    for (int32_t i = 0; i < tc; i++)
+        s.tensors[i].copy(tensors[i]);
+    for (int32_t j = 0; j < sc; j++)
+        s.scalars[j] = scalars[j];
+    int32_t n = 0;
+    for (int32_t i = 0; i < tc; i++)
+        s.args[n++] = reinterpret_cast<uint64_t>(&s.tensors[i]);
+    for (int32_t j = 0; j < sc; j++)
+        s.args[n++] = s.scalars[j];
+    s.local_ctx.block_idx = 0;
+    s.local_ctx.block_num = 1;
+    s.local_ctx.async_ctx = AsyncCtx{};
+    s.global_ctx.sub_block_id = sub_block_id;
+    s.args[SPMD_LOCAL_CONTEXT_INDEX] = reinterpret_cast<uint64_t>(&s.local_ctx);
+    s.args[SPMD_GLOBAL_CONTEXT_INDEX] = reinterpret_cast<uint64_t>(&s.global_ctx);
+    s.fanin_count = fc;
+    for (int32_t k = 0; k < fc; k++)
+        s.fanin[k] = fanin[k];
+    s.is_multicore = is_multicore;
+    s.won_block = won_block;
+    s.won_slot = won_slot;
+}
+
+// Reserve a free block.won slot in `block`. Returns slot index or -1 if full.
+// 2V allows either AIV of the block to be an anchor, so allocation must be atomic.
+int32_t alloc_won_slot(int32_t block) {
+    BlockWon &bw = g_dist.blocks[block];
+    for (int32_t i = 0; i < kPrivateSlots; i++) {
+        int32_t exp = 0;
+        if (bw.slots[i].state.compare_exchange_strong(exp, 2, std::memory_order_acq_rel, std::memory_order_relaxed)) {
+            return i;
+        }
+    }
+    return -1;
+}
+
+// True if a published block.won deposit for this core's lane has not yet been
+// taken — used by the termination check to avoid finishing before draining.
+bool has_pending_won(DistCore *self) {
+    if (self->lane == LANE_AIC || self->lane == LANE_NONE) return false;
+    BlockWon &bw = g_dist.blocks[self->block_id];
+    if (bw.any_pub.load(std::memory_order_acquire) == 0) return false;  // no deposit ever published
+    for (int32_t i = 0; i < kPrivateSlots; i++) {
+        WonSlot &w = bw.slots[i];
+        if (w.state.load(std::memory_order_acquire) != 1) continue;
+        if (w.lane[self->lane].present && w.drained[self->lane].load(std::memory_order_acquire) == 0) return true;
+    }
+    return false;
+}
+
+// Follower drain (§3.1, §6): pull every published block.won subtask addressed to
+// this core's physical lane that we have not yet taken, building each into a free
+// private-ring slot (back-pressure: stop when the ring is full). Non-blocking —
+// if nothing is addressed to us we simply return.
+void drain_block_won(DistCore *self) {
+    if (self->lane == LANE_AIC || self->lane == LANE_NONE) return;  // AIC is never a follower
+    BlockWon &bw = g_dist.blocks[self->block_id];
+    // Fast path: if no anchor has ever published a deposit into this block, there
+    // is nothing to drain — skip the per-slot scan on every submit (hot path).
+    if (bw.any_pub.load(std::memory_order_acquire) == 0) return;
+    for (int32_t i = 0; i < kPrivateSlots; i++) {
+        WonSlot &w = bw.slots[i];
+        if (w.state.load(std::memory_order_acquire) != 1) continue;
+        if (!w.lane[self->lane].present) continue;
+        int32_t exp = 0;
+        if (!w.drained[self->lane].compare_exchange_strong(
+                exp, 1, std::memory_order_acq_rel, std::memory_order_relaxed
+            ))
+            continue;  // already taken by us on a prior pass
+        int32_t si = alloc_ring_slot(self);
+        if (si < 0) {
+            // Ring full: hand the deposit back and let Phase B free a slot first.
+            w.drained[self->lane].store(0, std::memory_order_release);
+            return;
+        }
+        const BuiltSubtask &b = w.lane[self->lane];
+#if DIST_TRACE_ENABLED
+        const uint64_t t_won0 = trace_now();
+        const uint64_t t_won0_cpu = trace_now_cpu();
+#endif
+        build_ring_slot(
+            self->slots[si], w.task_id, b.func_id, b.function_bin_addr, b.tensors, b.tensor_count, b.scalars,
+            b.scalar_count, b.fanin, b.fanin_count, b.sub_block_id, /*is_multicore=*/true, self->block_id, i
+        );
+        self->occupied_count++;
+        self->owned_total++;
+#if DIST_TRACE_ENABLED
+        if (g_trace_on) {
+            for (int32_t k = 0; k < b.fanin_count; k++)
+                self->dep_edges.push_back({w.task_id, b.fanin[k]});
+        }
+        trace_overhead_impl(self, w.task_id, b.func_id, TracePhase::DrainWon, t_won0, t_won0_cpu);
+#endif
+    }
+}
+
+// -----------------------------------------------------------------------------
+// Distributed submit op (replaces the centralized orchestrator submit).
+//
+// Every core runs this for every task (identical replay): materialize outputs
+// at deterministic heap addresses, maintain the per-core producer map, then
+// race to claim ownership. Only the winner builds the task into its private
+// ring; losers return with map + outputs updated so downstream get_ref() and
+// fan-in resolution stay consistent across cores.
+// -----------------------------------------------------------------------------
+TaskOutputTensors dist_submit_impl(PTO2Runtime *, const MixedKernels &mixed, const L0TaskArgs &args) {
+    DistCore *self = g_self;
+    if (self == nullptr) return TaskOutputTensors{};
+    Runtime *runtime = g_dist.runtime;
+
+    // EXECUTE-FIRST (docs §6 step 0+1, §6.1): before claiming this task, pull any
+    // follower deposits and execute every ready owned task. This interleaves
+    // execution with claiming so a fast core does not burst-claim a full ring of
+    // consecutive tasks; while it executes a (long) task other cores advance the
+    // cursor and claim subsequent ones. The deterministic replay below (id bump,
+    // heap bump, map maintenance) is unaffected — draining only runs/flags tasks
+    // this core already owns. Every core does this on every submit point.
+    //
+    // Reset the lap cursor at entry so the runtime's spans never absorb the orch
+    // round-trip between two submits — that time is USER orchestration code, not
+    // runtime work, and would bias EfDrain if counted here. It is left un-timed on
+    // purpose (a deliberate gap between submits, not a runtime span).
+    TRACE_LAP_RESET(self);
+    if (!fatal_set()) {
+        drain_block_won(self);
+        drain_phase_b(self);
+    }
+    // Lap: the execute-first drain itself (deposits + ready owned kernels it ran).
+    // Kernels show separately on the kernel sub-lane; this is the drain's own scan.
+    TRACE_LAP(self, self->local_index, -1, TracePhase::EfDrain);
+
+    const int32_t N = self->local_index++;
+    const ActiveMask M = mixed.to_active_mask();
+    const int32_t tc = args.tensor_count();
+    if (N >= kFlagCap) {  // flag ring + vend[] are non-windowed; cap total tasks
+        set_fatal();
+        fprintf(
+            stderr, "[dist_engine] task id %d exceeds kFlagCap %d (enlarge or window the flag/vend rings)\n", N,
+            kFlagCap
+        );
+        return TaskOutputTensors{};
+    }
+
+    // (a) Deterministic GM output-heap allocation + materialization (§9.3, §11.4).
+    // The virtual bump `heap_next` is unbounded and identical on every core; the
+    // PHYSICAL address is (virtual mod ring). First sum this task's aligned output
+    // bytes so we can keep the whole task within one ring lap: if it would straddle
+    // the ring end, pad the virtual base up to the next ring boundary (deterministic
+    // → every core agrees). A single task larger than the ring is unsatisfiable.
+    const size_t ring = g_dist.heap_size;
+    uint64_t total = 0;
+    for (int32_t i = 0; i < tc; i++) {
+        if (args.tag(i) != TensorArgType::OUTPUT) continue;
+        total += PTO2_ALIGN_UP(args.tensor(i).create_info().buffer_size_bytes(), PTO2_PACKED_OUTPUT_ALIGN);
+    }
+    uint64_t task_base = PTO2_ALIGN_UP(self->heap_next, PTO2_PACKED_OUTPUT_ALIGN);
+    if (total > 0 && g_dist.heap_base != nullptr) {
+        if (total > ring) {
+            set_fatal();
+            fprintf(
+                stderr, "[dist_engine] task %d outputs %llu B exceed heap ring %zu B (enlarge PTO_DIST_HEAP_MB)\n", N,
+                (unsigned long long)total, ring
+            );
+            return TaskOutputTensors{};
+        }
+        if ((task_base % ring) + total > ring) {
+            task_base = ((task_base / ring) + 1) * ring;  // skip the ring tail; start next lap
+        }
+    }
+    uint64_t off = 0;
+    TaskOutputTensors result;
+    for (int32_t i = 0; i < tc; i++) {
+        if (args.tag(i) != TensorArgType::OUTPUT) continue;
+        const TensorCreateInfo &ci = args.tensor(i).create_info();
+        const uint64_t logical = ci.buffer_size_bytes();
+        const uint64_t sz = PTO2_ALIGN_UP(logical, PTO2_PACKED_OUTPUT_ALIGN);
+        if (g_dist.heap_base == nullptr) {
+            set_fatal();
+            fprintf(stderr, "[dist_engine] GM output heap not allocated at task %d\n", N);
+            return result;
+        }
+        const uint64_t phys = (task_base + off) % ring;  // straddle-pad guarantees phys+logical <= ring
+        Tensor &slot_t = self->outpool[self->outpool_head];
+        self->outpool_head = (self->outpool_head + 1) % kOutPoolSlots;
+        init_tensor_from_create_info(slot_t, ci, g_dist.heap_base + phys, logical);
+        result.materialize_output(slot_t);
+        off += sz;
+    }
+    self->heap_next = task_base + off;
+    // Publish cumulative virtual bytes through task N so any core can derive the
+    // live window [vend[R], heap_next) for reclaim back-pressure. Deterministic, so
+    // all cores store the same value (this core also reads its own writes for R<N).
+    if (N >= 0 && N < kFlagCap) g_dist.vend[N].store(self->heap_next, std::memory_order_relaxed);
+
+    // Once fatal, stop claiming/executing but keep replaying the deterministic
+    // allocation above so this task's `result` carries valid (materialized) output
+    // refs — the orchestration may still call get_ref() on them. This degrades a
+    // fatal (e.g. heap-too-small) into a clean wrong-answer failure + diagnostic
+    // rather than an assertion crash mid-replay.
+    if (fatal_set()) return result;
+
+    // Retire producer-map entries that have left the H span (deterministic,
+    // N-derived) before this task's lookups/inserts. Bounds chain length so
+    // submit stays ~O(N) instead of O(N^2). See DistTensorMap.
+    self->map.advance_retire(N, g_dist.H);
+
+    // (b) Anchor type + claim race FIRST — resolved from the mask alone (no map
+    // ops, no Tensor copies). Deciding the winner up front lets the ~2/3 of cores
+    // that fail type_match / lose the race SKIP the fan-in lookup below; they only
+    // still perform the unconditional output insert (so every core's duplicate
+    // TensorMap stays identical — §4). Competition is by anchor TYPE (§2/§3.1):
+    // cube tasks (any AIC subtask) contested by AIC cores; vector tasks (AIV-only,
+    // incl. 2V) by AIV cores. The cursor CAS touches no map state, so doing it
+    // before the insert below does not affect the deterministic map replay.
+    const uint8_t cmask = M.core_mask();
+    const int32_t pc = __builtin_popcount(cmask);
+    const bool has_aic = (cmask & PTO2_SUBTASK_MASK_AIC) != 0;
+    const bool anchor_is_cube = has_aic;
+    const bool type_match = anchor_is_cube ? (self->role == CoreType::AIC) : (self->role == CoreType::AIV);
+    bool is_winner = false;
+    if (type_match) {
+        // Pick the shard for this task (§6.6): shard = N % kCursorShards, a pure
+        // function of the task id so every core targets the same sub-cursor for N.
+        PaddedCursor *cursors = anchor_is_cube ? g_dist.cube_cursor : g_dist.vector_cursor;
+        std::atomic<int32_t> &cursor = cursors[N % kCursorShards].v;
+        is_winner = claim(cursor, N);
+    }
+
+    // (c) Fan-in resolution — WINNER ONLY. Look up producers of INPUT/INOUT regions
+    // BEFORE this task registers its own writes (so an INOUT does not self-match).
+    // Losers never consume fanin, so they skip these lookups entirely; correctness
+    // is unaffected because the map state read here is identical on every core and
+    // only the owner needs the result.
+    int32_t fanin[kMaxFanin];
+    int32_t fc = 0;
+    if (is_winner) {
+        for (int32_t i = 0; i < tc; i++) {
+            const TensorArgType tag = args.tag(i);
+            if (tag != TensorArgType::INPUT && tag != TensorArgType::INOUT) continue;
+            const Tensor &t = args.tensor(i).ref();
+            if (t.manual_dep) continue;
+            const int32_t p = self->map.lookup(t);
+            if (p < 0) continue;
+            bool dup = false;
+            for (int32_t k = 0; k < fc; k++)
+                if (fanin[k] == p) {
+                    dup = true;
+                    break;
+                }
+            if (!dup && fc < kMaxFanin) fanin[fc++] = p;
+        }
+    }
+
+    // (d) Register this task as the producer of its OUTPUT / INOUT / existing
+    // outputs — UNCONDITIONAL (every core, so all duplicate maps stay identical).
+    uint32_t out_idx = 0;
+    for (int32_t i = 0; i < tc; i++) {
+        const TensorArgType tag = args.tag(i);
+        if (tag == TensorArgType::OUTPUT) {
+            self->map.insert(result.get_ref(out_idx), N);
+            out_idx++;
+        } else if (tag == TensorArgType::INOUT || tag == TensorArgType::OUTPUT_EXISTING) {
+            self->map.insert(args.tensor(i).ref(), N);
+        }
+    }
+
+    if (!is_winner) {
+        TRACE_LAP(self, N, -1, TracePhase::Replay);
+        return result;  // wrong type or lost the race: map updated, nothing to build
+    }
+
+    // (e) Winner only: assemble the shared argument Tensors (identical for every
+    // active lane of a multi-core task — they share the task tensors, each lane
+    // writing its designated output per the kernels). Inputs are copied from the
+    // args; outputs are the materialized heap-addressed descriptors. Done AFTER
+    // the claim so the ~2/3 of cores that fail type_match / lose the race never
+    // pay these tc x sizeof(Tensor) copies.
+    const uint64_t *scalars = args.scalars();
+    const int32_t sc = args.scalar_count();
+    Tensor built[MAX_TENSOR_ARGS];
+    {
+        uint32_t bo = 0;
+        for (int32_t i = 0; i < tc; i++) {
+            if (args.tag(i) == TensorArgType::OUTPUT) {
+                built[i].copy(result.get_ref(bo));
+                bo++;
+            } else {
+                built[i].copy(args.tensor(i).ref());
+            }
+        }
+    }
+
+    // ---- Winner = owner (single-core) / anchor (multi-core). ----
+    // The real per-task build work (claim + fan-in lookup + built[] assembly)
+    // ends here; the two back-pressure spins below are WAITING, not work, so
+    // close the Build span now and time the spins separately as RingBp. Without
+    // this split the spin time was misattributed to "build" (it dominated build
+    // under a small ring / few blocks — it is dependency/slot wait, not cost).
+    TRACE_LAP(self, N, -1, TracePhase::Build);
+
+    // Back-pressure for self-claimed work: wait until the ring has a non-reserved
+    // slot free, draining block.won deposits + ready tasks meanwhile. The reserve
+    // guarantees a follower can still pull its (ready) deposits when the rest of
+    // the ring is full of not-yet-ready consumers (no priority inversion).
+    uint64_t wd_self = 0;
+#if DIST_TRACE_ENABLED
+    // Swimlane (slot-release edges): if we are about to actually wait, snapshot the
+    // tasks currently occupying our ring — those are what must execute to free a
+    // slot, i.e. what this ringbp truly waits on. The ring only shrinks during the
+    // wait, so the entry snapshot is the complete set.
+    if (g_trace_on && self->occupied_count >= kPrivateSlots - kWonReserve) {
+        for (int32_t i = 0; i < kPrivateSlots; i++) {
+            const RingSlot &rs = self->slots[i];
+            if (rs.occupied && rs.built) self->slot_edges.push_back({N, rs.task_id});
+        }
+    }
+#endif
+    while (self->occupied_count >= kPrivateSlots - kWonReserve && !fatal_set()) {
+        drain_block_won(self);
+        if (drain_phase_b(self) == 0) {
+            SPIN_WAIT_HINT();
+            watchdog(wd_self);
+        }
+    }
+    if (fatal_set()) return result;
+
+    // Heap reclaim back-pressure (§9.5/§11.4): this owner is about to build (and
+    // later write) task N's outputs at deterministic physical offsets. Recycling a
+    // ring region is safe only once its previous occupant's task id <= R = F - H
+    // (all that occupant's consumers, which have id <= occupant+H, are done). The
+    // equivalent global-derivable test is: the live virtual window (heap_next minus
+    // vend[R]) must fit in the ring. Spin (draining + advancing F) until it does.
+    if (g_dist.heap_base != nullptr) {
+        const size_t ring = g_dist.heap_size;
+        uint64_t wd_heap = 0;
+        while (!fatal_set()) {
+            const int32_t f = g_dist.frontier.load(std::memory_order_acquire);
+            const int32_t R = f - g_dist.H;
+            const uint64_t vstart_live = (R < 0) ? 0 : g_dist.vend[R].load(std::memory_order_relaxed);
+            if (self->heap_next - vstart_live <= ring) break;  // window fits — region free
+            if (f >= N - 1) {  // every predecessor done yet H-window still overflows the ring
+                set_fatal();
+                fprintf(
+                    stderr,
+                    "[dist_engine] heap ring %zu B too small for H=%d window at task %d (live=%llu B); "
+                    "enlarge PTO_DIST_HEAP_MB or reduce PTO_DIST_H\n",
+                    ring, g_dist.H, N, (unsigned long long)(self->heap_next - vstart_live)
+                );
+                return result;
+            }
+            drain_block_won(self);
+            if (drain_phase_b(self) == 0) {
+                SPIN_WAIT_HINT();
+                watchdog(wd_heap);
+            }
+        }
+        if (fatal_set()) return result;
+    }
+    // Time spent in the two back-pressure spins above (ring-slot wait + heap
+    // reclaim wait) — dependency/slot WAITING, kept separate from Build.
+    TRACE_LAP(self, N, -1, TracePhase::RingBp);
+
+    int32_t si = alloc_ring_slot(self);
+    if (si < 0) {  // should not happen given the back-pressure gate above
+        set_fatal();
+        fprintf(stderr, "[dist_engine] no free private-ring slot after back-pressure at task %d\n", N);
+        return result;
+    }
+    // Reserve so concurrent drains (including the block.won back-pressure loop
+    // below, which calls drain_phase_b) do not reuse this slot. Mark it unbuilt
+    // so Phase B skips it until build_ring_slot populates it (avoids re-executing
+    // the prior occupant's stale task_id/fanin/won linkage).
+    self->slots[si].occupied = true;
+    self->slots[si].built = false;
+
+    int32_t own_lane;
+    int32_t won_block = -1;
+    int32_t won_slot = -1;
+    bool is_multicore = (pc > 1);
+
+    if (!is_multicore) {
+        // Single core (1C / 1V): the one active lane is the only subtask. For 1V
+        // the winner may be physically AIV0 or AIV1, but the active lane/kernel is
+        // AIV0 (rt_submit_aiv fills aiv0). Find the single active lane.
+        own_lane = has_aic ? LANE_AIC : LANE_AIV0;
+    } else {
+        // Multi-core (MIX / 2V): we are the anchor. Our own physical lane subtask
+        // goes to our private ring; the remaining active lanes are deposited into
+        // block.won for our same-block followers to drain (§3.1).
+        own_lane = self->lane;
+        won_block = self->block_id;
+        won_slot = alloc_won_slot(won_block);
+        uint64_t wd_won = 0;
+        while (won_slot < 0 && !fatal_set()) {  // block.won full → back-pressure (drain, then retry)
+            drain_block_won(self);
+            if (drain_phase_b(self) == 0) {
+                SPIN_WAIT_HINT();
+                watchdog(wd_won);
+            }
+            won_slot = alloc_won_slot(won_block);
+        }
+        if (fatal_set()) return result;
+        WonSlot &w = g_dist.blocks[won_block].slots[won_slot];
+        w.task_id = N;
+        w.remaining.store(pc, std::memory_order_relaxed);
+        for (int32_t L = 0; L < PTO2_SUBTASK_SLOT_COUNT; L++) {
+            w.drained[L].store(0, std::memory_order_relaxed);
+            w.lane[L].present = false;
+        }
+        for (int32_t L = 0; L < PTO2_SUBTASK_SLOT_COUNT; L++) {
+            if (L == own_lane || !lane_active(M, L)) continue;
+            BuiltSubtask &b = w.lane[L];
+            b.present = true;
+            b.func_id = kernel_id_for_lane(mixed, L);
+            b.function_bin_addr = resolve_kernel_addr(runtime, kernel_id_for_lane(mixed, L));
+            b.tensor_count = tc;
+            b.scalar_count = sc;
+            for (int32_t i = 0; i < tc; i++)
+                b.tensors[i].copy(built[i]);
+            for (int32_t j = 0; j < sc; j++)
+                b.scalars[j] = scalars[j];
+            b.fanin_count = fc;
+            for (int32_t k = 0; k < fc; k++)
+                b.fanin[k] = fanin[k];
+            b.sub_block_id = (L == LANE_AIV1) ? 1 : 0;
+        }
+        std::atomic_thread_fence(std::memory_order_release);
+        g_dist.blocks[won_block].any_pub.store(1, std::memory_order_release);  // enable follower drains
+        w.state.store(1, std::memory_order_release);                           // publish the deposits to followers
+    }
+
+    const int32_t own_sub_block = (own_lane == LANE_AIV1) ? 1 : 0;
+    const int32_t own_func_id = kernel_id_for_lane(mixed, own_lane);
+    build_ring_slot(
+        self->slots[si], N, own_func_id, resolve_kernel_addr(runtime, own_func_id), built, tc, scalars, sc, fanin, fc,
+        own_sub_block, is_multicore, won_block, won_slot
+    );
+    self->occupied_count++;
+    self->owned_total++;
+
+#if DIST_TRACE_ENABLED
+    if (g_trace_on) {
+        for (int32_t k = 0; k < fc; k++)
+            self->dep_edges.push_back({N, fanin[k]});
+    }
+#endif
+    TRACE_LAP(self, N, -1, TracePhase::Commit);
+    return result;
+}
+
+// -----------------------------------------------------------------------------
+// Remaining ops — minimal stubs (bgemm exercises submit/scope/log only).
+// -----------------------------------------------------------------------------
+void dist_scope_begin(PTO2Runtime *) {}
+void dist_scope_end(PTO2Runtime *) {}
+void dist_orchestration_done(PTO2Runtime *) {}
+bool dist_is_fatal(PTO2Runtime *) { return fatal_set(); }
+
+void dist_report_fatal(PTO2Runtime *, int32_t code, const char *func, const char *fmt, ...) {
+    set_fatal();
+    va_list ap;
+    va_start(ap, fmt);
+    fprintf(stderr, "[dist_engine][FATAL][%s] code=%d: ", func ? func : "?", code);
+    vfprintf(stderr, fmt, ap);
+    fprintf(stderr, "\n");
+    va_end(ap);
+}
+
+void dist_log_error(const char *func, const char *fmt, ...) {
+    va_list ap;
+    va_start(ap, fmt);
+    fprintf(stderr, "[dist_engine][E][%s] ", func ? func : "?");
+    vfprintf(stderr, fmt, ap);
+    fprintf(stderr, "\n");
+    va_end(ap);
+}
+void dist_log_warn(const char *, const char *, ...) {}
+void dist_log_debug(const char *, const char *, ...) {}
+void dist_log_info_v(const char *, int, const char *, ...) {}
+
+// Orchestration-side tensor data access (get/set_tensor_data). Replay runs on the
+// AICore worker and reads/writes real GM, so these are genuine memory accesses.
+// The only subtlety is read-after-write across tasks: if the region has a producer
+// in this core's map, wait until that producer's completion flag is set (draining
+// this core's own ring meanwhile so an owned producer actually runs). External
+// tensors (no producer) are accessed immediately. Consumer (WAR) tracking is not
+// modeled, mirroring the centralized runtime's documented INPUT-reader limitation.
+void wait_producer_ready(DistCore *self, const Tensor &t) {
+    // Cold path (get/set_tensor_data); uses the map's current alive_floor.
+    const int32_t p = self->map.lookup(t);
+    if (p < 0) return;
+    uint64_t wd = 0;
+    while (!fatal_set()) {
+        if (g_dist.flags[p & (kFlagCap - 1)].load(std::memory_order_acquire) != 0) break;
+        drain_block_won(self);
+        if (drain_phase_b(self) == 0) {
+            SPIN_WAIT_HINT();
+            watchdog(wd);
+        }
+    }
+}
+
+uint64_t dist_get_tensor_data(PTO2Runtime *, const Tensor &tensor, uint32_t ndims, const uint32_t *indices) {
+    if (tensor.buffer.addr == 0) return 0;
+    DistCore *self = g_self;
+    if (self != nullptr) wait_producer_ready(self, tensor);
+    const uint64_t flat = tensor.compute_flat_offset(indices, ndims);
+    const uint64_t esz = get_element_size(tensor.dtype);
+    uint64_t result = 0;
+    memcpy(&result, reinterpret_cast<const void *>(tensor.buffer.addr + flat * esz), esz);
+    return result;
+}
+
+void dist_set_tensor_data(
+    PTO2Runtime *, const Tensor &tensor, uint32_t ndims, const uint32_t *indices, uint64_t value
+) {
+    if (tensor.buffer.addr == 0) return;
+    DistCore *self = g_self;
+    if (self != nullptr) wait_producer_ready(self, tensor);
+    const uint64_t flat = tensor.compute_flat_offset(indices, ndims);
+    const uint64_t esz = get_element_size(tensor.dtype);
+    memcpy(reinterpret_cast<void *>(tensor.buffer.addr + flat * esz), &value, esz);
+}
+
+// alloc_tensors — a kernel-less "hidden task" that only reserves GM output
+// buffers (no compute). It consumes one task id, allocates its outputs on the
+// deterministic heap exactly like dist_submit_impl step (a), registers itself as
+// their producer, and completes INLINE (sets its own flag immediately) since no
+// kernel runs. A later writer (INOUT / OUTPUT_EXISTING) becomes the new producer
+// of the region, so real consumers depend on the writer, not on this alloc. Every
+// core replays it identically, keeping heap addresses + maps consistent.
+TaskOutputTensors dist_alloc_tensors(PTO2Runtime *, const L0TaskArgs &args) {
+    DistCore *self = g_self;
+    if (self == nullptr) return TaskOutputTensors{};
+    // EXECUTE-FIRST (docs §6 step 0+1, §6.1): every submit point first seeks an
+    // execution opportunity before advancing the deterministic replay below.
+    TRACE_LAP_RESET(self);  // exclude the inter-submit orch round-trip (user code) from runtime spans
+    if (!fatal_set()) {
+        drain_block_won(self);
+        drain_phase_b(self);
+    }
+    TRACE_LAP(self, self->local_index, -1, TracePhase::EfDrain);
+    const int32_t N = self->local_index++;
+    const int32_t tc = args.tensor_count();
+    if (N >= kFlagCap) {
+        set_fatal();
+        fprintf(stderr, "[dist_engine] alloc task id %d exceeds kFlagCap %d\n", N, kFlagCap);
+        return TaskOutputTensors{};
+    }
+
+    // Deterministic GM heap allocation + straddle-padding (identical to submit (a)).
+    const size_t ring = g_dist.heap_size;
+    uint64_t total = 0;
+    for (int32_t i = 0; i < tc; i++) {
+        if (args.tag(i) != TensorArgType::OUTPUT) continue;
+        total += PTO2_ALIGN_UP(args.tensor(i).create_info().buffer_size_bytes(), PTO2_PACKED_OUTPUT_ALIGN);
+    }
+    uint64_t task_base = PTO2_ALIGN_UP(self->heap_next, PTO2_PACKED_OUTPUT_ALIGN);
+    if (total > 0 && g_dist.heap_base != nullptr) {
+        if (total > ring) {
+            set_fatal();
+            fprintf(
+                stderr, "[dist_engine] alloc task %d outputs %llu B exceed heap ring %zu B\n", N,
+                (unsigned long long)total, ring
+            );
+            return TaskOutputTensors{};
+        }
+        if ((task_base % ring) + total > ring) task_base = ((task_base / ring) + 1) * ring;
+    }
+
+    // (a) Materialize outputs + publish the deterministic heap layout — EVERY core
+    // (like dist_submit_impl step (a)), so duplicate maps and vend[] stay identical.
+    uint64_t off = 0;
+    TaskOutputTensors result;
+    for (int32_t i = 0; i < tc; i++) {
+        if (args.tag(i) != TensorArgType::OUTPUT) continue;
+        const TensorCreateInfo &ci = args.tensor(i).create_info();
+        const uint64_t logical = ci.buffer_size_bytes();
+        const uint64_t sz = PTO2_ALIGN_UP(logical, PTO2_PACKED_OUTPUT_ALIGN);
+        if (g_dist.heap_base == nullptr) {
+            set_fatal();
+            fprintf(stderr, "[dist_engine] GM output heap not allocated at alloc %d\n", N);
+            return result;
+        }
+        const uint64_t phys = (task_base + off) % ring;
+        Tensor &slot_t = self->outpool[self->outpool_head];
+        self->outpool_head = (self->outpool_head + 1) % kOutPoolSlots;
+        init_tensor_from_create_info(slot_t, ci, g_dist.heap_base + phys, logical);
+        result.materialize_output(slot_t);
+        off += sz;
+    }
+    self->heap_next = task_base + off;
+    if (N >= 0 && N < kFlagCap) g_dist.vend[N].store(self->heap_next, std::memory_order_relaxed);
+    if (fatal_set()) return result;
+
+    // (b) Register this alloc as producer of each output — EVERY core (map parity).
+    self->map.advance_retire(N, g_dist.H);
+    uint32_t out_idx = 0;
+    for (int32_t i = 0; i < tc; i++) {
+        if (args.tag(i) != TensorArgType::OUTPUT) continue;
+        self->map.insert(result.get_ref(out_idx), N);
+        out_idx++;
+    }
+
+    // (c) Single-owner election (mirrors dist_submit_impl's claim). The first core
+    // to reach this alloc id wins; that core is by construction at/ahead of the
+    // completion frontier (N is not yet done, so F < N), hence the winner-only
+    // back-pressure below can never see heap_next < vend[F-H] and never underflows.
+    // Losers have finished the deterministic bookkeeping above and return — the
+    // winner alone paces reclaim and publishes the completion flag (the leading
+    // core was the one gating completion before this change too, so timing is
+    // unchanged; this only drops the lagging cores' redundant pass).
+    bool is_winner = claim(g_dist.alloc_cursor[N % kCursorShards].v, N);
+    if (!is_winner) {
+        TRACE_LAP(self, N, -1, TracePhase::Replay);
+        return result;
+    }
+
+    // (d) Winner-only heap reclaim back-pressure: drain this core's ring while the
+    // live virtual window [vend[F-H], heap_next) would overflow the physical ring.
+    if (total > 0 && g_dist.heap_base != nullptr) {
+        uint64_t wd_heap = 0;
+        while (!fatal_set()) {
+            const int32_t f = g_dist.frontier.load(std::memory_order_acquire);
+            const int32_t R = f - g_dist.H;
+            const uint64_t vstart_live = (R < 0) ? 0 : g_dist.vend[R].load(std::memory_order_relaxed);
+            if (self->heap_next - vstart_live <= ring) break;  // window fits — region free
+            if (f >= N - 1) {
+                set_fatal();
+                fprintf(
+                    stderr, "[dist_engine] heap ring %zu B too small for H=%d window at alloc %d (live=%llu B)\n", ring,
+                    g_dist.H, N, (unsigned long long)(self->heap_next - vstart_live)
+                );
+                return result;
+            }
+            drain_block_won(self);
+            if (drain_phase_b(self) == 0) {
+                SPIN_WAIT_HINT();
+                watchdog(wd_heap);
+            }
+        }
+        if (fatal_set()) return result;
+    }
+
+    // (e) Winner completes inline (no kernel runs).
+    g_dist.flags[N & (kFlagCap - 1)].store(1, std::memory_order_release);
+    advance_frontier();
+    TRACE_LAP(self, N, -1, TracePhase::Alloc);
+    return result;
+}
+
+TaskOutputTensors dist_submit_dummy(PTO2Runtime *, const L0TaskArgs &) { return TaskOutputTensors{}; }
+void dist_scope_set_site(const char *, int) {}
+
+const PTO2RuntimeOps g_dist_ops = {
+    dist_submit_impl,     dist_scope_begin,     dist_scope_end,     dist_orchestration_done, dist_is_fatal,
+    dist_report_fatal,    dist_log_error,       dist_log_warn,      dist_log_debug,          dist_log_info_v,
+    dist_get_tensor_data, dist_set_tensor_data, dist_alloc_tensors, dist_submit_dummy,       dist_scope_set_site,
+};
+
+// -----------------------------------------------------------------------------
+// Deadlock diagnostics: dump the full engine state on SIGUSR1. Sim runs every
+// core as a pthread in one process, so a single handler can walk g_dist. Used to
+// debug hangs (kill -USR1 <pid>); compiled in but inert unless signalled.
+// -----------------------------------------------------------------------------
+void dist_dump_state(int) {
+    fprintf(stderr, "\n===== DIST STATE DUMP =====\n");
+    fprintf(
+        stderr, "frontier=%d H=%d ring=%zuB replay_done=%d/%d num_blocks=%d fatal=%d\n", g_dist.frontier.load(),
+        g_dist.H, g_dist.heap_size, g_dist.replay_done.load(), g_dist.num_workers, g_dist.num_blocks,
+        g_dist.fatal.load()
+    );
+    fprintf(stderr, "cube_cursor[%d]=", kCursorShards);
+    for (int32_t s = 0; s < kCursorShards; s++)
+        fprintf(stderr, "%d%s", g_dist.cube_cursor[s].v.load(), s + 1 < kCursorShards ? "," : "");
+    fprintf(stderr, " vector_cursor[%d]=", kCursorShards);
+    for (int32_t s = 0; s < kCursorShards; s++)
+        fprintf(stderr, "%d%s", g_dist.vector_cursor[s].v.load(), s + 1 < kCursorShards ? "," : "");
+    fprintf(stderr, "\n");
+    for (int32_t c = 0; c < g_dist.num_workers && c < RUNTIME_MAX_WORKER; c++) {
+        DistCore &co = g_dist.cores[c];
+        fprintf(
+            stderr, "core %d role=%d blk=%d lane=%d replayed=%d occ=%d owned=%d\n", c, static_cast<int>(co.role),
+            co.block_id, co.lane, co.local_index, co.occupied_count, co.owned_total
+        );
+        for (int32_t i = 0; i < kPrivateSlots; i++) {
+            RingSlot &s = co.slots[i];
+            if (!s.occupied) continue;
+            int32_t unmet = -1;
+            for (int32_t f = 0; f < s.fanin_count; f++)
+                if (g_dist.flags[s.fanin[f] & (kFlagCap - 1)].load() == 0) {
+                    unmet = s.fanin[f];
+                    break;
+                }
+            fprintf(
+                stderr, "    slot%d tid=%d built=%d mc=%d won=(%d,%d) fanin=%d unmet=%d\n", i, s.task_id, s.built,
+                s.is_multicore, s.won_block, s.won_slot, s.fanin_count, unmet
+            );
+        }
+    }
+    for (int32_t b = 0; b < g_dist.num_blocks; b++) {
+        for (int32_t i = 0; i < kPrivateSlots; i++) {
+            WonSlot &w = g_dist.blocks[b].slots[i];
+            int32_t st = w.state.load();
+            if (st == 0) continue;
+            fprintf(
+                stderr, "  won blk%d slot%d state=%d tid=%d remaining=%d drained=[%d,%d,%d] present=[%d,%d,%d]\n", b, i,
+                st, w.task_id, w.remaining.load(), w.drained[0].load(), w.drained[1].load(), w.drained[2].load(),
+                w.lane[0].present, w.lane[1].present, w.lane[2].present
+            );
+        }
+    }
+    fprintf(stderr, "===== END DUMP =====\n");
+}
+
+// -----------------------------------------------------------------------------
+// Per-core entry point invoked by each AICore worker thread.
+// -----------------------------------------------------------------------------
+void dist_core_main(void *runtime_v, int core_idx, int core_type_int) {
+    if (core_idx < 0 || core_idx >= RUNTIME_MAX_WORKER) return;
+    Runtime *runtime = reinterpret_cast<Runtime *>(runtime_v);
+    DistCore *self = &g_dist.cores[core_idx];
+    const CoreType role = static_cast<CoreType>(core_type_int);
+
+    // sub_block lane: only meaningful for AIV in MIX tasks (M3). bgemm's 1V add
+    // ignores it, so 0 is correct for the M2 single-core scope.
+    const CoreLayout lay = g_dist.layout[core_idx];
+    self->reset(role, lay.block_id, lay.lane);
+    self->core_idx = core_idx;
+    g_self = self;
+    if (dist_trace())
+        fprintf(
+            stderr, "[dist] core %d role=%d block=%d lane=%d START\n", core_idx, core_type_int, lay.block_id, lay.lane
+        );
+
+    // Startup barrier: wait until every worker thread has been scheduled in and
+    // reached this point before anyone begins replay. In sim the OS brings the
+    // host threads up one at a time, so without this the cores that start early
+    // race ahead and the swimlane's first-task stagger reflects thread-wakeup
+    // skew rather than engine scheduling. Bare spin (no yield) per the AICPU
+    // spin-wait convention. Skipped under fatal so a failed run still tears down.
+    if (!fatal_set()) {
+        g_dist.started_count.fetch_add(1, std::memory_order_acq_rel);
+        uint64_t wd_start = 0;
+        while (g_dist.started_count.load(std::memory_order_acquire) < g_dist.num_workers && !fatal_set()) {
+            SPIN_WAIT_HINT();
+            watchdog(wd_start);
+        }
+    }
+
+    // Replay the full orchestration submit stream: build the per-core map and
+    // claim/build owned tasks into the private ring (back-pressure inline). MIX
+    // anchors deposit follower subtasks into block.won during this replay.
+    TRACE_LAP_RESET(self);  // origin for the first lap span (post-barrier, pre-replay)
+    if (g_dist.orch_func != nullptr && g_dist.orch_args != nullptr && !fatal_set()) {
+        g_dist.orch_func(*g_dist.orch_args);
+    }
+
+    // Publish "my replay is done" so followers can eventually conclude that no
+    // further block.won deposits will arrive for them (§7 tail-idle).
+    g_dist.replay_done.fetch_add(1, std::memory_order_acq_rel);
+
+    // Drain to completion: pull any follower deposits addressed to my lane, run
+    // ready tasks, and only finish once every core has finished replay (no more
+    // pushes), my private ring is empty, and there is no undrained deposit left
+    // for my lane.
+    uint64_t wd_drain = 0;
+    while (!fatal_set()) {
+        drain_block_won(self);
+        int32_t freed = drain_phase_b(self);
+        const bool all_replayed = g_dist.replay_done.load(std::memory_order_acquire) >= g_dist.num_workers;
+        const bool ring_empty = (self->occupied_count == 0);
+        const bool pending = has_pending_won(self);
+        if (all_replayed && ring_empty && !pending) break;
+        if (freed == 0) {
+            SPIN_WAIT_HINT();
+            watchdog(wd_drain);
+        }
+    }
+
+    if (dist_trace() || fatal_set()) {
+        fprintf(
+            stderr, "[dist] core %d role=%d DONE replayed=%d owned=%d fatal=%d\n", core_idx, core_type_int,
+            self->local_index, self->owned_total, fatal_set() ? 1 : 0
+        );
+    }
+    g_self = nullptr;
+    __atomic_add_fetch(&runtime->dist.done_count, 1, __ATOMIC_ACQ_REL);
+}
+
+}  // namespace
+
+void *dist_engine_register(
+    PTO2Runtime *rt, DistOrchFunc orch_func, const L2TaskArgs *orch_args, int num_workers, Runtime *runtime
+) {
+    // GM output heap: a BOUNDED ring reclaimed by the completion frontier (M4).
+    // Size from PTO_DIST_HEAP_MB (MiB) else kHeapRingDefault. Allocated once per
+    // process; if a later run needs a different size, free + realloc.
+    {
+        size_t want = kHeapRingDefault;
+        if (const char *e = getenv("PTO_DIST_HEAP_MB")) {
+            const long mb = atol(e);
+            if (mb > 0) want = static_cast<size_t>(mb) << 20;
+        }
+        if (g_dist.heap_base != nullptr && g_dist.heap_size != want) {
+            free(g_dist.heap_base);
+            g_dist.heap_base = nullptr;
+        }
+        if (g_dist.heap_base == nullptr) {
+            g_dist.heap_base = static_cast<uint8_t *>(malloc(want));
+            g_dist.heap_size = (g_dist.heap_base != nullptr) ? want : 0;
+        }
+        // Zero the heap each run so freshly-allocated output regions read as 0,
+        // matching the centralized runtime's zero-initialized GM. Kernels that
+        // read a padded tile (e.g. softmax/PV where valid_len < tile width) rely
+        // on the unwritten remainder being zero; an uninitialized (malloc) or
+        // recycled heap would otherwise yield nondeterministic results.
+        if (g_dist.heap_base != nullptr) memset(g_dist.heap_base, 0, g_dist.heap_size);
+    }
+    // Dependency-span bound H (R = F - H). Env override for graphs with longer
+    // heap spans; default kHDefault.
+    g_dist.H = kHDefault;
+    if (const char *e = getenv("PTO_DIST_H")) {
+        const long h = atol(e);
+        if (h >= 0) g_dist.H = static_cast<int32_t>(h);
+    }
+    // The producer map recycles a task's entry-head slot kTaskWindow tasks later;
+    // cleanup retires a task once it leaves the H span, so H must stay below the
+    // window (with margin) or a slot could be reused before its task is cleaned.
+    always_assert(g_dist.H < kTaskWindow - 1);
+#if DIST_TRACE_ENABLED
+    // Swimlane tracing gate. Capture the epoch now so every core's event ts is
+    // relative to the same run start.
+    g_trace_on = (getenv("PTO_DIST_SWIMLANE") != nullptr);
+    g_trace_epoch_ns = now_ns();
+    // Per-core span reserve: 0 when off (reset never reserves → zero overhead on a
+    // normal run); a generous bound when on so push_back never reallocs for the
+    // sizes we actually analyze (a realloc would perturb heap layout + add timing
+    // noise to the very gaps we measure). Best-effort: a huge trace may still grow.
+    g_trace_reserve = g_trace_on ? (1 << 16) : 0;
+#endif
+#if DIST_SIM_HOST_CLOCK
+    // Overhead-isolation gate (skip incore kernel calls, keep all bookkeeping).
+    g_skip_exec = (getenv("PTO_DIST_SKIP_EXEC") != nullptr);
+#endif
+
+    for (int32_t s = 0; s < kCursorShards; s++) {
+        g_dist.cube_cursor[s].v.store(-1, std::memory_order_relaxed);
+        g_dist.vector_cursor[s].v.store(-1, std::memory_order_relaxed);
+        g_dist.alloc_cursor[s].v.store(-1, std::memory_order_relaxed);
+    }
+    g_dist.frontier.store(-1, std::memory_order_relaxed);
+    for (int32_t i = 0; i < kFlagCap; i++)
+        g_dist.flags[i].store(0, std::memory_order_relaxed);
+    g_dist.fatal.store(0, std::memory_order_relaxed);
+    g_dist.replay_done.store(0, std::memory_order_relaxed);
+    g_dist.started_count.store(0, std::memory_order_relaxed);
+    g_dist.orch_func = orch_func;
+    g_dist.orch_args = orch_args;
+    g_dist.rt = rt;
+    g_dist.runtime = runtime;
+
+    // Derive the physical-block topology (1 AIC + 2 AIV per block) the same way
+    // the centralized scheduler discovers clusters: AIC/AIV cores in worker-index
+    // order, AIC[b] paired with AIV[2b] (AIV0) and AIV[2b+1] (AIV1). Followers and
+    // anchors use this to address block.won deposits. See §3.1.
+    g_dist.num_workers = num_workers;
+    int32_t aic_ids[RUNTIME_MAX_WORKER];
+    int32_t aiv_ids[RUNTIME_MAX_WORKER];
+    int32_t naic = 0, naiv = 0;
+    for (int32_t i = 0; i < num_workers && i < RUNTIME_MAX_WORKER; i++) {
+        g_dist.layout[i].block_id = -1;
+        g_dist.layout[i].lane = LANE_NONE;
+        if (runtime->workers[i].core_type == CoreType::AIC) {
+            aic_ids[naic++] = i;
+        } else {
+            aiv_ids[naiv++] = i;
+        }
+    }
+    g_dist.num_blocks = naic;
+    for (int32_t b = 0; b < naic; b++) {
+        g_dist.layout[aic_ids[b]] = CoreLayout{b, LANE_AIC};
+        if (2 * b < naiv) g_dist.layout[aiv_ids[2 * b]] = CoreLayout{b, LANE_AIV0};
+        if (2 * b + 1 < naiv) g_dist.layout[aiv_ids[2 * b + 1]] = CoreLayout{b, LANE_AIV1};
+        g_dist.blocks[b].any_pub.store(0, std::memory_order_relaxed);
+        for (int32_t s = 0; s < kPrivateSlots; s++) {
+            g_dist.blocks[b].slots[s].state.store(0, std::memory_order_relaxed);
+        }
+    }
+
+    if (dist_trace()) {
+        fprintf(
+            stderr, "[dist] register: num_workers=%d heap_base=%p heap_size=%zu\n", num_workers,
+            (void *)g_dist.heap_base, g_dist.heap_size
+        );
+    }
+
+    // Install the SIGUSR1 deadlock dumper once, but only when diagnostics are
+    // opted in (PTO_DIST_WATCHDOG set) — default runs install no signal handler.
+    static bool handler_installed = false;
+    if (!handler_installed && getenv("PTO_DIST_WATCHDOG") != nullptr) {
+        signal(SIGUSR1, dist_dump_state);
+        handler_installed = true;
+    }
+
+    // Publish all of the above before any worker observes Runtime::dist.go.
+    std::atomic_thread_fence(std::memory_order_release);
+    rt->ops = &g_dist_ops;
+    return reinterpret_cast<void *>(&dist_core_main);
+}
+
+#if DIST_TRACE_ENABLED
+void dist_engine_dump_trace() {
+    if (!g_trace_on) return;
+    const char *path = getenv("PTO_DIST_SWIMLANE");
+    if (path == nullptr || path[0] == '\0') return;
+    FILE *f = fopen(path, "w");
+    if (f == nullptr) {
+        fprintf(stderr, "[dist_engine] cannot open swimlane file %s for write\n", path);
+        return;
+    }
+
+    auto lane_name = [](int32_t lane) -> const char * {
+        switch (lane) {
+        case LANE_AIC:
+            return "AIC";
+        case LANE_AIV0:
+            return "AIV0";
+        case LANE_AIV1:
+            return "AIV1";
+        default:
+            return "?";
+        }
+    };
+
+    // Chrome Trace Event Format (https://ui.perfetto.dev / chrome://tracing).
+    // Two process groups: pid = block_id is the WALL-clock swimlane; pid =
+    // block_id + kCpuPid is a parallel CPU-time swimlane (same spans, width =
+    // cpu_us). process_sort_index forces all wall groups above all cpu groups.
+    // Dependency arrows (flow events) are emitted only in the cpu group so they
+    // stay within the cpu lanes instead of tangling across the wall lanes.
+    constexpr int32_t kCpuPid = 1000;
+    fprintf(f, "{\n  \"displayTimeUnit\": \"ns\",\n  \"traceEvents\": [\n");
+    bool first = true;
+    const int32_t nw = g_dist.num_workers;
+
+    // Lane/process name + sort metadata first (so idle lanes still appear).
+    for (int32_t c = 0; c < nw && c < RUNTIME_MAX_WORKER; c++) {
+        DistCore &co = g_dist.cores[c];
+        if (co.block_id < 0 || co.lane < 0) continue;
+        if (!first) fprintf(f, ",\n");
+        first = false;
+        fprintf(
+            f, "    {\"ph\":\"M\",\"name\":\"process_name\",\"pid\":%d,\"args\":{\"name\":\"block%d (wall)\"}}",
+            co.block_id, co.block_id
+        );
+        fprintf(
+            f, ",\n    {\"ph\":\"M\",\"name\":\"process_sort_index\",\"pid\":%d,\"args\":{\"sort_index\":%d}}",
+            co.block_id, co.block_id
+        );
+        fprintf(
+            f,
+            ",\n    {\"ph\":\"M\",\"name\":\"thread_name\",\"pid\":%d,\"tid\":%d,"
+            "\"args\":{\"name\":\"%s (core%d)\"}}",
+            co.block_id, co.lane, lane_name(co.lane), c
+        );
+        fprintf(
+            f, ",\n    {\"ph\":\"M\",\"name\":\"process_name\",\"pid\":%d,\"args\":{\"name\":\"block%d (cpu)\"}}",
+            co.block_id + kCpuPid, co.block_id
+        );
+        fprintf(
+            f, ",\n    {\"ph\":\"M\",\"name\":\"process_sort_index\",\"pid\":%d,\"args\":{\"sort_index\":%d}}",
+            co.block_id + kCpuPid, co.block_id + kCpuPid
+        );
+        fprintf(
+            f,
+            ",\n    {\"ph\":\"M\",\"name\":\"thread_name\",\"pid\":%d,\"tid\":%d,"
+            "\"args\":{\"name\":\"%s (core%d)\"}}",
+            co.block_id + kCpuPid, co.lane, lane_name(co.lane), c
+        );
+        // CPU-group kernel sub-lane (tid = lane + 3): kernel spans live here so a
+        // ringbp bar that time-contains its releasing kernel does not nest+hide it.
+        fprintf(
+            f,
+            ",\n    {\"ph\":\"M\",\"name\":\"thread_name\",\"pid\":%d,\"tid\":%d,"
+            "\"args\":{\"name\":\"%s·kernel (core%d)\"}}",
+            co.block_id + kCpuPid, co.lane + 3, lane_name(co.lane), c
+        );
+    }
+
+    auto phase_name = [](TracePhase p) -> const char * {
+        switch (p) {
+        case TracePhase::Kernel:
+            return "kernel";
+        case TracePhase::Alloc:
+            return "alloc";
+        case TracePhase::Build:
+            return "build";
+        case TracePhase::DrainWon:
+            return "drain_won";
+        case TracePhase::Replay:
+            return "replay";
+        case TracePhase::RingBp:
+            return "ringbp";
+        case TracePhase::EfDrain:
+            return "efdrain";
+        case TracePhase::Commit:
+            return "commit";
+        default:
+            return "?";
+        }
+    };
+
+    // Index: task_id -> its kernel span location in the CPU group, so a dep edge
+    // can anchor an arrow at the producer's and consumer's actual spans.
+    struct SpanLoc {
+        int32_t pid;
+        int32_t tid;
+        double ts_us;
+        double dur_us;
+    };
+    // In the CPU group, kernel spans go on a SEPARATE sub-lane (tid = lane +
+    // kCpuKernelLane) from the build/ringbp/replay/alloc spans (tid = lane). A
+    // ringbp span time-contains the kernel that ends its wait, so on one lane
+    // perfetto would nest the kernel inside the ringbp bar and hide it; splitting
+    // the kernel onto its own row keeps both visible.
+    constexpr int32_t kCpuKernelLane = 3;
+    std::vector<SpanLoc> kloc(static_cast<size_t>(kFlagCap), SpanLoc{-1, -1, 0.0, 0.0});
+    for (int32_t c = 0; c < nw && c < RUNTIME_MAX_WORKER; c++) {
+        DistCore &co = g_dist.cores[c];
+        if (co.block_id < 0 || co.lane < 0) continue;
+        for (const TraceEvent &e : co.trace) {
+            if (e.phase != TracePhase::Kernel || e.task_id < 0 || e.task_id >= kFlagCap) continue;
+            kloc[static_cast<size_t>(e.task_id)] =
+                SpanLoc{co.block_id + kCpuPid, co.lane + kCpuKernelLane, e.ts_ns / 1000.0, e.cpu_ns / 1000.0};
+        }
+    }
+    // Index: task_id -> its ringbp span in the CPU group (the arrow head for a
+    // slot-release edge anchors at the ringbp's END = when the wait was satisfied).
+    std::vector<SpanLoc> rbloc(static_cast<size_t>(kFlagCap), SpanLoc{-1, -1, 0.0, 0.0});
+    for (int32_t c = 0; c < nw && c < RUNTIME_MAX_WORKER; c++) {
+        DistCore &co = g_dist.cores[c];
+        if (co.block_id < 0 || co.lane < 0) continue;
+        for (const TraceEvent &e : co.trace) {
+            if (e.phase != TracePhase::RingBp || e.task_id < 0 || e.task_id >= kFlagCap) continue;
+            rbloc[static_cast<size_t>(e.task_id)] =
+                SpanLoc{co.block_id + kCpuPid, co.lane, e.ts_ns / 1000.0, e.cpu_ns / 1000.0};
+        }
+    }
+
+    // Duration events: kernel + non-kernel overhead spans, emitted once in the
+    // wall group (pid=block) and once in the cpu group (pid=block+kCpuPid).
+    for (int32_t c = 0; c < nw && c < RUNTIME_MAX_WORKER; c++) {
+        DistCore &co = g_dist.cores[c];
+        if (co.block_id < 0 || co.lane < 0) continue;
+        for (const TraceEvent &e : co.trace) {
+            const char *ph = phase_name(e.phase);
+            char name[64];
+            if (e.phase != TracePhase::Kernel) {
+                snprintf(name, sizeof(name), "%s#%d", ph, e.task_id);
+            } else if (e.func_id >= 0) {
+                snprintf(name, sizeof(name), "f%d#%d", e.func_id, e.task_id);
+            } else {
+                snprintf(name, sizeof(name), "task#%d", e.task_id);
+            }
+            if (!first) fprintf(f, ",\n");
+            first = false;
+            // Convert raw ns -> us (swimlane unit) here, at dump time — never on the
+            // hot path (see TraceEvent).
+            const double ts_us = e.ts_ns / 1000.0;
+            const double dur_us = e.dur_ns / 1000.0;
+            const double cpu_us = e.cpu_ns / 1000.0;
+            fprintf(
+                f,
+                "    {\"ph\":\"X\",\"name\":\"%s\",\"pid\":%d,\"tid\":%d,\"ts\":%.3f,\"dur\":%.3f,"
+                "\"args\":{\"phase\":\"%s\",\"task_id\":%d,\"func_id\":%d,\"core\":%d,\"mc\":%d,\"cpu_us\":%.3f}}",
+                name, co.block_id, co.lane, ts_us, dur_us, ph, e.task_id, e.func_id, c, e.multicore, cpu_us
+            );
+            fprintf(
+                f,
+                ",\n    {\"ph\":\"X\",\"name\":\"%s\",\"pid\":%d,\"tid\":%d,\"ts\":%.3f,\"dur\":%.3f,"
+                "\"args\":{\"phase\":\"%s\",\"task_id\":%d,\"func_id\":%d,\"wall_us\":%.3f}}",
+                name, co.block_id + kCpuPid, e.phase == TracePhase::Kernel ? co.lane + kCpuKernelLane : co.lane, ts_us,
+                cpu_us, ph, e.task_id, e.func_id, dur_us
+            );
+        }
+    }
+
+    // Flow events: the full static dependency graph. One arrow per dep edge, in
+    // the cpu group, from the PRODUCER kernel span's end to the CONSUMER kernel
+    // span's start (time always forward: a producer completes before its consumer
+    // runs). Click any task and follow arrows backward hop-by-hop to walk the
+    // chain "what was this waiting on, and what was THAT waiting on".
+    int32_t flow_id = 0;
+    for (int32_t c = 0; c < nw && c < RUNTIME_MAX_WORKER; c++) {
+        DistCore &co = g_dist.cores[c];
+        if (co.block_id < 0 || co.lane < 0) continue;
+        for (const DistCore::DepEdge &de : co.dep_edges) {
+            if (de.producer_task < 0 || de.producer_task >= kFlagCap) continue;
+            if (de.consumer_task < 0 || de.consumer_task >= kFlagCap) continue;
+            const SpanLoc &pr = kloc[static_cast<size_t>(de.producer_task)];
+            const SpanLoc &cs = kloc[static_cast<size_t>(de.consumer_task)];
+            if (pr.pid < 0 || cs.pid < 0) continue;  // need both kernel spans
+            fprintf(
+                f, ",\n    {\"ph\":\"s\",\"name\":\"dep\",\"cat\":\"dep\",\"id\":%d,\"pid\":%d,\"tid\":%d,\"ts\":%.3f}",
+                flow_id, pr.pid, pr.tid, pr.ts_us + pr.dur_us
+            );
+            fprintf(
+                f,
+                ",\n    {\"ph\":\"f\",\"name\":\"dep\",\"cat\":\"dep\",\"id\":%d,\"bp\":\"e\",\"pid\":%d,\"tid\":%d,"
+                "\"ts\":%.3f}",
+                flow_id, cs.pid, cs.tid, cs.ts_us
+            );
+            flow_id++;
+        }
+    }
+
+    // Flow events (cat="slot"): slot-release edges that explain a ringbp's stall.
+    // From the END of the occupant kernel's span (the moment it frees the slot) to
+    // the END of the waiting ringbp span. Chains with the dep arrows: ringbp
+    // --slot--> occupant kernel --dep--> the occupant's fan-in kernels.
+    for (int32_t c = 0; c < nw && c < RUNTIME_MAX_WORKER; c++) {
+        DistCore &co = g_dist.cores[c];
+        if (co.block_id < 0 || co.lane < 0) continue;
+        for (const DistCore::DepEdge &se : co.slot_edges) {
+            if (se.producer_task < 0 || se.producer_task >= kFlagCap) continue;  // occupant
+            if (se.consumer_task < 0 || se.consumer_task >= kFlagCap) continue;  // ringbp waiter
+            const SpanLoc &occ = kloc[static_cast<size_t>(se.producer_task)];
+            const SpanLoc &rb = rbloc[static_cast<size_t>(se.consumer_task)];
+            if (occ.pid < 0 || rb.pid < 0) continue;
+            double tail = occ.ts_us + occ.dur_us;      // occupant kernel end (slot freed)
+            const double head = rb.ts_us + rb.dur_us;  // ringbp end (wait satisfied)
+            if (tail > head) tail = head;              // keep forward in time
+            fprintf(
+                f,
+                ",\n    {\"ph\":\"s\",\"name\":\"slot\",\"cat\":\"slot\",\"id\":%d,\"pid\":%d,\"tid\":%d,\"ts\":%.3f}",
+                flow_id, occ.pid, occ.tid, tail
+            );
+            fprintf(
+                f,
+                ",\n    {\"ph\":\"f\",\"name\":\"slot\",\"cat\":\"slot\",\"id\":%d,\"bp\":\"e\",\"pid\":%d,\"tid\":%d,"
+                "\"ts\":%.3f}",
+                flow_id, rb.pid, rb.tid, head
+            );
+            flow_id++;
+        }
+    }
+
+    fprintf(f, "\n  ]\n}\n");
+    fclose(f);
+    fprintf(stderr, "[dist_engine] swimlane trace written to %s\n", path);
+}
+#else   // !DIST_TRACE_ENABLED
+// Tracing compiled out: keep the public symbol so aicpu_executor.cpp still links.
+void dist_engine_dump_trace() {}
+#endif  // DIST_TRACE_ENABLED
diff --git a/src/a2a3/runtime/fully_distributed_within_core/runtime/dist_engine.h b/src/a2a3/runtime/fully_distributed_within_core/runtime/dist_engine.h
new file mode 100644
index 000000000..c68c3c398
--- /dev/null
+++ b/src/a2a3/runtime/fully_distributed_within_core/runtime/dist_engine.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+/**
+ * fully_distributed_within_core engine — public wiring entry.
+ *
+ * The distributed runtime moves orchestration + scheduling + execution onto the
+ * AI cores in SPMD fashion (see docs/fully_distributed_within_core.md). The
+ * engine itself (per-core TensorMap, claim race over global cursors, private
+ * task ring, run-ahead loop, completion-flag ring, deterministic GM output
+ * heap) lives in dist_engine.cpp and is compiled into the AICPU .so so it can
+ * reuse the full submit-side type set (TensorMap, MixedKernels, L0TaskArgs,
+ * kernel-address resolution).
+ *
+ * The AICPU "stub" thread does dlopen + arena setup, then calls
+ * dist_engine_register() once and publishes the returned per-core entry pointer
+ * via Runtime::dist.core_main_fn. Each AICore worker thread invokes that entry,
+ * which runs the orchestration entry (replaying the full submit stream) and
+ * executes the tasks it wins.
+ */
+
+#pragma once
+
+struct PTO2Runtime;
+struct L2TaskArgs;
+class Runtime;
+
+// Orchestration entry signature (matches DeviceOrchestrationFunc in the AICPU
+// executor): the dlopen'd user orchestration function the cores replay.
+typedef void (*DistOrchFunc)(const L2TaskArgs &);
+
+/**
+ * Wire the distributed engine for one run.
+ *
+ * Resets the global claim cursors + completion-flag ring, (re)acquires the GM
+ * output heap, stores the orchestration entry / args / PTO2Runtime, and points
+ * rt->ops at the distributed ops table so the cores route rt_submit_* into the
+ * distributed submit path. Must be called once on the AICPU orchestrator thread
+ * before publishing Runtime::dist.go.
+ *
+ * Returns the address of the per-core entry function
+ * (signature: void(void *runtime, int core_idx, int core_type)) to store into
+ * Runtime::dist.core_main_fn. Returned as void* to keep this header light.
+ */
+void *dist_engine_register(
+    PTO2Runtime *rt, DistOrchFunc orch_func, const L2TaskArgs *orch_args, int num_workers, Runtime *runtime
+);
+
+/**
+ * Dump a per-core execution swimlane as a Chrome Trace Event JSON.
+ *
+ * Self-gated on the PTO_DIST_SWIMLANE env var (output file path); a no-op when
+ * unset. Each executed (sub)task is one duration event laid out by physical
+ * block (pid) and lane AIC/AIV0/AIV1 (tid), so the trace shows how the
+ * execute-first claim race spreads work across cores (load balance, docs §6.1).
+ * Must be called AFTER all workers have finished a run (single-threaded), e.g.
+ * by the AICPU stub once Runtime::dist.done_count == num_workers.
+ */
+void dist_engine_dump_trace();
diff --git a/src/a2a3/runtime/fully_distributed_within_core/runtime/pto2_dispatch_payload.h b/src/a2a3/runtime/fully_distributed_within_core/runtime/pto2_dispatch_payload.h
new file mode 100644
index 000000000..e1bb3465e
--- /dev/null
+++ b/src/a2a3/runtime/fully_distributed_within_core/runtime/pto2_dispatch_payload.h
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+/**
+ * @file pto2_dispatch_payload.h
+ * @brief Per-core dispatch payload for AICore kernel execution
+ *
+ * PTO2DispatchPayload holds the kernel function address, a per-core args[]
+ * array, and embedded SPMD context (LocalContext + GlobalContext).  AICPU
+ * maintains a static array of these (one per core).
+ *
+ * GlobalContext (sub_block_id) is initialized once at runtime startup via
+ * init_global_context() and never modified afterwards.
+ *
+ * LocalContext (block_idx, block_num) and args[] are rebuilt by build_payload()
+ * before each dispatch.  Both context struct pointers are written into the
+ * args[] suffix on every dispatch (since args[] is rebuilt entirely each time).
+ *
+ * AICore caches a pointer to its per-core slot at startup and reads from
+ * it on each dispatch.  The struct is cache-line aligned to avoid false
+ * sharing across concurrently dispatched cores.
+ *
+ * The DATA_MAIN_BASE register protocol is unchanged from the base runtime:
+ * a monotonically increasing reg_task_id signals new work to AICore.
+ */
+
+#pragma once
+
+#include <stdint.h>
+
+#include "arg_direction.h"
+#include "intrinsic.h"
+
+/** Max dispatch arguments: 16 scalars + up to 32 tensor pointers + ext params */
+#ifndef PTO2_DISPATCH_MAX_ARGS
+#define PTO2_DISPATCH_MAX_ARGS (MAX_TENSOR_ARGS + MAX_SCALAR_ARGS + PTO2_EXT_PARAMS_COUNT)
+#endif
+
+#ifndef PTO2_ALIGN_UP
+#define PTO2_ALIGN_UP(x, align) (((x) + (align) - 1) & ~((align) - 1))
+#endif
+
+// Verify hardcoded indices in intrinsic.h match the computed values.
+static_assert(
+    (MAX_TENSOR_ARGS + MAX_SCALAR_ARGS) == SPMD_LOCAL_CONTEXT_INDEX, "LOCAL_CONTEXT_INDEX out of sync with intrinsic.h"
+);
+static_assert(
+    (MAX_TENSOR_ARGS + MAX_SCALAR_ARGS + 1) == SPMD_GLOBAL_CONTEXT_INDEX,
+    "GLOBAL_CONTEXT_INDEX out of sync with intrinsic.h"
+);
+
+/**
+ * Per-core dispatch payload: function address + args[] + SPMD context.
+ *
+ * AICPU maintains a static array s_payload_per_core[RUNTIME_MAX_WORKER].
+ * AICore caches a pointer to its per-core slot at startup (via Handshake.task)
+ * and reads from it on each dispatch.
+ *
+ * The struct is cache-line aligned to prevent false sharing across
+ * concurrently dispatched cores.
+ */
+struct alignas(64) PTO2DispatchPayload {
+    uint64_t function_bin_addr;            /**< Kernel entry address in GM (set by Scheduler) */
+    uint64_t args[PTO2_DISPATCH_MAX_ARGS]; /**< Kernel arguments (GM pointers + scalars + ext params) */
+
+    /** Per-dispatch context: block_idx and block_num.
+     *  Written by build_payload() before each dispatch.
+     *  args[SPMD_LOCAL_CONTEXT_INDEX] points here. */
+    LocalContext local_context;
+
+    /** Per-core global context: sub_block_id (AIV lane identity).
+     *  Initialized once by init_global_context() at runtime startup.
+     *  args[SPMD_GLOBAL_CONTEXT_INDEX] points here. */
+    GlobalContext global_context;
+
+    /** Speculative early-dispatch gate. 0 = ready: AICore executes on pickup.
+     *  1 = not-ready: AICore waits until AICPU rings the doorbell
+     *  (DATA_MAIN_BASE high 32 == this dispatch's reg_task_id) before executing. */
+    volatile uint32_t not_ready;
+    uint8_t reserved_payload_abi_pad[4];
+
+    static_assert(sizeof(args[0]) == 8);
+    static_assert(
+        PTO2_ALIGN_UP((MAX_TENSOR_ARGS + MAX_SCALAR_ARGS) * sizeof(args[0]), 64) ==
+        (MAX_TENSOR_ARGS + MAX_SCALAR_ARGS) * sizeof(args[0])
+    );
+};
+
+static_assert(sizeof(PTO2DispatchPayload) == 512, "PTO2DispatchPayload hardware ABI size drift");
diff --git a/src/a2a3/runtime/fully_distributed_within_core/runtime/pto_async_kernel_api.h b/src/a2a3/runtime/fully_distributed_within_core/runtime/pto_async_kernel_api.h
new file mode 100644
index 000000000..cf6eb4790
--- /dev/null
+++ b/src/a2a3/runtime/fully_distributed_within_core/runtime/pto_async_kernel_api.h
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+#ifndef PTO_ASYNC_KERNEL_API_H
+#define PTO_ASYNC_KERNEL_API_H
+
+#include <stdint.h>
+
+#include <pto/comm/comm_types.hpp>
+#include <pto/comm/pto_comm_inst.hpp>
+
+#include "intrinsic.h"
+#include "aicore_completion_mailbox_types.h"
+#include "pto_completion_token.h"
+#include "pto_runtime_status.h"
+
+#ifndef __aicore__
+#define __aicore__
+#endif
+#ifndef __gm__
+#define __gm__
+#endif
+
+// Public surface: get_async_ctx, async_ctx_is_deferred,
+// register_completion_condition, send_notification,
+// save_expected_notification_counter. Everything else lives in
+// pto2::detail and is reserved for backend adapters / internal use.
+namespace pto2::detail {
+
+inline __aicore__ void defer_load_slab(AsyncCtx &ctx) {
+    if (ctx.completion_count == nullptr) return;
+#if defined(__CCE_KT_TEST__) || defined(__CCE_AICORE__) || defined(__DAV_C220__)
+    uintptr_t line = reinterpret_cast<uintptr_t>(ctx.completion_count) & ~(uintptr_t(PTO2_ALIGN_SIZE) - 1u);
+    dcci((__gm__ int32_t *)line, SINGLE_CACHE_LINE);
+#else
+    __asm__ __volatile__("" ::: "memory");
+#endif
+}
+
+inline __aicore__ void defer_error(AsyncCtx &ctx, int32_t error_code) {
+    if (ctx.task_token.is_valid() && ctx.completion_error_code != nullptr) {
+        *ctx.completion_error_code = error_code;
+    }
+}
+
+inline __aicore__ void defer_flush_range(volatile __gm__ void *addr, uint32_t size_bytes) {
+    if (addr == nullptr || size_bytes == 0) return;
+#if defined(__CCE_KT_TEST__) || defined(__CCE_AICORE__) || defined(__DAV_C220__)
+    uintptr_t start = reinterpret_cast<uintptr_t>(addr) & ~(uintptr_t(PTO2_ALIGN_SIZE) - 1u);
+    uintptr_t end =
+        (reinterpret_cast<uintptr_t>(addr) + size_bytes + PTO2_ALIGN_SIZE - 1u) & ~(uintptr_t(PTO2_ALIGN_SIZE) - 1u);
+    for (uintptr_t p = start; p < end; p += PTO2_ALIGN_SIZE) {
+        dcci((__gm__ int32_t *)p, SINGLE_CACHE_LINE, CACHELINE_OUT);
+    }
+#else
+    (void)addr;
+    (void)size_bytes;
+#endif
+}
+
+inline __aicore__ void defer_flush(AsyncCtx &ctx) {
+    if (ctx.task_token.is_invalid() || ctx.completion_count == nullptr) return;
+#if defined(__CCE_KT_TEST__) || defined(__CCE_AICORE__) || defined(__DAV_C220__)
+    uint32_t count = *ctx.completion_count;
+    if (count > ctx.completion_capacity) {
+        count = ctx.completion_capacity;
+    }
+    uint32_t flush_bytes = static_cast<uint32_t>(sizeof(*ctx.completion_count));
+    if (ctx.completion_error_code != nullptr) {
+        flush_bytes += static_cast<uint32_t>(sizeof(*ctx.completion_error_code));
+    }
+    if (ctx.completion_entries != nullptr) {
+        flush_bytes += count * static_cast<uint32_t>(sizeof(DeferredCompletionEntry));
+    }
+    defer_flush_range(ctx.completion_count, flush_bytes);
+#if defined(__CPU_SIM)
+    dsb(0);
+#else
+    dsb(DSB_DDR);
+#endif
+    pipe_barrier(PIPE_ALL);
+#else
+    (void)ctx;
+    __asm__ __volatile__("" ::: "memory");
+#endif
+}
+
+}  // namespace pto2::detail
+
+inline __aicore__ AsyncCtx get_async_ctx(__gm__ int64_t *args) {
+    __gm__ LocalContext *lc =
+        reinterpret_cast<__gm__ LocalContext *>(static_cast<uintptr_t>(args[PAYLOAD_LOCAL_CONTEXT_INDEX]));
+    AsyncCtx ctx{};
+    ctx.completion_count = lc->async_ctx.completion_count;
+    ctx.completion_error_code = lc->async_ctx.completion_error_code;
+    ctx.completion_entries = lc->async_ctx.completion_entries;
+    ctx.completion_capacity = lc->async_ctx.completion_capacity;
+    ctx.task_token.raw = lc->async_ctx.task_token.raw;
+    pto2::detail::defer_load_slab(ctx);
+    return ctx;
+}
+
+inline __aicore__ bool async_ctx_is_deferred(const AsyncCtx &ctx) { return ctx.task_token.is_valid(); }
+
+// Canonical writer: backend submit handlers build a CompletionToken and pass
+// it here. Writes one DeferredCompletionEntry to the AsyncCtx slab and
+// bumps completion_count. Returns false on overflow (also stores
+// PTO2_ERROR_ASYNC_WAIT_OVERFLOW in ctx.completion_error_code) or when ctx is
+// not currently a deferred context.
+inline __aicore__ bool register_completion_condition(AsyncCtx &ctx, const CompletionToken &token) {
+    if (ctx.task_token.is_invalid() || ctx.completion_count == nullptr || ctx.completion_entries == nullptr) {
+        return false;
+    }
+
+    uint32_t idx = *ctx.completion_count;
+    if (idx >= ctx.completion_capacity) {
+        if (ctx.completion_error_code != nullptr) {
+            *ctx.completion_error_code = PTO2_ERROR_ASYNC_WAIT_OVERFLOW;
+        }
+        return false;
+    }
+
+    volatile __gm__ DeferredCompletionEntry *slot = &ctx.completion_entries[idx];
+    slot->addr = token.addr;
+    slot->expected_value = token.expected_value;
+    slot->engine = token.engine;
+    slot->completion_type = token.completion_type;
+    slot->_pad = 0;
+    *ctx.completion_count = idx + 1;
+    return true;
+}
+
+inline __aicore__ void
+send_notification(volatile __gm__ void *remote_counter_addr, int32_t value, pto::comm::NotifyOp notify_op) {
+    __gm__ int32_t *counter = reinterpret_cast<__gm__ int32_t *>(const_cast<__gm__ void *>(remote_counter_addr));
+    pto::comm::Signal signal(counter);
+    pto::comm::TNOTIFY(signal, value, notify_op);
+}
+
+inline __aicore__ void
+save_expected_notification_counter(AsyncCtx &ctx, volatile __gm__ void *counter_addr, uint32_t expected_value) {
+    CompletionToken token{
+        reinterpret_cast<uint64_t>(counter_addr), expected_value, COMPLETION_ENGINE_SDMA, COMPLETION_TYPE_COUNTER, 0
+    };
+    (void)register_completion_condition(ctx, token);
+    pto2::detail::defer_flush(ctx);
+}
+
+#endif  // PTO_ASYNC_KERNEL_API_H
diff --git a/src/a2a3/runtime/fully_distributed_within_core/runtime/pto_async_wait.h b/src/a2a3/runtime/fully_distributed_within_core/runtime/pto_async_wait.h
new file mode 100644
index 000000000..65608ad2f
--- /dev/null
+++ b/src/a2a3/runtime/fully_distributed_within_core/runtime/pto_async_wait.h
@@ -0,0 +1,303 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+#ifndef PTO_ASYNC_WAIT_H
+#define PTO_ASYNC_WAIT_H
+
+#include <atomic>
+#include <cstddef>
+#include <cstdint>
+
+#include "aicpu/platform_regs.h"
+#include "backend/sdma/sdma_completion_scheduler.h"
+#include "intrinsic.h"
+#include "aicore_completion_mailbox.h"
+#include "pto_completion_token.h"
+#include "pto_runtime2_types.h"
+
+struct PTO2SchedulerState;
+struct PTO2LocalReadyBuffer;
+struct CompletionStats;
+
+inline constexpr int32_t MAX_ASYNC_WAITS = 64;
+
+// The mailbox transport (has_pending / try_push_condition /
+// try_push_normal_done / try_pop) lives as AICoreCompletionMailbox member
+// functions in aicore_completion_mailbox.h. This file only holds the
+// application layer: translating drained messages into wait-list state.
+
+inline uintptr_t mailbox_cache_line(const volatile void *addr) {
+    return reinterpret_cast<uintptr_t>(addr) & ~(uintptr_t(PTO2_ALIGN_SIZE) - 1u);
+}
+
+struct CompletionCondition;
+
+using CompletionPollFn = CompletionPollResult (*)(const CompletionCondition &);
+using CompletionRetireFn = void (*)(CompletionCondition &);
+
+struct CompletionBackendOps {
+    CompletionPollFn poll;
+    CompletionRetireFn retire;
+};
+
+struct CompletionCondition {
+    AsyncEngine engine{ASYNC_ENGINE_SDMA};
+    int32_t completion_type{COMPLETION_TYPE_COUNTER};
+    bool satisfied{false};
+    bool retired{false};
+    volatile uint32_t *counter_addr{nullptr};
+    uint64_t addr{0};
+    uint32_t expected_value{0};
+
+    CompletionPollResult test() const;
+    void retire();
+};
+
+// Per-completion-type ops. SDMA_EVENT_RECORD detail lives in
+// backend/sdma/sdma_completion_scheduler.h; the op wrappers below are thin
+// glue mapping CompletionCondition.addr into the backend's raw-addr helpers.
+inline CompletionPollResult counter_poll_op(const CompletionCondition &cond) {
+    if (cond.counter_addr == nullptr) {
+        return {CompletionPollState::FAILED, PTO2_ERROR_ASYNC_COMPLETION_INVALID};
+    }
+    return {
+        *cond.counter_addr >= cond.expected_value ? CompletionPollState::READY : CompletionPollState::PENDING,
+        PTO2_ERROR_NONE
+    };
+}
+
+inline void counter_retire_op(CompletionCondition & /*cond*/) {}
+
+inline CompletionPollResult sdma_event_record_poll_op(const CompletionCondition &cond) {
+    return poll_sdma_event_record(cond.addr);
+}
+
+inline void sdma_event_record_retire_op(CompletionCondition &cond) { retire_sdma_event_record(cond.addr); }
+
+inline const CompletionBackendOps *completion_backend_ops_for(int completion_type) {
+    static const CompletionBackendOps kOps[] = {
+        {counter_poll_op, counter_retire_op},                      // COMPLETION_TYPE_COUNTER = 0
+        {sdma_event_record_poll_op, sdma_event_record_retire_op},  // COMPLETION_TYPE_SDMA_EVENT_RECORD = 1
+    };
+    constexpr int kOpsCount = static_cast<int>(sizeof(kOps) / sizeof(kOps[0]));
+    if (completion_type < 0 || completion_type >= kOpsCount) return nullptr;
+    return &kOps[completion_type];
+}
+
+inline CompletionPollResult CompletionCondition::test() const {
+    if (satisfied) {
+        return {CompletionPollState::READY, PTO2_ERROR_NONE};
+    }
+    const CompletionBackendOps *ops = completion_backend_ops_for(completion_type);
+    if (ops == nullptr || ops->poll == nullptr) {
+        return {CompletionPollState::FAILED, PTO2_ERROR_ASYNC_COMPLETION_INVALID};
+    }
+    return ops->poll(*this);
+}
+
+inline void CompletionCondition::retire() {
+    if (retired) return;
+    const CompletionBackendOps *ops = completion_backend_ops_for(completion_type);
+    if (ops != nullptr && ops->retire != nullptr) {
+        ops->retire(*this);
+    }
+    retired = true;
+}
+
+struct AsyncWaitEntry {
+    PTO2TaskSlotState *slot_state{nullptr};
+    PTO2TaskId task_token{PTO2TaskId::invalid()};
+    CompletionCondition conditions[MAX_COMPLETIONS_PER_TASK];
+    int32_t condition_count{0};
+    int32_t waiting_completion_count{0};
+    bool normal_done{false};
+};
+
+struct AsyncPollResult {
+    int32_t completed{0};
+    int32_t error_code{PTO2_ERROR_NONE};
+    PTO2TaskSlotState *failed_slot_state{nullptr};
+};
+
+inline const char *async_engine_name(AsyncEngine engine) {
+    switch (engine) {
+    case ASYNC_ENGINE_SDMA:
+        return "SDMA";
+    case ASYNC_ENGINE_ROCE:
+        return "ROCE";
+    case ASYNC_ENGINE_URMA:
+        return "URMA";
+    case ASYNC_ENGINE_CCU:
+        return "CCU";
+    default:
+        return "UNKNOWN";
+    }
+}
+
+struct AsyncWaitList {
+    std::atomic<int32_t> busy{0};
+    AsyncWaitEntry entries[MAX_ASYNC_WAITS];
+    int32_t count{0};
+    // Diagnostic: counts every FIN-side try_push that hit a full mailbox.
+    // Expected to stay zero on real workloads (ring is 4096 entries); a
+    // non-zero value means consumers are too slow or the ring is undersized.
+    // Read by scheduler shutdown / l2 perf summary; not on the hot path.
+    std::atomic<uint64_t> mpsc_skipped_count{0};
+
+    bool try_lock() {
+        int32_t expected = 0;
+        return busy.compare_exchange_strong(expected, 1, std::memory_order_acquire, std::memory_order_relaxed);
+    }
+
+    void unlock() { busy.store(0, std::memory_order_release); }
+
+    AsyncWaitEntry *find_entry_by_token(PTO2TaskId token) {
+        for (int32_t i = 0; i < count; i++) {
+            if (entries[i].task_token == token) return &entries[i];
+        }
+        return nullptr;
+    }
+
+    // Captures the side-channel a scheduler-aware drain needs to complete
+    // NotDeferred tasks inline (without storing a transient entry in
+    // entries[]).
+    struct DrainCompletionSink {
+        PTO2SchedulerState *sched{nullptr};
+        PTO2LocalReadyBuffer *local_bufs{nullptr};
+        PTO2TaskSlotState **deferred_release_slot_states{nullptr};
+        int32_t *deferred_release_count{nullptr};
+        int32_t deferred_release_capacity{0};
+        int32_t inline_completed{0};
+#if PTO2_SCHED_PROFILING
+        int32_t thread_idx{0};
+#endif
+
+        bool can_inline_complete() const { return sched != nullptr; }
+    };
+
+    // Inline-complete a NotDeferred task during drain. Returns false on
+    // deferred_release_slot_states overflow.
+    bool try_inline_complete_locked(DrainCompletionSink &sink, PTO2TaskSlotState &slot_state);
+
+    // Single-consumer drain: pop each published message in tail order and
+    // translate it into wait-list state. An empty sink (sched == nullptr) just
+    // materializes entries; a sched-aware sink additionally inline-completes
+    // lonely NotDeferred NORMAL_DONEs without ever growing entries[].
+    int32_t drain_aicore_completion_mailbox_locked(
+        AICoreCompletionMailbox *aicore_mailbox, DrainCompletionSink &sink, int32_t &error_code
+    ) {
+        error_code = PTO2_ERROR_NONE;
+        if (aicore_mailbox == nullptr) return 0;
+
+        int32_t drained = 0;
+        AICoreCompletionMsgView msg;
+        // try_pop is the transport layer (seq-gated, in-order dequeue); this
+        // loop is the application layer (translate each message into wait-list
+        // state). try_pop returns false at the first gap or when empty.
+        while (aicore_mailbox->try_pop(msg)) {
+            drained++;
+            if (msg.kind == MSG_KIND_CONDITION) {
+                AsyncWaitEntry *entry = find_entry_by_token(msg.task_token);
+                if (entry == nullptr) {
+                    // First message for this task — materialize the entry here.
+                    // slot_state stays null until the matching TASK_NORMAL_DONE
+                    // sentinel arrives.
+                    if (count >= MAX_ASYNC_WAITS) {
+                        error_code = PTO2_ERROR_ASYNC_WAIT_OVERFLOW;
+                        return drained;
+                    }
+                    entry = &entries[count++];
+                    entry->task_token = msg.task_token;
+                    entry->slot_state = nullptr;
+                    entry->condition_count = 0;
+                    entry->waiting_completion_count = 0;
+                    entry->normal_done = false;
+                }
+                if (!append_condition_locked(
+                        *entry, msg.addr, msg.expected_value, static_cast<AsyncEngine>(msg.engine), msg.completion_type,
+                        error_code
+                    )) {
+                    return drained;
+                }
+            } else if (msg.kind == MSG_KIND_TASK_NORMAL_DONE) {
+                PTO2TaskSlotState *slot_state_ptr =
+                    reinterpret_cast<PTO2TaskSlotState *>(static_cast<uintptr_t>(msg.addr));
+                AsyncWaitEntry *entry = find_entry_by_token(msg.task_token);
+                if (entry == nullptr) {
+                    // Producers strictly order: all CONDITIONs for token T are
+                    // pushed before the matching NORMAL_DONE (the acq_rel on
+                    // on_subtask_complete enforces this across producers). So
+                    // observing NORMAL_DONE first => the task registered no
+                    // conditions => NotDeferred. Complete it inline when the
+                    // sink allows; otherwise fall back to the entry-store path.
+                    if (sink.can_inline_complete()) {
+                        (void)try_inline_complete_locked(sink, *slot_state_ptr);
+                        continue;
+                    }
+                    if (count >= MAX_ASYNC_WAITS) {
+                        error_code = PTO2_ERROR_ASYNC_WAIT_OVERFLOW;
+                        return drained;
+                    }
+                    entry = &entries[count++];
+                    entry->task_token = msg.task_token;
+                    entry->slot_state = slot_state_ptr;
+                    entry->condition_count = 0;
+                    entry->waiting_completion_count = 0;
+                    entry->normal_done = true;
+                } else {
+                    if (entry->slot_state == nullptr) {
+                        entry->slot_state = slot_state_ptr;
+                    }
+                    entry->normal_done = true;
+                }
+            } else {
+                error_code = PTO2_ERROR_ASYNC_REGISTRATION_FAILED;
+                return drained;
+            }
+        }
+        return drained;
+    }
+
+    bool append_condition_locked(
+        AsyncWaitEntry &entry, uint64_t addr, uint32_t expected_value, AsyncEngine engine, int32_t completion_type,
+        int32_t &error_code
+    ) {
+        if (entry.condition_count >= MAX_COMPLETIONS_PER_TASK) {
+            error_code = PTO2_ERROR_ASYNC_REGISTRATION_FAILED;
+            return false;
+        }
+        CompletionCondition &cond = entry.conditions[entry.condition_count++];
+        cond.engine = engine;
+        cond.completion_type = completion_type;
+        cond.satisfied = false;
+        cond.retired = false;
+        cond.addr = addr;
+        cond.counter_addr = completion_type == COMPLETION_TYPE_COUNTER ?
+                                reinterpret_cast<volatile uint32_t *>(static_cast<uintptr_t>(addr)) :
+                                nullptr;
+        cond.expected_value = expected_value;
+        entry.waiting_completion_count++;
+        return true;
+    }
+
+    template <bool Profiling>
+    AsyncPollResult poll_and_complete(
+        AICoreCompletionMailbox *aicore_mailbox, PTO2SchedulerState *sched, PTO2LocalReadyBuffer *local_bufs,
+        PTO2TaskSlotState **deferred_release_slot_states, int32_t &deferred_release_count,
+        int32_t deferred_release_capacity
+#if PTO2_SCHED_PROFILING
+        ,
+        int thread_idx
+#endif
+    );
+};
+
+#endif  // PTO_ASYNC_WAIT_H
diff --git a/src/a2a3/runtime/fully_distributed_within_core/runtime/pto_completion_token.h b/src/a2a3/runtime/fully_distributed_within_core/runtime/pto_completion_token.h
new file mode 100644
index 000000000..c5a8c345f
--- /dev/null
+++ b/src/a2a3/runtime/fully_distributed_within_core/runtime/pto_completion_token.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+#ifndef SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_PTO_COMPLETION_TOKEN_H_
+#define SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_PTO_COMPLETION_TOKEN_H_
+
+#include <stdint.h>
+
+#include "aicore_completion_mailbox_types.h"
+#include "pto_runtime_status.h"
+
+// CompletionToken is the runtime-internal POD that backend submit handlers
+// produce and the generic register_completion_condition() consumes. It is the
+// ABI contract for "this is one completion to wait on" — independent of which
+// backend (SDMA, RoCE, notification counter, ...) generated it. Each backend's
+// (poll, retire) pair is registered in pto_async_wait.h's ops table, keyed by
+// completion_type.
+struct CompletionToken {
+    uint64_t addr;
+    uint32_t expected_value;
+    uint32_t engine;
+    int32_t completion_type;
+    uint64_t backend_cookie;
+};
+
+enum class CompletionPollState : uint8_t {
+    PENDING = 0,
+    READY = 1,
+    FAILED = 2,
+};
+
+struct CompletionPollResult {
+    CompletionPollState state{CompletionPollState::PENDING};
+    int32_t error_code{PTO2_ERROR_NONE};
+};
+
+#endif  // SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_PTO_COMPLETION_TOKEN_H_
diff --git a/src/a2a3/runtime/fully_distributed_within_core/runtime/pto_constants.h b/src/a2a3/runtime/fully_distributed_within_core/runtime/pto_constants.h
new file mode 100644
index 000000000..07251cc39
--- /dev/null
+++ b/src/a2a3/runtime/fully_distributed_within_core/runtime/pto_constants.h
@@ -0,0 +1,19 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+#ifndef SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_PTO_CONSTANTS_H_
+#define SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_PTO_CONSTANTS_H_
+
+#define PTO2_ALIGN_SIZE 64             // Cache line alignment
+#define PTO2_PACKED_OUTPUT_ALIGN 1024  // Each output in packed buffer aligned to 1024B; gap is padding
+#define PTO2_ALIGN_UP(x, align) (((x) + (align) - 1) & ~((align) - 1))
+
+#endif  // SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_PTO_CONSTANTS_H_
diff --git a/src/a2a3/runtime/fully_distributed_within_core/runtime/pto_dep_compute.h b/src/a2a3/runtime/fully_distributed_within_core/runtime/pto_dep_compute.h
new file mode 100644
index 000000000..cf68a2617
--- /dev/null
+++ b/src/a2a3/runtime/fully_distributed_within_core/runtime/pto_dep_compute.h
@@ -0,0 +1,155 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+/**
+ * @file pto_dep_compute.h
+ * @brief Dependency computation primitives shared by runtime submit_task and dep_gen replay.
+ *
+ * Two header-only template entry points:
+ *
+ *   compute_task_fanin     — STEP 3 in submit_task: per-tensor creator retention (Step A)
+ *                            + tensormap.lookup for INPUT/INOUT (Step B). Calls back into
+ *                            user-supplied `emit` for each producer it identifies.
+ *
+ *   register_task_outputs  — STEP 4 in submit_task: tensormap.insert for INOUT and
+ *                            OUTPUT_EXISTING tensors. No callbacks.
+ *
+ * STEP 1 (explicit_deps) is intentionally left at the runtime call site because its
+ * `last_task_alive` shortcut + unchecked slot lookup is subtly different from the
+ * `slot_state->task->task_id == producer` reuse check in STEP 3. Unifying them would
+ * require two emit semantics or a marginal behavior change in transients — not worth
+ * the minor structural overlap. Replay handles STEP 1 with a one-line loop of its own.
+ *
+ * The Emit callback contract:
+ *   bool emit(PTO2TaskId producer);
+ *     - return true to continue (whether or not the producer was actually recorded —
+ *       producer-not-alive / dedup-hit / etc. all return true silently)
+ *     - return false to signal fatal (e.g. fanin spill overflow); caller bails
+ *
+ * Performance: Emit is a template parameter, not std::function. Both runtime
+ * (lambda capturing fanin_builder + sm_header) and replay (lambda capturing edge
+ * vector) instantiate at the call site and inline through. Do NOT replace with
+ * std::function — it would break the inlining and add ~5 ns/call to the orch hot path.
+ */
+
+#ifndef SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_PTO_DEP_COMPUTE_H_
+#define SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_PTO_DEP_COMPUTE_H_
+
+#include <cstdint>
+
+#include "pto_task_id.h"
+#include "pto_tensormap.h"
+#include "pto_types.h"  // TensorRef
+#include "tensor.h"
+
+/**
+ * View struct for inputs to compute_task_fanin / register_task_outputs.
+ *
+ * Both runtime and replay assemble one of these from their own data sources
+ * (runtime: from Arg accessors; replay: from SubmitTraceEntry fields). All
+ * pointer arrays must remain valid for the duration of the call.
+ */
+struct DepInputs {
+    int32_t tensor_count;
+    const TensorRef *tensors;        // length = tensor_count (union; OUTPUT slots' .ptr is unused)
+    const TensorArgType *arg_types;  // length = tensor_count
+    int32_t explicit_dep_count;
+    const PTO2TaskId *explicit_deps;  // length = explicit_dep_count (validity checked by caller)
+};
+
+/**
+ * Compute fanin for a task being submitted (STEP 3: Step A creator retention +
+ * Step B tensormap modifier lookup).
+ *
+ * For each non-OUTPUT tensor:
+ *   - If owner_task_id is valid, emit(owner)
+ *   - For INPUT/INOUT (and not manual_dep), tensor_map.lookup(*tensor) and emit
+ *     each matching producer. INOUT+COVERED triggers tensor_map.remove_entry(entry).
+ *
+ * @return true on success (or producer-skipped-silently); false if emit signaled
+ *         fatal — caller should propagate (after any fatal bookkeeping done by emit).
+ */
+template <typename Emit>
+[[nodiscard]] inline bool
+compute_task_fanin(const DepInputs &inputs, PTO2TensorMap &tensor_map, bool in_manual_scope, Emit emit) {
+    if (in_manual_scope) {
+        return true;
+    }
+
+    for (int32_t i = 0; i < inputs.tensor_count; i++) {
+        TensorArgType ptype = inputs.arg_types[i];
+        if (ptype == TensorArgType::OUTPUT) {
+            // Runtime-created OUTPUT tensors are not looked up in the TensorMap since
+            // they have no dependencies.
+            continue;
+        }
+
+        const Tensor *tensor = &inputs.tensors[i].ref();
+
+        // Step A: creator retention — all existing tensors extend their creator lifetime.
+        PTO2TaskId owner = tensor->owner_task_id;
+        if (owner.is_valid()) {
+            if (!emit(owner)) {
+                return false;
+            }
+        }
+
+        // Step B: only INPUT/INOUT need modifier dependency lookup.
+        if (ptype != TensorArgType::INPUT && ptype != TensorArgType::INOUT) {
+            continue;
+        }
+        if (tensor->manual_dep) {
+            continue;
+        }
+
+        bool fatal = false;
+        tensor_map.lookup(*tensor, [&](PTO2TensorMapEntry &entry, OverlapStatus overlap_status) -> bool {
+            if (!emit(entry.producer_task_id)) {
+                fatal = true;
+                return false;  // stop iteration
+            }
+            if (ptype == TensorArgType::INOUT && overlap_status == OverlapStatus::COVERED) {
+                tensor_map.remove_entry(entry);
+            }
+            return true;
+        });
+        if (fatal) {
+            return false;
+        }
+    }
+    return true;
+}
+
+/**
+ * Register a task's outputs in the tensormap (STEP 4 in submit_task).
+ *
+ * For INOUT and OUTPUT_EXISTING tensors (excluding manual_dep), inserts the
+ * tensor into tensor_map keyed by its buffer.addr with `task_id` as producer.
+ *
+ * No-op when in_manual_scope.
+ */
+inline void
+register_task_outputs(const DepInputs &inputs, PTO2TaskId task_id, PTO2TensorMap &tensor_map, bool in_manual_scope) {
+    if (in_manual_scope) {
+        return;
+    }
+    for (int32_t i = 0; i < inputs.tensor_count; i++) {
+        TensorArgType ptype = inputs.arg_types[i];
+        if (ptype == TensorArgType::INOUT || ptype == TensorArgType::OUTPUT_EXISTING) {
+            const Tensor *tensor = &inputs.tensors[i].ref();
+            if (!tensor->manual_dep) {
+                tensor_map.insert(*tensor, task_id);
+            }
+        }
+    }
+}
+
+#endif  // SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_PTO_DEP_COMPUTE_H_
diff --git a/src/a2a3/runtime/fully_distributed_within_core/runtime/pto_orchestrator.cpp b/src/a2a3/runtime/fully_distributed_within_core/runtime/pto_orchestrator.cpp
new file mode 100644
index 000000000..2043c116b
--- /dev/null
+++ b/src/a2a3/runtime/fully_distributed_within_core/runtime/pto_orchestrator.cpp
@@ -0,0 +1,972 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+/**
+ * PTO Runtime2 - Orchestrator Implementation
+ *
+ * Implements orchestrator state management, scope handling, and task submission.
+ *
+ * Based on: docs/RUNTIME_LOGIC.md
+ */
+
+#include "pto_orchestrator.h"
+
+#include <assert.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <stdarg.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "aicpu/dep_gen_collector_aicpu.h"
+#include "common/dep_gen.h"
+#include "common/unified_log.h"
+#include "pto_dep_compute.h"
+#include "pto_runtime2_types.h"
+#include "pto_shared_memory.h"
+#include "pto_tensormap.h"
+#include "pto_types.h"
+#include "tensor.h"
+
+#if PTO2_PROFILING
+#include "aicpu/scope_stats_collector_aicpu.h"
+#include "aicpu/tensor_dump_aicpu.h"
+#endif
+
+// Verify the captured Tensor blob size in DepGenRecord matches the runtime
+// Tensor layout. The platform header defines DEP_GEN_TENSOR_SIZE without
+// including runtime/tensor.h, so this check lives at the orch callsite.
+static_assert(sizeof(Tensor) == DEP_GEN_TENSOR_SIZE, "DepGenRecord::tensors slot size out of sync with sizeof(Tensor)");
+// DEP_GEN_MAX_EXPLICIT_DEPS is a diagnostic-side capture cap only; the runtime
+// imposes no hard cap on explicit dep count. If a submit exceeds this cap,
+// dep_gen_aicpu_record_submit() logs and truncates — runtime correctness is
+// unaffected, only the captured replay record is truncated.
+
+// Weak fallbacks: dep_gen_collector_aicpu.cpp provides the strong symbols in
+// AICPU builds. Host builds (host_build_graph runtime, future dep_gen replay)
+// link these no-op stubs so the runtime translation unit is self-contained.
+// Visibility is hidden so the HOST .so doesn't export them into the global
+// dynamic symbol table where they'd shadow the AICPU .so's strong symbols
+// (same pattern as get_sys_cnt_aicpu / l2_swimlane_aicpu_record_orch_phase below).
+extern "C" __attribute__((weak, visibility("hidden"))) bool is_dep_gen_enabled() { return false; }
+__attribute__((weak, visibility("hidden"))) void dep_gen_aicpu_record_submit(
+    uint64_t, bool, int, const void *const *, const uint8_t *, int, const uint64_t *, const int32_t[3]
+) {}
+
+// Scope_stats enable gate, queried via the same predicate idiom as
+// is_dep_gen_enabled above. The AICPU collector links the strong definition;
+// host builds fall back to this weak `false`. Gating here still skips the
+// cross-agent occupancy reads that feed the sample when scope_stats is disabled.
+extern "C" __attribute__((weak, visibility("hidden"))) bool is_scope_stats_enabled() { return false; }
+
+// Heap-ring wrap report, called from the allocator (pto_ring_buffer.h) on each
+// wrap. Strong definition lives in the AICPU collector; host builds fall back to
+// this weak no-op so the runtime translation unit stays self-contained.
+extern "C" __attribute__((weak, visibility("hidden"))) void scope_stats_note_heap_wrap(int) {}
+
+// =============================================================================
+// Orchestrator Profiling (compile-time toggle)
+// =============================================================================
+#if PTO2_ORCH_PROFILING
+#include "aicpu/device_time.h"
+#include "aicpu/l2_swimlane_collector_aicpu.h"
+// Weak fallback for builds that don't link device_time.cpp (e.g. host).
+// The strong symbol from platform/.../device_time.cpp wins in the AICPU build.
+//
+// IMPORTANT: visibility("hidden") is required to prevent the HOST .so from
+// exporting this weak fallback into the global dynamic symbol table via
+// RTLD_GLOBAL. Without it, when the AICPU .so is loaded and its PLT entry
+// for get_sys_cnt_aicpu is resolved, the dynamic linker finds the HOST .so's
+// weak definition first (already in global table) and uses it — returning 0.
+// With hidden visibility, the HOST .so does not export this symbol globally,
+// so the AICPU .so's PLT resolves to its own strong definition from
+// device_time.cpp.
+__attribute__((weak, visibility("hidden"))) uint64_t get_sys_cnt_aicpu() { return 0; }
+// Weak fallback for builds that don't link l2_swimlane_collector_aicpu.cpp.
+// The strong symbol from the AICPU build wins when profiling is available.
+// Also hidden to prevent HOST .so from polluting the global symbol table.
+__attribute__((weak, visibility("hidden"))) void
+l2_swimlane_aicpu_record_orch_phase(uint64_t, uint64_t, uint64_t, uint32_t) {}
+// Accumulated cycles per sub-step (only needed for ORCH_PROFILING export)
+static uint64_t g_orch_sync_cycle = 0;       // tensormap sync
+static uint64_t g_orch_alloc_cycle = 0;      // unified task+heap alloc
+static uint64_t g_orch_args_cycle = 0;       // param copy
+static uint64_t g_orch_lookup_cycle = 0;     // tensormap lookup + dep building
+static uint64_t g_orch_insert_cycle = 0;     // tensormap insert
+static uint64_t g_orch_fanin_cycle = 0;      // fanin list + early-return check
+static uint64_t g_orch_scope_end_cycle = 0;  // scope_end overhead
+static int64_t g_orch_submit_count = 0;
+static uint32_t g_orch_submit_idx = 0;
+uint64_t g_orch_alloc_wait_cycle = 0;
+uint64_t g_orch_fanin_wait_cycle = 0;
+uint64_t g_orch_alloc_atomic_count = 0;
+uint64_t g_orch_args_atomic_count = 0;
+uint64_t g_orch_scope_end_atomic_count = 0;
+// Cycle accumulation is unconditional under PTO2_ORCH_PROFILING (that's what
+// the flag is for) and feeds the per-sub-step `g_orch_*_cycle` cumulatives
+// printed in the cold-path log.
+//
+// Per-submit ORCH_SUBMIT record is the only swim-lane emit on the orch
+// path — one record per submit_task() / alloc_tensors() call spanning
+// the entire [start, end] window. Per-sub-step phase records were dropped
+// in favour of the cumulatives + per-submit envelope; the dispatcher
+// already inserts one record at the end of each submit path via
+// CYCLE_COUNT_ORCH_SUBMIT_RECORD.
+#define CYCLE_COUNT_START()                                                        \
+    bool _prof_active = (orch->l2_swimlane_level >= L2SwimlaneLevel::ORCH_PHASES); \
+    uint64_t _t0 = get_sys_cnt_aicpu(), _t1;                                       \
+    uint64_t _submit_start_ts = _t0
+#define CYCLE_COUNT_LAP(acc)       \
+    do {                           \
+        _t1 = get_sys_cnt_aicpu(); \
+        acc += (_t1 - _t0);        \
+        _t0 = _t1;                 \
+    } while (0)
+#define CYCLE_COUNT_ORCH_SUBMIT_RECORD(tid)                                                       \
+    do {                                                                                          \
+        if (_prof_active) {                                                                       \
+            l2_swimlane_aicpu_record_orch_phase(_submit_start_ts, _t1, (tid), g_orch_submit_idx); \
+        }                                                                                         \
+    } while (0)
+#elif PTO2_PROFILING
+#include "aicpu/device_time.h"
+#include "aicpu/l2_swimlane_collector_aicpu.h"
+__attribute__((weak, visibility("hidden"))) uint64_t get_sys_cnt_aicpu() { return 0; }
+__attribute__((weak, visibility("hidden"))) void
+l2_swimlane_aicpu_record_orch_phase(uint64_t, uint64_t, uint64_t, uint32_t) {}
+// submit_idx needed for swimlane task_id tagging (no cycle accumulation at this level)
+static uint32_t g_orch_submit_idx = 0;
+#define CYCLE_COUNT_START()                                                        \
+    bool _prof_active = (orch->l2_swimlane_level >= L2SwimlaneLevel::ORCH_PHASES); \
+    uint64_t _t0 = _prof_active ? get_sys_cnt_aicpu() : 0, _t1 = 0;                \
+    uint64_t _submit_start_ts = _t0
+#define CYCLE_COUNT_LAP(acc) \
+    do {                     \
+    } while (0)
+#define CYCLE_COUNT_ORCH_SUBMIT_RECORD(tid)                                                       \
+    do {                                                                                          \
+        if (_prof_active) {                                                                       \
+            _t1 = get_sys_cnt_aicpu();                                                            \
+            l2_swimlane_aicpu_record_orch_phase(_submit_start_ts, _t1, (tid), g_orch_submit_idx); \
+        }                                                                                         \
+    } while (0)
+#else
+#define CYCLE_COUNT_START()
+#define CYCLE_COUNT_LAP(acc)
+#define CYCLE_COUNT_ORCH_SUBMIT_RECORD(tid)
+#endif
+
+static int32_t orch_mark_fatal(PTO2OrchestratorState *orch, int32_t error_code) {
+    always_assert(orch != nullptr);
+    orch->fatal = true;
+    if (error_code == PTO2_ERROR_NONE || orch->sm_header == nullptr) {
+        return PTO2_ERROR_NONE;
+    }
+
+    int32_t expected = PTO2_ERROR_NONE;
+    std::atomic<int32_t> &orch_error_code = orch->sm_header->orch_error_code;
+    if (orch_error_code.compare_exchange_strong(expected, error_code, std::memory_order_acq_rel)) {
+        return error_code;
+    }
+    return expected;
+}
+
+static void
+orch_report_fatal_v(PTO2OrchestratorState *orch, int32_t error_code, const char *func, const char *fmt, va_list args) {
+    int32_t latched_code = orch_mark_fatal(orch, error_code);
+
+#if PTO2_PROFILING
+    // Flush the current scope's peaks BEFORE the FATAL log line, so the
+    // diagnostic context (which pool/window filled up) appears right next to
+    // the failure reason. on_fatal is latched, so duplicate fatals from
+    // different layers don't print multiple stats lines.
+    scope_stats_on_fatal();
+#endif
+
+    if (fmt == nullptr || fmt[0] == '\0') {
+        if (latched_code != PTO2_ERROR_NONE && latched_code != error_code) {
+            unified_log_error(func, "FATAL(code=%d, latched=%d)", error_code, latched_code);
+        } else {
+            unified_log_error(func, "FATAL(code=%d)", error_code);
+        }
+        return;
+    }
+
+    char message[1024];
+    vsnprintf(message, sizeof(message), fmt, args);
+    if (latched_code != PTO2_ERROR_NONE && latched_code != error_code) {
+        unified_log_error(func, "FATAL(code=%d, latched=%d): %s", error_code, latched_code, message);
+        return;
+    }
+    unified_log_error(func, "FATAL(code=%d): %s", error_code, message);
+}
+
+void PTO2OrchestratorState::report_fatal(int32_t error_code, const char *func, const char *fmt, ...) {
+    auto *orch = this;
+    va_list args;
+    va_start(args, fmt);
+    orch_report_fatal_v(orch, error_code, func, fmt, args);
+    va_end(args);
+}
+
+static uint32_t next_fanin_seen_epoch(PTO2OrchestratorState *orch) {
+    uint32_t next = orch->fanin_seen_current_epoch + 1;
+    if (next == 0) {
+        for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+            memset(
+                orch->fanin_seen_epoch[r], 0,
+                static_cast<size_t>(orch->sm_header->rings[r].task_window_size) * sizeof(uint32_t)
+            );
+        }
+        next = 1;
+    }
+    orch->fanin_seen_current_epoch = next;
+    return next;
+}
+
+struct PTO2FaninBuilder {
+    PTO2FaninBuilder(PTO2OrchestratorState *orch, PTO2FaninPool &spill_pool, uint32_t seen_epoch) :
+        count(0),
+        spill_start(0),
+        orch(orch),
+        seen_epoch(seen_epoch),
+        spill_pool(spill_pool) {}
+    int32_t count{0};
+    int32_t spill_start{0};
+    PTO2OrchestratorState *orch{nullptr};
+    uint32_t seen_epoch{0};
+    PTO2FaninPool &spill_pool;
+    PTO2TaskSlotState *inline_slots[PTO2_FANIN_INLINE_CAP];
+
+    template <typename Fn>
+    PTO2FaninForEachReturn<Fn> for_each(Fn &&fn) const {
+        return for_each_fanin_storage(inline_slots, count, spill_start, spill_pool, static_cast<Fn &&>(fn));
+    }
+
+    bool mark_seen(uint8_t prod_ring, int32_t prod_slot) {
+        if (prod_ring >= PTO2_MAX_RING_DEPTH || prod_slot < 0) {
+            return false;
+        }
+        uint32_t *seen = orch->fanin_seen_epoch[prod_ring];
+        uint32_t slot = static_cast<uint32_t>(prod_slot);
+        if (seen[slot] == seen_epoch) {
+            return true;
+        }
+        seen[slot] = seen_epoch;
+        return false;
+    }
+};
+
+static bool append_fanin_or_fail(
+    PTO2OrchestratorState *orch, uint8_t prod_ring, int32_t prod_slot, PTO2TaskSlotState *prod_state,
+    PTO2FaninBuilder *fanin_builder, uint8_t ring_id
+) {
+    if (fanin_builder->mark_seen(prod_ring, prod_slot)) {
+        return true;
+    }
+
+    if (fanin_builder->count < PTO2_FANIN_INLINE_CAP) {
+        fanin_builder->inline_slots[fanin_builder->count++] = prod_state;
+        return true;
+    }
+
+    PTO2FaninPool &fanin_pool = fanin_builder->spill_pool;
+    if (!fanin_pool.ensure_space(orch->sm_header->rings[ring_id], 1)) {
+        orch_mark_fatal(orch, PTO2_ERROR_DEP_POOL_OVERFLOW);
+        return false;
+    }
+    int32_t spill_idx = fanin_pool.top;
+    PTO2FaninSpillEntry *entry = fanin_pool.alloc();
+    if (entry == nullptr) {
+        orch_mark_fatal(orch, PTO2_ERROR_DEP_POOL_OVERFLOW);
+        return false;
+    }
+    if (fanin_builder->count == PTO2_FANIN_INLINE_CAP) {
+        fanin_builder->spill_start = spill_idx;
+    }
+    entry->slot_state = prod_state;
+    fanin_builder->count++;
+    return true;
+}
+
+static void scope_tasks_push(PTO2OrchestratorState *orch, PTO2TaskSlotState *task_slot_state);
+
+struct PTO2PreparedTask {
+    PTO2TaskId task_id = PTO2TaskId::invalid();
+    PTO2TaskAllocResult alloc_result = {-1, 0, nullptr, nullptr};
+    PTO2TaskDescriptor *task = nullptr;
+    PTO2TaskPayload *payload = nullptr;
+    PTO2TaskSlotState *slot_state = nullptr;
+};
+
+static PTO2OutputLayout calculate_output_layout(const L0TaskArgs &args) {
+    PTO2OutputLayout layout;
+    for (int32_t i = 0; i < args.tensor_count(); i++) {
+        if (args.tag(i) != TensorArgType::OUTPUT) {
+            continue;
+        }
+        layout.offsets[i] = layout.total_output_size;
+        layout.buffer_sizes[i] =
+            PTO2_ALIGN_UP(args.tensor(i).create_info().buffer_size_bytes(), PTO2_PACKED_OUTPUT_ALIGN);
+        layout.total_output_size += layout.buffer_sizes[i];
+    }
+    return layout;
+}
+
+static bool check_scope_can_accept_task(PTO2OrchestratorState *orch, PTO2TaskAllocator &allocator, uint8_t ring_id) {
+    always_assert(orch->scope_stack_top >= 0 && "Cannot submit task outside a scope");
+
+    int32_t scope_task_count = orch->scope_tasks_size - orch->scope_begins[orch->scope_stack_top];
+    if (scope_task_count < allocator.window_size() - 1) {
+        return true;
+    }
+
+    int32_t active_count = allocator.active_count();
+
+    LOG_ERROR("========================================");
+    LOG_ERROR("FATAL: Scope Deadlock Detected! (ring %d)", ring_id);
+    LOG_ERROR("========================================");
+    LOG_ERROR("Tasks in current scope (%d) >= task_window_size (%d).", scope_task_count, allocator.window_size());
+    LOG_ERROR("  scope_depth:        %d", orch->scope_stack_top + 1);
+    LOG_ERROR("  ring_id:            %d", ring_id);
+    LOG_ERROR("  scope_task_count:   %d", scope_task_count);
+    LOG_ERROR("  active_tasks:       %d / %d", active_count, allocator.window_size());
+    LOG_ERROR("Root Cause:");
+    LOG_ERROR("  Tasks within a scope hold a fanout_count reference that is only");
+    LOG_ERROR("  released at scope_end. When scope task count >= window_size,");
+    LOG_ERROR("  no slots can be reclaimed -> deadlock.");
+    LOG_ERROR("Solution:");
+    LOG_ERROR("  1. Reduce tasks per scope (use batching/unroll)");
+    LOG_ERROR("  2. Increase task window (current: %d)", allocator.window_size());
+    LOG_ERROR("     Compile-time: PTO2_TASK_WINDOW_SIZE in pto_runtime2_types.h");
+    LOG_ERROR("     Runtime env:  PTO2_RING_TASK_WINDOW=<power-of-2>");
+    LOG_ERROR("  3. Split work across multiple scopes");
+    LOG_ERROR("========================================");
+    orch_mark_fatal(orch, PTO2_ERROR_SCOPE_DEADLOCK);
+    return false;
+}
+
+static bool prepare_task(
+    PTO2OrchestratorState *orch, const L0TaskArgs &args, int32_t total_output_size, ActiveMask active_mask,
+    PTO2PreparedTask *out
+) {
+    uint8_t ring_id = orch->current_ring_id();
+    auto &allocator = orch->rings[ring_id].task_allocator;
+
+    if (!check_scope_can_accept_task(orch, allocator, ring_id)) {
+        return false;
+    }
+
+    out->alloc_result = allocator.alloc(total_output_size);
+    if (out->alloc_result.failed()) {
+        orch_mark_fatal(orch, PTO2_ERROR_HEAP_RING_DEADLOCK);
+        return false;
+    }
+
+    out->task_id = PTO2TaskId::make(ring_id, static_cast<uint32_t>(out->alloc_result.task_id));
+    out->slot_state = &orch->sm_header->rings[ring_id].get_slot_state_by_slot(out->alloc_result.slot);
+    out->task = &orch->sm_header->rings[ring_id].task_descriptors[out->alloc_result.slot];
+    out->payload = &orch->sm_header->rings[ring_id].task_payloads[out->alloc_result.slot];
+
+    out->payload->prefetch(args.tensor_count(), args.scalar_count());
+
+    // Re-bind payload/task pointers each submit. Value is per-slot constant
+    // (same as &task_payloads[slot] / &task_descriptors[slot]), but writing
+    // here lets RingSchedState::init() skip the O(window_size) bind loop.
+    // Both writes hit the same 64B slot_state cache line we're about to
+    // dirty below, so the extra cost is two stores on an already-hot line.
+    // Must precede the scheduler wiring.queue.push at the end of
+    // submit_task_common — that push is the first read of slot_state->task /
+    // slot_state->payload by another thread.
+    out->slot_state->bind_buffers(out->payload, out->task);
+
+    // prepare_task does NO payload writes: all payload content (tensors/scalars +
+    // early-dispatch spec fields) is initialized in PTO2TaskPayload::init, the
+    // single payload-init point, which runs before the scheduler wiring push.
+
+    // Fields already reset by advance_ring_pointers (eager reset after CONSUMED):
+    //   fanout_lock=0, fanout_count=1, fanout_head=nullptr,
+    //   fanin_refcount=0, fanout_refcount=0, completed_subtasks=0, next_block_idx=0
+    // Fields immutable after RingSchedState::init():
+    //   ring_id
+    // task_state left as CONSUMED by eager reset (safe for stale wait_for_tensor
+    // observers); set to PENDING here when orchestrator actually reuses the slot.
+    out->slot_state->task_state.store(PTO2_TASK_PENDING, std::memory_order_relaxed);
+    int16_t block_num = args.launch_spec.block_num();
+    out->slot_state->total_required_subtasks =
+        static_cast<int16_t>(block_num * __builtin_popcount(active_mask.core_mask()));
+    out->slot_state->logical_block_num = block_num;
+    out->slot_state->active_mask = active_mask;
+    // fanin_count is set by scheduler during wiring
+    scope_tasks_push(orch, out->slot_state);
+
+    return true;
+}
+
+// =============================================================================
+// Scope Management
+// =============================================================================
+
+static void scope_tasks_push(PTO2OrchestratorState *orch, PTO2TaskSlotState *task_slot_state) {
+    if (orch->scope_tasks_size >= orch->scope_tasks_capacity) {
+        // scope_tasks lives in the per-Worker arena (single backing allocation),
+        // so realloc is not legal. Capacity == PTO2_SCOPE_TASKS_CAP ==
+        // PTO2_TASK_WINDOW_SIZE × PTO2_MAX_RING_DEPTH, the total in-flight slot
+        // budget — hitting it means every ring is saturated, so no further push
+        // could succeed regardless of buffer growth.
+        orch->report_fatal(
+            PTO2_ERROR_SCOPE_TASKS_OVERFLOW, __FUNCTION__,
+            "scope_tasks buffer saturated at %d entries (all rings full)", orch->scope_tasks_capacity
+        );
+        return;
+    }
+    orch->scope_tasks[orch->scope_tasks_size++] = task_slot_state;
+}
+
+void PTO2OrchestratorState::begin_scope(PTO2ScopeMode mode) {
+    auto *orch = this;
+    if (orch->fatal) {
+        return;
+    }
+    assert(orch->scope_stack_top < static_cast<int32_t>(orch->scope_stack_capacity - 1) && "Scope stack overflow");
+    if (mode == PTO2ScopeMode::AUTO && orch->in_manual_scope()) {
+        report_fatal(PTO2_ERROR_INVALID_ARGS, __FUNCTION__, "auto scope nested inside manual scope is not supported");
+        return;
+    }
+
+    bool already_in_manual_scope = orch->in_manual_scope();
+    ++orch->scope_stack_top;
+    orch->scope_begins[orch->scope_stack_top] = orch->scope_tasks_size;
+    if (mode == PTO2ScopeMode::MANUAL && !already_in_manual_scope) {
+        orch->manual_begin_depth = orch->scope_stack_top;
+    }
+#if PTO2_PROFILING
+    // Gate via is_scope_stats_enabled() (weak-false in host builds) BEFORE the
+    // collector call: when disabled we pay nothing. Sample the current ring's
+    // task/heap start-end and tensormap usage at the scope boundary.
+    if (is_scope_stats_enabled()) {
+        uint8_t ring_id = orch->current_ring_id();
+        auto &alloc = orch->rings[ring_id].task_allocator;
+        int32_t dep_pool_tail = 0;
+        int32_t dep_pool_top = 0;
+        if (orch->scheduler) {
+            orch->scheduler->ring_sched_states[ring_id].read_dep_pool_snapshot(dep_pool_tail, dep_pool_top);
+        }
+        scope_stats_begin(
+            ring_id, alloc.task_tail(), alloc.task_head(), alloc.heap_tail(), alloc.heap_top(), dep_pool_tail,
+            dep_pool_top, orch->tensor_map.current_used()
+        );
+    }
+#endif
+}
+
+void PTO2OrchestratorState::end_scope() {
+    auto *orch = this;
+    if (orch->fatal) {
+        return;
+    }
+    assert(orch->scope_stack_top >= 0 && "Scope stack underflow");
+
+    // Snapshot the ring start/end BEFORE the orchestrator drains pending tasks
+    // via scheduler->on_scope_end, so the end record reflects the scope's
+    // occupancy at close, not the residual after teardown.
+#if PTO2_PROFILING
+    // Gate via is_scope_stats_enabled() (see begin_scope). One collector call
+    // emits the end-boundary record and tears down bookkeeping.
+    if (is_scope_stats_enabled()) {
+        uint8_t ring_id = orch->current_ring_id();
+        auto &alloc = orch->rings[ring_id].task_allocator;
+        int32_t dep_pool_tail = 0;
+        int32_t dep_pool_top = 0;
+        if (orch->scheduler) {
+            orch->scheduler->ring_sched_states[ring_id].read_dep_pool_snapshot(dep_pool_tail, dep_pool_top);
+        }
+        scope_stats_end(
+            ring_id, alloc.task_tail(), alloc.task_head(), alloc.heap_tail(), alloc.heap_top(), dep_pool_tail,
+            dep_pool_top, orch->tensor_map.current_used()
+        );
+    }
+#endif
+
+#if PTO2_ORCH_PROFILING
+    uint64_t _se0 = get_sys_cnt_aicpu();
+#endif
+
+    bool ending_manual_scope = orch->scope_stack_top == orch->manual_begin_depth;
+    int32_t begin = orch->scope_begins[orch->scope_stack_top--];
+    int32_t count = orch->scope_tasks_size - begin;
+    if (ending_manual_scope) {
+        orch->manual_begin_depth = PTO2_MAX_SCOPE_DEPTH;
+    }
+
+    if (orch->scheduler && count > 0) {
+        orch->scheduler->on_scope_end(&orch->scope_tasks[begin], count);
+    }
+
+    // Rewind the task buffer — these entries are no longer needed
+    orch->scope_tasks_size = begin;
+
+#if PTO2_ORCH_PROFILING
+    uint64_t _se1 = get_sys_cnt_aicpu();
+    g_orch_scope_end_cycle += (_se1 - _se0);
+#endif
+}
+
+// =============================================================================
+// Task Submission
+// =============================================================================
+
+// Shared body for submit_task / submit_dummy_task. Caller has already validated
+// args.has_error, decided active_mask (empty for dummy), and resolved the per-slot
+// kernel_ids (all INVALID_KERNEL_ID for dummy). Performs tensormap sync, fanin
+// computation (explicit_deps + auto), output registration, slot init, and pushes
+// to the scheduler wiring queue.
+static TaskOutputTensors submit_task_common(
+    PTO2OrchestratorState *orch, const L0TaskArgs &args, ActiveMask active_mask, int32_t aic_kernel_id,
+    int32_t aiv0_kernel_id, int32_t aiv1_kernel_id
+) {
+    CYCLE_COUNT_START();
+    TaskOutputTensors result;
+    PTO2OutputLayout layout = calculate_output_layout(args);
+    PTO2PreparedTask prepared;
+    if (!prepare_task(orch, args, layout.total_output_size, active_mask, &prepared)) {
+        return result;
+    }
+    uint8_t ring_id = prepared.task_id.ring();
+    PTO2SchedulerState *sched = orch->scheduler;
+    PTO2RingFlowControl &fc = orch->sm_header->rings[ring_id].fc;
+    PTO2TaskId task_id = prepared.task_id;
+    PTO2TaskSlotState &cur_slot_state = *prepared.slot_state;
+    PTO2TaskDescriptor &task = *prepared.task;
+    PTO2TaskPayload &payload = *prepared.payload;
+    result.set_task_id(task_id);
+
+    // dep_gen capture point: snapshot the orch submit_task inputs while the
+    // tensormap is still in its pre-lookup state for this task. Replay reads
+    // these records offline to reconstruct the complete dep graph — the sole
+    // source of truth for fanout now that the swimlane hot path no longer
+    // records it.
+    if (is_dep_gen_enabled()) {
+        const void *tensor_ptrs[MAX_TENSOR_ARGS];
+        // TensorArgType is `enum class : int32_t` (4 bytes); the on-disk record
+        // packs arg_types as uint8_t[16] (5-value enum fits in a byte). Narrow
+        // each tag here rather than letting the AICPU writer reinterpret a
+        // 4×-wider array as bytes — that path silently lost two of every three
+        // tags on little-endian and synthesized phantom self-edges in replay.
+        uint8_t arg_types_u8[MAX_TENSOR_ARGS];
+        // Clamp to MAX_TENSOR_ARGS even though the Arg builder caps adds at
+        // MAX_TENSOR_ARGS: defensive against any future builder bypass /
+        // shared-memory bit-flip that could otherwise overrun the two
+        // MAX_TENSOR_ARGS-sized stack buffers above.
+        const int tc_raw = args.tensor_count();
+        const int tc = tc_raw > MAX_TENSOR_ARGS ? MAX_TENSOR_ARGS : tc_raw;
+        for (int i = 0; i < tc; i++) {
+            // OUTPUT slots carry create_info (not yet a Tensor); skip them —
+            // they have no producer to look up and replay's per-tensor loop
+            // also skips OUTPUT.
+            tensor_ptrs[i] = (args.tag(i) == TensorArgType::OUTPUT) ? nullptr : &args.tensor(i).ref();
+            arg_types_u8[i] = static_cast<uint8_t>(args.tag(i));
+        }
+        const int32_t kernel_ids_capture[3] = {aic_kernel_id, aiv0_kernel_id, aiv1_kernel_id};
+        dep_gen_aicpu_record_submit(
+            task_id.raw, orch->in_manual_scope(), tc, tensor_ptrs, arg_types_u8,
+            static_cast<int>(args.explicit_dep_count()), reinterpret_cast<const uint64_t *>(args.explicit_deps_data()),
+            kernel_ids_capture
+        );
+    }
+
+    PTO2FaninBuilder fanin_builder(orch, orch->rings[ring_id].fanin_pool, next_fanin_seen_epoch(orch));
+
+    CYCLE_COUNT_LAP(g_orch_alloc_cycle);
+
+#if PTO2_PROFILING
+    if (layout.total_output_size > 0) {
+        orch->buffers_allocated++;
+        orch->bytes_allocated += layout.total_output_size;
+    }
+#endif
+
+    // === STEP 2: Sync TensorMap validity and optional cleanup ===
+    // Read current last_task_alive from shared memory for this ring
+    int32_t sm_last_task_alive = fc.last_task_alive.load(std::memory_order_acquire);
+
+    orch->tensor_map.sync_tensormap(task_id, sm_last_task_alive);
+
+    CYCLE_COUNT_LAP(g_orch_sync_cycle);
+
+    for (uint32_t i = 0; i < args.explicit_dep_count(); i++) {
+        PTO2TaskId dep_task_id = args.explicit_dep(i);
+        if (!dep_task_id.is_valid()) {
+            orch->report_fatal(
+                PTO2_ERROR_INVALID_ARGS, __FUNCTION__, "Arg.set_dependencies(...) requires valid task ids"
+            );
+            return result;
+        }
+        uint8_t dep_ring_id = dep_task_id.ring();
+        PTO2SharedMemoryRingHeader &dep_ring = orch->sm_header->rings[dep_ring_id];
+        int32_t dep_local_task_id = static_cast<int32_t>(dep_task_id.local());
+        int32_t dep_last_task_alive = dep_ring.fc.last_task_alive.load(std::memory_order_acquire);
+        if (dep_local_task_id < dep_last_task_alive) {
+            continue;
+        }
+        int32_t dep_slot = dep_ring.get_slot_by_task_id(dep_local_task_id);
+        PTO2TaskSlotState *producer_slot_state = &dep_ring.get_slot_state_by_slot(dep_slot);
+        if (!append_fanin_or_fail(orch, dep_ring_id, dep_slot, producer_slot_state, &fanin_builder, ring_id)) {
+            return result;
+        }
+    }
+
+    // === STEP 3: Lookup inputs (creator retention + tensormap modifier lookup) ===
+    DepInputs dep_inputs{
+        args.tensor_count(),       args.tensor_data(), args.tag_data(), static_cast<int32_t>(args.explicit_dep_count()),
+        args.explicit_deps_data(),
+    };
+
+    auto runtime_emit = [&](PTO2TaskId producer_task_id) -> bool {
+        uint8_t prod_ring = producer_task_id.ring();
+        PTO2SharedMemoryRingHeader &producer_ring = orch->sm_header->rings[prod_ring];
+        int32_t prod_slot = producer_ring.get_slot_by_task_id(static_cast<int32_t>(producer_task_id.local()));
+        PTO2TaskSlotState *prod_state = &producer_ring.get_slot_state_by_slot(prod_slot);
+        return append_fanin_or_fail(orch, prod_ring, prod_slot, prod_state, &fanin_builder, ring_id);
+    };
+
+    if (!compute_task_fanin(dep_inputs, orch->tensor_map, orch->in_manual_scope(), runtime_emit)) {
+        return result;
+    }
+
+    CYCLE_COUNT_LAP(g_orch_lookup_cycle);
+
+    // === STEP 4: Register outputs/inouts in TensorMap (must be separate from lookup) ===
+    register_task_outputs(dep_inputs, task_id, orch->tensor_map, orch->in_manual_scope());
+
+    CYCLE_COUNT_LAP(g_orch_insert_cycle);
+
+    // === STEP 5: Batch-write to GM (single cache line burst) ===
+    // Deferred from allocation phase to avoid scattered GM writes that get
+    // evicted by TensorMap lookup/insert cache pressure.
+    __builtin_prefetch(&task, 1, 1);
+    task.task_id = task_id;
+    task.kernel_id[static_cast<int>(PTO2SubtaskSlot::AIC)] = aic_kernel_id;
+    task.kernel_id[static_cast<int>(PTO2SubtaskSlot::AIV0)] = aiv0_kernel_id;
+    task.kernel_id[static_cast<int>(PTO2SubtaskSlot::AIV1)] = aiv1_kernel_id;
+    task.packed_buffer_base = prepared.alloc_result.packed_base;
+    task.packed_buffer_end = prepared.alloc_result.packed_end;
+
+    // Increment fanout_count on each producer (no lock — only orch writes this field).
+    // Prevents premature CONSUMED: scope_end's release_producer checks fanout_refcount == fanout_count.
+    for_each_fanin_storage(
+        fanin_builder.inline_slots, fanin_builder.count, fanin_builder.spill_start, fanin_builder.spill_pool,
+        [](PTO2TaskSlotState *producer) {
+            producer->fanout_count++;
+        }
+    );
+
+    int32_t inline_count = std::min(fanin_builder.count, PTO2_FANIN_INLINE_CAP);
+    // Store fanin metadata in payload for scheduler to iterate
+    payload.fanin_actual_count = fanin_builder.count;
+    payload.fanin_spill_start = fanin_builder.spill_start;
+    payload.fanin_spill_pool = &fanin_builder.spill_pool;
+    for (int i = 0; i < inline_count; i++) {
+        payload.fanin_inline_slot_states[i] = fanin_builder.inline_slots[i];
+    }
+
+    payload.init(args, result, prepared.alloc_result, layout);
+#if PTO2_PROFILING
+    if (is_dump_args_enabled()) {
+        if (args.scalar_count() > 0) {
+            set_dump_args_task_scalar_dtypes(
+                task_id.raw, static_cast<uint32_t>(args.scalar_count()), args.scalar_dtypes()
+            );
+        }
+        // Selective vs full dump is latched at dump_args_init from DumpDataHeader
+        // (host-decided before any dispatch), so it is race-free regardless of
+        // submission order. Here we only record each marked task's arg mask and
+        // metadata flags, which selective collection consults.
+        if (args.dump_arg_mask() != 0) {
+            set_dump_args_task_mask(task_id.raw, args.dump_arg_mask(), args.dump_arg_index_ambiguous_mask());
+        }
+    }
+#endif
+
+    CYCLE_COUNT_LAP(g_orch_args_cycle);
+#if PTO2_ORCH_PROFILING
+    g_orch_args_atomic_count += 2;  // fanout_lock.store + fanout_count.store
+#endif
+
+    // === STEP 6: push to wiring queue ===
+    // Deferred wiring: orchestrator only stores dependency metadata and increments
+    // fanout_count. The actual fanout_head wiring (lock + dep_pool + early_finished)
+    // is handled asynchronously by scheduler thread 0 via the wiring queue.
+    // Push to global wiring queue — scheduler sets fanin_count, wires fanout, checks readiness
+    while (!sched->wiring.queue.push(&cur_slot_state)) {
+        SPIN_WAIT_HINT();
+    }
+
+    CYCLE_COUNT_LAP(g_orch_fanin_cycle);
+    CYCLE_COUNT_ORCH_SUBMIT_RECORD(task_id.raw);
+
+#if PTO2_PROFILING
+    orch->tasks_submitted++;
+#if PTO2_ORCH_PROFILING
+    g_orch_submit_count++;
+#endif
+    g_orch_submit_idx++;
+#endif
+    return result;
+}
+
+TaskOutputTensors PTO2OrchestratorState::submit_task(const MixedKernels &mixed_kernels, const L0TaskArgs &args) {
+    auto *orch = this;
+
+    // Orchestration API should short-circuit after fatal, but keep this entry
+    // robust as a no-op in case a caller reaches it directly.
+    if (orch->fatal) {
+        return TaskOutputTensors{};
+    }
+
+    // Validate Arg construction (errors recorded by add_input/add_output/etc.)
+    if (args.has_error) {
+        LOG_ERROR("========================================");
+        LOG_ERROR("FATAL: Invalid Arg Detected!");
+        LOG_ERROR("========================================");
+        LOG_ERROR("Error: %s", args.error_msg ? args.error_msg : "(unknown)");
+        LOG_ERROR("  tensor_count: %d, scalar_count: %d", args.tensor_count(), args.scalar_count());
+        LOG_ERROR("This is a bug in the orchestration code.");
+        LOG_ERROR("========================================");
+        orch_mark_fatal(orch, PTO2_ERROR_INVALID_ARGS);
+        return TaskOutputTensors{};
+    }
+    always_assert(orch->scheduler != nullptr);
+    // === Validate submit inputs ===
+    ActiveMask active_mask = mixed_kernels.to_active_mask();
+    always_assert(static_cast<bool>(active_mask) && "MixedKernels must have at least one active slot");
+
+    int16_t block_num = args.launch_spec.block_num();
+    always_assert(block_num >= 1 && "block_num must be >= 1");
+
+    // Normalize single-AIV tasks: if only aiv1 is set (no aic, no aiv0), move
+    // it to the aiv0 slot.  This guarantees the dispatch path can always use
+    // PTO2SubtaskSlot::AIV0 for single-AIV shapes without inspecting active_mask.
+    // Mixed tasks (AIC+AIV) keep their original AIV identity so the correct
+    // hardware channel (AIV0→AIC vs AIV1→AIC) is used at dispatch time.
+    MixedKernels normalized = mixed_kernels;
+    bool has_aic = active_mask.has_mask(PTO2_SUBTASK_MASK_AIC);
+    bool has_aiv0 = active_mask.has_mask(PTO2_SUBTASK_MASK_AIV0);
+    bool has_aiv1 = active_mask.has_mask(PTO2_SUBTASK_MASK_AIV1);
+    if (!has_aic && has_aiv1 && !has_aiv0) {
+        normalized.aiv0_kernel_id = normalized.aiv1_kernel_id;
+        normalized.aiv1_kernel_id = INVALID_KERNEL_ID;
+        active_mask = normalized.to_active_mask();
+    }
+
+    // Encode require_sync_start into active_mask bit 3 (only meaningful for tasks with block_num > 1)
+    if (block_num > 1 && args.launch_spec.require_sync_start()) {
+        // Deadlock check: block_num >= total available slots of the required type.
+        // For MIX/AIC: limit is total_cluster_count (one AIC per cluster).
+        // For AIV:     limit is total_aiv_count.
+        PTO2ResourceShape shape = active_mask.to_shape();
+        int32_t limit = (shape == PTO2ResourceShape::AIV) ? orch->total_aiv_count : orch->total_cluster_count;
+        if (limit > 0 && block_num > limit) {
+            report_fatal(
+                PTO2_ERROR_REQUIRE_SYNC_START_INVALID, __FUNCTION__,
+                "require_sync_start block_num=%d > limit=%d (deadlock guaranteed)", block_num, limit
+            );
+            return TaskOutputTensors{};
+        }
+        active_mask.set_sync_start();
+    }
+
+    return submit_task_common(
+        orch, args, active_mask, normalized.aic_kernel_id, normalized.aiv0_kernel_id, normalized.aiv1_kernel_id
+    );
+}
+
+// Submit a dependency-only task: full dependency graph participation
+// (tensormap lookup/insert, explicit_deps, manual_dep, manual_scope) but no
+// AICore dispatch. Empty active_mask routes the slot to the DUMMY ready
+// bucket; dispatch loop short-circuits to completion. Accepts the same Arg
+// shape as submit_task; scalars are permitted but never consumed.
+TaskOutputTensors PTO2OrchestratorState::submit_dummy_task(const L0TaskArgs &args) {
+    auto *orch = this;
+
+    if (orch->fatal) {
+        return TaskOutputTensors{};
+    }
+
+    if (args.has_error) {
+        LOG_ERROR("========================================");
+        LOG_ERROR("FATAL: Invalid Arg in submit_dummy_task!");
+        LOG_ERROR("========================================");
+        LOG_ERROR("Error: %s", args.error_msg ? args.error_msg : "(unknown)");
+        LOG_ERROR("  tensor_count: %d, scalar_count: %d", args.tensor_count(), args.scalar_count());
+        LOG_ERROR("========================================");
+        orch_mark_fatal(orch, PTO2_ERROR_INVALID_ARGS);
+        return TaskOutputTensors{};
+    }
+    always_assert(orch->scheduler != nullptr);
+
+    return submit_task_common(orch, args, ActiveMask{}, INVALID_KERNEL_ID, INVALID_KERNEL_ID, INVALID_KERNEL_ID);
+}
+
+TaskOutputTensors PTO2OrchestratorState::alloc_tensors(const L0TaskArgs &args) {
+    auto *orch = this;
+    // Orchestration API should short-circuit after fatal, but keep this entry
+    // robust as a no-op in case a caller reaches it directly.
+    if (orch->fatal) {
+        return TaskOutputTensors{};
+    }
+
+    if (args.tensor_count() <= 0) {
+        report_fatal(PTO2_ERROR_INVALID_ARGS, __FUNCTION__, "alloc_tensors requires at least one TensorCreateInfo");
+        return TaskOutputTensors{};
+    }
+    if (args.scalar_count() != 0) {
+        report_fatal(PTO2_ERROR_INVALID_ARGS, __FUNCTION__, "alloc_tensors only accepts output TensorCreateInfo args");
+        return TaskOutputTensors{};
+    }
+    for (int32_t i = 0; i < args.tensor_count(); i++) {
+        if (args.tag(i) != TensorArgType::OUTPUT) {
+            report_fatal(
+                PTO2_ERROR_INVALID_ARGS, __FUNCTION__, "alloc_tensors only accepts output TensorCreateInfo args"
+            );
+            return TaskOutputTensors{};
+        }
+    }
+
+    CYCLE_COUNT_START();
+
+    if (args.has_error) {
+        report_fatal(
+            PTO2_ERROR_INVALID_ARGS, __FUNCTION__, "%s",
+            args.error_msg ? args.error_msg : "alloc_tensors failed to construct output-only Arg"
+        );
+        return TaskOutputTensors{};
+    }
+
+    PTO2OutputLayout layout = calculate_output_layout(args);
+    PTO2PreparedTask prepared;
+    if (!prepare_task(orch, args, layout.total_output_size, ActiveMask{}, &prepared)) {
+        return TaskOutputTensors{};
+    }
+
+    PTO2TaskDescriptor &task = *prepared.task;
+    PTO2TaskPayload &payload = *prepared.payload;
+
+    CYCLE_COUNT_LAP(g_orch_alloc_cycle);
+
+#if PTO2_PROFILING
+    if (layout.total_output_size > 0) {
+        orch->buffers_allocated++;
+        orch->bytes_allocated += layout.total_output_size;
+    }
+#endif
+
+    task.task_id = prepared.task_id;
+    task.kernel_id[static_cast<int>(PTO2SubtaskSlot::AIC)] = INVALID_KERNEL_ID;
+    task.kernel_id[static_cast<int>(PTO2SubtaskSlot::AIV0)] = INVALID_KERNEL_ID;
+    task.kernel_id[static_cast<int>(PTO2SubtaskSlot::AIV1)] = INVALID_KERNEL_ID;
+    task.packed_buffer_base = prepared.alloc_result.packed_base;
+    task.packed_buffer_end = prepared.alloc_result.packed_end;
+
+    TaskOutputTensors outputs;
+    outputs.set_task_id(prepared.task_id);
+    payload.init(args, outputs, prepared.alloc_result, layout);
+    payload.fanin_actual_count = 0;
+    payload.fanin_spill_start = 0;
+    payload.fanin_spill_pool = &orch->rings[prepared.task_id.ring()].fanin_pool;
+    CYCLE_COUNT_LAP(g_orch_args_cycle);
+
+    if (prepared.slot_state != nullptr) {
+        // Hidden alloc tasks complete inline in the orchestrator before any
+        // consumer can exist, so they have no fanout to notify and no worker
+        // subtasks to retire. Running the full on_task_complete path
+        // would only pay unnecessary fanout_lock / traversal overhead here.
+        // The generic slot initialization done in prepare_task() is still
+        // required so scope_end can release the producer-side reference and
+        // drive the slot to CONSUMED, but worker dispatch fields are never
+        // observed for hidden alloc tasks.
+        prepared.slot_state->task_state.store(PTO2_TASK_COMPLETED, std::memory_order_release);
+    }
+    orch->inline_completed_tasks++;
+
+    CYCLE_COUNT_LAP(g_orch_fanin_cycle);
+    CYCLE_COUNT_ORCH_SUBMIT_RECORD(prepared.task_id.raw);
+
+#if PTO2_PROFILING
+    orch->tasks_submitted++;
+#if PTO2_ORCH_PROFILING
+    g_orch_submit_count++;
+#endif
+    g_orch_submit_idx++;
+#endif
+
+    return outputs;
+}
+
+// =============================================================================
+// Flow Control
+// =============================================================================
+
+void PTO2OrchestratorState::mark_done() {
+    auto *orch = this;
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        int32_t total_tasks = orch->rings[r].task_allocator.active_count();
+        if (total_tasks > 0) {
+            LOG_INFO_V0("=== [Orchestrator] ring %d: total_tasks=%d ===", r, total_tasks);
+        }
+        auto &fanin_pool = orch->rings[r].fanin_pool;
+        if (fanin_pool.top > 1) {
+            LOG_INFO_V0(
+                "=== [FaninPool %d] top=%d tail=%d used=%d high_water=%d capacity=%d ===", r, fanin_pool.top,
+                fanin_pool.tail, fanin_pool.top - fanin_pool.tail, fanin_pool.high_water, fanin_pool.capacity
+            );
+        }
+    }
+    orch->sm_header->orchestrator_done.store(1, std::memory_order_release);
+    orch->scope_tasks_size = 0;
+    orch->scope_stack_top = -1;
+    orch->manual_begin_depth = PTO2_MAX_SCOPE_DEPTH;
+#if !PTO2_ORCH_PROFILING && PTO2_PROFILING
+    g_orch_submit_idx = 0;
+#endif
+}
+
+#if PTO2_ORCH_PROFILING
+PTO2OrchProfilingData orchestrator_get_profiling() {
+    PTO2OrchProfilingData d;
+    d.sync_cycle = g_orch_sync_cycle;
+    d.alloc_cycle = g_orch_alloc_cycle;
+    d.args_cycle = g_orch_args_cycle;
+    d.lookup_cycle = g_orch_lookup_cycle;
+    d.insert_cycle = g_orch_insert_cycle;
+    d.fanin_cycle = g_orch_fanin_cycle;
+    d.scope_end_cycle = g_orch_scope_end_cycle;
+    d.submit_count = g_orch_submit_count;
+    d.alloc_wait_cycle = g_orch_alloc_wait_cycle;
+    d.fanin_wait_cycle = g_orch_fanin_wait_cycle;
+    d.alloc_atomic_count = g_orch_alloc_atomic_count;
+    d.args_atomic_count = g_orch_args_atomic_count;
+    d.scope_end_atomic_count = g_orch_scope_end_atomic_count;
+
+    // Reset
+    g_orch_sync_cycle = g_orch_alloc_cycle = g_orch_args_cycle = 0;
+    g_orch_lookup_cycle = g_orch_insert_cycle = 0;
+    g_orch_fanin_cycle = g_orch_scope_end_cycle = 0;
+    g_orch_submit_count = 0;
+    g_orch_submit_idx = 0;
+    g_orch_alloc_wait_cycle = 0;
+    g_orch_fanin_wait_cycle = 0;
+    g_orch_alloc_atomic_count = 0;
+    g_orch_args_atomic_count = 0;
+    g_orch_scope_end_atomic_count = 0;
+    return d;
+}
+#endif
diff --git a/src/a2a3/runtime/fully_distributed_within_core/runtime/pto_orchestrator.h b/src/a2a3/runtime/fully_distributed_within_core/runtime/pto_orchestrator.h
new file mode 100644
index 000000000..8ffe39b31
--- /dev/null
+++ b/src/a2a3/runtime/fully_distributed_within_core/runtime/pto_orchestrator.h
@@ -0,0 +1,209 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * PTO Runtime2 - Orchestrator Interface
+ *
+ * The Orchestrator is responsible for:
+ * 1. Executing the orchestration function (Turing-complete control flow)
+ * 2. Allocating intermediate buffers from the heap
+ * 3. Submitting tasks via async InCore function calls
+ * 4. Building the dependency graph using TensorMap
+ * 5. Managing buffer scopes for lifecycle control
+ *
+ * The Orchestrator can run on either:
+ * - Host CPU (lower latency for complex control, easier debugging)
+ * - Device AI_CPU (lower latency for task submission)
+ *
+ * Based on: docs/RUNTIME_LOGIC.md
+ */
+
+#ifndef PTO_ORCHESTRATOR_H
+#define PTO_ORCHESTRATOR_H
+
+#include "common/l2_swimlane_profiling.h"
+#include "utils/device_arena.h"
+#include "pto_ring_buffer.h"
+#include "pto_runtime2_types.h"
+#include "pto_submit_types.h"
+#include "scheduler/pto_scheduler.h"
+#include "pto_shared_memory.h"
+#include "pto_tensormap.h"
+#include "pto_types.h"
+
+/**
+ * Layout descriptor produced by PTO2OrchestratorState::reserve_layout(). Holds
+ * arena offsets for every sub-region the orchestrator owns (per-ring fanin
+ * pools, scope arrays, plus the nested PTO2TensorMap layout).
+ */
+struct PTO2OrchestratorLayout {
+    size_t off_fanin_pool[PTO2_MAX_RING_DEPTH];
+    size_t off_fanin_seen_epoch[PTO2_MAX_RING_DEPTH];
+    size_t off_scope_tasks;
+    size_t off_scope_begins;
+    PTO2TensorMapLayout tensor_map;
+    int32_t dep_pool_capacities[PTO2_MAX_RING_DEPTH];
+    int32_t scope_tasks_cap;
+    uint64_t scope_stack_capacity;
+};
+
+// =============================================================================
+// Orchestrator State
+// =============================================================================
+
+/**
+ * Orchestrator state structure (private to Orchestrator)
+ *
+ * Contains all state needed for task graph construction and buffer management.
+ */
+struct PTO2OrchestratorState {
+    // === SHARED MEMORY ACCESS ===
+    PTO2SharedMemoryHeader *sm_header;
+
+    // === PER-RING RESOURCES ===
+    PTO2RingSet rings[PTO2_MAX_RING_DEPTH];
+    uint32_t *fanin_seen_epoch[PTO2_MAX_RING_DEPTH];
+    uint32_t fanin_seen_current_epoch{1};
+
+    // === TENSOR MAP (Private) ===
+    PTO2TensorMap tensor_map;  // Producer lookup
+
+    // === SCOPE STACK (Private) ===
+    // Single contiguous buffer of task IDs, partitioned by scope level.
+    // scope_begins[i] is the index into scope_tasks where scope i starts.
+    // Tasks for the top scope occupy [scope_begins[top], scope_tasks_size).
+    PTO2TaskSlotState **scope_tasks;  // Flat buffer of taskSlotState (all scopes concatenated)
+    int32_t scope_tasks_size;         // Number of task IDs currently in the buffer
+    int32_t scope_tasks_capacity;     // Allocated capacity of scope_tasks
+    int32_t *scope_begins;            // scope_begins[i] = start index of scope i in scope_tasks
+    int32_t scope_stack_top;          // Current top of stack (-1 = no scope open)
+    uint64_t scope_stack_capacity;    // Max nesting depth (PTO2_MAX_SCOPE_DEPTH)
+    int32_t manual_begin_depth{PTO2_MAX_SCOPE_DEPTH};
+
+    // === SCHEDULER REFERENCE ===
+    // Note: In simulated mode, orchestrator and scheduler share address space
+    // In real mode, they communicate via shared memory only
+    PTO2SchedulerState *scheduler;  // For simulated mode only
+
+    // Total core counts set once at executor init; used for submit-time deadlock detection.
+    int32_t total_cluster_count{0};  // AIC cores = MIX clusters
+    int32_t total_aiv_count{0};      // AIV cores (= 2 × clusters on standard hardware)
+#if PTO2_PROFILING
+    // L2 swimlane_level copied from get_l2_swimlane_level().
+    L2SwimlaneLevel l2_swimlane_level{L2SwimlaneLevel::DISABLED};
+#endif
+
+    // === GM HEAP (for output buffers) ===
+    void *gm_heap_base;     // Base address of GM heap
+    uint64_t gm_heap_size;  // Total size of GM heap (all rings)
+
+    // === FATAL ERROR ===
+    // Fatal error flag (single-thread access by orchestrator, no atomic needed)
+    // Cross-thread notification uses shared memory orch_error_code (atomic)
+    bool fatal;
+
+    // Hidden alloc tasks complete synchronously inside the orchestrator and
+    // therefore bypass the executor's normal worker-completion counter path.
+    // The executor adds this count into its completed_tasks_ progress counter
+    // after orchestration finishes so shutdown/profiling totals remain closed.
+    int64_t inline_completed_tasks{0};
+
+    // === STATISTICS ===
+#if PTO2_PROFILING
+    int64_t tasks_submitted;
+    int64_t buffers_allocated;
+    int64_t bytes_allocated;
+#endif
+
+    /**
+     * Get current ring index from scope depth.
+     * Maps scope depth to ring_id: min(scope_depth, PTO2_MAX_RING_DEPTH - 1)
+     */
+    uint8_t current_ring_id() const {
+        int32_t depth = scope_stack_top;
+        if (depth < 0) depth = 0;
+        return depth < PTO2_MAX_RING_DEPTH ? static_cast<uint8_t>(depth) : PTO2_MAX_RING_DEPTH - 1;
+    }
+
+    bool in_manual_scope() const { return scope_stack_top >= manual_begin_depth; }
+
+    // === Cold-path API (defined in pto_orchestrator.cpp) ===
+
+    // Phase 1: declare every sub-region (per-ring fanin pool, scope arrays,
+    // tensor_map sub-layout) on the supplied arena. task_window_sizes feeds
+    // the nested tensor_map layout. Returned layout is consumed by
+    // init_from_layout.
+    static PTO2OrchestratorLayout reserve_layout(
+        DeviceArena &arena, const int32_t task_window_sizes[PTO2_MAX_RING_DEPTH],
+        int32_t dep_pool_capacity = PTO2_DEP_LIST_POOL_SIZE
+    );
+    static PTO2OrchestratorLayout reserve_layout(
+        DeviceArena &arena, const int32_t task_window_sizes[PTO2_MAX_RING_DEPTH],
+        const int32_t dep_pool_capacities[PTO2_MAX_RING_DEPTH]
+    );
+
+    // Phase 3a: write everything *except* arena-internal pointer fields.
+    // sm_dev_base is the SM device address (only stored, never dereferenced);
+    // task_window_size feeds the per-ring SM address arithmetic. Safe to call
+    // on a host arena that holds the prebuilt image.
+    bool init_data_from_layout(
+        const PTO2OrchestratorLayout &layout, DeviceArena &arena, void *sm_dev_base, void *gm_heap, uint64_t heap_size,
+        uint64_t task_window_size
+    );
+    bool init_data_from_layout(
+        const PTO2OrchestratorLayout &layout, DeviceArena &arena, void *sm_dev_base, void *gm_heap,
+        const uint64_t heap_sizes[PTO2_MAX_RING_DEPTH], const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH]
+    );
+
+    // Phase 3b: write the arena-internal pointer fields (scope_tasks,
+    // scope_begins, rings[].fanin_pool.base, tensor_map.{buckets,entry_pool,
+    // free_entry_list,task_entry_heads}, scheduler reference).
+    // Idempotent — host runs once on the image, AICPU runs once after attach.
+    void wire_arena_pointers(const PTO2OrchestratorLayout &layout, DeviceArena &arena, PTO2SchedulerState *scheduler);
+
+    // Forget pointers; arena owns the backing buffers.
+    void destroy();
+    void set_scheduler(PTO2SchedulerState *scheduler);
+    void report_fatal(int32_t error_code, const char *func, const char *fmt, ...);
+    void begin_scope(PTO2ScopeMode mode = PTO2ScopeMode::AUTO);
+    void end_scope();
+    TaskOutputTensors submit_task(const MixedKernels &mixed_kernels, const L0TaskArgs &args);
+    TaskOutputTensors submit_dummy_task(const L0TaskArgs &args);
+    TaskOutputTensors alloc_tensors(const L0TaskArgs &args);
+    void mark_done();
+};
+
+// =============================================================================
+// Orchestrator Profiling Data
+// =============================================================================
+
+#if PTO2_ORCH_PROFILING
+struct PTO2OrchProfilingData {
+    uint64_t sync_cycle;
+    uint64_t alloc_cycle;  // Combined task slot + heap allocation
+    uint64_t args_cycle;
+    uint64_t lookup_cycle;
+    uint64_t insert_cycle;
+    uint64_t fanin_cycle;
+    uint64_t scope_end_cycle;
+    int64_t submit_count;
+    // Wait time tracking for blocking phases
+    uint64_t alloc_wait_cycle;  // Cycles spent waiting in unified alloc
+    uint64_t fanin_wait_cycle;  // Cycles spent waiting in fanout_lock
+    // Atomic operation counts per phase
+    uint64_t alloc_atomic_count;
+    uint64_t args_atomic_count;
+    uint64_t scope_end_atomic_count;
+};
+
+PTO2OrchProfilingData orchestrator_get_profiling();
+#endif
+
+#endif  // PTO_ORCHESTRATOR_H
diff --git a/src/a2a3/runtime/fully_distributed_within_core/runtime/pto_ring_buffer.cpp b/src/a2a3/runtime/fully_distributed_within_core/runtime/pto_ring_buffer.cpp
new file mode 100644
index 000000000..f6009dc57
--- /dev/null
+++ b/src/a2a3/runtime/fully_distributed_within_core/runtime/pto_ring_buffer.cpp
@@ -0,0 +1,168 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * PTO Runtime2 - Ring Buffer Implementation
+ *
+ * Implements DepListPool ring buffer for zero-overhead dependency management.
+ * TaskAllocator methods are defined inline in pto_ring_buffer.h.
+ *
+ * Based on: docs/RUNTIME_LOGIC.md
+ */
+
+#include "pto_ring_buffer.h"
+#include <inttypes.h>
+#include <string.h>
+#include "common/unified_log.h"
+#include "scheduler/pto_scheduler.h"
+
+static void latch_pool_error(std::atomic<int32_t> *error_code_ptr, int32_t error_code) {
+    if (error_code_ptr == nullptr) {
+        return;
+    }
+    int32_t expected = PTO2_ERROR_NONE;
+    error_code_ptr->compare_exchange_strong(expected, error_code, std::memory_order_acq_rel);
+}
+
+// =============================================================================
+// Fanin Spill Pool Implementation
+// =============================================================================
+void PTO2FaninPool::reclaim(PTO2SharedMemoryRingHeader &ring, int32_t sm_last_task_alive) {
+    if (sm_last_task_alive <= reclaim_task_cursor) return;
+
+    int32_t scan_end = sm_last_task_alive;
+    for (int32_t task_id = reclaim_task_cursor; task_id < scan_end; ++task_id) {
+        PTO2TaskPayload &payload = ring.get_payload_by_task_id(task_id);
+        if (payload.fanin_spill_pool != this) {
+            continue;
+        }
+
+        int32_t inline_count = std::min(payload.fanin_actual_count, PTO2_FANIN_INLINE_CAP);
+        int32_t spill_edge_count = payload.fanin_actual_count - inline_count;
+        if (spill_edge_count > 0) {
+            advance_tail(payload.fanin_spill_start + spill_edge_count);
+        }
+    }
+    reclaim_task_cursor = scan_end;
+}
+
+bool PTO2FaninPool::ensure_space(PTO2SharedMemoryRingHeader &ring, int32_t needed) {
+    if (available() >= needed) return true;
+
+    int spin_count = 0;
+    int32_t prev_last_alive = ring.fc.last_task_alive.load(std::memory_order_acquire);
+    while (available() < needed) {
+        reclaim(ring, prev_last_alive);
+        if (available() >= needed) return true;
+
+        spin_count++;
+
+        int32_t cur_last_alive = ring.fc.last_task_alive.load(std::memory_order_acquire);
+        if (cur_last_alive > prev_last_alive) {
+            spin_count = 0;
+            prev_last_alive = cur_last_alive;
+        }
+
+        if (spin_count >= PTO2_DEP_POOL_SPIN_LIMIT) {
+            int32_t current = ring.fc.current_task_index.load(std::memory_order_acquire);
+            LOG_ERROR("========================================");
+            LOG_ERROR("FATAL: Fanin Spill Pool Deadlock Detected!");
+            LOG_ERROR("========================================");
+            LOG_ERROR("Fanin spill pool cannot reclaim space after %d spins (no progress).", spin_count);
+            LOG_ERROR(
+                "  - Pool used:     %d / %d (%.1f%%)", used(), capacity,
+                (capacity > 0) ? (100.0 * used() / capacity) : 0.0
+            );
+            LOG_ERROR("  - Pool top:      %d (linear)", top);
+            LOG_ERROR("  - Pool tail:     %d (linear)", tail);
+            LOG_ERROR("  - High water:    %d", high_water);
+            LOG_ERROR("  - Needed:        %d entries", needed);
+            LOG_ERROR("  - last_task_alive: %d (stuck here)", cur_last_alive);
+            LOG_ERROR("  - current_task:    %d", current);
+            LOG_ERROR("  - In-flight tasks: %d", current - cur_last_alive);
+            LOG_ERROR("Diagnosis:");
+            LOG_ERROR("  last_task_alive is not advancing, so fanin spill pool tail");
+            LOG_ERROR("  cannot reclaim. Check TaskRing diagnostics for root cause.");
+            LOG_ERROR("Solution:");
+            LOG_ERROR("  Increase fanin spill pool capacity (current: %d, recommended: %d)", capacity, high_water * 2);
+            LOG_ERROR("  Compile-time: PTO2_DEP_LIST_POOL_SIZE in pto_runtime2_types.h");
+            LOG_ERROR("  Runtime env:  PTO2_RING_DEP_POOL=%d", high_water * 2);
+            LOG_ERROR("========================================");
+            latch_pool_error(error_code_ptr, PTO2_ERROR_DEP_POOL_OVERFLOW);
+            return false;
+        }
+        SPIN_WAIT_HINT();
+    }
+    return true;
+}
+
+// =============================================================================
+// Dependency List Pool Implementation
+// =============================================================================
+void PTO2DepListPool::reclaim(PTO2SharedMemoryRingHeader &ring, int32_t sm_last_task_alive) {
+    if (sm_last_task_alive >= last_reclaimed + PTO2_DEP_POOL_CLEANUP_INTERVAL && sm_last_task_alive > 0) {
+        int32_t mark = ring.get_slot_state_by_task_id(sm_last_task_alive - 1).dep_pool_mark;
+        if (mark > 0) {
+            advance_tail(mark);
+        }
+        last_reclaimed = sm_last_task_alive;
+    }
+}
+
+bool PTO2DepListPool::ensure_space(PTO2SharedMemoryRingHeader &ring, int32_t needed) {
+    if (available() >= needed) return true;
+
+    int spin_count = 0;
+    int32_t prev_last_alive = ring.fc.last_task_alive.load(std::memory_order_acquire);
+    while (available() < needed) {
+        reclaim(ring, prev_last_alive);
+        if (available() >= needed) return true;
+
+        spin_count++;
+
+        // Progress detection: reset spin counter if last_task_alive advances
+        int32_t cur_last_alive = ring.fc.last_task_alive.load(std::memory_order_acquire);
+        if (cur_last_alive > prev_last_alive) {
+            spin_count = 0;
+            prev_last_alive = cur_last_alive;
+        }
+
+        if (spin_count >= PTO2_DEP_POOL_SPIN_LIMIT) {
+            int32_t current = ring.fc.current_task_index.load(std::memory_order_acquire);
+            LOG_ERROR("========================================");
+            LOG_ERROR("FATAL: Dependency Pool Deadlock Detected!");
+            LOG_ERROR("========================================");
+            LOG_ERROR("DepListPool cannot reclaim space after %d spins (no progress).", spin_count);
+            LOG_ERROR(
+                "  - Pool used:     %d / %d (%.1f%%)", used(), capacity,
+                (capacity > 0) ? (100.0 * used() / capacity) : 0.0
+            );
+            LOG_ERROR("  - Pool top:      %d (linear)", top);
+            LOG_ERROR("  - Pool tail:     %d (linear)", tail);
+            LOG_ERROR("  - High water:    %d", high_water);
+            LOG_ERROR("  - Needed:        %d entries", needed);
+            LOG_ERROR("  - last_task_alive: %d (stuck here)", cur_last_alive);
+            LOG_ERROR("  - current_task:    %d", current);
+            LOG_ERROR("  - In-flight tasks: %d", current - cur_last_alive);
+            LOG_ERROR("Diagnosis:");
+            LOG_ERROR("  last_task_alive is not advancing, so dep pool tail");
+            LOG_ERROR("  cannot reclaim. Check TaskRing diagnostics for root cause.");
+            LOG_ERROR("Solution:");
+            LOG_ERROR("  Increase dep pool capacity (current: %d, recommended: %d)", capacity, high_water * 2);
+            LOG_ERROR("  Compile-time: PTO2_DEP_LIST_POOL_SIZE in pto_runtime2_types.h");
+            LOG_ERROR("  Runtime env:  PTO2_RING_DEP_POOL=%d", high_water * 2);
+            LOG_ERROR("========================================");
+            latch_pool_error(error_code_ptr, PTO2_ERROR_DEP_POOL_OVERFLOW);
+            return false;
+        }
+        SPIN_WAIT_HINT();
+    }
+    return true;
+}
diff --git a/src/a2a3/runtime/fully_distributed_within_core/runtime/pto_ring_buffer.h b/src/a2a3/runtime/fully_distributed_within_core/runtime/pto_ring_buffer.h
new file mode 100644
index 000000000..ea39c8b4c
--- /dev/null
+++ b/src/a2a3/runtime/fully_distributed_within_core/runtime/pto_ring_buffer.h
@@ -0,0 +1,693 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * PTO Runtime2 - Ring Buffer Data Structures
+ *
+ * Implements ring buffer designs for zero-overhead memory management:
+ *
+ * 1. TaskAllocator - Unified task slot + output buffer allocation
+ *    - Combines task ring (slot allocation) and heap ring (output buffer allocation)
+ *    - Single spin-wait loop with unified back-pressure and deadlock detection
+ *    - O(1) bump allocation for both task slots and heap buffers
+ *
+ * 2. FaninPool - Fanin spill entry allocation
+ *    - Ring buffer for spilled fanin entries
+ *    - O(1) append allocation
+ *    - Implicit reclamation with task ring
+ *
+ * 3. DepListPool - Dependency list entry allocation
+ *    - Ring buffer for linked list entries
+ *    - O(1) prepend operation
+ *    - Implicit reclamation with task ring
+ *
+ * Based on: docs/RUNTIME_LOGIC.md
+ */
+
+#ifndef PTO_RING_BUFFER_H
+#define PTO_RING_BUFFER_H
+
+#include <algorithm>
+#include <inttypes.h>
+#include <type_traits>
+
+#include "pto_runtime2_types.h"
+#include "pto_shared_memory.h"
+#include "common/unified_log.h"
+
+#if PTO2_PROFILING
+// Heap-ring wrap reporting — the allocator is the only place each individual
+// wrap is observable, so it notifies the scope_stats collector here. Gated:
+// pays nothing (no include, no call) when profiling is compiled out.
+#include "aicpu/scope_stats_collector_aicpu.h"
+#endif
+
+// Block notification interval (in spin counts)
+#define PTO2_BLOCK_NOTIFY_INTERVAL 10000
+// Alloc spin limit - after this, report deadlock and exit
+#define PTO2_ALLOC_SPIN_LIMIT 100000
+
+// Dep pool spin limit - if exceeded, dep pool capacity too small for workload
+#define PTO2_DEP_POOL_SPIN_LIMIT 100000
+
+// =============================================================================
+// Task Allocator (unified task slot + heap buffer allocation)
+// =============================================================================
+
+/**
+ * Unified task slot + heap buffer allocator.
+ *
+ * Since task and heap are always allocated together and the orchestrator is
+ * single-threaded, both pointers (task index, heap top) are tracked locally
+ * and published to shared memory via plain store — no fetch_add or CAS needed.
+ *
+ * The alloc() method checks both resources BEFORE committing to either,
+ * eliminating the need for rollback on partial failure.
+ */
+class PTO2TaskAllocator {
+public:
+    /**
+     * Initialize the allocator with task ring and heap ring resources.
+     *
+     * All pointer arguments are device addresses (live in SM / GM heap); this
+     * function only stores them, no dereferences, so it is safe to invoke
+     * from host code that constructs a prebuilt arena image.
+     *
+     * Production callers leave `initial_local_task_id` at 0: the SM ring
+     * flow-control counters that current_index_ptr / last_alive_ptr point at
+     * start at zero (PTO2RingFlowControl::init() runs on the AICPU during SM
+     * reset), so we keep local_task_id_ aligned with that without reading the
+     * SM. Tests that drive SM state directly may pass a non-zero seed to
+     * exercise corner cases like task IDs near INT32_MAX.
+     */
+    void init(
+        PTO2TaskDescriptor *descriptors, int32_t window_size, std::atomic<int32_t> *current_index_ptr,
+        std::atomic<int32_t> *last_alive_ptr, void *heap_base, uint64_t heap_size, std::atomic<int32_t> *error_code_ptr,
+        int32_t initial_local_task_id = 0
+    ) {
+        descriptors_ = descriptors;
+        window_size_ = window_size;
+        window_mask_ = window_size - 1;
+        current_index_ptr_ = current_index_ptr;
+        last_alive_ptr_ = last_alive_ptr;
+        heap_base_ = heap_base;
+        heap_size_ = heap_size;
+        error_code_ptr_ = error_code_ptr;
+        local_task_id_ = initial_local_task_id;
+        heap_top_ = 0;
+        heap_tail_ = 0;
+        last_alive_seen_ = 0;
+    }
+
+    /**
+     * Allocate a task slot and its associated output buffer in one call.
+     *
+     * Both task index and heap top are maintained as local counters and
+     * published to shared memory only on success. Since the orchestrator is
+     * single-threaded, no CAS or fetch_add is needed — just check-then-commit.
+     *
+     * @param output_size  Total packed output size in bytes (0 = no heap needed)
+     * @return Allocation result; check failed() for errors
+     */
+    PTO2TaskAllocResult alloc(int32_t output_size) {
+        uint64_t aligned_size =
+            output_size > 0 ? PTO2_ALIGN_UP(static_cast<uint64_t>(output_size), PTO2_ALIGN_SIZE) : 0;
+
+        int spin_count = 0;
+        int32_t prev_last_alive = last_alive_ptr_->load(std::memory_order_acquire);
+        int32_t last_alive = prev_last_alive;
+        update_heap_tail(last_alive);
+        bool blocked_on_heap = false;
+#if PTO2_ORCH_PROFILING
+        uint64_t wait_start = 0;
+        bool waiting = false;
+#endif
+
+        while (true) {
+            // Check both resources; commit only if both available
+            if (local_task_id_ - last_alive + 1 < window_size_) {
+                void *heap_ptr = try_bump_heap(aligned_size);
+                if (heap_ptr) {
+                    int32_t task_id = commit_task();
+#if PTO2_ORCH_PROFILING
+                    record_wait(spin_count, wait_start, waiting);
+#endif
+                    return {task_id, task_id & window_mask_, heap_ptr, static_cast<char *>(heap_ptr) + aligned_size};
+                }
+                blocked_on_heap = true;
+            } else {
+                blocked_on_heap = false;
+            }
+
+            // Spin: wait for scheduler to advance last_task_alive
+            spin_count++;
+#if PTO2_ORCH_PROFILING
+            if (!waiting) {
+                wait_start = get_sys_cnt_aicpu();
+                waiting = true;
+            }
+#endif
+            last_alive = last_alive_ptr_->load(std::memory_order_acquire);
+            update_heap_tail(last_alive);
+            if (last_alive > prev_last_alive) {
+                spin_count = 0;
+                prev_last_alive = last_alive;
+            } else {
+                if (spin_count % PTO2_BLOCK_NOTIFY_INTERVAL == 0) {
+                    LOG_WARN(
+                        "[TaskAllocator] BLOCKED: tasks=%d/%d, heap=%" PRIu64 "/%" PRIu64 ", on=%s, spins=%d",
+                        local_task_id_ - last_alive, window_size_, heap_top_, heap_size_,
+                        blocked_on_heap ? "heap" : "task", spin_count
+                    );
+                }
+                if (spin_count >= PTO2_ALLOC_SPIN_LIMIT) {
+                    report_deadlock(output_size, blocked_on_heap);
+                    return {-1, -1, nullptr, nullptr};
+                }
+            }
+            SPIN_WAIT_HINT();
+        }
+    }
+
+    // =========================================================================
+    // State queries
+    // =========================================================================
+
+    int32_t active_count() const {
+        int32_t last_alive = last_alive_ptr_->load(std::memory_order_acquire);
+        return local_task_id_ - last_alive;
+    }
+
+    // Task ring start/end: tail = oldest live task (last_task_alive), head =
+    // next task id to allocate. head - tail == active_count().
+    int32_t task_tail() const { return last_alive_ptr_->load(std::memory_order_acquire); }
+    int32_t task_head() const { return local_task_id_; }
+
+    int32_t window_size() const { return window_size_; }
+
+    uint64_t heap_available() const {
+        uint64_t tail = heap_tail_;
+        if (heap_top_ >= tail) {
+            uint64_t at_end = heap_size_ - heap_top_;
+            uint64_t at_begin = tail;
+            return at_end > at_begin ? at_end : at_begin;
+        }
+        return tail - heap_top_;
+    }
+
+    uint64_t heap_top() const { return heap_top_; }
+    // Heap ring start: reclaim pointer (oldest byte still live). heap_top() is
+    // the end (next allocation). heap_top - heap_tail == heap_used_bytes().
+    uint64_t heap_tail() const { return heap_tail_; }
+    uint64_t heap_capacity() const { return heap_size_; }
+    uint64_t heap_used_bytes() const {
+        if (heap_size_ == 0) return 0;
+        return (heap_top_ + heap_size_ - heap_tail_) % heap_size_;
+    }
+
+private:
+    // --- Task Ring ---
+    PTO2TaskDescriptor *descriptors_ = nullptr;
+    int32_t window_size_ = 0;
+    int32_t window_mask_ = 0;
+    std::atomic<int32_t> *current_index_ptr_ = nullptr;
+    std::atomic<int32_t> *last_alive_ptr_ = nullptr;
+
+    // --- Heap ---
+    void *heap_base_ = nullptr;
+    uint64_t heap_size_ = 0;
+
+    // --- Local state (single-writer, no atomics needed) ---
+    int32_t local_task_id_ = 0;    // Next task ID to allocate
+    uint64_t heap_top_ = 0;        // Current heap allocation pointer
+    uint64_t heap_tail_ = 0;       // Heap reclamation pointer (derived from consumed tasks)
+    int32_t last_alive_seen_ = 0;  // last_task_alive at last heap_tail derivation
+
+    // --- Shared ---
+    std::atomic<int32_t> *error_code_ptr_ = nullptr;
+
+    // =========================================================================
+    // Internal helpers
+    // =========================================================================
+
+    /**
+     * Commit a task slot: bump local counter and publish to shared memory.
+     * Must only be called after space check has passed.
+     */
+    int32_t commit_task() {
+        int32_t task_id = local_task_id_++;
+        current_index_ptr_->store(local_task_id_, std::memory_order_release);
+        return task_id;
+    }
+
+    /**
+     * Derive heap_tail_ from the last consumed task's packed_buffer_end.
+     *
+     * Every task has a valid packed_buffer_end (equal to packed_buffer_base
+     * for zero-size allocations), so the last consumed task always determines
+     * the correct heap_tail — no backward scan needed.
+     */
+    void update_heap_tail(int32_t last_alive) {
+        if (last_alive <= last_alive_seen_) return;
+        last_alive_seen_ = last_alive;
+
+        PTO2TaskDescriptor &desc = descriptors_[(last_alive - 1) & window_mask_];
+        uint64_t old_tail = heap_tail_;
+        heap_tail_ =
+            static_cast<uint64_t>(static_cast<char *>(desc.packed_buffer_end) - static_cast<char *>(heap_base_));
+#if PTO2_PROFILING
+        // Reclaim pointer moves forward monotonically in ring order; a decrease
+        // means it wrapped past heap_size_ (occupancy < heap_size_ guarantees at
+        // most one wrap per call). Report it so scope_stats can unroll.
+        if (is_scope_stats_enabled() && heap_tail_ < old_tail) {
+            scope_stats_note_heap_wrap(SCOPE_STATS_HEAP_SIDE_RECLAIM);
+        }
+#else
+        (void)old_tail;
+#endif
+    }
+
+    /**
+     * Bump the heap pointer for the given allocation size.
+     * Returns the allocated pointer, or nullptr if insufficient space.
+     * When alloc_size == 0, returns current position without advancing.
+     */
+    void *try_bump_heap(uint64_t alloc_size) {
+        uint64_t top = heap_top_;
+        if (alloc_size == 0) {
+            return static_cast<char *>(heap_base_) + top;
+        }
+        uint64_t tail = heap_tail_;
+        void *result;
+
+        if (top >= tail) {
+            uint64_t space_at_end = heap_size_ - top;
+            if (space_at_end >= alloc_size) {
+                result = static_cast<char *>(heap_base_) + top;
+                heap_top_ = top + alloc_size;
+            } else if (tail > alloc_size) {
+                LOG_DEBUG(
+                    "try_bump_heap wrap-around alloc: top=%" PRIu64 ", tail=%" PRIu64 ", alloc=%" PRIu64, top, tail,
+                    alloc_size
+                );
+                result = heap_base_;
+                heap_top_ = alloc_size;
+#if PTO2_PROFILING
+                // Allocation pointer just wrapped past heap_size_; report it so
+                // scope_stats can unroll the wrapping offset into a monotonic value.
+                // The collector attributes the wrap to the current scope's ring.
+                if (is_scope_stats_enabled()) scope_stats_note_heap_wrap(SCOPE_STATS_HEAP_SIDE_ALLOC);
+#endif
+            } else {
+                LOG_DEBUG(
+                    "try_bump_heap failed (top>=tail): top=%" PRIu64 ", tail=%" PRIu64 ", alloc=%" PRIu64
+                    ", heap_size=%" PRIu64,
+                    top, tail, alloc_size, heap_size_
+                );
+                return nullptr;
+            }
+        } else {
+            if (tail - top > alloc_size) {
+                result = static_cast<char *>(heap_base_) + top;
+                heap_top_ = top + alloc_size;
+            } else {
+                LOG_DEBUG(
+                    "try_bump_heap failed (top<tail): top=%" PRIu64 ", tail=%" PRIu64 ", alloc=%" PRIu64
+                    ", free_gap=%" PRIu64,
+                    top, tail, alloc_size, tail - top
+                );
+                return nullptr;
+            }
+        }
+
+        return result;
+    }
+
+#if PTO2_ORCH_PROFILING
+    void record_wait(int spin_count, uint64_t wait_start, bool waiting) {
+        if (waiting) {
+            extern uint64_t g_orch_alloc_wait_cycle;
+            g_orch_alloc_wait_cycle += (get_sys_cnt_aicpu() - wait_start);
+        }
+        {
+            extern uint64_t g_orch_alloc_atomic_count;
+            g_orch_alloc_atomic_count += spin_count + 1;
+        }
+    }
+#endif
+
+    /**
+     * Report deadlock with targeted diagnostics.
+     */
+    void report_deadlock(int32_t requested_output_size, bool heap_blocked) {
+        int32_t last_alive = last_alive_ptr_->load(std::memory_order_acquire);
+        int32_t active_tasks = local_task_id_ - last_alive;
+        uint64_t htail = heap_tail_;
+
+        LOG_ERROR("========================================");
+        if (heap_blocked) {
+            LOG_ERROR("FATAL: Task Allocator Deadlock - Heap Exhausted!");
+        } else {
+            LOG_ERROR("FATAL: Task Allocator Deadlock - Task Ring Full!");
+        }
+        LOG_ERROR("========================================");
+        LOG_ERROR("No progress after %d spins.", PTO2_ALLOC_SPIN_LIMIT);
+        LOG_ERROR(
+            "  Task ring:  current=%d, last_alive=%d, active=%d/%d (%.1f%%)", local_task_id_, last_alive, active_tasks,
+            window_size_, 100.0 * active_tasks / window_size_
+        );
+        LOG_ERROR(
+            "  Heap ring:  top=%" PRIu64 ", tail=%" PRIu64 ", size=%" PRIu64 ", available=%" PRIu64, heap_top_, htail,
+            heap_size_, heap_available()
+        );
+        if (heap_blocked) {
+            LOG_ERROR("  Requested:  %d bytes", requested_output_size);
+        }
+        LOG_ERROR("Diagnosis:");
+        LOG_ERROR("  last_task_alive is stuck at %d, meaning task %d", last_alive, last_alive);
+        LOG_ERROR("  cannot transition to CONSUMED. Possible causes:");
+        LOG_ERROR("  1. Task %d still executing (subtasks not complete)", last_alive);
+        LOG_ERROR("  2. Task %d fanout not fully released (downstream not done)", last_alive);
+        LOG_ERROR("  3. Scope reference not released (scope_end not called)");
+        LOG_ERROR("  4. Orchestrator blocked here -> can't call scope_end -> circular wait");
+        LOG_ERROR("Solution:");
+        if (heap_blocked) {
+            LOG_ERROR(
+                "  Increase heap size (current: %" PRIu64 ", recommended: %" PRIu64 ")", heap_size_, heap_size_ * 2
+            );
+            LOG_ERROR("  Compile-time: PTO2_HEAP_SIZE in pto_runtime2_types.h");
+            LOG_ERROR("  Runtime env:  PTO2_RING_HEAP=<bytes> (e.g. %" PRIu64 ")", heap_size_ * 2);
+        } else {
+            LOG_ERROR("  Increase task window size (current: %d, recommended: %d)", window_size_, active_tasks * 2);
+            LOG_ERROR("  Compile-time: PTO2_TASK_WINDOW_SIZE in pto_runtime2_types.h");
+            LOG_ERROR("  Runtime env:  PTO2_RING_TASK_WINDOW=<power-of-2> (e.g. %d)", active_tasks * 2);
+        }
+        LOG_ERROR("========================================");
+        if (error_code_ptr_) {
+            int32_t code = heap_blocked ? PTO2_ERROR_HEAP_RING_DEADLOCK : PTO2_ERROR_FLOW_CONTROL_DEADLOCK;
+            error_code_ptr_->store(code, std::memory_order_release);
+        }
+    }
+};
+
+// =============================================================================
+// Fanin Spill Pool
+// =============================================================================
+
+/**
+ * Fanin spill pool structure
+ *
+ * True ring buffer for allocating spilled fanin entries.
+ * Entries are reclaimed when their consumer tasks become CONSUMED.
+ *
+ * Linear counters (top, tail) grow monotonically; the physical index
+ * is obtained via modulo: base[linear_index % capacity].
+ */
+struct PTO2FaninPool {
+    PTO2FaninSpillEntry *base;       // Pool base address
+    int32_t capacity;                // Total number of entries
+    int32_t top;                     // Linear next-allocation counter (starts from 1)
+    int32_t tail;                    // Linear first-alive counter (entries before this are dead)
+    int32_t high_water;              // Peak concurrent usage (top - tail)
+    int32_t reclaim_task_cursor{0};  // Last task id scanned for reclaim on this pool
+
+    std::atomic<int32_t> *error_code_ptr = nullptr;
+
+    void init(PTO2FaninSpillEntry *in_base, int32_t in_capacity, std::atomic<int32_t> *in_error_code_ptr) {
+        base = in_base;
+        capacity = in_capacity;
+        top = 1;
+        tail = 1;
+        high_water = 0;
+        reclaim_task_cursor = 0;
+        base[0].slot_state = nullptr;
+        error_code_ptr = in_error_code_ptr;
+    }
+
+    void reclaim(PTO2SharedMemoryRingHeader &ring, int32_t sm_last_task_alive);
+
+    bool ensure_space(PTO2SharedMemoryRingHeader &ring, int32_t needed);
+
+    PTO2FaninSpillEntry *alloc() {
+        int32_t used = top - tail;
+        if (used >= capacity) {
+            LOG_ERROR("========================================");
+            LOG_ERROR("FATAL: Fanin Spill Pool Overflow!");
+            LOG_ERROR("========================================");
+            LOG_ERROR("Fanin spill pool exhausted: %d entries alive (capacity=%d).", used, capacity);
+            LOG_ERROR("  - Pool top:      %d (linear)", top);
+            LOG_ERROR("  - Pool tail:     %d (linear)", tail);
+            LOG_ERROR("  - High water:    %d", high_water);
+            LOG_ERROR("Solution:");
+            LOG_ERROR("  Increase fanin spill pool capacity (current: %d, recommended: %d).", capacity, capacity * 2);
+            LOG_ERROR("  Compile-time: PTO2_DEP_LIST_POOL_SIZE in pto_runtime2_types.h");
+            LOG_ERROR("  Runtime env:  PTO2_RING_DEP_POOL=%d", capacity * 2);
+            LOG_ERROR("========================================");
+            if (error_code_ptr) {
+                error_code_ptr->store(PTO2_ERROR_DEP_POOL_OVERFLOW, std::memory_order_release);
+            }
+            return nullptr;
+        }
+        int32_t idx = top % capacity;
+        top++;
+        used++;
+        if (used > high_water) high_water = used;
+        return &base[idx];
+    }
+
+    void advance_tail(int32_t new_tail) {
+        if (new_tail > tail) {
+            tail = new_tail;
+        }
+    }
+
+    int32_t used() const { return top - tail; }
+
+    int32_t available() const { return capacity - used(); }
+};
+
+template <typename Fn>
+using PTO2FaninCallbackResult = std::invoke_result_t<Fn &, PTO2TaskSlotState *>;
+
+template <typename Fn>
+using PTO2FaninForEachReturn = std::conditional_t<std::is_same_v<PTO2FaninCallbackResult<Fn>, void>, void, bool>;
+
+template <typename InlineSlots, typename Fn>
+inline PTO2FaninForEachReturn<Fn> for_each_fanin_storage(
+    InlineSlots &&inline_slot_states, int32_t fanin_count, int32_t spill_start, PTO2FaninPool &spill_pool, Fn &&fn
+) {
+    using FaninCallbackResult = PTO2FaninCallbackResult<Fn>;
+    static_assert(
+        std::is_same_v<FaninCallbackResult, void> || std::is_same_v<FaninCallbackResult, bool>,
+        "fanin callback must return void or bool"
+    );
+
+    if constexpr (std::is_void_v<FaninCallbackResult>) {
+        int32_t inline_count = std::min(fanin_count, PTO2_FANIN_INLINE_CAP);
+        for (int32_t i = 0; i < inline_count; i++) {
+            fn(inline_slot_states[i]);
+        }
+
+        int32_t spill_count = fanin_count - inline_count;
+        if (spill_count <= 0) {
+            return;
+        }
+
+        int32_t start_idx = spill_start % spill_pool.capacity;
+        int32_t first_count = std::min(spill_count, spill_pool.capacity - start_idx);
+        PTO2FaninSpillEntry *first = spill_pool.base + start_idx;
+        for (int32_t i = 0; i < first_count; i++) {
+            fn(first[i].slot_state);
+        }
+
+        int32_t second_count = spill_count - first_count;
+        for (int32_t i = 0; i < second_count; i++) {
+            fn(spill_pool.base[i].slot_state);
+        }
+        return;
+    } else {
+        int32_t inline_count = std::min(fanin_count, PTO2_FANIN_INLINE_CAP);
+        for (int32_t i = 0; i < inline_count; i++) {
+            if (!fn(inline_slot_states[i])) {
+                return false;
+            }
+        }
+
+        int32_t spill_count = fanin_count - inline_count;
+        if (spill_count <= 0) {
+            return true;
+        }
+
+        int32_t start_idx = spill_start % spill_pool.capacity;
+        int32_t first_count = std::min(spill_count, spill_pool.capacity - start_idx);
+        PTO2FaninSpillEntry *first = spill_pool.base + start_idx;
+        for (int32_t i = 0; i < first_count; i++) {
+            if (!fn(first[i].slot_state)) {
+                return false;
+            }
+        }
+
+        int32_t second_count = spill_count - first_count;
+        for (int32_t i = 0; i < second_count; i++) {
+            if (!fn(spill_pool.base[i].slot_state)) {
+                return false;
+            }
+        }
+        return true;
+    }
+}
+
+template <typename Fn>
+inline PTO2FaninForEachReturn<Fn> for_each_fanin_slot_state(const PTO2TaskPayload &payload, Fn &&fn) {
+    return for_each_fanin_storage(
+        payload.fanin_inline_slot_states, payload.fanin_actual_count, payload.fanin_spill_start,
+        *payload.fanin_spill_pool, static_cast<Fn &&>(fn)
+    );
+}
+
+// =============================================================================
+// Dependency List Pool
+// =============================================================================
+
+/**
+ * Dependency list pool structure
+ *
+ * True ring buffer for allocating linked list entries.
+ * Entries are reclaimed when their producer tasks become CONSUMED,
+ * as tracked by the orchestrator via dep_pool_mark per task.
+ *
+ * Linear counters (top, tail) grow monotonically; the physical index
+ * is obtained via modulo: base[linear_index % capacity].
+ */
+struct PTO2DepListPool {
+    PTO2DepListEntry *base;     // Pool base address
+    int32_t capacity;           // Total number of entries
+    int32_t top;                // Linear next-allocation counter (starts from 1)
+    int32_t tail;               // Linear first-alive counter (entries before this are dead)
+    int32_t high_water;         // Peak concurrent usage (top - tail)
+    int32_t last_reclaimed{0};  // last_task_alive at last successful reclamation
+
+    // Error code pointer for fatal error reporting (→ sm_header->orch_error_code)
+    std::atomic<int32_t> *error_code_ptr = nullptr;
+
+    /**
+     *
+     * Initialize dependency list pool
+     * @param base      Pool base address from shared memory
+     * @param capacity  Total number of entries
+     */
+    void init(PTO2DepListEntry *in_base, int32_t in_capacity, std::atomic<int32_t> *in_error_code_ptr) {
+        base = in_base;
+        capacity = in_capacity;
+        top = 1;   // Start from 1, 0 means NULL/empty
+        tail = 1;  // Match initial top (no reclaimable entries yet)
+        high_water = 0;
+        last_reclaimed = 0;
+
+        // Initialize entry 0 as NULL marker
+        base[0].slot_state = nullptr;
+        base[0].next = nullptr;
+
+        error_code_ptr = in_error_code_ptr;
+    }
+
+    /**
+     * Reclaim dead entries based on scheduler's slot state dep_pool_mark.
+     * Safe to call multiple times — only advances tail forward.
+     *
+     * @param ring             Ring header (for reading slot dep_pool_mark)
+     * @param sm_last_task_alive Current last_task_alive from shared memory
+     */
+    void reclaim(PTO2SharedMemoryRingHeader &ring, int32_t sm_last_task_alive);
+
+    /**
+     * Ensure dep pool for a specific ring has at least `needed` entries available.
+     * Spin-waits for reclamation if under pressure. Detects deadlock if no progress.
+     */
+    bool ensure_space(PTO2SharedMemoryRingHeader &ring, int32_t needed);
+
+    /**
+     * Allocate a single entry from the pool (single-thread per pool instance)
+     *
+     * @return Pointer to allocated entry, or nullptr on fatal error
+     */
+    PTO2DepListEntry *alloc() {
+        int32_t used = top - tail;
+        if (used >= capacity) {
+            LOG_ERROR("========================================");
+            LOG_ERROR("FATAL: Dependency Pool Overflow!");
+            LOG_ERROR("========================================");
+            LOG_ERROR("DepListPool exhausted: %d entries alive (capacity=%d).", used, capacity);
+            LOG_ERROR("  - Pool top:      %d (linear)", top);
+            LOG_ERROR("  - Pool tail:     %d (linear)", tail);
+            LOG_ERROR("  - High water:    %d", high_water);
+            LOG_ERROR("Solution:");
+            LOG_ERROR("  Increase dep pool capacity (current: %d, recommended: %d).", capacity, capacity * 2);
+            LOG_ERROR("  Compile-time: PTO2_DEP_LIST_POOL_SIZE in pto_runtime2_types.h");
+            LOG_ERROR("  Runtime env:  PTO2_RING_DEP_POOL=%d", capacity * 2);
+            LOG_ERROR("========================================");
+            if (error_code_ptr) {
+                error_code_ptr->store(PTO2_ERROR_DEP_POOL_OVERFLOW, std::memory_order_release);
+            }
+            return nullptr;
+        }
+        int32_t idx = top % capacity;
+        top++;
+        used++;
+        if (used > high_water) high_water = used;
+        return &base[idx];
+    }
+
+    /**
+     * Advance the tail pointer, reclaiming dead entries.
+     * Called by the orchestrator based on last_task_alive advancement.
+     */
+    void advance_tail(int32_t new_tail) {
+        if (new_tail > tail) {
+            tail = new_tail;
+        }
+    }
+
+    /**
+     * Prepend a task ID to a dependency list
+     *
+     * O(1) operation: allocates new entry and links to current head.
+     *
+     * @param current_head  Current list head offset (0 = empty list)
+     * @param task_slot     Task slot to prepend
+     * @return New head offset
+     */
+    PTO2DepListEntry *prepend(PTO2DepListEntry *cur, PTO2TaskSlotState *slot_state) {
+        PTO2DepListEntry *new_entry = alloc();
+        if (!new_entry) return nullptr;
+        new_entry->slot_state = slot_state;
+        new_entry->next = cur;
+        return new_entry;
+    }
+
+    int32_t used() const { return top - tail; }
+
+    int32_t available() const { return capacity - used(); }
+};
+
+// =============================================================================
+// Ring Set (per-depth aggregate)
+// =============================================================================
+
+/**
+ * Groups a TaskAllocator and DepPool into one per-depth unit.
+ * PTO2_MAX_RING_DEPTH instances provide independent reclamation per scope depth.
+ */
+struct PTO2RingSet {
+    PTO2TaskAllocator task_allocator;
+    PTO2FaninPool fanin_pool;
+};
+
+#endif  // PTO_RING_BUFFER_H
diff --git a/src/a2a3/runtime/fully_distributed_within_core/runtime/pto_runtime2.cpp b/src/a2a3/runtime/fully_distributed_within_core/runtime/pto_runtime2.cpp
new file mode 100644
index 000000000..263adec8d
--- /dev/null
+++ b/src/a2a3/runtime/fully_distributed_within_core/runtime/pto_runtime2.cpp
@@ -0,0 +1,287 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+/**
+ * PTO Runtime2 - Main Implementation
+ *
+ * Implements the unified runtime API that combines orchestrator and scheduler.
+ *
+ * Based on: docs/RUNTIME_LOGIC.md
+ */
+
+#include "pto_runtime2.h"
+
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <algorithm>
+
+#include "aicpu/device_time.h"
+#include "common/unified_log.h"
+#if PTO2_PROFILING
+#include "aicpu/scope_stats_collector_aicpu.h"
+#endif
+
+// Weak fallback for HOST .so builds (never called, but satisfies linker).
+// The AICPU build links the strong symbol from platform/.../device_time.cpp.
+// Hidden visibility prevents HOST .so from polluting global symbol table.
+__attribute__((weak, visibility("hidden"))) uint64_t get_sys_cnt_aicpu() { return 0; }
+
+// =============================================================================
+// Orchestration Ops Table (function-pointer dispatch for orchestration .so)
+// =============================================================================
+
+static TaskOutputTensors submit_task_impl(PTO2Runtime *rt, const MixedKernels &mixed_kernels, const L0TaskArgs &args) {
+    return rt->orchestrator.submit_task(mixed_kernels, args);
+}
+
+static TaskOutputTensors alloc_tensors_impl(PTO2Runtime *rt, const L0TaskArgs &args) {
+    return rt->orchestrator.alloc_tensors(args);
+}
+
+static TaskOutputTensors submit_dummy_task_impl(PTO2Runtime *rt, const L0TaskArgs &args) {
+    return rt->orchestrator.submit_dummy_task(args);
+}
+
+void rt_scope_begin(PTO2Runtime *rt) {
+    PTO2ScopeMode mode = rt->pending_scope_mode;
+    rt->pending_scope_mode = PTO2ScopeMode::AUTO;
+    rt->orchestrator.begin_scope(mode);
+}
+
+void rt_scope_end(PTO2Runtime *rt) { rt->orchestrator.end_scope(); }
+
+void rt_orchestration_done(PTO2Runtime *rt) { rt->orchestrator.mark_done(); }
+
+static bool is_fatal_impl(PTO2Runtime *rt) { return rt->orchestrator.fatal; }
+
+void rt_report_fatal(PTO2Runtime *rt, int32_t error_code, const char *func, const char *fmt, ...) {
+    va_list args;
+    va_start(args, fmt);
+    if (fmt == nullptr || fmt[0] == '\0') {
+        rt->orchestrator.report_fatal(error_code, func, nullptr);
+    } else {
+        char message[1024];
+        vsnprintf(message, sizeof(message), fmt, args);
+        rt->orchestrator.report_fatal(error_code, func, "%s", message);
+    }
+    va_end(args);
+}
+
+// Wait for all producers of this tensor to be safe for data access.
+// Checks owner metadata (lifecycle anchor) and OverlapMap (modifier writers).
+// For reads: wait until each producer COMPLETED (done writing).
+// For writes: also wait until all consumers done reading
+//   (fanout_refcount >= fanout_count - 1, excluding scope reference).
+// Uses cycle-based timeout (checked every 1024 spins).
+// Returns false on timeout (sets orch.fatal).
+MAYBE_UNINITIALIZED_BEGIN
+static bool wait_for_tensor_ready(PTO2Runtime *rt, const Tensor &tensor, bool wait_for_consumers, const char *caller) {
+    PTO2TaskId owner = tensor.owner_task_id;
+    PTO2OrchestratorState &orch = rt->orchestrator;
+
+    // Segmented wait: collect up to kSegmentCap producer slots, then flush by
+    // spinning on each. When the segment fills, we wait for the accumulated
+    // batch before continuing to gather more. Dedup is per-segment only; a
+    // producer that appears in two segments is waited on twice, which is
+    // idempotent (task_state is monotonic) and only adds one atomic load on
+    // the second encounter.
+    constexpr int kSegmentCap = 64;
+    const PTO2TaskSlotState *seg[kSegmentCap];
+    int seg_count = 0;
+    bool signaled = false;
+    bool failed = false;
+
+    auto wait_one_producer = [&](const PTO2TaskSlotState &slot) {
+        uint8_t ring_id = slot.ring_id;
+        int32_t local_id = static_cast<int32_t>(slot.task->task_id.local());
+        uint64_t t0 = get_sys_cnt_aicpu();
+        int32_t spin_count = 0;
+        while (slot.task_state.load(std::memory_order_acquire) < PTO2_TASK_COMPLETED) {
+            SPIN_WAIT_HINT();
+            if ((++spin_count & 1023) == 0 && get_sys_cnt_aicpu() - t0 > PTO2_TENSOR_DATA_TIMEOUT_CYCLES) {
+                orch.report_fatal(
+                    PTO2_ERROR_TENSOR_WAIT_TIMEOUT, caller,
+                    "Timeout (%llu cycles): producer (ring=%d, local=%d) not completed",
+                    (unsigned long long)PTO2_TENSOR_DATA_TIMEOUT_CYCLES, ring_id, local_id
+                );
+                failed = true;
+                return;
+            }
+        }
+    };
+
+    auto wait_one_consumers = [&](const PTO2TaskSlotState &slot) {
+        uint8_t ring_id = slot.ring_id;
+        int32_t local_id = slot.task->task_id.local();
+        uint64_t t0 = get_sys_cnt_aicpu();
+        int32_t spin_count = 0;
+        while (slot.fanout_refcount.load(std::memory_order_acquire) < slot.fanout_count - 1) {
+            SPIN_WAIT_HINT();
+            if ((++spin_count & 1023) == 0 && get_sys_cnt_aicpu() - t0 > PTO2_TENSOR_DATA_TIMEOUT_CYCLES) {
+                orch.report_fatal(
+                    PTO2_ERROR_TENSOR_WAIT_TIMEOUT, caller,
+                    "Timeout (%llu cycles): consumers of producer (ring=%d, local=%d) not done",
+                    (unsigned long long)PTO2_TENSOR_DATA_TIMEOUT_CYCLES, ring_id, local_id
+                );
+                failed = true;
+                return;
+            }
+        }
+    };
+
+    auto flush_segment = [&]() {
+        for (int i = 0; i < seg_count; i++) {
+            wait_one_producer(*seg[i]);
+            if (failed) return;
+            if (!wait_for_consumers) continue;
+            wait_one_consumers(*seg[i]);
+            if (failed) return;
+        }
+        seg_count = 0;
+    };
+
+    auto try_push = [&](const PTO2TaskSlotState &s) {
+        for (int j = 0; j < seg_count; j++) {
+            if (seg[j] == &s) return;  // per-segment dedup
+        }
+        if (seg_count == kSegmentCap) {
+            flush_segment();
+            if (failed) return;
+        }
+        seg[seg_count++] = &s;
+        if (!signaled) {
+            orch.scheduler->wiring.orch_needs_drain.store(true, std::memory_order_release);
+            signaled = true;
+        }
+    };
+
+    auto do_wait = [&]() {
+        // Step A: creator retention — read owner directly from tensor metadata
+        if (owner.is_valid()) {
+            auto &s = orch.sm_header->rings[owner.ring()].get_slot_state_by_task_id(owner.local());
+            try_push(s);
+            if (failed) return;
+        }
+
+        // Step B: modifier writer lookup (OverlapMap), direct callback
+        orch.tensor_map.lookup(tensor, [&](PTO2TensorMapEntry &entry, OverlapStatus) -> bool {
+            PTO2TaskId pid = entry.producer_task_id;
+            auto &s = orch.sm_header->rings[pid.ring()].get_slot_state_by_task_id(pid.local());
+            try_push(s);
+            return !failed;
+        });
+        if (failed) return;
+        flush_segment();
+    };
+
+    do_wait();
+    if (signaled) {
+        orch.scheduler->wiring.orch_needs_drain.store(false, std::memory_order_release);
+    }
+    return !failed;
+}
+MAYBE_UNINITIALIZED_END
+
+uint64_t get_tensor_data(PTO2Runtime *rt, const Tensor &tensor, uint32_t ndims, const uint32_t indices[]) {
+    if (tensor.buffer.addr == 0) {
+        unified_log_error(
+            __FUNCTION__, "get_tensor_data: buffer not allocated (addr=0). "
+                          "Use the Tensor returned by add_output(TensorCreateInfo) after submit returns."
+        );
+        return 0;
+    }
+
+    if (!wait_for_tensor_ready(rt, tensor, false, __FUNCTION__)) {
+        return 0;
+    }
+
+    uint64_t flat_offset = tensor.compute_flat_offset(indices, ndims);
+    uint64_t elem_size = get_element_size(tensor.dtype);
+    const void *ptr = reinterpret_cast<const void *>(tensor.buffer.addr + flat_offset * elem_size);
+    uint64_t result = 0;
+    memcpy(&result, ptr, elem_size);
+    return result;
+}
+
+void set_tensor_data(PTO2Runtime *rt, const Tensor &tensor, uint32_t ndims, const uint32_t indices[], uint64_t value) {
+    if (tensor.buffer.addr == 0) {
+        unified_log_error(
+            __FUNCTION__, "set_tensor_data: buffer not allocated (addr=0). "
+                          "Use the Tensor returned by add_output(TensorCreateInfo) after submit returns."
+        );
+        return;
+    }
+
+    // Wait for producer + all consumers before writing (WAW + WAR safety)
+    if (!wait_for_tensor_ready(rt, tensor, true, __FUNCTION__)) {
+        return;
+    }
+
+    uint64_t flat_offset = tensor.compute_flat_offset(indices, ndims);
+    uint64_t elem_size = get_element_size(tensor.dtype);
+    void *ptr = reinterpret_cast<void *>(tensor.buffer.addr + flat_offset * elem_size);
+    memcpy(ptr, &value, elem_size);
+}
+
+// Ops-table entry that hands the call-site captured by PTO2ScopeGuard to the
+// [ScopeStats] collector. The slot is always present in the struct to keep
+// the layout stable; at PTO2_PROFILING=0 we fill nullptr so the orchestration
+// .so's null-check skips it.
+#if PTO2_PROFILING
+static void scope_set_site_impl(const char *file, int line) { scope_stats_set_pending_site(file, line); }
+#endif
+
+static const PTO2RuntimeOps s_runtime_ops = {
+    .submit_task = submit_task_impl,
+    .scope_begin = rt_scope_begin,
+    .scope_end = rt_scope_end,
+    .orchestration_done = rt_orchestration_done,
+    .is_fatal = is_fatal_impl,
+    .report_fatal = rt_report_fatal,
+    .log_error = unified_log_error,
+    .log_warn = unified_log_warn,
+    .log_debug = unified_log_debug,
+    .log_info_v = unified_log_info_v,
+    .get_tensor_data = get_tensor_data,
+    .set_tensor_data = set_tensor_data,
+    .alloc_tensors = alloc_tensors_impl,
+    .submit_dummy_task = submit_dummy_task_impl,
+#if PTO2_PROFILING
+    .scope_set_site = scope_set_site_impl,
+#else
+    .scope_set_site = nullptr,
+#endif
+};
+
+// =============================================================================
+// Runtime Lifecycle (AICPU-only fixup)
+// =============================================================================
+//
+// Layout / init_data / wire / destroy live in
+// runtime/shared/pto_runtime2_init.cpp so the host build can pre-populate the
+// prebuilt arena image. The pieces below — wiring the ops table and the
+// SPMD core counts — depend on the device-side s_runtime_ops global and the
+// AICPU SchedulerContext respectively, so they remain in the AICPU build.
+
+void runtime_finalize_after_wire(PTO2Runtime *rt, int32_t aic_count, int32_t aiv_count) {
+    rt->ops = &s_runtime_ops;
+    rt->orchestrator.total_cluster_count = aic_count;
+    rt->orchestrator.total_aiv_count = aiv_count;
+}
+
+void runtime_set_mode(PTO2Runtime *rt, PTO2RuntimeMode mode) {
+    if (rt) {
+        rt->mode = mode;
+    }
+}
diff --git a/src/a2a3/runtime/fully_distributed_within_core/runtime/pto_runtime2.h b/src/a2a3/runtime/fully_distributed_within_core/runtime/pto_runtime2.h
new file mode 100644
index 000000000..85680d8c3
--- /dev/null
+++ b/src/a2a3/runtime/fully_distributed_within_core/runtime/pto_runtime2.h
@@ -0,0 +1,290 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * PTO Runtime2 - Main Interface
+ *
+ * This is the main header for the PTO Runtime2 system.
+ * It provides a unified API for task graph construction and execution.
+ *
+ * Key Features:
+ * - Ring buffer based memory management (zero allocation overhead)
+ * - Lazy invalidation TensorMap for dependency discovery
+ * - Scope-based buffer lifecycle management
+ * - Per-task spinlocks for concurrent fanout updates
+ * - Orchestrator-Scheduler decoupling via shared memory
+ *
+ * Usage:
+ *   1. Create runtime: PTO2Runtime create methods
+ *   2. Build task graph in orchestration function:
+ *      - begin_scope() / end_scope()
+ *      - submit_task()
+ *   3. Mark orchestration complete: mark_done()
+ *   4. Destroy runtime
+ *
+ * Based on: docs/RUNTIME_LOGIC.md
+ */
+
+#pragma once
+
+#include "utils/device_arena.h"
+#include "pto_runtime2_types.h"
+#include "pto_submit_types.h"
+#include "pto_shared_memory.h"
+#include "pto_ring_buffer.h"
+#include "pto_tensormap.h"
+#include "scheduler/pto_scheduler.h"
+#include "pto_orchestrator.h"
+#include "aicore_completion_mailbox.h"
+
+// =============================================================================
+// Runtime Context
+// =============================================================================
+
+/**
+ * Runtime execution mode
+ */
+enum PTO2RuntimeMode {
+    PTO2_MODE_EXECUTE = 0,    // Execute tasks on workers
+    PTO2_MODE_SIMULATE = 1,   // Simulate task execution with cycle counting
+    PTO2_MODE_GRAPH_ONLY = 2  // Build graph only, no execution
+};
+
+/**
+ * Function-pointer ops table for runtime operations.
+ *
+ * The orchestration .so calls runtime functions through this table
+ * (via pto_orchestration_api.h inline wrappers), so it has zero link
+ * dependencies on runtime .cpp files.
+ */
+typedef struct PTO2Runtime PTO2Runtime;  // forward declare for ops signatures
+
+struct PTO2RuntimeOps {
+    TaskOutputTensors (*submit_task)(PTO2Runtime *rt, const MixedKernels &mixed_kernels, const L0TaskArgs &args);
+    void (*scope_begin)(PTO2Runtime *rt);
+    void (*scope_end)(PTO2Runtime *rt);
+    void (*orchestration_done)(PTO2Runtime *rt);
+    bool (*is_fatal)(PTO2Runtime *rt);
+    void (*report_fatal)(PTO2Runtime *rt, int32_t error_code, const char *func, const char *fmt, ...);
+
+    // Logging (populated by runtime, called by orchestration)
+    void (*log_error)(const char *func, const char *fmt, ...);
+    void (*log_warn)(const char *func, const char *fmt, ...);
+    void (*log_debug)(const char *func, const char *fmt, ...);
+    // INFO with explicit verbosity tier (v ∈ [0,9]; gating done inside).
+    void (*log_info_v)(const char *func, int v, const char *fmt, ...);
+
+    // Cross-layer data access (orchestration reads/writes tensor values via runtime)
+    // Placed after logging to avoid shifting hot-path field offsets.
+    uint64_t (*get_tensor_data)(PTO2Runtime *rt, const Tensor &tensor, uint32_t ndims, const uint32_t indices[]);
+    void (*set_tensor_data)(
+        PTO2Runtime *rt, const Tensor &tensor, uint32_t ndims, const uint32_t indices[], uint64_t value
+    );
+    TaskOutputTensors (*alloc_tensors)(PTO2Runtime *rt, const L0TaskArgs &args);
+    TaskOutputTensors (*submit_dummy_task)(PTO2Runtime *rt, const L0TaskArgs &args);
+    // Stash the call-site captured by PTO2ScopeGuard into the [ScopeStats]
+    // collector. Always present in the struct to keep ops-table layout stable
+    // across PTO2_PROFILING settings; set to nullptr at PTO2_PROFILING=0.
+    void (*scope_set_site)(const char *file, int line);
+};
+
+/**
+ * Layout descriptor for the prebuilt runtime arena. Holds all sub-region
+ * offsets (orchestrator / scheduler / sm_handle wrapper / runtime header /
+ * AICore mailbox) plus the layout-defining capacities. Produced once on the
+ * host by runtime_reserve_layout(); consumed by runtime_init_data_from_layout
+ * and runtime_wire_arena_pointers.
+ */
+struct PTO2RuntimeArenaLayout {
+    size_t off_sm_handle{0};
+    PTO2OrchestratorLayout orch;
+    PTO2SchedulerLayout sched;
+    size_t off_runtime{0};
+    size_t off_mailbox{0};
+
+    // Cached parameters (re-used by init_data + wire stages).
+    uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH]{};
+    uint64_t heap_sizes[PTO2_MAX_RING_DEPTH]{};
+    int32_t dep_pool_capacities[PTO2_MAX_RING_DEPTH]{};
+
+    // Total arena byte size post-commit. Used by host to size the prebuilt
+    // image buffer and as the rtMemcpy length.
+    size_t arena_size{0};
+};
+
+/**
+ * PTO Runtime2 context
+ *
+ * Contains all state for orchestration and scheduling.
+ * In simulated mode, runs in single process with shared address space.
+ */
+struct PTO2Runtime {
+    // Ops table (first field — used by orchestration .so via function pointers)
+    const PTO2RuntimeOps *ops;
+    PTO2ScopeMode pending_scope_mode;
+
+    // Components
+    PTO2SharedMemoryHandle *sm_handle;
+    PTO2OrchestratorState orchestrator;
+    PTO2SchedulerState scheduler;
+    AICoreCompletionMailbox *aicore_mailbox;
+
+    // GM Heap for output buffers
+    void *gm_heap;
+    uint64_t gm_heap_size;
+    bool gm_heap_owned;  // True if we allocated it
+
+    // Mode
+    PTO2RuntimeMode mode;
+
+    // Statistics
+    int64_t total_cycles;
+
+    // Prebuilt-arena fast path metadata. Carries every offset
+    // wire_arena_pointers needs at AICPU boot so the AICPU can reconstruct
+    // all arena-internal pointer fields without re-running init_data. The
+    // device base of the runtime arena travels separately on the host-side
+    // Runtime (Runtime::prebuilt_arena_base_), since the AICPU needs it
+    // *before* dereferencing this image. Populated on host by
+    // runtime_init_data_from_layout + runtime_wire_arena_pointers; read by
+    // aicpu_executor.cpp.
+    PTO2RuntimeArenaLayout prebuilt_layout;
+};
+
+// =============================================================================
+// Runtime Lifecycle API
+// =============================================================================
+
+/**
+ * Phase 1 — declare every sub-region (sm_handle wrapper, orchestrator /
+ * scheduler / tensor_map / mailbox / PTO2Runtime header) on the supplied
+ * arena. Pure arithmetic; does not touch device memory and may run on host.
+ * Returns the layout descriptor; caller commits/attaches the arena before
+ * Phase 2/3.
+ */
+PTO2RuntimeArenaLayout runtime_reserve_layout(
+    DeviceArena &arena, uint64_t task_window_size, int32_t dep_pool_capacity = PTO2_DEP_LIST_POOL_SIZE
+);
+PTO2RuntimeArenaLayout runtime_reserve_layout(
+    DeviceArena &arena, const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH],
+    const uint64_t heap_sizes[PTO2_MAX_RING_DEPTH], const int32_t dep_pool_capacities[PTO2_MAX_RING_DEPTH]
+);
+
+/**
+ * Phase 2 — write the data half of the runtime arena: standalone fields,
+ * memset'd arena regions, sub-structure initializers, and SM-side device
+ * pointers. The arena must already be committed (or attached); writes go
+ * into arena.base() + sub-region offsets.
+ *
+ * `sm_dev_base` / `gm_heap_dev_base` are device addresses; we only store
+ * them (never dereference). Safe to run on a host arena that owns a host
+ * mirror of the runtime image — the resulting buffer is rtMemcpy-ready.
+ *
+ * Returns the PTO2Runtime* that sits at layout.off_runtime within the arena.
+ * Caller must follow up with runtime_wire_arena_pointers; rt->ops and the
+ * AICore-side count fields are left untouched and must be filled by the
+ * AICPU at boot.
+ */
+PTO2Runtime *runtime_init_data_from_layout(
+    DeviceArena &arena, const PTO2RuntimeArenaLayout &layout, PTO2RuntimeMode mode, void *sm_dev_base, uint64_t sm_size,
+    void *gm_heap_dev_base, uint64_t heap_size
+);
+PTO2Runtime *runtime_init_data_from_layout(
+    DeviceArena &arena, const PTO2RuntimeArenaLayout &layout, PTO2RuntimeMode mode, void *sm_dev_base, uint64_t sm_size,
+    void *gm_heap_dev_base, const uint64_t heap_sizes[PTO2_MAX_RING_DEPTH]
+);
+
+/**
+ * Phase 3 — wire every arena-internal pointer field (rt->sm_handle,
+ * rt->aicore_mailbox, orchestrator.{scope_tasks, scope_begins, scheduler,
+ * tensor_map.*, rings[].fanin_pool.base}, scheduler.{ready_queues, dep_pool,
+ * wiring.queue}) so each holds arena.base() + offset. Idempotent — runs on
+ * both host (writing host-mirror addresses) and AICPU (writing device
+ * addresses) sides.
+ */
+void runtime_wire_arena_pointers(DeviceArena &arena, const PTO2RuntimeArenaLayout &layout, PTO2Runtime *rt);
+
+/**
+ * AICPU-only Phase 4 — fill in the few fields the host could not know at
+ * prebuilt-image build time: the ops table (s_runtime_ops is a device-side
+ * file-local global, host cannot resolve its device address) and the
+ * orchestrator's core counts (depend on the executor's scheduler context).
+ * Call once per boot after runtime_wire_arena_pointers.
+ */
+void runtime_finalize_after_wire(PTO2Runtime *rt, int32_t aic_count, int32_t aiv_count);
+
+/**
+ * Destroy runtime. With the prebuilt-arena fast path the arena buffer is
+ * pooled across runs by DeviceRunner, so we never call arena.release()
+ * here — the destructor only forgets sub-structure pointers (idempotent
+ * cleanup).
+ */
+void runtime_destroy(PTO2Runtime *rt, DeviceArena &arena);
+
+/**
+ * Set execution mode
+ */
+void runtime_set_mode(PTO2Runtime *rt, PTO2RuntimeMode mode);
+
+// =============================================================================
+// Orchestration API (called by orchestration function)
+// =============================================================================
+
+/**
+ * Begin a new scope
+ *
+ * All tasks submitted within this scope will have their lifetime
+ * bounded by the scope. When scope_end() is called, the scope
+ * releases its reference to all enclosed tasks.
+ */
+void rt_scope_begin(PTO2Runtime *rt);
+
+/**
+ * End current scope
+ *
+ * Releases scope reference for all tasks submitted since scope_begin().
+ * Tasks whose refcount reaches zero will have their buffers released.
+ */
+void rt_scope_end(PTO2Runtime *rt);
+
+/**
+ * Mark orchestration as complete
+ *
+ * Signals that no more tasks will be submitted.
+ */
+void rt_orchestration_done(PTO2Runtime *rt);
+
+/**
+ * Enter fatal state explicitly from orchestration.
+ */
+void rt_report_fatal(PTO2Runtime *rt, int32_t error_code, const char *func, const char *fmt, ...);
+
+/**
+ * Cross-layer data access: read a tensor value by waiting for its producer.
+ */
+uint64_t get_tensor_data(PTO2Runtime *rt, const Tensor &tensor, uint32_t ndims, const uint32_t indices[]);
+
+/**
+ * Cross-layer data access: write a value to a tensor at given indices.
+ * Waits for producer completion (WAW) and all consumers (WAR) via TensorMap.
+ * See set_tensor_data in pto_orchestration_api.h for full documentation.
+ */
+void set_tensor_data(PTO2Runtime *rt, const Tensor &tensor, uint32_t ndims, const uint32_t indices[], uint64_t value);
+
+/**
+ * Slim config struct exported by orchestration .so via aicpu_orchestration_config().
+ * Shared definition with pto_orchestration_api.h (same layout, guarded).
+ */
+#ifndef PTO2_ORCHESTRATION_CONFIG_DEFINED
+#define PTO2_ORCHESTRATION_CONFIG_DEFINED
+struct PTO2OrchestrationConfig {
+    int expected_arg_count;
+};
+#endif
diff --git a/src/a2a3/runtime/fully_distributed_within_core/runtime/pto_runtime2_types.h b/src/a2a3/runtime/fully_distributed_within_core/runtime/pto_runtime2_types.h
new file mode 100644
index 000000000..e4135a366
--- /dev/null
+++ b/src/a2a3/runtime/fully_distributed_within_core/runtime/pto_runtime2_types.h
@@ -0,0 +1,524 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+/**
+ * PTO Runtime2 - Core Type Definitions
+ *
+ * This header defines all fundamental types used by the PTO Runtime2 system:
+ * - Configuration constants
+ * - Worker types and task states
+ * - Tensor regions and task parameters
+ * - Task descriptors with fanin/fanout tracking
+ * - Dependency list entries
+ *
+ * Based on: docs/RUNTIME_LOGIC.md
+ */
+
+#ifndef SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_PTO_RUNTIME2_TYPES_H_
+#define SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_PTO_RUNTIME2_TYPES_H_
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include <atomic>
+
+#include "profiling_config.h"
+#include "pto_constants.h"
+#include "pto_runtime_status.h"
+#include "pto2_dispatch_payload.h"
+#include "aicore_completion_mailbox.h"
+#include "pto_submit_types.h"
+#include "pto_task_id.h"
+#include "pto_types.h"
+
+// Spin-wait hint for AICPU threads.  On real hardware the AICPU has dedicated
+// ARM A55 cores — no OS yield is needed, so the hint is a no-op.  In simulation
+// all threads share host CPU cores, so we yield to prevent starvation.
+// This header is also compiled into the Host .so (for struct definitions only),
+// where the hint is never called — the fallback no-op keeps Host builds clean.
+#if __has_include("spin_hint.h")
+#include "spin_hint.h"
+#else
+#define SPIN_WAIT_HINT() ((void)0)
+#endif
+
+#if PTO2_ORCH_PROFILING || PTO2_SCHED_PROFILING
+#include "aicpu/device_time.h"
+#endif
+
+// =============================================================================
+// Configuration Constants
+// =============================================================================
+
+// Task management
+// NOTE: PTO2_TASK_WINDOW_SIZE is now a per-ring default value.
+// Actual window size is passed at runtime to runtime_create_from_sm().
+// Use pto2_task_slot(sched, task_id) for slot calculation.
+#define PTO2_TASK_WINDOW_SIZE 16384  // Default per-ring task window size (power of 2)
+
+// Multi-ring: number of independent ring layers (HeapRing + TaskRing + DepPool per layer)
+// Scope depth maps to ring index via: min(scope_depth, PTO2_MAX_RING_DEPTH - 1)
+#define PTO2_MAX_RING_DEPTH 4
+
+// Memory pools (per-ring defaults; total = value × PTO2_MAX_RING_DEPTH)
+#define PTO2_HEAP_SIZE (256 * 1024 * 1024)  // 256MB per ring (1GB total)
+#define PTO2_DEP_LIST_POOL_SIZE 16384       // Per-ring dependency list pool entries
+#define PTO2_TENSORMAP_POOL_SIZE (65536)    // TensorMap entry pool
+#define PTO2_TENSORMAP_NUM_BUCKETS 4096     // Power of 2 for fast hash (4096×8B=32KB fits L1)
+
+// Scope management
+#define PTO2_MAX_SCOPE_DEPTH 64  // Maximum nesting depth
+// Hard cap for the scope_tasks buffer. Equals the total in-flight ring slot
+// budget (PTO2_TASK_WINDOW_SIZE × PTO2_MAX_RING_DEPTH): once every ring slot
+// is in flight, no more tasks can ever be pushed regardless of buffer size.
+// scope_tasks_push fatals on overflow rather than growing the arena-owned
+// buffer (which would be UB on the arena's malloc'd backing).
+#define PTO2_SCOPE_TASKS_CAP (PTO2_TASK_WINDOW_SIZE * PTO2_MAX_RING_DEPTH)
+
+// Ready queue
+#define PTO2_READY_QUEUE_SIZE 65536  // Per-shape queue size
+
+// Cross-thread early-dispatch work queue (power of two)
+#define PTO2_EARLY_DISPATCH_QUEUE_SIZE 64
+
+// Wiring queue
+#define PTO2_WRIRING_QUEUE_SIZE 1024  // Per-shape queue size
+
+// Fanin storage
+#define PTO2_FANIN_INLINE_CAP 64
+
+// TensorMap cleanup interval
+#define PTO2_TENSORMAP_CLEANUP_INTERVAL 64  // Cleanup every N retired tasks
+#define PTO2_DEP_POOL_CLEANUP_INTERVAL 64   // Cleanup every N retired tasks
+
+// get_tensor_data/set_tensor_data spin wait timeout in cycles.
+// ~10s on hardware (1.5 GHz counter), ~10s on simulation (chrono-based).
+constexpr uint64_t PTO2_TENSOR_DATA_TIMEOUT_CYCLES = 15 * 1000 * 1000 * 1000ULL;
+
+// =============================================================================
+// Task States
+// =============================================================================
+
+/**
+ * Task state enumeration
+ *
+ * State transitions:
+ *   PENDING -> COMPLETED -> CONSUMED
+ *
+ * The slot stays in PENDING from submit through "ready in queue" and "running
+ * on a worker"; readiness and running-vs-idle are derived from fanin_refcount
+ * and per-core running_slot_state respectively, not from task_state itself.
+ *
+ * Conditions:
+ *   PENDING->COMPLETED:   all subtasks finish (set by scheduler) or task is a
+ *                         hidden alloc completed inline by the orchestrator
+ *   COMPLETED->CONSUMED:  fanout_refcount == fanout_count && state == COMPLETED
+ */
+typedef enum {
+    PTO2_TASK_PENDING = 0,    // Submitted; awaiting fanin, queued, or dispatched
+    PTO2_TASK_COMPLETED = 1,  // Execution finished, output may still be in use
+    PTO2_TASK_CONSUMED = 2    // Output fully consumed, buffers can be released
+} PTO2TaskState;
+
+/**
+ * Result of a unified task allocation.
+ */
+struct PTO2TaskAllocResult {
+    int32_t task_id;    // Absolute task ID (not wrapped)
+    int32_t slot;       // task_id & (window_size - 1)
+    void *packed_base;  // Heap allocation result (nullptr if failure)
+    void *packed_end;   // packed_base + aligned output_size
+
+    bool failed() const { return task_id < 0; }
+};
+
+struct PTO2OutputLayout {
+    uint64_t offsets[MAX_TENSOR_ARGS] = {};
+    uint64_t buffer_sizes[MAX_TENSOR_ARGS] = {};
+    int32_t total_output_size = 0;
+};
+
+// =============================================================================
+// Dependency List Entry
+// =============================================================================
+
+/**
+ * Fanin spill entry
+ * Stored in the dedicated fanin spill ring buffer.
+ */
+struct PTO2TaskSlotState;  // Forward declaration
+struct PTO2FaninPool;      // Forward declaration
+struct PTO2FaninSpillEntry {
+    PTO2TaskSlotState *slot_state;
+};
+static_assert(sizeof(PTO2FaninSpillEntry) == sizeof(uintptr_t));
+
+/**
+ * Dependency list entry (singly-linked list node)
+ * Stored in DepListPool ring buffer.
+ */
+struct PTO2DepListEntry {
+    PTO2TaskSlotState *slot_state;  // Consumer slot state (direct pointer)
+    PTO2DepListEntry *next;         // next entry
+};
+
+// =============================================================================
+// Task Descriptor
+// =============================================================================
+
+/**
+ * Task descriptor structure (shared memory)
+ *
+ * Stored in the TaskDescriptor ring buffer in shared memory.
+ * Contains static identification and buffer pointers only.
+ * Dynamic scheduling state (fanin/fanout/task_state) is in PTO2TaskSlotState.
+ *
+ * Fields set by Orchestrator at submission, read by Scheduler for dispatch.
+ */
+struct PTO2TaskDescriptor {
+    // Mixed-task identification (encodes ring_id in upper 32 bits)
+    PTO2TaskId task_id;  // raw: (ring_id << 32) | local_id
+
+    // Per-slot kernel IDs (INVALID_KERNEL_ID = inactive)
+    int32_t kernel_id[PTO2_SUBTASK_SLOT_COUNT];
+
+    // Packed output buffer (all outputs packed into single contiguous buffer)
+    void *packed_buffer_base;  // Start of packed buffer in GM Heap
+    void *packed_buffer_end;   // End of packed buffer (for heap reclamation)
+};
+
+// =============================================================================
+// Per-Slot Scheduling State
+// =============================================================================
+
+/**
+ * Task payload data (cold path - only accessed during orchestration and dispatch)
+ *
+ * Layout: metadata + inline fanin packed in the first 9 cache lines, followed
+ * by bulk tensor and scalar data. Small fanins stay fully inline; larger
+ * fanins spill into a per-ring ring buffer slice.
+ */
+// Speculative early-dispatch claim states for PTO2TaskPayload::spec_state.
+enum PTO2SpecState : uint8_t {
+    PTO2_SPEC_NONE = 0,       // not pre-staged
+    PTO2_SPEC_STAGING = 1,    // Hook 1 claimed it; staging in progress
+    PTO2_SPEC_STAGED = 2,     // staged on a core, gated; staged_* fields valid
+    PTO2_SPEC_DISPATCHED = 3  // routed via the normal dispatch path (no pre-stage)
+};
+
+// A pre-staged consumer occupies one core per gated subtask block. WHICH cores
+// it occupies is recorded as a bitmask (staged_core_mask, 1 bit per global
+// core_id); the completion-path release iterates the set bits and rings each
+// core's doorbell from the scheduler's per-core doorbell table. Bounded by the
+// chip's core count (RUNTIME_MAX_WORKER = 72; no two-level pre-dispatch means
+// gated cores in flight <= core count), NOT by block_num — so a wide SPMD
+// consumer can pre-stage all its idle cores. 2 words = 128 bits >= 72.
+inline constexpr int PTO2_SPEC_CORE_MASK_WORDS = 2;
+
+struct PTO2TaskPayload {
+    // === Cache lines 0-8 (576B) — metadata + inline fanin ===
+    int32_t tensor_count{0};
+    int32_t scalar_count{0};
+    int32_t fanin_actual_count{0};  // Actual fanin count (without the +1 redundance)
+    int32_t fanin_spill_start{0};   // Linear start index in fanin spill pool (0 = no spill)
+    PTO2FaninPool *fanin_spill_pool{nullptr};
+    PTO2TaskSlotState *fanin_inline_slot_states[PTO2_FANIN_INLINE_CAP];
+    // Speculative early-dispatch metadata (AICPU-side only). Ordered by descending
+    // alignment (8B mask, 4B fanin, then 1B flags) so the block packs with no
+    // internal padding. Kept here after the fanin array (not moved up front): on
+    // cache line 8 it shares only with the rarely-touched fanin tail, whereas in
+    // line 0 the spec atomics (written during staging) would false-share with
+    // tensor_count/scalar_count (read by build_payload at dispatch). Fits in the 40B
+    // between the fanin array (offset 536) and the 64B-aligned tensors[] (offset
+    // 576), so sizeof and tensors[] are unchanged.
+    //
+    // Bitmask of global core_ids this consumer is pre-staged (gated) on. Set with
+    // atomic fetch_or by concurrent stagers; read by release. (Re)initialized in
+    // PTO2TaskPayload::init before the slot can be staged again.
+    std::atomic<uint64_t> staged_core_mask[PTO2_SPEC_CORE_MASK_WORDS]{};
+    // Early-dispatch CANDIDATE detection (event-driven, dual of fanin_refcount):
+    // seeded at wiring with producers already complete, then a flagged producer's
+    // DISPATCH bumps each consumer's dispatch_fanin. dispatch_fanin ==
+    // fanin_actual_count  <=>  every producer is flagged-and-dispatched or was
+    // pre-completed  =>  this task is an early-dispatch candidate (push early_dispatch_queue).
+    std::atomic<int32_t> dispatch_fanin{0};  // CONSUMER side: flagged-dispatched + pre-completed producers
+    bool allow_early_resolve{false};         // codegen hint copied from Arg in PTO2TaskPayload::init
+    // Lock-free claim state shared by the stagers (Hook 1, possibly several AICPU
+    // threads concurrently) and the completion-path release: 0=NONE, 1=STAGING,
+    // 3=DISPATCHED (2=STAGED is unused now). STAGING is the STABLE gated state —
+    // many threads stage blocks concurrently while it holds, each claiming a block
+    // via the atomic next_block_idx and OR-ing its cores into staged_core_mask.
+    // Release does STAGING->DISPATCHED then rings the mask; a thread that stages a
+    // block AFTER release flipped DISPATCHED rings that block's doorbell itself
+    // (self-ring), so no doorbell is ever missed.
+    std::atomic<uint8_t> spec_state{0};
+    std::atomic<uint8_t> dispatch_propagated{0};  // PRODUCER side: once-guard for fanout propagation
+    std::atomic<uint8_t> spec_chain_active{0};    // inherited early-dispatch flag (auto-chain past codegen flag)
+    uint8_t spec_chain_depth{0};                  // auto-chain depth; inherited = parent+1, capped
+    // === Cache lines 9-72 (4096B) — tensors (alignas(64) forces alignment) ===
+    Tensor tensors[MAX_TENSOR_ARGS];
+    // === Cache lines 73-74 (128B) — scalars ===
+    uint64_t scalars[MAX_SCALAR_ARGS];
+
+    // Layout verification (size checks that don't need offsetof).
+    static_assert(sizeof(Tensor) == 128, "Tensor must be 2 cache lines");
+    static_assert(MAX_SCALAR_ARGS * sizeof(uint64_t) == 128, "scalar region must be 128B (2 cache lines)");
+
+    /**
+     * Prefetch (for write) the regions init() is about to fill so the stores land
+     * in warm cache. tensor_count/scalar_count come from the Arg — the payload's
+     * own counts are not set until init(). Warms the early-dispatch spec block at
+     * offset 536 (cache line 8) too. A member fn lowers to the same prefetch
+     * instructions as a free function (`this` is just a register), no cache impact.
+     */
+    void prefetch(int32_t tensor_count, int32_t scalar_count) const {
+        for (int32_t i = 0; i < tensor_count; i++) {
+            __builtin_prefetch(&tensors[i], 1, 3);
+            __builtin_prefetch(reinterpret_cast<const char *>(&tensors[i]) + 64, 1, 3);
+        }
+        for (int32_t i = 0; i < scalar_count; i += 8) {
+            __builtin_prefetch(&scalars[i], 1, 3);
+        }
+        __builtin_prefetch(this, 1, 3);
+        __builtin_prefetch(reinterpret_cast<const char *>(this) + 64, 1, 3);
+        __builtin_prefetch(reinterpret_cast<const char *>(this) + 128, 1, 3);
+        __builtin_prefetch(reinterpret_cast<const char *>(this) + 512, 1, 3);  // spec fields (cache line 8)
+    }
+
+    /**
+     * Initialize payload: copy tensors, store scalars.
+     *
+     * For each param slot, the tensor source is determined by TensorArgType:
+     * - OUTPUT -> use materialized_outputs.output_ptr(out_idx++)
+     * - INPUT / INOUT -> use refs[i].tensor
+     *
+     * @param args                Task arguments (tensors + scalars)
+     * @param result  Materialized output tensors (from TensorCreateInfo path)
+     */
+    void init(
+        const L0TaskArgs &args, TaskOutputTensors &result, PTO2TaskAllocResult &alloc_result, PTO2OutputLayout &layout
+    ) {
+        tensor_count = args.tensor_count();
+        scalar_count = args.scalar_count();
+
+        // int32_t out_idx = 0;
+        for (int32_t i = 0; i < args.tensor_count(); i++) {
+            if (args.tag(i) != TensorArgType::OUTPUT) {
+                tensors[i].copy(args.tensor(i).ref());
+            } else {
+                init_tensor_from_create_info(
+                    tensors[i], args.tensor(i).create_info(),
+                    reinterpret_cast<void *>(reinterpret_cast<char *>(alloc_result.packed_base) + layout.offsets[i]),
+                    layout.buffer_sizes[i]
+                );
+                tensors[i].owner_task_id = result.task_id();
+                result.materialize_output(tensors[i]);
+            }
+        }
+        // Round up to cache line boundary. Both arrays are 128B so no overrun.
+        // Eliminates branches; extra bytes within the same CL have zero additional cost.
+        memcpy(scalars, args.scalars(), PTO2_ALIGN_UP(args.scalar_count() * sizeof(uint64_t), 64));
+
+        // Speculative early-dispatch metadata — the single init point for these
+        // fields. reset_for_reuse MUST NOT touch the payload (it runs on the
+        // scheduler's advance-ring path and would pull this cold cache line across
+        // structures); prepare_task only allocates/binds. prefetch() warms this
+        // line (offset 512) so these writes land in warm cache.
+        //
+        // spec_state / staged_core_mask / dispatch_fanin / spec_chain_* are all
+        // CONSUMER-side: a task with allow_early_resolve == false still has them
+        // touched when one of ITS producers is flagged (propagate_dispatch_fanin
+        // bumps dispatch_fanin and may CAS spec_state / set the auto-chain flag on
+        // any consumer, independent of the consumer's own hint). So they MUST be
+        // zeroed here unconditionally — no per-task allow_early_resolve gating.
+        allow_early_resolve = args.allow_early_resolve();
+        spec_state.store(PTO2_SPEC_NONE, std::memory_order_relaxed);
+        for (int w = 0; w < PTO2_SPEC_CORE_MASK_WORDS; w++)
+            staged_core_mask[w].store(0, std::memory_order_relaxed);
+        dispatch_fanin.store(0, std::memory_order_relaxed);
+        dispatch_propagated.store(0, std::memory_order_relaxed);
+        spec_chain_active.store(0, std::memory_order_relaxed);
+        spec_chain_depth = 0;
+    }
+};
+
+// PTO2TaskPayload layout verification (offsetof requires complete type).
+static_assert(offsetof(PTO2TaskPayload, fanin_spill_pool) == 16, "spill pool pointer layout drift");
+static_assert(
+    offsetof(PTO2TaskPayload, fanin_inline_slot_states) == 24, "inline fanin array must follow spill metadata"
+);
+static_assert(offsetof(PTO2TaskPayload, tensors) == 576, "tensors must start at byte 576 (cache line 9)");
+static_assert(
+    offsetof(PTO2TaskPayload, scalars) == 576 + MAX_TENSOR_ARGS * sizeof(Tensor),
+    "scalars must immediately follow tensors"
+);
+static_assert(
+    sizeof(PTO2TaskPayload) == 576 + MAX_TENSOR_ARGS * sizeof(Tensor) + MAX_SCALAR_ARGS * sizeof(uint64_t),
+    "PTO2TaskPayload size must stay on the baseline cache-line footprint"
+);
+
+/**
+ * Per-task slot scheduling state (scheduler-private, NOT in shared memory)
+ *
+ * Consolidates all hot-path scheduling fields into a single cache-friendly
+ * structure (32 bytes = half a cache line). Accessing any field of a task's
+ * slot state brings all related fields into the same cache line.
+ *
+ * Concurrency notes:
+ * - fanout_head, fanout_count protected by fanout_lock (per-task spinlock)
+ * - fanin_count set once at submission, read-only after (hot path for ready check)
+ * - task_state, fanin_refcount, fanout_refcount updated atomically
+ */
+struct alignas(64) PTO2TaskSlotState {
+    // Fanout lock + list (accessed together under lock in on_task_complete)
+    std::atomic<int32_t> fanout_lock;  // Per-task spinlock (0=unlocked, 1=locked)
+    int32_t fanout_count;              // 1 (owning scope) + number of consumers
+
+    PTO2DepListEntry *fanout_head;  // Pointer to first fanout entry (nullptr = empty)
+
+    // Task state (completion, consumed check, ready check)
+    std::atomic<PTO2TaskState> task_state;  // PENDING/COMPLETED/CONSUMED
+
+    // Fanin (accessed together in release_fanin_and_check_ready)
+    std::atomic<int32_t> fanin_refcount;  // Dynamic: counts completed producers
+    int32_t fanin_count;                  // Number of producer dependencies (set once by wiring)
+
+    // Fanout refcount (accessed with fanout_count in check_and_handle_consumed)
+    std::atomic<int32_t> fanout_refcount;  // Dynamic: counts released references
+
+    // --- Per-slot constant, re-bound by orch::prepare_task each submit ---
+    // Value is the same on every reuse (&task_payloads[slot] / &task_descriptors[slot]),
+    // but written here per-submit instead of in an O(window_size) init loop —
+    // these are the only "scale-dependent" pointers in this struct, so moving
+    // them out of init makes startup cost independent of task_window_size.
+    PTO2TaskPayload *payload;
+    PTO2TaskDescriptor *task;
+
+    // --- Set per-submit (depend on task inputs) ---
+    ActiveMask active_mask;  // Bitmask of active subtask slots (set once)
+    uint8_t ring_id;         // Ring layer (immutable after init)
+    // Set by any subtask FIN that pushed deferred-completion CONDITIONs to
+    // the runtime mailbox; read by the last subtask FIN to decide whether
+    // the task needs MPSC-deferred completion or can complete inline on this
+    // thread. Carved out of the otherwise-padding byte between ring_id and
+    // dep_pool_mark to keep PTO2TaskSlotState at 64 bytes. The write is
+    // sequenced before on_subtask_complete's acq_rel fetch_add and the read
+    // after, so all earlier subtasks' writes are visible to the last subtask.
+    std::atomic<bool> any_subtask_deferred{false};
+    uint8_t _async_pad{0};
+    int32_t dep_pool_mark{0};  // Dep pool top after wiring (thread-0-only)
+
+    std::atomic<int16_t> completed_subtasks{0};  // Each core completion increments by 1
+    int16_t total_required_subtasks{0};          // = logical_block_num * popcount(active_mask)
+    int16_t logical_block_num{1};                // Total logical blocks (set by orchestrator)
+    // Next block to dispatch. Atomic so concurrent speculative stagers can each
+    // claim a distinct block via CAS; normal dispatch (ready-queue serialized)
+    // uses plain relaxed load/store. The two phases never overlap in time (staging
+    // happens before release; normal dispatch of the remainder happens after).
+    std::atomic<int16_t> next_block_idx{0};
+
+    /**
+     * Bind the slot-invariant ring id. Called once per slot during
+     * RingSchedState::init(); ring_id never changes across reuses.
+     */
+    void bind_ring(uint8_t rid) { ring_id = rid; }
+
+    /**
+     * Re-bind the per-slot payload/task pointers. Called by
+     * orch::prepare_task on every submit. Value is constant for a given
+     * slot, but we pay the cheap re-write each submit (both fields land on
+     * the same 64B slot_state cache line that prepare_task is already
+     * dirtying) to avoid the init-time per-slot loop.
+     */
+    void bind_buffers(PTO2TaskPayload *p, PTO2TaskDescriptor *t) {
+        payload = p;
+        task = t;
+    }
+
+    /**
+     * Reset dynamic scheduling fields for slot reuse.
+     * Called by advance_ring_pointers() after a slot transitions to CONSUMED
+     * and last_task_alive advances past it, but before sync_to_sm() publishes
+     * the new last_task_alive to the orchestrator.
+     *
+     * Skips payload, task, ring_id (immutable, bound once at init).
+     * Skips task_state: left as CONSUMED so that wait_for_tensor_ready()
+     * callers holding stale owner_task_id still observe a completed state.
+     * task_state is set to PENDING by the orchestrator when it reuses the slot.
+     */
+    void reset_for_reuse() {
+        fanout_lock.store(0, std::memory_order_relaxed);
+        fanout_count = 1;
+        fanout_head = nullptr;
+        fanin_refcount.store(0, std::memory_order_relaxed);
+        fanout_refcount.store(0, std::memory_order_relaxed);
+        completed_subtasks.store(0, std::memory_order_relaxed);
+        next_block_idx.store(0, std::memory_order_relaxed);
+        any_subtask_deferred.store(false, std::memory_order_relaxed);
+        // Note: payload spec fields (spec_state / staged_core_mask / dispatch_fanin /
+        // spec_chain_*) are NOT reset here — this method skips the payload by
+        // contract. They are (re)initialized in PTO2TaskPayload::init on every
+        // submit, before the slot becomes visible to the scheduler.
+    }
+
+    // === Per-task fanout spinlock ===
+    //
+    // Used by BOTH the orchestrator and the scheduler. The fanout_lock MUST
+    // be held whenever reading or writing fanout_head / fanout_count, because
+    // the orchestrator adds consumers concurrently with the scheduler
+    // traversing the list after task completion.
+
+#if PTO2_ORCH_PROFILING || PTO2_SCHED_PROFILING
+    void lock_fanout(uint64_t &atomic_count, uint64_t &wait_cycle) {
+        uint64_t t0 = get_sys_cnt_aicpu();
+        bool contended = false;
+        uint32_t atomic_ops = 0;
+
+        for (;;) {
+            while (fanout_lock.load(std::memory_order_acquire) != 0) {
+                contended = true;
+                atomic_ops++;
+                SPIN_WAIT_HINT();
+            }
+            int32_t expected = 0;
+            if (fanout_lock.compare_exchange_weak(expected, 1, std::memory_order_acquire, std::memory_order_relaxed)) {
+                atomic_ops++;
+                atomic_count += atomic_ops;
+                if (contended) {
+                    wait_cycle += (get_sys_cnt_aicpu() - t0);
+                }
+                return;
+            }
+            contended = true;
+            atomic_ops++;
+        }
+    }
+#endif
+
+    void lock_fanout() {
+        for (;;) {
+            while (fanout_lock.load(std::memory_order_acquire) != 0) {
+                SPIN_WAIT_HINT();
+            }
+            int32_t expected = 0;
+            if (fanout_lock.compare_exchange_weak(expected, 1, std::memory_order_acquire, std::memory_order_relaxed)) {
+                return;
+            }
+        }
+    }
+
+    void unlock_fanout() { fanout_lock.store(0, std::memory_order_release); }
+};
+
+static_assert(sizeof(PTO2TaskSlotState) == 64);
+
+#endif  // SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_PTO_RUNTIME2_TYPES_H_
diff --git a/src/a2a3/runtime/fully_distributed_within_core/runtime/pto_shared_memory.h b/src/a2a3/runtime/fully_distributed_within_core/runtime/pto_shared_memory.h
new file mode 100644
index 000000000..cad5cec36
--- /dev/null
+++ b/src/a2a3/runtime/fully_distributed_within_core/runtime/pto_shared_memory.h
@@ -0,0 +1,270 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * PTO Runtime2 - Shared Memory Layout
+ *
+ * Defines the shared memory structure for Orchestrator-Scheduler communication.
+ *
+ * Memory Layout (per-ring sections repeat for each ring 0..PTO2_MAX_RING_DEPTH-1):
+ *   +---------------------------+
+ *   | SharedMemoryHeader        |  (per-ring flow control + sync)
+ *   +---------------------------+
+ *   | Ring 0: TaskDescriptor[]  |
+ *   | Ring 0: TaskPayload[]     |
+ *   | Ring 0: TaskSlotState[]   |
+ *   +---------------------------+
+ *   | Ring 1: TaskDescriptor[]  |
+ *   | Ring 1: TaskPayload[]     |
+ *   | Ring 1: TaskSlotState[]   |
+ *   +---------------------------+
+ *   | ...                       |
+ *   +---------------------------+
+ *
+ * Design principles:
+ * - Only data needed for Orchestrator<->Scheduler communication is here
+ * - TensorMap, scope_stack, ready_queues, dep_pool are in private memory
+ * - Flow control via atomic counters/flags (no locks needed for single-word R/W)
+ *
+ * Based on: docs/RUNTIME_LOGIC.md
+ */
+
+#pragma once
+
+#include "utils/device_arena.h"
+#include "pto_runtime2_types.h"
+
+// =============================================================================
+// Shared Memory Header
+// =============================================================================
+
+struct PTO2SharedMemoryHandle;
+
+/**
+ * Per-ring flow control state in shared memory.
+ * Written/read by Orchestrator and Scheduler for synchronization.
+ */
+struct alignas(64) PTO2RingFlowControl {
+    // === Cache Line 0: Written by Orchestrator, Read by Scheduler ===
+    alignas(64) std::atomic<int32_t> current_task_index;  // Task ring head (next to allocate)
+
+    // === Cache Line 1: Written by Scheduler, Read by Orchestrator (for back-pressure) ===
+    alignas(64) std::atomic<int32_t> last_task_alive;  // Task ring tail (oldest active task)
+
+    // Per-boot SM reset. PTO2TaskAllocator::init() seeds its private
+    // local_task_id_ from initial_local_task_id (default 0 in production)
+    // *without* dereferencing current_task_index — it relies on this reset
+    // running on every AICPU boot so 0 stays in sync. If you ever change
+    // the initial fc value or the boot ordering, update the default in
+    // PTO2TaskAllocator::init (pto_ring_buffer.h) in the same change, or
+    // submit IDs will be off by the divergence.
+    void init() {
+        current_task_index.store(0, std::memory_order_relaxed);
+        last_task_alive.store(0, std::memory_order_relaxed);
+    }
+
+    bool validate(PTO2SharedMemoryHandle *handle, int32_t ring_id) const;
+};
+
+static_assert(sizeof(PTO2RingFlowControl) == 128, "PTO2RingFlowControl must be exactly 2 cache lines (128B)");
+
+/**
+ * Per-ring shared memory header section.
+ *
+ * Groups flow-control, layout info, and per-ring data pointers for a single ring.
+ * Pointers are host-side only (set by setup_pointers, invalid on device).
+ */
+struct alignas(64) PTO2SharedMemoryRingHeader {
+    PTO2RingFlowControl fc;
+
+    // Layout metadata (set once at init)
+    uint64_t task_window_size;
+    int32_t task_window_mask;
+    uint64_t heap_size;
+    uint64_t task_descriptors_offset;  // Offset from SM base, in bytes
+
+    // Per-ring data pointers (host-side, set by setup_pointers)
+    PTO2TaskDescriptor *task_descriptors;
+    PTO2TaskPayload *task_payloads;
+    PTO2TaskSlotState *slot_states;
+
+    int32_t get_slot_by_task_id(int32_t local_task_id) { return local_task_id & task_window_mask; }
+
+    PTO2TaskDescriptor &get_task_by_slot(int32_t slot) { return task_descriptors[slot]; }
+
+    PTO2TaskDescriptor &get_task_by_task_id(int32_t local_id) {
+        return task_descriptors[get_slot_by_task_id(local_id)];
+    }
+
+    PTO2TaskPayload &get_payload_by_slot(int32_t slot) { return task_payloads[slot]; }
+
+    PTO2TaskPayload &get_payload_by_task_id(int32_t local_id) { return task_payloads[get_slot_by_task_id(local_id)]; }
+
+    PTO2TaskSlotState &get_slot_state_by_slot(int32_t slot) { return slot_states[slot]; }
+
+    PTO2TaskSlotState &get_slot_state_by_task_id(int32_t local_id) {
+        return slot_states[get_slot_by_task_id(local_id)];
+    }
+};
+
+/**
+ * Shared memory header structure
+ *
+ * Contains per-ring flow control and global layout information.
+ */
+struct alignas(PTO2_ALIGN_SIZE) PTO2SharedMemoryHeader {
+    // === PER-RING FLOW CONTROL + LAYOUT INFO (set once at init) ===
+    PTO2SharedMemoryRingHeader rings[PTO2_MAX_RING_DEPTH];
+
+    // === GLOBAL FIELDS ===
+    std::atomic<int32_t> orchestrator_done;  // Flag: orchestration complete
+
+    // Total shared memory size (for validation)
+    uint64_t total_size;
+
+    // Graph output for copy-back (set by orchestrator when using packed buffer)
+    // Host finalize copies from this address instead of dev_ptr when non-zero
+    std::atomic<uint64_t> graph_output_ptr;   // Address where final output was written (packed buffer)
+    std::atomic<uint64_t> graph_output_size;  // Size in bytes
+
+    // === ERROR REPORTING ===
+
+    // Orchestrator fatal error code (Orchestrator → Scheduler, AICPU → Host)
+    // Non-zero signals fatal error. Written by orchestrator, read by scheduler and host.
+    std::atomic<int32_t> orch_error_code;
+
+    // Scheduler error state (Scheduler → Host, independent of orchestrator)
+    // Written by scheduler threads on timeout; read by orchestrator and host.
+    std::atomic<uint32_t> sched_error_bitmap;  // Bit X set = thread X had error
+    std::atomic<int32_t> sched_error_code;     // Last scheduler error code (last-writer-wins)
+    std::atomic<int32_t> sched_error_thread;   // Thread index of last error writer
+};
+
+static_assert(
+    (sizeof(PTO2SharedMemoryHeader) % PTO2_ALIGN_SIZE == 0) && (sizeof(PTO2SharedMemoryHeader) < 4096),
+    "PTO2SharedMemoryHeader should be reasonably sized"
+);
+
+// =============================================================================
+// Shared Memory Handle
+// =============================================================================
+
+/**
+ * Handle for shared memory lifecycle management (create/destroy).
+ * Runtime components (orchestrator, scheduler) use PTO2SharedMemoryHeader* directly.
+ */
+struct PTO2SharedMemoryHandle {
+    void *sm_base;     // Base address of shared memory
+    uint64_t sm_size;  // Total size of shared memory
+
+    PTO2SharedMemoryHeader *header;
+
+    // Ownership flag
+    bool is_owner;  // True if this handle allocated the memory
+
+    // === Static helpers ===
+
+    static uint64_t calculate_size(uint64_t task_window_size);
+    static uint64_t calculate_size_per_ring(const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH]);
+
+    // UT convenience: reserve wrapper + sm_base on `arena`, commit, and init
+    // using default PTO2_TASK_WINDOW_SIZE / PTO2_HEAP_SIZE. Only valid when the
+    // arena is otherwise empty (the call performs the single commit). All
+    // memory is owned by the arena — caller must not call destroy().
+    static PTO2SharedMemoryHandle *create_and_init_default(DeviceArena &arena);
+
+    // === Instance methods ===
+
+    // In-place init for caller-provided wrapper storage (e.g. a region carved
+    // out of a DeviceArena). Sets is_owner = false, calls setup_pointers and
+    // init_header. Returns false when `sm_size` is too small for the requested
+    // `task_window_size`.
+    bool init(void *sm_base, uint64_t sm_size, uint64_t task_window_size, uint64_t heap_size);
+    bool init_per_ring(
+        void *sm_base, uint64_t sm_size, const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH],
+        const uint64_t heap_sizes[PTO2_MAX_RING_DEPTH]
+    );
+
+    void destroy();
+    void print_layout();
+    bool validate();
+
+private:
+    void init_header(uint64_t task_window_size, uint64_t heap_size);
+    void init_header_per_ring(
+        const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH], const uint64_t heap_sizes[PTO2_MAX_RING_DEPTH]
+    );
+    void setup_pointers(uint64_t task_window_size);
+    void setup_pointers_per_ring(const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH]);
+};
+
+// =============================================================================
+// SM Device Layout Helpers
+// =============================================================================
+//
+// When the host pre-builds a runtime-arena image, it needs the device-side
+// addresses of several SM sub-fields (ring flow-control counters,
+// task_descriptors arrays, orch_error_code) so it can wire them into the
+// orchestrator / scheduler init_data path without dereferencing the SM —
+// the SM lives in device memory and cannot be touched from host.
+//
+// These helpers compute those addresses by offset arithmetic on the SM
+// device base. Pure pointer math, no loads/stores; safe to call from host.
+// The same arithmetic happens on AICPU too (via PTO2SharedMemoryHandle's
+// own setup_pointers), so values are guaranteed consistent across sides.
+namespace pto2_sm_layout {
+
+inline std::atomic<int32_t> *orch_error_code_addr(void *sm_dev_base) noexcept {
+    return reinterpret_cast<std::atomic<int32_t> *>(
+        static_cast<char *>(sm_dev_base) + offsetof(PTO2SharedMemoryHeader, orch_error_code)
+    );
+}
+
+inline PTO2SharedMemoryRingHeader *ring_header_addr(void *sm_dev_base, int ring_id) noexcept {
+    return reinterpret_cast<PTO2SharedMemoryRingHeader *>(
+        static_cast<char *>(sm_dev_base) + offsetof(PTO2SharedMemoryHeader, rings) +
+        static_cast<size_t>(ring_id) * sizeof(PTO2SharedMemoryRingHeader)
+    );
+}
+
+inline std::atomic<int32_t> *ring_current_task_index_addr(void *sm_dev_base, int ring_id) noexcept {
+    return reinterpret_cast<std::atomic<int32_t> *>(
+        reinterpret_cast<char *>(ring_header_addr(sm_dev_base, ring_id)) + offsetof(PTO2SharedMemoryRingHeader, fc) +
+        offsetof(PTO2RingFlowControl, current_task_index)
+    );
+}
+
+inline std::atomic<int32_t> *ring_last_task_alive_addr(void *sm_dev_base, int ring_id) noexcept {
+    return reinterpret_cast<std::atomic<int32_t> *>(
+        reinterpret_cast<char *>(ring_header_addr(sm_dev_base, ring_id)) + offsetof(PTO2SharedMemoryRingHeader, fc) +
+        offsetof(PTO2RingFlowControl, last_task_alive)
+    );
+}
+
+// Walk the per-ring SM layout (same arithmetic as setup_pointers_per_ring)
+// to compute ring `ring_id`'s task_descriptors device address. Accepts a
+// per-ring window-size array so the helper's signature mirrors
+// `PTO2SharedMemoryHandle::setup_pointers_per_ring` and cannot silently
+// disagree with the SM layout when (hypothetically) ring sizes diverge.
+inline PTO2TaskDescriptor *ring_task_descriptors_addr(
+    void *sm_dev_base, const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH], int ring_id
+) noexcept {
+    assert(ring_id >= 0 && ring_id < PTO2_MAX_RING_DEPTH && "pto2_sm_layout: ring_id out of range");
+    char *p = static_cast<char *>(sm_dev_base);
+    p += PTO2_ALIGN_UP(sizeof(PTO2SharedMemoryHeader), PTO2_ALIGN_SIZE);
+    for (int r = 0; r < ring_id; r++) {
+        p += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskDescriptor), PTO2_ALIGN_SIZE);
+        p += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskPayload), PTO2_ALIGN_SIZE);
+        p += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskSlotState), PTO2_ALIGN_SIZE);
+    }
+    return reinterpret_cast<PTO2TaskDescriptor *>(p);
+}
+
+}  // namespace pto2_sm_layout
diff --git a/src/a2a3/runtime/fully_distributed_within_core/runtime/pto_submit_types.h b/src/a2a3/runtime/fully_distributed_within_core/runtime/pto_submit_types.h
new file mode 100644
index 000000000..21c77fce2
--- /dev/null
+++ b/src/a2a3/runtime/fully_distributed_within_core/runtime/pto_submit_types.h
@@ -0,0 +1,161 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+/**
+ * PTO Submit Types - Shared submit-contract definitions
+ *
+ * Header-only definitions shared by orchestration-facing and runtime-facing
+ * headers. Keeps orchestration slim (no dependency on pto_runtime2_types.h).
+ */
+
+#pragma once
+
+#include <stdint.h>
+
+inline constexpr int32_t INVALID_KERNEL_ID = -1;
+
+/**
+ * Subtask slot count: AIC, AIV0, AIV1
+ */
+inline constexpr int32_t PTO2_SUBTASK_SLOT_COUNT = 3;
+
+/**
+ * Subtask slot indices
+ */
+enum class PTO2SubtaskSlot : uint8_t {
+    AIC = 0,
+    AIV0 = 1,
+    AIV1 = 2,
+};
+
+/**
+ * Subtask mask bits (for ActiveMask)
+ */
+inline constexpr uint8_t PTO2_SUBTASK_MASK_AIC = (1u << 0);         // 0x1
+inline constexpr uint8_t PTO2_SUBTASK_MASK_AIV0 = (1u << 1);        // 0x2
+inline constexpr uint8_t PTO2_SUBTASK_MASK_AIV1 = (1u << 2);        // 0x4
+inline constexpr uint8_t PTO2_SUBTASK_FLAG_SYNC_START = (1u << 3);  // 0x8: all blocks must launch atomically
+
+/**
+ * Resource shape — classifies a MixedKernels into one of 3 scheduling buckets.
+ *
+ * Multi-subtask tasks (2+ active slots) are all scheduled as MIX. Dispatch
+ * chooses one cluster, then uses active_mask to decide which cores in that
+ * cluster must be placed together: all used cores idle -> running placement;
+ * all used cores already running with free pending slots -> pending placement;
+ * mixed used-core state is rejected and retried later.
+ *
+ * DUMMY is a synthetic shape for dep-only tasks (no AICore dispatch). Tasks
+ * with an empty core_mask route to a dedicated DUMMY ready queue and are
+ * completed inline by the scheduler dispatch loop, bypassing core allocation.
+ */
+enum class PTO2ResourceShape : uint8_t {
+    AIC = 0,    // Single AIC
+    AIV = 1,    // Single AIV
+    MIX = 2,    // Full cluster (dispatch uses active_mask)
+    DUMMY = 3,  // Dependency-only (no AICore dispatch)
+};
+
+// Number of *dispatchable* resource shapes (AIC, AIV, MIX). DUMMY does not
+// allocate a per-shape ready_queue entry / local buffer — it lives in a
+// dedicated queue inside PTO2SchedulerState.
+inline constexpr int32_t PTO2_NUM_RESOURCE_SHAPES = 3;
+
+/**
+ * Bitmask of active subtask slots + flags, sizeof == 1.
+ */
+class ActiveMask {
+public:
+    constexpr ActiveMask() = default;
+    constexpr explicit ActiveMask(uint8_t raw) :
+        raw_(raw) {}
+
+    uint8_t raw() const { return raw_; }
+
+    bool subtask_active(PTO2SubtaskSlot slot) const { return (raw_ & (1u << static_cast<uint8_t>(slot))) != 0; }
+
+    uint8_t core_mask() const { return raw_ & 0x07u; }
+
+    bool requires_sync_start() const { return (raw_ & PTO2_SUBTASK_FLAG_SYNC_START) != 0; }
+
+    PTO2ResourceShape to_shape() const {
+        uint8_t cmask = core_mask();
+        if (cmask == 0) return PTO2ResourceShape::DUMMY;
+        int bit_count = __builtin_popcount(cmask);
+        if (bit_count >= 2) return PTO2ResourceShape::MIX;
+        if (cmask & PTO2_SUBTASK_MASK_AIC) return PTO2ResourceShape::AIC;
+        return PTO2ResourceShape::AIV;
+    }
+
+    void set_sync_start() { raw_ |= PTO2_SUBTASK_FLAG_SYNC_START; }
+
+    bool operator==(ActiveMask other) const { return raw_ == other.raw_; }
+    bool operator!=(ActiveMask other) const { return raw_ != other.raw_; }
+
+    ActiveMask operator|(ActiveMask other) const { return ActiveMask(raw_ | other.raw_); }
+    ActiveMask &operator|=(ActiveMask other) {
+        raw_ |= other.raw_;
+        return *this;
+    }
+
+    ActiveMask operator&(uint8_t mask) const { return ActiveMask(raw_ & mask); }
+
+    bool has_mask(uint8_t mask) const { return (raw_ & mask) != 0; }
+
+    explicit operator bool() const { return raw_ != 0; }
+
+private:
+    uint8_t raw_{0};
+};
+
+static_assert(sizeof(ActiveMask) == 1, "ActiveMask must be exactly 1 byte");
+
+/**
+ * Mixed-task submit contract.
+ *
+ * Each field holds either a valid kernel ID or INVALID_KERNEL_ID (inactive).
+ * At least one slot must be valid.
+ */
+struct MixedKernels {
+    int32_t aic_kernel_id{INVALID_KERNEL_ID};
+    int32_t aiv0_kernel_id{INVALID_KERNEL_ID};
+    int32_t aiv1_kernel_id{INVALID_KERNEL_ID};
+
+    ActiveMask to_active_mask() const {
+        uint8_t mask = 0;
+        if (aic_kernel_id != INVALID_KERNEL_ID) mask |= PTO2_SUBTASK_MASK_AIC;
+        if (aiv0_kernel_id != INVALID_KERNEL_ID) mask |= PTO2_SUBTASK_MASK_AIV0;
+        if (aiv1_kernel_id != INVALID_KERNEL_ID) mask |= PTO2_SUBTASK_MASK_AIV1;
+        return ActiveMask(mask);
+    }
+};
+
+/**
+ * SPMD launch parameters carried inside Arg.
+ *
+ * Controls how many logical blocks (SPMD dimension) a single task
+ * is expanded into at dispatch time.  Each block receives a unique
+ * block_idx in [0, block_num) via the per-dispatch LocalContext.
+ */
+class PTO2LaunchSpec {
+public:
+    constexpr PTO2LaunchSpec() = default;
+
+    int16_t block_num() const { return block_num_; }
+    void set_block_num(int16_t n) { block_num_ = n; }
+
+    bool require_sync_start() const { return require_sync_start_; }
+    void set_require_sync_start(bool v) { require_sync_start_ = v; }
+
+private:
+    int16_t block_num_{1};
+    bool require_sync_start_{false};
+};
diff --git a/src/a2a3/runtime/fully_distributed_within_core/runtime/pto_tensormap.h b/src/a2a3/runtime/fully_distributed_within_core/runtime/pto_tensormap.h
new file mode 100644
index 000000000..30017fadd
--- /dev/null
+++ b/src/a2a3/runtime/fully_distributed_within_core/runtime/pto_tensormap.h
@@ -0,0 +1,723 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+/**
+ * PTO Runtime2 - TensorMap Interface
+ *
+ * TensorMap provides producer lookup for dependency discovery:
+ * - Maps Tensor -> producer task ID
+ * - Used by pto_submit_task() to find dependencies
+ *
+ * Key design features:
+ * 1. Ring buffer pool for entries (no malloc/free)
+ * 2. Lazy invalidation (entries become stale when producer retires)
+ * 3. Per-task per-ring entry tracking for efficient cleanup
+ * 4. OVERLAP DETECTION: Detects dependencies for overlapping sub-regions
+ *
+ * Hash table with chaining:
+ * - buckets[] array of head offsets
+ * - Entries linked via next_in_bucket
+ * - Insert at head (newest first) for sorted chains
+ *
+ * CRITICAL: Hash only by base_ptr
+ * ==============================
+ * For overlap detection to work, ALL sub-regions of the same base tensor
+ * MUST be in the SAME hash bucket. This allows lookup to compare all
+ * potentially overlapping regions.
+ *
+ * Overlap detection: Two regions create a dependency if:
+ *   1. Same base_ptr (raw tensor pointer)
+ *   2. Byte ranges [offset, offset+size) intersect
+ *
+ * Based on: docs/RUNTIME_LOGIC.md
+ */
+
+#pragma once
+
+#include "common.h"
+#include "profiling_config.h"
+#include "utils/device_arena.h"
+#include "pto_runtime2_types.h"
+#include "tensor.h"
+
+// Overlap geometry types. Relocated here from tensor.h: they are used only by
+// the runtime's overlap-detection / dependency machinery, not by the
+// wire/host-facing Tensor definition.
+enum class OverlapStatus {
+    NO_OVERLAP,
+    COVERED,
+    OTHER,
+};
+
+struct Segment {
+    uint64_t begin;
+    uint64_t end;
+
+    bool line_segment_intersection(const Segment &other) const { return end > other.begin && other.end > begin; }
+    bool contains(const Segment &other) const { return begin <= other.begin && other.end <= end; }
+};
+
+/**
+ * Layout descriptor produced by PTO2TensorMap::reserve_layout(). Stores the
+ * region offsets returned by DeviceArena::reserve() so init_from_layout()
+ * can fetch the matching pointers after the arena is committed.
+ *
+ * All offsets are relative to the arena's base.
+ */
+struct PTO2TensorMapLayout {
+    size_t off_buckets;
+    size_t off_entry_pool;
+    size_t off_free_entry_list;
+    size_t off_task_entry_heads[PTO2_MAX_RING_DEPTH];
+    int32_t num_buckets;
+    int32_t pool_size;
+    int32_t task_window_sizes[PTO2_MAX_RING_DEPTH];
+};
+
+// TensorMap Lookup Profiling (must precede inline lookup/insert methods).
+#if PTO2_TENSORMAP_PROFILING
+extern uint64_t g_lookup_chain_total;
+extern uint64_t g_lookup_count;
+extern int32_t g_lookup_chain_max;
+extern uint64_t g_lookup_overlap_checks;
+extern uint64_t g_lookup_overlap_hits;
+extern uint64_t g_insert_count;
+#endif
+
+// =============================================================================
+// TensorMap Structure
+// =============================================================================
+
+/**
+ * TensorMap entry structure — cache-line optimized for lookup
+ *
+ * Cache line 1 (64B, lookup hot path) mirrors Tensor cache line 1 byte-for-byte
+ * from byte 16 onward, so that `memcpy(this, &tensor, 64)` populates everything
+ * we need for overlap checks. Bytes [0, 16) carry entry-only fields (hash
+ * bucket head + chain pointer) that overlap Tensor::buffer (addr in [0, 8) is
+ * the hash key, size in [8, 16) is unused by the entry — we repurpose it for
+ * `next_in_bucket`).
+ *
+ *   buffer_addr / next_in_bucket / producer_task_id   — chain traversal + match
+ *   start_offset                                       — overlap byte range begin
+ *   version, ndims, dtype, manual_dep, is_contiguous   — overlap fast path
+ *   shapes[5]                                          — overlap comparison (line 1)
+ *
+ * Cache line 2 (64B, slow-path / non-contiguous overlap):
+ *   prev_in_bucket / next_in_task / prev_in_task       — chain manipulation
+ *   bucket_index                                       — bookkeeping
+ *   extent_elem_cache                                  — overlap byte range end
+ *   strides[5]                                          — reserved for L2 overlap (PR-2)
+ *
+ * When both entry & probe are `is_contiguous && start_offset == 0`, the overlap
+ * check derives `extent_elem = prod(shapes)` from cache line 1 alone.
+ *
+ * Entry size: 128B (2 cache lines), matches Tensor.
+ */
+struct alignas(64) PTO2TensorMapEntry {
+    // === Cache line 1 (64B) — lookup hot path; mirrors Tensor line 1 from byte 16 ===
+    uint64_t buffer_addr;                // 8B [0, 8):   tensor base address (hash key, mirrors Tensor::buffer.addr)
+    PTO2TensorMapEntry *next_in_bucket;  // 8B [8, 16):  next entry in hash bucket chain (overlays Tensor::buffer.size)
+    PTO2TaskId producer_task_id;         // 8B [16,24):  mirrors Tensor::owner_task_id slot
+    uint64_t start_offset;               // 8B [24,32):  mirrors Tensor::start_offset (element offset)
+    int32_t version;                     // 4B [32,36):  mirrors Tensor::version
+    uint32_t ndims;                      // 4B [36,40):  mirrors Tensor::ndims
+    DataType dtype;                      // 1B [40,41):  mirrors Tensor::dtype
+    bool manual_dep;                     // 1B [41,42):  mirrors Tensor::manual_dep
+    bool is_contiguous;                  // 1B [42,43):  mirrors Tensor::is_contiguous
+    uint8_t __padding1__;                // 1B [43,44):  mirrors Tensor padding
+    uint32_t shapes[MAX_TENSOR_DIMS];    // 20B [44,64): mirrors Tensor::shapes
+
+    // === Cache line 2 (64B) — chain manipulation + non-contiguous overlap data ===
+    PTO2TensorMapEntry *prev_in_bucket;  // 8B [64, 72)
+    PTO2TensorMapEntry *next_in_task;    // 8B [72, 80)
+    PTO2TensorMapEntry *prev_in_task;    // 8B [80, 88)
+    int32_t bucket_index;                // 4B [88, 92): -1 when unlinked
+    uint32_t __padding2__;               // 4B [92, 96)
+    uint64_t extent_elem_cache;          // 8B [96,104): non-contiguous extent (mirrors Tensor)
+    uint32_t strides[MAX_TENSOR_DIMS];   // 20B [104,124): element strides, mirrors Tensor::strides
+    uint8_t __padding3__[4];             // 4B [124,128)
+
+    /**
+     * Copy overlap-relevant fields from a Tensor into this entry.
+     *
+     * 64B memcpy of Tensor cache line 1 populates buffer_addr (byte [0,8)),
+     * producer_task_id, start_offset, version, ndims, dtype, manual_dep,
+     * is_contiguous and shapes[]. Byte [8,16) holds Tensor::buffer.size in
+     * the source and gets written into next_in_bucket; that's harmless
+     * because link_entry() overwrites next_in_bucket immediately after.
+     *
+     * Cache line 2 (stride / extent_elem_cache) is derived from line 1 when
+     * the source is canonically contiguous (is_contiguous && start_offset==0),
+     * so the producer Tensor's cache line 2 stays cold during insert. Only
+     * non-contiguous producers pay one extra line 2 read.
+     */
+    void copy_from_tensor(const Tensor &tensor) {
+        memcpy(this, &tensor, 64);
+        if (tensor.is_contiguous && tensor.start_offset == 0) {
+            uint64_t numel = 1;
+            for (uint32_t i = 0; i < tensor.ndims; i++)
+                numel *= tensor.shapes[i];
+            extent_elem_cache = numel;
+            uint32_t s = 1;
+            for (int32_t i = static_cast<int32_t>(tensor.ndims) - 1; i >= 0; i--) {
+                strides[i] = s;
+                s *= tensor.shapes[i];
+            }
+        } else {
+            extent_elem_cache = tensor.extent_elem_cache;
+            for (uint32_t i = 0; i < tensor.ndims; i++) {
+                strides[i] = tensor.strides[i];
+            }
+        }
+    }
+
+    void copy_tensor_create_info(const TensorCreateInfo &tensor_create_info, uint64_t addr) {
+        memcpy(this, &tensor_create_info, 64);
+        buffer_addr = addr;
+        // Create-info outputs are always contiguous with start_offset = 0;
+        // extent_elem = prod(shapes); stride is row-major.
+        uint64_t numel = 1;
+        for (uint32_t i = 0; i < tensor_create_info.ndims; i++) {
+            numel *= tensor_create_info.shapes[i];
+        }
+        extent_elem_cache = numel;
+        uint32_t s = 1;
+        for (int32_t i = static_cast<int32_t>(tensor_create_info.ndims) - 1; i >= 0; i--) {
+            strides[i] = s;
+            s *= tensor_create_info.shapes[i];
+        }
+    }
+
+    /**
+     * Effective element extent of this entry.
+     * Contiguous-aligned views compute it from shapes alone (line 1 hit only);
+     * non-contiguous views read the cached value from line 2.
+     */
+    uint64_t effective_extent_elem() const {
+        if (is_contiguous) {
+            uint64_t n = 1;
+            for (uint32_t i = 0; i < ndims; i++)
+                n *= shapes[i];
+            return n;
+        }
+        return extent_elem_cache;
+    }
+
+    /**
+     * Check overlap between input tensor and this entry (the producer output).
+     *
+     * Three-level cascade:
+     *   L1 — O(1) byte-range intersection. Disjoint -> NO_OVERLAP.
+     *   L2 — O(ndims) hyper-rectangle precise check, eligible only when both
+     *        sides share the same canonical row-major axis layout (same
+     *        dtype/ndims/strides[], stride descends as integer multiples,
+     *        start_offset decomposes cleanly under the reference shape).
+     *        Yields NO_OVERLAP / COVERED / OTHER per-dim.
+     *   L3 — Non-hyper-rectangle pairs (transpose/permute mismatch, slice
+     *        with step, etc): conservative OTHER. Exact enumeration via
+     *        contiguous-segment merge is scheduled for a follow-up.
+     *
+     * COVERED is returned when `input` completely contains `entry` per-dim
+     * — dep_compute uses this to retire the now-redundant entry.
+     */
+    OverlapStatus check_overlap(const Tensor &input) const {
+        debug_assert(input.buffer.addr == buffer_addr);
+        debug_assert(input.version >= version);
+        if (input.version > version) {
+            return OverlapStatus::OTHER;
+        }
+
+        // -------- L1: byte-range intersection (O(1) fast reject) --------
+        const uint64_t in_begin = input.start_offset;
+        const uint64_t in_end = input.start_offset + input.extent_elem();
+        const uint64_t ent_begin = start_offset;
+        const uint64_t ent_end = start_offset + effective_extent_elem();
+        Segment in_range_bytes{in_begin, in_end};
+        Segment ent_range_bytes{ent_begin, ent_end};
+        if (!in_range_bytes.line_segment_intersection(ent_range_bytes)) {
+            return OverlapStatus::NO_OVERLAP;
+        }
+
+        // -------- L2 prereqs: same axis layout? --------
+        if (input.dtype != dtype || input.ndims != ndims || ndims == 0) {
+            return OverlapStatus::OTHER;
+        }
+        for (uint32_t i = 0; i < ndims; i++) {
+            if (input.strides[i] != strides[i]) return OverlapStatus::OTHER;
+        }
+        // strides[ndims-1] must be 1 and strides[i-1] must be an integer
+        // multiple of strides[i] for the row-major reference-shape derivation
+        // below to hold. This rejects slice-with-step (strides[d] != prev factor)
+        // and any view chain that scrambles the axis order. (strides is
+        // uint32_t with the > 0 invariant enforced at construction, so no
+        // sign check needed.)
+        if (strides[ndims - 1] != 1) return OverlapStatus::OTHER;
+        for (uint32_t i = 1; i < ndims; i++) {
+            if (strides[i - 1] % strides[i] != 0) return OverlapStatus::OTHER;
+        }
+
+        // Derive reference shape A from stride. By construction stride is
+        // row-major over A: strides[i] = prod(A[i+1..ndims-1]). So
+        //   A[i] = strides[i-1] / strides[i]   for i >= 1
+        //   A[0] = (buffer.size / dtype_bytes) / strides[0]
+        // input.buffer.size is the storage size; entry shares the same buffer
+        // (debug-asserted by buffer.addr equality at the top), so we read it
+        // from input rather than mirroring buffer.size into the entry.
+        //
+        // Note on buffer padding: runtime allocators may over-allocate
+        // `buffer.size` (cache-line / 1024B alignment, ring-buffer slot
+        // rounding, etc). When that happens, `numel_storage` is larger than
+        // the true logical extent and `ref_shapes[0]` ends up generously over-
+        // sized. This is intentional: ref_shapes is only used as an *upper
+        // bound* in the in-bounds checks below; the actual overlap test (the
+        // per-dim line-segment intersection on the real start_offset /
+        // shapes / stride further down) is unaffected. A larger-than-truth
+        // ref_shapes[0] simply makes the bounds check more permissive — it
+        // can never cause a false NO_OVERLAP nor a false COVERED.
+        uint32_t ref_shapes[MAX_TENSOR_DIMS] = {};
+        for (uint32_t i = 1; i < ndims; i++) {
+            ref_shapes[i] = strides[i - 1] / strides[i];
+        }
+        const uint64_t elem_size = get_element_size(dtype);
+        if (elem_size == 0) return OverlapStatus::OTHER;
+        const uint64_t numel_storage = input.buffer.size / elem_size;
+        const uint32_t stride0 = strides[0];  // > 0 by Tensor invariant
+        if (numel_storage % stride0 != 0) return OverlapStatus::OTHER;
+        ref_shapes[0] = static_cast<uint32_t>(numel_storage / stride0);
+
+        // Decompose start_offset into row-major multi-dim offsets. By the same
+        // relation strides[i] = prod(ref_shapes[i+1..]) so dividing by strides[i]
+        // (no inner loop) yields each axis offset directly.
+        uint32_t in_offsets[MAX_TENSOR_DIMS] = {};
+        uint32_t ent_offsets[MAX_TENSOR_DIMS] = {};
+        uint64_t in_remain = input.start_offset;
+        uint64_t ent_remain = start_offset;
+        for (uint32_t i = 0; i < ndims; i++) {
+            const uint32_t s = strides[i];
+            in_offsets[i] = static_cast<uint32_t>(in_remain / s);
+            ent_offsets[i] = static_cast<uint32_t>(ent_remain / s);
+            in_remain %= s;
+            ent_remain %= s;
+        }
+        if (in_remain != 0 || ent_remain != 0) return OverlapStatus::OTHER;
+
+        // Validate that each side fits within ref_shapes (defense in depth —
+        // a well-formed view always satisfies this).
+        for (uint32_t i = 0; i < ndims; i++) {
+            if (static_cast<uint64_t>(in_offsets[i]) + input.shapes[i] > ref_shapes[i]) return OverlapStatus::OTHER;
+            if (static_cast<uint64_t>(ent_offsets[i]) + shapes[i] > ref_shapes[i]) return OverlapStatus::OTHER;
+        }
+
+        // -------- L2 core: per-dim line-segment intersection --------
+        bool input_contains_entry = true;
+        for (uint32_t i = 0; i < ndims; i++) {
+            Segment in_seg{in_offsets[i], static_cast<uint64_t>(in_offsets[i]) + input.shapes[i]};
+            Segment ent_seg{ent_offsets[i], static_cast<uint64_t>(ent_offsets[i]) + shapes[i]};
+            if (!in_seg.line_segment_intersection(ent_seg)) {
+                return OverlapStatus::NO_OVERLAP;
+            }
+            if (!in_seg.contains(ent_seg)) {
+                input_contains_entry = false;
+            }
+        }
+        return input_contains_entry ? OverlapStatus::COVERED : OverlapStatus::OTHER;
+    }
+};
+
+static_assert(sizeof(PTO2TensorMapEntry) == 128, "TensorMapEntry must be exactly 2 cache lines (128 bytes)");
+static_assert(offsetof(PTO2TensorMapEntry, buffer_addr) == offsetof(Tensor, buffer.addr));
+static_assert(offsetof(PTO2TensorMapEntry, producer_task_id) == offsetof(Tensor, owner_task_id));
+static_assert(offsetof(PTO2TensorMapEntry, start_offset) == offsetof(Tensor, start_offset));
+static_assert(offsetof(PTO2TensorMapEntry, version) == offsetof(Tensor, version));
+static_assert(offsetof(PTO2TensorMapEntry, ndims) == offsetof(Tensor, ndims));
+static_assert(offsetof(PTO2TensorMapEntry, dtype) == offsetof(Tensor, dtype));
+static_assert(offsetof(PTO2TensorMapEntry, manual_dep) == offsetof(Tensor, manual_dep));
+static_assert(offsetof(PTO2TensorMapEntry, is_contiguous) == offsetof(Tensor, is_contiguous));
+static_assert(offsetof(PTO2TensorMapEntry, shapes) == offsetof(Tensor, shapes));
+static_assert(
+    offsetof(PTO2TensorMapEntry, prev_in_bucket) == 64, "TensorMapEntry must be exactly 2 cache lines (128 bytes)"
+);
+
+// =============================================================================
+// TensorMap Lookup Chain Length Statistics (compile-time toggle)
+// =============================================================================
+
+/**
+ * TensorMap structure
+ *
+ * Hash table with ring buffer entry pool and lazy invalidation.
+ */
+struct PTO2TensorMap {
+    // Hash table buckets (fixed size, power of 2)
+    PTO2TensorMapEntry **buckets;  // Array of offsets into entry_pool (-1 = empty)
+    int32_t num_buckets;           // Must be power of 2 for fast modulo
+
+    // Entry pool as ring buffer
+    PTO2TensorMapEntry *entry_pool;        // Ring buffer of entries
+    PTO2TensorMapEntry **free_entry_list;  // free entry ids
+    int32_t pool_size;                     // Total pool capacity
+    int32_t next_entry_idx;                // id when next entry insert
+    int32_t free_num;                      // free entry number in entry pool
+
+    // Per-ring per-task entry tracking (for efficient bucket cleanup)
+    // Indexed by [ring_id][local_id & (task_window_sizes[ring_id] - 1)]
+    PTO2TensorMapEntry **task_entry_heads[PTO2_MAX_RING_DEPTH];
+    int32_t task_window_sizes[PTO2_MAX_RING_DEPTH];  // Per-ring task window size (for slot masking)
+
+    // Per-ring validity threshold (for lazy invalidation)
+    int32_t last_task_alives[PTO2_MAX_RING_DEPTH];  // Cached from shared memory per ring
+
+    // Per-ring cleanup progress (for periodic cleanup_retired)
+    int32_t last_cleanup[PTO2_MAX_RING_DEPTH]{};
+
+    uint32_t get_task_local_id_slot(uint8_t ring_id, uint32_t task_local_id) const {
+        return task_local_id & (task_window_sizes[ring_id] - 1);
+    }
+
+    // Accessors read by scope_stats_collector. Declared unconditionally so the
+    // collector .cpp compiles at PTO2_PROFILING=0 (collector is unconditional —
+    // setter symbols must export for host dlsym; the probe call sites that use
+    // these accessors stay gated by PTO2_PROFILING).
+    int32_t current_used() const { return next_entry_idx - free_num; }
+    int32_t pool_capacity() const { return pool_size; }
+
+    // new_entry only allocates memory, does not assign attributes
+    PTO2TensorMapEntry *new_entry() {
+        if (free_num > 0) {
+            PTO2TensorMapEntry *res = free_entry_list[--free_num];
+            debug_assert(res->bucket_index == -1);
+            return res;
+        }
+        always_assert(next_entry_idx < pool_size);
+        PTO2TensorMapEntry *res = &entry_pool[next_entry_idx++];
+        debug_assert(res->bucket_index == -1);
+        return res;
+    }
+
+    void free_entry(PTO2TensorMapEntry &entry) {
+        always_assert(entry.bucket_index != -1);  // must still be in a bucket
+
+        // Update predecessor's next pointer (O(1) via prev_in_bucket)
+        if (entry.prev_in_bucket == nullptr) {
+            // Entry is the head of its bucket chain, update bucket head
+            // Must compute hash BEFORE clearing tensor
+            buckets[entry.bucket_index] = entry.next_in_bucket;
+        } else {
+            entry.prev_in_bucket->next_in_bucket = entry.next_in_bucket;
+        }
+
+        // Update successor's prev pointer
+        if (entry.next_in_bucket != nullptr) {
+            entry.next_in_bucket->prev_in_bucket = entry.prev_in_bucket;
+        }
+
+        free_entry_list[free_num++] = &entry;
+        entry.bucket_index = -1;
+        entry.next_in_bucket = nullptr;
+        entry.prev_in_bucket = nullptr;
+        entry.next_in_task = nullptr;
+        entry.prev_in_task = nullptr;
+    }
+
+    // =============================================================================
+    // TensorMap API
+    // =============================================================================
+
+    /**
+     * Phase 1: reserve every sub-region (buckets, entry_pool, free list, per-ring
+     * task_entry_heads) on the supplied arena. Records the resulting offsets in
+     * the returned layout descriptor. Must be called before the arena is
+     * committed.
+     */
+    static PTO2TensorMapLayout reserve_layout(
+        DeviceArena &arena, int32_t num_buckets, int32_t pool_size, const int32_t task_window_sizes[PTO2_MAX_RING_DEPTH]
+    );
+
+    /**
+     * Same as reserve_layout() with default sizes (PTO2_TENSORMAP_NUM_BUCKETS,
+     * PTO2_TENSORMAP_POOL_SIZE).
+     */
+    static PTO2TensorMapLayout
+    reserve_layout_default(DeviceArena &arena, const int32_t task_window_sizes[PTO2_MAX_RING_DEPTH]);
+
+    /**
+     * Phase 3a: write everything *except* arena-internal pointer fields
+     * (buckets, entry_pool, free_entry_list, task_entry_heads[r]).
+     * Uses arena.region_ptr to address the arena regions for data writes,
+     * but does not store those addresses in struct fields. Safe to call on
+     * a host arena that holds the prebuilt image.
+     */
+    bool init_data_from_layout(const PTO2TensorMapLayout &layout, DeviceArena &arena);
+
+    /**
+     * Phase 3b: write the arena-internal pointer fields. Idempotent;
+     * called once on the host arena and once on the AICPU after attach.
+     */
+    void wire_arena_pointers(const PTO2TensorMapLayout &layout, DeviceArena &arena);
+
+    /**
+     * Tear down state. Does not free memory — the arena owns the backing
+     * buffer. Pointers are set to nullptr so accidental reuse traps.
+     */
+    void destroy();
+
+    /**
+     * Update validity threshold from shared memory
+     * Called periodically to refresh the lazy invalidation threshold.
+     *
+     * @param last_task_alive  Current value from shared memory
+     */
+    void sync_validity(int32_t ring_id, int32_t last_task_alive) { this->last_task_alives[ring_id] = last_task_alive; }
+
+    /**
+     * Lookup producer for a tensor region
+     *
+     * Searches the hash table for matching regions and invokes the callback
+     * for each overlapping valid entry.
+     * Stale entries from different rings are skipped (not truncated).
+     *
+     * The callback receives (PTO2TensorMapEntry &, OverlapStatus) and should
+     * return true to continue iteration, false to stop early. It is safe for
+     * the callback to call remove_entry() on the current entry: next_in_bucket
+     * is latched before invocation.
+     *
+     * @param tensor    Tensor to look up
+     * @param on_match  Callback invoked for each overlapping entry
+     */
+    template <typename Fn>
+    void lookup(const Tensor &tensor, Fn &&on_match) {
+        uint32_t bucket_index = hash(tensor.buffer.addr);
+        PTO2TensorMapEntry *cur_entry = buckets[bucket_index];
+
+#if PTO2_TENSORMAP_PROFILING
+        g_lookup_count++;
+        int32_t chain_len = 0;
+#endif
+
+        while (cur_entry != nullptr) {
+            PTO2TensorMapEntry *next_entry = cur_entry->next_in_bucket;
+
+#if PTO2_TENSORMAP_PROFILING
+            chain_len++;
+#endif
+            // Skip stale entries (no chain truncation — entries from different
+            // rings can be interleaved, so a stale entry from one ring does NOT
+            // imply subsequent entries from other rings are also stale)
+            if (!entry_valid(*cur_entry)) {
+                cur_entry = next_entry;
+                continue;
+            }
+
+            // Entry is valid - check if regions OVERLAP (not just exact match)
+            // Since we hash only by base_ptr, all entries in this bucket have
+            // potential to overlap. We must check actual byte-range overlap.
+            if (tensor.buffer.addr == cur_entry->buffer_addr) {
+#if PTO2_TENSORMAP_PROFILING
+                g_lookup_overlap_checks++;
+#endif
+                auto overlap_status = cur_entry->check_overlap(tensor);
+                if (overlap_status != OverlapStatus::NO_OVERLAP) {
+#if PTO2_TENSORMAP_PROFILING
+                    g_lookup_overlap_hits++;
+#endif
+                    if (!on_match(*cur_entry, overlap_status)) {
+#if PTO2_TENSORMAP_PROFILING
+                        g_lookup_chain_total += chain_len;
+                        if (chain_len > g_lookup_chain_max) g_lookup_chain_max = chain_len;
+#endif
+                        return;
+                    }
+                }
+            }
+
+            // Move to next entry
+            cur_entry = next_entry;
+        }
+#if PTO2_TENSORMAP_PROFILING
+        g_lookup_chain_total += chain_len;
+        if (chain_len > g_lookup_chain_max) g_lookup_chain_max = chain_len;
+#endif
+    }
+
+    /**
+     * Insert a new entry (called when task produces output)
+     *
+     * Allocates from ring buffer pool, may overwrite stale entries.
+     * Inserts at head of hash bucket chain (maintains task_id ordering).
+     *
+     * @param tensor            Tensor produced
+     * @param producer_task_id  Task ID of producer
+     */
+    void insert(const Tensor &tensor, PTO2TaskId producer_task_id) {
+        PTO2TensorMapEntry *entry = new_entry();
+        entry->copy_from_tensor(tensor);
+        link_entry(entry, tensor.buffer.addr, producer_task_id);
+    }
+
+    /**
+     * Cleanup stale entries for retired tasks
+     *
+     * Called periodically by Orchestrator when last_task_alive advances.
+     * Removes entries from bucket chains for tasks in [old, new) range.
+     *
+     * @param old_last_task_alive  Previous threshold
+     * @param new_last_task_alive  New threshold
+     */
+    void cleanup_retired(int32_t ring_id, int32_t old_last_task_alive, int32_t new_last_task_alive) {
+        // Iterate through retired tasks on this ring and remove their entries
+        for (int32_t local_id = old_last_task_alive; local_id < new_last_task_alive; local_id++) {
+            int32_t task_slot = local_id & (task_window_sizes[ring_id] - 1);
+            PTO2TensorMapEntry *cur_entry = task_entry_heads[ring_id][task_slot];
+
+            while (cur_entry != nullptr) {
+                PTO2TensorMapEntry *next_entry = cur_entry->next_in_task;  // Save before clearing
+                // Only remove if this entry belongs to the retiring task
+                // (slot may have been reused by a newer task)
+                debug_assert(
+                    cur_entry->producer_task_id ==
+                    PTO2TaskId::make(static_cast<uint8_t>(ring_id), static_cast<uint32_t>(local_id))
+                );
+                free_entry(*cur_entry);
+                cur_entry = next_entry;
+            }
+
+            // Clear task's entry head (slot will be reused by local_id + task_window_sizes[ring_id])
+            task_entry_heads[ring_id][task_slot] = nullptr;
+        }
+    }
+
+    // =============================================================================
+    // Internal Helpers (exposed for testing)
+    // =============================================================================
+
+    /**
+     * Compute hash for tensor addr
+     *
+     * Multiplicative hash using the golden-ratio constant.  Multiplication
+     * mixes ALL input bits into the high bits of the product, so aligned
+     * addresses (low bits all-zero) still distribute evenly.  We extract
+     * the top log2(num_buckets) bits which carry the most entropy.
+     */
+    uint32_t hash(uint64_t key) {
+        key *= 0x9E3779B97F4A7C15ULL;
+        return static_cast<uint32_t>(key >> (64 - __builtin_ctz(num_buckets)));
+    }
+
+    /**
+     * Link an initialized entry into bucket and task chains.
+     */
+    void link_entry(PTO2TensorMapEntry *entry, uint64_t addr, PTO2TaskId producer_task_id) {
+#if PTO2_TENSORMAP_PROFILING
+        g_insert_count++;
+#endif
+        uint32_t bucket_index = hash(addr);
+        auto ring_id = producer_task_id.ring();
+        auto local_id = producer_task_id.local();
+        int32_t task_slot = local_id & (task_window_sizes[ring_id] - 1);
+
+        entry->producer_task_id = producer_task_id;
+
+        // Insert at head of hash bucket
+        entry->bucket_index = bucket_index;
+        entry->next_in_bucket = buckets[bucket_index];
+        if (entry->next_in_bucket != nullptr) {
+            entry->next_in_bucket->prev_in_bucket = entry;
+        }
+        buckets[bucket_index] = entry;
+        entry->prev_in_bucket = nullptr;
+
+        // Link to task's entry list
+        entry->next_in_task = task_entry_heads[ring_id][task_slot];
+        entry->prev_in_task = nullptr;
+        if (entry->next_in_task != nullptr) {
+            entry->next_in_task->prev_in_task = entry;
+        }
+        task_entry_heads[ring_id][task_slot] = entry;
+    }
+
+    /**
+     * Check if entry is valid (producer has not retired)
+     */
+    bool entry_valid(const PTO2TensorMapEntry &entry) const {
+        return static_cast<int32_t>(entry.producer_task_id.local()) >= last_task_alives[entry.producer_task_id.ring()];
+    }
+
+    void remove_entry(PTO2TensorMapEntry &entry) {
+        remove_from_task(entry);
+        free_entry(entry);
+    }
+
+    /**
+     * Remove entry from its task chain (O(1) with prev pointer)
+     * Called during pool wrap-around to unlink reused entries.
+     */
+    void remove_from_task(PTO2TensorMapEntry &entry) {
+        always_assert(entry.bucket_index != -1);  // must still be in a bucket
+        // Update predecessor's next pointer (O(1) via prev_in_task)
+        if (entry.prev_in_task == nullptr) {
+            // Entry is the head of its task chain, update task_entry_heads
+            int32_t ring_id = entry.producer_task_id.ring();
+            int32_t local_id = static_cast<int32_t>(entry.producer_task_id.local());
+            int32_t task_slot = local_id & (task_window_sizes[ring_id] - 1);
+            task_entry_heads[ring_id][task_slot] = entry.next_in_task;
+        } else {
+            entry.prev_in_task->next_in_task = entry.next_in_task;
+        }
+
+        // Update successor's prev pointer
+        if (entry.next_in_task != nullptr) {
+            entry.next_in_task->prev_in_task = entry.prev_in_task;
+        }
+
+        entry.next_in_task = nullptr;
+        entry.prev_in_task = nullptr;
+    }
+
+    // =============================================================================
+    // Debug Utilities
+    // =============================================================================
+
+    /**
+     * Print TensorMap statistics
+     */
+    void print_stats();
+
+    /**
+     * Get count of valid entries
+     */
+    int32_t valid_count();
+
+    // =============================================================================
+    // TensorMap Synchronization
+    // =============================================================================
+
+    /**
+     * Sync TensorMap validity threshold from shared memory
+     *
+     * Called periodically to refresh the lazy invalidation threshold.
+     * Also triggers cleanup if threshold has advanced significantly.
+     */
+    void sync_tensormap(PTO2TaskId task_id, int32_t sm_last_task_alive);
+};
+
+#if PTO2_TENSORMAP_PROFILING
+struct PTO2TensorMapProfilingData {
+    uint64_t lookup_chain_total;
+    uint64_t lookup_count;
+    int32_t lookup_chain_max;
+    uint64_t overlap_checks;
+    uint64_t overlap_hits;
+    uint64_t insert_count;
+};
+
+PTO2TensorMapProfilingData pto2_tensormap_get_profiling();
+#endif
diff --git a/src/a2a3/runtime/fully_distributed_within_core/runtime/pto_types.h b/src/a2a3/runtime/fully_distributed_within_core/runtime/pto_types.h
new file mode 100644
index 000000000..65d593a49
--- /dev/null
+++ b/src/a2a3/runtime/fully_distributed_within_core/runtime/pto_types.h
@@ -0,0 +1,614 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Orchestration Build Graph Types - Data structures for orchestration runtime extensions
+ *
+ * Standalone header defining orchestration-specific types for:
+ * - TaskOutputTensors: Return value from submit containing materialized output Tensors
+ * - Arg: Aggregated argument container for pto_submit_task API
+ *
+ * Tensor descriptor types (Tensor, PTOBufferHandle, TensorCreateInfo) are
+ * defined in tensor.h.
+ *
+ * This header is independent of orch_build_graph_runtime.h to allow inclusion from runtime.h
+ * without type conflicts (Handshake, TensorPair, HostApi).
+ */
+
+#ifndef SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_PTO_TYPES_H_
+#define SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_PTO_TYPES_H_
+
+#include <stdint.h>
+#include <string.h>
+
+#include <string>
+#include <type_traits>
+#include <utility>
+
+#if defined(__aarch64__)
+#include <arm_neon.h>
+#endif
+
+#include "aicpu/dump_arg_selection.h"
+#include "data_type.h"
+#include "profiling_config.h"
+#include "pto_submit_types.h"
+#include "task_args.h"
+#include "tensor.h"
+#include "tensor_create_info.h"  // runtime-only TensorCreateInfo + materialization helpers
+
+typedef enum {
+    ASYNC_ENGINE_SDMA = 0,
+    ASYNC_ENGINE_ROCE = 1,
+    ASYNC_ENGINE_URMA = 2,
+    ASYNC_ENGINE_CCU = 3,
+    NUM_ASYNC_ENGINES = 4,
+} AsyncEngine;
+
+enum class CompletionType : int32_t {
+    COUNTER = 0,
+};
+
+// =============================================================================
+// Task Output Tensors (return value from submit)
+// =============================================================================
+
+enum class PTO2ScopeMode : uint8_t {
+    AUTO = 0,
+    MANUAL = 1,
+};
+
+/**
+ * TaskOutputTensors — returned by submit, holds materialized output Tensors.
+ *
+ * Only runtime-created outputs are stored here, indexed in add_output order.
+ *
+ * The underlying storage is uninitialized; only output_count elements are
+ * valid after submit returns.  This avoids default-constructing Tensor[]
+ * on the hot path (2 KB of unnecessary zeroing per submit).
+ *
+ * Users must hold a named TaskOutputTensors variable and borrow via get_ref();
+ * binding get_ref() on an rvalue is compile-time rejected to prevent dangling.
+ *
+ * LIFETIME — single-scope only:
+ *   Internally this class stores pointers into the submitting task's payload
+ *   (PTO2TaskPayload::tensors[]), which lives in a ring-buffer slot. After
+ *   scope_end the slot becomes eligible for reuse, and a later submit will
+ *   overwrite the same Tensor storage in place. Therefore the
+ *   TaskOutputTensors instance, the const Tensor& returned by get_ref(), and
+ *   any pointer derived from either MUST NOT outlive the PTO2_SCOPE in which
+ *   submit was called — do not move/copy them to outer-scope variables, do
+ *   not capture references by std::reference_wrapper or raw pointers across
+ *   scope boundaries.
+ *
+ *   This invariant is intentionally not enforced at runtime: a reused slot
+ *   simply carries a different but valid owner_task_id, so checking
+ *   owner_task_id cannot distinguish "still mine" from "silently aliased to
+ *   an unrelated task". Misuse manifests as a wrong-tensor read with no
+ *   diagnostic.
+ */
+class TaskOutputTensors {
+public:
+    TaskOutputTensors() :
+        task_id_(PTO2TaskId::invalid()),
+        output_count_(0) {}
+
+    bool empty() const { return output_count_ == 0; }
+    uint32_t size() const { return output_count_; }
+
+    /// Borrow a materialized output tensor by index (lvalue only).
+    const Tensor &get_ref(uint32_t index) const & {
+        always_assert(index < output_count_);
+        return *tensors_[index];
+    }
+    const Tensor &get_ref(uint32_t index) const && = delete;
+
+    /// Runtime-internal: append one materialized output Tensor.
+    void materialize_output(const Tensor &tensor) {
+        always_assert(output_count_ < MAX_TENSOR_ARGS);
+        tensors_[output_count_++] = &tensor;
+    }
+
+    void set_task_id(PTO2TaskId id) { task_id_ = id; }
+
+    PTO2TaskId task_id() const { return task_id_; }
+
+private:
+    PTO2TaskId task_id_;
+    uint32_t output_count_;
+    // Upper bound: a task cannot have more outputs than total tensor args
+    // (every OUTPUT/OUTPUT_EXISTING slot is one of the Arg's tensor slots).
+    const Tensor *tensors_[MAX_TENSOR_ARGS];
+};
+
+using TaskSubmitResult = TaskOutputTensors;
+
+// =============================================================================
+// Argument Types (for pto_submit_task API)
+// =============================================================================
+
+// TensorArgType is defined in tensor.h (included via task_args.h above)
+
+/**
+ * Tagged reference to a single Arg slot — either a Tensor* or a
+ * TensorCreateInfo*. The active member is determined by the slot's
+ * TensorArgType tag (OUTPUT → create_info, else → tensor pointer).
+ *
+ * Minimal-permission: the union members are private; content is set only via
+ * operator=(ptr) and read via ref()/create_info(). Copy/move are deleted — a
+ * TensorRef is written in place inside an Arg's slot array, never passed by
+ * value.
+ */
+class TensorRef {
+    union {
+        const Tensor *ptr_;
+        const TensorCreateInfo *create_info_;
+    };
+
+public:
+    TensorRef() :
+        ptr_(nullptr) {}
+    TensorRef(const TensorRef &) = delete;
+    TensorRef(TensorRef &&) = delete;
+    TensorRef &operator=(const TensorRef &) = delete;
+    TensorRef &operator=(TensorRef &&) = delete;
+
+    TensorRef &operator=(const Tensor *p) {
+        ptr_ = p;
+        return *this;
+    }
+    TensorRef &operator=(const TensorCreateInfo *ci) {
+        create_info_ = ci;
+        return *this;
+    }
+
+    const Tensor &ref() const { return *ptr_; }
+    const TensorCreateInfo &create_info() const { return *create_info_; }
+    bool refers_to(const Tensor *t) const { return ptr_ == t; }
+    bool refers_to(const TensorCreateInfo *ci) const { return create_info_ == ci; }
+};
+
+/**
+ * Aggregated argument container for pto_submit_task
+ *
+ * Inherits storage from TaskArgsTpl<TensorRef, uint64_t, MAX_TENSOR_ARGS, MAX_SCALAR_ARGS, TensorArgType>.
+ * Each tensor slot stores a TensorRef union (Tensor* or TensorCreateInfo)
+ * discriminated by the corresponding tag().
+ * Tensors are dispatched first in kernel args, followed by scalars.
+ *
+ * Output arguments follow two distinct ownership models:
+ * - add_output(const TensorCreateInfo&): OUTPUT — runtime allocates buffer
+ *   and materializes a new Tensor, returned via TaskOutputTensors.
+ * - add_inout(const Tensor&): INOUT — reuses an existing Tensor as the write target.
+ *
+ * Example:
+ *   Tensor x = make_tensor_external(dev_a, shapes, 2);
+ *   TensorCreateInfo ci(shapes, 2);  // must outlive submit
+ *   Arg args;
+ *   args.add_input(x);
+ *   args.add_output(ci);
+ *   args.add_scalar(some_value);
+ *   TaskOutputTensors outs = rt_submit_aic_task(kernel_id, args);
+ *   const Tensor& y = outs.get_ref(0);
+ */
+template <size_t MaxT, size_t MaxS>
+struct Arg : TaskArgsTpl<TensorRef, uint64_t, MaxT, MaxS, TensorArgType> {
+    using Base = TaskArgsTpl<TensorRef, uint64_t, MaxT, MaxS, TensorArgType>;
+    // Make dependent-base members visible for unqualified use (two-phase lookup
+    // does not search a dependent base in a class template).
+    using Base::scalar_count_;
+    using Base::scalars_;
+    using Base::tags_;
+    using Base::tensor_count_;
+    using Base::tensors_;
+
+    // Minimal-permission: an Arg is built in place and consumed by reference;
+    // it is never copied/moved (it is a large object, and its TensorRef slots
+    // are non-copyable by design).
+    Arg() = default;
+    Arg(const Arg &) = delete;
+    Arg(Arg &&) = delete;
+    Arg &operator=(const Arg &) = delete;
+    Arg &operator=(Arg &&) = delete;
+
+    bool has_error{false};
+    const char *error_msg{nullptr};
+    PTO2LaunchSpec launch_spec;  // SPMD launch parameters (block_num, etc.)
+
+    // Speculative early-dispatch hint (codegen-author set, off by default). When
+    // true, the scheduler may stage this task on an idle core before its producer
+    // finishes, gating execution on the DATA_MAIN_BASE doorbell — only safe when
+    // the author knows the task's data dependencies allow it. Read in-process by
+    // the runtime; never crosses the wire format.
+    bool allow_early_resolve_{false};
+    void set_allow_early_resolve(bool v = true) { allow_early_resolve_ = v; }
+    bool allow_early_resolve() const { return allow_early_resolve_; }
+
+    void clear() {
+        Base::clear();
+#if PTO2_PROFILING
+        dump_arg_selection_.clear();
+#endif
+        explicit_deps_ = nullptr;
+        explicit_dep_count_ = 0;
+        allow_early_resolve_ = false;
+    }
+
+    void reset() {
+        clear();
+        has_error = false;
+        error_msg = nullptr;
+    }
+
+    void set_error(const char *msg) {
+        if (!has_error) {
+            has_error = true;
+            error_msg = msg;
+        }
+    }
+
+    template <typename... Args>
+    void dump(Args &&...args) {
+#if PTO2_PROFILING
+        static_assert(
+            (std::is_lvalue_reference_v<Args> && ...),
+            "dump: temporaries are not allowed — pass tensors/scalars already added to this Arg"
+        );
+        static_assert(
+            (is_supported_dump_arg_v<Args> && ...),
+            "dump: all arguments must be Tensor, TensorCreateInfo, or scalar lvalues"
+        );
+        if constexpr (sizeof...(Args) == 0) {
+            mark_all_dump_args();
+        } else {
+            (mark_dump_arg(args), ...);
+        }
+#else
+        ((void)args, ...);
+#endif
+    }
+
+#if PTO2_PROFILING
+    uint64_t dump_arg_mask() const { return dump_arg_selection_.dump_arg_mask(); }
+    uint64_t dump_arg_index_ambiguous_mask() const { return dump_arg_selection_.dump_arg_index_ambiguous_mask(); }
+#else
+    uint64_t dump_arg_mask() const { return 0; }
+    uint64_t dump_arg_index_ambiguous_mask() const { return 0; }
+#endif
+
+    template <typename... Args>
+    void add_input(Args &&...args) {
+        assert_add_tensor_args<false, Args...>();
+        if (!check_add_tensor_capacity(static_cast<int32_t>(sizeof...(Args)))) {
+            return;
+        }
+        ((tensors_[tensor_count_] = &args, tags_[tensor_count_] = TensorArgType::INPUT, tensor_count_++), ...);
+    }
+
+    /// Batch add outputs — all Tensor or all TensorCreateInfo:
+    ///   add_output(ci1, ci2)         — runtime allocates buffers (OUTPUT)
+    ///   add_output(t1, t2)           — write-only existing tensors (OUTPUT_EXISTING)
+    template <typename... Args>
+    void add_output(Args &&...args) {
+        assert_add_tensor_args<true, Args...>();
+        if (!check_add_tensor_capacity(static_cast<int32_t>(sizeof...(Args)))) return;
+        if constexpr ((std::is_same_v<std::decay_t<Args>, TensorCreateInfo> && ...)) {
+            ((tensors_[tensor_count_] = &args, tags_[tensor_count_] = TensorArgType::OUTPUT, tensor_count_++), ...);
+        } else {
+            ((tensors_[tensor_count_] = &args, tags_[tensor_count_] = TensorArgType::OUTPUT_EXISTING, tensor_count_++),
+             ...);
+        }
+    }
+
+    template <typename... Args>
+    void add_inout(Args &&...args) {
+        assert_add_tensor_args<false, Args...>();
+        if (!check_add_tensor_capacity(static_cast<int32_t>(sizeof...(Args)))) {
+            return;
+        }
+        ((tensors_[tensor_count_] = &args, tags_[tensor_count_] = TensorArgType::INOUT, tensor_count_++), ...);
+    }
+
+    /// No-dependency existing tensor: skips OverlapMap lookup, depends on creator only.
+    template <typename... Args>
+    void add_no_dep(Args &&...args) {
+        assert_add_tensor_args<false, Args...>();
+        if (!check_add_tensor_capacity(static_cast<int32_t>(sizeof...(Args)))) return;
+        ((tensors_[tensor_count_] = &args, tags_[tensor_count_] = TensorArgType::NO_DEP, tensor_count_++), ...);
+    }
+
+    /**
+     * Attach an explicit dependency array. The Arg stores (ptr, count) without
+     * copying — the caller's array must outlive the submit (same lifetime rule
+     * as add_input/add_output, which also store pointers).
+     *
+     * count == 0 is a valid "set empty" — it clears any previously stored deps
+     * and returns. This lets callers that build the dep set conditionally pass
+     * the result through unguarded, including in the no-dep branch:
+     *   PTO2TaskId deps[3];
+     *   uint32_t n = 0;
+     *   if (have_prev) deps[n++] = prev;
+     *   if (is_last)   deps[n++] = alloc;
+     *   args.set_dependencies(deps, n);    // safe even if n == 0
+     *
+     * For count > 0, the call is single-shot: a second non-empty call after
+     * deps are already set will fail with set_error(). Use count == 0 first
+     * if you need to re-set.
+     */
+    void set_dependencies(const PTO2TaskId *deps, uint32_t count) {
+        if (count == 0) {
+            explicit_deps_ = nullptr;
+            explicit_dep_count_ = 0;
+            return;
+        }
+        if (deps == nullptr) {
+            set_error("set_dependencies: deps must not be null when count > 0");
+            return;
+        }
+        if (explicit_deps_ != nullptr) {
+            set_error("set_dependencies: may be called at most once per Arg");
+            return;
+        }
+        explicit_deps_ = deps;
+        explicit_dep_count_ = count;
+    }
+
+    uint32_t explicit_dep_count() const { return explicit_dep_count_; }
+
+    PTO2TaskId explicit_dep(uint32_t index) const {
+        always_assert(index < explicit_dep_count_);
+        return explicit_deps_[index];
+    }
+
+    const PTO2TaskId *explicit_deps_data() const { return explicit_deps_; }
+
+    /**
+     * Add scalar values. Types are deduced per argument; each value is
+     * bit-cast to uint64_t for storage. Mixed types are allowed:
+     *
+     *   args.add_scalar(uint64_val);                  // single
+     *   args.add_scalar(3.14f, int32_t(42), 7u);     // mixed batch
+     */
+    template <typename... Args>
+    void add_scalar(Args &&...args) {
+        static_assert(sizeof...(Args) >= 1, "add_scalar: at least one argument required");
+        static_assert((is_supported_scalar_arg_v<Args> && ...), "add_scalar: all types must be arithmetic or enum");
+        if (scalar_count_ + sizeof...(Args) > MaxS) {
+            set_error(scalar_cap_msg());
+            return;
+        }
+        (add_scalar_one(std::forward<Args>(args)), ...);
+    }
+
+    void add_scalars(const uint64_t *values, int count) {
+        if (count < 0 || scalar_count_ + count > MaxS) {
+            set_error(scalar_cap_msg());
+            return;
+        }
+        memcpy(&scalars_[scalar_count_], values, count * sizeof(uint64_t));
+#if PTO2_PROFILING
+        dump_arg_selection_.clear_scalar_metadata(scalar_count_, count);
+#endif
+        scalar_count_ += count;
+    }
+
+    /**
+     * Zero-extend int32 bit patterns into uint64 scalar slots.
+     * Negative values are treated as their unsigned 32-bit representation
+     * (e.g., -1 → 0x00000000FFFFFFFF, not 0xFFFFFFFFFFFFFFFF).
+     * Uses NEON to process 4 elements per iteration on aarch64.
+     */
+    void add_scalars_i32(const int32_t *values, int count) {
+        if (count < 0 || scalar_count_ + count > MaxS) {
+            set_error(scalar_cap_msg());
+            return;
+        }
+        uint64_t *dst = &scalars_[scalar_count_];
+#if defined(__aarch64__)
+        int i = 0;
+        for (; i + 4 <= count; i += 4) {
+            uint32x4_t v = vld1q_u32(reinterpret_cast<const uint32_t *>(values + i));
+            uint64x2_t lo = vmovl_u32(vget_low_u32(v));
+            uint64x2_t hi = vmovl_u32(vget_high_u32(v));
+            vst1q_u64(dst + i, lo);
+            vst1q_u64(dst + i + 2, hi);
+        }
+        for (; i < count; i++) {
+            dst[i] = static_cast<uint64_t>(static_cast<uint32_t>(values[i]));
+        }
+#else
+        for (int i = 0; i < count; i++) {
+            dst[i] = static_cast<uint64_t>(static_cast<uint32_t>(values[i]));
+        }
+#endif
+#if PTO2_PROFILING
+        dump_arg_selection_.clear_scalar_metadata(scalar_count_, count);
+#endif
+        scalar_count_ += count;
+    }
+
+    /**
+     * Copy scalars from another Arg's scalar array.
+     * Useful when multiple tasks share the same scalar data (e.g., block indices).
+     */
+    void copy_scalars_from(const Arg &src, int src_offset, int count) {
+        if (src_offset < 0 || count < 0 || src_offset + count > src.scalar_count_) {
+            set_error("Source scalar range out of bounds in copy_scalars_from");
+            return;
+        }
+        if (scalar_count_ + count > MaxS) {
+            set_error(scalar_cap_msg());
+            return;
+        }
+        memcpy(&scalars_[scalar_count_], &src.scalars_[src_offset], count * sizeof(uint64_t));
+#if PTO2_PROFILING
+        dump_arg_selection_.copy_scalar_dtypes_from(src.dump_arg_selection_, scalar_count_, src_offset, count);
+#endif
+        scalar_count_ += count;
+    }
+
+#if PTO2_PROFILING
+    const uint8_t *scalar_dtypes() const { return dump_arg_selection_.scalar_dtypes(); }
+#else
+    const uint8_t *scalar_dtypes() const { return nullptr; }
+#endif
+
+private:
+    // Caller-owned dependency array; lifetime must extend through submit.
+#if PTO2_PROFILING
+    DumpArgSelection dump_arg_selection_;
+#endif
+    const PTO2TaskId *explicit_deps_{nullptr};
+    uint32_t explicit_dep_count_{0};
+#if PTO2_PROFILING
+    template <typename T>
+    static constexpr bool is_supported_dump_arg_v =
+        std::is_same_v<std::decay_t<T>, Tensor> || std::is_same_v<std::decay_t<T>, TensorCreateInfo> ||
+        is_supported_scalar_arg_v<T>;
+#endif
+
+    // Capacity-overflow messages — spell the actual limit (MaxS/MaxT, whatever
+    // the instantiation is) into the text via std::to_string. Built once into a
+    // function-local static so set_error() can hold the const char* safely.
+    static const char *scalar_cap_msg() {
+        static const std::string msg = "Too many scalar args (max " + std::to_string(MaxS) + ")";
+        return msg.c_str();
+    }
+    static const char *tensor_cap_msg() {
+        static const std::string msg = "Too many tensor args (max " + std::to_string(MaxT) + ")";
+        return msg.c_str();
+    }
+
+    template <typename T>
+    void add_scalar_one(T &&value) {
+        scalars_[scalar_count_] = to_u64(value);
+#if PTO2_PROFILING
+        uintptr_t scalar_source_ptr = 0;
+        if constexpr (std::is_lvalue_reference_v<T>) {
+            scalar_source_ptr = reinterpret_cast<uintptr_t>(&value);
+        }
+        dump_arg_selection_.record_scalar_source(
+            scalar_count_, scalar_source_ptr, dtype_of<std::remove_cv_t<std::remove_reference_t<T>>>()
+        );
+#endif
+        scalar_count_++;
+    }
+
+#if PTO2_PROFILING
+    // No-arg dump(): mark every arg already added to this Arg.
+    void mark_all_dump_args() {
+        if (tensor_count_ == 0 && scalar_count_ == 0) {
+            set_error("dump: no arguments added to this Arg");
+            return;
+        }
+        dump_arg_selection_.mark_all(tensor_count_, scalar_count_);
+    }
+
+    void mark_dump_arg(const Tensor &tensor) {
+        for (int32_t i = 0; i < tensor_count_; i++) {
+            if (tags_[i] != TensorArgType::OUTPUT && tensors_[i].refers_to(&tensor)) {
+                dump_arg_selection_.mark_index(i);
+                return;
+            }
+        }
+        set_error("dump: tensor is not part of this Arg");
+    }
+
+    void mark_dump_arg(const TensorCreateInfo &create_info) {
+        for (int32_t i = 0; i < tensor_count_; i++) {
+            if (tags_[i] == TensorArgType::OUTPUT && tensors_[i].refers_to(&create_info)) {
+                dump_arg_selection_.mark_index(i);
+                return;
+            }
+        }
+        set_error("dump: TensorCreateInfo is not part of this Arg");
+    }
+
+    template <typename T>
+    std::enable_if_t<is_supported_scalar_arg_v<T>, void> mark_dump_arg(const T &scalar) {
+        uintptr_t ptr = reinterpret_cast<uintptr_t>(&scalar);
+        if (dump_arg_selection_.mark_scalar_by_ptr(ptr, scalar_count_, tensor_count_)) {
+            return;
+        }
+        set_error("dump: scalar is not part of this Arg");
+    }
+#endif
+
+    // Compile-time validation: arg count, value category (reject temporaries —
+    // a stored &arg would dangle after the call), and element type. Driven
+    // purely by Args, with no runtime state.
+    template <bool is_output, typename... Args>
+    static void assert_add_tensor_args() {
+        static_assert(sizeof...(Args) >= 1, "at least one argument required");
+        static_assert(
+            (std::is_lvalue_reference_v<Args> && ...),
+            "temporaries are not allowed — stored pointers would dangle after the call"
+        );
+        if constexpr (is_output) {
+            static_assert(
+                (std::is_same_v<std::decay_t<Args>, Tensor> && ...) ||
+                    (std::is_same_v<std::decay_t<Args>, TensorCreateInfo> && ...),
+                "add_output: all arguments must be the same type (all Tensor or all TensorCreateInfo)"
+            );
+        } else {
+            static_assert((std::is_same_v<std::decay_t<Args>, Tensor> && ...), "all arguments must be Tensor");
+        }
+    }
+
+    // Runtime validation: tensor-before-scalar ordering + slot capacity. Records
+    // an error and returns false on violation.
+    bool check_add_tensor_capacity(int32_t count) {
+        if (scalar_count_ != 0) {
+            set_error(
+                "add_input/add_output/add_inout called after add_scalar: "
+                "all tensors must be added before any scalars"
+            );
+            return false;
+        }
+        if (tensor_count_ + count > static_cast<int32_t>(MaxT)) {
+            set_error(tensor_cap_msg());
+            return false;
+        }
+        return true;
+    }
+};
+
+// =============================================================================
+// Task-args layer aliases
+// =============================================================================
+//
+// L0TaskArgs — core-level container used to build and submit tasks inside
+//   orchestration (small, stack-friendly).
+using L0TaskArgs = Arg<MAX_TENSOR_ARGS, MAX_SCALAR_ARGS>;
+
+// L2TaskArgs — chip-level entry-arg holding the orchestration entry's
+// already-allocated inputs (capacity matches ChipStorageTaskArgs).
+// aicpu_orchestration_entry/config receive a const L2TaskArgs&.
+struct L2TaskArgs : Arg<CHIP_MAX_TENSOR_ARGS, CHIP_MAX_SCALAR_ARGS> {
+    // Build from the executor's ChipStorageTaskArgs: each input becomes a
+    // TensorRef pointing at src's Tensor, so `src` must outlive this (on the
+    // executor path src is runtime->orch_args_storage_, alive for the whole run).
+    void create_from_chip_args(const ChipStorageTaskArgs &src) {
+        reset();
+        for (int32_t i = 0; i < src.tensor_count(); ++i) {
+            // Entry inputs are external submit-time tensors; the entry binds them
+            // by const Tensor& (replacing from_tensor_arg's old version/manual_dep
+            // reset), so this invariant is what keeps that binding behavior-preserving.
+            const Tensor &t = src.tensor(i);
+            debug_assert(!t.manual_dep && t.version == 0);
+            add_input(t);
+        }
+        for (int32_t i = 0; i < src.scalar_count(); ++i) {
+            add_scalar(src.scalar(i));
+        }
+    }
+};
+
+#endif  // SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_PTO_TYPES_H_
diff --git a/src/a2a3/runtime/fully_distributed_within_core/runtime/runtime.h b/src/a2a3/runtime/fully_distributed_within_core/runtime/runtime.h
new file mode 100644
index 000000000..00f1cd852
--- /dev/null
+++ b/src/a2a3/runtime/fully_distributed_within_core/runtime/runtime.h
@@ -0,0 +1,365 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Runtime Class - Device Execution and Handshake Control
+ *
+ * This class manages device-side execution through AICPU-AICore handshake
+ * protocol. Task graph construction is handled by PTO2Runtime; this class
+ * only handles:
+ * - Handshake buffers for AICPU-AICore communication
+ * - Execution parameters (block_dim, aicpu_thread_num)
+ * - Tensor pair management for host-device memory tracking
+ * - Device orchestration state (gm_sm_ptr_, orch_args_)
+ * - Function address mapping (func_id_to_addr_)
+ *
+ * Task dispatch uses a per-core PTO2DispatchPayload written by the scheduler.
+ * At dispatch time, build_payload() copies tensor pointers and scalars from
+ * the task payload into the per-core args[], populates SPMD context, then
+ * signals AICore via DATA_MAIN_BASE.
+ */
+
+#ifndef SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_RUNTIME_H_
+#define SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_RUNTIME_H_
+
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>   // for fprintf, printf
+#include <string.h>  // for memset
+
+#include <vector>
+
+#include "common/core_type.h"
+#include "common/l2_swimlane_profiling.h"
+#include "common/platform_config.h"
+#include "pto2_dispatch_payload.h"
+#include "task_args.h"
+
+// =============================================================================
+// Configuration Macros
+// =============================================================================
+
+#define RUNTIME_MAX_ARGS 128
+#define RUNTIME_MAX_WORKER 72  // 24 AIC + 48 AIV cores
+#define RUNTIME_MAX_FUNC_ID 1024
+#define RUNTIME_MAX_ORCH_SO_SIZE (4 * 1024 * 1024)  // 4MB max for orchestration SO
+#define RUNTIME_MAX_ORCH_SYMBOL_NAME 64
+
+// Default ready queue shards: one shard per worker thread (total minus orchestrator)
+constexpr int RUNTIME_DEFAULT_READY_QUEUE_SHARDS = PLATFORM_MAX_AICPU_THREADS - 1;
+
+// =============================================================================
+// Data Structures
+// =============================================================================
+
+/**
+ * Handshake Structure - Shared between Host, AICPU, and AICore
+ *
+ * This structure facilitates communication and synchronization between
+ * AICPU and AICore during task execution.
+ *
+ * Protocol State Machine:
+ * 1. Initialization: AICPU sets aicpu_ready=1
+ * 2. Acknowledgment: AICore sets aicore_done=core_id+1
+ * 3. Task Dispatch: AICPU writes DATA_MAIN_BASE after updating the per-core payload
+ * 4. Task Execution: AICore reads the cached PTO2DispatchPayload and executes
+ * 5. Task Completion: AICore writes FIN to COND; AICPU observes completion
+ * 6. Shutdown: AICPU sets control=1, AICore exits
+ *
+ * Each AICore instance has its own handshake buffer to enable concurrent
+ * task execution across multiple cores.
+ */
+
+/**
+ * Handshake buffer for AICPU-AICore communication
+ *
+ * Each AICore has its own handshake buffer for synchronization with AICPU.
+ * The structure is cache-line aligned (64 bytes) to prevent false sharing
+ * between cores and optimize cache coherency operations.
+ *
+ * Field Access Patterns:
+ * - aicpu_ready: Written by AICPU, read by AICore
+ * - aicore_done: Written by AICore, read by AICPU
+ * - task: Written by AICPU, read by AICore (0 = not ready, non-zero = PTO2DispatchPayload*)
+ * - core_type: Written by AICPU, read by AICore (CoreType::AIC or CoreType::AIV)
+ */
+struct Handshake {
+    volatile uint32_t aicpu_ready;        // AICPU ready signal: 0=not ready, 1=ready
+    volatile uint32_t aicore_done;        // AICore ready signal: 0=not ready, core_id+1=ready
+    volatile uint64_t task;               // Init: PTO2DispatchPayload* (set before aicpu_ready); runtime: unused
+    volatile CoreType core_type;          // Core type: CoreType::AIC or CoreType::AIV
+    volatile uint32_t physical_core_id;   // Physical core ID
+    volatile uint32_t aicpu_regs_ready;   // AICPU register init done: 0=pending, 1=done
+    volatile uint32_t aicore_regs_ready;  // AICore ID reported: 0=pending, 1=done
+} __attribute__((aligned(64)));
+
+/**
+ * Tensor pair for tracking host-device memory mappings.
+ * Used for copy-back during finalize.
+ */
+struct TensorPair {
+    void *host_ptr;
+    void *dev_ptr;
+    size_t size;
+    // false for read-only INPUT tensors: they are never written by the kernel,
+    // so the end-of-run D2H copy-back is skipped. OUTPUT/INOUT/unknown
+    // keep the safe default of copying back.
+    bool needs_copy_back = true;
+};
+
+/**
+ * Host API function pointers for device memory operations.
+ * Allows runtime to use pluggable device memory backends.
+ */
+struct HostApi {
+    void *(*device_malloc)(size_t size);
+    void (*device_free)(void *dev_ptr);
+    int (*copy_to_device)(void *dev_ptr, const void *host_ptr, size_t size);
+    int (*copy_from_device)(void *host_ptr, const void *dev_ptr, size_t size);
+    // Set a device buffer to a byte value (device-side, no PCIe). Used to
+    // zero-init pure OUTPUT buffers in lieu of an H2D copy-in. May be
+    // null on backends that don't wire it; callers must fall back to
+    // copy_to_device.
+    int (*device_memset)(void *dev_ptr, int value, size_t size);
+    // Commit the three per-Worker pooled regions (PTO2 GM heap, PTO2 shared
+    // memory, trb prebuilt runtime arena) as three independent device
+    // allocations. `runtime_arena_size == 0` skips the third region (hbg
+    // path: hbg has no prebuilt runtime arena). Idempotent on identical
+    // sizes; returns 0 on success, -1 on allocation failure.
+    int (*setup_static_arena)(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size);
+    // Return the per-Worker pooled pointer for the PTO2 GM heap / shared
+    // memory / prebuilt runtime arena. setup_static_arena must have already
+    // committed the relevant region; the returned pointer is owned by the
+    // DeviceRunner and freed in `DeviceRunner::finalize()` — do NOT pass it
+    // to device_free or record it in `tensor_pairs_`.
+    //
+    // acquire_pooled_runtime_arena is trb-only — the runtime-arena region is
+    // only committed when setup_static_arena was invoked with
+    // runtime_arena_size > 0. Calling it on the hbg path
+    // (setup_static_arena(...,0)) returns nullptr (not undefined).
+    void *(*acquire_pooled_gm_heap)();
+    void *(*acquire_pooled_gm_sm)();
+    void *(*acquire_pooled_runtime_arena)();
+    // Single-shot upload of the entire ChipCallable buffer. `callable` is a
+    // `const ChipCallable *` (declared void* to avoid pulling task_interface
+    // headers into runtime.h). DeviceRunner walks child_offsets_ to compute
+    // total byte size, allocates device GM once, fixes up each child's
+    // resolved_addr_ in an internal host scratch (onboard: device addr; sim:
+    // dlopen function pointer), H2D's once, and returns the device-side
+    // address of the ChipCallable header. Pool-managed: identical buffer
+    // contents (FNV-1a 64-bit) hit the dedup cache; all chip buffers are
+    // bulk-freed in DeviceRunner::finalize(). Returns 0 on error or when
+    // child_count() == 0. Caller computes child addrs as
+    //     chip_dev + offsetof(ChipCallable, storage_) + child_offset(i)
+    // and stores them via runtime->set_function_bin_addr(fid, child_dev).
+    uint64_t (*upload_chip_callable_buffer)(const void *callable);
+};
+
+/**
+ * Task structure - Compatibility stub for platform layer
+ *
+ * RT2 uses PTO2DispatchPayload instead of Task for task dispatch.
+ * This stub exists only for API compatibility with device_runner.cpp.
+ * Since get_task_count() returns 0, this struct is never actually used.
+ */
+struct Task {
+    int func_id;
+    uint64_t function_bin_addr;
+};
+
+// Per-core entry point of the fully_distributed_within_core engine. Implemented
+// in runtime/dist_engine.cpp (compiled into the AICPU .so), invoked by each
+// AICore worker thread via Runtime::dist.core_main_fn. `runtime` is Runtime*,
+// `core_type` is CoreType (cast to int to keep this typedef header-light).
+// See docs/fully_distributed_within_core.md.
+typedef void (*DistCoreMainFn)(void *runtime, int core_idx, int core_type);
+
+// =============================================================================
+// Runtime Class
+// =============================================================================
+
+/**
+ * Runtime class for device execution and handshake control
+ *
+ * This class manages AICPU-AICore communication through handshake buffers.
+ * Task graph construction is handled by PTO2Runtime; this class only handles
+ * execution control and device orchestration state.
+ */
+class Runtime {
+public:
+    // Handshake buffers for AICPU-AICore communication
+    Handshake workers[RUNTIME_MAX_WORKER];  // Worker (AICore) handshake buffers
+    int worker_count;                       // Number of active workers
+
+    // Execution parameters for AICPU scheduling.
+    //
+    // aicpu_thread_num is the *total* AICPU thread count launched on this run
+    // (= orch + schedulers). AicpuExecutor splits this into one orchestrator
+    // thread (highest idx, runs aicpu_orchestration_entry) and the remaining
+    // aicpu_thread_num-1 scheduler threads that dispatch tasks to AICore.
+    // The orch thread also dispatches when env PTO2_ORCH_TO_SCHED is set.
+    int aicpu_thread_num;
+    int ready_queue_shards;  // Number of ready queue shards (1..MAX_AICPU_THREADS, default MAX-1)
+
+    // PTO2 integration: kernel_id -> GM function_bin_addr mapping
+    // NOTE: Made public for direct access from aicore code
+    uint64_t func_id_to_addr_[RUNTIME_MAX_FUNC_ID];
+
+    // Sim-only trace-driven replay (CallConfig::use_example_exec_time). Filled by
+    // the host from CallConfig at bind time; read by execute_slot in dist_engine:
+    // when use_example_exec_time_ is set, a func whose example_exec_time_ns_[fid]
+    // is > 0 is "executed" by busy-waiting that many nanoseconds instead of
+    // calling the real kernel (funcs left at 0 still run for real). Public for
+    // direct AICore-side access, mirroring func_id_to_addr_.
+    bool use_example_exec_time_;
+    int32_t example_exec_time_ns_[RUNTIME_MAX_FUNC_ID];
+
+    // Orchestrator-to-scheduler transition control
+    // When true, orchestrator threads convert to scheduler threads after orchestration completes.
+    // When false (default), orchestrator threads exit after orchestration without dispatching tasks.
+    // Controlled via PTO2_ORCH_TO_SCHED environment variable.
+    bool orch_to_sched;
+
+    // ---- fully_distributed_within_core handoff (SPMD-on-core) ----
+    // The AICPU orchestrator thread does dlopen/arena setup, then hands the
+    // resolved orchestration entry + per-core engine off to the AICore worker
+    // threads through these fields instead of running orchestration/scheduling
+    // itself. Each AICore worker invokes core_main_fn(runtime, idx, core_type)
+    // once `go` is set, then increments `done_count` when finished. See
+    // runtime/dist_engine.* and docs/fully_distributed_within_core.md.
+    struct DistHandoff {
+        volatile uint64_t core_main_fn;  // DistCoreMainFn (in AICPU .so)
+        volatile uint32_t go;            // 1 once engine wired and cores may start
+        volatile int32_t num_workers;    // number of AICore workers participating
+        volatile int32_t done_count;     // workers atomically increment when done
+    } dist;
+
+private:
+    // Kernel binary tracking for cleanup
+    int registered_kernel_func_ids_[RUNTIME_MAX_FUNC_ID];
+    int registered_kernel_count_;
+
+    void *gm_sm_ptr_;                        // GM pointer to PTO2 shared memory (device)
+    void *gm_heap_ptr_;                      // GM heap for orchestrator output buffers (device)
+    void *slot_states_ptr_;                  // Pointer to PTO2TaskSlotState array (scheduler-private, for profiling)
+    ChipStorageTaskArgs orch_args_storage_;  // Copy of args for device
+
+    // Prebuilt-arena fast path (trb only). Set by the host before rtMemcpy'ing
+    // Runtime to device; AICPU reads them in the boot path to skip
+    // runtime_create_from_sm and reuse the pooled, prebuilt arena buffer
+    // (already populated by runtime_init_data_from_layout + wire on host).
+    void *prebuilt_arena_base_;
+    size_t prebuilt_runtime_offset_;
+
+    // Device orchestration SO (for dlopen on AICPU thread 3).
+    // The SO bytes themselves live in a separately-allocated device buffer
+    // owned by DeviceRunner; only the metadata below travels inside Runtime.
+    uint64_t dev_orch_so_addr_;
+    uint64_t dev_orch_so_size_;
+    // Per-callable_id dispatch. AICPU dispatches via
+    // `orch_so_table_[active_callable_id_]`; `register_new_callable_id_`
+    // signals whether the host is delivering a freshly-registered
+    // callable_id (write+dlopen) or reusing an already-loaded one.
+    int32_t active_callable_id_;
+    bool register_new_callable_id_;
+    char device_orch_func_name_[RUNTIME_MAX_ORCH_SYMBOL_NAME];
+    char device_orch_config_name_[RUNTIME_MAX_ORCH_SYMBOL_NAME];
+
+public:
+    /**
+     * Constructor - zero-initialize all arrays
+     */
+    Runtime();
+
+    // =========================================================================
+    // Performance Profiling
+    // =========================================================================
+
+    // =========================================================================
+    // Device orchestration (for AICPU thread 3)
+    // =========================================================================
+
+    void *get_gm_sm_ptr() const;
+    void *get_gm_heap_ptr() const;
+    const ChipStorageTaskArgs &get_orch_args() const;
+    void set_gm_sm_ptr(void *p);
+    void set_gm_heap(void *p);
+    void set_slot_states_ptr(void *p);
+    void set_orch_args(const ChipStorageTaskArgs &args);
+
+    // Prebuilt-arena fast path (trb only). Set by host's
+    // bind_callable_to_runtime_impl; consumed by AICPU at boot to attach a
+    // DeviceArena to `prebuilt_arena_base_` and pick up the PTO2Runtime at
+    // `prebuilt_arena_base_ + prebuilt_runtime_offset_`. Both stay zero on
+    // first construction (Runtime() ctor zeros them) so a non-prebuilt boot
+    // path can still detect "no prebuilt image set" via nullptr.
+    void set_prebuilt_arena(void *arena_base, size_t runtime_off);
+    void *get_prebuilt_arena_base() const;
+    size_t get_prebuilt_runtime_offset() const;
+
+    // Device orchestration SO binary (for dlopen on AICPU thread 3)
+    void set_dev_orch_so(uint64_t dev_addr, uint64_t size);
+    uint64_t get_dev_orch_so_addr() const;
+    uint64_t get_dev_orch_so_size() const;
+    // Per-callable_id dispatch. callable_id must be in
+    // [0, MAX_REGISTERED_CALLABLE_IDS); register_new_callable_id_ tells AICPU
+    // whether to (re)load the orch SO into orch_so_table_[callable_id] or
+    // reuse the cached entry.
+    void set_active_callable_id(int32_t callable_id, bool is_new);
+    int32_t get_active_callable_id() const;
+    bool register_new_callable_id() const;
+    void set_device_orch_func_name(const char *name);
+    const char *get_device_orch_func_name() const;
+    void set_device_orch_config_name(const char *name);
+    const char *get_device_orch_config_name() const;
+
+    uint64_t get_function_bin_addr(int func_id) const;
+    void set_function_bin_addr(int func_id, uint64_t addr);
+    /**
+     * Replay a previously-uploaded kernel address onto a fresh Runtime
+     * without recording it in registered_kernel_func_ids_. Used by
+     * DeviceRunner::bind_callable_to_runtime so prepared kernel
+     * binaries are not freed by validate_runtime_impl across runs.
+     */
+    void replay_function_bin_addr(int func_id, uint64_t addr);
+
+    int get_registered_kernel_count() const;
+    int get_registered_kernel_func_id(int index) const;
+    void clear_registered_kernels();
+
+    // =========================================================================
+    // Deprecated API (for platform compatibility, always returns 0/nullptr)
+    // Task graph is now managed by PTO2Runtime, not Runtime
+    // =========================================================================
+
+    /** @deprecated Task count is now in PTO2 shared memory */
+    int get_task_count() const { return 0; }
+
+    /** @deprecated RT2 uses PTO2DispatchPayload, not Task. Always returns nullptr. */
+    Task *get_task(int) { return nullptr; }
+
+    // =========================================================================
+    // Host API (host-only, not copied to device)
+    // =========================================================================
+
+    // Host API function pointers for device memory operations
+    // NOTE: Placed at end of class to avoid affecting device memory layout
+    HostApi host_api;
+
+    // Host-side tensor ledger for D2H copy-back at finalize. Populated by
+    // runtime_maker.cpp from orch_args at bind time, then iterated in
+    // validate_runtime_impl. Not read by AICPU/AICore — the device-side
+    // Runtime image carries the std::vector control block as harmless
+    // garbage, identical to host_api above. No fixed cap — grows with the
+    // chip-level entry-tensor count.
+    std::vector<TensorPair> tensor_pairs_;
+};
+
+#endif  // SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_RUNTIME_H_
diff --git a/src/a2a3/runtime/fully_distributed_within_core/runtime/scheduler/pto_scheduler.cpp b/src/a2a3/runtime/fully_distributed_within_core/runtime/scheduler/pto_scheduler.cpp
new file mode 100644
index 000000000..4b7484bc9
--- /dev/null
+++ b/src/a2a3/runtime/fully_distributed_within_core/runtime/scheduler/pto_scheduler.cpp
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * PTO Runtime2 - Scheduler Implementation
+ *
+ * Implements scheduler state management, ready queues, and task lifecycle.
+ *
+ * Based on: docs/RUNTIME_LOGIC.md
+ */
+
+#include "pto_scheduler.h"
+#include <inttypes.h>
+#include <stdlib.h>
+#include "common/unified_log.h"
+
+#if PTO2_PROFILING
+// Weak fallbacks for host/UT builds that don't link the scope_stats collector.
+extern "C" __attribute__((weak, visibility("hidden"))) bool is_scope_stats_enabled() { return false; }
+extern "C" __attribute__((weak, visibility("hidden"))) void scope_stats_note_heap_wrap(int) {}
+#endif
+
+// =============================================================================
+// Scheduler Profiling Counters
+// =============================================================================
+
+#if PTO2_SCHED_PROFILING
+#include "common/platform_config.h"
+
+uint64_t g_sched_lock_cycle[PLATFORM_MAX_AICPU_THREADS] = {};
+uint64_t g_sched_fanout_cycle[PLATFORM_MAX_AICPU_THREADS] = {};
+uint64_t g_sched_fanin_cycle[PLATFORM_MAX_AICPU_THREADS] = {};
+uint64_t g_sched_self_consumed_cycle[PLATFORM_MAX_AICPU_THREADS] = {};
+uint64_t g_sched_lock_wait_cycle[PLATFORM_MAX_AICPU_THREADS] = {};
+uint64_t g_sched_push_wait_cycle[PLATFORM_MAX_AICPU_THREADS] = {};
+uint64_t g_sched_pop_wait_cycle[PLATFORM_MAX_AICPU_THREADS] = {};
+uint64_t g_sched_lock_atomic_count[PLATFORM_MAX_AICPU_THREADS] = {};
+uint64_t g_sched_fanout_atomic_count[PLATFORM_MAX_AICPU_THREADS] = {};
+uint64_t g_sched_fanin_atomic_count[PLATFORM_MAX_AICPU_THREADS] = {};
+uint64_t g_sched_self_atomic_count[PLATFORM_MAX_AICPU_THREADS] = {};
+uint64_t g_sched_pop_atomic_count[PLATFORM_MAX_AICPU_THREADS] = {};
+uint64_t g_sched_complete_count[PLATFORM_MAX_AICPU_THREADS] = {};
+
+PTO2SchedProfilingData scheduler_get_profiling(int thread_idx) {
+    PTO2SchedProfilingData d;
+    d.lock_cycle = std::exchange(g_sched_lock_cycle[thread_idx], 0);
+    d.fanout_cycle = std::exchange(g_sched_fanout_cycle[thread_idx], 0);
+    d.fanin_cycle = std::exchange(g_sched_fanin_cycle[thread_idx], 0);
+    d.self_consumed_cycle = std::exchange(g_sched_self_consumed_cycle[thread_idx], 0);
+    d.lock_wait_cycle = std::exchange(g_sched_lock_wait_cycle[thread_idx], 0);
+    d.push_wait_cycle = std::exchange(g_sched_push_wait_cycle[thread_idx], 0);
+    d.pop_wait_cycle = std::exchange(g_sched_pop_wait_cycle[thread_idx], 0);
+    d.lock_atomic_count = std::exchange(g_sched_lock_atomic_count[thread_idx], 0);
+    d.fanout_atomic_count = std::exchange(g_sched_fanout_atomic_count[thread_idx], 0);
+    d.fanin_atomic_count = std::exchange(g_sched_fanin_atomic_count[thread_idx], 0);
+    d.self_atomic_count = std::exchange(g_sched_self_atomic_count[thread_idx], 0);
+    d.pop_atomic_count = std::exchange(g_sched_pop_atomic_count[thread_idx], 0);
+    d.complete_count = std::exchange(g_sched_complete_count[thread_idx], 0);
+    return d;
+}
+#endif
+
+// =============================================================================
+// Debug Utilities
+// =============================================================================
+
+void PTO2SchedulerState::print_stats() {
+    PTO2SchedulerState *sched = this;
+    LOG_INFO_V0("=== Scheduler Statistics ===");
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        if (sched->ring_sched_states[r].last_task_alive > 0) {
+            LOG_INFO_V0("Ring %d:", r);
+            LOG_INFO_V0("  last_task_alive: %d", sched->ring_sched_states[r].last_task_alive);
+            auto &dp = sched->ring_sched_states[r].dep_pool;
+            if (dp.top > 0) {
+                LOG_INFO_V0(
+                    "  dep_pool: top=%d tail=%d used=%d high_water=%d capacity=%d", dp.top, dp.tail, dp.top - dp.tail,
+                    dp.high_water, dp.capacity
+                );
+            }
+        }
+    }
+#if PTO2_SCHED_PROFILING
+    LOG_INFO_V0("tasks_completed:   %lld", (long long)sched->tasks_completed.load(std::memory_order_relaxed));
+    LOG_INFO_V0("tasks_consumed:    %lld", (long long)sched->tasks_consumed.load(std::memory_order_relaxed));
+#endif
+    LOG_INFO_V0("============================");
+}
+
+void PTO2SchedulerState::print_queues() {
+    PTO2SchedulerState *sched = this;
+    LOG_INFO_V0("=== Ready Queues ===");
+
+    const char *shape_names[] = {"AIC", "AIV", "MIX"};
+
+    for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) {
+        LOG_INFO_V0("  %s: count=%" PRIu64, shape_names[i], sched->ready_queues[i].size());
+    }
+    LOG_INFO_V0("  DUMMY: count=%" PRIu64, sched->dummy_ready_queue.size());
+
+    LOG_INFO_V0("====================");
+}
diff --git a/src/a2a3/runtime/fully_distributed_within_core/runtime/scheduler/pto_scheduler.h b/src/a2a3/runtime/fully_distributed_within_core/runtime/scheduler/pto_scheduler.h
new file mode 100644
index 000000000..ca88d7a87
--- /dev/null
+++ b/src/a2a3/runtime/fully_distributed_within_core/runtime/scheduler/pto_scheduler.h
@@ -0,0 +1,1485 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+/**
+ * PTO Runtime2 - Scheduler Interface
+ *
+ * The Scheduler is responsible for:
+ * 1. Maintaining per-resource-shape ready queues
+ * 2. Tracking task state (PENDING -> COMPLETED -> CONSUMED)
+ * 3. Managing fanin/fanout refcounts for dependency resolution
+ * 4. Advancing last_task_alive for heap reclamation
+ * 5. Two-stage mixed-task completion (subtask done bits → mixed-task complete)
+ *
+ * The Scheduler runs on Device AI_CPU and processes:
+ * - Task state transitions based on fanin_refcount
+ * - Buffer lifecycle based on fanout_refcount
+ * - Ring pointer advancement for flow control
+ *
+ * Based on: docs/RUNTIME_LOGIC.md
+ */
+
+#pragma once
+
+#include <atomic>
+
+#include "common/core_type.h"
+#include "utils/device_arena.h"
+#include "aicpu/platform_regs.h"  // get_reg_ptr / RegId for the speculative doorbell
+#include "pto_async_wait.h"
+#include "pto_ring_buffer.h"
+#include "pto_runtime2_types.h"
+#include "pto_shared_memory.h"
+
+#include "aicpu/device_time.h"  // get_sys_cnt_aicpu (weak; used by spec doorbell timing too)
+#if PTO2_SCHED_PROFILING
+#define PTO2_SCHED_CYCLE_START() uint64_t _st0 = get_sys_cnt_aicpu(), _st1
+#define PTO2_SCHED_CYCLE_LAP(acc)   \
+    do {                            \
+        _st1 = get_sys_cnt_aicpu(); \
+        acc += (_st1 - _st0);       \
+        _st0 = _st1;                \
+    } while (0)
+#endif
+
+// =============================================================================
+// Ready Queue (Lock-free bounded MPMC — Vyukov design)
+// =============================================================================
+
+/**
+ * Per-slot entry: sequence counter for ABA safety + task payload
+ */
+struct PTO2ReadyQueueSlot {
+    std::atomic<int64_t> sequence;
+    PTO2TaskSlotState *slot_state;
+};
+
+/**
+ * Thread-local ready buffer for local-first dispatch optimization.
+ *
+ * Two buffers per scheduling thread, one per CoreType (AIC=0, AIV=1).
+ * Initialized once before the scheduling loop; must be empty at
+ * the start of each iteration (verified by always_assert).
+ *
+ * Phase 1 fills per-CoreType buffers via on_task_complete().
+ * The dispatch stage drains them local-first via get_ready_tasks_batch,
+ * with any remaining tasks pushed to the global ready queue.
+ */
+// Number of CoreType values eligible for local dispatch (AIC=0, AIV=1)
+static constexpr int PTO2_LOCAL_DISPATCH_TYPE_NUM = 2;
+
+struct PTO2LocalReadyBuffer {
+    PTO2TaskSlotState **slot_states = nullptr;
+    int count = 0;
+    int capacity = 0;
+
+    void reset(PTO2TaskSlotState **buf, int cap) {
+        slot_states = buf;
+        count = 0;
+        capacity = cap;
+    }
+
+    bool try_push(PTO2TaskSlotState *s) {
+        if (slot_states && count < capacity) {
+            slot_states[count++] = s;
+            return true;
+        }
+        return false;
+    }
+
+    PTO2TaskSlotState *pop() { return (count > 0) ? slot_states[--count] : nullptr; }
+};
+
+/**
+ * Lock-free bounded MPMC queue (Dmitry Vyukov design)
+ *
+ * Key properties:
+ * - enqueue_pos and dequeue_pos on separate cache lines (no false sharing)
+ * - Per-slot sequence counter prevents ABA problem
+ * - Empty queue pop returns immediately (single atomic load, no lock)
+ * - CAS contention is split: producers only touch enqueue_pos,
+ *   consumers only touch dequeue_pos
+ */
+struct alignas(64) PTO2ReadyQueue {
+    PTO2ReadyQueueSlot *slots;
+    uint64_t capacity;
+    uint64_t mask;        // capacity - 1
+    char _pad0[64 - 24];  // Pad to own cache line
+
+    std::atomic<uint64_t> enqueue_pos;
+    char _pad1[64 - sizeof(std::atomic<uint64_t>)];  // Own cache line
+
+    std::atomic<uint64_t> dequeue_pos;
+    char _pad2[64 - sizeof(std::atomic<uint64_t>)];  // Own cache line
+
+    uint64_t size() {
+        uint64_t e = enqueue_pos.load(std::memory_order_relaxed);
+        uint64_t d = dequeue_pos.load(std::memory_order_relaxed);
+        return (e >= d) ? (e - d) : 0;
+    }
+
+    bool push(PTO2TaskSlotState *slot_state) {
+        uint64_t pos;
+        PTO2ReadyQueueSlot *slot;
+        while (true) {
+            pos = enqueue_pos.load(std::memory_order_relaxed);
+            slot = &slots[pos & mask];
+            int64_t seq = slot->sequence.load(std::memory_order_acquire);
+            int64_t diff = seq - static_cast<int64_t>(pos);
+            if (diff == 0) {
+                if (enqueue_pos.compare_exchange_weak(
+                        pos, pos + 1, std::memory_order_relaxed, std::memory_order_relaxed
+                    )) {
+                    break;
+                }
+            } else if (diff < 0) {
+                return false;  // Queue full
+            }
+        }
+
+        slot->slot_state = slot_state;
+        slot->sequence.store(static_cast<int64_t>(pos + 1), std::memory_order_release);
+        return true;
+    }
+
+    // Batch push: reserve count slots with a single CAS after confirming
+    // every target slot is available under the usual Vyukov sequence check.
+    void push_batch(PTO2TaskSlotState **items, int count) {
+        if (count == 0) return;
+
+        uint64_t pos;
+        while (true) {
+            pos = enqueue_pos.load(std::memory_order_relaxed);
+            bool ready = true;
+            for (int i = 0; i < count; i++) {
+                PTO2ReadyQueueSlot *slot = &slots[(pos + i) & mask];
+                int64_t seq = slot->sequence.load(std::memory_order_acquire);
+                int64_t diff = seq - static_cast<int64_t>(pos + i);
+                if (diff != 0) {
+                    ready = false;
+                    break;
+                }
+            }
+            if (!ready) {
+                continue;
+            }
+            if (enqueue_pos.compare_exchange_weak(
+                    pos, pos + count, std::memory_order_relaxed, std::memory_order_relaxed
+                )) {
+                break;
+            }
+        }
+
+        for (int i = 0; i < count; i++) {
+            PTO2ReadyQueueSlot *slot = &slots[(pos + i) & mask];
+            slot->slot_state = items[i];
+            slot->sequence.store(static_cast<int64_t>(pos + i + 1), std::memory_order_release);
+        }
+    }
+
+#if PTO2_ORCH_PROFILING || PTO2_SCHED_PROFILING
+    bool push(PTO2TaskSlotState *slot_state, uint64_t &atomic_count, uint64_t &wait_cycle) {
+        uint64_t pos;
+        PTO2ReadyQueueSlot *slot;
+        uint64_t t0 = get_sys_cnt_aicpu();
+        bool contended = false;
+        uint32_t atomic_ops = 0;
+        while (true) {
+            pos = enqueue_pos.load(std::memory_order_relaxed);
+            slot = &slots[pos & mask];
+            int64_t seq = slot->sequence.load(std::memory_order_acquire);
+            int64_t diff = seq - static_cast<int64_t>(pos);
+            atomic_ops += 2;  // enqueue_pos.load + sequence.load
+            if (diff == 0) {
+                if (enqueue_pos.compare_exchange_weak(
+                        pos, pos + 1, std::memory_order_relaxed, std::memory_order_relaxed
+                    )) {
+                    atomic_ops++;  // successful CAS
+                    break;
+                }
+                contended = true;
+                atomic_ops++;  // failed CAS
+            } else if (diff < 0) {
+                return false;  // Queue full
+            } else {
+                contended = true;  // diff > 0: slot not yet released, spin
+            }
+        }
+        atomic_ops++;  // final sequence.store
+        atomic_count += atomic_ops;
+        if (contended) {
+            wait_cycle += (get_sys_cnt_aicpu() - t0);
+        }
+
+        slot->slot_state = slot_state;
+        slot->sequence.store(static_cast<int64_t>(pos + 1), std::memory_order_release);
+        return true;
+    }
+#endif
+
+    PTO2TaskSlotState *pop() {
+        // Fast-path: skip slot load when queue is clearly empty
+        uint64_t d = dequeue_pos.load(std::memory_order_relaxed);
+        uint64_t e = enqueue_pos.load(std::memory_order_relaxed);
+        if (d >= e) {
+            return nullptr;
+        }
+
+        uint64_t pos;
+        PTO2ReadyQueueSlot *slot;
+        while (true) {
+            pos = dequeue_pos.load(std::memory_order_relaxed);
+            slot = &slots[pos & mask];
+            int64_t seq = slot->sequence.load(std::memory_order_acquire);
+            int64_t diff = seq - static_cast<int64_t>(pos + 1);
+            if (diff == 0) {
+                if (dequeue_pos.compare_exchange_weak(
+                        pos, pos + 1, std::memory_order_relaxed, std::memory_order_relaxed
+                    ))
+                    break;
+            } else if (diff < 0) {
+                return nullptr;  // Queue empty
+            }
+        }
+
+        PTO2TaskSlotState *result = slot->slot_state;
+        slot->sequence.store(static_cast<int64_t>(pos + mask + 1), std::memory_order_release);
+        return result;
+    }
+
+#if PTO2_SCHED_PROFILING
+    PTO2TaskSlotState *pop(uint64_t &atomic_count, uint64_t &wait_cycle) {
+        // Fast-path: skip slot load when queue is clearly empty
+        uint64_t d = dequeue_pos.load(std::memory_order_relaxed);
+        uint64_t e = enqueue_pos.load(std::memory_order_relaxed);
+        atomic_count += 2;  // dequeue_pos.load + enqueue_pos.load
+        if (d >= e) {
+            return nullptr;
+        }
+
+        uint64_t pos;
+        PTO2ReadyQueueSlot *slot;
+        uint64_t t0 = get_sys_cnt_aicpu();
+        bool contended = false;
+        uint32_t atomic_ops = 0;
+        while (true) {
+            pos = dequeue_pos.load(std::memory_order_relaxed);
+            slot = &slots[pos & mask];
+            int64_t seq = slot->sequence.load(std::memory_order_acquire);
+            int64_t diff = seq - static_cast<int64_t>(pos + 1);
+            atomic_ops += 2;  // dequeue_pos.load + sequence.load
+            if (diff == 0) {
+                if (dequeue_pos.compare_exchange_weak(
+                        pos, pos + 1, std::memory_order_relaxed, std::memory_order_relaxed
+                    )) {
+                    atomic_ops++;  // successful CAS
+                    break;
+                }
+                contended = true;
+                atomic_ops++;  // failed CAS
+            } else if (diff < 0) {
+                atomic_count += atomic_ops;
+                return nullptr;  // Queue empty
+            } else {
+                contended = true;
+            }
+        }
+        atomic_ops++;  // final sequence.store
+        atomic_count += atomic_ops;
+        if (contended) {
+            wait_cycle += (get_sys_cnt_aicpu() - t0);
+        }
+
+        PTO2TaskSlotState *result = slot->slot_state;
+        slot->sequence.store(static_cast<int64_t>(pos + mask + 1), std::memory_order_release);
+        return result;
+    }
+#endif
+
+    // Batch pop: reserve a contiguous run of ready slots with a single CAS.
+    // Returns actual number of items popped (may be less than max_count).
+    int pop_batch(PTO2TaskSlotState **out, int max_count) {
+        uint64_t pos;
+        int count;
+        while (true) {
+            pos = dequeue_pos.load(std::memory_order_relaxed);
+            count = 0;
+            while (count < max_count) {
+                PTO2ReadyQueueSlot *slot = &slots[(pos + count) & mask];
+                int64_t seq = slot->sequence.load(std::memory_order_acquire);
+                int64_t diff = seq - static_cast<int64_t>(pos + count + 1);
+                if (diff == 0) {
+                    count++;
+                    continue;
+                }
+                if (diff < 0) {
+                    break;
+                }
+                count = -1;
+                break;
+            }
+            if (count == 0) return 0;
+            if (count < 0) continue;
+            if (dequeue_pos.compare_exchange_weak(
+                    pos, pos + count, std::memory_order_relaxed, std::memory_order_relaxed
+                )) {
+                break;
+            }
+        }
+
+        for (int i = 0; i < count; i++) {
+            PTO2ReadyQueueSlot *slot = &slots[(pos + i) & mask];
+            out[i] = slot->slot_state;
+            slot->sequence.store(static_cast<int64_t>(pos + i + mask + 1), std::memory_order_release);
+        }
+        return count;
+    }
+
+#if PTO2_SCHED_PROFILING
+    int pop_batch(PTO2TaskSlotState **out, int max_count, uint64_t &atomic_count, uint64_t &wait_cycle) {
+        uint64_t pos;
+        int count;
+        uint64_t t0 = get_sys_cnt_aicpu();
+        bool contended = false;
+        uint32_t atomic_ops = 0;
+        while (true) {
+            pos = dequeue_pos.load(std::memory_order_relaxed);
+            atomic_ops++;  // dequeue_pos.load
+            count = 0;
+            while (count < max_count) {
+                PTO2ReadyQueueSlot *slot = &slots[(pos + count) & mask];
+                int64_t seq = slot->sequence.load(std::memory_order_acquire);
+                int64_t diff = seq - static_cast<int64_t>(pos + count + 1);
+                atomic_ops++;  // sequence.load
+                if (diff == 0) {
+                    count++;
+                    continue;
+                }
+                if (diff < 0) {
+                    break;
+                }
+                contended = true;
+                count = -1;
+                break;
+            }
+            if (count == 0) {
+                atomic_count += atomic_ops;
+                return 0;
+            }
+            if (count < 0) {
+                continue;
+            }
+            if (dequeue_pos.compare_exchange_weak(
+                    pos, pos + count, std::memory_order_relaxed, std::memory_order_relaxed
+                )) {
+                atomic_ops++;  // successful CAS
+                break;
+            }
+            contended = true;
+            atomic_ops++;  // failed CAS
+        }
+
+        for (int i = 0; i < count; i++) {
+            PTO2ReadyQueueSlot *slot = &slots[(pos + i) & mask];
+            out[i] = slot->slot_state;
+            slot->sequence.store(static_cast<int64_t>(pos + i + mask + 1), std::memory_order_release);
+            atomic_ops++;  // sequence.store
+        }
+        atomic_count += atomic_ops;
+        if (contended) {
+            wait_cycle += (get_sys_cnt_aicpu() - t0);
+        }
+        return count;
+    }
+#endif
+};
+
+// Cold-path ready queue operations (defined in pto_scheduler.cpp). Declared
+// as non-member so PTO2ReadyQueue stays a POD-like struct with cache-line
+// alignment. Storage is owned by the caller-supplied arena.
+//   reserve_layout: declare the slots[] region on the arena (must precede commit)
+//   init_from_layout: bind slots pointer from arena.region_ptr(off) and
+//                     initialize sequence counters
+//   destroy: forget the slots pointer (arena owns the buffer)
+size_t ready_queue_reserve_layout(DeviceArena &arena, uint64_t capacity);
+// Writes everything *except* the arena-internal `slots` pointer field
+// (sequences/positions on the slot array, capacity, mask). Uses
+// arena.region_ptr(slots_off) only to address the slot array for writes;
+// does NOT store the pointer in `queue->slots`. Call
+// `ready_queue_wire_arena_pointers` afterwards to set the field itself.
+bool ready_queue_init_data_from_layout(PTO2ReadyQueue *queue, DeviceArena &arena, size_t slots_off, uint64_t capacity);
+// Stores queue->slots = arena.region_ptr(slots_off). Idempotent.
+void ready_queue_wire_arena_pointers(PTO2ReadyQueue *queue, DeviceArena &arena, size_t slots_off);
+void ready_queue_destroy(PTO2ReadyQueue *queue);
+
+// =============================================================================
+// SPSC Queue (Single-Producer Single-Consumer, wait-free)
+// =============================================================================
+//
+// Bounded ring buffer optimized for the wiring queue use case:
+//   - Producer: orchestrator thread (push)
+//   - Consumer: scheduler thread 0 (pop_batch)
+//
+// Design based on Rigtorp's cached-index technique: each side caches
+// the other's index locally, avoiding cross-core cache line bouncing
+// on the hot path. Only when the local cache says "full" or "empty"
+// does the thread issue an acquire load on the remote index.
+//
+// Memory layout: 5 cache-line-aligned fields ensure zero false sharing.
+
+struct alignas(64) PTO2SpscQueue {
+    // --- Producer cache lines (orchestrator thread) ---
+    alignas(64) std::atomic<uint64_t> head_{0};
+    alignas(64) uint64_t tail_cached_{0};
+
+    // --- Consumer cache lines (scheduler thread 0) ---
+    alignas(64) std::atomic<uint64_t> tail_{0};
+    alignas(64) uint64_t head_cached_{0};
+
+    // --- Shared Cacheline (read only) with mask and data ptr (immutable after init) ---
+    alignas(64) PTO2TaskSlotState **buffer_{nullptr};
+    uint64_t mask_{0};
+
+    // Padding to exactly 5 cache lines
+    char padding[64 - sizeof(PTO2TaskSlotState **) - sizeof(uint64_t)];
+
+    // Reserve the backing buffer region on the supplied arena. Returns the
+    // region offset, to be passed to init_from_layout() after the arena is
+    // committed. Cache-line aligned: the buffer is shared between the
+    // orchestrator (push) and scheduler thread 0 (pop_batch), so its base
+    // must not false-share with neighboring regions.
+    static size_t reserve_layout(DeviceArena &arena, uint64_t capacity) {
+        return arena.reserve(capacity * sizeof(uintptr_t), PTO2_ALIGN_SIZE);
+    }
+
+    // Writes everything except the arena-internal `buffer_` pointer field
+    // (zeros the slot pointer array, mask/head/tail). The host pre-builds the
+    // image without storing a host address in buffer_; the AICPU wires
+    // buffer_ at boot via wire_arena_pointers().
+    bool init_data_from_layout(DeviceArena &arena, size_t buffer_off, uint64_t capacity) {
+        if (capacity == 0 || (capacity & (capacity - 1)) != 0) return false;
+        auto *buf = static_cast<PTO2TaskSlotState **>(arena.region_ptr(buffer_off));
+        // calloc'd-equivalent: zero the slot pointers so spurious early pops
+        // observe nullptr.
+        for (uint64_t i = 0; i < capacity; i++)
+            buf[i] = nullptr;
+        mask_ = capacity - 1;
+        head_.store(0, std::memory_order_relaxed);
+        tail_.store(0, std::memory_order_relaxed);
+        tail_cached_ = 0;
+        head_cached_ = 0;
+        return true;
+    }
+
+    // Wire the arena-internal pointer. Called by both host (with host arena)
+    // and AICPU (with device arena attached to the prebuilt image).
+    void wire_arena_pointers(DeviceArena &arena, size_t buffer_off) {
+        buffer_ = static_cast<PTO2TaskSlotState **>(arena.region_ptr(buffer_off));
+    }
+
+    // Arena owns the buffer; here we only forget our pointer.
+    void destroy() { buffer_ = nullptr; }
+
+    // Push one item (producer only). Returns false if queue is full.
+    // Full condition: next_h - tail > mask_ (i.e. > capacity-1), so the
+    // effective usable capacity is capacity-1 (one slot is wasted as a
+    // sentinel to distinguish full from empty). uint64_t wrapping is safe
+    // since head and tail are monotonically increasing and subtraction
+    // wraps correctly.
+    bool push(PTO2TaskSlotState *item) {
+        uint64_t h = head_.load(std::memory_order_relaxed);
+        uint64_t next_h = h + 1;
+        if (next_h - tail_cached_ > mask_) {
+            tail_cached_ = tail_.load(std::memory_order_acquire);
+            if (next_h - tail_cached_ > mask_) {
+                return false;
+            }
+        }
+        buffer_[h & mask_] = item;
+        head_.store(next_h, std::memory_order_release);
+        return true;
+    }
+
+    // Pop up to max_count items (consumer only). Returns actual count.
+    int pop_batch(PTO2TaskSlotState **out, int max_count) {
+        uint64_t t = tail_.load(std::memory_order_relaxed);
+        uint64_t avail = head_cached_ - t;
+        if (avail < static_cast<uint64_t>(max_count)) {
+            head_cached_ = head_.load(std::memory_order_acquire);
+            avail = head_cached_ - t;
+            if (avail == 0) return 0;
+        }
+        int count = (avail < static_cast<uint64_t>(max_count)) ? static_cast<int>(avail) : max_count;
+        for (int i = 0; i < count; i++) {
+            out[i] = buffer_[(t + i) & mask_];
+        }
+        tail_.store(t + count, std::memory_order_release);
+        return count;
+    }
+
+    // Approximate size (used for backoff decisions, not exact).
+    uint64_t size() const {
+        uint64_t h = head_.load(std::memory_order_acquire);
+        uint64_t t = tail_.load(std::memory_order_acquire);
+        return h - t;
+    }
+};
+
+static_assert(sizeof(PTO2SpscQueue) == 5 * 64, "PTO2SpscQueue must be exactly 5 cache lines (320B)");
+// =============================================================================
+
+/**
+ * Statistics returned by mixed-task completion processing
+ */
+struct CompletionStats {
+    int32_t fanout_edges;       // Number of fanout edges traversed (notify consumers)
+    int32_t tasks_enqueued;     // Number of consumers that became READY
+    int32_t fanin_edges;        // Number of fanin edges traversed (release producers)
+    bool mixed_task_completed;  // True only when this callback completed a mixed task
+};
+
+/**
+ * Layout descriptor produced by PTO2SchedulerState::reserve_layout(). Holds
+ * the arena offsets of every sub-region the scheduler needs plus the
+ * capacities used at layout time (init_from_layout reuses them).
+ */
+struct PTO2SchedulerLayout {
+    size_t off_ready_queue_slots[PTO2_NUM_RESOURCE_SHAPES];
+    size_t off_dummy_ready_queue_slots;
+    size_t off_early_dispatch_queue_slots;
+    size_t off_dep_pool_entries[PTO2_MAX_RING_DEPTH];
+    size_t off_wiring_spsc_buffer;
+    uint64_t ready_queue_capacity;
+    uint64_t spsc_capacity;
+    int32_t dep_pool_capacities[PTO2_MAX_RING_DEPTH];
+};
+
+/**
+ * Scheduler state structure
+ *
+ * Contains dynamic state updated during task execution.
+ * Separated from shared memory for cache efficiency.
+ * Hot-path methods are defined inline (implicitly inline as member functions).
+ */
+struct PTO2SchedulerState {
+    // Shared memory access
+    PTO2SharedMemoryHeader *sm_header;
+
+    // Per-ring state
+    struct alignas(64) RingSchedState {
+        // --- Cache Line 0: ring pointer (read-only) + hot path (read-write) ---
+        PTO2SharedMemoryRingHeader *ring;
+        int32_t last_task_alive;
+        std::atomic<int32_t> advance_lock;  // multi-thread CAS
+
+        // --- Cache Line 1+: Thread 0 only (wiring dep_pool) ---
+        alignas(64) PTO2DepListPool dep_pool;
+#if PTO2_PROFILING
+        // Published only for scope_stats; orchestrator must not read dep_pool's non-atomic counters directly.
+        alignas(64) std::atomic<int32_t> dep_pool_snapshot_tail;
+        std::atomic<int32_t> dep_pool_snapshot_top;
+#endif
+
+        // Initialize arena-internal data + arena-external pointers; does NOT
+        // store dep_pool.base (that lives in the runtime arena and is wired
+        // by SchedulerState::wire_arena_pointers). The `ring` field stores
+        // the device address of the SM ring header — computed via offset
+        // arithmetic, no SM dereference.
+        bool init_data_from_layout(void *sm_dev_base, int32_t ring_id);
+        void destroy();
+
+        void sync_to_sm() { ring->fc.last_task_alive.store(last_task_alive, std::memory_order_release); }
+
+#if PTO2_PROFILING
+        void publish_dep_pool_snapshot() {
+            dep_pool_snapshot_tail.store(dep_pool.tail, std::memory_order_release);
+            dep_pool_snapshot_top.store(dep_pool.top, std::memory_order_release);
+        }
+
+        void read_dep_pool_snapshot(int32_t &tail, int32_t &top) const {
+            top = dep_pool_snapshot_top.load(std::memory_order_acquire);
+            tail = dep_pool_snapshot_tail.load(std::memory_order_acquire);
+            if (tail > top) tail = top;
+        }
+#endif
+
+        void advance_ring_pointers() {
+            int32_t current_task_index = ring->fc.current_task_index.load(std::memory_order_acquire);
+            int32_t old_last_task_alive = last_task_alive;
+
+            while (last_task_alive < current_task_index) {
+                PTO2TaskSlotState &slot_state = ring->get_slot_state_by_task_id(last_task_alive);
+                if (slot_state.task_state.load(std::memory_order_acquire) != PTO2_TASK_CONSUMED) {
+                    break;
+                }
+                last_task_alive++;
+            }
+
+            // Eager reset: prepare reclaimed slots for reuse while still hot in cache.
+            // Safe because last_task_alive has advanced past these slots but
+            // sync_to_sm has not yet published — the orchestrator cannot reuse
+            // them until the release store below.
+            // Skips payload, task, ring_id — immutable after RingSchedState::init().
+            for (int32_t id = old_last_task_alive; id < last_task_alive; id++) {
+                ring->get_slot_state_by_task_id(id).reset_for_reuse();
+            }
+
+            sync_to_sm();
+        }
+    } ring_sched_states[PTO2_MAX_RING_DEPTH];
+
+    // Ready queues remain global (scheduling is ring-agnostic)
+    PTO2ReadyQueue ready_queues[PTO2_NUM_RESOURCE_SHAPES];
+
+    // Dependency-only tasks (active_mask is empty, shape == DUMMY). Drained by
+    // the dispatch loop and completed inline -- never goes to AICore.
+    PTO2ReadyQueue dummy_ready_queue;
+
+    // Wiring subsystem — groups all wiring-related state for cache-line isolation.
+    //
+    // Three cache-line regions by writer:
+    //   1. batch_*  / backoff — thread 0 exclusive (local batch buffer)
+    //   2. queue    — SPSC: orchestrator push, thread 0 pop
+    //   3. orch_needs_drain — orchestrator write, thread 0 read
+    struct alignas(64) WiringState {
+        static constexpr uint64_t BATCH_SIZE = 30;
+        static constexpr int BACKOFF_LIMIT = 32;
+
+        // --- Thread 0 exclusive: local batch buffer + backoff ---
+        int batch_count = 0;
+        int batch_index = 0;
+        int backoff_counter = 0;
+        PTO2TaskSlotState *batch[BATCH_SIZE];
+
+        // --- SPSC queue: orchestrator (push) ↔ thread 0 (pop) ---
+        PTO2SpscQueue queue;
+
+        // --- Orchestrator write, thread 0 read ---
+        alignas(64) std::atomic<bool> orch_needs_drain{false};
+    } wiring;
+
+    static_assert(
+        offsetof(WiringState, queue) == 256, "WiringState: batch region must be exactly 4 cache lines before queue"
+    );
+    static_assert(sizeof(WiringState) == 640, "WiringState must be exactly 10 cache lines (640B)");
+
+    alignas(64) AsyncWaitList async_wait_list;
+
+    // Statistics (cold path, isolated from hot-path fields)
+#if PTO2_SCHED_PROFILING
+    alignas(64) std::atomic<int64_t> tasks_completed;
+    std::atomic<int64_t> tasks_consumed;
+#endif
+    // =========================================================================
+    // Inline hot-path methods
+    // =========================================================================
+
+    /**
+     * Drain wiring queue: pop submitted tasks and wire their fanout edges.
+     * Called by scheduler thread 0 each loop iteration. Sets fanin_count,
+     * acquires fanout_lock per producer, allocates dep_pool entries, and
+     * pushes ready tasks to the appropriate ready queue.
+     *
+     * @return Number of tasks wired this call.
+     */
+
+    int drain_wiring_queue(bool force_drain = false) {
+        int wired = 0;
+
+        // Refill local batch buffer when exhausted.
+        if (wiring.batch_index >= wiring.batch_count) {
+            // Backoff: defer pop when queue holds fewer than a full batch,
+            // unless force_drain, orch_needs_drain, or backoff limit reached.
+            if (!force_drain && wiring.queue.size() < WiringState::BATCH_SIZE) {
+                if (!wiring.orch_needs_drain.load(std::memory_order_acquire) &&
+                    wiring.backoff_counter < WiringState::BACKOFF_LIMIT) {
+                    wiring.backoff_counter++;
+                    return 0;
+                }
+            }
+            wiring.backoff_counter = 0;
+            wiring.batch_count = wiring.queue.pop_batch(wiring.batch, WiringState::BATCH_SIZE);
+            wiring.batch_index = 0;
+            if (wiring.batch_count == 0) return 0;
+        }
+
+        // Process tasks from local buffer in strict FIFO order.
+        while (wiring.batch_index < wiring.batch_count) {
+            PTO2TaskSlotState *ws = wiring.batch[wiring.batch_index];
+            int ring_id = ws->ring_id;
+            auto &rss = ring_sched_states[ring_id];
+            int32_t wfanin = ws->payload->fanin_actual_count;
+
+            if (wfanin > 0 && rss.dep_pool.available() < wfanin) {
+                rss.dep_pool.reclaim(*rss.ring, rss.last_task_alive);
+                if (rss.dep_pool.available() < wfanin) {
+#if PTO2_PROFILING
+                    if (is_scope_stats_enabled()) {
+                        rss.publish_dep_pool_snapshot();
+                    }
+#endif
+                    break;  // not enough dep_pool space — keep remainder for next call
+                }
+            }
+
+            wiring.batch_index++;
+            wire_task(rss, ws, wfanin);
+            wired++;
+        }
+
+        return wired;
+    }
+
+    // Route a ready slot to the right global queue. Dummy tasks (empty
+    // active_mask) live in dummy_ready_queue; everything else goes to the
+    // per-shape ready_queues[]. Used by paths that do not have a thread-local
+    // ready buffer (e.g. wiring). See push_ready_routed_local for the
+    // dispatch-time fast path.
+    void push_ready_routed(PTO2TaskSlotState *slot_state) {
+        PTO2ResourceShape shape = slot_state->active_mask.to_shape();
+        if (shape == PTO2ResourceShape::DUMMY) {
+            dummy_ready_queue.push(slot_state);
+        } else {
+            ready_queues[static_cast<int32_t>(shape)].push(slot_state);
+        }
+    }
+
+    /**
+     * Wire fanout edges for a single task. Sets fanin_count, acquires each
+     * producer's fanout_lock, allocates dep_pool entries for live producers,
+     * pushes the task to the ready queue once its fanin refcount is satisfied.
+     */
+    void wire_task(RingSchedState &rss, PTO2TaskSlotState *ws, int32_t wfanin) {
+        PTO2TaskPayload *wp = ws->payload;
+        ws->fanin_count = wfanin + 1;
+
+        if (wfanin != 0) {
+            int32_t early_finished = 0;
+            for_each_fanin_slot_state(*wp, [&](PTO2TaskSlotState *producer) {
+                producer->lock_fanout();
+                int32_t pstate = producer->task_state.load(std::memory_order_acquire);
+                if (pstate >= PTO2_TASK_COMPLETED) {
+                    early_finished++;
+                } else {
+                    producer->fanout_head = rss.dep_pool.prepend(producer->fanout_head, ws);
+                }
+                producer->unlock_fanout();
+            });
+
+            // Seed dispatch_fanin with producers already complete at wiring
+            // time (e.g. buffer-creator tasks that finished before this
+            // consumer entered the graph). Such producers never dispatch at
+            // runtime, so they can never bump dispatch_fanin via the fanout
+            // walk; without this seed the candidate compare
+            // (dispatch_fanin == fanin_actual_count) would be unreachable
+            // whenever any producer is pre-completed. Mirrors the
+            // early_finished seed that ready_fanin gets via init_rc.
+            if (early_finished != 0) {
+                wp->dispatch_fanin.fetch_add(early_finished, std::memory_order_acq_rel);
+            }
+
+            int32_t init_rc = early_finished + 1;
+            int32_t new_rc = ws->fanin_refcount.fetch_add(init_rc, std::memory_order_acq_rel) + init_rc;
+            if (new_rc >= ws->fanin_count) {
+                push_ready_routed(ws);
+            }
+        } else {
+            ws->fanin_refcount.fetch_add(1, std::memory_order_acq_rel);
+            push_ready_routed(ws);
+        }
+
+        ws->dep_pool_mark = rss.dep_pool.top;
+#if PTO2_PROFILING
+        if (is_scope_stats_enabled()) {
+            rss.publish_dep_pool_snapshot();
+        }
+#endif
+    }
+
+    void check_and_handle_consumed(PTO2TaskSlotState &slot_state) {
+        if (slot_state.fanout_refcount.load(std::memory_order_acquire) != slot_state.fanout_count) return;
+
+        PTO2TaskState expected = PTO2_TASK_COMPLETED;
+        if (!slot_state.task_state.compare_exchange_strong(
+                expected, PTO2_TASK_CONSUMED, std::memory_order_acq_rel, std::memory_order_acquire
+            )) {
+            return;
+        }
+
+#if PTO2_SCHED_PROFILING
+        tasks_consumed.fetch_add(1, std::memory_order_relaxed);
+#endif
+
+        int32_t ring_id = slot_state.ring_id;
+        // Try-lock — if another thread is advancing this ring, it will scan our CONSUMED task
+        int32_t expected_lock = 0;
+        if (ring_sched_states[ring_id].advance_lock.compare_exchange_strong(
+                expected_lock, 1, std::memory_order_acquire, std::memory_order_relaxed
+            )) {
+            ring_sched_states[ring_id].advance_ring_pointers();
+            ring_sched_states[ring_id].advance_lock.store(0, std::memory_order_release);
+        }
+    }
+
+#if PTO2_ORCH_PROFILING || PTO2_SCHED_PROFILING
+    void check_and_handle_consumed(PTO2TaskSlotState &slot_state, uint64_t &atomic_count) {
+        int32_t fc = slot_state.fanout_count;
+        int32_t rc = slot_state.fanout_refcount.load(std::memory_order_acquire);
+
+        atomic_count += 2;  // fanout_count.load + fanout_refcount.load
+
+        if (rc != fc) return;
+
+        PTO2TaskState expected = PTO2_TASK_COMPLETED;
+        if (!slot_state.task_state.compare_exchange_strong(
+                expected, PTO2_TASK_CONSUMED, std::memory_order_acq_rel, std::memory_order_acquire
+            )) {
+            atomic_count += 1;  // failed CAS
+            return;
+        }
+
+        atomic_count += 1;  // successful CAS
+
+#if PTO2_SCHED_PROFILING
+        tasks_consumed.fetch_add(1, std::memory_order_relaxed);
+#endif
+
+        int32_t ring_id = slot_state.ring_id;
+        // Try-lock — if another thread is advancing this ring, it will scan our CONSUMED task
+        int32_t expected_lock = 0;
+        if (ring_sched_states[ring_id].advance_lock.compare_exchange_strong(
+                expected_lock, 1, std::memory_order_acquire, std::memory_order_relaxed
+            )) {
+            ring_sched_states[ring_id].advance_ring_pointers();
+            ring_sched_states[ring_id].advance_lock.store(0, std::memory_order_release);
+            atomic_count += 2;  // try-lock CAS + unlock store
+        } else {
+            atomic_count += 1;  // failed try-lock CAS
+        }
+    }
+#endif
+
+    void release_producer(PTO2TaskSlotState &slot_state) {
+        slot_state.fanout_refcount.fetch_add(1, std::memory_order_acq_rel);
+        check_and_handle_consumed(slot_state);
+    }
+
+#if PTO2_ORCH_PROFILING || PTO2_SCHED_PROFILING
+    void release_producer(PTO2TaskSlotState &slot_state, uint64_t &atomic_count) {
+        slot_state.fanout_refcount.fetch_add(1, std::memory_order_acq_rel);
+        atomic_count += 1;  // fanout_refcount.fetch_add
+        check_and_handle_consumed(slot_state, atomic_count);
+    }
+#endif
+
+    // Speculative early-dispatch release. If the now-ready task was pre-staged
+    // (gated on a core), ring its DATA_MAIN_BASE high-32 doorbell RIGHT HERE in
+    // the completion path — the moment its last producer's FIN satisfies fanin —
+    // instead of routing it through the ready queue and waiting for the dispatch
+    // pass to pop it. Returns true if the task is fully handled (caller must NOT
+    // push to the ready queue). Returns false when the caller must route C
+    // normally: either it was never pre-staged, OR it is a SPMD consumer only
+    // PARTIALLY pre-staged — the gated blocks are released by the doorbells rung
+    // here, and the remaining (next_block_idx .. logical_block_num) blocks
+    // dispatch normally off the ready queue. Lock-free claim shared with Hook 1
+    // (the stager): CAS NONE->DISPATCHED wins => not pre-staged; lose => STAGED
+    // (spin past the brief STAGING window so the mask is visible), then ring.
+
+    // Per-core speculative doorbell table. Hook 1 records each gated core's
+    // (reg_addr, dispatch token) here at stage time; the completion-path release
+    // reads it back for the cores set in the consumer's staged_core_mask. One
+    // global table indexed by core_id (not per-task): gated cores in flight are
+    // bounded by the chip's core count (no two-level pre-dispatch), so this is the
+    // natural capacity and removes the old per-task 3-doorbell cap.
+    struct SpecDoorbell {
+        uint64_t addr{0};
+        uint32_t token{0};
+    };
+    SpecDoorbell spec_doorbell_table[PTO2_SPEC_CORE_MASK_WORDS * 64]{};
+
+    // Cross-thread early-dispatch work queue (a PTO2ReadyQueue MPMC instance,
+    // arena-backed — reserved/wired in pto_runtime2_init alongside the ready queues).
+    // A consumer's SPMD blocks span cores owned by several AICPU threads, but only a
+    // thread RUNNING the consumer's producer discovers it (via the producer's
+    // fanout). When that producer is thread-local (e.g. a 16-block AIV op filling one
+    // thread's cores), the other threads never see the consumer and its blocks on
+    // their cores can't pre-stage. The first claimer pushes the partially-staged
+    // consumer here; every idle thread's early_dispatch pass pops one, stages a range onto
+    // ITS OWN cores (range-claim via next_block_idx), and re-pushes if blocks remain
+    // — exactly mirroring how a partially-dispatched SPMD task is re-pushed to the
+    // ready queue (scheduler_dispatch: pop -> claim -> re-push). A stale/released
+    // entry fails the STAGING check on pop and is dropped; a push that overflows is
+    // logged and the consumer's blocks fall back to normal dispatch.
+    PTO2ReadyQueue early_dispatch_queue;
+
+    static inline void ring_one_doorbell(uint64_t reg_addr, uint32_t token) {
+        volatile uint64_t *dmb = reinterpret_cast<volatile uint64_t *>(get_reg_ptr(reg_addr, RegId::DATA_MAIN_BASE));
+        uint64_t tk = static_cast<uint64_t>(token);
+        *dmb = (tk << 32) | tk;  // 64-bit STR: high=low=token releases the gated AICore
+    }
+
+    // auto-chain depth cap: a candidate inherits the flag only while depth < this.
+    static constexpr uint8_t PTO2_SPEC_CHAIN_MAX = 4;
+
+    // Event-driven candidate detection (the dual of fanin_refcount/ready). Call when a
+    // FLAGGED producer `p` DISPATCHES (starts running): walk its fanout and bump each
+    // consumer's dispatch_fanin. A consumer whose dispatch_fanin reaches
+    // fanin_actual_count (= every producer is either flagged-and-dispatched, or was
+    // already complete when the consumer was wired) is an early-dispatch candidate:
+    // CAS NONE->STAGING (exactly-once) and push to early_dispatch_queue for the idle drain to
+    // pre-stage. Once-guarded per producer so an SPMD producer's block-by-block
+    // dispatch propagates once. Replaces the old per-iteration pass-1 PULL scan.
+    void propagate_dispatch_fanin(PTO2TaskSlotState &p) {
+        if (!(p.payload->allow_early_resolve || p.payload->spec_chain_active.load(std::memory_order_acquire)))
+            return;  // only flagged (codegen or inherited) producers propagate
+        if (p.payload->dispatch_propagated.exchange(1, std::memory_order_acq_rel) != 0)
+            return;  // already propagated once
+        uint8_t child_depth = static_cast<uint8_t>(p.payload->spec_chain_depth + 1);
+        p.lock_fanout();
+        PTO2DepListEntry *edge = p.fanout_head;  // snapshot head, walk lock-free (fanout stable by dispatch)
+        p.unlock_fanout();
+        for (; edge != nullptr; edge = edge->next) {
+            PTO2TaskSlotState *c = edge->slot_state;
+            // Compare to fanin_actual_count (the real producer-edge count), NOT
+            // fanin_count: fanin_count = fanin_actual_count + 1 (a self/wiring +1 that
+            // ready_fanin gets but dispatch_fanin does not). dispatch_fanin starts at
+            // the wiring-time early_finished seed (producers already complete) and is
+            // bumped here by flagged producers; reaching fanin_actual_count means every
+            // producer is flagged-dispatched or was pre-completed.
+            int32_t nf = c->payload->dispatch_fanin.fetch_add(1, std::memory_order_acq_rel) + 1;
+            if (nf != c->payload->fanin_actual_count) continue;
+            if (c->active_mask.requires_sync_start()) continue;  // sync_start can't be block-by-block pre-staged
+            PTO2ResourceShape shape = c->active_mask.to_shape();
+            if (shape != PTO2ResourceShape::AIC && shape != PTO2ResourceShape::AIV && shape != PTO2ResourceShape::MIX)
+                continue;
+            uint8_t expect = PTO2_SPEC_NONE;  // exactly-once: only the CAS winner enqueues
+            if (!c->payload->spec_state.compare_exchange_strong(
+                    expect, PTO2_SPEC_STAGING, std::memory_order_seq_cst, std::memory_order_seq_cst
+                ))
+                continue;
+            if (child_depth < PTO2_SPEC_CHAIN_MAX) {  // auto-chain: C propagates to ITS consumers
+                c->payload->spec_chain_depth = child_depth;
+                c->payload->spec_chain_active.store(1, std::memory_order_release);
+            }
+            early_dispatch_queue.push(c);
+        }
+    }
+
+    // Collects consumers released via the speculative-doorbell path during a
+    // single on_task_complete fanout walk, so their dispatch_fanin
+    // propagation runs AFTER the walk — never between two siblings' doorbells.
+    struct SpecReleaseSink {
+        static constexpr int CAP = 32;
+        PTO2TaskSlotState *items[CAP];
+        int n = 0;
+        inline bool push(PTO2TaskSlotState *s) {
+            if (n >= CAP) return false;
+            items[n++] = s;
+            return true;
+        }
+    };
+
+    inline bool try_speculative_release(PTO2TaskSlotState &slot_state, SpecReleaseSink *sink = nullptr) {
+        // Never staged => CAS NONE->DISPATCHED wins => dispatch normally.
+        uint8_t expect = PTO2_SPEC_NONE;
+        if (slot_state.payload->spec_state.compare_exchange_strong(
+                expect, PTO2_SPEC_DISPATCHED, std::memory_order_seq_cst, std::memory_order_seq_cst
+            )) {
+            return false;
+        }
+        // Staged (STAGING). Flip STAGING->DISPATCHED, THEN read the mask. seq_cst
+        // gives a total order with the concurrent stagers, each of which OR-s its
+        // core into the mask and THEN loads spec_state: a stager whose bit lands
+        // before this CAS is read here and rung; a stager whose bit lands after
+        // sees DISPATCHED and rings that core itself (self-ring in
+        // stage_consumer_blocks). Either way every gated core's doorbell fires once
+        // (a double-ring is harmless — the AICore already matched). This replaces
+        // the old transient-STAGING spin: STAGING is now the stable gated state.
+        expect = PTO2_SPEC_STAGING;
+        slot_state.payload->spec_state.compare_exchange_strong(
+            expect, PTO2_SPEC_DISPATCHED, std::memory_order_seq_cst, std::memory_order_seq_cst
+        );
+        for (int w = 0; w < PTO2_SPEC_CORE_MASK_WORDS; w++) {
+            uint64_t bits = slot_state.payload->staged_core_mask[w].load(std::memory_order_seq_cst);
+            while (bits != 0) {
+                int core_id = w * 64 + __builtin_ctzll(bits);
+                bits &= bits - 1;
+                ring_one_doorbell(spec_doorbell_table[core_id].addr, spec_doorbell_table[core_id].token);
+            }
+        }
+        // This pre-staged consumer was just released by its doorbell — it starts
+        // running NOW, so propagate dispatch_fanin to ITS consumers (auto-chain,
+        // knob A). Defer it via the sink so it runs after the whole fanout walk:
+        // doing it inline here would delay the doorbells of later consumers in the
+        // same producer's fanout. Fallback to inline if no sink / sink full.
+        if (sink == nullptr || !sink->push(&slot_state)) {
+            propagate_dispatch_fanin(slot_state);
+        }
+        // No explicit removal from the cross-thread queue: a still-queued entry for
+        // this consumer is now DISPATCHED and is dropped when a peer pops it.
+        // Fully pre-staged => skip the ready queue. Partially staged SPMD consumer =>
+        // fall through so the caller pushes C; dispatch resumes from next_block_idx.
+        return slot_state.next_block_idx.load(std::memory_order_seq_cst) >= slot_state.logical_block_num;
+    }
+
+    bool release_fanin_and_check_ready(
+        PTO2TaskSlotState &slot_state, PTO2LocalReadyBuffer *local_bufs = nullptr, SpecReleaseSink *sink = nullptr
+    ) {
+        // Atomically increment fanin_refcount and check if all producers are done
+        // ACQ_REL on fanin_refcount already synchronizes with the orchestrator's
+        // init release, making fanin_count visible — plain load suffices.
+        int32_t new_refcount = slot_state.fanin_refcount.fetch_add(1, std::memory_order_acq_rel) + 1;
+
+        if (new_refcount == slot_state.fanin_count) {
+            // Speculative early-dispatch: pre-staged tasks are released by doorbell
+            // here, skipping the ready-queue round-trip entirely.
+            if (try_speculative_release(slot_state, sink)) return true;
+            // Local-first: try per-CoreType thread-local buffer before global queue
+            // Route by active_mask: AIC-containing tasks → buf[0], AIV-only → buf[1]
+            // DUMMY shape is out of range for local_bufs (sized PTO2_NUM_RESOURCE_SHAPES);
+            // dummy slots bypass the local fast path and go straight to dummy_ready_queue.
+            PTO2ResourceShape shape = slot_state.active_mask.to_shape();
+            if (shape == PTO2ResourceShape::DUMMY) {
+                dummy_ready_queue.push(&slot_state);
+            } else if (!local_bufs || !local_bufs[static_cast<int32_t>(shape)].try_push(&slot_state)) {
+                ready_queues[static_cast<int32_t>(shape)].push(&slot_state);
+            }
+            return true;
+        }
+        return false;
+    }
+
+#if PTO2_ORCH_PROFILING || PTO2_SCHED_PROFILING
+    bool release_fanin_and_check_ready(
+        PTO2TaskSlotState &slot_state, uint64_t &atomic_count, uint64_t &push_wait,
+        PTO2LocalReadyBuffer *local_bufs = nullptr, SpecReleaseSink *sink = nullptr
+    ) {
+        int32_t new_refcount = slot_state.fanin_refcount.fetch_add(1, std::memory_order_acq_rel) + 1;
+        atomic_count += 1;  // fanin_refcount.fetch_add
+
+        if (new_refcount == slot_state.fanin_count) {
+            // Speculative early-dispatch: pre-staged tasks are released by doorbell
+            // here, skipping the ready-queue round-trip entirely.
+            if (try_speculative_release(slot_state, sink)) return true;
+            // Local-first: try per-CoreType thread-local buffer before global queue.
+            // Dummy slots bypass local_bufs (out-of-range for PTO2_NUM_RESOURCE_SHAPES)
+            // and go straight to dummy_ready_queue; use the profiling-aware push so
+            // atomic_count / push_wait stay consistent with the non-dummy path.
+            PTO2ResourceShape shape = slot_state.active_mask.to_shape();
+            if (shape == PTO2ResourceShape::DUMMY) {
+                dummy_ready_queue.push(&slot_state, atomic_count, push_wait);
+            } else if (!local_bufs || !local_bufs[static_cast<int32_t>(shape)].try_push(&slot_state)) {
+                ready_queues[static_cast<int32_t>(shape)].push(&slot_state, atomic_count, push_wait);
+            }
+            return true;
+        }
+        return false;
+    }
+#endif
+
+    int get_ready_tasks_batch(
+        PTO2ResourceShape shape, PTO2LocalReadyBuffer &local_buf, PTO2TaskSlotState **out, int max_count
+    ) {
+        int count = 0;
+        while (count < max_count && local_buf.count > 0) {
+            out[count++] = local_buf.slot_states[--local_buf.count];
+        }
+        int remaining = max_count - count;
+        if (remaining > 0) {
+            count += ready_queues[static_cast<int32_t>(shape)].pop_batch(out + count, remaining);
+        }
+        return count;
+    }
+
+#if PTO2_SCHED_PROFILING
+    int get_ready_tasks_batch(
+        PTO2ResourceShape shape, PTO2LocalReadyBuffer &local_buf, PTO2TaskSlotState **out, int max_count,
+        uint64_t &atomic_count, uint64_t &wait_cycle
+    ) {
+        int count = 0;
+        while (count < max_count && local_buf.count > 0) {
+            out[count++] = local_buf.slot_states[--local_buf.count];
+        }
+        int remaining = max_count - count;
+        if (remaining > 0) {
+            count +=
+                ready_queues[static_cast<int32_t>(shape)].pop_batch(out + count, remaining, atomic_count, wait_cycle);
+        }
+        return count;
+    }
+#endif
+
+    void on_scope_end(PTO2TaskSlotState **task_slot_states, int32_t count) {
+#if PTO2_ORCH_PROFILING
+        extern uint64_t g_orch_scope_end_atomic_count;
+        if (count > 0) __builtin_prefetch(task_slot_states[0], 1, 0);
+        for (int32_t i = 0; i < count; i++) {
+            if (i + 1 < count) __builtin_prefetch(task_slot_states[i + 1], 1, 0);
+            release_producer(*task_slot_states[i], g_orch_scope_end_atomic_count);
+        }
+#else
+        if (count > 0) __builtin_prefetch(task_slot_states[0], 1, 0);
+        for (int32_t i = 0; i < count; i++) {
+            if (i + 1 < count) __builtin_prefetch(task_slot_states[i + 1], 1, 0);
+            release_producer(*task_slot_states[i]);
+        }
+#endif
+    }
+
+    /**
+     * Subtask completion: atomic counter model.
+     * Called when a single subtask (AIC, AIV0, or AIV1) finishes on any block.
+     * Atomically increments completed_subtasks and checks whether all subtasks
+     * across all blocks are done.
+     *
+     * @return true if this was the last subtask, completing the entire task.
+     */
+    bool on_subtask_complete(PTO2TaskSlotState &slot_state) {
+        int16_t prev = slot_state.completed_subtasks.fetch_add(1, std::memory_order_acq_rel);
+        return (prev + 1) == slot_state.total_required_subtasks;
+    }
+
+    /**
+     * Two-stage completion: second stage.
+     * Called exactly once when all subtasks of a task are done (i.e.,
+     * on_subtask_complete returned true). Walks the consumer (fanout) list,
+     * decrements each consumer's fanin, pushes newly-ready ones, and rings
+     * doorbells for speculative hits.
+     *
+     * Non-PROFILING returns the consumer-walk count (= edges traversed). The
+     * Resolve swimlane bar reads it to label the bar with how many successors
+     * actually got resolved. PROFILING returns the richer CompletionStats
+     * whose `fanout_edges` carries the same number.
+     */
+#if PTO2_SCHED_PROFILING
+    CompletionStats
+#else
+    uint32_t
+#endif
+    on_task_complete(
+        PTO2TaskSlotState &slot_state,
+#if PTO2_SCHED_PROFILING
+        int thread_idx,
+#endif
+
+        PTO2LocalReadyBuffer *local_bufs = nullptr
+    ) {
+#if PTO2_SCHED_PROFILING
+        CompletionStats stats = {0, 0, 0, true};
+#else
+        uint32_t consumer_walk_count = 0;
+#endif
+#if PTO2_SCHED_PROFILING
+        extern uint64_t g_sched_lock_cycle[], g_sched_fanout_cycle[];
+        extern uint64_t g_sched_lock_atomic_count[], g_sched_lock_wait_cycle[];
+        extern uint64_t g_sched_fanout_atomic_count[], g_sched_push_wait_cycle[];
+        uint64_t lock_atomics = 0, lock_wait = 0;
+        PTO2_SCHED_CYCLE_START();
+#endif
+
+#if PTO2_SCHED_PROFILING
+        slot_state.lock_fanout(lock_atomics, lock_wait);
+#else
+        slot_state.lock_fanout();
+#endif
+        slot_state.task_state.store(PTO2_TASK_COMPLETED, std::memory_order_release);
+        PTO2DepListEntry *current = slot_state.fanout_head;  // Protected by fanout_lock
+        slot_state.unlock_fanout();
+
+#if PTO2_SCHED_PROFILING
+        lock_atomics += 2;  // state.store + unlock.store
+        g_sched_lock_atomic_count[thread_idx] += lock_atomics;
+        g_sched_lock_wait_cycle[thread_idx] += lock_wait;
+        PTO2_SCHED_CYCLE_LAP(g_sched_lock_cycle[thread_idx]);
+#endif
+
+        // Fanout: notify consumers. A pre-staged consumer that becomes ready has
+        // its doorbell rung INLINE (db = nullptr) the moment its node is reached,
+        // not batched to after the whole walk — so a flagged consumer near the
+        // front of the list starts immediately and overlaps the remaining
+        // release_fanin work for the other consumers, instead of waiting for the
+        // full O(fanout-degree) walk (~5us for a 50-consumer producer).
+        //
+        // Safe on silicon: the producer's slot is already COMPLETED here — every
+        // SPMD block has FIN'd AND dcci-flushed its output to HBM before
+        // on_task_complete runs — so a released consumer never reads stale
+        // producer output. (Batching used to align the released wave, but pushed
+        // every doorbell to the end of the walk, defeating the whole point of
+        // speculative early-dispatch: minimal producer-end -> consumer-start.)
+#if PTO2_SCHED_PROFILING
+        uint64_t fanout_atomics = 0, push_wait = 0;
+#endif
+        // Doorbells for released pre-staged consumers fire INLINE in the walk
+        // below; their dispatch_fanin propagation is collected here and replayed
+        // after the walk, so no consumer's doorbell waits on a sibling's propagate.
+        SpecReleaseSink rel_sink;
+        while (current != nullptr) {
+            PTO2TaskSlotState &consumer_slot = *current->slot_state;
+#if PTO2_SCHED_PROFILING
+            stats.fanout_edges++;
+            if (release_fanin_and_check_ready(consumer_slot, fanout_atomics, push_wait, local_bufs, &rel_sink)) {
+                stats.tasks_enqueued++;
+            }
+#else
+            consumer_walk_count++;
+            release_fanin_and_check_ready(consumer_slot, local_bufs, &rel_sink);
+#endif
+            current = current->next;
+        }
+        for (int i = 0; i < rel_sink.n; i++) {
+            propagate_dispatch_fanin(*rel_sink.items[i]);
+        }
+
+#if PTO2_SCHED_PROFILING
+        g_sched_fanout_atomic_count[thread_idx] += fanout_atomics;
+        g_sched_push_wait_cycle[thread_idx] += push_wait;
+        PTO2_SCHED_CYCLE_LAP(g_sched_fanout_cycle[thread_idx]);
+        return stats;
+#else
+        return consumer_walk_count;
+#endif
+    }
+
+    /**
+     * Cold path: release producers (fanin traversal) + check self for CONSUMED.
+     * Returns fanin edge count for profiling.
+     */
+
+#if PTO2_SCHED_PROFILING
+    int32_t on_task_release(PTO2TaskSlotState &slot_state, int32_t thread_idx) {
+        PTO2_SCHED_CYCLE_START();
+        extern uint64_t g_sched_fanin_cycle[], g_sched_fanin_atomic_count[];
+        extern uint64_t g_sched_self_atomic_count[];
+        extern uint64_t g_sched_self_consumed_cycle[];
+        extern uint64_t g_sched_complete_count[];
+        uint64_t fanin_atomics = 0;
+#else
+    int32_t on_task_release(PTO2TaskSlotState &slot_state) {
+#endif
+        PTO2TaskPayload *payload = slot_state.payload;
+        for_each_fanin_slot_state(*payload, [&](PTO2TaskSlotState *producer_slot_state) {
+#if PTO2_SCHED_PROFILING
+            release_producer(*producer_slot_state, fanin_atomics);
+#else
+            release_producer(*producer_slot_state);
+#endif
+        });
+#if PTO2_SCHED_PROFILING
+        g_sched_fanin_atomic_count[thread_idx] += fanin_atomics;
+        PTO2_SCHED_CYCLE_LAP(g_sched_fanin_cycle[thread_idx]);
+#endif
+
+        // Self consumed check
+#if PTO2_SCHED_PROFILING
+        uint64_t self_atomics = 0;
+        check_and_handle_consumed(slot_state, self_atomics);
+        g_sched_self_atomic_count[thread_idx] += self_atomics;
+        PTO2_SCHED_CYCLE_LAP(g_sched_self_consumed_cycle[thread_idx]);
+        g_sched_complete_count[thread_idx]++;
+#else
+        check_and_handle_consumed(slot_state);
+#endif
+        return payload->fanin_actual_count;
+    }
+
+    // === Cold-path API (defined in pto_scheduler.cpp) ===
+
+    // Phase 1: declare every sub-region (ready_queue slots, dummy queue slots,
+    // per-ring dep_pool entries, wiring SPSC buffer) on the supplied arena.
+    // Capacities are baked into the returned layout; init_data_from_layout uses
+    // the same values.
+    static PTO2SchedulerLayout reserve_layout(DeviceArena &arena, int32_t dep_pool_capacity = PTO2_DEP_LIST_POOL_SIZE);
+    static PTO2SchedulerLayout
+    reserve_layout(DeviceArena &arena, const int32_t dep_pool_capacities[PTO2_MAX_RING_DEPTH]);
+
+    // Phase 3a: write everything *except* arena-internal pointer fields.
+    // `sm_dev_base` is the device address of the SM (only stored, never
+    // dereferenced here). Safe to call on a host arena that holds the
+    // prebuilt image buffer. (The orchestrator counterpart takes
+    // task_window_size for ring task_descriptors address arithmetic; the
+    // scheduler only needs the SM header / ring header base addresses,
+    // both window-size-independent.)
+    bool init_data_from_layout(const PTO2SchedulerLayout &layout, DeviceArena &arena, void *sm_dev_base);
+
+    // Phase 3b: write the arena-internal pointer fields
+    // (ready_queues[].slots, dummy_ready_queue.slots, dep_pool.base for each
+    // ring, wiring.queue.buffer_). Called on both host and device sides.
+    void wire_arena_pointers(const PTO2SchedulerLayout &layout, DeviceArena &arena);
+
+    // Forget per-region pointers; arena owns the backing memory.
+    void destroy();
+    void print_stats();
+    void print_queues();
+};
+
+// Scheduler cold-path API is declared as PTO2SchedulerState member functions.
+// See init()/destroy()/print_stats()/print_queues() below the struct definition.
+
+// try_inline_complete_locked: short-circuit NotDeferred completions seen during
+// drain so they don't grow entries[]. Defined here (not in pto_async_wait.h)
+// because PTO2SchedulerState's on_task_complete signature is only known
+// after its full definition above.
+//
+// When the deferred_release_slot_states[] buffer is full, drain it via
+// on_task_release before appending — mirrors the same overflow-drain idiom
+// that scheduler_completion.cpp's inline NotDeferred path uses, so high task
+// rates don't surface as ASYNC_WAIT_OVERFLOW errors.
+inline bool
+AsyncWaitList::try_inline_complete_locked(AsyncWaitList::DrainCompletionSink &sink, PTO2TaskSlotState &slot_state) {
+    // Return value (CompletionStats / consumer-walk count) discarded:
+    // async-wait drain path has no Resolve swimlane bar attached.
+#if PTO2_SCHED_PROFILING
+    (void)sink.sched->on_task_complete(slot_state, sink.thread_idx, sink.local_bufs);
+#else
+    (void)sink.sched->on_task_complete(slot_state, sink.local_bufs);
+#endif
+    if (*sink.deferred_release_count >= sink.deferred_release_capacity) {
+        while (*sink.deferred_release_count > 0) {
+#if PTO2_SCHED_PROFILING
+            (void)sink.sched->on_task_release(
+                *sink.deferred_release_slot_states[--(*sink.deferred_release_count)], sink.thread_idx
+            );
+#else
+            sink.sched->on_task_release(*sink.deferred_release_slot_states[--(*sink.deferred_release_count)]);
+#endif
+        }
+    }
+    sink.deferred_release_slot_states[(*sink.deferred_release_count)++] = &slot_state;
+    sink.inline_completed++;
+    return true;
+}
+
+template <bool Profiling>
+inline AsyncPollResult AsyncWaitList::poll_and_complete(
+    AICoreCompletionMailbox *aicore_mailbox, PTO2SchedulerState *sched, PTO2LocalReadyBuffer *local_bufs,
+    PTO2TaskSlotState **deferred_release_slot_states, int32_t &deferred_release_count, int32_t deferred_release_capacity
+#if PTO2_SCHED_PROFILING
+    ,
+    int thread_idx
+#endif
+) {
+    AsyncPollResult result;
+    if (!try_lock()) return result;
+
+    AsyncWaitList::DrainCompletionSink sink{};
+    sink.sched = sched;
+    sink.local_bufs = local_bufs;
+    sink.deferred_release_slot_states = deferred_release_slot_states;
+    sink.deferred_release_count = &deferred_release_count;
+    sink.deferred_release_capacity = deferred_release_capacity;
+#if PTO2_SCHED_PROFILING
+    sink.thread_idx = thread_idx;
+#endif
+
+    int32_t drain_err = PTO2_ERROR_NONE;
+    drain_aicore_completion_mailbox_locked(aicore_mailbox, sink, drain_err);
+    if (drain_err != PTO2_ERROR_NONE) {
+        result.error_code = drain_err;
+        unlock();
+        return result;
+    }
+    result.completed += sink.inline_completed;
+
+    for (int32_t i = count - 1; i >= 0; --i) {
+        AsyncWaitEntry &entry = entries[i];
+        uintptr_t last_invalidated_counter_line = static_cast<uintptr_t>(-1);
+        for (int32_t c = 0; c < entry.condition_count; c++) {
+            CompletionCondition &cond = entry.conditions[c];
+            if (cond.satisfied) continue;
+            if (cond.completion_type == COMPLETION_TYPE_COUNTER && cond.counter_addr != nullptr) {
+                uintptr_t counter_line = mailbox_cache_line(cond.counter_addr);
+                if (counter_line != last_invalidated_counter_line) {
+                    cache_invalidate_range(reinterpret_cast<const void *>(counter_line), sizeof(uint32_t));
+                    last_invalidated_counter_line = counter_line;
+                }
+            }
+            CompletionPollResult poll = cond.test();
+            if (poll.state == CompletionPollState::FAILED) {
+                result.error_code = poll.error_code;
+                result.failed_slot_state = entry.slot_state;
+                unlock();
+                return result;
+            }
+            if (poll.state == CompletionPollState::READY) {
+                cond.satisfied = true;
+                cond.retire();
+                entry.waiting_completion_count--;
+            }
+        }
+
+        if (entry.normal_done && entry.waiting_completion_count <= 0) {
+            // Return value (CompletionStats / consumer-walk count) discarded:
+            // deferred-completion drain has no Resolve swimlane bar attached.
+#if PTO2_SCHED_PROFILING
+            (void)sched->on_task_complete(*entry.slot_state, thread_idx, local_bufs);
+#else
+            (void)sched->on_task_complete(*entry.slot_state, local_bufs);
+#endif
+            // Drain deferred_release in place when the buffer fills — same
+            // overflow-drain idiom used by complete_slot_task's inline path
+            // and by try_inline_complete_locked. Without this, large bursts
+            // of completable wait_list entries in a single poll surfaced as
+            // ASYNC_WAIT_OVERFLOW under the MPSC model.
+            if (deferred_release_count >= deferred_release_capacity) {
+                while (deferred_release_count > 0) {
+#if PTO2_SCHED_PROFILING
+                    (void)sched->on_task_release(*deferred_release_slot_states[--deferred_release_count], thread_idx);
+#else
+                    sched->on_task_release(*deferred_release_slot_states[--deferred_release_count]);
+#endif
+                }
+            }
+            deferred_release_slot_states[deferred_release_count++] = entry.slot_state;
+            result.completed++;
+
+            int32_t last = count - 1;
+            if (i != last) entries[i] = entries[last];
+            count = last;
+        }
+    }
+
+    unlock();
+    return result;
+}
+
+// =============================================================================
+// Scheduler Profiling Data
+// =============================================================================
+
+#if PTO2_SCHED_PROFILING
+struct PTO2SchedProfilingData {
+    // Sub-phase cycle breakdown within on_task_complete
+    uint64_t lock_cycle;           // lock_fanout + state store + unlock
+    uint64_t fanout_cycle;         // fanout traversal
+    uint64_t fanin_cycle;          // fanin traversal
+    uint64_t self_consumed_cycle;  // self check_and_handle_consumed
+
+    // Wait times
+    uint64_t lock_wait_cycle;  // spin-wait in fanout_lock
+    uint64_t push_wait_cycle;  // CAS contention in push()
+    uint64_t pop_wait_cycle;   // CAS contention in pop()
+
+    // Atomic counts per sub-phase
+    uint64_t lock_atomic_count;
+    uint64_t fanout_atomic_count;
+    uint64_t fanin_atomic_count;
+    uint64_t self_atomic_count;
+    uint64_t pop_atomic_count;
+
+    int64_t complete_count;
+};
+
+/**
+ * Get and reset scheduler profiling data for a specific thread.
+ * Returns accumulated profiling data and resets counters.
+ */
+PTO2SchedProfilingData scheduler_get_profiling(int thread_idx);
+#endif
diff --git a/src/a2a3/runtime/fully_distributed_within_core/runtime/scheduler/scheduler_cold_path.cpp b/src/a2a3/runtime/fully_distributed_within_core/runtime/scheduler/scheduler_cold_path.cpp
new file mode 100644
index 000000000..4dd0cb28d
--- /dev/null
+++ b/src/a2a3/runtime/fully_distributed_within_core/runtime/scheduler/scheduler_cold_path.cpp
@@ -0,0 +1,1093 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+#include "scheduler_context.h"
+
+#include <cinttypes>
+#include <cstdio>
+
+#include "common/unified_log.h"
+#include "aicpu/device_time.h"
+#include "aicpu/l2_swimlane_collector_aicpu.h"
+#include "aicpu/platform_regs.h"
+#include "aicpu/pmu_collector_aicpu.h"
+#include "aicpu/tensor_dump_aicpu.h"
+#include "common/memory_barrier.h"
+#include "common/l2_swimlane_profiling.h"
+#include "common/platform_config.h"
+#include "pto_runtime2.h"
+#include "pto_shared_memory.h"
+#include "runtime.h"
+#include "spin_hint.h"
+
+// =============================================================================
+// Cold-path helpers for the main dispatch loop (noinline to reduce hot-loop icache)
+// =============================================================================
+
+static void latch_scheduler_error(PTO2SharedMemoryHeader *header, int32_t thread_idx, int32_t error_code) {
+    if (header == nullptr || error_code == PTO2_ERROR_NONE) {
+        return;
+    }
+    // The first error code/thread pair wins; the bitmap cumulatively records all reporting threads.
+    int32_t expected = PTO2_ERROR_NONE;
+    if (header->sched_error_code.compare_exchange_strong(expected, error_code, std::memory_order_acq_rel)) {
+        header->sched_error_thread.store(thread_idx, std::memory_order_release);
+    }
+    if (thread_idx >= 0 && thread_idx < 32) {
+        header->sched_error_bitmap.fetch_or(1U << static_cast<uint32_t>(thread_idx), std::memory_order_acq_rel);
+    }
+}
+
+LoopAction SchedulerContext::handle_orchestrator_exit(
+    int32_t thread_idx, PTO2SharedMemoryHeader *header, Runtime *runtime, int32_t &task_count
+) {
+    if (completed_.load(std::memory_order_acquire)) {
+        return LoopAction::BREAK_LOOP;
+    }
+    int32_t orch_err = header->orch_error_code.load(std::memory_order_acquire);
+    if (orch_err != PTO2_ERROR_NONE) {
+        LOG_ERROR(
+            "Thread %d: Fatal error (code=%d), sending EXIT_SIGNAL to all cores. "
+            "completed_tasks=%d, total_tasks=%d",
+            thread_idx, orch_err, completed_tasks_.load(std::memory_order_relaxed), total_tasks_
+        );
+        if (!completed_.exchange(true, std::memory_order_acq_rel)) {
+            emergency_shutdown(runtime);
+        }
+        return LoopAction::BREAK_LOOP;
+    }
+    int32_t sched_err = header->sched_error_code.load(std::memory_order_acquire);
+    if (sched_err != PTO2_ERROR_NONE) {
+        LOG_ERROR("Thread %d: Scheduler fatal error detected (code=%d)", thread_idx, sched_err);
+        if (!completed_.exchange(true, std::memory_order_acq_rel)) {
+            emergency_shutdown(runtime);
+        }
+        return LoopAction::BREAK_LOOP;
+    }
+
+    bool orch_done = orchestrator_done_;
+    if (!orch_done) return LoopAction::NONE;
+
+    task_count = total_tasks_;
+    // task_count == 0 is the fully_distributed_within_core path: orchestration +
+    // scheduling + execution all ran on the AI cores, so nothing was submitted to
+    // shared memory. Once orchestration is done (checked above) an empty SM graph
+    // means there is no AICPU-side work left — complete immediately rather than
+    // spinning forever. The centralized path (task_count > 0) is unchanged.
+    if (completed_tasks_.load(std::memory_order_relaxed) >= task_count) {
+        completed_.store(true, std::memory_order_release);
+        LOG_INFO_V0(
+            "Thread %d: PTO2 completed tasks %d/%d", thread_idx, completed_tasks_.load(std::memory_order_relaxed),
+            task_count
+        );
+        return LoopAction::BREAK_LOOP;
+    }
+    return LoopAction::NONE;
+}
+
+LoopAction SchedulerContext::handle_core_transition(bool &cores_released) {
+    if (!transition_requested_.load(std::memory_order_acquire)) return LoopAction::NONE;
+    if (!reassigned_.load(std::memory_order_acquire)) {
+        wait_reassign_.fetch_add(1, std::memory_order_release);
+        while (!reassigned_.load(std::memory_order_acquire)) {
+            if (completed_.load(std::memory_order_acquire)) {
+                return LoopAction::BREAK_LOOP;
+            }
+            SPIN_WAIT_HINT();
+        }
+    }
+    cores_released = true;
+    return LoopAction::NONE;
+}
+
+LoopAction
+SchedulerContext::check_idle_fatal_error(int32_t thread_idx, PTO2SharedMemoryHeader *header, Runtime *runtime) {
+    if (completed_.load(std::memory_order_acquire)) {
+        return LoopAction::BREAK_LOOP;
+    }
+    int32_t orch_err = header->orch_error_code.load(std::memory_order_acquire);
+    if (orch_err != PTO2_ERROR_NONE) {
+        LOG_ERROR("Thread %d: Fatal error detected (code=%d), sending EXIT_SIGNAL to all cores", thread_idx, orch_err);
+        if (!completed_.exchange(true, std::memory_order_acq_rel)) {
+            emergency_shutdown(runtime);
+        }
+        return LoopAction::BREAK_LOOP;
+    }
+    int32_t sched_err = header->sched_error_code.load(std::memory_order_acquire);
+    if (sched_err != PTO2_ERROR_NONE) {
+        LOG_ERROR("Thread %d: Scheduler fatal error detected (code=%d)", thread_idx, sched_err);
+        if (!completed_.exchange(true, std::memory_order_acq_rel)) {
+            emergency_shutdown(runtime);
+        }
+        return LoopAction::BREAK_LOOP;
+    }
+    return LoopAction::NONE;
+}
+
+// =============================================================================
+// Stall diagnostic log format.
+//
+// Every line is self-contained — when scheduler threads emit concurrently and
+// device_log interleaves their output, each line still carries enough context
+// to identify which thread / iteration / object it belongs to.
+//
+// Prefix on every line:
+//   [STALL thread=N idle_iterations=K] CATEGORY ...
+//
+// All scheduler threads spinning at the same idle rate hit STALL_LOG_INTERVAL
+// together, so lines with the same idle_iterations belong to one diagnostic
+// round; grep "idle_iterations=N" groups one round's output.
+//
+// Categories (and which thread emits them):
+//   SUMMARY  — completed / total counts and scan totals               (thread 0 only)
+//   TASK     — one per non-completed task scanned from shared rings   (thread 0 only)
+//              - state=RUNNING: includes running_on=[...] cross-ref
+//              - state=READY:   fanin satisfied but no idle core yet
+//              - state=WAIT:    includes missing_deps=N
+//   CLUSTER  — one per cluster owned by this thread                   (every thread)
+//              - busy slot shows kernel + task_id + cond_reg_state;
+//                ANOMALY suffix when COND register is fin while software
+//                still has the slot marked busy.
+//
+// Reader workflow:
+//   1. grep SUMMARY                          -> overall completion status
+//   2. grep "idle_iterations=N TASK"         -> stuck RUNNING task and which
+//                                               core/thread it is on
+//   3. grep "idle_iterations=N CLUSTER.*task=<id>" -> cross-check via the
+//                                                     cluster line (or just
+//                                                     read running_on in step 2)
+// =============================================================================
+
+namespace {
+
+// Format a core's idle/busy state into a fixed buffer. Used inside CLUSTER lines.
+// Layout (idle):    coreN(idle)
+// Layout (busy):    coreN(busy kernel=K task=T cond_reg_state=ack)
+// Layout (anomaly): coreN(busy kernel=K task=T cond_reg_state=fin ANOMALY)
+//
+// Healthy busy: COND register reports ack (AICore still executing). fin means
+// AICore wrote completion but AICPU hasn't recycled the running slot yet —
+// either a completion-poll bug or the diagnostic raced the recycle.
+void format_core_status(
+    char *buf, size_t buf_size, int32_t core_id, bool idle, const CoreExecState *core_state, uint64_t reg_addr_for_cond
+) {
+    if (idle) {
+        snprintf(buf, buf_size, "core%d(idle)", core_id);
+        return;
+    }
+    int32_t kernel = -1;
+    int64_t task_id_raw = -1;
+    if (core_state && core_state->running_slot_state) {
+        int32_t subslot = static_cast<int32_t>(core_state->running_subslot);
+        kernel = core_state->running_slot_state->task->kernel_id[subslot];
+        task_id_raw = static_cast<int64_t>(core_state->running_slot_state->task->task_id.raw);
+    }
+    uint64_t cond_reg = read_reg(reg_addr_for_cond, RegId::COND);
+    int32_t hw_state = EXTRACT_TASK_STATE(cond_reg);
+    const char *cond_reg_state_str = (hw_state == TASK_ACK_STATE) ? "ack" : "fin";
+    if (hw_state == TASK_ACK_STATE) {
+        snprintf(
+            buf, buf_size, "core%d(busy kernel=%d task=%" PRId64 " cond_reg_state=%s)", core_id, kernel, task_id_raw,
+            cond_reg_state_str
+        );
+    } else {
+        snprintf(
+            buf, buf_size,
+            "core%d(busy kernel=%d task=%" PRId64
+            " cond_reg_state=%s ANOMALY cond_tok=%d running_tok=%d pending_tok=%d)",
+            core_id, kernel, task_id_raw, cond_reg_state_str, EXTRACT_TASK_ID(cond_reg),
+            core_state->running_reg_task_id, core_state->pending_reg_task_id
+        );
+    }
+}
+
+}  // namespace
+
+int32_t SchedulerContext::find_core_owner_thread(int32_t core_id) const {
+    for (int32_t t = 0; t < aicpu_thread_num_; t++) {
+        const int32_t *ids = core_trackers_[t].core_ids();
+        int32_t n = core_trackers_[t].core_num();
+        for (int32_t i = 0; i < n; i++) {
+            if (ids[i] == core_id) return t;
+        }
+    }
+    return -1;
+}
+
+bool SchedulerContext::self_owns_running_task(int32_t thread_idx) const {
+    const int32_t *cores = core_trackers_[thread_idx].core_ids();
+    int32_t core_num = core_trackers_[thread_idx].core_num();
+    for (int32_t i = 0; i < core_num; i++) {
+        if (core_exec_states_[cores[i]].running_slot_state != nullptr) {
+            return true;
+        }
+    }
+    return false;
+}
+
+bool SchedulerContext::no_thread_owns_running_task() const {
+    for (int32_t t = 0; t < aicpu_thread_num_; t++) {
+        if (self_owns_running_task(t)) return false;
+    }
+    return true;
+}
+
+void SchedulerContext::log_stall_diagnostics(
+    int32_t thread_idx, int32_t task_count, int32_t idle_iterations, int32_t last_progress_count
+) {
+    CoreTracker &tracker = core_trackers_[thread_idx];
+
+    // T0 owns the shared-ring scan; printing it from other threads would
+    // produce identical TASK lines once per scheduler thread.
+    if (thread_idx == 0) {
+        int32_t cnt_ready = 0, cnt_waiting = 0, cnt_running = 0, submitted_in_ring = 0;
+        for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+            PTO2SharedMemoryRingHeader &ring = *sched_->ring_sched_states[r].ring;
+            int32_t ring_task_count = ring.fc.current_task_index.load(std::memory_order_relaxed);
+            submitted_in_ring += ring_task_count;
+            for (int32_t si = 0; si < ring_task_count; si++) {
+                PTO2TaskSlotState &slot_state = ring.get_slot_state_by_task_id(si);
+                PTO2TaskState st = slot_state.task_state.load(std::memory_order_relaxed);
+                int32_t rc = slot_state.fanin_refcount.load(std::memory_order_relaxed);
+                int32_t fi = slot_state.fanin_count;
+                int32_t kid_aic = slot_state.task->kernel_id[0];
+                int32_t kid_aiv0 = slot_state.task->kernel_id[1];
+                int32_t kid_aiv1 = slot_state.task->kernel_id[2];
+                int64_t task_id = static_cast<int64_t>(slot_state.task->task_id.raw);
+                if (st >= PTO2_TASK_COMPLETED) continue;
+                // task_state has no intermediate ready/running value — it
+                // stays PENDING until the worker stores COMPLETED. Classify
+                // by the ground truth instead: a slot is RUNNING iff some
+                // core has it as running_slot_state. A task occupies at most
+                // 3 cores (one cluster), all under the same owner thread by
+                // construction of assign_cores_to_threads.
+                char running_on[192] = {0};
+                int32_t owner = -1;
+                int32_t pos = 0;
+                bool is_running = false;
+                for (int32_t cid = 0; cid < cores_total_num_ && pos + 32 < (int32_t)sizeof(running_on); cid++) {
+                    if (core_exec_states_[cid].running_slot_state != &slot_state) continue;
+                    is_running = true;
+                    if (owner < 0) owner = find_core_owner_thread(cid);
+                    const char *sname = subslot_name(core_exec_states_[cid].running_subslot);
+                    int32_t written = snprintf(
+                        running_on + pos, sizeof(running_on) - pos, "%score=%d(%s)", pos == 0 ? "" : " ", cid, sname
+                    );
+                    if (written > 0) pos += written;
+                }
+
+                if (is_running) {
+                    cnt_running++;
+                    if (cnt_running > STALL_DUMP_READY_MAX) continue;
+                    LOG_INFO_V9(
+                        "[STALL thread=%d idle_iterations=%d] TASK ring=%d task_id=%" PRId64
+                        " state=RUNNING fanin_refcount=%d/%d kernels=[aic:%d aiv0:%d aiv1:%d] "
+                        "running_on=[owner_thread=%d cores=[%s]]",
+                        thread_idx, idle_iterations, r, task_id, rc, fi, kid_aic, kid_aiv0, kid_aiv1, owner, running_on
+                    );
+                    continue;
+                }
+                if (rc >= fi) {
+                    cnt_ready++;
+                    if (cnt_ready > STALL_DUMP_READY_MAX) continue;
+                    LOG_INFO_V9(
+                        "[STALL thread=%d idle_iterations=%d] TASK ring=%d task_id=%" PRId64
+                        " state=READY   fanin_refcount=%d/%d kernels=[aic:%d aiv0:%d aiv1:%d]",
+                        thread_idx, idle_iterations, r, task_id, rc, fi, kid_aic, kid_aiv0, kid_aiv1
+                    );
+                    continue;
+                }
+                cnt_waiting++;
+                if (cnt_waiting > STALL_DUMP_WAIT_MAX) continue;
+                LOG_INFO_V9(
+                    "[STALL thread=%d idle_iterations=%d] TASK ring=%d task_id=%" PRId64
+                    " state=WAIT    fanin_refcount=%d/%d kernels=[aic:%d aiv0:%d aiv1:%d] missing_deps=%d",
+                    thread_idx, idle_iterations, r, task_id, rc, fi, kid_aic, kid_aiv0, kid_aiv1, fi - rc
+                );
+            }
+        }
+        int32_t effective_total = task_count > 0 ? task_count : submitted_in_ring;
+        int32_t c = completed_tasks_.load(std::memory_order_relaxed);
+        LOG_INFO_V9(
+            "[STALL thread=%d idle_iterations=%d] SUMMARY completed=%d/%d last_progress_iteration=%d "
+            "scan_ready=%d scan_waiting=%d scan_running=%d",
+            thread_idx, idle_iterations, c, effective_total, last_progress_count, cnt_ready, cnt_waiting, cnt_running
+        );
+    }
+
+    // CLUSTER lines: one per cluster this thread owns.
+    // cluster_id = local_cluster_idx * active_sched_threads_ + thread_idx, matching the
+    // round-robin assignment in assign_cores_to_threads / reassign_cores_for_all_threads.
+    int32_t ast = active_sched_threads_ > 0 ? active_sched_threads_ : aicpu_thread_num_;
+    for (int32_t cli = 0; cli < tracker.get_cluster_count() && cli < STALL_DUMP_CORE_MAX; cli++) {
+        int32_t offset = cli * 3;
+        int32_t aic_id = tracker.get_aic_core_id(offset);
+        int32_t aiv0_id = tracker.get_aiv0_core_id(offset);
+        int32_t aiv1_id = tracker.get_aiv1_core_id(offset);
+        bool aic_idle = tracker.is_aic_core_idle(offset);
+        bool aiv0_idle = tracker.is_aiv0_core_idle(offset);
+        bool aiv1_idle = tracker.is_aiv1_core_idle(offset);
+        int32_t cluster_id = cli * ast + thread_idx;
+        char aic_buf[192], aiv0_buf[192], aiv1_buf[192];
+        format_core_status(
+            aic_buf, sizeof(aic_buf), aic_id, aic_idle, &core_exec_states_[aic_id], core_exec_states_[aic_id].reg_addr
+        );
+        format_core_status(
+            aiv0_buf, sizeof(aiv0_buf), aiv0_id, aiv0_idle, &core_exec_states_[aiv0_id],
+            core_exec_states_[aiv0_id].reg_addr
+        );
+        format_core_status(
+            aiv1_buf, sizeof(aiv1_buf), aiv1_id, aiv1_idle, &core_exec_states_[aiv1_id],
+            core_exec_states_[aiv1_id].reg_addr
+        );
+        LOG_INFO_V9(
+            "[STALL thread=%d idle_iterations=%d] CLUSTER cluster_id=%d aic=%s aiv0=%s aiv1=%s", thread_idx,
+            idle_iterations, cluster_id, aic_buf, aiv0_buf, aiv1_buf
+        );
+    }
+}
+
+void SchedulerContext::log_shutdown_stall_snapshot(
+    int32_t trigger_thread_idx, int32_t trigger_idle_iterations, int32_t trigger_last_progress_count
+) {
+    LOG_WARN(
+        "[SHUTDOWN_SNAPSHOT trigger_thread=%d reason=scheduler_timeout idle_iterations=%d] "
+        "dumping all scheduler threads before emergency shutdown",
+        trigger_thread_idx, trigger_idle_iterations
+    );
+    int32_t thread_count = active_sched_threads_ > 0 ? active_sched_threads_ : aicpu_thread_num_;
+    if (thread_count < 0 || thread_count > MAX_AICPU_THREADS) {
+        LOG_ERROR(
+            "[SHUTDOWN_SNAPSHOT trigger_thread=%d] invalid thread_count=%d, clamping to [0,%d]", trigger_thread_idx,
+            thread_count, MAX_AICPU_THREADS
+        );
+        thread_count = thread_count < 0 ? 0 : MAX_AICPU_THREADS;
+    }
+    for (int32_t t = 0; t < thread_count; t++) {
+        log_stall_diagnostics(t, total_tasks_, trigger_idle_iterations, trigger_last_progress_count);
+    }
+}
+
+int32_t SchedulerContext::handle_timeout_exit(
+    int32_t thread_idx, PTO2SharedMemoryHeader *header, Runtime *runtime, int32_t idle_iterations,
+    int32_t last_progress_count
+#if PTO2_PROFILING
+    ,
+    uint64_t sched_start_ts
+#endif
+) {
+    LOG_ERROR(
+        "[STALL thread=%d idle_iterations=%d] TIMEOUT_EXIT after_idle_iterations=%d", thread_idx, idle_iterations,
+        idle_iterations
+    );
+    latch_scheduler_error(header, thread_idx, PTO2_ERROR_SCHEDULER_TIMEOUT);
+    if (!completed_.exchange(true, std::memory_order_acq_rel)) {
+        log_shutdown_stall_snapshot(thread_idx, idle_iterations, last_progress_count);
+#if PTO2_PROFILING
+        // Capture the in-flight kernels' partial output before signalling the
+        // cores to exit, so the dump reflects the live stuck state.
+        if (is_dump_args_enabled()) {
+            dump_running_task_outputs<PTO2_SUBTASK_SLOT_COUNT>(
+                thread_idx, cores_total_num_,
+                [this](int32_t cid) {
+                    return core_exec_states_[cid].running_slot_state;
+                },
+                [](ActiveMask active_mask, int raw_subtask_id) {
+                    return active_mask.subtask_active(static_cast<PTO2SubtaskSlot>(raw_subtask_id));
+                },
+                [this](int32_t func_id) {
+                    return get_function_bin_addr(func_id);
+                }
+            );
+        }
+#endif
+        emergency_shutdown(runtime);
+    }
+#if PTO2_PROFILING
+    uint64_t sched_timeout_ts = get_sys_cnt_aicpu();
+    LOG_INFO_V9(
+        "Thread %d: sched_start=%" PRIu64 " sched_end(timeout)=%" PRIu64 " sched_cost=%.3fus", thread_idx,
+        static_cast<uint64_t>(sched_start_ts), static_cast<uint64_t>(sched_timeout_ts),
+        cycles_to_us(sched_timeout_ts - sched_start_ts)
+    );
+#endif
+    return -PTO2_ERROR_SCHEDULER_TIMEOUT;
+}
+
+#if PTO2_PROFILING
+void SchedulerContext::log_l2_swimlane_summary(int32_t thread_idx, int32_t cur_thread_completed) {
+    auto &l2_swimlane = sched_l2_swimlane_[thread_idx];
+    uint64_t sched_end_ts = get_sys_cnt_aicpu();
+    LOG_INFO_V9(
+        "Thread %d: sched_start=%" PRIu64 " sched_end=%" PRIu64 " sched_cost=%.3fus", thread_idx,
+        static_cast<uint64_t>(l2_swimlane.sched_start_ts), static_cast<uint64_t>(sched_end_ts),
+        cycles_to_us(sched_end_ts - l2_swimlane.sched_start_ts)
+    );
+
+    uint64_t sched_total = l2_swimlane.sched_wiring_cycle + l2_swimlane.sched_complete_cycle +
+                           l2_swimlane.sched_dispatch_cycle + l2_swimlane.sched_idle_cycle;
+    if (sched_total == 0) sched_total = 1;
+
+#if PTO2_SCHED_PROFILING
+    {
+        PTO2SchedProfilingData sp = scheduler_get_profiling(thread_idx);
+        uint64_t otc_total = sp.lock_cycle + sp.fanout_cycle + sp.fanin_cycle + sp.self_consumed_cycle;
+        uint64_t complete_poll =
+            (l2_swimlane.sched_complete_cycle > otc_total + l2_swimlane.sched_complete_perf_cycle) ?
+                (l2_swimlane.sched_complete_cycle - otc_total - l2_swimlane.sched_complete_perf_cycle) :
+                0;
+        uint64_t dispatch_poll = (l2_swimlane.sched_dispatch_cycle >
+                                  l2_swimlane.sched_dispatch_pop_cycle + l2_swimlane.sched_dispatch_setup_cycle) ?
+                                     (l2_swimlane.sched_dispatch_cycle - l2_swimlane.sched_dispatch_pop_cycle -
+                                      l2_swimlane.sched_dispatch_setup_cycle) :
+                                     0;
+
+        LOG_INFO_V9(
+            "Thread %d: === Scheduler Phase Breakdown: total=%.3fus, %d tasks ===", thread_idx,
+            cycles_to_us(sched_total), cur_thread_completed
+        );
+
+        // fanout / fanin per-thread aggregates live in
+        // sched_overhead_analysis.compute_dag_stats_from_deps (deps.json edges
+        // × core_to_thread).
+        LOG_INFO_V9(
+            "Thread %d:   complete       : %.3fus (%.1f%%)", thread_idx, cycles_to_us(l2_swimlane.sched_complete_cycle),
+            l2_swimlane.sched_complete_cycle * 100.0 / sched_total
+        );
+
+        uint64_t c_parent = l2_swimlane.sched_complete_cycle > 0 ? l2_swimlane.sched_complete_cycle : 1;
+        uint64_t complete_miss_count = (l2_swimlane.complete_probe_count > l2_swimlane.complete_hit_count) ?
+                                           (l2_swimlane.complete_probe_count - l2_swimlane.complete_hit_count) :
+                                           0;
+        double complete_hit_rate = l2_swimlane.complete_probe_count > 0 ?
+                                       l2_swimlane.complete_hit_count * 100.0 / l2_swimlane.complete_probe_count :
+                                       0.0;
+        LOG_INFO_V9(
+            "Thread %d:     poll         : %.3fus (%.1f%%)  hit=%" PRIu64 ", miss=%" PRIu64 ", hit_rate=%.1f%%",
+            thread_idx, cycles_to_us(complete_poll), complete_poll * 100.0 / c_parent,
+            static_cast<uint64_t>(l2_swimlane.complete_hit_count), static_cast<uint64_t>(complete_miss_count),
+            complete_hit_rate
+        );
+        LOG_INFO_V9(
+            "Thread %d:     otc_lock     : %.3fus (%.1f%%)  work=%.3fus wait=%.3fus  atomics=%" PRIu64 "", thread_idx,
+            cycles_to_us(sp.lock_cycle), sp.lock_cycle * 100.0 / c_parent,
+            cycles_to_us(sp.lock_cycle - sp.lock_wait_cycle), cycles_to_us(sp.lock_wait_cycle),
+            static_cast<uint64_t>(sp.lock_atomic_count)
+        );
+        LOG_INFO_V9(
+            "Thread %d:     otc_fanout   : %.3fus (%.1f%%)  work=%.3fus wait=%.3fus  atomics=%" PRIu64 "", thread_idx,
+            cycles_to_us(sp.fanout_cycle), sp.fanout_cycle * 100.0 / c_parent,
+            cycles_to_us(sp.fanout_cycle - sp.push_wait_cycle), cycles_to_us(sp.push_wait_cycle),
+            static_cast<uint64_t>(sp.fanout_atomic_count)
+        );
+        LOG_INFO_V9(
+            "Thread %d:     otc_fanin    : %.3fus (%.1f%%)  atomics=%" PRIu64 "", thread_idx,
+            cycles_to_us(sp.fanin_cycle), sp.fanin_cycle * 100.0 / c_parent,
+            static_cast<uint64_t>(sp.fanin_atomic_count)
+        );
+        LOG_INFO_V9(
+            "Thread %d:     otc_self     : %.3fus (%.1f%%)  atomics=%" PRIu64 "", thread_idx,
+            cycles_to_us(sp.self_consumed_cycle), sp.self_consumed_cycle * 100.0 / c_parent,
+            static_cast<uint64_t>(sp.self_atomic_count)
+        );
+        LOG_INFO_V9(
+            "Thread %d:     perf         : %.3fus (%.1f%%)", thread_idx,
+            cycles_to_us(l2_swimlane.sched_complete_perf_cycle),
+            l2_swimlane.sched_complete_perf_cycle * 100.0 / c_parent
+        );
+
+        LOG_INFO_V9(
+            "Thread %d:   dispatch       : %.3fus (%.1f%%)", thread_idx, cycles_to_us(l2_swimlane.sched_dispatch_cycle),
+            l2_swimlane.sched_dispatch_cycle * 100.0 / sched_total
+        );
+
+        uint64_t d_parent = l2_swimlane.sched_dispatch_cycle > 0 ? l2_swimlane.sched_dispatch_cycle : 1;
+        LOG_INFO_V9(
+            "Thread %d:     poll         : %.3fus (%.1f%%)", thread_idx, cycles_to_us(dispatch_poll),
+            dispatch_poll * 100.0 / d_parent
+        );
+        LOG_INFO_V9(
+            "Thread %d:     pop          : %.3fus (%.1f%%)  work=%.3fus wait=%.3fus  atomics=%" PRIu64 "", thread_idx,
+            cycles_to_us(l2_swimlane.sched_dispatch_pop_cycle), l2_swimlane.sched_dispatch_pop_cycle * 100.0 / d_parent,
+            cycles_to_us(l2_swimlane.sched_dispatch_pop_cycle - sp.pop_wait_cycle), cycles_to_us(sp.pop_wait_cycle),
+            static_cast<uint64_t>(sp.pop_atomic_count)
+        );
+        LOG_INFO_V9(
+            "Thread %d:     setup        : %.3fus (%.1f%%)", thread_idx,
+            cycles_to_us(l2_swimlane.sched_dispatch_setup_cycle),
+            l2_swimlane.sched_dispatch_setup_cycle * 100.0 / d_parent
+        );
+
+#if PTO2_SCHED_PROFILING
+        LOG_INFO_V9(
+            "Thread %d:   wiring         : %.3fus (%.1f%%)  tasks=%d", thread_idx,
+            cycles_to_us(l2_swimlane.sched_wiring_cycle), l2_swimlane.sched_wiring_cycle * 100.0 / sched_total,
+            l2_swimlane.phase_wiring_count
+        );
+#else
+        LOG_INFO_V9(
+            "Thread %d:   wiring         : %.3fus (%.1f%%)", thread_idx, cycles_to_us(l2_swimlane.sched_wiring_cycle),
+            l2_swimlane.sched_wiring_cycle * 100.0 / sched_total
+        );
+#endif
+
+        LOG_INFO_V9(
+            "Thread %d:   idle           : %.3fus (%.1f%%)", thread_idx, cycles_to_us(l2_swimlane.sched_idle_cycle),
+            l2_swimlane.sched_idle_cycle * 100.0 / sched_total
+        );
+
+        if (cur_thread_completed > 0) {
+            LOG_INFO_V9(
+                "Thread %d:   avg/complete   : %.3fus", thread_idx,
+                cycles_to_us(l2_swimlane.sched_complete_cycle) / cur_thread_completed
+            );
+        }
+    }
+#endif
+    LOG_INFO_V9(
+        "Thread %d: Scheduler summary: total_time=%.3fus, loops=%" PRIu64 ", tasks_scheduled=%d", thread_idx,
+        cycles_to_us(sched_total), static_cast<uint64_t>(l2_swimlane.sched_loop_count), cur_thread_completed
+    );
+}
+#endif
+
+// =============================================================================
+// Shutdown: deinit AICore regs for this thread's cores (and PMU finalize if enabled).
+// Orchestrator threads have core_trackers_[thread_idx].core_num() == 0 -> no-op.
+// platform_deinit_aicore_regs is idempotent; safe to call after early completion.
+// =============================================================================
+int32_t SchedulerContext::shutdown(int32_t thread_idx) {
+    const int32_t *cores = core_trackers_[thread_idx].core_ids();
+    int32_t core_num = core_trackers_[thread_idx].core_num();
+    if (core_num == 0) return 0;
+
+#if PTO2_PROFILING
+    if (is_pmu_enabled()) {
+        pmu_aicpu_finalize(cores, core_num);
+    }
+#endif
+
+    LOG_INFO_V0("Thread %d: Shutting down %d cores", thread_idx, core_num);
+    int32_t rc = 0;
+    for (int32_t i = 0; i < core_num; i++) {
+        int32_t core_id = cores[i];
+        uint64_t reg_addr = core_exec_states_[core_id].reg_addr;
+        if (reg_addr != 0) {
+            // Timeout means AICore is unresponsive. Log and continue deiniting remaining cores.
+            if (platform_deinit_aicore_regs(reg_addr) != 0) {
+                LOG_ERROR("Thread %d: Core %d deinit timed out", thread_idx, core_id);
+                rc = -1;
+            }
+        } else {
+            LOG_ERROR("Thread %d: Core %d has invalid register address", thread_idx, core_id);
+        }
+    }
+    LOG_INFO_V0("Thread %d: Shutdown complete", thread_idx);
+    return rc;
+}
+
+// =============================================================================
+// Handshake with all AICore workers; discover core type and reg address.
+// =============================================================================
+int32_t SchedulerContext::handshake_all_cores(Runtime *runtime) {
+    Handshake *all_handshakes = reinterpret_cast<Handshake *>(runtime->workers);
+    cores_total_num_ = runtime->worker_count;
+
+    // Validate cores_total_num_ before using as array index
+    if (cores_total_num_ == 0 || cores_total_num_ > RUNTIME_MAX_WORKER) {
+        LOG_ERROR("Invalid cores_total_num %d (expected 1-%d)", cores_total_num_, RUNTIME_MAX_WORKER);
+        return -1;
+    }
+
+    aic_count_ = 0;
+    aiv_count_ = 0;
+
+    LOG_INFO_V0("Handshaking with %d cores", cores_total_num_);
+
+    // Step 1: Write per-core payload addresses and send handshake signal.
+    // OUT_OF_ORDER_STORE_BARRIER() ensures task is globally visible before
+    // aicpu_ready=1, so AICore reads the correct payload pointer after waking up.
+    for (int32_t i = 0; i < cores_total_num_; i++) {
+        all_handshakes[i].task = reinterpret_cast<uint64_t>(&payload_per_core_[i][0]);
+        OUT_OF_ORDER_STORE_BARRIER();
+        all_handshakes[i].aicpu_ready = 1;
+    }
+    OUT_OF_ORDER_STORE_BARRIER();
+
+    // Get platform physical cores count for validation
+    uint32_t max_physical_cores_count = platform_get_physical_cores_count();
+
+    // Step 2: Wait for all cores to respond, collect core type and register addresses
+    bool handshake_failed = false;
+    for (int32_t i = 0; i < cores_total_num_; i++) {
+        Handshake *hank = &all_handshakes[i];
+
+        while (hank->aicore_regs_ready == 0) {
+            SPIN_WAIT_HINT();
+        }
+
+        uint32_t physical_core_id = hank->physical_core_id;
+
+        if (physical_core_id >= max_physical_cores_count) {
+            LOG_ERROR(
+                "Core %d reported invalid physical_core_id=%u (platform max=%u)", i, physical_core_id,
+                max_physical_cores_count
+            );
+            handshake_failed = true;
+            continue;
+        }
+
+        uint64_t *regs = reinterpret_cast<uint64_t *>(regs_);
+        uint64_t reg_addr = regs[physical_core_id];
+
+        // Initialize AICore registers after discovery (first round)
+        platform_init_aicore_regs(reg_addr);
+        OUT_OF_ORDER_STORE_BARRIER();
+        hank->aicpu_regs_ready = 1;
+
+        OUT_OF_ORDER_STORE_BARRIER();
+
+        while (hank->aicore_done == 0) {
+            SPIN_WAIT_HINT();
+        }
+
+        CoreType type = hank->core_type;
+
+        core_exec_states_[i].reg_addr = reg_addr;
+        core_exec_states_[i].cond_ptr = get_reg_ptr(reg_addr, RegId::COND);
+
+#if PTO2_PROFILING
+        // Record physical_core_id for PMU init later (CoreExecState has no room
+        // for this field under PTO2_PROFILING).
+        physical_core_ids_[i] = physical_core_id;
+#endif
+#if !PTO2_PROFILING
+        core_exec_states_[i].worker_id = i;
+        core_exec_states_[i].physical_core_id = physical_core_id;
+        core_exec_states_[i].core_type = type;
+#endif
+
+        if (type == CoreType::AIC) {
+            aic_worker_ids_[aic_count_++] = i;
+            LOG_INFO_V0("Core %d: AIC, physical_id=%u, reg_addr=0x%lx", i, physical_core_id, reg_addr);
+        } else {
+            aiv_worker_ids_[aiv_count_++] = i;
+            LOG_INFO_V0("Core %d: AIV, physical_id=%u, reg_addr=0x%lx", i, physical_core_id, reg_addr);
+        }
+    }
+
+    if (handshake_failed) {
+        emergency_shutdown(runtime);
+        return -1;
+    }
+
+    LOG_INFO_V0("Core discovery complete: %d AIC, %d AIV", aic_count_, aiv_count_);
+    return 0;
+}
+
+// =============================================================================
+// Assign discovered cores to scheduler threads (cluster-aligned round-robin).
+// =============================================================================
+bool SchedulerContext::assign_cores_to_threads() {
+    // Cluster-aligned round-robin assignment: cluster ci -> sched thread ci % active_sched_threads_.
+    // Each cluster = 1 AIC + 2 adjacent AIV; the triple is always kept together.
+    active_sched_threads_ = (sched_thread_num_ > 0) ? sched_thread_num_ : aicpu_thread_num_;
+    int32_t cluster_count = aic_count_;
+
+    // Max clusters any single sched thread can hold: ceil(cluster_count / active_sched_threads_).
+    int32_t max_clusters_per_thread = (cluster_count + active_sched_threads_ - 1) / active_sched_threads_;
+    int32_t thread_cores_num = max_clusters_per_thread * 3;
+
+    if (thread_cores_num > CoreTracker::MAX_CORE_PER_THREAD) {
+        LOG_ERROR("Can't assign more then 64 cores in per scheduler");
+        return false;
+    }
+
+    LOG_INFO_V0(
+        "Assigning cores (round-robin): %d clusters across %d sched threads (%d AIC, %d AIV)", cluster_count,
+        active_sched_threads_, aic_count_, aiv_count_
+    );
+
+    for (int32_t i = 0; i < RUNTIME_MAX_WORKER; i++) {
+        core_exec_states_[i].running_reg_task_id = AICPU_TASK_INVALID;
+        core_exec_states_[i].pending_reg_task_id = AICPU_TASK_INVALID;
+    }
+
+    // Count clusters per thread first (round-robin may distribute unevenly)
+    int32_t clusters_per_thread[MAX_AICPU_THREADS] = {};
+    for (int32_t ci = 0; ci < cluster_count; ci++) {
+        clusters_per_thread[ci % active_sched_threads_]++;
+    }
+    for (int32_t i = 0; i < active_sched_threads_; i++) {
+        core_trackers_[i].init(clusters_per_thread[i]);
+    }
+
+    int32_t cluster_idx_per_thread[MAX_AICPU_THREADS] = {};
+
+    for (int32_t ci = 0; ci < cluster_count; ci++) {
+        int32_t t = ci % active_sched_threads_;
+
+        int32_t aic_wid = aic_worker_ids_[ci];
+        int32_t aiv0_wid = aiv_worker_ids_[2 * ci];
+        int32_t aiv1_wid = aiv_worker_ids_[2 * ci + 1];
+
+        core_trackers_[t].set_cluster(cluster_idx_per_thread[t]++, aic_wid, aiv0_wid, aiv1_wid);
+
+        LOG_INFO_V0("Thread %d: cluster %d (AIC=%d, AIV0=%d, AIV1=%d)", t, ci, aic_wid, aiv0_wid, aiv1_wid);
+    }
+
+    for (int32_t t = 0; t < aicpu_thread_num_; t++) {
+        LOG_INFO_V0(
+            "Thread %d: total %d cores (%d clusters)", t, core_trackers_[t].core_num(),
+            core_trackers_[t].get_cluster_count()
+        );
+    }
+
+    LOG_INFO_V0(
+        "Config: threads=%d, cores=%d, cores_per_thread=%d", aicpu_thread_num_, cores_total_num_, thread_cores_num
+    );
+    return true;
+}
+
+// =============================================================================
+// Reassign all cores across all threads (sched + orchestrator) after orchestration.
+// =============================================================================
+void SchedulerContext::reassign_cores_for_all_threads() {
+    LOG_INFO_V0(
+        "Reassigning cores (cluster-aligned) for %d threads: %d AIC, %d AIV", aicpu_thread_num_, aic_count_, aiv_count_
+    );
+
+    // Collect running worker_ids from all current trackers
+    bool running_cores[RUNTIME_MAX_WORKER] = {};
+    for (int32_t i = 0; i < aicpu_thread_num_; i++) {
+        auto all_running = core_trackers_[i].get_all_running_cores();
+        int32_t bp;
+        while ((bp = all_running.pop_first()) >= 0) {
+            running_cores[core_trackers_[i].get_core_id_by_offset(bp)] = true;
+        }
+    }
+
+    // Count clusters per thread (round-robin across all threads)
+    int32_t cluster_count = aic_count_;
+    int32_t clusters_per_thread[MAX_AICPU_THREADS] = {};
+    for (int32_t ci = 0; ci < cluster_count; ci++) {
+        clusters_per_thread[ci % aicpu_thread_num_]++;
+    }
+
+    // Re-init all trackers and reset core counts
+    for (int32_t i = 0; i < aicpu_thread_num_; i++) {
+        core_trackers_[i].init(clusters_per_thread[i]);
+    }
+
+    // Assign clusters round-robin and restore running state
+    int32_t cluster_idx_per_thread[MAX_AICPU_THREADS] = {};
+    for (int32_t ci = 0; ci < cluster_count; ci++) {
+        int32_t t = ci % aicpu_thread_num_;
+
+        int32_t aic_wid = aic_worker_ids_[ci];
+        int32_t aiv0_wid = aiv_worker_ids_[2 * ci];
+        int32_t aiv1_wid = aiv_worker_ids_[2 * ci + 1];
+
+        int32_t cl_idx = cluster_idx_per_thread[t]++;
+        core_trackers_[t].set_cluster(cl_idx, aic_wid, aiv0_wid, aiv1_wid);
+
+        // init() marks all idle; toggle cores that were running and restore pending_occupied
+        if (running_cores[aic_wid]) {
+            core_trackers_[t].change_core_state(cl_idx * 3);
+            core_trackers_[t].set_pending_occupied(cl_idx * 3);
+        }
+        if (running_cores[aiv0_wid]) {
+            core_trackers_[t].change_core_state(cl_idx * 3 + 1);
+            core_trackers_[t].set_pending_occupied(cl_idx * 3 + 1);
+        }
+        if (running_cores[aiv1_wid]) {
+            core_trackers_[t].change_core_state(cl_idx * 3 + 2);
+            core_trackers_[t].set_pending_occupied(cl_idx * 3 + 2);
+        }
+    }
+
+    // Log final distribution
+    LOG_INFO_V0("Core reassignment complete:");
+    for (int32_t t = 0; t < aicpu_thread_num_; t++) {
+        int32_t aic_running = core_trackers_[t].get_running_count<CoreType::AIC>();
+        int32_t aiv_running = core_trackers_[t].get_running_count<CoreType::AIV>();
+        LOG_INFO_V0(
+            "  Thread %d: %d cores, %d clusters (AIC running=%d, AIV running=%d)", t, core_trackers_[t].core_num(),
+            core_trackers_[t].get_cluster_count(), aic_running, aiv_running
+        );
+    }
+    active_sched_threads_ = aicpu_thread_num_;
+}
+
+// =============================================================================
+// Emergency shutdown: broadcast exit signal to every handshake'd core and
+// deinit their AICore register blocks. Idempotent.
+// =============================================================================
+void SchedulerContext::emergency_shutdown(Runtime *runtime) {
+    LOG_WARN("Emergency shutdown: sending exit signal to all initialized cores");
+    Handshake *all_handshakes = reinterpret_cast<Handshake *>(runtime->workers);
+    int32_t timeout_count = 0;
+    for (int32_t i = 0; i < cores_total_num_; i++) {
+        Handshake *hank = &all_handshakes[i];
+        OUT_OF_ORDER_STORE_BARRIER();
+        hank->aicpu_regs_ready = 1;
+        if (core_exec_states_[i].reg_addr != 0) {
+            if (platform_deinit_aicore_regs(core_exec_states_[i].reg_addr) != 0) {
+                timeout_count++;
+            }
+        }
+    }
+    if (timeout_count > 0) {
+        LOG_ERROR("Emergency shutdown: %d cores did not acknowledge exit", timeout_count);
+    }
+    LOG_WARN("Emergency shutdown complete");
+}
+
+// =============================================================================
+// Lifecycle: init / deinit
+// =============================================================================
+int32_t SchedulerContext::init(
+    Runtime *runtime, int32_t aicpu_thread_num, int32_t sched_thread_num, bool orch_to_sched, uint64_t regs_base
+) {
+    always_assert(runtime != nullptr);
+
+    // Zero all per-core execution state before handshake
+    memset(core_exec_states_, 0, sizeof(core_exec_states_));
+
+    // Wire thread/transition configuration that handshake/assign need to read.
+    aicpu_thread_num_ = aicpu_thread_num;
+    sched_thread_num_ = sched_thread_num;
+    orch_to_sched_ = orch_to_sched;
+    regs_ = regs_base;
+
+#if PTO2_PROFILING
+    // l2_swimlane_aicpu_init promotes g_l2_swimlane_level from the shared-memory
+    // header — must be called BEFORE caching the level, otherwise the cached
+    // value would still be 0 (only the binary enable bit has been seeded by
+    // kernel.cpp at this point). Reset the cached level on disabled runs so a
+    // prior enabled launch's level can't leak into the phase-record gates in
+    // scheduler_dispatch.
+    if (is_l2_swimlane_enabled()) {
+        l2_swimlane_aicpu_init(runtime->worker_count);
+        l2_swimlane_level_ = get_l2_swimlane_level();
+        if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) {
+            // Sched-phase pool count: matches the dump_args_init branch in
+            // scheduler_dispatch.cpp. sched_thread_num_ <= 0 means "use all
+            // AICPU threads as scheduler threads" (see assign_cores_to_threads'
+            // active_sched_threads_ normalization at line 689). Without this
+            // normalization here, init_phase would prime zero sched pools and
+            // all sched_phase emits would silently drop.
+            const int active_sched = (sched_thread_num_ > 0) ? sched_thread_num_ : aicpu_thread_num_;
+            const int sched_phase_threads = orch_to_sched_ ? aicpu_thread_num_ : active_sched;
+            // Orchestration is always single-threaded, so orch-phase is one pool
+            // (ordinal 0) in both modes — see record_orch_phase.
+            const int orch_phase_threads = 1;
+            l2_swimlane_aicpu_init_phase(runtime->worker_count, sched_phase_threads, orch_phase_threads);
+        }
+    } else {
+        l2_swimlane_level_ = L2SwimlaneLevel::DISABLED;
+    }
+#endif
+
+    // Discover cores and assign to scheduler threads.
+    int32_t rc = handshake_all_cores(runtime);
+    if (rc != 0) {
+        LOG_ERROR("handshake_all_cores failed");
+        return rc;
+    }
+    if (!assign_cores_to_threads()) {
+        return -1;
+    }
+
+    // Initialize task counters. Task count comes from PTO2 shared memory.
+    if (runtime->get_gm_sm_ptr()) {
+        auto *header = static_cast<PTO2SharedMemoryHeader *>(runtime->get_gm_sm_ptr());
+        // Read at one-time boot init, before the SM is reset for the run, so a
+        // ring not yet written holds uninitialized memory (0xbe... under ASAN's
+        // malloc-fill). Sum in int64 and only count rings whose value is a
+        // plausible task count — (0, PTO2_SCOPE_TASKS_CAP]; a ring cannot hold
+        // more than the scope cap. This rejects any garbage pattern (negative
+        // or positive), so uninitialized rings contribute 0 (the correct boot
+        // count) while valid counts still add up, with no signed overflow.
+        int64_t pto2_count = 0;
+        for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+            int32_t ring_tasks = header->rings[r].fc.current_task_index.load(std::memory_order_acquire);
+            if (ring_tasks > 0 && ring_tasks <= PTO2_SCOPE_TASKS_CAP) pto2_count += ring_tasks;
+        }
+        total_tasks_ = static_cast<int32_t>(pto2_count);
+    } else {
+        total_tasks_ = 0;
+    }
+    completed_tasks_.store(0, std::memory_order_release);
+
+    // Device orchestration: the orchestrator thread flips this when the graph is built.
+    orchestrator_done_ = false;
+
+    // Clear per-core dispatch payloads
+    memset(payload_per_core_, 0, sizeof(payload_per_core_));
+    memset(deferred_slab_per_core_, 0, sizeof(deferred_slab_per_core_));
+
+    // Initialize per-core GlobalContext (sub_block_id) based on cluster position.
+    // This is done once at startup and never modified afterwards.
+    for (int32_t t = 0; t < sched_thread_num_; t++) {
+        CoreTracker &tracker = core_trackers_[t];
+        for (int32_t c = 0; c < tracker.get_cluster_count(); c++) {
+            int32_t cluster_offset = c * 3;  // Each cluster = 1 AIC + 2 AIV
+            auto aiv0_id = tracker.get_core_id_by_offset(tracker.get_aiv0_core_offset(cluster_offset));
+            auto aiv1_id = tracker.get_core_id_by_offset(tracker.get_aiv1_core_offset(cluster_offset));
+            payload_per_core_[aiv0_id][0].global_context.sub_block_id = 0;
+            payload_per_core_[aiv0_id][1].global_context.sub_block_id = 0;
+            payload_per_core_[aiv1_id][0].global_context.sub_block_id = 1;
+            payload_per_core_[aiv1_id][1].global_context.sub_block_id = 1;
+        }
+    }
+
+    func_id_to_addr_ = runtime->func_id_to_addr_;
+
+    return 0;
+}
+
+void SchedulerContext::deinit() {
+    // Reset all per-core execution state
+    for (int32_t i = 0; i < RUNTIME_MAX_WORKER; i++) {
+        core_exec_states_[i] = {};
+        core_exec_states_[i].running_reg_task_id = AICPU_TASK_INVALID;
+        core_exec_states_[i].pending_reg_task_id = AICPU_TASK_INVALID;
+    }
+
+    // Clear per-core dispatch payloads
+    memset(payload_per_core_, 0, sizeof(payload_per_core_));
+    memset(deferred_slab_per_core_, 0, sizeof(deferred_slab_per_core_));
+
+    // Reset sync-start drain coordination — a previous run that aborted mid-drain
+    // would otherwise leave dirty pending/elected/ack state for the next reuse.
+    drain_state_.sync_start_pending.store(0, std::memory_order_release);
+    drain_state_.drain_worker_elected.store(0, std::memory_order_release);
+    drain_state_.drain_ack_mask.store(0, std::memory_order_release);
+    drain_state_.pending_task.store(nullptr, std::memory_order_release);
+
+    // Reset task counters and orchestrator state
+    completed_tasks_.store(0, std::memory_order_release);
+    total_tasks_ = 0;
+    orchestrator_done_ = false;
+    pto2_init_claimed_.store(false, std::memory_order_release);
+    pto2_init_complete_.store(false, std::memory_order_release);
+
+    // Reset core transition state
+    transition_requested_.store(false, std::memory_order_release);
+    wait_reassign_.store(0, std::memory_order_release);
+    reassigned_.store(false, std::memory_order_release);
+    completed_.store(false, std::memory_order_release);
+
+    // Reset core discovery and assignment state
+    aic_count_ = 0;
+    aiv_count_ = 0;
+    cores_total_num_ = 0;
+    aicpu_thread_num_ = 0;
+    sched_thread_num_ = 0;
+    orch_to_sched_ = false;
+    active_sched_threads_ = 0;
+    for (int32_t t = 0; t < MAX_AICPU_THREADS; t++) {
+        core_trackers_[t] = CoreTracker{};
+    }
+
+    regs_ = 0;
+    sched_ = nullptr;
+    rt_ = nullptr;
+    func_id_to_addr_ = nullptr;
+}
+
+void SchedulerContext::wait_pto2_init_complete() const {
+    while (!pto2_init_complete_.load(std::memory_order_acquire)) {
+        SPIN_WAIT_HINT();
+    }
+}
+
+void SchedulerContext::bind_runtime(PTO2Runtime *rt) {
+    rt_ = rt;
+    sched_ = &rt->scheduler;
+}
+
+// =============================================================================
+// Post-orchestration bookkeeping. Runs on the orchestrator thread once the
+// build phase finishes; folds inline-completed tasks, flips orchestrator_done_,
+// and drives the orchestrator → scheduler core transition (or fatal shutdown).
+// =============================================================================
+void SchedulerContext::on_orchestration_done(
+    Runtime *runtime, PTO2Runtime *rt, int32_t thread_idx, int32_t total_tasks
+) {
+#if PTO2_PROFILING
+    if (l2_swimlane_level_ >= L2SwimlaneLevel::ORCH_PHASES) {
+        // Flush the orchestrator's orch-phase buffer (single instance, pool 0).
+        // The orchestrator has no scheduler-phase pool of its own — those belong
+        // to the scheduler threads and are flushed in scheduler_dispatch.
+        l2_swimlane_aicpu_flush_orch_phase_buffer(thread_idx);
+    }
+#endif
+
+    total_tasks_ = total_tasks;
+
+    // Fold tasks completed inline during orchestration
+    int32_t inline_completed = static_cast<int32_t>(rt->orchestrator.inline_completed_tasks);
+    if (inline_completed > 0) {
+        completed_tasks_.fetch_add(inline_completed, std::memory_order_relaxed);
+#if PTO2_SCHED_PROFILING
+        rt->scheduler.tasks_completed.fetch_add(inline_completed, std::memory_order_relaxed);
+#endif
+    }
+    orchestrator_done_ = true;
+
+    // Check for fatal error from orchestration; if so, shut down immediately.
+    int32_t orch_err = 0;
+    if (sched_->sm_header) {
+        orch_err = sched_->sm_header->orch_error_code.load(std::memory_order_relaxed);
+    }
+    if (orch_err != PTO2_ERROR_NONE) {
+        if (!completed_.exchange(true, std::memory_order_acq_rel)) {
+            emergency_shutdown(runtime);
+        }
+    }
+
+    // Skip core transition on fatal error — cores already shut down above.
+    if (completed_.load(std::memory_order_acquire)) {
+        // Signal transition to unblock scheduler threads waiting at core transition
+        transition_requested_.store(true, std::memory_order_release);
+        reassigned_.store(true, std::memory_order_release);
+    } else if (orch_to_sched_) {
+        LOG_INFO_V0("Thread %d: Set orchestrator_done=true, requesting core transition", thread_idx);
+        transition_requested_.store(true, std::memory_order_release);
+
+        // Wait for scheduler threads to acknowledge transition request
+        while (wait_reassign_.load(std::memory_order_acquire) != sched_thread_num_) {
+            if (completed_.load(std::memory_order_acquire)) {
+                break;
+            }
+            SPIN_WAIT_HINT();
+        }
+        if (!completed_.load(std::memory_order_acquire)) {
+            reassign_cores_for_all_threads();
+            reassigned_.store(true, std::memory_order_release);
+        }
+    }
+
+#if PTO2_PROFILING
+    // Write core-to-thread mapping AFTER reassignment so the profiling data
+    // reflects the final distribution (all active_sched_threads_, including
+    // former orchestrator threads when orch_to_sched_ is enabled).
+    if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) {
+        l2_swimlane_aicpu_init_core_assignments(cores_total_num_);
+        for (int32_t t = 0; t < active_sched_threads_; t++) {
+            l2_swimlane_aicpu_write_core_assignments_for_thread(
+                t, core_trackers_[t].core_ids(), core_trackers_[t].core_num()
+            );
+        }
+    }
+#endif
+}
diff --git a/src/a2a3/runtime/fully_distributed_within_core/runtime/scheduler/scheduler_completion.cpp b/src/a2a3/runtime/fully_distributed_within_core/runtime/scheduler/scheduler_completion.cpp
new file mode 100644
index 000000000..774589865
--- /dev/null
+++ b/src/a2a3/runtime/fully_distributed_within_core/runtime/scheduler/scheduler_completion.cpp
@@ -0,0 +1,614 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+#include "scheduler_context.h"
+
+#include <algorithm>
+
+#include "common/unified_log.h"
+#include "aicpu/device_time.h"
+#include "aicpu/platform_regs.h"
+#include "common/l2_swimlane_profiling.h"
+#include "common/memory_barrier.h"
+#include "common/platform_config.h"
+#include "pto_runtime2.h"
+#include "runtime.h"
+#include "spin_hint.h"
+
+// Performance profiling headers
+#include "aicpu/l2_swimlane_collector_aicpu.h"
+#include "aicpu/pmu_collector_aicpu.h"
+#include "aicpu/tensor_dump_aicpu.h"
+
+// =============================================================================
+// Dual-slot state machine helpers
+// =============================================================================
+
+namespace {
+inline constexpr int32_t PTO2_DEFERRED_RELEASE_CAP = 256;
+}
+
+// Pure function: read register result -> SlotTransition (no side effects).
+SlotTransition SchedulerContext::decide_slot_transition(
+    int32_t reg_task_id, int32_t reg_state, int32_t running_id, int32_t pending_id, bool pending_gated
+) {
+    SlotTransition t;
+    if (pending_id != AICPU_TASK_INVALID && reg_task_id == pending_id) {
+        t.matched = true;
+        t.running_done = true;  // Serial execution: pending event implies running done
+        t.running_freed = true;
+        t.pending_freed = true;
+        if (reg_state == TASK_FIN_STATE) {
+            t.pending_done = true;  // Case 1: pending FIN
+        }
+        // else: Case 2: pending ACK (pending_done stays false)
+    } else if (reg_task_id == running_id) {
+        if (reg_state == TASK_FIN_STATE) {
+            if (pending_id == AICPU_TASK_INVALID) {
+                // Case 3.2: running FIN, no pending -> core goes idle
+                t.matched = true;
+                t.running_done = true;
+                t.running_freed = true;
+            } else if (pending_gated) {
+                // Case 3.3: running FIN, pending is a SPECULATIVE GATED task. The
+                // Case 3.1 "wait for the pending's ack" shortcut assumes the AICore
+                // immediately runs the pending task; a gated task instead spins on
+                // its doorbell and never acks until its producer completes — and
+                // that producer's completion depends on collecting THIS running FIN.
+                // Waiting would deadlock. Complete the running FIN now and promote
+                // the gated task (it then skip-gates until its doorbell). pending is
+                // NOT freed (it promotes, not retires) so the bitmap update keeps the
+                // core off-limits — no second gated block, no doorbell overwrite.
+                t.matched = true;
+                t.running_done = true;
+                t.running_freed = true;
+            }
+            // Case 3.1: running FIN, NON-gated pending exists -> skip (transient
+            // state). Case 1/2 (pending ack/FIN) completes running implicitly.
+        } else {
+            // Case 4: running ACK -- only pending_freed (slot now hardware-latched)
+            t.matched = true;
+            t.pending_freed = true;
+        }
+    }
+    return t;
+}
+
+// Complete one slot's task: subtask counting, mixed completion, deferred release, profiling.
+void SchedulerContext::complete_slot_task(
+    PTO2TaskSlotState &slot_state, int32_t expected_reg_task_id, [[maybe_unused]] PTO2SubtaskSlot subslot,
+    int32_t thread_idx, int32_t core_id, Handshake *hank, int32_t &completed_this_turn,
+    PTO2TaskSlotState *deferred_release_slot_states[], int32_t &deferred_release_count, PTO2LocalReadyBuffer *local_bufs
+#if PTO2_PROFILING
+    ,
+    uint64_t dispatch_ts, uint64_t finish_ts
+#endif
+) {
+#if PTO2_PROFILING
+    auto &l2_swimlane = sched_l2_swimlane_[thread_idx];
+#else
+    (void)hank;
+#endif
+    // MPSC fast-path is opt-in per task: only tasks with at least one subtask
+    // that registered a deferred condition route through the mailbox. Pure
+    // non-deferred tasks complete inline on this thread (matching pre-MPSC
+    // behavior — keeps the common case parallelized across scheduler threads
+    // instead of serializing through the single consumer). The
+    // any_subtask_deferred flag on slot_state is the discriminator; it's set
+    // (release) before on_subtask_complete and read (acquire) after, so the
+    // last subtask sees flag writes from any earlier subtask of the same task.
+    AICoreCompletionMailbox *mailbox = rt_ != nullptr ? rt_->aicore_mailbox : nullptr;
+    bool defer_completion_to_consumer = false;
+
+    if (slot_state.payload != nullptr) {
+        volatile DeferredCompletionSlab *deferred_slab = &deferred_slab_per_core_[core_id][expected_reg_task_id & 1];
+        int32_t slab_err = deferred_slab->error_code;
+        if (slab_err != PTO2_ERROR_NONE) {
+            int32_t expected = PTO2_ERROR_NONE;
+            sched_->sm_header->sched_error_code.compare_exchange_strong(
+                expected, slab_err, std::memory_order_acq_rel, std::memory_order_acquire
+            );
+            completed_.store(true, std::memory_order_release);
+            return;
+        }
+
+        uint32_t cond_count = deferred_slab->count;
+        if (cond_count > MAX_COMPLETIONS_PER_TASK) {
+            int32_t expected = PTO2_ERROR_NONE;
+            sched_->sm_header->sched_error_code.compare_exchange_strong(
+                expected, PTO2_ERROR_ASYNC_REGISTRATION_FAILED, std::memory_order_acq_rel, std::memory_order_acquire
+            );
+            completed_.store(true, std::memory_order_release);
+            return;
+        }
+
+        if (cond_count > 0) {
+            // Publish "this task is deferred" before on_subtask_complete so the
+            // acq_rel fetch_add inside on_subtask_complete makes the flag
+            // visible to whichever subtask sees task_complete=true (which may
+            // be this thread or a later one).
+            slot_state.any_subtask_deferred.store(true, std::memory_order_release);
+
+            const PTO2TaskId token = slot_state.task->task_id;
+            for (uint32_t i = 0; i < cond_count; ++i) {
+                volatile DeferredCompletionEntry *e = &deferred_slab->entries[i];
+                while (!mailbox->try_push_condition(token, e->addr, e->expected_value, e->engine, e->completion_type)) {
+                    sched_->async_wait_list.mpsc_skipped_count.fetch_add(1, std::memory_order_relaxed);
+                    SPIN_WAIT_HINT();
+                }
+            }
+        }
+    }
+
+    bool task_complete = sched_->on_subtask_complete(slot_state);
+
+#if PTO2_PROFILING
+    // Sub-block retire that did not finish the slot: record it so the poll
+    // iteration becomes visible on the scheduler lane (the SPMD harvest tail).
+    if (!task_complete && l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) {
+        l2_swimlane.phase_subretire_count++;
+    }
+#endif
+
+    if (task_complete && slot_state.payload != nullptr &&
+        slot_state.any_subtask_deferred.load(std::memory_order_acquire)) {
+        // Some subtask of this task registered conditions; finish the
+        // registration by handing the slot_state off to the consumer.
+        while (!mailbox->try_push_normal_done(slot_state.task->task_id, reinterpret_cast<uint64_t>(&slot_state))) {
+            sched_->async_wait_list.mpsc_skipped_count.fetch_add(1, std::memory_order_relaxed);
+            SPIN_WAIT_HINT();
+        }
+        defer_completion_to_consumer = true;
+    }
+
+    if (task_complete && !defer_completion_to_consumer) {
+#if PTO2_PROFILING
+        if (is_dump_args_enabled()) {
+            dump_args_for_task<PTO2_SUBTASK_SLOT_COUNT>(
+                thread_idx, slot_state, TensorDumpStage::AFTER_COMPLETION,
+                [](ActiveMask active_mask, int raw_subtask_id) {
+                    return active_mask.subtask_active(static_cast<PTO2SubtaskSlot>(raw_subtask_id));
+                },
+                [this](int32_t func_id) {
+                    return get_function_bin_addr(func_id);
+                }
+            );
+        }
+#endif
+#if PTO2_PROFILING
+        // Time Resolve (walk the consumer list, decrement each consumer's
+        // fanin, push the newly-ready ones, ring doorbells for speculative
+        // hits) so it renders as a child bar nested inside this iteration's
+        // Complete bar. The 1 µs floor below filters out the ~88% of tasks
+        // with 1-2 consumers (~500 ns Resolve) so only the long broadcast /
+        // reduction walks stand out on the lane.
+        uint64_t resolve_t0 = (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) ? get_sys_cnt_aicpu() : 0;
+#endif
+        // [[maybe_unused]] silences -Werror=unused-but-set-variable on the
+        // profiling-flags-smoke build path where PTO2_PROFILING is OFF and
+        // the Resolve emit below is excluded.
+        [[maybe_unused]] uint32_t consumers_resolved = 0;
+#if PTO2_SCHED_PROFILING
+        // SCHED_PROFILING variant takes thread_idx for its per-thread atomic
+        // counter side-effects (g_sched_*_atomic_count[thread_idx], consumed
+        // by the otc_* log lines). It returns CompletionStats whose
+        // `fanout_edges` is the consumer-walk count.
+        consumers_resolved = sched_->on_task_complete(slot_state, thread_idx, local_bufs).fanout_edges;
+#else
+        consumers_resolved = sched_->on_task_complete(slot_state, local_bufs);
+#endif
+#if PTO2_PROFILING
+        if (resolve_t0 != 0) {
+            uint64_t resolve_t1 = get_sys_cnt_aicpu();
+            // Filter: drop Resolve bars under 1 µs so the lane shows only
+            // resolves that did meaningful work (high consumer counts or
+            // doorbells). 50 cycles @ 50 MHz = 1 µs (PLATFORM_PROF_SYS_CNT_FREQ
+            // is the device sys-cnt frequency).
+            constexpr uint64_t RESOLVE_EMIT_MIN_CYCLES = PLATFORM_PROF_SYS_CNT_FREQ / 1'000'000;  // 1 µs
+            if (resolve_t1 - resolve_t0 >= RESOLVE_EMIT_MIN_CYCLES) {
+                l2_swimlane_aicpu_record_sched_phase(
+                    thread_idx, L2SwimlaneSchedPhaseKind::Resolve, resolve_t0, resolve_t1, l2_swimlane.sched_loop_count,
+                    consumers_resolved
+                );
+            }
+        }
+        l2_swimlane.phase_complete_count++;
+#endif
+        if (deferred_release_count < PTO2_DEFERRED_RELEASE_CAP) {
+            deferred_release_slot_states[deferred_release_count++] = &slot_state;
+        } else {
+            LOG_INFO_V9("Thread %d: release", thread_idx);
+            while (deferred_release_count > 0) {
+#if PTO2_SCHED_PROFILING
+                // SCHED_PROFILING variant takes thread_idx for the per-thread
+                // atomic counter side-effects. The return value is unused.
+                (void)sched_->on_task_release(*deferred_release_slot_states[--deferred_release_count], thread_idx);
+#else
+                sched_->on_task_release(*deferred_release_slot_states[--deferred_release_count]);
+#endif
+            }
+            deferred_release_slot_states[deferred_release_count++] = &slot_state;
+        }
+        completed_this_turn++;
+    }
+
+#if PTO2_PROFILING
+    // Level gate: at AICORE_TIMING (level=1) the AICore record alone carries
+    // {start, end, task_token_raw}, host resolves func_id/core_type from
+    // dep_gen / per-core mapping, and AICPU has nothing to write. Only at
+    // AICPU_TIMING (level=2) and above does AICPU contribute dispatch/finish
+    // timestamps via complete_task. Bypassing here saves the per-completion
+    // hot-path cost (counter inc + ring lookup + record store + wmb + buffer
+    // rotation bookkeeping) for runs that only want AICore timing.
+    if (l2_swimlane.l2_swimlane_enabled && l2_swimlane_level_ >= L2SwimlaneLevel::AICPU_TIMING) {
+#if PTO2_SCHED_PROFILING
+        uint64_t t_perf_start = get_sys_cnt_aicpu();
+#endif
+
+        if (l2_swimlane_aicpu_complete_task(
+                core_id, thread_idx, static_cast<uint32_t>(expected_reg_task_id), dispatch_ts, finish_ts
+            ) != 0) {
+            LOG_ERROR(
+                "Core %d: l2_swimlane_aicpu_complete_task failed for task 0x%" PRIx64, core_id,
+                static_cast<uint64_t>(slot_state.task->task_id.raw)
+            );
+        }
+#if PTO2_SCHED_PROFILING
+        l2_swimlane.sched_complete_perf_cycle += (get_sys_cnt_aicpu() - t_perf_start);
+#endif
+    }
+
+    if (is_pmu_enabled()) {
+        pmu_aicpu_record_task(
+            core_id, thread_idx, slot_state.task->task_id.raw,
+            slot_state.task->kernel_id[static_cast<int32_t>(subslot)], hank[core_id].core_type
+        );
+    }
+#endif
+}
+
+// Promote pending slot data to running slot. Clears pending fields.
+void SchedulerContext::promote_pending_to_running(CoreExecState &core) {
+    core.running_slot_state = core.pending_slot_state;
+    core.running_reg_task_id = core.pending_reg_task_id;
+    core.running_subslot = core.pending_subslot;
+#if PTO2_PROFILING
+    core.running_dispatch_timestamp = core.pending_dispatch_timestamp;
+#endif
+    core.pending_slot_state = nullptr;
+    core.pending_reg_task_id = AICPU_TASK_INVALID;
+}
+
+// Clear running slot (core becomes idle).
+void SchedulerContext::clear_running_slot(CoreExecState &core) {
+    core.running_slot_state = nullptr;
+    core.running_reg_task_id = AICPU_TASK_INVALID;
+}
+
+void SchedulerContext::check_running_cores_for_completion(
+    int32_t thread_idx, Handshake *hank, int32_t &completed_this_turn, int32_t &cur_thread_completed,
+    bool &made_progress, PTO2TaskSlotState *deferred_release_slot_states[], int32_t &deferred_release_count,
+    PTO2LocalReadyBuffer *local_bufs
+) {
+#if PTO2_SCHED_PROFILING
+    auto &l2_swimlane = sched_l2_swimlane_[thread_idx];
+#endif
+    CoreTracker &tracker = core_trackers_[thread_idx];
+    auto running_core_states = tracker.get_all_running_cores();
+    while (running_core_states.has_value()) {
+        int32_t bit_pos = running_core_states.pop_first();
+        int32_t core_id = tracker.get_core_id_by_offset(bit_pos);
+        CoreExecState &core = core_exec_states_[core_id];
+
+        // Skip gated speculative cores. A STAGED task is parked on this core
+        // waiting for its doorbell — it physically cannot ACK/FIN yet, so
+        // reading its COND (MMIO, and the core is hot-spinning on its own SPR)
+        // every poll is pure waste that drags out the completion phase. The
+        // doorbell (try_speculative_release) flips spec_state to DISPATCHED, at
+        // which point the core becomes pollable again and its FIN is caught.
+        // Cheap cacheable load; no MMIO. Pending slot is empty while gated.
+        {
+            PTO2TaskSlotState *rs = core.running_slot_state;
+            if (rs != nullptr && rs->payload != nullptr &&
+                rs->payload->spec_state.load(std::memory_order_relaxed) == PTO2_SPEC_STAGING) {
+                continue;
+            }
+        }
+
+        // --- Judgment phase: read register, derive transition ---
+        // Use the precomputed cond_ptr (resolved once in handshake) to skip
+        // the reg_offset switch and reg_addr addition on every poll.
+        uint64_t reg_val = static_cast<uint64_t>(*core.cond_ptr);
+        // ARM64 allows Device-nGnRnE -> Normal-cacheable load reorder; the
+        // rmb() pins any AICore-published cacheable reads downstream of the
+        // FIN observation. Replaces the post-`__sync_synchronize` that the
+        // old read_reg() helper carried implicitly.
+        rmb();
+        int32_t reg_task_id = EXTRACT_TASK_ID(reg_val);
+        int32_t reg_state = EXTRACT_TASK_STATE(reg_val);
+
+#if PTO2_SCHED_PROFILING
+        if (l2_swimlane.l2_swimlane_enabled) {
+            l2_swimlane.complete_probe_count++;
+        }
+#endif
+
+        // A pending task is "gated" when it is a speculative pre-stage still
+        // waiting on its doorbell (STAGED): it will not ack on the producer's FIN,
+        // so the Case 3.1 wait-for-pending-ack shortcut would deadlock. Detect it
+        // so decide_slot_transition completes the running FIN and promotes it.
+        bool pending_gated =
+            (core.pending_slot_state != nullptr && core.pending_slot_state->payload != nullptr &&
+             core.pending_slot_state->payload->spec_state.load(std::memory_order_relaxed) == PTO2_SPEC_STAGING);
+        SlotTransition t = decide_slot_transition(
+            reg_task_id, reg_state, core.running_reg_task_id, core.pending_reg_task_id, pending_gated
+        );
+        if (!t.matched) continue;
+
+#if PTO2_SCHED_PROFILING
+        if (l2_swimlane.l2_swimlane_enabled && (t.running_done || t.pending_done)) {
+            l2_swimlane.complete_hit_count++;
+        }
+#endif
+
+#if PTO2_PROFILING
+        // Capture finish_ts at the FIN observation point — right after rmb()
+        // above pinned the cacheable AICore reads downstream of the register
+        // load, and BEFORE any fanin / deferred-release work. Anything later
+        // (slot transition apply, complete_slot_task fanin processing) would
+        // charge AICPU completion-processing cost to the (end → finish)
+        // span, masking the actual FIN-delivery latency.
+        uint64_t finish_ts = 0;
+        if (l2_swimlane_level_ >= L2SwimlaneLevel::AICPU_TIMING && (t.pending_done || t.running_done)) {
+            finish_ts = get_sys_cnt_aicpu();
+        }
+#endif
+
+        // --- Apply phase: execute actions based on transition ---
+
+        // 1. Complete finished tasks (capture pointers before modifying core state)
+        if (t.pending_done) {
+            complete_slot_task(
+                *core.pending_slot_state, core.pending_reg_task_id, core.pending_subslot, thread_idx, core_id, hank,
+                completed_this_turn, deferred_release_slot_states, deferred_release_count, local_bufs
+#if PTO2_PROFILING
+                ,
+                core.pending_dispatch_timestamp, finish_ts
+#endif
+            );
+            cur_thread_completed++;
+        }
+        if (t.running_done) {
+            complete_slot_task(
+                *core.running_slot_state, core.running_reg_task_id, core.running_subslot, thread_idx, core_id, hank,
+                completed_this_turn, deferred_release_slot_states, deferred_release_count, local_bufs
+#if PTO2_PROFILING
+                ,
+                core.running_dispatch_timestamp, finish_ts
+#endif
+            );
+            cur_thread_completed++;
+        }
+
+        // 2. Update slot data
+        if (t.running_freed) {
+            if (core.pending_slot_state != nullptr && !t.pending_done) {
+                promote_pending_to_running(core);  // Case 2 or Case 3 (with pending)
+            } else {
+                clear_running_slot(core);  // Case 1 or Case 3 (no pending)
+                if (t.pending_done) {
+                    // Case 1: pending FIN observed directly -- clear stale pending fields.
+                    // Without this, pending_reg_task_id retains a stale value that blocks
+                    // clear_pending_occupied and permanently degrades pipelining.
+                    core.pending_slot_state = nullptr;
+                    core.pending_reg_task_id = AICPU_TASK_INVALID;
+                }
+            }
+        }
+
+        // 3. Update tracker bitmap
+        bool is_idle = (core.running_reg_task_id == AICPU_TASK_INVALID);
+        if (is_idle) {
+            tracker.change_core_state(bit_pos);       // Mark idle
+            tracker.clear_pending_occupied(bit_pos);  // Idle safeguard: no payload to protect
+        } else if (t.pending_freed && core.pending_reg_task_id == AICPU_TASK_INVALID) {
+            // Case 4 (running ACK) or Case 2 (pending ACK): clear pending_occupied only
+            // when no pending task is currently held. Otherwise pending slot is occupied
+            // by a pre-loaded task and must stay protected.
+            tracker.clear_pending_occupied(bit_pos);
+        }
+
+        // 4. Progress signal (only when running task completes)
+        if (t.running_done) {
+            made_progress = true;
+        }
+    }
+}
+
+// =============================================================================
+// sync_start drain protocol
+// =============================================================================
+
+// Take ownership of slot_state and signal all threads to enter drain mode.
+// Returns true if this thread won the CAS and owns the drain slot.
+// Returns false if another thread already holds drain; caller must re-push slot_state.
+//
+// Two-phase protocol: CAS 0 -> -1 (sentinel) to claim ownership, store task and
+// reset election flag, then release-store block_num.  Other threads acquire-load
+// sync_start_pending; seeing block_num > 0 ensures all relaxed stores are visible.
+bool SchedulerContext::enter_drain_mode(PTO2TaskSlotState *slot_state, int32_t block_num) {
+    int32_t expected = 0;
+    if (!drain_state_.sync_start_pending.compare_exchange_strong(
+            expected, -1, std::memory_order_relaxed, std::memory_order_relaxed
+        )) {
+        return false;  // Another thread already holds the drain slot.
+    }
+    // We own the drain slot.  Store the task and reset election flag before making it visible.
+    drain_state_.pending_task.store(slot_state, std::memory_order_release);
+    drain_state_.drain_ack_mask.store(0, std::memory_order_relaxed);
+    drain_state_.drain_worker_elected.store(0, std::memory_order_relaxed);
+    // Release store: all stores above are now visible to any thread that
+    // acquire-loads sync_start_pending and sees block_num > 0.
+    drain_state_.sync_start_pending.store(block_num, std::memory_order_release);
+    return true;
+}
+
+// Count total available resources across all scheduler threads for a given shape.
+int32_t SchedulerContext::count_global_available(PTO2ResourceShape shape, uint8_t core_mask) {
+    int32_t total = 0;
+    for (int32_t t = 0; t < active_sched_threads_; t++) {
+        if (shape == PTO2ResourceShape::MIX) {
+            total += core_trackers_[t].count_mix_running_clusters(core_mask);
+        } else {
+            total += core_trackers_[t].get_idle_core_offset_states(shape).count();
+        }
+    }
+    return total;
+}
+
+// Drain worker: dispatch all blocks in one pass across all threads' trackers.
+// Called only when global resources >= block_num, so one pass always suffices.
+// All other threads are spinning -- the drain worker has exclusive tracker access.
+void SchedulerContext::drain_worker_dispatch(int32_t block_num) {
+    PTO2TaskSlotState *slot_state = drain_state_.pending_task.load(std::memory_order_acquire);
+    if (!slot_state) {
+        drain_state_.sync_start_pending.store(0, std::memory_order_release);
+        return;
+    }
+    PTO2ResourceShape shape = slot_state->active_mask.to_shape();
+    uint8_t core_mask = slot_state->active_mask.core_mask();
+
+    for (int32_t t = 0;
+         t < active_sched_threads_ && slot_state->next_block_idx.load(std::memory_order_relaxed) < block_num; t++) {
+        auto valid = (shape == PTO2ResourceShape::MIX) ?
+                         core_trackers_[t].get_mix_running_cluster_offset_states(core_mask) :
+                         core_trackers_[t].get_idle_core_offset_states(shape);
+        int32_t start = slot_state->next_block_idx.load(std::memory_order_relaxed);
+        int32_t remaining = slot_state->logical_block_num - start;
+        int32_t claim = std::min(valid.count(), remaining);
+        slot_state->next_block_idx.store(static_cast<int16_t>(start + claim), std::memory_order_relaxed);
+        PublishHandle handles[CoreTracker::MAX_CLUSTERS * 3];
+        int handle_count = 0;
+        for (int32_t b = 0; b < claim; b++) {
+            auto core_offset = valid.pop_first();
+            handle_count += prepare_block_for_dispatch(
+                t, core_offset, *slot_state, shape, false, start + b, &handles[handle_count]
+            );
+        }
+        wmb();
+        uint64_t dispatch_ts = 0;
+#if PTO2_PROFILING
+        if (l2_swimlane_level_ >= L2SwimlaneLevel::AICPU_TIMING) {
+            dispatch_ts = get_sys_cnt_aicpu();
+        }
+#endif
+        for (int i = 0; i < handle_count; i++) {
+            publish_subtask_to_core(handles[i], dispatch_ts);
+        }
+    }
+
+    // All blocks dispatched -- clear drain state.
+    // Release fence ensures tracker mutations are visible to threads that
+    // acquire-load sync_start_pending == 0 and resume normal operation.
+    std::atomic_thread_fence(std::memory_order_release);
+    drain_state_.pending_task.store(nullptr, std::memory_order_release);
+    drain_state_.drain_ack_mask.store(0, std::memory_order_relaxed);
+    drain_state_.drain_worker_elected.store(0, std::memory_order_relaxed);
+    drain_state_.sync_start_pending.store(0, std::memory_order_release);
+}
+
+// Called by each scheduler thread when drain_state_.sync_start_pending != 0.
+//
+// Protocol (single-stage ack barrier):
+//   1. Ack barrier: all threads signal they've stopped dispatch, then spin
+//      until all ack bits are set.
+//      If this thread's bit gets cleared while waiting, a reset occurred -- return.
+//   2. Election: one thread wins the CAS and becomes the drain worker.
+//      If resources are insufficient, reset ack/election fields and return --
+//      all threads resume completion polling to free running cores, then retry.
+//   3. Dispatch: elected thread dispatches all blocks (one pass, resources guaranteed).
+//      Non-elected threads spin-wait until sync_start_pending == 0.
+//      During dispatch the elected thread has exclusive tracker access.
+void SchedulerContext::handle_drain_mode(int32_t thread_idx) {
+    // Every spin in this function honors is_completed(): once the run latches
+    // completed_ (all tasks done, or a fatal error raised elsewhere), peers leave
+    // the dispatch loop and stop participating in the drain. A thread parked in a
+    // drain spin would then wait forever for acks / a gate-open that can no longer
+    // arrive -- the AICPU watchdog never fires here because these spins live
+    // outside the dispatch loop's wall-clock budget, so the hang escalates straight
+    // to the 3 s STARS op-exec timeout (507018) and poisons the device. Bailing on
+    // completed_ is always safe: any pending sync_start task is either already
+    // dispatched (a stale re-popped slot) or moot under teardown, and deinit()
+    // resets drain_state_ before the next run, so leaving it dirty is harmless.
+    // Spin until drain is fully initialized (sentinel -1 -> block_num > 0).
+    int32_t block_num;
+    do {
+        if (is_completed()) return;
+        block_num = drain_state_.sync_start_pending.load(std::memory_order_acquire);
+    } while (block_num < 0);
+    if (block_num == 0) return;
+
+    uint32_t all_acked = (1u << active_sched_threads_) - 1;
+
+    // Ack barrier -- signal this thread has stopped dispatch.
+    drain_state_.drain_ack_mask.fetch_or(1u << thread_idx, std::memory_order_release);
+
+    // Spin until all threads have acked.
+    // If our bit is cleared while waiting, elected reset due to insufficient resources.
+    while (true) {
+        if (is_completed()) return;
+        uint32_t ack = drain_state_.drain_ack_mask.load(std::memory_order_acquire);
+        if ((ack & all_acked) == all_acked) break;
+        if ((ack & (1u << thread_idx)) == 0) return;
+        SPIN_WAIT_HINT();
+    }
+
+    // Election -- exactly one thread wins the CAS.
+    int32_t expected = 0;
+    drain_state_.drain_worker_elected.compare_exchange_strong(
+        expected, thread_idx + 1, std::memory_order_acquire, std::memory_order_relaxed
+    );
+
+    if (drain_state_.drain_worker_elected.load(std::memory_order_relaxed) != thread_idx + 1) {
+        // Non-elected: spin-wait for drain completion or resource-insufficient reset.
+        while (drain_state_.sync_start_pending.load(std::memory_order_acquire) != 0) {
+            if (is_completed()) return;
+            if (drain_state_.drain_worker_elected.load(std::memory_order_acquire) == 0) return;
+            SPIN_WAIT_HINT();
+        }
+        return;
+    }
+
+    // Elected: check if global resources are sufficient.
+    PTO2TaskSlotState *slot_state = drain_state_.pending_task.load(std::memory_order_acquire);
+    if (slot_state == nullptr) {
+        // pending_task is observed null only when a concurrent drain completion
+        // already cleared it (drain_worker_dispatch nulls it before reopening the
+        // gate). That drain is done and this is a stale-elected thread, so just
+        // release the election lock and return. Do NOT clear drain_ack_mask or
+        // sync_start_pending: a *new* drain run may already be active and
+        // accumulating acks, and zeroing them would corrupt it into a hang.
+        drain_state_.drain_worker_elected.store(0, std::memory_order_release);
+        return;
+    }
+    PTO2ResourceShape shape = slot_state->active_mask.to_shape();
+    int32_t available = count_global_available(shape, slot_state->active_mask.core_mask());
+
+    if (available < block_num) {
+        // Insufficient resources -- reset drain fields so threads can resume
+        // completion polling to free running cores, then retry.
+        drain_state_.drain_ack_mask.store(0, std::memory_order_release);
+        drain_state_.drain_worker_elected.store(0, std::memory_order_release);
+        return;
+    }
+
+    // Dispatch -- all other threads are spinning, elected thread has exclusive tracker access.
+    drain_worker_dispatch(block_num);
+}
diff --git a/src/a2a3/runtime/fully_distributed_within_core/runtime/scheduler/scheduler_context.h b/src/a2a3/runtime/fully_distributed_within_core/runtime/scheduler/scheduler_context.h
new file mode 100644
index 000000000..88bcff170
--- /dev/null
+++ b/src/a2a3/runtime/fully_distributed_within_core/runtime/scheduler/scheduler_context.h
@@ -0,0 +1,423 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+#ifndef SCHEDULER_CONTEXT_H
+#define SCHEDULER_CONTEXT_H
+
+#include "aicpu/platform_regs.h"
+#include "common/l2_swimlane_profiling.h"
+#include "common/unified_log.h"
+#include "scheduler_types.h"
+
+#include "scheduler/pto_scheduler.h"
+
+#include "aicore_completion_mailbox.h"
+#include "pto2_dispatch_payload.h"
+
+// These macros are defined in runtime.h, but we cannot include it here
+// (it pulls in Handshake which we only forward-declare).  Mirror the
+// authoritative values so the class layout compiles standalone.
+#ifndef RUNTIME_MAX_WORKER
+#define RUNTIME_MAX_WORKER 72
+#endif
+#ifndef RUNTIME_MAX_FUNC_ID
+#define RUNTIME_MAX_FUNC_ID 1024
+#endif
+
+// Forward declarations — avoid pulling in full headers for pointer/reference params.
+class Runtime;
+struct Handshake;
+struct PTO2Runtime;
+
+/**
+ * SchedulerContext: owns all scheduler-side state and methods.
+ *
+ * Held as a member of AicpuExecutor (sched_ctx_).  The single public entry
+ * point is resolve_and_dispatch(), called once per scheduler thread.
+ *
+ * All dispatch/completion/drain/cold-path logic is implemented as private
+ * member methods, split across three .cpp files by responsibility:
+ *   - scheduler_completion.cpp  (completion polling, drain protocol)
+ *   - scheduler_cold_path.cpp   (exit checks, stall diagnostics, profiling)
+ *   - scheduler_dispatch.cpp    (task dispatch loop and helpers)
+ */
+class SchedulerContext {
+public:
+    // =========================================================================
+    // Lifecycle
+    // =========================================================================
+
+    // Initialize scheduler state from the given runtime and thread layout.
+    // - Discovers cores via handshake_all_cores()
+    // - Assigns cores to scheduler threads
+    // - Resets task counters, payloads, per-core GlobalContext
+    // - Binds func_id_to_addr_ / initial sched_ (if rt is already known)
+    // - Captures AICore-register base (consumed by handshake_all_cores())
+    // Returns 0 on success, negative on failure (handshake / assignment error).
+    int32_t
+    init(Runtime *runtime, int32_t aicpu_thread_num, int32_t sched_thread_num, bool orch_to_sched, uint64_t regs_base);
+
+    // Reset all SchedulerContext-owned state to its post-construction defaults.
+    // Called by AicpuExecutor::deinit() during per-run teardown.
+    void deinit();
+
+    // =========================================================================
+    // Per-thread execution entry points (called by AicpuExecutor::run)
+    // =========================================================================
+
+    // Main scheduler thread entry: poll completion + dispatch ready tasks.
+    int32_t resolve_and_dispatch(Runtime *runtime, int32_t thread_idx);
+
+    // Shutdown AICore registers for this thread's assigned cores.
+    // Also runs PMU finalize (PTO2_PROFILING) before deinit when enabled.
+    // Orchestrator threads (core_trackers_[thread_idx].core_num() == 0) are a no-op.
+    int32_t shutdown(int32_t thread_idx);
+
+    // Run all post-orchestration scheduler bookkeeping:
+    //  - publishes core assignments to the perf collector (PTO2_PROFILING)
+    //  - latches submitted task count from PTO2 shared memory
+    //  - folds inline_completed_tasks into completed_tasks_
+    //  - flips orchestrator_done_ and triggers core transition
+    //    (skipped on fatal error — emergency_shutdown runs instead)
+    // Callers must invoke rt_orchestration_done(rt) before this — that
+    // step belongs to the orchestrator lifecycle, not the scheduler.
+    void on_orchestration_done(Runtime *runtime, PTO2Runtime *rt, int32_t thread_idx, int32_t total_tasks);
+
+    // Bind the PTO2Runtime scheduler pointer. Required in device-orchestration
+    // mode where rt is created by the orchestrator thread after init().
+    void bind_runtime(PTO2Runtime *rt);
+
+    // =========================================================================
+    // State queries / external synchronization points
+    // =========================================================================
+
+    int32_t aic_count() const { return aic_count_; }
+    int32_t aiv_count() const { return aiv_count_; }
+    bool is_completed() const { return completed_.load(std::memory_order_acquire); }
+    int32_t completed_tasks_count() const { return completed_tasks_.load(std::memory_order_acquire); }
+
+    // Block until the first scheduler thread has finished one-time PTO2 init.
+    // Called by the orchestrator thread in device-orch mode.
+    void wait_pto2_init_complete() const;
+
+private:
+    // =========================================================================
+    // State
+    // =========================================================================
+
+    // --- Scheduler binding & per-core runtime state ---
+    alignas(64) PTO2SchedulerState *sched_{nullptr};
+    PTO2Runtime *rt_{nullptr};
+
+    // Per-core execution state, indexed by core_id (= worker_id)
+    CoreExecState core_exec_states_[RUNTIME_MAX_WORKER];
+
+    // Cluster-ordered core trackers, one per scheduler thread
+    CoreTracker core_trackers_[MAX_AICPU_THREADS];
+
+    // Per-core dispatch payload storage: dual-buffer for pipelining.
+    // buf_idx = reg_task_id & 1; adjacent dispatches alternate automatically.
+    PTO2DispatchPayload payload_per_core_[RUNTIME_MAX_WORKER][2];
+
+    // Per-core deferred-completion software registration storage.  This has
+    // the same runtime lifetime as payload_per_core_, but is kept out of the
+    // dispatch payload so normal task dispatch layout and cache footprint stay
+    // unchanged.
+    DeferredCompletionSlab deferred_slab_per_core_[RUNTIME_MAX_WORKER][2];
+
+    // sync_start drain coordination
+    SyncStartDrainState drain_state_;
+
+#if PTO2_PROFILING
+    SchedL2SwimlaneCounters sched_l2_swimlane_[MAX_AICPU_THREADS];
+    // Cached once at init() from get_l2_swimlane_level(), AFTER
+    // l2_swimlane_aicpu_init has promoted the level from the shared-memory header.
+    L2SwimlaneLevel l2_swimlane_level_{L2SwimlaneLevel::DISABLED};
+#endif
+
+    // --- Task-execution tracking ---
+    std::atomic<int32_t> completed_tasks_{0};
+    int32_t total_tasks_{0};
+    // Device orchestration: set by last orchestrator when graph is built; schedulers poll it.
+    // volatile prevents the compiler from hoisting the load out of spin loops.
+    volatile bool orchestrator_done_{false};
+    std::atomic<bool> completed_{false};
+    uint64_t *func_id_to_addr_{nullptr};
+
+    // --- Core-transition coordination ---
+    std::atomic<bool> transition_requested_{false};
+    std::atomic<int32_t> wait_reassign_{0};
+    std::atomic<bool> reassigned_{false};
+
+    // --- Thread/core configuration ---
+    int32_t active_sched_threads_{0};
+    int32_t sched_thread_num_{0};
+    bool orch_to_sched_{false};
+    int32_t aicpu_thread_num_{0};
+    int32_t cores_total_num_{0};
+
+    // Cluster-ordered worker_id lists, populated by handshake_all_cores().
+    int32_t aic_worker_ids_[RUNTIME_MAX_WORKER]{};
+    int32_t aiv_worker_ids_[RUNTIME_MAX_WORKER]{};
+    int32_t aic_count_{0};
+    int32_t aiv_count_{0};
+
+    // Platform AICore-register base array (set by AicpuExecutor before init()).
+    uint64_t regs_{0};
+
+#if PTO2_PROFILING
+    // PMU profiling: physical core IDs for PMU MMIO base resolution.
+    // Separate storage because CoreExecState's 64-byte budget has no room for
+    // physical_core_id when PTO2_PROFILING=1.
+    uint32_t physical_core_ids_[RUNTIME_MAX_WORKER]{};
+#endif
+
+    // --- One-time init coordination ---
+    std::atomic<bool> pto2_init_claimed_{false};
+    std::atomic<bool> pto2_init_complete_{false};
+
+    // =========================================================================
+    // Core management (scheduler_cold_path.cpp)
+    // =========================================================================
+
+    // Handshake with all AICore workers; populates core_exec_states_, worker id lists.
+    int32_t handshake_all_cores(Runtime *runtime);
+
+    // Assign discovered cores (cluster = 1 AIC + 2 AIV) round-robin across scheduler threads.
+    bool assign_cores_to_threads();
+
+    // Re-distribute all cores across all threads after orchestration completes.
+    void reassign_cores_for_all_threads();
+
+    // Emergency shutdown: broadcast exit signal to every handshake'd core and
+    // deinit their AICore register blocks. Idempotent.
+    void emergency_shutdown(Runtime *runtime);
+
+    // =========================================================================
+    // Dispatch (scheduler_dispatch.cpp)
+    // =========================================================================
+
+    static const char *shape_name(PTO2ResourceShape shape);
+
+    // Lower-case rendering of PTO2SubtaskSlot, used by dispatch and stall logs.
+    // Kept lower-case to match the `kernels=[aic:N aiv0:N aiv1:N]` field
+    // convention already established in the stall log family.
+    static inline const char *subslot_name(PTO2SubtaskSlot s) {
+        switch (s) {
+        case PTO2SubtaskSlot::AIC:
+            return "aic";
+        case PTO2SubtaskSlot::AIV0:
+            return "aiv0";
+        case PTO2SubtaskSlot::AIV1:
+            return "aiv1";
+        }
+        return "?";
+    }
+
+    int pop_ready_tasks_batch(
+        PTO2ResourceShape shape, int32_t thread_idx, PTO2LocalReadyBuffer &local_buf, PTO2TaskSlotState **out,
+        int max_count
+    );
+
+    void build_payload(
+        PTO2DispatchPayload &dispatch_payload, PTO2TaskSlotState &slot_state, PTO2SubtaskSlot subslot,
+        const AsyncCtx &async_ctx, int32_t block_idx
+    );
+
+    // Batched-dispatch primitives. prepare_* builds the payload and per-core
+    // state; publish_* issues the MMIO register write. Callers must wmb()
+    // between the prepare batch and the publish batch, then sample
+    // get_sys_cnt_aicpu() once and pass it to publish_* for every handle.
+    //
+    // dispatch_timestamp_slot points to the CoreExecState slot
+    // (pending_dispatch_timestamp / running_dispatch_timestamp) selected at
+    // prepare time, or nullptr when L2 swimlane is below AICPU_TIMING and no
+    // dispatch timestamp is being recorded.
+    struct PublishHandle {
+        uint64_t reg_addr;
+        uint32_t reg_task_id;
+        int32_t core_offset;
+        uint64_t *dispatch_timestamp_slot;
+    };
+
+    PublishHandle prepare_subtask_to_core(
+        int32_t thread_idx, int32_t core_offset, PTO2TaskSlotState &slot_state, PTO2SubtaskSlot subslot,
+        bool to_pending, int32_t block_idx
+    );
+
+    inline void publish_subtask_to_core(const PublishHandle &h, uint64_t dispatch_ts) {
+        if (h.dispatch_timestamp_slot != nullptr) {
+            *h.dispatch_timestamp_slot = dispatch_ts;
+        }
+        write_reg(h.reg_addr, RegId::DATA_MAIN_BASE, static_cast<uint64_t>(h.reg_task_id));
+    }
+
+    // Fan out one block's subtasks (1 for AIC/AIV, 1-3 for MIX) into the
+    // caller-supplied handles buffer. Returns the number of handles written.
+    int prepare_block_for_dispatch(
+        int32_t thread_idx, int32_t core_offset, PTO2TaskSlotState &slot_state, PTO2ResourceShape shape,
+        bool to_pending, int32_t block_idx, PublishHandle *out_handles
+    );
+
+    void dispatch_shape(
+        int32_t thread_idx, PTO2ResourceShape shape, CoreTracker::DispatchPhase phase, PTO2LocalReadyBuffer &local_buf,
+        CoreTracker &tracker, bool &entered_drain, bool &made_progress, bool &try_pushed
+    );
+
+    // Speculative early-dispatch (Hook 1). After normal dispatch leaves idle
+    // cores spare, pre-stage the consumers of any RUNNING flagged producer onto
+    // those cores with not_ready=1 (gated). Touches no dependency state — the
+    // task is released by the doorbell at its normal ready-pop (Hook 2).
+    int32_t try_speculative_early_dispatch(int32_t thread_idx);
+
+    // Stage the already-claimed range [start, start+count) of consumer `c` onto
+    // thread_idx's idle (RUNNING slot) then pending (gated-pending, promote-on-FIN)
+    // cores from the provided free-core sets. The caller advances next_block_idx and
+    // re-pushes `c` BEFORE calling, so this expensive prepare+publish runs
+    // concurrently with peers (mirrors the normal SPMD dispatch path). Returns the
+    // number of blocks staged.
+    int32_t stage_consumer_blocks(
+        int32_t thread_idx, PTO2TaskSlotState *c, PTO2ResourceShape shape, int32_t start, int32_t count,
+        CoreTracker::BitStates &idle, CoreTracker::BitStates &pend
+    );
+
+    // One pass of "Phase 4" in the resolve_and_dispatch loop: IDLE-stage dispatch
+    // for MIX then (if no mix residual) AIC/AIV; mid-flush of local buffers; then
+    // PENDING-stage dispatch with cross-thread idle gating. MIX is strictly
+    // prioritized — when mix residual is detected after MIX-IDLE, AIC/AIV are
+    // skipped for the whole pass but MIX-PENDING still runs.
+    //
+    // Forward-progress argument for AIC/AIV: skip_aic_aiv is sticky for the
+    // current pass only. The next loop iteration re-evaluates after Phase 1
+    // completion polling and the global MIX queue draining (here or on any
+    // peer thread). AIC/AIV starvation is therefore bounded by MIX throughput,
+    // not unbounded — once mix completes on at least one cluster, the next
+    // pass either drains the residual or admits AIC/AIV.
+    void dispatch_ready_tasks(
+        int32_t thread_idx, CoreTracker &tracker, PTO2LocalReadyBuffer (&local_bufs)[PTO2_NUM_RESOURCE_SHAPES],
+        bool pmu_active, bool &made_progress, bool &try_pushed
+    );
+
+    // Returns true if any *other* scheduler thread currently has an idle core
+    // matching `shape`. Used as a scheduling hint on the PENDING dispatch path
+    // — see the implementation in scheduler_dispatch.cpp for the hint-semantics
+    // rationale and the safety argument against the drain worker.
+    bool has_idle_in_other_threads(int32_t self_thread_idx, PTO2ResourceShape shape) const;
+
+    // True if mix tasks remain anywhere this thread could see them: the caller's
+    // MIX local LIFO stack or the global MIX ready queue. Approximate —
+    // PTO2ReadyQueue::size() (see pto_scheduler.h) snapshots its enqueue/dequeue
+    // positions with std::memory_order_relaxed and may interleave with concurrent
+    // push/pop. Don't confuse with PTO2SpscQueue::size(), which uses acquire
+    // loads — that one isn't on this path. A stale read here causes at most one
+    // extra/missed AIC/AIV skip and self-corrects on the next loop iteration.
+    bool has_residual_mix(const PTO2LocalReadyBuffer &mix_local_buf) const {
+        return mix_local_buf.count > 0 || sched_->ready_queues[static_cast<int32_t>(PTO2ResourceShape::MIX)].size() > 0;
+    }
+
+    // =========================================================================
+    // Completion & drain (scheduler_completion.cpp)
+    // =========================================================================
+
+    static SlotTransition decide_slot_transition(
+        int32_t reg_task_id, int32_t reg_state, int32_t running_id, int32_t pending_id, bool pending_gated = false
+    );
+
+    void complete_slot_task(
+        PTO2TaskSlotState &slot_state, int32_t expected_reg_task_id, PTO2SubtaskSlot subslot, int32_t thread_idx,
+        int32_t core_id, Handshake *hank, int32_t &completed_this_turn,
+        PTO2TaskSlotState *deferred_release_slot_states[], int32_t &deferred_release_count,
+        PTO2LocalReadyBuffer *local_bufs
+#if PTO2_PROFILING
+        ,
+        uint64_t dispatch_ts, uint64_t finish_ts
+#endif
+    );
+
+    static void promote_pending_to_running(CoreExecState &core);
+    static void clear_running_slot(CoreExecState &core);
+
+    void check_running_cores_for_completion(
+        int32_t thread_idx, Handshake *hank, int32_t &completed_this_turn, int32_t &cur_thread_completed,
+        bool &made_progress, PTO2TaskSlotState *deferred_release_slot_states[], int32_t &deferred_release_count,
+        PTO2LocalReadyBuffer *local_bufs
+    );
+
+    bool enter_drain_mode(PTO2TaskSlotState *slot_state, int32_t block_num);
+    int32_t count_global_available(PTO2ResourceShape shape, uint8_t core_mask);
+    void drain_worker_dispatch(int32_t block_num);
+    void handle_drain_mode(int32_t thread_idx);
+
+    // =========================================================================
+    // Cold path: exit checks, stall diagnostics, profiling (scheduler_cold_path.cpp)
+    // =========================================================================
+
+    __attribute__((noinline, cold)) LoopAction
+    handle_orchestrator_exit(int32_t thread_idx, PTO2SharedMemoryHeader *header, Runtime *runtime, int32_t &task_count);
+
+    __attribute__((noinline, cold)) LoopAction handle_core_transition(bool &cores_released);
+
+    __attribute__((noinline, cold)) LoopAction
+    check_idle_fatal_error(int32_t thread_idx, PTO2SharedMemoryHeader *header, Runtime *runtime);
+
+    __attribute__((noinline, cold)) void
+    log_stall_diagnostics(int32_t thread_idx, int32_t task_count, int32_t idle_iterations, int32_t last_progress_count);
+
+    __attribute__((noinline, cold)) void log_shutdown_stall_snapshot(
+        int32_t trigger_thread_idx, int32_t trigger_idle_iterations, int32_t trigger_last_progress_count
+    );
+
+    // Reverse lookup: given a global core_id, find which scheduler thread's
+    // tracker owns it. Returns -1 if not found. Linear scan — only used on
+    // the cold diagnostic path.
+    int32_t find_core_owner_thread(int32_t core_id) const;
+
+    // Does this thread own any core with a RUNNING task (running_slot_state set)?
+    // Gates the scheduler timeout fatal latch: a thread without an owned
+    // RUNNING task has no first-hand evidence of a stuck dispatch and must
+    // not declare global fatal on its own idle observation. The thread that
+    // does own the stuck task will reach the budget on its own polls and
+    // latch with valid evidence (or recover when the COND register flips).
+    bool self_owns_running_task(int32_t thread_idx) const;
+
+    // Does *any* scheduler thread own a RUNNING task? Used as the second
+    // fatal-latch condition: if the wall-clock budget elapsed AND no thread
+    // owns RUNNING work AND tasks remain incomplete, the system is in a
+    // pre-dispatch / WAIT-only deadlock (e.g. dependency cycle) and the
+    // ownerless idle threads are the only observers — let one of them latch.
+    bool no_thread_owns_running_task() const;
+
+    __attribute__((noinline, cold)) int32_t handle_timeout_exit(
+        int32_t thread_idx, PTO2SharedMemoryHeader *header, Runtime *runtime, int32_t idle_iterations,
+        int32_t last_progress_count
+#if PTO2_PROFILING
+        ,
+        uint64_t sched_start_ts
+#endif
+    );
+
+#if PTO2_PROFILING
+    __attribute__((noinline, cold)) void log_l2_swimlane_summary(int32_t thread_idx, int32_t cur_thread_completed);
+#endif
+
+    // =========================================================================
+    // Small inline helpers
+    // =========================================================================
+
+    uint64_t get_function_bin_addr(int func_id) const {
+        if (!func_id_to_addr_ || func_id < 0 || func_id >= RUNTIME_MAX_FUNC_ID) {
+            LOG_ERROR("func_id=%d is out of range [0, %d) or map is null", func_id, RUNTIME_MAX_FUNC_ID);
+            return 0;
+        }
+        return func_id_to_addr_[func_id];
+    }
+};
+
+#endif  // SCHEDULER_CONTEXT_H
diff --git a/src/a2a3/runtime/fully_distributed_within_core/runtime/scheduler/scheduler_dispatch.cpp b/src/a2a3/runtime/fully_distributed_within_core/runtime/scheduler/scheduler_dispatch.cpp
new file mode 100644
index 000000000..c727ff16c
--- /dev/null
+++ b/src/a2a3/runtime/fully_distributed_within_core/runtime/scheduler/scheduler_dispatch.cpp
@@ -0,0 +1,1501 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+#include "scheduler_context.h"
+
+#include <algorithm>
+#include <cinttypes>
+#include <limits>
+
+#include "common.h"  // debug_assert
+
+#include "common/unified_log.h"
+#include "aicpu/device_time.h"
+#include "aicpu/platform_regs.h"
+#include "callable.h"
+#include "common/l2_swimlane_profiling.h"
+#include "common/memory_barrier.h"
+#include "common/platform_config.h"
+#include "pto_runtime2.h"
+#include "runtime.h"
+#include "spin_hint.h"
+
+// Performance profiling headers
+#include "aicpu/l2_swimlane_collector_aicpu.h"
+#include "aicpu/pmu_collector_aicpu.h"
+#include "aicpu/tensor_dump_aicpu.h"
+
+#ifndef unlikely
+#define unlikely(x) __builtin_expect(!!(x), 0)
+#endif
+
+// =============================================================================
+// Dispatch helpers
+// =============================================================================
+
+namespace {
+inline constexpr int32_t PTO2_DEFERRED_RELEASE_CAP = 256;
+}
+
+// The speculative core bitmask (PTO2_SPEC_CORE_MASK_WORDS * 64 bits) must cover
+// every global core_id, and the per-core doorbell table is sized to match.
+static_assert(
+    RUNTIME_MAX_WORKER <= PTO2_SPEC_CORE_MASK_WORDS * 64, "staged_core_mask too small for RUNTIME_MAX_WORKER cores"
+);
+
+const char *SchedulerContext::shape_name(PTO2ResourceShape shape) {
+    switch (shape) {
+    case PTO2ResourceShape::AIC:
+        return "AIC";
+    case PTO2ResourceShape::AIV:
+        return "AIV";
+    case PTO2ResourceShape::MIX:
+        return "MIX";
+    case PTO2ResourceShape::DUMMY:
+        return "DUMMY";
+    }
+    return "UNKNOWN";
+}
+
+bool SchedulerContext::has_idle_in_other_threads(int32_t self_thread_idx, PTO2ResourceShape shape) const {
+    // Cross-thread read of peer trackers without explicit synchronization. The
+    // backing `core_states_` is a naturally aligned uint64_t; aarch64 guarantees
+    // single-copy atomicity for an 8-byte aligned load, so no torn read. The
+    // value is consumed only as a scheduling *hint* — a stale read at worst
+    // causes one missed/extra pending dispatch, corrected on the next iteration.
+    // Drain-mode cross-thread writes are serialized by handle_drain_mode's ack
+    // barrier (all peers spin out of the dispatch path before any tracker
+    // mutation), so this routine is never racing the drain worker.
+    for (int32_t t = 0; t < active_sched_threads_; t++) {
+        if (t == self_thread_idx) continue;
+        if (core_trackers_[t].get_idle_core_offset_states(shape).has_value()) {
+            return true;
+        }
+    }
+    return false;
+}
+
+int SchedulerContext::pop_ready_tasks_batch(
+    PTO2ResourceShape shape, int32_t thread_idx, PTO2LocalReadyBuffer &local_buf, PTO2TaskSlotState **out, int max_count
+) {
+#if PTO2_PROFILING
+    auto &l2_swimlane = sched_l2_swimlane_[thread_idx];
+#if PTO2_SCHED_PROFILING
+    extern uint64_t g_sched_pop_atomic_count[], g_sched_pop_wait_cycle[];
+    uint64_t t_pop_start = get_sys_cnt_aicpu();
+    int count = sched_->get_ready_tasks_batch(
+        shape, local_buf, out, max_count, g_sched_pop_atomic_count[thread_idx], g_sched_pop_wait_cycle[thread_idx]
+    );
+    l2_swimlane.sched_dispatch_pop_cycle += (get_sys_cnt_aicpu() - t_pop_start);
+#else
+    int count = sched_->get_ready_tasks_batch(shape, local_buf, out, max_count);
+#endif
+    if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) {
+        if (count > 0) {
+            l2_swimlane.pop_hit += count;
+        } else {
+            l2_swimlane.pop_miss++;
+        }
+    }
+#else
+    (void)thread_idx;
+    int count = sched_->get_ready_tasks_batch(shape, local_buf, out, max_count);
+#endif
+    return count;
+}
+
+void SchedulerContext::build_payload(
+    PTO2DispatchPayload &dispatch_payload, PTO2TaskSlotState &slot_state, PTO2SubtaskSlot subslot,
+    const AsyncCtx &async_ctx, int32_t block_idx
+) {
+    int32_t slot_idx = static_cast<int32_t>(subslot);
+    uint64_t callable_addr = get_function_bin_addr(slot_state.task->kernel_id[slot_idx]);
+    const CoreCallable *callable = reinterpret_cast<const CoreCallable *>(callable_addr);
+    dispatch_payload.function_bin_addr = callable->resolved_addr();
+    auto &payload = *slot_state.payload;
+    int n = 0;
+    for (int32_t i = 0; i < payload.tensor_count; i++) {
+        dispatch_payload.args[n++] = reinterpret_cast<uint64_t>(&payload.tensors[i]);
+    }
+    for (int32_t i = 0; i < payload.scalar_count; i++) {
+        dispatch_payload.args[n++] = payload.scalars[i];
+    }
+    dispatch_payload.local_context.block_idx = block_idx;
+    dispatch_payload.local_context.block_num = slot_state.logical_block_num;
+    dispatch_payload.local_context.async_ctx = async_ctx;
+    dispatch_payload.args[PAYLOAD_LOCAL_CONTEXT_INDEX] = reinterpret_cast<uint64_t>(&dispatch_payload.local_context);
+    dispatch_payload.args[PAYLOAD_GLOBAL_CONTEXT_INDEX] = reinterpret_cast<uint64_t>(&dispatch_payload.global_context);
+    // Speculative early-dispatch: a task being staged (Hook 1 set spec_state to
+    // STAGING before this call) is gated — the AICore must wait for the
+    // DATA_MAIN_BASE high-32 doorbell. All other dispatches run on pickup.
+    dispatch_payload.not_ready =
+        (slot_state.payload->spec_state.load(std::memory_order_relaxed) == PTO2_SPEC_STAGING) ? 1 : 0;
+}
+
+SchedulerContext::PublishHandle SchedulerContext::prepare_subtask_to_core(
+    int32_t thread_idx, int32_t core_offset, PTO2TaskSlotState &slot_state, PTO2SubtaskSlot subslot, bool to_pending,
+    int32_t block_idx
+) {
+    CoreTracker &tracker = core_trackers_[thread_idx];
+    auto core_id = tracker.get_core_id_by_offset(core_offset);
+    CoreExecState &core_exec_state = core_exec_states_[core_id];
+
+    core_exec_state.dispatch_seq++;
+    uint32_t reg_task_id = core_exec_state.dispatch_seq & TASK_ID_MASK;
+    static_assert(
+        (TASK_ID_MASK - AICORE_EXIT_SIGNAL + 1) % 2 == 0, "Sentinel skip must be even to preserve dual-buffer parity"
+    );
+    if (reg_task_id >= AICORE_EXIT_SIGNAL) {
+        core_exec_state.dispatch_seq += (TASK_ID_MASK - reg_task_id + 1);
+        reg_task_id = core_exec_state.dispatch_seq & TASK_ID_MASK;
+    }
+
+    uint32_t buf_idx = reg_task_id & 1u;
+    PTO2DispatchPayload &payload = payload_per_core_[core_id][buf_idx];
+    DeferredCompletionSlab *deferred_slab = &deferred_slab_per_core_[core_id][buf_idx];
+    deferred_slab->count = 0;
+    deferred_slab->error_code = PTO2_ERROR_NONE;
+    AsyncCtx async_ctx = AsyncCtx::make(slot_state.task->task_id, deferred_slab);
+    build_payload(payload, slot_state, subslot, async_ctx, block_idx);
+
+    if (to_pending) {
+        core_exec_state.pending_subslot = subslot;
+        core_exec_state.pending_slot_state = &slot_state;
+        core_exec_state.pending_reg_task_id = static_cast<int32_t>(reg_task_id);
+    } else {
+        core_exec_state.running_subslot = subslot;
+        core_exec_state.running_slot_state = &slot_state;
+        core_exec_state.running_reg_task_id = static_cast<int32_t>(reg_task_id);
+        tracker.change_core_state(core_offset);
+    }
+    tracker.set_pending_occupied(core_offset);
+
+    LOG_DEBUG(
+        "Thread %d: Dispatched %s %s task %" PRId64 " kernel_id=[%d,%d,%d] block_idx=%d/total_blocks=%d to"
+        " core_offset=%d core_id=%d reg_task_id=%u",
+        thread_idx, to_pending ? "pending" : "idle", subslot_name(subslot),
+        static_cast<int64_t>(slot_state.task->task_id.raw), slot_state.task->kernel_id[0],
+        slot_state.task->kernel_id[1], slot_state.task->kernel_id[2], block_idx, slot_state.logical_block_num,
+        core_offset, core_id, reg_task_id
+    );
+
+    // AICore buffer rotation lives on the dispatch path: count this dispatch
+    // and rotate before write_reg when we're about to cross a BUFFER_SIZE
+    // boundary. The completion-before-dispatch invariant makes this race-free
+    // (all prior tasks on this core have FIN'd, so AICore has dcci'd their
+    // records out of the old buffer). Gated on the same enable bit as flush
+    // so level=1 (AICORE_TIMING-only) participates without needing complete_task.
+#if PTO2_PROFILING
+    if (l2_swimlane_level_ != L2SwimlaneLevel::DISABLED) {
+        l2_swimlane_aicpu_on_aicore_dispatch(core_id, thread_idx);
+    }
+#endif
+
+    uint64_t *dispatch_timestamp_slot = nullptr;
+#if PTO2_PROFILING
+    if (l2_swimlane_level_ >= L2SwimlaneLevel::AICPU_TIMING) {
+        dispatch_timestamp_slot =
+            to_pending ? &core_exec_state.pending_dispatch_timestamp : &core_exec_state.running_dispatch_timestamp;
+    }
+#endif
+
+    return PublishHandle{core_exec_state.reg_addr, reg_task_id, core_offset, dispatch_timestamp_slot};
+}
+
+int SchedulerContext::prepare_block_for_dispatch(
+    int32_t thread_idx, int32_t core_offset, PTO2TaskSlotState &slot_state, PTO2ResourceShape shape, bool to_pending,
+    int32_t block_idx, PublishHandle *out_handles
+) {
+#if PTO2_PROFILING
+    if (is_dump_args_enabled()) {
+        dump_args_for_task<PTO2_SUBTASK_SLOT_COUNT>(
+            thread_idx, slot_state, TensorDumpStage::BEFORE_DISPATCH,
+            [](ActiveMask active_mask, int raw_subtask_id) {
+                return active_mask.subtask_active(static_cast<PTO2SubtaskSlot>(raw_subtask_id));
+            },
+            [this](int32_t func_id) {
+                return get_function_bin_addr(func_id);
+            }
+        );
+    }
+#endif
+    CoreTracker &tracker = core_trackers_[thread_idx];
+    if (shape == PTO2ResourceShape::MIX) {
+        uint8_t cmask = slot_state.active_mask.core_mask();
+        int n = 0;
+        if (cmask & PTO2_SUBTASK_MASK_AIC) {
+            out_handles[n++] = prepare_subtask_to_core(
+                thread_idx, tracker.get_aic_core_offset(core_offset), slot_state, PTO2SubtaskSlot::AIC, to_pending,
+                block_idx
+            );
+        }
+        if (cmask & PTO2_SUBTASK_MASK_AIV0) {
+            out_handles[n++] = prepare_subtask_to_core(
+                thread_idx, tracker.get_aiv0_core_offset(core_offset), slot_state, PTO2SubtaskSlot::AIV0, to_pending,
+                block_idx
+            );
+        }
+        if (cmask & PTO2_SUBTASK_MASK_AIV1) {
+            out_handles[n++] = prepare_subtask_to_core(
+                thread_idx, tracker.get_aiv1_core_offset(core_offset), slot_state, PTO2SubtaskSlot::AIV1, to_pending,
+                block_idx
+            );
+        }
+#if PTO2_PROFILING
+        sched_l2_swimlane_[thread_idx].phase_dispatch_count += __builtin_popcount(cmask);
+#endif
+        return n;
+    } else if (shape == PTO2ResourceShape::AIC) {
+        out_handles[0] =
+            prepare_subtask_to_core(thread_idx, core_offset, slot_state, PTO2SubtaskSlot::AIC, to_pending, block_idx);
+#if PTO2_PROFILING
+        sched_l2_swimlane_[thread_idx].phase_dispatch_count += 1;
+#endif
+        return 1;
+    } else {
+        out_handles[0] =
+            prepare_subtask_to_core(thread_idx, core_offset, slot_state, PTO2SubtaskSlot::AIV0, to_pending, block_idx);
+#if PTO2_PROFILING
+        sched_l2_swimlane_[thread_idx].phase_dispatch_count += 1;
+#endif
+        return 1;
+    }
+}
+
+void SchedulerContext::dispatch_shape(
+    int32_t thread_idx, PTO2ResourceShape shape, CoreTracker::DispatchPhase phase, PTO2LocalReadyBuffer &local_buf,
+    CoreTracker &tracker, bool &entered_drain, bool &made_progress, bool &try_pushed
+) {
+#if PTO2_SCHED_PROFILING
+    auto &l2_swimlane = sched_l2_swimlane_[thread_idx];
+#endif
+    if (entered_drain) return;
+
+    bool is_pending = (phase == CoreTracker::DispatchPhase::PENDING);
+    bool is_mix = (shape == PTO2ResourceShape::MIX);
+    auto cores = is_mix ? tracker.get_cluster_offset_states() : tracker.get_dispatchable_cores(shape, phase);
+    if (!cores.has_value()) return;
+
+    while (cores.has_value() && !entered_drain) {
+        int want = cores.count();
+        PTO2TaskSlotState *batch[CoreTracker::MAX_CLUSTERS * 3];
+        int got = pop_ready_tasks_batch(shape, thread_idx, local_buf, batch, want);
+        if (got == 0) break;
+
+        // sync_start exclusion gate.
+        //
+        // When the popped batch contains a sync_start task we MUST publish each
+        // prior task with its own wmb so AICore receives them with time
+        // separation. The drain coordinator's `count_global_available()` check
+        // reads the per-thread CoreTracker, and although `prepare_block_for_dispatch`
+        // marks cores occupied synchronously, the head-start between successive
+        // tasks is what lets the surrounding completion loop catch up on FINs in
+        // the retry window when the sync_start task hits insufficient resources.
+        // Bursting all prior tasks at the end of the pop (cross-task batching)
+        // collapses that head-start and causes spmd_sync_start_stress to time
+        // out via 507018 on ~40% of runs — see
+        // docs/investigations/2026-06-cross-task-batched-publish.md.
+        //
+        // When the batch carries no sync_start task, no drain entry can happen
+        // in this pop, so we hoist `handles[]`, `wmb()`, and the publish loop
+        // out of the per-task body. One wmb amortizes across all tasks and one
+        // dispatch_ts is shared, which restores ~60 ns first-to-last AICore
+        // start span for single-block decode kernels (out_proj, q_proj, ...).
+        // Detection is a single mask check per task — cheap relative to even
+        // one register write.
+        bool any_sync_start = false;
+        for (int bi = 0; bi < got; bi++) {
+            if (batch[bi]->active_mask.requires_sync_start()) {
+                any_sync_start = true;
+                break;
+            }
+        }
+
+        // handles[] is sized for the MIX worst case: total claims across the
+        // pop bounded by `cores.count() ≤ MAX_CLUSTERS`, and each block
+        // contributes ≤ 3 subtasks for MIX.
+        PublishHandle handles[CoreTracker::MAX_CLUSTERS * 3];
+        int handle_count = 0;
+        bool dispatched_any = false;
+        // Slots dispatched this pop whose dispatch_fanin must be propagated to
+        // consumers. Deferred until AFTER publish (below) so a flagged producer's
+        // fanout walk never sits between claiming cores and publishing its own
+        // blocks — doing it inline delays this thread's blocks while peer threads
+        // co-dispatching the same SPMD task publish immediately, misaligning the
+        // task's block starts. Bounded by cores.count() ≤ MAX_CLUSTERS dispatches.
+        PTO2TaskSlotState *prop_list[CoreTracker::MAX_CLUSTERS];
+        int prop_n = 0;
+#if PTO2_SCHED_PROFILING
+        uint64_t t_setup_start = get_sys_cnt_aicpu();
+#endif
+
+        // Flush prepared-but-unpublished handles. Required before
+        // `enter_drain_mode` so the drain coordinator sees cores as occupied,
+        // and at the per-task boundary when `any_sync_start` is true.
+        auto flush_publish = [&]() {
+            if (handle_count == 0) return;
+            wmb();
+            uint64_t dispatch_ts = 0;
+#if PTO2_PROFILING
+            if (l2_swimlane_level_ >= L2SwimlaneLevel::AICPU_TIMING) {
+                dispatch_ts = get_sys_cnt_aicpu();
+            }
+#endif
+            for (int i = 0; i < handle_count; i++) {
+                publish_subtask_to_core(handles[i], dispatch_ts);
+            }
+            handle_count = 0;
+            made_progress = true;
+        };
+
+        for (int bi = 0; bi < got; bi++) {
+            PTO2TaskSlotState *slot_state = batch[bi];
+            CoreTracker::BitStates selected_mix_clusters(0ULL);
+
+            if (is_mix) {
+                auto candidates = cores;
+                uint8_t cmask = slot_state->active_mask.core_mask();
+                auto wanted = is_pending ? CoreTracker::MixPlacement::PENDING : CoreTracker::MixPlacement::RUNNING;
+                while (candidates.has_value()) {
+                    int32_t cluster_offset = candidates.pop_first();
+                    if (tracker.classify_mix_cluster(cluster_offset, cmask) == wanted) {
+                        selected_mix_clusters |= CoreTracker::BitStates(1ULL << cluster_offset);
+                    }
+                }
+                if (!selected_mix_clusters.has_value()) {
+                    sched_->ready_queues[static_cast<int32_t>(shape)].push(slot_state);
+                    continue;
+                }
+            }
+
+            // (Speculative pre-staged tasks never reach this ready-pop: they are
+            // released by their doorbell in release_fanin_and_check_ready the
+            // instant their last producer completes — see try_speculative_release.)
+
+            if (slot_state->active_mask.requires_sync_start()) {
+                if (is_pending) {
+                    sched_->ready_queues[static_cast<int32_t>(shape)].push(slot_state);
+                    continue;
+                }
+                int32_t available = is_mix ? selected_mix_clusters.count() : cores.count();
+                if (available < slot_state->logical_block_num) {
+                    flush_publish();
+                    if (!enter_drain_mode(slot_state, slot_state->logical_block_num)) {
+                        sched_->ready_queues[static_cast<int32_t>(shape)].push(slot_state);
+                    }
+                    for (int rem = bi + 1; rem < got; rem++) {
+                        sched_->ready_queues[static_cast<int32_t>(shape)].push(batch[rem]);
+                    }
+                    entered_drain = true;
+                    break;
+                }
+            }
+
+            if (!cores.has_value()) {
+                flush_publish();
+                sched_->ready_queues[static_cast<int32_t>(shape)].push_batch(&batch[bi], got - bi);
+                break;
+            }
+
+            dispatched_any = true;
+            try_pushed = true;
+            // Record for deferred dispatch_fanin propagation after this pop's
+            // blocks are published (see after the loop). propagate's own guard
+            // filters non-flagged slots, so recording unconditionally is cheap.
+            if (prop_n < static_cast<int>(sizeof(prop_list) / sizeof(prop_list[0]))) {
+                prop_list[prop_n++] = slot_state;
+            }
+            // Claim a contiguous range of blocks, hand the slot back to the
+            // ready queue immediately, then perform the expensive dispatches.
+            // This lets other schedulers concurrently claim and dispatch the
+            // remaining blocks of the same SPMD task instead of spinning while
+            // this thread fills all its own cores. Only local `start + b` is
+            // read after the push — `next_block_idx` may already be advanced
+            // by another scheduler that popped the slot.
+            int32_t start = slot_state->next_block_idx.load(std::memory_order_relaxed);
+            int32_t remaining = slot_state->logical_block_num - start;
+            int32_t available = is_mix ? selected_mix_clusters.count() : cores.count();
+            int32_t claim = std::min(available, remaining);
+            slot_state->next_block_idx.store(static_cast<int16_t>(start + claim), std::memory_order_relaxed);
+
+            if (start + claim < slot_state->logical_block_num) {
+                sched_->ready_queues[static_cast<int32_t>(shape)].push(slot_state);
+            }
+
+            for (int32_t b = 0; b < claim; b++) {
+                auto core_offset = is_mix ? selected_mix_clusters.pop_first() : cores.pop_first();
+                if (is_mix) {
+                    cores.clear_bit(core_offset);
+                }
+                handle_count += prepare_block_for_dispatch(
+                    thread_idx, core_offset, *slot_state, shape, is_pending, start + b, &handles[handle_count]
+                );
+            }
+
+            // Sync_start exclusion: flush per task so prior tasks have head-
+            // start time before any sync_start drain check. Normal batches
+            // fall through and accumulate for one cross-task flush at the
+            // end of the pop.
+            if (any_sync_start) {
+                flush_publish();
+            }
+        }
+
+        flush_publish();
+        // Blocks are published; now propagate dispatch_fanin for any flagged
+        // producers dispatched above (knob A: producer is running). Off the
+        // pre-publish path so it cannot delay or misalign their blocks.
+        for (int i = 0; i < prop_n; i++) {
+            sched_->propagate_dispatch_fanin(*prop_list[i]);
+        }
+#if PTO2_SCHED_PROFILING
+        l2_swimlane.sched_dispatch_setup_cycle += (get_sys_cnt_aicpu() - t_setup_start);
+#endif
+
+        if (!dispatched_any) break;
+
+        if (!cores.has_value()) {
+            cores = is_mix ? tracker.get_cluster_offset_states() : tracker.get_dispatchable_cores(shape, phase);
+        }
+    }
+}
+
+void SchedulerContext::dispatch_ready_tasks(
+    int32_t thread_idx, CoreTracker &tracker, PTO2LocalReadyBuffer (&local_bufs)[PTO2_NUM_RESOURCE_SHAPES],
+    bool pmu_active, bool &made_progress, bool &try_pushed
+) {
+    using Phase = CoreTracker::DispatchPhase;
+    constexpr int32_t MIX_I = static_cast<int32_t>(PTO2ResourceShape::MIX);
+
+    // MIX is handled explicitly at the top of each stage; only AIC/AIV cycle
+    // through this 2-elem array, with order toggled by thread parity for
+    // shape-level load balancing across threads.
+    static constexpr PTO2ResourceShape kAicAivOrder[2][2] = {
+        {PTO2ResourceShape::AIC, PTO2ResourceShape::AIV},
+        {PTO2ResourceShape::AIV, PTO2ResourceShape::AIC},
+    };
+    const PTO2ResourceShape *aic_aiv = kAicAivOrder[thread_idx & 1];
+
+    // Spill overflow from local_bufs to the shared ready queue BEFORE we start
+    // dispatching. release_fanin's fast path packs all newly-ready consumers
+    // into the producing thread's local_bufs (zero atomic, peer-invisible). For
+    // batch releases (e.g. attn_fence → 50 out_proj consumers) that
+    // overshoots this thread's slot budget so peers are starving while we
+    // hoard. The cross-thread invisibility window between "complete pushes 50
+    // to local" and "IDLE-AIC's mid-phase flush exposes overflow to shared"
+    // is what shows up in the swimlane as the multi-microsecond inter-thread
+    // stagger on out_proj's first wave.
+    //
+    // Gate conditions:
+    //   (a) local count exceeds this thread's per-shape block budget — we
+    //       can't dispatch them all even with both RUNNING+PENDING slots;
+    //   (b) at least one peer has idle cores in this shape — they want work.
+    // Both must hold to avoid wasting a CAS push when we could profitably
+    // self-dispatch the overflow. Condition (b) reads peer CoreTracker
+    // (plain 8-byte load on a rarely-contended cache line, ~5 ns) — we
+    // deliberately avoid ready_queues[s].size() here, which is two atomic
+    // loads on lines pushers + poppers actively bounce.
+    //
+    // Capacity derives from how cores are partitioned across sched threads:
+    //   per-shape budget = (PLATFORM_MAX_BLOCKDIM / active_sched_threads_)
+    //                       × cores_per_blockdim_for_that_shape
+    //   MIX is 1 cluster per block dim, so its budget equals the block-dim
+    //   share without multiplying.
+    //
+    // Push the trailing `excess` slot pointers — O(1) count decrement, no
+    // memmove. push_batch is one CAS for the whole excess; peers see the
+    // batch immediately and can race for them.
+    const int32_t bd_per_thread = PLATFORM_MAX_BLOCKDIM / active_sched_threads_;
+    const int32_t thread_capacity[PTO2_NUM_RESOURCE_SHAPES] = {
+        /*AIC=*/bd_per_thread * PLATFORM_AIC_CORES_PER_BLOCKDIM,
+        /*AIV=*/bd_per_thread * PLATFORM_AIV_CORES_PER_BLOCKDIM,
+        /*MIX=*/bd_per_thread,
+    };
+    for (int32_t s = 0; s < PTO2_NUM_RESOURCE_SHAPES; s++) {
+        auto &lb = local_bufs[s];
+        int32_t excess = lb.count - thread_capacity[s];
+        if (excess <= 0) continue;
+        if (!has_idle_in_other_threads(thread_idx, static_cast<PTO2ResourceShape>(s))) continue;
+        sched_->ready_queues[s].push_batch(&lb.slot_states[lb.count - excess], excess);
+        lb.count -= excess;
+    }
+
+    auto flush_local_bufs = [&]() {
+        for (int32_t s = 0; s < PTO2_NUM_RESOURCE_SHAPES; s++) {
+            auto &lb = local_bufs[s];
+            if (lb.count > 0) {
+                sched_->ready_queues[s].push_batch(lb.slot_states, lb.count);
+                lb.count = 0;
+            }
+        }
+    };
+    // Every return path below must flush; wrap in RAII so we cannot forget.
+    // The mid-function flush between IDLE and PENDING is still called
+    // explicitly — guard only covers exit.
+    struct FlushGuard {
+        decltype(flush_local_bufs) &flush_fn;
+        ~FlushGuard() { flush_fn(); }
+    } flush_guard{flush_local_bufs};
+
+    bool entered_drain = false;
+
+    // ===== IDLE stage =====
+    dispatch_shape(
+        thread_idx, PTO2ResourceShape::MIX, Phase::IDLE, local_bufs[MIX_I], tracker, entered_drain, made_progress,
+        try_pushed
+    );
+    if (entered_drain) return;
+
+    // MIX-IDLE residual: AIC/AIV (both IDLE and PENDING) yield for this pass.
+    // MIX-PENDING below still runs — that is the core of "mix strict priority":
+    // pending slots are spent on mix before AIC/AIV get any chance.
+    bool skip_aic_aiv = has_residual_mix(local_bufs[MIX_I]);
+
+    if (!skip_aic_aiv) {
+        for (int i = 0; i < 2; i++) {
+            PTO2ResourceShape s = aic_aiv[i];
+            dispatch_shape(
+                thread_idx, s, Phase::IDLE, local_bufs[static_cast<int32_t>(s)], tracker, entered_drain, made_progress,
+                try_pushed
+            );
+            if (entered_drain) return;
+        }
+    }
+
+    // Flush between IDLE and PENDING so PENDING-stage queue-size checks and any
+    // peer-thread reads see the IDLE-stage release_fanin output.
+    flush_local_bufs();
+
+    if (pmu_active) return;
+
+    // ===== PENDING stage =====
+    // MIX-PENDING gate: skip when a peer has an idle MIX-capable cluster — that
+    // peer's next IDLE-MIX iteration will pull the mix task from the global
+    // queue (already flushed above) at lower latency than us pre-loading a
+    // pending slot here. Forward progress for MIX is preserved: at least one
+    // thread will run MIX-IDLE next pass and consume the residual.
+    //
+    // The gate is NOT subject to skip_aic_aiv — residual mix continues to drain
+    // via pending slots on this thread when no peer is idle.
+    if (!has_idle_in_other_threads(thread_idx, PTO2ResourceShape::MIX)) {
+        dispatch_shape(
+            thread_idx, PTO2ResourceShape::MIX, Phase::PENDING, local_bufs[MIX_I], tracker, entered_drain,
+            made_progress, try_pushed
+        );
+        if (entered_drain) return;
+    }
+
+    // Re-check after MIX-PENDING. If MIX-IDLE already set skip_aic_aiv, leave
+    // it set; otherwise, escalate iff PENDING-MIX left residual.
+    if (!skip_aic_aiv && has_residual_mix(local_bufs[MIX_I])) {
+        skip_aic_aiv = true;
+    }
+
+    // PENDING-MIX may have re-populated AIC/AIV local_bufs via release_fanin
+    // during in-flight completions; flush_guard ensures these don't carry
+    // across to the next iteration's IDLE stage.
+    if (skip_aic_aiv) return;
+
+    // AIC/AIV-PENDING gate: a peer-idle skip is a delay, not a loss — the peer
+    // will pull from the global queue on its next IDLE pass.
+    for (int i = 0; i < 2; i++) {
+        PTO2ResourceShape s = aic_aiv[i];
+        if (has_idle_in_other_threads(thread_idx, s)) continue;
+        dispatch_shape(
+            thread_idx, s, Phase::PENDING, local_bufs[static_cast<int32_t>(s)], tracker, entered_drain, made_progress,
+            try_pushed
+        );
+        if (entered_drain) return;
+    }
+}
+
+// Stage the ALREADY-CLAIMED range [start, start+count) of consumer `c` onto
+// thread_idx's idle then pending cores. The caller (the queue drain) has advanced
+// next_block_idx by `count` under pop-exclusivity AND re-pushed `c` for peers
+// BEFORE calling this — so this, the expensive prepare+publish, runs CONCURRENTLY
+// with peers staging other ranges of the same consumer. This mirrors the normal
+// SPMD dispatch path (claim range -> store next_block_idx -> re-push -> dispatch).
+// `idle`/`pend` are this thread's free-core sets, sized so idle.count+pend.count >=
+// count (the caller clamped the claim to them), so all `count` blocks get a core.
+//
+// Rule 1: idle cores -> gated task in the RUNNING slot. Rule 2: PENDING slot of
+// cores running a real task -> promoted in when that task FINs (gated-pending Case
+// 3.3 in decide_slot_transition completes the running FIN + promotes instead of
+// waiting for an ack the gated task never sends). Each staged core stays
+// pending_occupied while gated, so no second gated block stacks on it.
+//
+// Self-ring: release flips STAGING->DISPATCHED then rings the mask. A block staged
+// after that flip isn't in the mask release read, so this thread rings it here. The
+// seq_cst order between "OR mask then load spec_state" (here) and "store DISPATCHED
+// then read mask" (release) guarantees every gated core's doorbell fires.
+int32_t SchedulerContext::stage_consumer_blocks(
+    int32_t thread_idx, PTO2TaskSlotState *c, PTO2ResourceShape shape, int32_t start, int32_t count,
+    CoreTracker::BitStates &idle, CoreTracker::BitStates &pend
+) {
+    CoreTracker &tracker = core_trackers_[thread_idx];
+    // Stamp the real pre-stage time (NOT 0) so the swimlane shows these blocks
+    // dispatched during the producer's run, not at trace start.
+    uint64_t early_dispatch_ts = get_sys_cnt_aicpu();
+    uint64_t my_cores[PTO2_SPEC_CORE_MASK_WORDS] = {0};  // cores this thread gated (for self-ring)
+    int32_t staged = 0;
+    int32_t block = start;
+    auto stage_from = [&](CoreTracker::BitStates &avail, bool to_pending) {
+        // Mirror the normal flush_publish (scheduler_dispatch.cpp wmb()+publish loop):
+        // prepare all claimed blocks' payloads, one wmb(), then publish. The wmb
+        // guarantees the not_ready gate + args are globally visible before any
+        // DATA_MAIN_BASE token — without it a gated core can pick up the token and
+        // dcci a stale payload (the doorbell/release path mirrors normal dispatch).
+        PublishHandle handles[CoreTracker::MAX_CLUSTERS * 3];
+        int n = 0;
+        while (count > 0 && avail.has_value()) {
+            int32_t core_offset = avail.pop_first();
+            n += prepare_block_for_dispatch(thread_idx, core_offset, *c, shape, to_pending, block, &handles[n]);
+            block++;
+            count--;
+            staged++;
+        }
+        if (n == 0) return;
+        wmb();
+        for (int i = 0; i < n; i++) {
+            publish_subtask_to_core(handles[i], early_dispatch_ts);
+            int32_t cid = tracker.get_core_id_by_offset(handles[i].core_offset);
+            sched_->spec_doorbell_table[cid].addr = handles[i].reg_addr;
+            sched_->spec_doorbell_table[cid].token = handles[i].reg_task_id;
+            my_cores[cid >> 6] |= (1ULL << (cid & 63));
+        }
+    };
+    if (idle.has_value()) stage_from(idle, /*to_pending=*/false);
+    if (pend.has_value()) stage_from(pend, /*to_pending=*/true);
+    // Publish all this thread's gated cores into the shared mask in one OR per word
+    // (vs one per subtask) so release sees them; seq_cst keeps the self-ring order.
+    for (int w = 0; w < PTO2_SPEC_CORE_MASK_WORDS; w++)
+        if (my_cores[w] != 0) c->payload->staged_core_mask[w].fetch_or(my_cores[w], std::memory_order_seq_cst);
+
+    // If release already flipped DISPATCHED, it may have read the mask before our
+    // bits landed — ring our own cores so none is left gated forever.
+    if (staged > 0 && c->payload->spec_state.load(std::memory_order_seq_cst) == PTO2_SPEC_DISPATCHED) {
+        for (int w = 0; w < PTO2_SPEC_CORE_MASK_WORDS; w++) {
+            uint64_t bits = my_cores[w];
+            while (bits != 0) {
+                int cid = w * 64 + __builtin_ctzll(bits);
+                bits &= bits - 1;
+                PTO2SchedulerState::ring_one_doorbell(
+                    sched_->spec_doorbell_table[cid].addr, sched_->spec_doorbell_table[cid].token
+                );
+            }
+        }
+    }
+    return staged;
+}
+
+// Early-dispatch drain (idle pass). Candidates are pushed to early_dispatch_queue
+// EVENT-DRIVEN by propagate_dispatch_fanin (a flagged producer's dispatch bumps its
+// consumers' dispatch_fanin; reaching fanin_count enqueues the consumer) — there is
+// no per-iteration PULL scan here anymore. This pass only DRAINS the queue.
+// Returns the number of blocks staged this pass (for the EarlyDispatch swimlane bar).
+int32_t SchedulerContext::try_speculative_early_dispatch(int32_t thread_idx) {
+    constexpr int PTO2_EARLY_DISPATCH_DRAIN_MAX = 8;  // bounded pops per pass
+    CoreTracker &tracker = core_trackers_[thread_idx];
+    int32_t total_staged = 0;
+
+    // Drain the queue — mirrors the normal SPMD dispatch path. Pop a consumer,
+    // CLAIM a range sized to THIS thread's free cores by advancing next_block_idx with
+    // a CAS (atomic — next_block_idx is shared with normal dispatch, which also claims
+    // it if release routes the consumer to the ready queue, so a plain store could
+    // double-dispatch), RE-PUSH it for peers, THEN do the expensive prepare+publish.
+    // Re-pushing before staging lets peers claim the next range and stage CONCURRENTLY
+    // — a wide consumer (online_softmax, 48 blocks) is filled by all idle threads in
+    // parallel instead of a serial winner-then-peer daisy chain. Bounded pops/pass.
+    for (int n = 0; n < PTO2_EARLY_DISPATCH_DRAIN_MAX; n++) {
+        PTO2TaskSlotState *c = sched_->early_dispatch_queue.pop();
+        if (c == nullptr) break;
+        if (c->payload->spec_state.load(std::memory_order_acquire) != PTO2_SPEC_STAGING) continue;  // released
+        PTO2ResourceShape shape = c->active_mask.to_shape();
+        auto idle = tracker.get_idle_core_offset_states(shape);
+        auto pend = tracker.get_pending_core_offset_states(shape);
+        int32_t freecores = (idle.has_value() ? idle.count() : 0) + (pend.has_value() ? pend.count() : 0);
+        if (freecores == 0) {  // no free cores of this shape — give it back for peers and stop
+            sched_->early_dispatch_queue.push(c);
+            break;
+        }
+        // CAS-claim a contiguous range [start, start+claim) sized to this thread's
+        // free cores; CAS keeps it atomic against peers AND normal dispatch.
+        int32_t start = 0, claim = 0;
+        while (true) {
+            int16_t cur = c->next_block_idx.load(std::memory_order_relaxed);
+            if (cur >= c->logical_block_num) break;  // fully claimed
+            int32_t cnt = c->logical_block_num - cur;
+            if (cnt > freecores) cnt = freecores;
+            if (c->next_block_idx.compare_exchange_weak(
+                    cur, static_cast<int16_t>(cur + cnt), std::memory_order_seq_cst, std::memory_order_relaxed
+                )) {
+                start = cur;
+                claim = cnt;
+                break;
+            }
+        }
+        if (claim == 0) continue;  // nothing left to claim -> drop (no re-push)
+        // Re-push for concurrent peers BEFORE the expensive staging.
+        if (start + claim < c->logical_block_num) {
+            if (!sched_->early_dispatch_queue.push(c))
+                LOG_INFO_V9(
+                    "[SPEC] queue full on re-push, consumer=%" PRId64, static_cast<int64_t>(c->task->task_id.raw)
+                );
+        }
+        total_staged += stage_consumer_blocks(thread_idx, c, shape, start, claim, idle, pend);
+    }
+    return total_staged;
+}
+
+// =============================================================================
+// Main scheduler dispatch loop
+// =============================================================================
+
+int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_idx) {
+    always_assert(sched_ != nullptr);
+    CoreTracker &tracker = core_trackers_[thread_idx];
+    LOG_INFO_V0("Thread %d: resolve_and_dispatch entry", thread_idx);
+
+    PTO2SharedMemoryHeader *header = sched_->sm_header;
+    if (!header) {
+        LOG_ERROR("PTO2 dispatch: header is null");
+        return -1;
+    }
+    LOG_INFO_V0(
+        "Thread %d: header=%p, task_desc_offset[0]=%lu, window_size=%lu", thread_idx, static_cast<void *>(header),
+        static_cast<uint64_t>(header->rings[0].task_descriptors_offset),
+        static_cast<uint64_t>(header->rings[0].task_window_size)
+    );
+
+    Handshake *hank = static_cast<Handshake *>(runtime->workers);
+    LOG_INFO_V0(
+        "Thread %d: hank=%p, window_size=%lu", thread_idx, static_cast<void *>(hank),
+        static_cast<uint64_t>(header->rings[0].task_window_size)
+    );
+
+    // One-time init: assign perf buffers (one thread does it; others wait)
+    if (!pto2_init_claimed_.exchange(true, std::memory_order_acq_rel)) {
+        LOG_INFO_V0("Thread %d: doing one-time init", thread_idx);
+
+#if PTO2_PROFILING
+        if (is_dump_args_enabled()) {
+            dump_args_init(orch_to_sched_ ? aicpu_thread_num_ : sched_thread_num_);
+        }
+#endif
+
+#if PTO2_PROFILING
+        // Initialize PMU: program events, start counters, and pop initial buffers
+        if (is_pmu_enabled()) {
+            pmu_aicpu_init(physical_core_ids_, cores_total_num_);
+            LOG_INFO_V0("PMU profiling started on %d cores", cores_total_num_);
+        }
+#endif
+
+        LOG_INFO_V0("Thread %d: one-time init done", thread_idx);
+        pto2_init_complete_.store(true, std::memory_order_release);
+    } else {
+        while (!pto2_init_complete_.load(std::memory_order_acquire)) {
+            SPIN_WAIT_HINT();
+        }
+    }
+
+    LOG_INFO_V0("Thread %d: PTO2 dispatch starting with %d cores", thread_idx, core_trackers_[thread_idx].core_num());
+    int32_t cur_thread_completed = 0;
+    // Non-zero once a scheduler-hang timeout latches; returned in place of the
+    // completed count so the caller still sees the negative error rc while the
+    // shared end-of-loop flush below runs.
+    int32_t timeout_rc = 0;
+    int32_t idle_iterations = 0;
+    int32_t last_progress_count = 0;
+#if PTO2_PROFILING
+    auto &l2_swimlane = sched_l2_swimlane_[thread_idx];
+    l2_swimlane.reset();
+    l2_swimlane.l2_swimlane_enabled = (l2_swimlane_level_ != L2SwimlaneLevel::DISABLED);
+#endif
+
+    constexpr int LOCAL_READY_CAP_PER_TYPE = 64;
+    PTO2TaskSlotState *local_ptrs[PTO2_NUM_RESOURCE_SHAPES][LOCAL_READY_CAP_PER_TYPE];
+    PTO2LocalReadyBuffer local_bufs[PTO2_NUM_RESOURCE_SHAPES];
+    for (int32_t i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) {
+        local_bufs[i].reset(local_ptrs[i], LOCAL_READY_CAP_PER_TYPE);
+    }
+    PTO2TaskSlotState *deferred_release_slot_states[PTO2_DEFERRED_RELEASE_CAP];
+    int32_t deferred_release_count = 0;
+
+    bool cores_released = false;
+
+    // PMU runs require single-issue dispatch — overlapping in-flight tasks
+    // pollute per-task PMU counters, so skip the PENDING pre-load phase.
+    // Cached at function scope: is_pmu_enabled() is extern "C" and the
+    // compiler cannot hoist it across the dispatch loop on its own.
+    const bool pmu_active = is_pmu_enabled();
+
+#if PTO2_PROFILING
+    l2_swimlane.sched_start_ts = get_sys_cnt_aicpu();
+#endif
+
+#if PTO2_PROFILING
+    // Queue-depth snapshot carried across the iteration boundary: each phase
+    // emit consumes (phase_start_*) and refreshes them with its own end snapshot
+    // so the next phase's "at_start" equals the previous phase's "at_end".
+    //
+    // L2SWIMLANE_NUM_QUEUE_SHAPES (3) matches PTO2_NUM_RESOURCE_SHAPES: AIC/AIV/MIX.
+    //
+    // **Hot-path cost discipline.** Local depth (this thread's PTO2LocalReadyBuffer)
+    // is a single int read on a register-cached stack — free. Shared depth
+    // (PTO2ReadyQueue::size) is two atomic relaxed loads against cache lines
+    // that all peer sched threads also write to (enqueue_pos and dequeue_pos
+    // bounce on every flush_local_bufs + every pop). With both phases emitting
+    // per iter that's 12 cross-core loads × thousands of iters per run, a
+    // measurable AICPU slowdown. Mitigation: lazy + per-iter cached shared
+    // snapshot, refreshed at most once per iteration. The complete-emit and
+    // dispatch-emit in the same iter both reuse the same shared sample; the
+    // big transitions (local→shared flush) still show up across iter boundaries.
+    static_assert(
+        L2SWIMLANE_NUM_QUEUE_SHAPES == PTO2_NUM_RESOURCE_SHAPES,
+        "queue snapshot width must match runtime resource shape count"
+    );
+    int16_t phase_start_local[L2SWIMLANE_NUM_QUEUE_SHAPES] = {0};
+    int16_t phase_start_shared[L2SWIMLANE_NUM_QUEUE_SHAPES] = {0};
+    int16_t iter_shared_snapshot[L2SWIMLANE_NUM_QUEUE_SHAPES] = {0};
+    bool iter_shared_sampled = false;
+    auto capture_local_snapshot = [&](int16_t local_out[L2SWIMLANE_NUM_QUEUE_SHAPES]) {
+        for (int s = 0; s < L2SWIMLANE_NUM_QUEUE_SHAPES; s++) {
+            local_out[s] = static_cast<int16_t>(local_bufs[s].count);
+        }
+    };
+    auto get_or_sample_shared = [&]() -> const int16_t * {
+        if (!iter_shared_sampled) {
+            // Clamp to int16_t max before narrowing. PTO2_PROF_READYQUEUE_SIZE
+            // is in the low thousands today but could grow with platform
+            // scaling — without clamp, sizes above 32767 wrap to negatives
+            // and silently corrupt the snapshot.
+            constexpr size_t kMax = static_cast<size_t>(std::numeric_limits<int16_t>::max());
+            for (int s = 0; s < L2SWIMLANE_NUM_QUEUE_SHAPES; s++) {
+                const size_t qsize = sched_->ready_queues[s].size();
+                iter_shared_snapshot[s] = static_cast<int16_t>(std::min(qsize, kMax));
+            }
+            iter_shared_sampled = true;
+        }
+        return iter_shared_snapshot;
+    };
+    auto capture_phase_end = [&](int16_t local_out[L2SWIMLANE_NUM_QUEUE_SHAPES],
+                                 int16_t shared_out[L2SWIMLANE_NUM_QUEUE_SHAPES]) {
+        capture_local_snapshot(local_out);
+        const int16_t *shared_cached = get_or_sample_shared();
+        for (int s = 0; s < L2SWIMLANE_NUM_QUEUE_SHAPES; s++)
+            shared_out[s] = shared_cached[s];
+    };
+    if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) {
+        capture_phase_end(phase_start_local, phase_start_shared);
+    }
+#endif
+
+    // Wall-clock timestamp of the last completed task on this thread.
+    // Updated on made_progress; consulted to decide whether the wall-clock
+    // budget for declaring a scheduler hang has elapsed. Initialized to
+    // "now" so the first budget cycle starts when this thread does, not at
+    // an undefined value.
+    uint64_t last_progress_ts = get_sys_cnt_aicpu();
+
+    while (true) {
+        if (completed_.load(std::memory_order_acquire)) {
+            break;
+        }
+        bool made_progress = false;
+#if PTO2_PROFILING
+        CYCLE_COUNT_START();
+        l2_swimlane.sched_loop_count++;
+        uint64_t _t0_phase = _t0;
+        // Release is the only "no Complete/Dispatch bar" attribution we keep —
+        // emitted with its own span in the idle branch below. Iterations that
+        // only scan/poll show as blank gaps; the per-loop Poll/Scan bars (PR
+        // #1079 debug overlay) were removed since "scheduler is polling when
+        // there's nothing to do" carries no actionable signal.
+        // Per-iter lazy shared-queue snapshot: first phase emit in this iter
+        // pays the atomic-load cost, subsequent emits in the same iter reuse
+        // the cached value. Reset here so we re-sample exactly once per iter
+        // (or skip entirely on iters with no phase emit).
+        iter_shared_sampled = false;
+#endif
+        int32_t task_count = 0;
+        if (!tracker.has_any_running_cores()) {
+            LoopAction action = handle_orchestrator_exit(thread_idx, header, runtime, task_count);
+            if (action == LoopAction::BREAK_LOOP) break;
+        }
+
+        if (!cores_released && orch_to_sched_) {
+            LoopAction action = handle_core_transition(cores_released);
+            if (action == LoopAction::BREAK_LOOP) break;
+        }
+
+#if PTO2_PROFILING
+        CYCLE_COUNT_LAP(l2_swimlane.sched_idle_cycle);
+#endif
+
+        // Phase 1: Check running cores for completion
+        int32_t completed_this_turn = 0;
+
+        bool try_completed = tracker.has_any_running_cores();
+        if (try_completed) {
+            check_running_cores_for_completion(
+                thread_idx, hank, completed_this_turn, cur_thread_completed, made_progress,
+                deferred_release_slot_states, deferred_release_count, local_bufs
+            );
+        }
+        if (completed_this_turn > 0) {
+#if PTO2_SCHED_PROFILING
+            sched_->tasks_completed.fetch_add(completed_this_turn, std::memory_order_relaxed);
+#endif
+            int32_t prev = completed_tasks_.fetch_add(completed_this_turn, std::memory_order_relaxed);
+            int32_t new_total = prev + completed_this_turn;
+            last_progress_count = new_total;
+            if (thread_idx == 0 && task_count > 0) {
+                if (new_total <= PROGRESS_VERBOSE_THRESHOLD ||
+                    new_total / PROGRESS_LOG_INTERVAL != prev / PROGRESS_LOG_INTERVAL || new_total >= task_count) {
+                    LOG_INFO_V9(
+                        "PTO2 progress: completed=%d total=%d (%.1f%%)", new_total, task_count,
+                        100.0 * new_total / task_count
+                    );
+                }
+            }
+        }
+
+        if (rt_ != nullptr && rt_->aicore_mailbox != nullptr &&
+            (sched_->async_wait_list.count > 0 || rt_->aicore_mailbox->has_pending())) {
+            AsyncPollResult poll_result = sched_->async_wait_list.poll_and_complete<false>(
+                rt_->aicore_mailbox, sched_, local_bufs, deferred_release_slot_states, deferred_release_count,
+                PTO2_DEFERRED_RELEASE_CAP
+#if PTO2_SCHED_PROFILING
+                ,
+                thread_idx
+#endif
+            );
+            if (poll_result.error_code != PTO2_ERROR_NONE) {
+                int32_t expected = PTO2_ERROR_NONE;
+                header->sched_error_code.compare_exchange_strong(
+                    expected, poll_result.error_code, std::memory_order_acq_rel, std::memory_order_acquire
+                );
+                completed_.store(true, std::memory_order_release);
+                break;
+            }
+            if (poll_result.completed > 0) {
+#if PTO2_SCHED_PROFILING
+                sched_->tasks_completed.fetch_add(poll_result.completed, std::memory_order_relaxed);
+#endif
+                int32_t prev = completed_tasks_.fetch_add(poll_result.completed, std::memory_order_relaxed);
+                int32_t new_total = prev + poll_result.completed;
+                last_progress_count = new_total;
+                made_progress = true;
+            }
+        }
+
+#if PTO2_PROFILING
+        if (!try_completed) {
+            CYCLE_COUNT_LAP(l2_swimlane.sched_idle_cycle);
+        } else {
+            CYCLE_COUNT_LAP(l2_swimlane.sched_complete_cycle);
+            // Emit on any completion work this iteration — a finished slot OR
+            // sub-block retires that did not finish a slot. The latter makes the
+            // SPMD harvest tail visible (count field = blocks processed this
+            // iteration; on a pure-retire iteration phase_complete_count is 0).
+            if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES &&
+                (l2_swimlane.phase_complete_count > 0 || l2_swimlane.phase_subretire_count > 0)) {
+                // Local depth is cheap (this thread's own buffer counter).
+                // Shared depth is NOT sampled here: complete's release_fanin
+                // pushes to local_bufs in the fast path (try_push succeeds
+                // until cap=64). Shared only changes on dispatch's flush
+                // path. Carrying phase_start_shared forward as end_shared
+                // is the right answer 99% of the time AND skips three
+                // contended atomic loads per emit.
+                int16_t phase_end_local[L2SWIMLANE_NUM_QUEUE_SHAPES];
+                capture_local_snapshot(phase_end_local);
+                l2_swimlane_aicpu_record_sched_phase(
+                    thread_idx, L2SwimlaneSchedPhaseKind::Complete, _t0_phase, _t1, l2_swimlane.sched_loop_count,
+                    l2_swimlane.phase_complete_count + l2_swimlane.phase_subretire_count, /*pop_hit=*/0,
+                    /*pop_miss=*/0, phase_start_local, phase_start_shared, phase_end_local, phase_start_shared
+                );
+                for (int s = 0; s < L2SWIMLANE_NUM_QUEUE_SHAPES; s++) {
+                    phase_start_local[s] = phase_end_local[s];
+                    // phase_start_shared unchanged — carried forward
+                }
+                _t0_phase = _t1;
+                l2_swimlane.phase_complete_count = 0;
+                l2_swimlane.phase_subretire_count = 0;
+            }
+        }
+#endif
+
+        bool try_pushed = false;
+
+        // Phase 2 drain check
+        if (drain_state_.sync_start_pending.load(std::memory_order_acquire) != 0) {
+            handle_drain_mode(thread_idx);
+            continue;
+        }
+
+        // Phase 3: Drain wiring queue (thread 0 only)
+        int wired = 0;
+        if (thread_idx == 0) {
+            wired = sched_->drain_wiring_queue(orchestrator_done_);
+            if (wired > 0) {
+                made_progress = true;
+#if PTO2_SCHED_PROFILING
+                l2_swimlane.phase_wiring_count += wired;
+#endif
+            }
+        }
+#if PTO2_PROFILING
+        CYCLE_COUNT_LAP(l2_swimlane.sched_wiring_cycle);
+        // Wire outer phase: emit one bar covering this iter's drain_wiring_queue
+        // pass when it wired any tasks. tasks_processed = wired count. Resolve
+        // does NOT nest under Wire — wiring only enqueues, the consumer release
+        // happens later in Complete/Dummy.
+        if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES && wired > 0) {
+            int16_t phase_end_local[L2SWIMLANE_NUM_QUEUE_SHAPES];
+            capture_local_snapshot(phase_end_local);
+            l2_swimlane_aicpu_record_sched_phase(
+                thread_idx, L2SwimlaneSchedPhaseKind::Wire, _t0_phase, _t1, l2_swimlane.sched_loop_count,
+                static_cast<uint32_t>(wired), /*pop_hit=*/0, /*pop_miss=*/0, phase_start_local, phase_start_shared,
+                phase_end_local, phase_start_shared
+            );
+            for (int s = 0; s < L2SWIMLANE_NUM_QUEUE_SHAPES; s++) {
+                phase_start_local[s] = phase_end_local[s];
+            }
+            _t0_phase = _t1;
+        }
+#endif
+
+        // Phase 3b: Drain dummy ready queue (thread 0 only).
+        //
+        // Dependency-only tasks bypass AICore dispatch: they go through the
+        // scheduler so fanin/fanout edges stay consistent, but completion is
+        // signalled inline here. Pinned to thread 0 to avoid cross-thread
+        // races and to keep cache hot near the wiring drain above.
+        if (thread_idx == 0) {
+            constexpr int DUMMY_DRAIN_BATCH = 16;
+            PTO2TaskSlotState *dummy_batch[DUMMY_DRAIN_BATCH];
+            int dummy_got = sched_->dummy_ready_queue.pop_batch(dummy_batch, DUMMY_DRAIN_BATCH);
+#if PTO2_PROFILING
+            // Dummy outer phase: covers handling of all dummies popped this
+            // iter. Per-dummy DummyTask markers are emitted to a SEPARATE lane
+            // (Worker View AICPU_N) by the converter, so they do not nest
+            // under this bar. Resolve emits below DO land on the sched lane
+            // and nest under this Dummy outer by time containment.
+            uint64_t dummy_outer_t0 =
+                (dummy_got > 0 && l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) ? get_sys_cnt_aicpu() : 0;
+#endif
+            for (int di = 0; di < dummy_got; di++) {
+                PTO2TaskSlotState &dummy_slot = *dummy_batch[di];
+
+                // ----- DummyTask phase: dummy "task" identity marker. --------
+                // The dummy has no AICore presence — start ≈ end (1 cycle
+                // wide, just "we identified it"). Converter renders this on
+                // Worker View's DUMMY_T{thread} lane so the DAG node is
+                // visually present. tasks_processed = task_token low 32 bits
+                // (= local_id within ring) so deps.json flow arrows can land.
+                // The Resolve work that follows is emitted separately below.
+#if PTO2_PROFILING
+                if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) {
+                    uint64_t dummy_marker_t = get_sys_cnt_aicpu();
+                    uint32_t dummy_id_low32 = static_cast<uint32_t>(dummy_slot.task->task_id.raw & 0xFFFFFFFFu);
+                    l2_swimlane_aicpu_record_sched_phase(
+                        thread_idx, L2SwimlaneSchedPhaseKind::DummyTask, dummy_marker_t, dummy_marker_t,
+                        sched_l2_swimlane_[thread_idx].sched_loop_count, dummy_id_low32
+                    );
+                }
+#endif
+
+                // ----- Resolve work: walk this dummy's consumer list. ------
+                // Same 1 µs filter as the main-path Resolve emit suppresses
+                // dummies whose consumer release runs sub-microsecond.
+#if PTO2_PROFILING
+                uint64_t dummy_resolve_t0 =
+                    (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) ? get_sys_cnt_aicpu() : 0;
+#endif
+                // [[maybe_unused]] silences -Werror=unused-but-set-variable on
+                // the profiling-flags-smoke build path where PTO2_PROFILING is
+                // OFF and the Resolve emit below is excluded.
+                [[maybe_unused]] uint32_t dummy_consumers = 0;
+#if PTO2_SCHED_PROFILING
+                dummy_consumers = sched_->on_task_complete(dummy_slot, thread_idx, local_bufs).fanout_edges;
+#else
+                dummy_consumers = sched_->on_task_complete(dummy_slot, local_bufs);
+#endif
+#if PTO2_PROFILING
+                if (dummy_resolve_t0 != 0) {
+                    uint64_t dummy_resolve_t1 = get_sys_cnt_aicpu();
+                    constexpr uint64_t RESOLVE_EMIT_MIN_CYCLES = PLATFORM_PROF_SYS_CNT_FREQ / 1'000'000;  // 1 µs
+                    if (dummy_resolve_t1 - dummy_resolve_t0 >= RESOLVE_EMIT_MIN_CYCLES) {
+                        l2_swimlane_aicpu_record_sched_phase(
+                            thread_idx, L2SwimlaneSchedPhaseKind::Resolve, dummy_resolve_t0, dummy_resolve_t1,
+                            sched_l2_swimlane_[thread_idx].sched_loop_count, dummy_consumers
+                        );
+                    }
+                }
+#endif
+                // Dummy tasks have no subtasks to retire and no fanout pre-conditions
+                // beyond their own producers; release self-reference so the slot can
+                // reach CONSUMED once all consumers drain.
+                deferred_release_slot_states[deferred_release_count++] = &dummy_slot;
+                if (deferred_release_count >= PTO2_DEFERRED_RELEASE_CAP) {
+                    while (deferred_release_count > 0) {
+#if PTO2_SCHED_PROFILING
+                        (void)sched_->on_task_release(
+                            *deferred_release_slot_states[--deferred_release_count], thread_idx
+                        );
+#else
+                        sched_->on_task_release(*deferred_release_slot_states[--deferred_release_count]);
+#endif
+                    }
+                }
+                int32_t prev = completed_tasks_.fetch_add(1, std::memory_order_relaxed);
+                last_progress_count = prev + 1;
+                cur_thread_completed++;
+            }
+            if (dummy_got > 0) {
+                made_progress = true;
+            }
+#if PTO2_PROFILING
+            // Emit Dummy outer over the whole dummy_drain pass. Span starts at
+            // dummy_outer_t0 (captured before the pop_batch) and ends at "now".
+            // tasks_processed = dummy_got. Advancing _t0_phase here makes the
+            // following Dispatch / EarlyDispatch / second-Complete bars start
+            // at this end.
+            if (dummy_outer_t0 != 0) {
+                int16_t phase_end_local[L2SWIMLANE_NUM_QUEUE_SHAPES];
+                capture_local_snapshot(phase_end_local);
+                uint64_t dummy_outer_t1 = get_sys_cnt_aicpu();
+                l2_swimlane_aicpu_record_sched_phase(
+                    thread_idx, L2SwimlaneSchedPhaseKind::Dummy, dummy_outer_t0, dummy_outer_t1,
+                    l2_swimlane.sched_loop_count, static_cast<uint32_t>(dummy_got), /*pop_hit=*/0,
+                    /*pop_miss=*/0, phase_start_local, phase_start_shared, phase_end_local, phase_start_shared
+                );
+                for (int s = 0; s < L2SWIMLANE_NUM_QUEUE_SHAPES; s++) {
+                    phase_start_local[s] = phase_end_local[s];
+                }
+                _t0_phase = dummy_outer_t1;
+                // We do NOT re-sync _t0/_t1 — the dummy span will be absorbed
+                // into the next CYCLE_COUNT_LAP accumulator. The phase-model
+                // anchor (_t0_phase) is the authoritative source for bar spans
+                // on the swimlane; the cycle accumulators are coarse aggregates.
+            }
+#endif
+        }
+
+        // Phase 4: MIX-strict-priority dispatch with phase-split and
+        // cross-thread idle gating. See dispatch_ready_tasks for the policy.
+#if PTO2_PROFILING
+        uint64_t dispatch_t0 = (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) ? get_sys_cnt_aicpu() : 0;
+#endif
+        dispatch_ready_tasks(thread_idx, tracker, local_bufs, pmu_active, made_progress, try_pushed);
+#if PTO2_PROFILING
+        // Emit Dispatch IMMEDIATELY after dispatch_ready_tasks so its span
+        // covers the actual publish work — not the trailing second-poll /
+        // early-dispatch time. (Pre-redesign the Dispatch emit lived at iter
+        // end with span extending past the second poll, which made finish_time
+        // events from the second poll fall under the Dispatch bar rather than
+        // a Complete bar of their own — confusing for trace consumers.)
+        if (dispatch_t0 != 0 && try_pushed && l2_swimlane.phase_dispatch_count > 0) {
+            uint64_t dispatch_t1 = get_sys_cnt_aicpu();
+            uint64_t pop_hit_delta = l2_swimlane.pop_hit - l2_swimlane.pop_hit_at_last_emit;
+            uint64_t pop_miss_delta = l2_swimlane.pop_miss - l2_swimlane.pop_miss_at_last_emit;
+            debug_assert(pop_hit_delta < (1ULL << 32));
+            debug_assert(pop_miss_delta < (1ULL << 32));
+            int16_t phase_end_local[L2SWIMLANE_NUM_QUEUE_SHAPES];
+            int16_t phase_end_shared[L2SWIMLANE_NUM_QUEUE_SHAPES];
+            capture_phase_end(phase_end_local, phase_end_shared);
+            l2_swimlane_aicpu_record_sched_phase(
+                thread_idx, L2SwimlaneSchedPhaseKind::Dispatch, _t0_phase, dispatch_t1, l2_swimlane.sched_loop_count,
+                l2_swimlane.phase_dispatch_count, static_cast<uint32_t>(pop_hit_delta),
+                static_cast<uint32_t>(pop_miss_delta), phase_start_local, phase_start_shared, phase_end_local,
+                phase_end_shared
+            );
+            for (int s = 0; s < L2SWIMLANE_NUM_QUEUE_SHAPES; s++) {
+                phase_start_local[s] = phase_end_local[s];
+                phase_start_shared[s] = phase_end_shared[s];
+            }
+            _t0_phase = dispatch_t1;
+            l2_swimlane.phase_dispatch_count = 0;
+            l2_swimlane.pop_hit_at_last_emit = l2_swimlane.pop_hit;
+            l2_swimlane.pop_miss_at_last_emit = l2_swimlane.pop_miss;
+        }
+#endif
+
+        // Phase 4b: early-dispatch onto spare cores, but ONLY when this thread is
+        // otherwise idle — nothing was dispatched this iteration AND no ready work is
+        // queued for any shape. Early-dispatch competes with normal dispatch for
+        // pending slots, so gating on "no ready work" keeps it from delaying a real
+        // ready task; skipping the producer-fanout scan when busy also removes its
+        // per-iteration cost (the discovery walk only runs on genuinely idle passes).
+        bool any_ready_work = try_pushed;
+        for (int s = 0; !any_ready_work && s < PTO2_NUM_RESOURCE_SHAPES; s++) {
+            if (sched_->ready_queues[s].size() > 0 || local_bufs[s].count > 0) any_ready_work = true;
+        }
+#if PTO2_PROFILING
+        bool early_dispatch_record = l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES;
+        uint64_t early_dispatch_t0 = early_dispatch_record ? get_sys_cnt_aicpu() : 0;
+#endif
+        // Skip speculative early-dispatch under PMU: dispatch_ready_tasks already
+        // withholds PENDING dispatch when pmu_active to preserve single-issue PMU
+        // windows, and staging gated work into idle/pending slots would perturb the
+        // same windows.
+        [[maybe_unused]] int32_t staged_count =
+            (pmu_active || any_ready_work) ? 0 : try_speculative_early_dispatch(thread_idx);
+#if PTO2_PROFILING
+        // Emit an EarlyDispatch bar so a staging-dominated iteration is attributed
+        // to early-dispatch rather than disappearing into a blank gap.
+        if (early_dispatch_record && staged_count > 0) {
+            uint64_t early_dispatch_t1 = get_sys_cnt_aicpu();
+            l2_swimlane_aicpu_record_sched_phase(
+                thread_idx, L2SwimlaneSchedPhaseKind::EarlyDispatch, early_dispatch_t0, early_dispatch_t1,
+                sched_l2_swimlane_[thread_idx].sched_loop_count, static_cast<uint32_t>(staged_count)
+            );
+            // prepare_block_for_dispatch bumped phase_dispatch_count while staging;
+            // those blocks belong to this EarlyDispatch bar, so clear the counter
+            // before it leaks into the next Dispatch bar.
+            sched_l2_swimlane_[thread_idx].phase_dispatch_count = 0;
+            // Advance _t0_phase so the following second-poll's Complete bar
+            // starts at the EarlyDispatch end, not before it (otherwise their
+            // spans overlap and the outer-phase mutual-exclusion breaks).
+            _t0_phase = early_dispatch_t1;
+        }
+#endif
+
+        // Second completion poll. dispatch_ready_tasks + try_speculative_early_dispatch
+        // above can take several us in a busy window; a producer block that FINs
+        // during them would otherwise wait for the NEXT iteration's top-of-loop
+        // Phase-1 poll (the ~7us detection latency that delays a flagged
+        // producer's doorbell). Re-polling here observes those FINs immediately,
+        // so the doorbell fires this iteration. Idempotent (the poll is a poll);
+        // we drain deferred releases eagerly to keep the buffer from growing.
+#if PTO2_PROFILING
+        uint64_t complete2_t0 = (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) ? get_sys_cnt_aicpu() : 0;
+#endif
+        if (tracker.has_any_running_cores()) {
+            int32_t completed_2nd = 0;
+            check_running_cores_for_completion(
+                thread_idx, hank, completed_2nd, cur_thread_completed, made_progress, deferred_release_slot_states,
+                deferred_release_count, local_bufs
+            );
+            if (completed_2nd > 0) {
+#if PTO2_SCHED_PROFILING
+                sched_->tasks_completed.fetch_add(completed_2nd, std::memory_order_relaxed);
+#endif
+                completed_tasks_.fetch_add(completed_2nd, std::memory_order_relaxed);
+                last_progress_count = completed_tasks_.load(std::memory_order_relaxed);
+            }
+            // Eager drain so the second poll can't push deferred_release toward
+            // its cap between idle iterations.
+            while (deferred_release_count >= PTO2_DEFERRED_RELEASE_CAP - 96) {
+#if PTO2_SCHED_PROFILING
+                (void)sched_->on_task_release(*deferred_release_slot_states[--deferred_release_count], thread_idx);
+#else
+                sched_->on_task_release(*deferred_release_slot_states[--deferred_release_count]);
+#endif
+            }
+        }
+#if PTO2_PROFILING
+        // Complete2 outer phase: covers second-poll FIN observation. Without
+        // this emit, FIN counts from the second poll would carry over into the
+        // next iter's first-Complete bar and be displayed with a span that
+        // doesn't actually include those FINs' timestamps (visible mismatch
+        // between Complete bar span and per-task finish_time in Worker /
+        // Scheduler View).
+        if (complete2_t0 != 0 && (l2_swimlane.phase_complete_count > 0 || l2_swimlane.phase_subretire_count > 0)) {
+            uint64_t complete2_t1 = get_sys_cnt_aicpu();
+            int16_t phase_end_local[L2SWIMLANE_NUM_QUEUE_SHAPES];
+            capture_local_snapshot(phase_end_local);
+            l2_swimlane_aicpu_record_sched_phase(
+                thread_idx, L2SwimlaneSchedPhaseKind::Complete, complete2_t0, complete2_t1,
+                l2_swimlane.sched_loop_count, l2_swimlane.phase_complete_count + l2_swimlane.phase_subretire_count,
+                /*pop_hit=*/0,
+                /*pop_miss=*/0, phase_start_local, phase_start_shared, phase_end_local, phase_start_shared
+            );
+            for (int s = 0; s < L2SWIMLANE_NUM_QUEUE_SHAPES; s++) {
+                phase_start_local[s] = phase_end_local[s];
+            }
+            _t0_phase = complete2_t1;
+            l2_swimlane.phase_complete_count = 0;
+            l2_swimlane.phase_subretire_count = 0;
+        }
+
+        // Cycle-counter LAP for the iter tail. Dispatch's emit moved earlier
+        // (see Phase 4 above) so this branch only routes the time accumulator.
+        if (!try_pushed) {
+            CYCLE_COUNT_LAP(l2_swimlane.sched_idle_cycle);
+        } else {
+            CYCLE_COUNT_LAP(l2_swimlane.sched_dispatch_cycle);
+        }
+#endif
+
+#if !PTO2_PROFILING
+        (void)try_completed;
+        (void)try_pushed;
+#endif
+
+        if (made_progress) {
+            idle_iterations = 0;
+            last_progress_ts = get_sys_cnt_aicpu();
+        } else {
+#if PTO2_PROFILING
+            uint64_t rel_t0 = (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES && deferred_release_count > 0) ?
+                                  get_sys_cnt_aicpu() :
+                                  0;
+#endif
+            while (deferred_release_count > 0) {
+#if PTO2_SCHED_PROFILING
+                (void)sched_->on_task_release(*deferred_release_slot_states[--deferred_release_count], thread_idx);
+#else
+                sched_->on_task_release(*deferred_release_slot_states[--deferred_release_count]);
+#endif
+            }
+#if PTO2_PROFILING
+            // Release is a distinct operation from the poll scan — emit it with
+            // its own span (Perfetto nests it inside the surrounding poll/idle
+            // run by time-containment) rather than competing with poll for one
+            // per-iteration label.
+            if (rel_t0 != 0) {
+                l2_swimlane_aicpu_record_sched_phase(
+                    thread_idx, L2SwimlaneSchedPhaseKind::Release, rel_t0, get_sys_cnt_aicpu(),
+                    l2_swimlane.sched_loop_count, /*tasks_processed=*/0
+                );
+            }
+#endif
+            idle_iterations++;
+
+            if (idle_iterations % FATAL_ERROR_CHECK_INTERVAL == 0) {
+                LoopAction action = check_idle_fatal_error(thread_idx, header, runtime);
+                if (action == LoopAction::BREAK_LOOP) break;
+            }
+
+            if (idle_iterations % STALL_LOG_INTERVAL == 0) {
+                log_stall_diagnostics(thread_idx, total_tasks_, idle_iterations, last_progress_count);
+            }
+            // Wall-clock budget gate, with two fatal-latch branches:
+            //
+            // 1. Self owns a RUNNING task — first-hand evidence the
+            //    dispatch is stuck. Latch.
+            // 2. No thread anywhere owns a RUNNING task AND tasks remain
+            //    unfinished — the system is in a pre-dispatch / WAIT-only
+            //    deadlock (e.g. dependency cycle). Ownerless idle threads
+            //    are the only observers; let this one latch on the global
+            //    evidence (`completed_tasks_ < total_tasks_` and
+            //    `no_thread_owns_running_task()`).
+            //
+            // Otherwise: a sibling thread owns a RUNNING task but hasn't
+            // hit its own budget yet (typical distributed startup-skew
+            // case) — refresh last_progress_ts and keep spinning. The
+            // STALL diagnostic above still fires periodically so
+            // observability is preserved.
+            if (get_sys_cnt_aicpu() - last_progress_ts > SCHEDULER_TIMEOUT_CYCLES) {
+                bool self_owns = self_owns_running_task(thread_idx);
+                bool global_stuck = !self_owns && total_tasks_ > 0 &&
+                                    completed_tasks_.load(std::memory_order_relaxed) < total_tasks_ &&
+                                    no_thread_owns_running_task();
+                if (self_owns || global_stuck) {
+                    // Latch the error + emergency_shutdown, then break to the
+                    // shared end-of-loop cleanup so the diagnostic buffers get
+                    // flushed to the host. An early return here would strand the
+                    // stuck task's already-dumped inputs and every completed
+                    // task's in/out records in the unflushed per-thread dump
+                    // buffer — exactly the state we need to triage the hang.
+                    timeout_rc = handle_timeout_exit(
+                        thread_idx, header, runtime, idle_iterations, last_progress_count
+#if PTO2_PROFILING
+                        ,
+                        l2_swimlane.sched_start_ts
+#endif
+                    );
+                    break;
+                }
+                last_progress_ts = get_sys_cnt_aicpu();
+            }
+            SPIN_WAIT_HINT();
+#if PTO2_PROFILING
+            CYCLE_COUNT_LAP(l2_swimlane.sched_idle_cycle);
+            // _t0_phase advances through idle laps so the next emitted
+            // COMPLETE/DISPATCH bar starts at the iter it actually ran in, not
+            // at the start of the preceding idle stretch. The idle/poll time
+            // itself is attributed by the activity-fill below — no blanks.
+            if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) {
+                _t0_phase = _t1;
+            }
+#endif
+        }
+    }
+
+    // Drain any entries left in the deferred-release batch. The in-loop flush
+    // only fires on idle iterations and on buffer-full; a loop exit while the
+    // last iteration made progress can leave entries un-released. Drop them
+    // here so every consumed producer slot completes its on_task_release
+    // regardless of which loop-exit path fired.
+    while (deferred_release_count > 0) {
+#if PTO2_SCHED_PROFILING
+        (void)sched_->on_task_release(*deferred_release_slot_states[--deferred_release_count], thread_idx);
+#else
+        sched_->on_task_release(*deferred_release_slot_states[--deferred_release_count]);
+#endif
+    }
+
+#if PTO2_PROFILING
+    // Final-drain: emit any pop_hit / pop_miss accrued since the last
+    // dispatch emit (typically the trailing idle loops while waiting for
+    // orchestrator_done_) as a zero-duration synthetic dispatch record so
+    // sum(record.pop_*) reconciles with the run-cumulative counter.
+    // Gate on SCHED_PHASES — at lower levels the phase buffer is never
+    // flushed (see below), so writing this record would be wasted work.
+    if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) {
+        uint64_t final_pop_hit_delta = l2_swimlane.pop_hit - l2_swimlane.pop_hit_at_last_emit;
+        uint64_t final_pop_miss_delta = l2_swimlane.pop_miss - l2_swimlane.pop_miss_at_last_emit;
+        debug_assert(final_pop_hit_delta < (1ULL << 32));
+        debug_assert(final_pop_miss_delta < (1ULL << 32));
+        if (final_pop_hit_delta != 0 || final_pop_miss_delta != 0) {
+            uint64_t t_now = get_sys_cnt_aicpu();
+            int16_t phase_end_local[L2SWIMLANE_NUM_QUEUE_SHAPES];
+            int16_t phase_end_shared[L2SWIMLANE_NUM_QUEUE_SHAPES];
+            capture_phase_end(phase_end_local, phase_end_shared);
+            l2_swimlane_aicpu_record_sched_phase(
+                thread_idx, L2SwimlaneSchedPhaseKind::Dispatch, t_now, t_now, l2_swimlane.sched_loop_count, 0,
+                static_cast<uint32_t>(final_pop_hit_delta), static_cast<uint32_t>(final_pop_miss_delta),
+                phase_end_local, phase_end_shared, phase_end_local, phase_end_shared
+            );
+            l2_swimlane.pop_hit_at_last_emit = l2_swimlane.pop_hit;
+            l2_swimlane.pop_miss_at_last_emit = l2_swimlane.pop_miss;
+        }
+    }
+    log_l2_swimlane_summary(thread_idx, cur_thread_completed);
+#endif
+
+#if PTO2_PROFILING
+    if (l2_swimlane.l2_swimlane_enabled) {
+        l2_swimlane_aicpu_flush(
+            thread_idx, core_trackers_[thread_idx].core_ids(), core_trackers_[thread_idx].core_num()
+        );
+        if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) {
+            l2_swimlane_aicpu_flush_sched_phase_buffer(thread_idx);
+        }
+    }
+#endif
+#if PTO2_PROFILING
+    if (is_dump_args_enabled()) {
+        dump_args_flush(thread_idx);
+    }
+#endif
+#if PTO2_PROFILING
+    if (is_pmu_enabled()) {
+        pmu_aicpu_flush_buffers(
+            thread_idx, core_trackers_[thread_idx].core_ids(), core_trackers_[thread_idx].core_num()
+        );
+    }
+#endif
+
+    return timeout_rc != 0 ? timeout_rc : cur_thread_completed;
+}
diff --git a/src/a2a3/runtime/fully_distributed_within_core/runtime/scheduler/scheduler_types.h b/src/a2a3/runtime/fully_distributed_within_core/runtime/scheduler/scheduler_types.h
new file mode 100644
index 000000000..f1dc5d7f8
--- /dev/null
+++ b/src/a2a3/runtime/fully_distributed_within_core/runtime/scheduler/scheduler_types.h
@@ -0,0 +1,468 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+#ifndef SCHEDULER_TYPES_H
+#define SCHEDULER_TYPES_H
+
+#include <atomic>
+#include <cstdint>
+
+#include "common/core_type.h"
+#include "common/platform_config.h"
+#include "pto_runtime2_types.h"
+#include "spin_hint.h"
+
+// =============================================================================
+// Profiling macros (compile-time gated)
+// =============================================================================
+
+#if PTO2_PROFILING
+#include "aicpu/device_time.h"
+// Accumulated nanoseconds per sub-step
+#define CYCLE_COUNT_START() uint64_t _t0 = get_sys_cnt_aicpu(), _t1
+#define CYCLE_COUNT_LAP(acc)       \
+    do {                           \
+        _t1 = get_sys_cnt_aicpu(); \
+        acc += (_t1 - _t0);        \
+        _t0 = _t1;                 \
+    } while (0)
+#else
+#define CYCLE_COUNT_START()
+#define CYCLE_COUNT_LAP(acc)
+#endif
+
+// =============================================================================
+// Scheduler constants
+// =============================================================================
+
+constexpr int32_t MAX_AICPU_THREADS = PLATFORM_MAX_AICPU_THREADS;
+
+// Periodic cadence (in idle iterations) for emitting the per-thread STALL
+// diagnostic while no progress is being made. Purely an observability knob,
+// independent of the wall-clock timeout below: small enough to fire a few times
+// before the budget expires, large enough not to flood device_log.
+constexpr int32_t STALL_LOG_INTERVAL = 480000;
+constexpr int32_t FATAL_ERROR_CHECK_INTERVAL = 1024;  // Check orchestrator error every N idle iters
+
+// Wall-clock budget for declaring "no progress = scheduler timeout". Replaces
+// the per-thread iteration-count cap that once lived here as MAX_IDLE_ITERATIONS
+// for the fatal-latch decision; STALL_LOG_INTERVAL above keeps the per-thread
+// diagnostic cadence.
+//
+// Using wall-clock here is load-bearing for distributed runs: with per-thread
+// iteration counts, a pure-idle thread spinning ~115 ns/iter hits the cap in
+// ~92 ms while a sibling thread polling a RUNNING task takes ~200 ms for the
+// same iteration count. The fast spinner racing ahead and latching fatal
+// kills the slower-but-correct poller mid-poll — see the distributed
+// startup-skew scenario in issue #897.
+//
+// The budget is platform-defined (PLATFORM_SCHEDULER_TIMEOUT_MS in spin_hint.h)
+// because the safe value differs per variant: onboard trims it to 2 s so the
+// AICPU detects a hang and flushes its diagnostics (tensor dump, in-flight
+// partial output) before STARS reaps the op and poisons the context (chain:
+// this < op-exec < host stream-sync, platform_config.h); sim has no STARS to
+// race and keeps the full 5 s #897 headroom. See spin_hint.h for the per-variant
+// rationale.
+constexpr int32_t SCHEDULER_TIMEOUT_MS = PLATFORM_SCHEDULER_TIMEOUT_MS;
+constexpr uint64_t SCHEDULER_TIMEOUT_CYCLES =
+    static_cast<uint64_t>(SCHEDULER_TIMEOUT_MS) * (PLATFORM_PROF_SYS_CNT_FREQ / 1000);
+constexpr int32_t STALL_DUMP_READY_MAX = 8;
+constexpr int32_t STALL_DUMP_WAIT_MAX = 4;
+constexpr int32_t STALL_DUMP_CORE_MAX = 8;
+constexpr int32_t PROGRESS_VERBOSE_THRESHOLD = 10;  // log every completion for the first N tasks
+constexpr int32_t PROGRESS_LOG_INTERVAL = 250;      // log every N completions after threshold
+
+// =============================================================================
+// Control flow signal from cold-path helpers back to the main dispatch loop.
+// =============================================================================
+
+enum class LoopAction : int8_t {
+    NONE,        // cold path did not trigger; proceed normally
+    BREAK_LOOP,  // equivalent to 'break' from the while(true) loop
+};
+
+// =============================================================================
+// Per-core state: one cache line per core to eliminate false sharing
+// and co-locate all hot-path fields for minimal cache misses.
+// Dual-slot layout: running (currently executing) + pending (pre-loaded, awaiting hardware pickup).
+// =============================================================================
+
+struct alignas(64) CoreExecState {
+    // --- Hot fields (completion + dispatch, every iteration) ---
+    uint64_t reg_addr;                      // offset  0: register base address (set once in handshake)
+    PTO2TaskSlotState *running_slot_state;  // offset  8: slot state for running task (nullptr = empty)
+    PTO2TaskSlotState *pending_slot_state;  // offset 16: slot state for pending task (nullptr = empty)
+    int32_t running_reg_task_id;            // offset 24: register task ID (AICPU_TASK_INVALID = idle)
+    int32_t pending_reg_task_id;            // offset 28: pending register task ID (AICPU_TASK_INVALID = none)
+    uint32_t dispatch_seq;                  // offset 32: monotonic dispatch counter
+    PTO2SubtaskSlot running_subslot;        // offset 36: which subtask slot is running
+    PTO2SubtaskSlot pending_subslot;        // offset 37: which subtask slot is pending
+    uint8_t pad0_[2];                       // offset 38: alignment padding
+    // Precomputed COND register pointer; resolved once in handshake so the
+    // hot completion poll does a single volatile load instead of recomputing
+    // reg_base + reg_offset(COND) on every iteration.
+    volatile uint32_t *cond_ptr;  // offset 40: precomputed pointer to COND register
+#if PTO2_PROFILING
+    // --- Profiling fields (dispatch path, compile-time gated) ---
+    uint64_t running_dispatch_timestamp;  // offset 48: AICPU dispatch timestamp for running task
+    uint64_t pending_dispatch_timestamp;  // offset 56: AICPU dispatch timestamp for pending task
+#else
+    // --- Cold fields (init/diagnostics only, never in hot path) ---
+    int32_t worker_id;          // offset 48: index in runtime.workers[]
+    uint32_t physical_core_id;  // offset 52: hardware physical core ID
+    CoreType core_type;         // offset 56: AIC or AIV (enum class : int32_t)
+    uint8_t pad2_[4];           // offset 60: pad to 64 bytes
+#endif
+};
+static_assert(sizeof(CoreExecState) == 64, "CoreExecState must occupy exactly one cache line");
+
+// =============================================================================
+// CoreTracker: cluster-based bitmask tracker for idle/running core state.
+//
+// core_states_ encodes per-cluster core idle/running in 3 bits per cluster:
+//   bit i*3   = AIC of cluster i   (1 = idle, 0 = running)
+//   bit i*3+1 = AIV0 of cluster i
+//   bit i*3+2 = AIV1 of cluster i
+// Max 21 clusters per tracker (63 bits in uint64_t).
+// =============================================================================
+
+class alignas(64) CoreTracker {
+public:
+    static inline int32_t MAX_CORE_PER_THREAD = 63;
+    static constexpr int32_t MAX_CLUSTERS = 63 / 3;
+
+public:
+    CoreTracker() = default;
+
+    class BitStates {
+    public:
+        BitStates() = default;
+
+        explicit BitStates(uint64_t states) :
+            states_(states) {}
+        void init() { states_ = 0; }
+
+        BitStates operator~() const { return BitStates(~states_); }
+        BitStates operator&(const BitStates &other) const { return BitStates(states_ & other.states_); }
+        BitStates operator|(const BitStates &other) const { return BitStates(states_ | other.states_); }
+        BitStates operator^(const BitStates &other) const { return BitStates(states_ ^ other.states_); }
+        BitStates operator>>(int32_t offset) const { return BitStates(states_ >> offset); }
+        BitStates operator<<(int32_t offset) const { return BitStates(states_ << offset); }
+        void operator&=(const BitStates &other) { states_ &= other.states_; }
+        void operator|=(const BitStates &other) { states_ |= other.states_; }
+        void operator^=(const BitStates &other) { states_ ^= other.states_; }
+
+        bool has_value() const { return states_ > 0; }
+        int32_t count() const { return __builtin_popcountll(states_); }
+        void clear_bit(int32_t offset) { states_ &= ~(1ULL << offset); }
+
+        // Extract the lowest set bit from mask, clear it, and return its position.
+        // Returns -1 if mask is empty.
+        int32_t pop_first() {
+            if (states_ == 0) return -1;
+            int32_t pos = __builtin_ctzll(states_);
+            states_ &= states_ - 1;
+            return pos;
+        }
+
+    private:
+        uint64_t states_{0};
+    };
+
+public:
+    void init(int32_t cluster_count) {
+        cluster_count_ = cluster_count;
+        aic_mask_.init();
+        aiv_mask_.init();
+        pending_occupied_.init();
+        for (int32_t i = 0; i < cluster_count; i++) {
+            aic_mask_ |= BitStates(1ULL << (i * 3));
+            aiv_mask_ |= BitStates(6ULL << (i * 3));
+        }
+        core_states_ = aic_mask_ | aiv_mask_;
+    }
+
+    void set_cluster(int32_t cluster_idx, int32_t aic_wid, int32_t aiv0_wid, int32_t aiv1_wid) {
+        core_id_map_[cluster_idx * 3] = aic_wid;
+        core_id_map_[cluster_idx * 3 + 1] = aiv0_wid;
+        core_id_map_[cluster_idx * 3 + 2] = aiv1_wid;
+    }
+
+    int32_t get_cluster_count() const { return cluster_count_; }
+
+    // --- Running core queries ---
+
+    template <CoreType CT>
+    bool has_running_cores() const {
+        if constexpr (CT == CoreType::AIC) {
+            return ((~core_states_) & aic_mask_).has_value();
+        } else {
+            return ((~core_states_) & aiv_mask_).has_value();
+        }
+    }
+
+    bool has_any_running_cores() const { return ((~core_states_) & (aic_mask_ | aiv_mask_)).has_value(); }
+
+    template <CoreType CT>
+    int32_t get_running_count() const {
+        if constexpr (CT == CoreType::AIC) {
+            return ((~core_states_) & aic_mask_).count();
+        } else {
+            return ((~core_states_) & aiv_mask_).count();
+        }
+    }
+
+    // Return an opaque bitmask for iterating running cores of a given type.
+    // Use pop_first() to extract core bit offsets one at a time.
+    template <CoreType CT>
+    BitStates get_running_cores() const {
+        if constexpr (CT == CoreType::AIC) {
+            return (~core_states_) & aic_mask_;
+        } else {
+            return (~core_states_) & aiv_mask_;
+        }
+    }
+
+    BitStates get_all_running_cores() const { return (~core_states_) & (aic_mask_ | aiv_mask_); }
+    BitStates get_cluster_offset_states() const { return aic_mask_; }
+
+    // --- Cluster matching ---
+
+    BitStates get_valid_cluster_offset_states(PTO2ResourceShape shape) const {
+        switch (shape) {
+        case PTO2ResourceShape::AIC:
+            return core_states_ & aic_mask_;
+        case PTO2ResourceShape::AIV:
+            return ((core_states_ >> 1) | (core_states_ >> 2)) & aic_mask_;
+        case PTO2ResourceShape::MIX:
+            return (core_states_ >> 1) & (core_states_ >> 2) & core_states_ & aic_mask_;
+        case PTO2ResourceShape::DUMMY:
+            // DUMMY tasks never reach the core-tracker dispatch path; they are
+            // completed inline by resolve_and_dispatch via dummy_ready_queue.
+            return BitStates(0ULL);
+        }
+        return BitStates(0ULL);
+    }
+
+    int32_t get_aic_core_id(int32_t cluster_offset) const { return core_id_map_[cluster_offset]; }
+    int32_t get_aiv0_core_id(int32_t cluster_offset) const { return core_id_map_[cluster_offset + 1]; }
+    int32_t get_aiv1_core_id(int32_t cluster_offset) const { return core_id_map_[cluster_offset + 2]; }
+
+    int32_t get_aic_core_offset(int32_t cluster_offset) const { return cluster_offset; }
+    int32_t get_aiv0_core_offset(int32_t cluster_offset) const { return cluster_offset + 1; }
+    int32_t get_aiv1_core_offset(int32_t cluster_offset) const { return cluster_offset + 2; }
+
+    bool is_aic_core_idle(int32_t cluster_offset) const {
+        return ((core_states_ >> cluster_offset) & BitStates(1ULL)).has_value();
+    }
+    bool is_aiv0_core_idle(int32_t cluster_offset) const {
+        return ((core_states_ >> (cluster_offset + 1)) & BitStates(1ULL)).has_value();
+    }
+    bool is_aiv1_core_idle(int32_t cluster_offset) const {
+        return ((core_states_ >> (cluster_offset + 2)) & BitStates(1ULL)).has_value();
+    }
+
+    // --- State mutation ---
+
+    // Toggle bit at the given bit offset (running <-> idle)
+    void change_core_state(int32_t bit_offset) { core_states_ ^= BitStates(1ULL << bit_offset); }
+
+    // --- Pending-occupied tracking ---
+    // Tracks whether a core's pending payload slot is occupied (awaiting hardware ACK).
+    // SET on dispatch (both running-first and pending), CLEAR on idle or pending_freed.
+
+    void set_pending_occupied(int32_t bit_offset) { pending_occupied_ |= BitStates(1ULL << bit_offset); }
+    void clear_pending_occupied(int32_t bit_offset) {
+        pending_occupied_ ^= (pending_occupied_ & BitStates(1ULL << bit_offset));
+    }
+
+    // --- Two-phase dispatch queries ---
+
+    // Idle dispatch: returns bit offsets of idle cores for the given shape.
+    // For AIC: 1 bit per cluster (core offset == cluster offset).
+    // For AIV: 1 bit per AIV core (2 bits per cluster at aiv_mask_ positions).
+    // Only AIC needs pending_occupied filtering: by invariant, idle cores (core_states_ bit=1)
+    // always have pending_occupied=0, so AIV/MIX need no extra filtering.
+    // Skipping the AIC-centric filter also fixes a latent bug where a running+pending AIC core
+    // would incorrectly block AIV idle dispatch on the same cluster.
+    BitStates get_idle_core_offset_states(PTO2ResourceShape shape) const {
+        if (shape == PTO2ResourceShape::AIC) {
+            return get_valid_cluster_offset_states(shape) & ~(pending_occupied_ & aic_mask_);
+        }
+        if (shape == PTO2ResourceShape::AIV) {
+            return core_states_ & aiv_mask_;
+        }
+        return get_valid_cluster_offset_states(shape);  // MIX: cluster-level
+    }
+
+    // Pending dispatch: returns bit offsets of cores eligible for pending-slot dispatch.
+    // AIC: 1 bit per cluster (aic_mask_ positions). AIV: 1 bit per AIV core (aiv_mask_ positions).
+    // Runtime MIX dispatch uses classify_mix_cluster() so the decision follows the task's active_mask.
+    enum class MixPlacement : uint8_t { RUNNING, PENDING, REJECT };
+
+    // A MIX block must place all cores named by active_mask the same way:
+    // all idle means running placement, all running means pending placement,
+    // and any mixed state is retried later.
+    MixPlacement classify_mix_cluster(int32_t cluster_offset, uint8_t core_mask) const {
+        BitStates used(0ULL);
+        if (core_mask & PTO2_SUBTASK_MASK_AIC) {
+            used |= BitStates(1ULL << cluster_offset);
+        }
+        if (core_mask & PTO2_SUBTASK_MASK_AIV0) {
+            used |= BitStates(1ULL << (cluster_offset + 1));
+        }
+        if (core_mask & PTO2_SUBTASK_MASK_AIV1) {
+            used |= BitStates(1ULL << (cluster_offset + 2));
+        }
+        if (!used.has_value() || (pending_occupied_ & used).has_value()) {
+            return MixPlacement::REJECT;
+        }
+
+        BitStates idle = core_states_ & used;
+        if (idle.count() == used.count()) {
+            return MixPlacement::RUNNING;
+        }
+        if (!idle.has_value()) {
+            return MixPlacement::PENDING;
+        }
+        return MixPlacement::REJECT;
+    }
+
+    BitStates get_mix_running_cluster_offset_states(uint8_t core_mask) const {
+        BitStates result(0ULL);
+        BitStates candidates = get_cluster_offset_states();
+        while (candidates.has_value()) {
+            int32_t cluster_offset = candidates.pop_first();
+            if (classify_mix_cluster(cluster_offset, core_mask) == MixPlacement::RUNNING) {
+                result |= BitStates(1ULL << cluster_offset);
+            }
+        }
+        return result;
+    }
+
+    int32_t count_mix_running_clusters(uint8_t core_mask) const {
+        return get_mix_running_cluster_offset_states(core_mask).count();
+    }
+
+    BitStates get_pending_core_offset_states(PTO2ResourceShape shape) const {
+        if (shape == PTO2ResourceShape::MIX) {
+            // Shape-level query kept conservative for legacy callers/tests.
+            // The real MIX dispatch path applies active_mask in classify_mix_cluster().
+            // Any core without a pending payload can accept a dispatch (idle or running).
+            BitStates available = ~pending_occupied_;
+            BitStates mix_available =
+                (available & aic_mask_) & ((available >> 1) & aic_mask_) & ((available >> 2) & aic_mask_);
+            // Pending MIX can only reuse a fully-running cluster. Partially-running clusters
+            // could split one MIX block across immediate and pending placement.
+            BitStates running = ~core_states_;
+            BitStates cluster_all_running =
+                (running & aic_mask_) & ((running >> 1) & aic_mask_) & ((running >> 2) & aic_mask_);
+            return mix_available & cluster_all_running;
+        }
+        if (shape == PTO2ResourceShape::AIC) {
+            return (~core_states_) & aic_mask_ & ~(pending_occupied_ & aic_mask_);
+        }
+        // AIV
+        return (~core_states_) & aiv_mask_ & ~pending_occupied_;
+    }
+
+    // --- Two-phase dispatch unified query ---
+
+    enum class DispatchPhase : uint8_t { IDLE, PENDING };
+
+    BitStates get_dispatchable_cores(PTO2ResourceShape shape, DispatchPhase phase) const {
+        return (phase == DispatchPhase::IDLE) ? get_idle_core_offset_states(shape) :
+                                                get_pending_core_offset_states(shape);
+    }
+
+    // --- Bit offset <-> worker_id mapping ---
+
+    int32_t get_core_id_by_offset(int32_t offset) const { return core_id_map_[offset]; }
+
+    const int32_t *core_ids() const { return core_id_map_; }
+    int32_t core_num() const { return cluster_count_ * 3; }
+
+private:
+    int32_t cluster_count_;
+    BitStates aic_mask_;
+    BitStates aiv_mask_;
+    BitStates core_states_;
+    BitStates pending_occupied_;
+    int32_t core_id_map_[63];  // bit_position -> worker_id, max 21 clusters * 3
+};
+
+// =============================================================================
+// SlotTransition: pure event signals from a single register poll.
+// true = event occurred, false = no-op (maintain current state).
+// =============================================================================
+
+struct SlotTransition {
+    bool running_done = false;   // running task completed
+    bool pending_done = false;   // pending task completed
+    bool running_freed = false;  // running slot data should be released
+    bool pending_freed = false;  // pending_occupied can be cleared
+    bool matched = false;        // some case was hit (otherwise skip apply)
+};
+
+// =============================================================================
+// Profiling counters (compile-time gated)
+// =============================================================================
+
+#if PTO2_PROFILING
+struct alignas(64) SchedL2SwimlaneCounters {
+    bool l2_swimlane_enabled{false};
+    uint64_t sched_start_ts{0};
+    uint64_t sched_complete_cycle{0};
+    uint64_t sched_dispatch_cycle{0};
+    uint64_t sched_wiring_cycle{0};
+    uint64_t sched_idle_cycle{0};
+    uint64_t sched_loop_count{0};
+    uint32_t phase_complete_count{0};
+    // Sub-block retires that did NOT finish a slot (SPMD blocks of a multi-block
+    // task retiring one at a time). Counted separately so the Complete-phase
+    // emit can fire on poll iterations that only retired sub-blocks — otherwise
+    // the serial-harvest tail of an SPMD slot is invisible (no slot completes
+    // until the last block, leaving the scheduler lane blank for that window).
+    uint32_t phase_subretire_count{0};
+    uint32_t phase_dispatch_count{0};
+    // Per-emit delta is (current - *_at_last_emit). Accumulated only when
+    // l2_swimlane_level_ >= SCHED_PHASES.
+    uint64_t pop_hit{0};
+    uint64_t pop_miss{0};
+    uint64_t pop_hit_at_last_emit{0};
+    uint64_t pop_miss_at_last_emit{0};
+#if PTO2_SCHED_PROFILING
+    uint32_t phase_wiring_count{0};
+    uint64_t complete_probe_count{0};
+    uint64_t complete_hit_count{0};
+    uint64_t sched_complete_perf_cycle{0};
+    uint64_t sched_dispatch_pop_cycle{0};
+    uint64_t sched_dispatch_setup_cycle{0};
+#endif
+    void reset() { *this = SchedL2SwimlaneCounters{}; }
+};
+#endif
+
+// =============================================================================
+// sync_start drain coordination
+// =============================================================================
+
+// When sync_start_pending != 0, all scheduler threads skip dispatch
+// (only process completions) until the drain worker finishes launching all blocks.
+struct alignas(64) SyncStartDrainState {
+    std::atomic<int32_t> sync_start_pending{0};    // 0=normal; -1=initializing; >0=active (value=block_num)
+    std::atomic<int32_t> drain_worker_elected{0};  // 0=none; >0: elected thread's (thread_idx+1)
+    std::atomic<uint32_t> drain_ack_mask{0};       // bit per thread; all-set = all threads reached ack barrier
+    std::atomic<PTO2TaskSlotState *> pending_task{nullptr};  // held task (not re-queued)
+    int32_t _pad[10];
+};
+static_assert(sizeof(SyncStartDrainState) == 64);
+
+#endif  // SCHEDULER_TYPES_H
diff --git a/src/a2a3/runtime/fully_distributed_within_core/runtime/shared/pto_runtime2_init.cpp b/src/a2a3/runtime/fully_distributed_within_core/runtime/shared/pto_runtime2_init.cpp
new file mode 100644
index 000000000..0ee5919ce
--- /dev/null
+++ b/src/a2a3/runtime/fully_distributed_within_core/runtime/shared/pto_runtime2_init.cpp
@@ -0,0 +1,466 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Host/AICPU shared runtime-arena layout, init_data and wire implementations.
+ *
+ * Lives under runtime/shared/ so it is included in both the host_runtime.so
+ * build (host pre-populates the prebuilt arena image) and the aicpu_runtime
+ * build (AICPU runs wire_arena_pointers + destroy after attach). The
+ * device-only parts of pto_runtime2.cpp / pto_orchestrator.cpp / pto_scheduler.cpp
+ * (ops table, scope/submit/dispatch business logic, profiling) stay in their
+ * original files and the aicpu build only.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+
+#include <limits>
+
+#include "pto_orchestrator.h"
+#include "pto_runtime2.h"
+#include "pto_ring_buffer.h"
+#include "pto_shared_memory.h"
+#include "pto_tensormap.h"
+#include "scheduler/pto_scheduler.h"
+
+static bool sum_ring_heap_sizes(const uint64_t heap_sizes[PTO2_MAX_RING_DEPTH], uint64_t *total) {
+    uint64_t sum = 0;
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        if (heap_sizes[r] > std::numeric_limits<uint64_t>::max() - sum) {
+            LOG_ERROR("Total ring heap size overflows uint64_t");
+            return false;
+        }
+        sum += heap_sizes[r];
+    }
+    *total = sum;
+    return true;
+}
+
+// =============================================================================
+// Ready queue
+// =============================================================================
+
+size_t ready_queue_reserve_layout(DeviceArena &arena, uint64_t capacity) {
+    // Align the slots[] base to a full cache line so MPMC CAS traffic on the
+    // first slot cannot false-share with whatever region sits in front of us
+    // (e.g. orchestrator tensormap heads written by the orch thread).
+    return arena.reserve(capacity * sizeof(PTO2ReadyQueueSlot), PTO2_ALIGN_SIZE);
+}
+
+bool ready_queue_init_data_from_layout(PTO2ReadyQueue *queue, DeviceArena &arena, size_t slots_off, uint64_t capacity) {
+    // Address the slots region for data writes without storing the pointer in
+    // queue->slots — that field is set by ready_queue_wire_arena_pointers.
+    auto *slots_arena = static_cast<PTO2ReadyQueueSlot *>(arena.region_ptr(slots_off));
+    queue->capacity = capacity;
+    queue->mask = capacity - 1;
+    queue->enqueue_pos.store(0, std::memory_order_relaxed);
+    queue->dequeue_pos.store(0, std::memory_order_relaxed);
+
+    for (uint64_t i = 0; i < capacity; i++) {
+        slots_arena[i].sequence.store((int64_t)i, std::memory_order_relaxed);
+        slots_arena[i].slot_state = nullptr;
+    }
+
+    return true;
+}
+
+void ready_queue_wire_arena_pointers(PTO2ReadyQueue *queue, DeviceArena &arena, size_t slots_off) {
+    queue->slots = static_cast<PTO2ReadyQueueSlot *>(arena.region_ptr(slots_off));
+}
+
+void ready_queue_destroy(PTO2ReadyQueue *queue) {
+    // Arena owns the slots[] buffer; just forget the pointer.
+    queue->slots = nullptr;
+}
+
+// =============================================================================
+// Scheduler
+// =============================================================================
+
+bool PTO2SchedulerState::RingSchedState::init_data_from_layout(void *sm_dev_base, int32_t ring_id) {
+    // ring stores the device address of the SM ring header — pure offset
+    // arithmetic, no SM load.
+    ring = pto2_sm_layout::ring_header_addr(sm_dev_base, ring_id);
+    last_task_alive = 0;
+    advance_lock.store(0, std::memory_order_relaxed);
+#if PTO2_PROFILING
+    dep_pool_snapshot_tail.store(1, std::memory_order_relaxed);
+    dep_pool_snapshot_top.store(1, std::memory_order_relaxed);
+#endif
+
+    // Per-slot SM-side initialization (bind_ring + reset_for_reuse +
+    // fanin_count/active_mask zero) lives in PTO2SharedMemoryHandle::
+    // init_header_per_ring so the AICPU performs it during SM reset; host
+    // prebuilt-arena init skips SM access here.
+
+    return true;
+}
+
+void PTO2SchedulerState::RingSchedState::destroy() { ring = nullptr; }
+
+PTO2SchedulerLayout PTO2SchedulerState::reserve_layout(DeviceArena &arena, int32_t dep_pool_capacity) {
+    int32_t dep_pool_capacities[PTO2_MAX_RING_DEPTH];
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        dep_pool_capacities[r] = dep_pool_capacity;
+    }
+    return reserve_layout(arena, dep_pool_capacities);
+}
+
+PTO2SchedulerLayout
+PTO2SchedulerState::reserve_layout(DeviceArena &arena, const int32_t dep_pool_capacities[PTO2_MAX_RING_DEPTH]) {
+    PTO2SchedulerLayout layout{};
+    layout.ready_queue_capacity = PTO2_READY_QUEUE_SIZE;
+    layout.spsc_capacity = PTO2_WRIRING_QUEUE_SIZE;
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        layout.dep_pool_capacities[r] = dep_pool_capacities[r];
+    }
+
+    for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) {
+        layout.off_ready_queue_slots[i] = ready_queue_reserve_layout(arena, PTO2_READY_QUEUE_SIZE);
+    }
+    layout.off_dummy_ready_queue_slots = ready_queue_reserve_layout(arena, PTO2_READY_QUEUE_SIZE);
+    layout.off_early_dispatch_queue_slots = ready_queue_reserve_layout(arena, PTO2_EARLY_DISPATCH_QUEUE_SIZE);
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        // Force a cache-line base so writes from scheduler thread 0 (sole
+        // writer of this ring's dep_pool) do not invalidate adjacent
+        // multi-threaded regions like ready_queue.slots.
+        layout.off_dep_pool_entries[r] =
+            arena.reserve(static_cast<size_t>(dep_pool_capacities[r]) * sizeof(PTO2DepListEntry), PTO2_ALIGN_SIZE);
+    }
+    layout.off_wiring_spsc_buffer = PTO2SpscQueue::reserve_layout(arena, PTO2_WRIRING_QUEUE_SIZE);
+    return layout;
+}
+
+bool PTO2SchedulerState::init_data_from_layout(
+    const PTO2SchedulerLayout &layout, DeviceArena &arena, void *sm_dev_base
+) {
+    PTO2SchedulerState *sched = this;
+    sched->sm_header = reinterpret_cast<PTO2SharedMemoryHeader *>(sm_dev_base);
+#if PTO2_SCHED_PROFILING
+    sched->tasks_completed.store(0, std::memory_order_relaxed);
+    sched->tasks_consumed.store(0, std::memory_order_relaxed);
+#endif
+
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        if (!sched->ring_sched_states[r].init_data_from_layout(sm_dev_base, r)) {
+            return false;
+        }
+    }
+
+    for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) {
+        if (!ready_queue_init_data_from_layout(
+                &sched->ready_queues[i], arena, layout.off_ready_queue_slots[i], layout.ready_queue_capacity
+            )) {
+            return false;
+        }
+    }
+    if (!ready_queue_init_data_from_layout(
+            &sched->dummy_ready_queue, arena, layout.off_dummy_ready_queue_slots, layout.ready_queue_capacity
+        )) {
+        return false;
+    }
+    if (!ready_queue_init_data_from_layout(
+            &sched->early_dispatch_queue, arena, layout.off_early_dispatch_queue_slots, PTO2_EARLY_DISPATCH_QUEUE_SIZE
+        )) {
+        return false;
+    }
+
+    auto *orch_err = pto2_sm_layout::orch_error_code_addr(sm_dev_base);
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        auto *dep_entries = static_cast<PTO2DepListEntry *>(arena.region_ptr(layout.off_dep_pool_entries[r]));
+        memset(dep_entries, 0, static_cast<size_t>(layout.dep_pool_capacities[r]) * sizeof(PTO2DepListEntry));
+        sched->ring_sched_states[r].dep_pool.init(dep_entries, layout.dep_pool_capacities[r], orch_err);
+    }
+
+    if (!sched->wiring.queue.init_data_from_layout(arena, layout.off_wiring_spsc_buffer, layout.spsc_capacity)) {
+        return false;
+    }
+    sched->wiring.batch_count = 0;
+    sched->wiring.batch_index = 0;
+    sched->wiring.backoff_counter = 0;
+
+    return true;
+}
+
+void PTO2SchedulerState::wire_arena_pointers(const PTO2SchedulerLayout &layout, DeviceArena &arena) {
+    PTO2SchedulerState *sched = this;
+    for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) {
+        ready_queue_wire_arena_pointers(&sched->ready_queues[i], arena, layout.off_ready_queue_slots[i]);
+    }
+    ready_queue_wire_arena_pointers(&sched->dummy_ready_queue, arena, layout.off_dummy_ready_queue_slots);
+    ready_queue_wire_arena_pointers(&sched->early_dispatch_queue, arena, layout.off_early_dispatch_queue_slots);
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        sched->ring_sched_states[r].dep_pool.base =
+            static_cast<PTO2DepListEntry *>(arena.region_ptr(layout.off_dep_pool_entries[r]));
+    }
+    sched->wiring.queue.wire_arena_pointers(arena, layout.off_wiring_spsc_buffer);
+}
+
+void PTO2SchedulerState::destroy() {
+    PTO2SchedulerState *sched = this;
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        sched->ring_sched_states[r].destroy();
+        sched->ring_sched_states[r].dep_pool.base = nullptr;
+    }
+    sched->wiring.queue.destroy();
+    for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) {
+        ready_queue_destroy(&sched->ready_queues[i]);
+    }
+    ready_queue_destroy(&sched->dummy_ready_queue);
+    ready_queue_destroy(&sched->early_dispatch_queue);
+}
+
+// =============================================================================
+// Orchestrator
+// =============================================================================
+
+PTO2OrchestratorLayout PTO2OrchestratorState::reserve_layout(
+    DeviceArena &arena, const int32_t task_window_sizes[PTO2_MAX_RING_DEPTH], int32_t dep_pool_capacity
+) {
+    int32_t dep_pool_capacities[PTO2_MAX_RING_DEPTH];
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        dep_pool_capacities[r] = dep_pool_capacity;
+    }
+    return reserve_layout(arena, task_window_sizes, dep_pool_capacities);
+}
+
+PTO2OrchestratorLayout PTO2OrchestratorState::reserve_layout(
+    DeviceArena &arena, const int32_t task_window_sizes[PTO2_MAX_RING_DEPTH],
+    const int32_t dep_pool_capacities[PTO2_MAX_RING_DEPTH]
+) {
+    PTO2OrchestratorLayout layout{};
+    layout.scope_tasks_cap = PTO2_SCOPE_TASKS_CAP;
+    layout.scope_stack_capacity = PTO2_MAX_SCOPE_DEPTH;
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        layout.dep_pool_capacities[r] = dep_pool_capacities[r];
+    }
+
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        const size_t fanin_pool_bytes =
+            PTO2_ALIGN_UP(static_cast<size_t>(dep_pool_capacities[r]) * sizeof(PTO2FaninSpillEntry), PTO2_ALIGN_SIZE);
+        layout.off_fanin_pool[r] = arena.reserve(fanin_pool_bytes, PTO2_ALIGN_SIZE);
+
+        always_assert(task_window_sizes[r] > 0 && (task_window_sizes[r] & (task_window_sizes[r] - 1)) == 0);
+        const size_t seen_epoch_bytes =
+            PTO2_ALIGN_UP(static_cast<size_t>(task_window_sizes[r]) * sizeof(uint32_t), PTO2_ALIGN_SIZE);
+        layout.off_fanin_seen_epoch[r] = arena.reserve(seen_epoch_bytes, PTO2_ALIGN_SIZE);
+    }
+    layout.off_scope_tasks =
+        arena.reserve(static_cast<size_t>(layout.scope_tasks_cap) * sizeof(uintptr_t), alignof(PTO2TaskSlotState *));
+    layout.off_scope_begins =
+        arena.reserve(static_cast<size_t>(layout.scope_stack_capacity) * sizeof(int32_t), alignof(int32_t));
+    layout.tensor_map = PTO2TensorMap::reserve_layout_default(arena, task_window_sizes);
+    return layout;
+}
+
+bool PTO2OrchestratorState::init_data_from_layout(
+    const PTO2OrchestratorLayout &layout, DeviceArena &arena, void *sm_dev_base, void *gm_heap, uint64_t heap_size,
+    uint64_t task_window_size
+) {
+    uint64_t heap_sizes[PTO2_MAX_RING_DEPTH];
+    uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH];
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        heap_sizes[r] = heap_size;
+        task_window_sizes[r] = task_window_size;
+    }
+    return init_data_from_layout(layout, arena, sm_dev_base, gm_heap, heap_sizes, task_window_sizes);
+}
+
+bool PTO2OrchestratorState::init_data_from_layout(
+    const PTO2OrchestratorLayout &layout, DeviceArena &arena, void *sm_dev_base, void *gm_heap,
+    const uint64_t heap_sizes[PTO2_MAX_RING_DEPTH], const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH]
+) {
+    auto *orch = this;
+    *orch = PTO2OrchestratorState{};
+
+    orch->sm_header = reinterpret_cast<PTO2SharedMemoryHeader *>(sm_dev_base);
+    orch->gm_heap_base = gm_heap;
+    uint64_t total_heap_size = 0;
+    if (!sum_ring_heap_sizes(heap_sizes, &total_heap_size)) {
+        return false;
+    }
+    orch->gm_heap_size = total_heap_size;
+    orch->fatal = false;
+
+    auto *orch_err = pto2_sm_layout::orch_error_code_addr(sm_dev_base);
+    uint64_t heap_offset = 0;
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        void *ring_heap_base = reinterpret_cast<char *>(gm_heap) + heap_offset;
+        auto *task_descs_dev = pto2_sm_layout::ring_task_descriptors_addr(sm_dev_base, task_window_sizes, r);
+        auto *cur_idx_dev = pto2_sm_layout::ring_current_task_index_addr(sm_dev_base, r);
+        auto *last_alive_dev = pto2_sm_layout::ring_last_task_alive_addr(sm_dev_base, r);
+
+        orch->rings[r].task_allocator.init(
+            task_descs_dev, static_cast<int32_t>(task_window_sizes[r]), cur_idx_dev, last_alive_dev, ring_heap_base,
+            heap_sizes[r], orch_err
+        );
+        heap_offset += heap_sizes[r];
+
+        const size_t fanin_pool_bytes = PTO2_ALIGN_UP(
+            static_cast<size_t>(layout.dep_pool_capacities[r]) * sizeof(PTO2FaninSpillEntry), PTO2_ALIGN_SIZE
+        );
+        auto *fanin_entries = static_cast<PTO2FaninSpillEntry *>(arena.region_ptr(layout.off_fanin_pool[r]));
+        memset(fanin_entries, 0, fanin_pool_bytes);
+        orch->rings[r].fanin_pool.init(fanin_entries, layout.dep_pool_capacities[r], orch_err);
+
+        const size_t seen_epoch_bytes = PTO2_ALIGN_UP(
+            static_cast<size_t>(layout.tensor_map.task_window_sizes[r]) * sizeof(uint32_t), PTO2_ALIGN_SIZE
+        );
+        auto *seen_epoch = static_cast<uint32_t *>(arena.region_ptr(layout.off_fanin_seen_epoch[r]));
+        memset(seen_epoch, 0, seen_epoch_bytes);
+        orch->fanin_seen_epoch[r] = seen_epoch;
+    }
+
+    if (!orch->tensor_map.init_data_from_layout(layout.tensor_map, arena)) {
+        return false;
+    }
+
+    orch->scope_tasks_size = 0;
+    orch->scope_tasks_capacity = layout.scope_tasks_cap;
+    orch->scope_stack_top = -1;
+    orch->scope_stack_capacity = layout.scope_stack_capacity;
+    orch->manual_begin_depth = PTO2_MAX_SCOPE_DEPTH;
+
+    return true;
+}
+
+void PTO2OrchestratorState::wire_arena_pointers(
+    const PTO2OrchestratorLayout &layout, DeviceArena &arena, PTO2SchedulerState *scheduler_arg
+) {
+    auto *orch = this;
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        orch->rings[r].fanin_pool.base = static_cast<PTO2FaninSpillEntry *>(arena.region_ptr(layout.off_fanin_pool[r]));
+        orch->fanin_seen_epoch[r] = static_cast<uint32_t *>(arena.region_ptr(layout.off_fanin_seen_epoch[r]));
+    }
+    orch->tensor_map.wire_arena_pointers(layout.tensor_map, arena);
+    orch->scope_tasks = static_cast<PTO2TaskSlotState **>(arena.region_ptr(layout.off_scope_tasks));
+    orch->scope_begins = static_cast<int32_t *>(arena.region_ptr(layout.off_scope_begins));
+    orch->scheduler = scheduler_arg;
+}
+
+void PTO2OrchestratorState::destroy() {
+    auto *orch = this;
+    orch->tensor_map.destroy();
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        orch->rings[r].fanin_pool.base = nullptr;
+        orch->fanin_seen_epoch[r] = nullptr;
+    }
+    orch->scope_tasks = nullptr;
+    orch->scope_begins = nullptr;
+}
+
+void PTO2OrchestratorState::set_scheduler(PTO2SchedulerState *scheduler) { this->scheduler = scheduler; }
+
+// =============================================================================
+// Top-level runtime arena
+// =============================================================================
+
+PTO2RuntimeArenaLayout
+runtime_reserve_layout(DeviceArena &arena, uint64_t task_window_size, int32_t dep_pool_capacity) {
+    uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH];
+    uint64_t heap_sizes[PTO2_MAX_RING_DEPTH];
+    int32_t dep_pool_capacities[PTO2_MAX_RING_DEPTH];
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        task_window_sizes[r] = task_window_size;
+        heap_sizes[r] = 0;
+        dep_pool_capacities[r] = dep_pool_capacity;
+    }
+    return runtime_reserve_layout(arena, task_window_sizes, heap_sizes, dep_pool_capacities);
+}
+
+PTO2RuntimeArenaLayout runtime_reserve_layout(
+    DeviceArena &arena, const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH],
+    const uint64_t heap_sizes[PTO2_MAX_RING_DEPTH], const int32_t dep_pool_capacities[PTO2_MAX_RING_DEPTH]
+) {
+    PTO2RuntimeArenaLayout layout{};
+
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        layout.task_window_sizes[r] = task_window_sizes[r];
+        layout.heap_sizes[r] = heap_sizes[r];
+        layout.dep_pool_capacities[r] = dep_pool_capacities[r];
+    }
+
+    layout.off_sm_handle = arena.reserve(sizeof(PTO2SharedMemoryHandle), alignof(PTO2SharedMemoryHandle));
+    int32_t task_window_sizes_i32[PTO2_MAX_RING_DEPTH];
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        task_window_sizes_i32[r] = static_cast<int32_t>(task_window_sizes[r]);
+    }
+    layout.orch = PTO2OrchestratorState::reserve_layout(arena, task_window_sizes_i32, dep_pool_capacities);
+    layout.sched = PTO2SchedulerState::reserve_layout(arena, dep_pool_capacities);
+    layout.off_runtime = arena.reserve(sizeof(PTO2Runtime), PTO2_ALIGN_SIZE);
+    layout.off_mailbox = arena.reserve(sizeof(AICoreCompletionMailbox), alignof(AICoreCompletionMailbox));
+
+    layout.arena_size = arena.total_size();
+    return layout;
+}
+
+PTO2Runtime *runtime_init_data_from_layout(
+    DeviceArena &arena, const PTO2RuntimeArenaLayout &layout, PTO2RuntimeMode mode, void *sm_dev_base,
+    uint64_t /*sm_size*/, void *gm_heap_dev_base, uint64_t heap_size
+) {
+    uint64_t heap_sizes[PTO2_MAX_RING_DEPTH];
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        heap_sizes[r] = heap_size;
+    }
+    return runtime_init_data_from_layout(arena, layout, mode, sm_dev_base, 0, gm_heap_dev_base, heap_sizes);
+}
+
+PTO2Runtime *runtime_init_data_from_layout(
+    DeviceArena &arena, const PTO2RuntimeArenaLayout &layout, PTO2RuntimeMode mode, void *sm_dev_base,
+    uint64_t /*sm_size*/, void *gm_heap_dev_base, const uint64_t heap_sizes[PTO2_MAX_RING_DEPTH]
+) {
+    PTO2Runtime *rt = static_cast<PTO2Runtime *>(arena.region_ptr(layout.off_runtime));
+    memset(rt, 0, sizeof(*rt));
+
+    auto *sm_wrap = static_cast<PTO2SharedMemoryHandle *>(arena.region_ptr(layout.off_sm_handle));
+    memset(sm_wrap, 0, sizeof(*sm_wrap));
+
+    // rt->ops is filled by the AICPU at boot.
+    rt->mode = mode;
+    rt->gm_heap = gm_heap_dev_base;
+    uint64_t total_heap_size = 0;
+    if (!sum_ring_heap_sizes(heap_sizes, &total_heap_size)) {
+        return nullptr;
+    }
+    rt->gm_heap_size = total_heap_size;
+    rt->gm_heap_owned = false;
+    rt->total_cycles = 0;
+
+    if (!rt->orchestrator.init_data_from_layout(
+            layout.orch, arena, sm_dev_base, gm_heap_dev_base, heap_sizes, layout.task_window_sizes
+        )) {
+        return nullptr;
+    }
+    if (!rt->scheduler.init_data_from_layout(layout.sched, arena, sm_dev_base)) {
+        return nullptr;
+    }
+
+    auto *mailbox = static_cast<AICoreCompletionMailbox *>(arena.region_ptr(layout.off_mailbox));
+    memset(mailbox, 0, sizeof(*mailbox));
+
+    return rt;
+}
+
+void runtime_wire_arena_pointers(DeviceArena &arena, const PTO2RuntimeArenaLayout &layout, PTO2Runtime *rt) {
+    rt->sm_handle = static_cast<PTO2SharedMemoryHandle *>(arena.region_ptr(layout.off_sm_handle));
+    rt->aicore_mailbox = static_cast<AICoreCompletionMailbox *>(arena.region_ptr(layout.off_mailbox));
+    rt->orchestrator.wire_arena_pointers(layout.orch, arena, &rt->scheduler);
+    rt->scheduler.wire_arena_pointers(layout.sched, arena);
+}
+
+void runtime_destroy(PTO2Runtime *rt, DeviceArena & /*arena*/) {
+    // Arena buffer is pooled across runs by DeviceRunner — never freed here.
+    if (!rt) return;
+    rt->scheduler.destroy();
+    rt->orchestrator.destroy();
+    rt->aicore_mailbox = nullptr;
+    rt->sm_handle = nullptr;
+}
diff --git a/src/a2a3/runtime/fully_distributed_within_core/runtime/shared/pto_shared_memory.cpp b/src/a2a3/runtime/fully_distributed_within_core/runtime/shared/pto_shared_memory.cpp
new file mode 100644
index 000000000..d704bd85d
--- /dev/null
+++ b/src/a2a3/runtime/fully_distributed_within_core/runtime/shared/pto_shared_memory.cpp
@@ -0,0 +1,268 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * PTO Runtime2 - Shared Memory Implementation
+ *
+ * Implements shared memory allocation, initialization, and management
+ * for Orchestrator-Scheduler communication.
+ *
+ * Based on: docs/RUNTIME_LOGIC.md
+ */
+
+#include "pto_shared_memory.h"
+#include <inttypes.h>
+#include <stdlib.h>
+#include <string.h>
+#include "common/unified_log.h"
+
+// =============================================================================
+// Size Calculation
+// =============================================================================
+
+uint64_t PTO2SharedMemoryHandle::calculate_size(uint64_t task_window_size) {
+    uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH];
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        task_window_sizes[r] = task_window_size;
+    }
+    return calculate_size_per_ring(task_window_sizes);
+}
+
+uint64_t PTO2SharedMemoryHandle::calculate_size_per_ring(const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH]) {
+    uint64_t size = 0;
+
+    // Header (aligned to cache line)
+    size += PTO2_ALIGN_UP(sizeof(PTO2SharedMemoryHeader), PTO2_ALIGN_SIZE);
+
+    // Per-ring task descriptors and payloads
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        size += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskDescriptor), PTO2_ALIGN_SIZE);
+        size += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskPayload), PTO2_ALIGN_SIZE);
+        size += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskSlotState), PTO2_ALIGN_SIZE);
+    }
+
+    return size;
+}
+
+// =============================================================================
+// Creation and Destruction
+// =============================================================================
+
+void PTO2SharedMemoryHandle::setup_pointers_per_ring(const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH]) {
+    char *ptr = (char *)sm_base;
+
+    // Header
+    header = (PTO2SharedMemoryHeader *)ptr;
+    ptr += PTO2_ALIGN_UP(sizeof(PTO2SharedMemoryHeader), PTO2_ALIGN_SIZE);
+
+    // Per-ring task descriptors, payloads, and slot states
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        auto &ring = header->rings[r];
+        ring.task_descriptors = (PTO2TaskDescriptor *)ptr;
+        ptr += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskDescriptor), PTO2_ALIGN_SIZE);
+
+        ring.task_payloads = (PTO2TaskPayload *)ptr;
+        ptr += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskPayload), PTO2_ALIGN_SIZE);
+
+        ring.slot_states = (PTO2TaskSlotState *)ptr;
+        ptr += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskSlotState), PTO2_ALIGN_SIZE);
+    }
+}
+
+void PTO2SharedMemoryHandle::setup_pointers(uint64_t task_window_size) {
+    uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH];
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        task_window_sizes[r] = task_window_size;
+    }
+    setup_pointers_per_ring(task_window_sizes);
+}
+
+bool PTO2SharedMemoryHandle::init(
+    void *sm_base_arg, uint64_t sm_size_arg, uint64_t task_window_size, uint64_t heap_size
+) {
+    uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH];
+    uint64_t heap_sizes[PTO2_MAX_RING_DEPTH];
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        task_window_sizes[r] = task_window_size;
+        heap_sizes[r] = heap_size;
+    }
+    return init_per_ring(sm_base_arg, sm_size_arg, task_window_sizes, heap_sizes);
+}
+
+bool PTO2SharedMemoryHandle::init_per_ring(
+    void *sm_base_arg, uint64_t sm_size_arg, const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH],
+    const uint64_t heap_sizes[PTO2_MAX_RING_DEPTH]
+) {
+    if (!sm_base_arg || sm_size_arg == 0) return false;
+    if (sm_size_arg < calculate_size_per_ring(task_window_sizes)) return false;
+
+    sm_base = sm_base_arg;
+    sm_size = sm_size_arg;
+    is_owner = false;
+    setup_pointers_per_ring(task_window_sizes);
+    init_header_per_ring(task_window_sizes, heap_sizes);
+    return true;
+}
+
+PTO2SharedMemoryHandle *PTO2SharedMemoryHandle::create_and_init_default(DeviceArena &arena) {
+    const uint64_t buffer_size = calculate_size(PTO2_TASK_WINDOW_SIZE);
+    const size_t off_handle = arena.reserve(sizeof(PTO2SharedMemoryHandle), alignof(PTO2SharedMemoryHandle));
+    const size_t off_buffer = arena.reserve(static_cast<size_t>(buffer_size), PTO2_ALIGN_SIZE);
+    if (arena.commit() == nullptr) return nullptr;
+
+    auto *handle = static_cast<PTO2SharedMemoryHandle *>(arena.region_ptr(off_handle));
+    memset(handle, 0, sizeof(*handle));
+    void *buffer = arena.region_ptr(off_buffer);
+    memset(buffer, 0, static_cast<size_t>(buffer_size));
+    if (!handle->init(buffer, buffer_size, PTO2_TASK_WINDOW_SIZE, PTO2_HEAP_SIZE)) return nullptr;
+    return handle;
+}
+
+void PTO2SharedMemoryHandle::destroy() {
+    // Arena-owned wrappers (is_owner == false) are reclaimed by arena.release();
+    // calling destroy on them is a no-op so existing callers stay safe.
+    if (is_owner && sm_base) {
+        free(sm_base);
+        free(this);
+    }
+}
+
+// =============================================================================
+// Initialization
+// =============================================================================
+//
+// no need init data in pool, init pool data when used
+void PTO2SharedMemoryHandle::init_header(uint64_t task_window_size, uint64_t heap_size) {
+    uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH];
+    uint64_t heap_sizes[PTO2_MAX_RING_DEPTH];
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        task_window_sizes[r] = task_window_size;
+        heap_sizes[r] = heap_size;
+    }
+    init_header_per_ring(task_window_sizes, heap_sizes);
+}
+
+void PTO2SharedMemoryHandle::init_header_per_ring(
+    const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH], const uint64_t heap_sizes[PTO2_MAX_RING_DEPTH]
+) {
+    // Per-ring flow control (start at 0)
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        header->rings[r].fc.init();
+    }
+
+    header->orchestrator_done.store(0, std::memory_order_relaxed);
+
+    // Per-ring layout info
+    uint64_t offset = PTO2_ALIGN_UP(sizeof(PTO2SharedMemoryHeader), PTO2_ALIGN_SIZE);
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        header->rings[r].task_window_size = task_window_sizes[r];
+        header->rings[r].task_window_mask = static_cast<int32_t>(task_window_sizes[r] - 1);
+        header->rings[r].heap_size = heap_sizes[r];
+        header->rings[r].task_descriptors_offset = offset;
+        offset += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskDescriptor), PTO2_ALIGN_SIZE);
+        offset += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskPayload), PTO2_ALIGN_SIZE);
+        offset += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskSlotState), PTO2_ALIGN_SIZE);
+    }
+
+    header->total_size = sm_size;
+    header->graph_output_ptr.store(0, std::memory_order_relaxed);
+    header->graph_output_size.store(0, std::memory_order_relaxed);
+
+    // Error reporting
+    header->orch_error_code.store(PTO2_ERROR_NONE, std::memory_order_relaxed);
+    header->sched_error_bitmap.store(0, std::memory_order_relaxed);
+    header->sched_error_code.store(PTO2_ERROR_NONE, std::memory_order_relaxed);
+    header->sched_error_thread.store(-1, std::memory_order_relaxed);
+
+    // Per-ring slot_states reset. Previously lived in
+    // PTO2SchedulerState::RingSchedState::init(), but it writes into
+    // ring->slot_states[] which is SM-side storage — keeping it here lets
+    // host-side prebuilt-arena init skip all SM dereferences.
+    // bind_ring() pins the ring_id (slot-invariant after this point);
+    // reset_for_reuse() prepares dynamic fanout/refcount fields so the first
+    // submit doesn't need an explicit reset.
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        auto &ring = header->rings[r];
+        for (uint64_t i = 0; i < task_window_sizes[r]; i++) {
+            ring.slot_states[i].bind_ring(static_cast<uint8_t>(r));
+            ring.slot_states[i].reset_for_reuse();
+            ring.slot_states[i].fanin_count = 0;
+            ring.slot_states[i].active_mask = ActiveMask{};
+        }
+    }
+}
+
+// =============================================================================
+// Debug Utilities
+// =============================================================================
+
+void PTO2SharedMemoryHandle::print_layout() {
+    if (!header) return;
+
+    PTO2SharedMemoryHeader *h = header;
+
+    LOG_INFO_V0("=== PTO2 Shared Memory Layout ===");
+    LOG_INFO_V0("Base address:       %p", sm_base);
+    LOG_INFO_V0("Total size:         %" PRIu64 " bytes", h->total_size);
+    LOG_INFO_V0("Ring depth:         %d", PTO2_MAX_RING_DEPTH);
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        LOG_INFO_V0("Ring %d:", r);
+        LOG_INFO_V0("  task_window_size: %" PRIu64, h->rings[r].task_window_size);
+        LOG_INFO_V0("  heap_size:        %" PRIu64 " bytes", h->rings[r].heap_size);
+        LOG_INFO_V0(
+            "  descriptors_off:  %" PRIu64 " (0x%" PRIx64 ")", h->rings[r].task_descriptors_offset,
+            h->rings[r].task_descriptors_offset
+        );
+        LOG_INFO_V0("  current_task_idx: %d", h->rings[r].fc.current_task_index.load(std::memory_order_acquire));
+        LOG_INFO_V0("  last_task_alive:  %d", h->rings[r].fc.last_task_alive.load(std::memory_order_acquire));
+    }
+    LOG_INFO_V0("orchestrator_done:  %d", h->orchestrator_done.load(std::memory_order_acquire));
+    LOG_INFO_V0("Error state:");
+    LOG_INFO_V0("  orch_error_code:    %d", h->orch_error_code.load(std::memory_order_relaxed));
+    LOG_INFO_V0("  sched_error_bitmap: 0x%x", h->sched_error_bitmap.load(std::memory_order_relaxed));
+    LOG_INFO_V0("  sched_error_code:   %d", h->sched_error_code.load(std::memory_order_relaxed));
+    LOG_INFO_V0("  sched_error_thread: %d", h->sched_error_thread.load(std::memory_order_relaxed));
+    LOG_INFO_V0("================================");
+}
+
+bool PTO2SharedMemoryHandle::validate() {
+    if (!sm_base) return false;
+    if (!header) return false;
+
+    PTO2SharedMemoryHeader *h = header;
+
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        if (!h->rings[r].fc.validate(this, r)) return false;
+    }
+
+    return true;
+}
+
+bool PTO2RingFlowControl::validate(PTO2SharedMemoryHandle *handle, int32_t ring_id) const {
+    if (!handle) return false;
+    if (!handle->header) return false;
+    if (ring_id < 0 || ring_id >= PTO2_MAX_RING_DEPTH) return false;
+
+    const PTO2SharedMemoryHeader *h = handle->header;
+
+    // Check that offsets are within bounds
+    if (h->rings[ring_id].task_descriptors_offset >= h->total_size) return false;
+
+    // Check pointer alignment
+    if ((uintptr_t)h->rings[ring_id].task_descriptors % PTO2_ALIGN_SIZE != 0) return false;
+
+    // Check flow control pointer sanity
+    int32_t current = current_task_index.load(std::memory_order_acquire);
+    int32_t last_alive = last_task_alive.load(std::memory_order_acquire);
+    if (current < 0) return false;
+    if (last_alive < 0) return false;
+
+    return true;
+}
diff --git a/src/a2a3/runtime/fully_distributed_within_core/runtime/shared/pto_tensormap.cpp b/src/a2a3/runtime/fully_distributed_within_core/runtime/shared/pto_tensormap.cpp
new file mode 100644
index 000000000..b99c67233
--- /dev/null
+++ b/src/a2a3/runtime/fully_distributed_within_core/runtime/shared/pto_tensormap.cpp
@@ -0,0 +1,261 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * PTO Runtime2 - TensorMap Implementation
+ *
+ * Implements TensorMap with ring buffer pool, lazy invalidation,
+ * and chain truncation optimization.
+ *
+ * Key features:
+ * 1. O(1) insert at bucket head
+ * 2. O(valid_entries) lookup with chain truncation
+ * 3. Automatic stale entry cleanup during lookup
+ * 4. Periodic explicit cleanup for long chains
+ *
+ * Based on: docs/RUNTIME_LOGIC.md
+ */
+
+#include "pto_tensormap.h"
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "common.h"
+#include "common/unified_log.h"
+
+// =============================================================================
+// TensorMap Lookup Chain Length Statistics (compile-time toggle)
+// =============================================================================
+#if PTO2_TENSORMAP_PROFILING
+uint64_t g_lookup_chain_total = 0;
+uint64_t g_lookup_count = 0;
+int32_t g_lookup_chain_max = 0;
+uint64_t g_lookup_overlap_checks = 0;
+uint64_t g_lookup_overlap_hits = 0;
+uint64_t g_insert_count = 0;
+#endif
+
+// =============================================================================
+// Initialization and Destruction
+// =============================================================================
+
+PTO2TensorMapLayout PTO2TensorMap::reserve_layout(
+    DeviceArena &arena, int32_t new_num_buckets, int32_t new_pool_size,
+    const int32_t new_task_window_sizes[PTO2_MAX_RING_DEPTH]
+) {
+    // num_buckets must be a power of two for the hash truncation to work.
+    always_assert((new_num_buckets & (new_num_buckets - 1)) == 0);
+
+    PTO2TensorMapLayout layout{};
+    layout.num_buckets = new_num_buckets;
+    layout.pool_size = new_pool_size;
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        layout.task_window_sizes[r] = new_task_window_sizes[r];
+    }
+
+    layout.off_buckets = arena.reserve(
+        static_cast<size_t>(new_num_buckets) * sizeof(PTO2TensorMapEntry *), alignof(PTO2TensorMapEntry *)
+    );
+    layout.off_entry_pool =
+        arena.reserve(static_cast<size_t>(new_pool_size) * sizeof(PTO2TensorMapEntry), alignof(PTO2TensorMapEntry));
+    layout.off_free_entry_list =
+        arena.reserve(static_cast<size_t>(new_pool_size) * sizeof(PTO2TensorMapEntry *), alignof(PTO2TensorMapEntry *));
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        layout.off_task_entry_heads[r] = arena.reserve(
+            static_cast<size_t>(new_task_window_sizes[r]) * sizeof(PTO2TensorMapEntry *), alignof(PTO2TensorMapEntry *)
+        );
+    }
+    return layout;
+}
+
+PTO2TensorMapLayout
+PTO2TensorMap::reserve_layout_default(DeviceArena &arena, const int32_t new_task_window_sizes[PTO2_MAX_RING_DEPTH]) {
+    return reserve_layout(arena, PTO2_TENSORMAP_NUM_BUCKETS, PTO2_TENSORMAP_POOL_SIZE, new_task_window_sizes);
+}
+
+bool PTO2TensorMap::init_data_from_layout(const PTO2TensorMapLayout &layout, DeviceArena &arena) {
+    num_buckets = layout.num_buckets;
+    pool_size = layout.pool_size;
+
+    // Address arena regions for data writes; do not store these in struct
+    // fields (wire_arena_pointers does that).
+    auto *buckets_arena = static_cast<PTO2TensorMapEntry **>(arena.region_ptr(layout.off_buckets));
+    auto *entry_pool_arena = static_cast<PTO2TensorMapEntry *>(arena.region_ptr(layout.off_entry_pool));
+    auto *free_list_arena = static_cast<PTO2TensorMapEntry **>(arena.region_ptr(layout.off_free_entry_list));
+
+    // buckets[]: empty == nullptr.
+    for (int32_t i = 0; i < num_buckets; i++) {
+        buckets_arena[i] = nullptr;
+    }
+
+    // entry_pool: zero-init equivalent to the previous calloc(entry_pool, ...).
+    // The pool's persistent invariant after init is "bucket_index == -1 means
+    // not linked", set explicitly below.
+    memset(entry_pool_arena, 0, static_cast<size_t>(pool_size) * sizeof(PTO2TensorMapEntry));
+    for (int32_t i = 0; i < pool_size; i++) {
+        entry_pool_arena[i].bucket_index = -1;
+        entry_pool_arena[i].next_in_bucket = nullptr;
+        entry_pool_arena[i].prev_in_bucket = nullptr;
+        entry_pool_arena[i].next_in_task = nullptr;
+        entry_pool_arena[i].prev_in_task = nullptr;
+        entry_pool_arena[i].producer_task_id = PTO2TaskId{};
+    }
+
+    // free_entry_list: zeroed (was calloc'd before); contents become meaningful
+    // only after entries are freed back, so the body of the array stays as 0.
+    memset(free_list_arena, 0, static_cast<size_t>(pool_size) * sizeof(PTO2TensorMapEntry *));
+
+    next_entry_idx = 0;
+    free_num = 0;
+
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        auto *heads_arena = static_cast<PTO2TensorMapEntry **>(arena.region_ptr(layout.off_task_entry_heads[r]));
+        for (int32_t i = 0; i < layout.task_window_sizes[r]; i++) {
+            heads_arena[i] = nullptr;
+        }
+        task_window_sizes[r] = layout.task_window_sizes[r];
+        last_task_alives[r] = 0;
+        last_cleanup[r] = 0;
+    }
+
+    return true;
+}
+
+void PTO2TensorMap::wire_arena_pointers(const PTO2TensorMapLayout &layout, DeviceArena &arena) {
+    buckets = static_cast<PTO2TensorMapEntry **>(arena.region_ptr(layout.off_buckets));
+    entry_pool = static_cast<PTO2TensorMapEntry *>(arena.region_ptr(layout.off_entry_pool));
+    free_entry_list = static_cast<PTO2TensorMapEntry **>(arena.region_ptr(layout.off_free_entry_list));
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        task_entry_heads[r] = static_cast<PTO2TensorMapEntry **>(arena.region_ptr(layout.off_task_entry_heads[r]));
+    }
+}
+
+void PTO2TensorMap::destroy() {
+    // Arena owns the backing memory; here we only forget our pointers so any
+    // stray post-destroy access trips a nullptr dereference instead of reading
+    // a recycled allocation.
+    buckets = nullptr;
+    entry_pool = nullptr;
+    free_entry_list = nullptr;
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        task_entry_heads[r] = nullptr;
+    }
+}
+
+// =============================================================================
+// Debug Utilities
+// =============================================================================
+
+void PTO2TensorMap::print_stats() {
+    int32_t valid = 0;
+    int32_t stale = 0;
+    int32_t empty_buckets = 0;
+    int32_t max_chain = 0;
+    int64_t total_chain = 0;
+    int32_t non_empty_buckets = 0;
+
+    // Count entries
+    for (int32_t i = 0; i < pool_size; i++) {
+        if (entry_pool[i].bucket_index != -1) {
+            if (entry_valid(entry_pool[i])) {
+                valid++;
+            } else {
+                stale++;
+            }
+        }
+    }
+
+    // Count bucket stats
+    for (int32_t b = 0; b < num_buckets; b++) {
+        int32_t chain_len = 0;
+        auto cur_entry = buckets[b];
+
+        while (cur_entry != nullptr) {
+            chain_len++;
+            cur_entry = cur_entry->next_in_bucket;
+        }
+
+        if (chain_len == 0) {
+            empty_buckets++;
+        } else {
+            non_empty_buckets++;
+            total_chain += chain_len;
+            if (chain_len > max_chain) {
+                max_chain = chain_len;
+            }
+        }
+    }
+
+    LOG_INFO_V0("=== TensorMap Statistics ===");
+    LOG_INFO_V0("Pool size:           %d", pool_size);
+    LOG_INFO_V0("Pool next entry idx: %d", next_entry_idx);
+    LOG_INFO_V0("Pool free_num:       %d", free_num);
+    LOG_INFO_V0("Num buckets:         %d", num_buckets);
+    LOG_INFO_V0("Valid entries:       %d", valid);
+    LOG_INFO_V0("Stale entries:       %d", stale);
+    LOG_INFO_V0("Empty buckets:       %d", empty_buckets);
+    LOG_INFO_V0("Max chain len:       %d", max_chain);
+    LOG_INFO_V0("Avg chain len:       %.2f", non_empty_buckets > 0 ? (float)total_chain / non_empty_buckets : 0);
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        LOG_INFO_V0("Last task alive[%d]: %d", r, last_task_alives[r]);
+    }
+    LOG_INFO_V0("============================");
+}
+
+int32_t PTO2TensorMap::valid_count() {
+    int32_t count = 0;
+
+    for (int32_t i = 0; i < pool_size; i++) {
+        if (entry_pool[i].bucket_index != -1 && entry_valid(entry_pool[i])) {
+            count++;
+        }
+    }
+
+    return count;
+}
+
+void PTO2TensorMap::sync_tensormap(PTO2TaskId task_id, int32_t sm_last_task_alive) {
+    auto ring_id = task_id.ring();
+    auto local_id = task_id.local();
+    sync_validity(ring_id, sm_last_task_alive);
+
+    // Only attempt cleanup when last_task_alive has actually advanced;
+    // otherwise cleanup_retired would empty-loop and we'd spin forever.
+    auto overlap = get_task_local_id_slot(ring_id, local_id) == get_task_local_id_slot(ring_id, last_cleanup[ring_id]);
+    if (sm_last_task_alive - last_cleanup[ring_id] >= PTO2_TENSORMAP_CLEANUP_INTERVAL || overlap) {
+        cleanup_retired(ring_id, last_cleanup[ring_id], sm_last_task_alive);
+        last_cleanup[ring_id] = sm_last_task_alive;
+    }
+}
+
+// =============================================================================
+// TensorMap Lookup Profiling
+// =============================================================================
+#if PTO2_TENSORMAP_PROFILING
+PTO2TensorMapProfilingData pto2_tensormap_get_profiling() {
+    PTO2TensorMapProfilingData d;
+    d.lookup_chain_total = g_lookup_chain_total;
+    d.lookup_count = g_lookup_count;
+    d.lookup_chain_max = g_lookup_chain_max;
+    d.overlap_checks = g_lookup_overlap_checks;
+    d.overlap_hits = g_lookup_overlap_hits;
+    d.insert_count = g_insert_count;
+
+    // Reset
+    g_lookup_chain_total = 0;
+    g_lookup_count = 0;
+    g_lookup_chain_max = 0;
+    g_lookup_overlap_checks = 0;
+    g_lookup_overlap_hits = 0;
+    g_insert_count = 0;
+    return d;
+}
+#endif
diff --git a/src/a2a3/runtime/fully_distributed_within_core/runtime/shared/runtime.cpp b/src/a2a3/runtime/fully_distributed_within_core/runtime/shared/runtime.cpp
new file mode 100644
index 000000000..d19e52724
--- /dev/null
+++ b/src/a2a3/runtime/fully_distributed_within_core/runtime/shared/runtime.cpp
@@ -0,0 +1,169 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Runtime Class - Implementation
+ *
+ * Device execution and handshake control.
+ * Task graph construction is handled by PTO2Runtime.
+ */
+
+#include "runtime.h"
+
+#include "common/unified_log.h"
+#include "pto_runtime2_types.h"
+#include "pto_shared_memory.h"
+
+// =============================================================================
+// Constructor
+// =============================================================================
+
+Runtime::Runtime() {
+    // NOTE: host_api is initialized in InitRuntime() (host-only code)
+    // because the CApi functions don't exist when compiled for device.
+
+    // Initialize handshake buffers
+    memset(workers, 0, sizeof(workers));
+    worker_count = 0;
+    aicpu_thread_num = 1;
+    ready_queue_shards = RUNTIME_DEFAULT_READY_QUEUE_SHARDS;
+    orch_to_sched = false;
+
+    // fully_distributed_within_core handoff fields
+    dist.core_main_fn = 0;
+    dist.go = 0;
+    dist.num_workers = 0;
+    dist.done_count = 0;
+
+    // Initialize device orchestration state
+    gm_sm_ptr_ = nullptr;
+    gm_heap_ptr_ = nullptr;
+    slot_states_ptr_ = nullptr;
+    orch_args_storage_.clear();
+    prebuilt_arena_base_ = nullptr;
+    prebuilt_runtime_offset_ = 0;
+
+    // Initialize device orchestration SO binary
+    dev_orch_so_addr_ = 0;
+    dev_orch_so_size_ = 0;
+    active_callable_id_ = -1;
+    register_new_callable_id_ = false;
+    device_orch_func_name_[0] = '\0';
+    device_orch_config_name_[0] = '\0';
+
+    // Initialize kernel binary tracking
+    registered_kernel_count_ = 0;
+
+    // Initialize function address mapping
+    for (int i = 0; i < RUNTIME_MAX_FUNC_ID; i++) {
+        func_id_to_addr_[i] = 0;
+    }
+}
+
+// =============================================================================
+// Device orchestration
+// =============================================================================
+
+void *Runtime::get_gm_sm_ptr() const { return gm_sm_ptr_; }
+void *Runtime::get_gm_heap_ptr() const { return gm_heap_ptr_; }
+const ChipStorageTaskArgs &Runtime::get_orch_args() const { return orch_args_storage_; }
+void Runtime::set_gm_sm_ptr(void *p) { gm_sm_ptr_ = p; }
+void Runtime::set_gm_heap(void *p) { gm_heap_ptr_ = p; }
+void Runtime::set_slot_states_ptr(void *p) { slot_states_ptr_ = p; }
+void Runtime::set_orch_args(const ChipStorageTaskArgs &args) { orch_args_storage_ = args; }
+
+void Runtime::set_prebuilt_arena(void *arena_base, size_t runtime_off) {
+    prebuilt_arena_base_ = arena_base;
+    prebuilt_runtime_offset_ = runtime_off;
+}
+void *Runtime::get_prebuilt_arena_base() const { return prebuilt_arena_base_; }
+size_t Runtime::get_prebuilt_runtime_offset() const { return prebuilt_runtime_offset_; }
+
+// Device orchestration SO metadata (bytes live in a separate device buffer
+// owned by DeviceRunner; only the address/size travels in Runtime).
+void Runtime::set_dev_orch_so(uint64_t dev_addr, uint64_t size) {
+    dev_orch_so_addr_ = dev_addr;
+    dev_orch_so_size_ = size;
+}
+
+uint64_t Runtime::get_dev_orch_so_addr() const { return dev_orch_so_addr_; }
+
+uint64_t Runtime::get_dev_orch_so_size() const { return dev_orch_so_size_; }
+
+void Runtime::set_active_callable_id(int32_t callable_id, bool is_new) {
+    active_callable_id_ = callable_id;
+    register_new_callable_id_ = is_new;
+}
+
+int32_t Runtime::get_active_callable_id() const { return active_callable_id_; }
+
+bool Runtime::register_new_callable_id() const { return register_new_callable_id_; }
+
+void Runtime::set_device_orch_func_name(const char *name) {
+    if (name == nullptr) {
+        device_orch_func_name_[0] = '\0';
+        return;
+    }
+    std::strncpy(device_orch_func_name_, name, RUNTIME_MAX_ORCH_SYMBOL_NAME - 1);
+    device_orch_func_name_[RUNTIME_MAX_ORCH_SYMBOL_NAME - 1] = '\0';
+}
+
+const char *Runtime::get_device_orch_func_name() const { return device_orch_func_name_; }
+
+void Runtime::set_device_orch_config_name(const char *name) {
+    if (name == nullptr) {
+        device_orch_config_name_[0] = '\0';
+        return;
+    }
+    std::strncpy(device_orch_config_name_, name, RUNTIME_MAX_ORCH_SYMBOL_NAME - 1);
+    device_orch_config_name_[RUNTIME_MAX_ORCH_SYMBOL_NAME - 1] = '\0';
+}
+
+const char *Runtime::get_device_orch_config_name() const { return device_orch_config_name_; }
+
+uint64_t Runtime::get_function_bin_addr(int func_id) const {
+    if (func_id < 0 || func_id >= RUNTIME_MAX_FUNC_ID) return 0;
+    return func_id_to_addr_[func_id];
+}
+
+void Runtime::set_function_bin_addr(int func_id, uint64_t addr) {
+    if (func_id < 0 || func_id >= RUNTIME_MAX_FUNC_ID) {
+        LOG_ERROR("[Runtime] func_id=%d is out of range [0, %d)", func_id, RUNTIME_MAX_FUNC_ID);
+        return;
+    }
+    if (addr != 0 && func_id_to_addr_[func_id] == 0) {
+        if (registered_kernel_count_ < RUNTIME_MAX_FUNC_ID) {
+            registered_kernel_func_ids_[registered_kernel_count_++] = func_id;
+        } else {
+            LOG_ERROR(
+                "[Runtime] Registration limit reached (%d). Cannot track func_id=%d for cleanup.", RUNTIME_MAX_FUNC_ID,
+                func_id
+            );
+        }
+    }
+    func_id_to_addr_[func_id] = addr;
+}
+
+void Runtime::replay_function_bin_addr(int func_id, uint64_t addr) {
+    if (func_id < 0 || func_id >= RUNTIME_MAX_FUNC_ID) {
+        LOG_ERROR("[Runtime] func_id=%d is out of range [0, %d)", func_id, RUNTIME_MAX_FUNC_ID);
+        return;
+    }
+    func_id_to_addr_[func_id] = addr;
+}
+
+int Runtime::get_registered_kernel_count() const { return registered_kernel_count_; }
+
+int Runtime::get_registered_kernel_func_id(int index) const {
+    if (index < 0 || index >= registered_kernel_count_) return -1;
+    return registered_kernel_func_ids_[index];
+}
+
+void Runtime::clear_registered_kernels() { registered_kernel_count_ = 0; }
diff --git a/src/a2a3/runtime/fully_distributed_within_core/runtime/tensor_create_info.h b/src/a2a3/runtime/fully_distributed_within_core/runtime/tensor_create_info.h
new file mode 100644
index 000000000..912839a34
--- /dev/null
+++ b/src/a2a3/runtime/fully_distributed_within_core/runtime/tensor_create_info.h
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * TensorCreateInfo — submit-time create-info for runtime-allocated outputs.
+ *
+ * Runtime-only: this header (and the materialization helpers below) are NOT
+ * part of the wire/host-facing Tensor in src/common/task_interface/tensor.h.
+ * It carries the metadata required to materialize a fresh contiguous output:
+ * dtype, ndims, shapes, manual_dep, and an optional initial value fill. Its
+ * 64B layout mirrors Tensor cache line 1 so init_tensor_from_create_info() can
+ * copy the whole line with a single memcpy.
+ */
+
+#pragma once
+
+#include <cstring>
+#include <memory.h>
+#include <stdint.h>
+
+#include "data_type.h"
+#include "tensor.h"
+
+class alignas(64) TensorCreateInfo {
+public:
+    TensorCreateInfo(
+        const uint32_t shapes_in[], uint32_t ndims_in, DataType dtype_in = DataType::FLOAT32, bool manual_dep_in = false
+    ) :
+        initial_value(0),
+        has_initial_value(false),
+        __pad2__(0),
+        start_offset(0),  // mirrors Tensor::start_offset; pre-zeroed for create-info outputs
+        version(0),
+        ndims(ndims_in),
+        dtype(dtype_in),
+        manual_dep(manual_dep_in),
+        is_contiguous(true),  // mirrors Tensor::is_contiguous; pre-set for create-info outputs
+        __pad_flags__(0) {
+        // Bound the write below: shapes[] holds MAX_TENSOR_DIMS, and ndims_in
+        // comes from user-submitted output shapes — guard before the loop so an
+        // oversized rank can't overrun the fixed array.
+        always_assert(ndims_in > 0 && ndims_in <= MAX_TENSOR_DIMS);
+        for (uint32_t i = 0; i < ndims_in; i++) {
+            shapes[i] = shapes_in[i];
+        }
+    }
+
+    void copy(const TensorCreateInfo &other) { memcpy(this, &other, sizeof(other)); }
+
+    template <typename T = uint64_t>
+    void set_initial_value(T value) {
+        has_initial_value = true;
+        initial_value = to_u64(value);
+    }
+
+    uint64_t buffer_size_bytes() const {
+        uint64_t total = 1;
+        for (uint32_t i = 0; i < ndims; i++) {
+            total *= shapes[i];
+        }
+        return total * get_element_size(dtype);
+    }
+
+public:
+    // --- Bytes [0, 32): TensorCreateInfo-only fields ---
+    // These occupy the same positions as Tensor::buffer, Tensor::owner_task_id,
+    // and Tensor::start_offset. The runtime overwrites owner metadata after the
+    // memcpy and recomputes start_offset / stride during payload materialization.
+    uint64_t initial_value;
+    bool has_initial_value;
+    uint8_t __pad1__[7];
+    uint64_t __pad2__;      // → Tensor::owner_task_id (overwritten post-memcpy)
+    uint64_t start_offset;  // mirrors Tensor::start_offset; always 0 for create-info outputs
+
+    // --- Bytes [32, 64): Matches Tensor cache line 1 layout ---
+    int32_t version;  // Always 0 for create-info outputs
+    uint32_t ndims;
+    DataType dtype;
+    bool manual_dep;
+    bool is_contiguous;                // Always true for create-info outputs
+    uint8_t __pad_flags__;             // → Tensor::child_memory (always 0 for create-info outputs)
+    uint32_t shapes[MAX_TENSOR_DIMS];  // → Tensor::shapes
+
+    TensorCreateInfo() = default;
+};
+
+// TensorCreateInfo layout must match Tensor cacheline 1 for memcpy optimization
+static_assert(sizeof(TensorCreateInfo) == 64, "TensorCreateInfo must match Tensor cacheline 1 size (64 bytes)");
+static_assert(offsetof(TensorCreateInfo, start_offset) == offsetof(Tensor, start_offset));
+static_assert(offsetof(TensorCreateInfo, version) == offsetof(Tensor, version));
+static_assert(offsetof(TensorCreateInfo, ndims) == offsetof(Tensor, ndims));
+static_assert(offsetof(TensorCreateInfo, dtype) == offsetof(Tensor, dtype));
+static_assert(offsetof(TensorCreateInfo, manual_dep) == offsetof(Tensor, manual_dep));
+static_assert(offsetof(TensorCreateInfo, is_contiguous) == offsetof(Tensor, is_contiguous));
+static_assert(offsetof(TensorCreateInfo, __pad_flags__) == offsetof(Tensor, child_memory));
+static_assert(offsetof(TensorCreateInfo, shapes) == offsetof(Tensor, shapes));
+
+// ============================================================================
+// Materialization helpers — operate on a Tensor& through its public members.
+// Factored out of Tensor (which now lives in the wire/host-facing common
+// header) so the create-info dependency stays runtime-only.
+// ============================================================================
+
+/// Fill the entire backing buffer of `t` with `initial_value` (doubling memcpy).
+inline void fill_tensor_initial_value(Tensor &t, uint64_t initial_value) {
+    always_assert(reinterpret_cast<char *>(t.buffer.addr) != nullptr);
+    uint64_t elem_size = get_element_size(t.dtype);
+    char *dst = reinterpret_cast<char *>(t.buffer.addr);
+    constexpr uint64_t blk_size = 64;
+    uint64_t blk = (t.buffer.size < blk_size) ? t.buffer.size : blk_size;
+    for (uint64_t b = 0; b < blk; b += elem_size) {
+        memcpy(dst + b, &initial_value, elem_size);
+    }
+    uint64_t filled = blk;
+    while (filled < t.buffer.size) {
+        uint64_t copy_size = ((t.buffer.size - filled) < filled) ? (t.buffer.size - filled) : filled;
+        memcpy(dst + filled, dst, copy_size);
+        filled += copy_size;
+    }
+}
+
+/// Materialize a TensorCreateInfo into `t` (fresh contiguous output).
+/// Single 64B memcpy covers cache line 1; `ci` pre-initialises start_offset (=0)
+/// and is_contiguous (=true) in its line-1 slots so they need no reset here.
+/// Cache line 2 (stride/extent) is computed from `ci.shapes` in a single reverse pass.
+inline void init_tensor_from_create_info(Tensor &t, const TensorCreateInfo &ci, void *addr, uint64_t buffer_size) {
+    always_assert(ci.ndims > 0 && ci.ndims <= MAX_TENSOR_DIMS);
+    memcpy(&t, &ci, 64);
+    t.buffer = {reinterpret_cast<uint64_t>(addr), buffer_size};
+    t.owner_task_id = PTO2TaskId::invalid();  // caller (orchestrator) overwrites with actual task_id
+    uint32_t s = 1;
+    for (int32_t i = static_cast<int32_t>(t.ndims) - 1; i >= 0; --i) {
+        t.strides[i] = s;
+        s *= t.shapes[i];
+    }
+    t.extent_elem_cache = s;
+    if (ci.has_initial_value) {
+        fill_tensor_initial_value(t, ci.initial_value);
+    }
+}
diff --git a/src/a5/platform/sim/aicore/kernel.cpp b/src/a5/platform/sim/aicore/kernel.cpp
index 0ea218067..01dacfc8c 100644
--- a/src/a5/platform/sim/aicore/kernel.cpp
+++ b/src/a5/platform/sim/aicore/kernel.cpp
@@ -17,6 +17,8 @@
  */
 
 #include <cstdint>
+#include <cstdio>
+#include <cstdlib>
 #include <pthread.h>
 
 #include "inner_kernel.h"
@@ -42,16 +44,55 @@ static pthread_key_t g_l2_swimlane_aicore_head_key;
 static pthread_key_t g_aicore_pmu_ring_key;
 static pthread_key_t g_pmu_reg_base_key;
 static pthread_once_t g_tls_once = PTHREAD_ONCE_INIT;
+// True once create_tls_keys() has successfully created ALL keys; gates the
+// unload-time delete so we never pthread_key_delete a stale/uncreated key.
+static bool g_tls_keys_ready = false;
+
+// All pthread keys owned by this DSO, in creation order. destroy_tls_keys()
+// rolls these back at unload so a per-run dlopen/dlclose cycle is net-zero on
+// the process-wide TLS key pool (see destroy_tls_keys()).
+static pthread_key_t *const g_all_keys[] = {
+    &g_reg_base_key,
+    &g_core_id_key,
+    &g_block_idx_key,
+    &g_aicore_profiling_flag_key,
+    &g_l2_swimlane_aicore_head_slot_key,
+    &g_l2_swimlane_aicore_head_key,
+    &g_aicore_pmu_ring_key,
+    &g_pmu_reg_base_key,
+};
+constexpr int kNumTlsKeys = sizeof(g_all_keys) / sizeof(g_all_keys[0]);
 
 static void create_tls_keys() {
-    pthread_key_create(&g_reg_base_key, nullptr);
-    pthread_key_create(&g_core_id_key, nullptr);
-    pthread_key_create(&g_block_idx_key, nullptr);
-    pthread_key_create(&g_aicore_profiling_flag_key, nullptr);
-    pthread_key_create(&g_l2_swimlane_aicore_head_slot_key, nullptr);
-    pthread_key_create(&g_l2_swimlane_aicore_head_key, nullptr);
-    pthread_key_create(&g_aicore_pmu_ring_key, nullptr);
-    pthread_key_create(&g_pmu_reg_base_key, nullptr);
+    for (int i = 0; i < kNumTlsKeys; i++) {
+        if (pthread_key_create(g_all_keys[i], nullptr) != 0) {
+            // The process-wide pthread key pool (PTHREAD_KEYS_MAX, 1024) is
+            // exhausted. Roll back what we created and fail loudly: silently
+            // leaving a key at 0 makes sim_get_reg_base() return NULL and
+            // crashes write_reg() on a NULL register base (hard-to-debug
+            // SIGSEGV). With destroy_tls_keys() reclaiming keys on unload this
+            // path should never be hit.
+            for (int j = 0; j < i; j++) pthread_key_delete(*g_all_keys[j]);
+            fprintf(stderr, "[aicore_sim] FATAL: pthread_key_create failed at key %d/%d — TLS key pool exhausted\n", i,
+                    kNumTlsKeys);
+            abort();
+        }
+    }
+    g_tls_keys_ready = true;
+}
+
+// Release this DSO's pthread TLS keys when it is unloaded (dlclose). The AICore
+// kernel .so is dlopen/dlclose'd once per run (device_runner.cpp reloads it
+// because the kernel binary can vary per case), and glibc does NOT reclaim a
+// DSO's pthread keys on unload. Without this, every run leaked these keys and
+// after ~PTHREAD_KEYS_MAX/kNumTlsKeys runs pthread_key_create() began failing
+// (EAGAIN), leaving the keys at 0 → sim_get_reg_base() == NULL → write_reg()
+// NULL-deref SIGSEGV mid-sweep. All AICore worker threads are joined before the
+// DSO is dlclose'd, so deleting the keys here is race-free.
+__attribute__((destructor)) static void destroy_tls_keys() {
+    if (!g_tls_keys_ready) return;
+    for (int i = 0; i < kNumTlsKeys; i++) pthread_key_delete(*g_all_keys[i]);
+    g_tls_keys_ready = false;
 }
 
 volatile uint8_t *sim_get_reg_base() { return static_cast<volatile uint8_t *>(pthread_getspecific(g_reg_base_key)); }
diff --git a/src/a5/runtime/fully_distributed_within_core/aicore/aicore_executor.cpp b/src/a5/runtime/fully_distributed_within_core/aicore/aicore_executor.cpp
new file mode 100644
index 000000000..21e79b3ed
--- /dev/null
+++ b/src/a5/runtime/fully_distributed_within_core/aicore/aicore_executor.cpp
@@ -0,0 +1,195 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+#include "aicore/aicore.h"
+#include "aicore/aicore_profiling_state.h"
+#include "aicore/l2_swimlane_collector_aicore.h"
+#include "aicore/pmu_collector_aicore.h"
+#include "common/l2_swimlane_profiling.h"
+#include "common/platform_config.h"  // Register-based communication
+#include "common/pmu_profiling.h"
+#include "pto2_dispatch_payload.h"
+#include "runtime.h"
+
+/**
+ * Unified function pointer type for kernel dispatch
+ *
+ * All kernels follow the same signature: void kernel(__gm__ int64_t* args)
+ * This enables simple, switch-free dispatch.
+ */
+typedef void (*UnifiedKernelFunc)(__gm__ int64_t *);
+
+/**
+ * Execute task from PTO2DispatchPayload.
+ *
+ * Reads function_bin_addr and args from the dispatch payload.
+ *
+ * @param payload Pointer to PTO2DispatchPayload in global memory
+ */
+__aicore__ __attribute__((always_inline)) static void execute_task(__gm__ PTO2DispatchPayload *payload) {
+    if (payload == nullptr || payload->function_bin_addr == 0) {
+        return;
+    }
+
+    UnifiedKernelFunc kernel = (UnifiedKernelFunc)payload->function_bin_addr;
+    kernel(reinterpret_cast<__gm__ int64_t *>(payload->args));
+    OUT_OF_ORDER_STORE_BARRIER();
+}
+
+/**
+ * AICore main execution loop
+ *
+ * Implements the AICPU-AICore register-based dispatch protocol:
+ * 1. Wait for AICPU ready signal via handshake buffer
+ * 2. Report physical core ID and core type, signal AICore ready
+ * 3. Cache per-core PTO2DispatchPayload pointer from hank->task
+ * 4. Poll DATA_MAIN_BASE register for task dispatch until exit signal
+ *
+ * AICPU writes &s_payload_per_core[i] to hank->task before setting
+ * aicpu_ready=1. AICore caches this pointer and reads function_bin_addr +
+ * args pointer from it on each dispatch. reg_val is a monotonically
+ * increasing task ID used only for dispatch signaling and ACK/FIN protocol.
+ *
+ * @param runtime Pointer to Runtime in global memory
+ * @param s_block_idx Block index (core ID)
+ * @param core_type Core type (AIC or AIV)
+ */
+__aicore__ __attribute__((weak)) void aicore_execute(__gm__ Runtime *runtime, int s_block_idx, CoreType core_type) {
+    __gm__ Handshake *my_hank = (__gm__ Handshake *)(&runtime->workers[s_block_idx]);
+
+    // Phase 1: Wait for AICPU initialization signal
+    while (my_hank->aicpu_ready == 0) {
+        dcci(my_hank, SINGLE_CACHE_LINE);
+        SPIN_WAIT_HINT();
+    }
+
+    // Phase 2: Report physical core ID, signal ready
+    my_hank->physical_core_id = get_physical_core_id();
+    OUT_OF_ORDER_STORE_BARRIER();
+    my_hank->aicore_regs_ready = 1;
+    dcci(&my_hank->aicore_regs_ready, SINGLE_CACHE_LINE, CACHELINE_OUT);
+    while (my_hank->aicpu_regs_ready == 0) {
+        dcci(&my_hank->aicpu_regs_ready, SINGLE_CACHE_LINE);
+        SPIN_WAIT_HINT();
+    }
+    // Report initial idle status via register
+    write_reg(RegId::COND, AICORE_IDLE_VALUE);
+
+    // Phase 3: Report core type, signal ready
+    my_hank->core_type = core_type;
+    OUT_OF_ORDER_STORE_BARRIER();
+    my_hank->aicore_done = s_block_idx + 1;  // Signal ready (use s_block_idx + 1 to avoid 0)
+
+    dcci(my_hank, SINGLE_CACHE_LINE, CACHELINE_OUT);
+
+    // Cache per-core dispatch payload pointer (set by AICPU before aicpu_ready)
+    __gm__ PTO2DispatchPayload *payload = reinterpret_cast<__gm__ PTO2DispatchPayload *>(my_hank->task);
+
+    // Cache profiling state once after Phase 3. The L2 / PMU rings and the
+    // PMU MMIO base are all stable for the entire run (host-resolved at
+    // AICore kernel entry from KernelArgs::regs[physical_core_id]), so
+    // they are safe to cache here.
+    uint32_t profiling_flag = get_aicore_profiling_flag();
+    bool l2_swimlane_enabled = GET_PROFILING_FLAG(profiling_flag, PROFILING_FLAG_L2_SWIMLANE);
+    bool dump_tensor_enabled = GET_PROFILING_FLAG(profiling_flag, PROFILING_FLAG_DUMP_TENSOR);
+    bool pmu_enabled = GET_PROFILING_FLAG(profiling_flag, PROFILING_FLAG_PMU);
+    // Per-core L2SwimlaneActiveHead channel — lazy-resolved on first task; the
+    // table slot AICPU populates inside `l2_swimlane_aicpu_init` runs
+    // concurrently with kernel entry, so we cannot deref at startup. The
+    // first dispatch is proof AICPU init is done.
+    __gm__ L2SwimlaneActiveHead *l2_swimlane_head = nullptr;
+    L2SwimlaneAicoreLocalState l2_swimlane_local = {nullptr, UINT32_MAX, 0};
+    __gm__ PmuAicoreRing *pmu_ring = pmu_enabled ? get_aicore_pmu_ring() : nullptr;
+    uint64_t pmu_reg_base = pmu_enabled ? get_aicore_pmu_reg_base() : 0;
+
+    // Phase 4: Main execution loop - poll register for tasks until exit signal
+    // Register encoding: AICPU_IDLE_TASK_ID=idle, task_id=task, AICORE_EXIT_SIGNAL=exit
+    uint32_t reg_val = AICPU_IDLE_TASK_ID;
+    uint32_t last_reg_val = AICPU_IDLE_TASK_ID;
+
+    while (true) {
+        reg_val = static_cast<uint32_t>(read_reg(RegId::DATA_MAIN_BASE));
+        if (reg_val == AICORE_EXIT_SIGNAL) {
+            // Signal exit acknowledgment to AICPU
+            write_reg(RegId::COND, AICORE_EXITED_VALUE);
+            break;
+        }
+
+        // Execute task if new (reg_val encoding: AICPU_IDLE_TASK_ID=idle, task_id=task)
+        if (reg_val == AICPU_IDLE_TASK_ID || reg_val == last_reg_val) {
+            SPIN_WAIT_HINT();
+            continue;
+        }
+
+        {
+            // receive_time is captured the instant DATA_MAIN_BASE returned a
+            // new task_id, BEFORE the per-task dcci + ack pair. Paired with
+            // start_time (captured after dcci + ack) it lets DFX split head_OH
+            // into the AICPU→AICore NoC propagation (dispatch_ts → receive_time,
+            // hardware-bound) and the AICore-local dcci+ack cost
+            // (receive_time → start_time, software-tunable). Stored in the
+            // record as a 32-bit delta `start_time - receive_time`.
+            uint64_t receive_time = get_sys_cnt_aicore();
+
+            uint32_t task_id = reg_val;  // Decode: register holds task_id directly
+
+            // First-task lazy resolve of the rotation channel.
+            if (l2_swimlane_enabled && l2_swimlane_head == nullptr) {
+                l2_swimlane_head = get_l2_swimlane_aicore_head();
+            }
+
+            // Select dual-buffer slot: same bit as AICPU used when writing payload
+            __gm__ PTO2DispatchPayload *exec_payload = payload + (task_id & 1u);
+
+            // Invalidate payload buffer (AICPU updates its content each dispatch)
+            dcci(exec_payload, ENTIRE_DATA_CACHE);
+
+            write_reg(RegId::COND, MAKE_ACK_VALUE(task_id));
+
+            // Performance profiling: record start time
+            uint64_t start_time = get_sys_cnt_aicore();
+
+            if (pmu_enabled) {
+                pmu_aicore_begin();
+            }
+
+            // Execute the task
+            execute_task(exec_payload);
+
+            if (pmu_enabled) {
+                pmu_aicore_end();
+                pmu_aicore_record_task(pmu_ring, pmu_reg_base, task_id);
+            }
+
+            if (dump_tensor_enabled) {
+                pipe_barrier(PIPE_ALL);
+            }
+
+            // Performance profiling: record task execution. task_token_raw is
+            // the PTO2 identity (already in AICore cache from the dispatch
+            // payload); reg_task_id is the per-core dispatch token AICore just
+            // read. Host uses reg_task_id as join key vs the AICPU stream.
+            if (l2_swimlane_enabled) {
+                uint64_t end_time = get_sys_cnt_aicore();
+                uint64_t task_token_raw = exec_payload->local_context.async_ctx.task_token.raw;
+                l2_swimlane_aicore_record_task(
+                    l2_swimlane_head, &l2_swimlane_local, task_token_raw, task_id, receive_time, start_time, end_time
+                );
+            }
+
+            last_reg_val = reg_val;
+            write_reg(RegId::COND, MAKE_FIN_VALUE(task_id));
+        }
+    }
+
+    // Flush all dirty cache lines to HBM before kernel exit.
+    dcci(my_hank, SINGLE_CACHE_LINE, CACHELINE_OUT);
+}
diff --git a/src/a5/runtime/fully_distributed_within_core/aicpu/aicpu_executor.cpp b/src/a5/runtime/fully_distributed_within_core/aicpu/aicpu_executor.cpp
new file mode 100644
index 000000000..313e3a36e
--- /dev/null
+++ b/src/a5/runtime/fully_distributed_within_core/aicpu/aicpu_executor.cpp
@@ -0,0 +1,848 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+#include <dlfcn.h>
+#include <unistd.h>
+
+#include <atomic>
+#include <cerrno>
+#include <cinttypes>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#ifdef __linux__
+#include <sys/mman.h>
+#endif
+
+#include "aicpu/device_time.h"
+#include "aicpu/orch_so_file.h"
+#include "aicpu/platform_aicpu_affinity.h"
+#include "callable_protocol.h"
+#include "pto2_dispatch_payload.h"
+#include "runtime.h"
+#include "spin_hint.h"
+
+// Runtime headers (full struct definition for create/destroy + PTO2_SCOPE)
+#include "pto_runtime2.h"
+#include "pto_runtime2_types.h"
+#include "pto_shared_memory.h"
+
+// Performance profiling headers
+#include "aicpu/dep_gen_collector_aicpu.h"
+#include "aicpu/l2_swimlane_collector_aicpu.h"
+#include "aicpu/scope_stats_collector_aicpu.h"
+#include "aicpu/tensor_dump_aicpu.h"
+#include "common/l2_swimlane_profiling.h"
+#include "common/unified_log.h"
+
+// Register-based communication
+#include "aicpu/platform_regs.h"
+#include "common/platform_config.h"
+
+// Core type definitions
+#include "common/core_type.h"
+
+// CoreCallable for resolved dispatch address
+#include "callable.h"
+
+// Scheduler data structures (CoreExecState, CoreTracker, etc.)
+#include "scheduler/scheduler_types.h"
+
+// Scheduler context class
+#include "scheduler/scheduler_context.h"
+
+// Device orchestration function signature (loaded via dlopen).
+// The executor binds the current thread's PTO2Runtime into orchestration TLS
+// before calling the user entry.
+typedef void (*DeviceOrchestrationFunc)(const L2TaskArgs &orch_args);
+typedef void (*DeviceOrchestrationBindRuntimeFunc)(PTO2Runtime *rt);
+
+// Config function exported by orchestration .so
+typedef PTO2OrchestrationConfig (*DeviceOrchestrationConfigFunc)(const L2TaskArgs &orch_args);
+
+// From orchestration/common.cpp linked into this DSO — updates g_current_runtime here (distinct from
+// framework_bind_runtime in the dlopen'd libdevice_orch_*.so).
+extern "C" void framework_bind_runtime(PTO2Runtime *rt);
+
+constexpr const char *DEFAULT_ORCH_ENTRY_SYMBOL = "aicpu_orchestration_entry";
+constexpr const char *DEFAULT_ORCH_CONFIG_SYMBOL = "aicpu_orchestration_config";
+
+static int32_t read_runtime_status(Runtime *runtime) {
+    if (runtime == nullptr) {
+        return 0;
+    }
+
+    void *sm = runtime->get_gm_sm_ptr();
+    if (sm == nullptr) {
+        return 0;
+    }
+
+    auto *header = static_cast<PTO2SharedMemoryHeader *>(sm);
+    int32_t orch_error_code = header->orch_error_code.load(std::memory_order_acquire);
+    int32_t sched_error_code = header->sched_error_code.load(std::memory_order_acquire);
+    return runtime_status_from_error_codes(orch_error_code, sched_error_code);
+}
+
+static PTO2Runtime *rt{nullptr};
+
+// Per-callable_id orchestration SO table. The executor dispatches
+// `orch_so_table_[active_callable_id_]` (created on first sighting of
+// that callable_id, kept warm across runs).
+// MAX_REGISTERED_CALLABLE_IDS is the protocol hard cap on callable_id values
+// (mailbox uint32 callable_id, register() returns small ints) and is shared
+// with the host bounds check in DeviceRunner::register_callable —
+// see src/common/task_interface/callable_protocol.h.
+
+struct OrchSoEntry {
+    bool in_use{false};
+    void *handle{nullptr};
+    char path[256]{};
+    DeviceOrchestrationFunc func{nullptr};
+    DeviceOrchestrationBindRuntimeFunc bind{nullptr};
+    DeviceOrchestrationConfigFunc config_func{nullptr};
+};
+
+struct AicpuExecutor {
+    int32_t sched_thread_num_;
+    bool orch_to_sched_{false};
+
+    // ===== Thread management state =====
+    std::atomic<int32_t> thread_idx_{0};
+    std::atomic<bool> initialized_{false};
+    std::atomic<bool> init_done_{false};
+    std::atomic<bool> init_failed_{false};
+    std::atomic<bool> finished_{false};
+
+    int32_t aicpu_thread_num_{0};
+
+    // ===== Task queue state (managed by scheduler ready queues) =====
+
+    std::atomic<int32_t> finished_count_{0};
+    std::atomic<bool> runtime_init_ready_{false};
+
+    // Per-Worker arena attaching to the pooled prebuilt runtime image. Host
+    // populates the layout + data on its own arena, rtMemcpys into a pooled
+    // device buffer owned by DeviceRunner, and the AICPU attach()es to that
+    // buffer on each boot — no AICPU-side commit, no per-boot rtMalloc.
+    // Default-constructed: libc-backed backend, no ctx.
+    DeviceArena runtime_arena_;
+
+    // Entry-arg L2TaskArgs built (via create_from_chip_args) from get_orch_args()
+    // before scheduler init; consumed by the (*p_func)(orch_args_cached_) below.
+    L2TaskArgs orch_args_cached_;
+
+    // Per-callable_id table. Single orch thread today, so first-write/read
+    // race is not possible; if multiple orch threads are ever introduced,
+    // guard the in_use=false→true transition with a mutex.
+    OrchSoEntry orch_so_table_[MAX_REGISTERED_CALLABLE_IDS];
+
+    // ===== Scheduler context (owns all dispatch/completion/drain state) =====
+    SchedulerContext sched_ctx_;
+
+    // ===== Methods =====
+    int32_t init(Runtime *runtime);
+    int32_t run(Runtime *runtime);
+    void deinit(Runtime *runtime);
+
+    ~AicpuExecutor() {
+        // Process-wide teardown (the single static instance dies here). Every
+        // in-use callable_id slot is dlclose()'d here; each is otherwise kept
+        // alive across runs for cache-hit reuse.
+        for (auto &e : orch_so_table_) {
+            if (!e.in_use) continue;
+            if (e.handle != nullptr) dlclose(e.handle);
+            if (e.path[0] != '\0') unlink(e.path);
+            e = OrchSoEntry{};
+        }
+    }
+};
+
+static AicpuExecutor g_aicpu_executor;
+
+// ===== AicpuExecutor Method Implementations =====
+
+int32_t AicpuExecutor::init(Runtime *runtime) {
+    bool expected = false;
+    if (!initialized_.compare_exchange_strong(expected, true, std::memory_order_acq_rel, std::memory_order_acquire)) {
+        return 0;
+    }
+
+    LOG_INFO_V0("AicpuExecutor: Initializing");
+
+    if (runtime == nullptr) {
+        LOG_ERROR("runtime is nullptr");
+        init_failed_.store(true, std::memory_order_release);
+        return -1;
+    }
+
+    // Read execution parameters from runtime. The 0 → 1 fixup runs before the
+    // sched_thread_num_ derivation so a zero input doesn't leave the scheduler
+    // count at -1.
+    aicpu_thread_num_ = runtime->aicpu_thread_num;
+    if (aicpu_thread_num_ == 0) aicpu_thread_num_ = 1;
+    sched_thread_num_ = aicpu_thread_num_ - 1;
+    orch_to_sched_ = runtime->orch_to_sched;
+
+    if (aicpu_thread_num_ < 1 || aicpu_thread_num_ > MAX_AICPU_THREADS) {
+        LOG_ERROR("Invalid aicpu_thread_num: %d", aicpu_thread_num_);
+        init_failed_.store(true, std::memory_order_release);
+        return -1;
+    }
+
+    if (sched_ctx_.init(runtime, aicpu_thread_num_, sched_thread_num_, orch_to_sched_, get_platform_regs()) != 0) {
+        init_failed_.store(true, std::memory_order_release);
+        return -1;
+    }
+
+    finished_count_.store(0, std::memory_order_release);
+
+    init_done_.store(true, std::memory_order_release);
+    LOG_INFO_V0("AicpuExecutor: Init complete");
+    return 0;
+}
+
+/**
+ * Shutdown AICore - Send exit signal via registers to all AICore kernels
+ */
+int32_t AicpuExecutor::run(Runtime *runtime) {
+    // Prefer the filter gate's deterministic exec_idx so role assignment
+    // (sched 0..N-2 / orch N-1) is driven by host-computed ALLOWED_CPUS,
+    // not arrival order. Fall back to the legacy fetch-add counter on
+    // platforms where the filter gate is inactive (sim sets exec_idx via
+    // its own stub; the fallback covers any path that bypassed the gate).
+    int32_t affinity_exec_idx = platform_aicpu_affinity_thread_idx();
+    int32_t thread_idx = (affinity_exec_idx >= 0) ? affinity_exec_idx : (thread_idx_++);
+    int32_t run_rc = 0;
+    LOG_INFO_V0("Thread %d: Start (exec_idx=%d)", thread_idx, affinity_exec_idx);
+
+    // Orchestrator check
+    if (thread_idx >= sched_thread_num_) {
+#if PTO2_PROFILING
+        uint64_t orch_cycle_start = 0;
+        int32_t submitted_tasks = -1;
+#endif
+        // Orchestrator thread: load + run the device orchestration SO. The braces
+        // scope the per-callable dlopen / SO-table locals to this block.
+        {
+            // Per-callable_id dispatch: the orch SO state lives in
+            // `orch_so_table_[callable_id]` keyed by registration order;
+            // reload is governed by `register_new_callable_id_`.
+            const int32_t callable_id = runtime->get_active_callable_id();
+            if (callable_id < 0 || callable_id >= MAX_REGISTERED_CALLABLE_IDS) {
+                LOG_ERROR(
+                    "Thread %d: invalid callable_id %d (limit=%d)", thread_idx, callable_id, MAX_REGISTERED_CALLABLE_IDS
+                );
+                runtime_init_ready_.store(true, std::memory_order_release);
+                return -1;
+            }
+            void **p_handle = &orch_so_table_[callable_id].handle;
+            char *p_path = orch_so_table_[callable_id].path;
+            DeviceOrchestrationFunc *p_func = &orch_so_table_[callable_id].func;
+            DeviceOrchestrationBindRuntimeFunc *p_bind = &orch_so_table_[callable_id].bind;
+            DeviceOrchestrationConfigFunc *p_config_func = &orch_so_table_[callable_id].config_func;
+            const bool reload_so = runtime->register_new_callable_id();
+
+            if (reload_so) {
+                LOG_INFO_V0("Thread %d: New orch SO detected (callable_id=%d), (re)loading", thread_idx, callable_id);
+                if (*p_handle != nullptr) {
+                    dlclose(*p_handle);
+                    *p_handle = nullptr;
+                    *p_func = nullptr;
+                    *p_bind = nullptr;
+                    if (p_path[0] != '\0') {
+                        // Unlink the old file so the new open() lands on a
+                        // fresh inode — protects against SIGBUS / ETXTBSY when
+                        // the kernel still has the old mapping pinned.
+                        unlink(p_path);
+                        p_path[0] = '\0';
+                    }
+                }
+
+                const void *so_data = reinterpret_cast<const void *>(runtime->get_dev_orch_so_addr());
+                size_t so_size = runtime->get_dev_orch_so_size();
+
+                if (so_data == nullptr || so_size == 0) {
+                    LOG_ERROR("Thread %d: Device orchestration SO not set", thread_idx);
+                    // Unblock scheduler threads before returning so they don't spin forever.
+                    runtime_init_ready_.store(true, std::memory_order_release);
+                    return -1;
+                }
+
+                // Try multiple paths that may allow execution on AICPU
+                char so_path[256];
+                bool file_created = false;
+                const char *candidate_dirs[] = {
+                    "/usr/lib64/aicpu_kernels/0/aicpu_kernels_device", "/usr/lib64", "/lib64", "/var/tmp", "/tmp"
+                };
+                const int32_t num_candidates = sizeof(candidate_dirs) / sizeof(candidate_dirs[0]);
+
+                for (int32_t i = 0; i < num_candidates && !file_created; i++) {
+                    int32_t fd = create_orch_so_file(
+                        candidate_dirs[i], callable_id, get_orch_device_id(), so_path, sizeof(so_path)
+                    );
+                    if (fd < 0) {
+                        LOG_INFO_V0(
+                            "Thread %d: Cannot create SO at %s (errno=%d), trying next path", thread_idx, so_path, errno
+                        );
+                        continue;
+                    }
+                    ssize_t written = write(fd, so_data, so_size);
+                    close(fd);
+                    if (written != static_cast<ssize_t>(so_size)) {
+                        LOG_INFO_V0(
+                            "Thread %d: Cannot write SO to %s (errno=%d), trying next path", thread_idx, so_path, errno
+                        );
+                        unlink(so_path);
+                        continue;
+                    }
+                    file_created = true;
+                    LOG_INFO_V0("Thread %d: Created SO file at %s (%zu bytes)", thread_idx, so_path, so_size);
+                }
+
+                if (!file_created) {
+                    LOG_ERROR("Thread %d: Failed to create SO file in any candidate path", thread_idx);
+                    // Unblock scheduler threads before returning so they don't spin forever.
+                    runtime_init_ready_.store(true, std::memory_order_release);
+                    return -1;
+                }
+
+                dlerror();
+                void *handle = dlopen(so_path, RTLD_LAZY | RTLD_LOCAL);
+                const char *dlopen_err = dlerror();
+                if (handle == nullptr) {
+                    LOG_ERROR("Thread %d: dlopen failed: %s", thread_idx, dlopen_err ? dlopen_err : "unknown");
+                    unlink(so_path);
+                    // Unblock scheduler threads before returning so they don't spin forever.
+                    runtime_init_ready_.store(true, std::memory_order_release);
+                    return -1;
+                }
+                LOG_INFO_V0("Thread %d: dlopen succeeded, handle=%p", thread_idx, handle);
+
+                // Unlink the on-disk SO immediately: dlopen has already mmap'd
+                // the image, so the kernel keeps the inode alive until the
+                // matching dlclose / process exit. This prevents stale
+                // libdevice_orch_<pid>_<cid>.so files from accumulating in
+                // /tmp when child processes exit via os._exit(0), which skips
+                // ~AicpuExecutor (worker.py: _sub/_chip/_child loops).
+                unlink(so_path);
+
+                const char *entry_symbol = runtime->get_device_orch_func_name();
+                if (entry_symbol == nullptr || entry_symbol[0] == '\0') {
+                    entry_symbol = DEFAULT_ORCH_ENTRY_SYMBOL;
+                }
+                const char *config_symbol = runtime->get_device_orch_config_name();
+                if (config_symbol == nullptr || config_symbol[0] == '\0') {
+                    config_symbol = DEFAULT_ORCH_CONFIG_SYMBOL;
+                }
+
+                dlerror();
+                DeviceOrchestrationFunc orch_func =
+                    reinterpret_cast<DeviceOrchestrationFunc>(dlsym(handle, entry_symbol));
+                const char *entry_dlsym_error = dlerror();
+                if (entry_dlsym_error != nullptr) {
+                    LOG_ERROR(
+                        "Thread %d: dlsym failed for entry symbol '%s': %s", thread_idx, entry_symbol, entry_dlsym_error
+                    );
+                    dlclose(handle);
+                    unlink(so_path);
+                    // Unblock scheduler threads before returning so they don't spin forever.
+                    runtime_init_ready_.store(true, std::memory_order_release);
+                    return -1;
+                }
+                if (orch_func == nullptr) {
+                    LOG_ERROR("Thread %d: dlsym returned NULL for entry symbol '%s'", thread_idx, entry_symbol);
+                    dlclose(handle);
+                    unlink(so_path);
+                    // Unblock scheduler threads before returning so they don't spin forever.
+                    runtime_init_ready_.store(true, std::memory_order_release);
+                    return -1;
+                }
+
+                dlerror();
+                auto config_func = reinterpret_cast<DeviceOrchestrationConfigFunc>(dlsym(handle, config_symbol));
+                const char *config_dlsym_error = dlerror();
+                if (config_dlsym_error != nullptr || config_func == nullptr) {
+                    LOG_ERROR(
+                        "Thread %d: dlsym failed for config symbol '%s': %s", thread_idx, config_symbol,
+                        config_dlsym_error ? config_dlsym_error : "NULL function pointer"
+                    );
+                    config_func = nullptr;
+                }
+
+                dlerror();
+                auto bind_runtime_func =
+                    reinterpret_cast<DeviceOrchestrationBindRuntimeFunc>(dlsym(handle, "framework_bind_runtime"));
+                const char *bind_runtime_error = dlerror();
+                if (bind_runtime_error != nullptr) {
+                    LOG_ERROR("Thread %d: dlsym failed for framework_bind_runtime: %s", thread_idx, bind_runtime_error);
+                    bind_runtime_func = nullptr;
+                }
+
+                *p_handle = handle;
+                *p_func = orch_func;
+                *p_bind = bind_runtime_func;
+                *p_config_func = config_func;
+                snprintf(p_path, 256, "%s", so_path);
+                orch_so_table_[callable_id].in_use = true;
+            } else {
+                LOG_INFO_V0(
+                    "Thread %d: Reusing cached orch SO handle=%p (callable_id=%d)", thread_idx, *p_handle, callable_id
+                );
+                if (*p_handle == nullptr || *p_func == nullptr) {
+                    LOG_ERROR(
+                        "Thread %d: reload=false but no cached SO handle/func for callable_id=%d", thread_idx,
+                        callable_id
+                    );
+                    // Unblock scheduler threads before returning so they don't spin forever.
+                    runtime_init_ready_.store(true, std::memory_order_release);
+                    return -1;
+                }
+            }
+
+            // Build the entry-arg once per run; both the config call below and
+            // the orchestration entry (consumed at orch_args_cached_) use it.
+            orch_args_cached_.create_from_chip_args(runtime->get_orch_args());
+
+            // Validate arg count on every run (reload or cache hit).
+            if (*p_config_func != nullptr) {
+                PTO2OrchestrationConfig cfg = (*p_config_func)(orch_args_cached_);
+                LOG_INFO_V0("Thread %d: Config: expected_args=%d", thread_idx, cfg.expected_arg_count);
+                if (cfg.expected_arg_count > 0) {
+                    const ChipStorageTaskArgs &args_validate = runtime->get_orch_args();
+                    int32_t actual_arg_count = args_validate.tensor_count() + args_validate.scalar_count();
+                    if (actual_arg_count < cfg.expected_arg_count) {
+                        LOG_ERROR(
+                            "Thread %d: arg_count %d < expected %d", thread_idx, actual_arg_count,
+                            cfg.expected_arg_count
+                        );
+                        // Clean up cached state so a subsequent run does a full reload.
+                        if (*p_handle != nullptr) {
+                            dlclose(*p_handle);
+                            *p_handle = nullptr;
+                        }
+                        if (p_path[0] != '\0') {
+                            unlink(p_path);
+                            p_path[0] = '\0';
+                        }
+                        *p_func = nullptr;
+                        *p_bind = nullptr;
+                        *p_config_func = nullptr;
+                        orch_so_table_[callable_id].in_use = false;
+                        // Unblock scheduler threads before returning so they don't spin forever.
+                        runtime_init_ready_.store(true, std::memory_order_release);
+                        return -1;
+                    }
+                }
+            } else {
+                LOG_INFO_V0("Thread %d: No config function, using defaults", thread_idx);
+            }
+
+            // sm_handle / rt are bound to *this* run's memory and must be
+            // (re)created every run, regardless of whether the SO itself was
+            // reused above.
+            const ChipStorageTaskArgs &args = runtime->get_orch_args();
+            int32_t arg_count = args.tensor_count() + args.scalar_count();
+            LOG_INFO_V0("Thread %d: sm_ptr=%p, arg_count=%d", thread_idx, runtime->get_gm_sm_ptr(), arg_count);
+            for (int32_t i = 0; i < args.tensor_count() && i < 20; i++) {
+                const Tensor &t = args.tensor(i);
+                LOG_INFO_V0(
+                    "Thread %d: orch_args[%d] = TENSOR(data=0x%lx, ndims=%u, dtype=%u)", thread_idx, i,
+                    static_cast<uint64_t>(t.buffer.addr), t.ndims, static_cast<unsigned>(t.dtype)
+                );
+            }
+            for (int32_t i = 0; i < args.scalar_count() && (args.tensor_count() + i) < 20; i++) {
+                LOG_INFO_V0(
+                    "Thread %d: orch_args[%d] = SCALAR(0x%lx)", thread_idx, args.tensor_count() + i,
+                    static_cast<uint64_t>(args.scalar(i))
+                );
+            }
+
+            void *sm_ptr = runtime->get_gm_sm_ptr();
+
+            // Prebuilt-arena fast path. Host has pre-populated the entire
+            // runtime arena (PTO2Runtime + orchestrator/scheduler/tensor_map
+            // sub-regions + sm_handle wrapper + mailbox) and uploaded it via
+            // rtMemcpy into the pooled runtime_arena buffer. We attach to it,
+            // wire arena-internal pointers to their device addresses, reset
+            // the SM, and finalize the few device-only fields the host could
+            // not know at image-build time.
+            void *prebuilt_arena = runtime->get_prebuilt_arena_base();
+            size_t off_runtime = runtime->get_prebuilt_runtime_offset();
+            if (prebuilt_arena == nullptr) {
+                LOG_ERROR("Thread %d: prebuilt_arena_base is null", thread_idx);
+                runtime_init_ready_.store(true, std::memory_order_release);
+                return -1;
+            }
+            runtime_arena_.attach(prebuilt_arena, DeviceArena::kDefaultBaseAlign);
+            rt = reinterpret_cast<PTO2Runtime *>(static_cast<char *>(prebuilt_arena) + off_runtime);
+
+            // Wire every arena-internal pointer field (host wrote host-mirror
+            // addresses; we overwrite them with device addresses).
+            runtime_wire_arena_pointers(runtime_arena_, rt->prebuilt_layout, rt);
+            uint64_t sm_size = PTO2SharedMemoryHandle::calculate_size_per_ring(rt->prebuilt_layout.task_window_sizes);
+            for (int r = 0; r < PTO2_MAX_RING_DEPTH; ++r) {
+                LOG_INFO_V0(
+                    "Thread %d: Ring %d sizes: task_window=%" PRIu64 " heap=%" PRIu64 " dep_pool=%d", thread_idx, r,
+                    rt->prebuilt_layout.task_window_sizes[r], rt->prebuilt_layout.heap_sizes[r],
+                    rt->prebuilt_layout.dep_pool_capacities[r]
+                );
+            }
+
+            // Reset SM state. setup_pointers + init_header_per_ring restore
+            // ring flow-control counters, layout metadata, error flags, and
+            // the per-slot ring->slot_states[] (bind_ring + reset_for_reuse +
+            // fanin_count/active_mask zero — previously done inside
+            // RingSchedState::init).
+            memset(rt->sm_handle, 0, sizeof(*rt->sm_handle));
+            if (!rt->sm_handle->init_per_ring(
+                    sm_ptr, sm_size, rt->prebuilt_layout.task_window_sizes, rt->prebuilt_layout.heap_sizes
+                )) {
+                LOG_ERROR("Thread %d: sm_handle->init_per_ring failed", thread_idx);
+                rt = nullptr;
+                runtime_init_ready_.store(true, std::memory_order_release);
+                return -1;
+            }
+
+            // AICore completion mailbox lives in the arena; reset it each
+            // boot so stale completion notifications from a previous run do
+            // not leak.
+            memset(rt->aicore_mailbox, 0, sizeof(*rt->aicore_mailbox));
+
+            // Fill ops / core counts (host can't resolve s_runtime_ops's
+            // device address nor know the SchedulerContext's core fan-out).
+            runtime_finalize_after_wire(rt, sched_ctx_.aic_count(), sched_ctx_.aiv_count());
+
+#if PTO2_PROFILING
+            rt->orchestrator.l2_swimlane_level = get_l2_swimlane_level();
+            {
+                auto &orch = rt->orchestrator;
+                for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+                    auto &alloc = orch.rings[r].task_allocator;
+                    scope_stats_set_ring_capacity(
+                        r, alloc.window_size(), alloc.heap_capacity(), rt->prebuilt_layout.dep_pool_capacities[r]
+                    );
+                }
+                scope_stats_set_tensormap_capacity(orch.tensor_map.pool_capacity());
+            }
+#endif
+
+            // With multi-ring, slot_states are per-ring inside the scheduler.
+            runtime->set_slot_states_ptr(nullptr);
+
+            // Wire scheduler context to the newly created PTO2Runtime before
+            // releasing scheduler threads from runtime_init_ready_.
+            sched_ctx_.bind_runtime(rt);
+
+            runtime_init_ready_.store(true, std::memory_order_release);
+
+            // Wait for scheduler's one-time init to complete
+            sched_ctx_.wait_init_complete();
+
+#if PTO2_PROFILING
+            if (get_l2_swimlane_level() >= L2SwimlaneLevel::ORCH_PHASES) {
+                l2_swimlane_aicpu_set_orch_thread_idx(thread_idx);
+            }
+            // scope_stats streams scope_end records off the orchestrator thread:
+            // record the per-thread ready_queue index. No-op (writer shared
+            // state null) when scope_stats is disabled; the current buffer is
+            // popped lazily on the first scope_end append.
+            scope_stats_aicpu_set_orch_thread_idx(thread_idx);
+#endif
+
+            // dep_gen plugs into the orchestrator thread (single-instance subsystem):
+            // set the per-thread queue index and pop the initial buffer before any
+            // submit_task can fire inside orch_func_.
+            if (is_dep_gen_enabled()) {
+                dep_gen_aicpu_set_orch_thread_idx(thread_idx);
+                dep_gen_aicpu_init();
+            }
+
+#if PTO2_PROFILING
+            orch_cycle_start = get_sys_cnt_aicpu();
+#endif
+            framework_bind_runtime(rt);
+            if (*p_bind != nullptr) {
+                (*p_bind)(rt);
+            }
+            rt_scope_begin(rt);
+            (*p_func)(orch_args_cached_);
+            rt_scope_end(rt);
+
+            // Flush the (potentially partially-filled) DepGenBuffer so the host
+            // collector can pick it up before this orchestrator thread joins.
+            if (is_dep_gen_enabled()) {
+                dep_gen_aicpu_flush();
+            }
+#if PTO2_PROFILING
+            // Push the partially-filled scope_stats buffer so the host gets the
+            // final scope_end records. Idempotent / no-op when disabled.
+            scope_stats_aicpu_flush_buffers();
+#endif
+#if PTO2_PROFILING
+            uint64_t orch_cycle_end = get_sys_cnt_aicpu();
+            (void)orch_cycle_end;
+#endif
+
+            // Print orchestrator profiling data
+#if PTO2_ORCH_PROFILING
+            PTO2OrchProfilingData p = orchestrator_get_profiling();
+            uint64_t total =
+                p.sync_cycle + p.alloc_cycle + p.args_cycle + p.lookup_cycle + p.insert_cycle + p.fanin_cycle;
+            if (total == 0) total = 1;  // avoid div-by-zero
+            LOG_INFO_V9(
+                "Thread %d: === Orchestrator Profiling: %" PRId64 " tasks, total=%.3fus ===", thread_idx,
+                static_cast<int64_t>(p.submit_count), cycles_to_us(total)
+            );
+            LOG_INFO_V9(
+                "Thread %d:   task+heap_alloc: %.3fus (%.1f%%)  work=%.3fus wait=%.3fus  atomics=%" PRIu64 "",
+                thread_idx, cycles_to_us(p.alloc_cycle), p.alloc_cycle * 100.0 / total,
+                cycles_to_us(p.alloc_cycle - p.alloc_wait_cycle), cycles_to_us(p.alloc_wait_cycle),
+                static_cast<uint64_t>(p.alloc_atomic_count)
+            );
+            LOG_INFO_V9(
+                "Thread %d:   sync_tensormap : %.3fus (%.1f%%)", thread_idx, cycles_to_us(p.sync_cycle),
+                p.sync_cycle * 100.0 / total
+            );
+            LOG_INFO_V9(
+                "Thread %d:   lookup+dep     : %.3fus (%.1f%%)", thread_idx, cycles_to_us(p.lookup_cycle),
+                p.lookup_cycle * 100.0 / total
+            );
+            LOG_INFO_V9(
+                "Thread %d:   tensormap_ins  : %.3fus (%.1f%%)", thread_idx, cycles_to_us(p.insert_cycle),
+                p.insert_cycle * 100.0 / total
+            );
+            LOG_INFO_V9(
+                "Thread %d:   param_copy     : %.3fus (%.1f%%)  atomics=%" PRIu64 "", thread_idx,
+                cycles_to_us(p.args_cycle), p.args_cycle * 100.0 / total, static_cast<uint64_t>(p.args_atomic_count)
+            );
+            LOG_INFO_V9(
+                "Thread %d:   fanin+ready    : %.3fus (%.1f%%)  work=%.3fus wait=%.3fus", thread_idx,
+                cycles_to_us(p.fanin_cycle), p.fanin_cycle * 100.0 / total,
+                cycles_to_us(p.fanin_cycle - p.fanin_wait_cycle), cycles_to_us(p.fanin_wait_cycle)
+            );
+            LOG_INFO_V9(
+                "Thread %d:   avg/task       : %.3fus", thread_idx,
+                p.submit_count > 0 ? cycles_to_us(total) / p.submit_count : 0.0
+            );
+
+#if PTO2_TENSORMAP_PROFILING
+            PTO2TensorMapProfilingData tp = pto2_tensormap_get_profiling();
+            LOG_INFO_V9("Thread %d: === TensorMap Lookup Stats ===", thread_idx);
+            LOG_INFO_V9(
+                "Thread %d:   lookups        : %" PRIu64 ", inserts: %" PRIu64 "", thread_idx,
+                static_cast<uint64_t>(tp.lookup_count), static_cast<uint64_t>(tp.insert_count)
+            );
+            LOG_INFO_V9(
+                "Thread %d:   chain walked   : total=%" PRIu64 ", avg=%.1f, max=%d", thread_idx,
+                static_cast<uint64_t>(tp.lookup_chain_total),
+                tp.lookup_count > 0 ? static_cast<double>(tp.lookup_chain_total) / tp.lookup_count : 0.0,
+                tp.lookup_chain_max
+            );
+            LOG_INFO_V9(
+                "Thread %d:   overlap checks : %" PRIu64 ", hits=%" PRIu64 " (%.1f%%)", thread_idx,
+                static_cast<uint64_t>(tp.overlap_checks), static_cast<uint64_t>(tp.overlap_hits),
+                tp.overlap_checks > 0 ? tp.overlap_hits * 100.0 / tp.overlap_checks : 0.0
+            );
+#endif
+#endif  // PTO2_ORCH_PROFILING
+
+            // Latch task count from PTO2 shared memory to hand off to the
+            // scheduler. The orchestrator's run window (start_time / end_time /
+            // submit_count) is no longer published to shared memory — the
+            // device LOG_INFO_V9 "orch_start=… orch_end=… orch_cost=…" line
+            // below carries the same envelope info for debugging, and
+            // host-side swimlane derives per-phase timing from the per-event
+            // L2SwimlaneAicpuPhaseRecord[] stream that already covers everything inside
+            // submit_task().
+            int32_t total_tasks = 0;
+            if (rt->orchestrator.sm_header) {
+                for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+                    total_tasks +=
+                        rt->orchestrator.sm_header->rings[r].fc.current_task_index.load(std::memory_order_acquire);
+                }
+            }
+
+#if PTO2_PROFILING
+            submitted_tasks = total_tasks;
+#endif
+
+            // Signal completion to the orchestrator state machine
+            rt_orchestration_done(rt);
+
+            sched_ctx_.on_orchestration_done(runtime, rt, thread_idx, total_tasks);
+        }
+#if PTO2_PROFILING
+        uint64_t orch_end_ts = get_sys_cnt_aicpu();
+        LOG_INFO_V9(
+            "Thread %d: orch_start=%" PRIu64 " orch_end=%" PRIu64 " orch_cost=%.3fus", thread_idx,
+            static_cast<uint64_t>(orch_cycle_start), static_cast<uint64_t>(orch_end_ts),
+            cycles_to_us(orch_end_ts - orch_cycle_start)
+        );
+        if (submitted_tasks >= 0) {
+            LOG_INFO_V9(
+                "PTO2 total submitted tasks = %d, already executed %d tasks", submitted_tasks,
+                sched_ctx_.completed_tasks_count()
+            );
+        }
+#endif
+        LOG_INFO_V0("Thread %d: Orchestrator completed", thread_idx);
+    }
+
+    // Scheduler thread (orchestrator threads skip dispatch when orch_to_sched_ is false)
+    if (!sched_ctx_.is_completed() && (thread_idx < sched_thread_num_ || orch_to_sched_)) {
+        // Device orchestration: wait for the primary orchestrator to initialize the SM header
+        while (!runtime_init_ready_.load(std::memory_order_acquire)) {
+            SPIN_WAIT_HINT();
+        }
+        if (rt == nullptr) {
+            LOG_ERROR("Thread %d: rt is null after orchestrator error, skipping dispatch", thread_idx);
+        } else {
+            sched_ctx_.bind_runtime(rt);
+            int32_t completed = sched_ctx_.resolve_and_dispatch(runtime, thread_idx);
+            if (completed < 0) {
+                LOG_ERROR("Thread %d: Scheduler failed with rc=%d", thread_idx, completed);
+                run_rc = completed;
+            } else {
+                LOG_INFO_V0("Thread %d: Executed %d tasks from runtime", thread_idx, completed);
+            }
+        }
+    }
+
+    // Always shutdown AICore — even if sched_ctx_.completed_ was already true.
+    // platform_deinit_aicore_regs is idempotent; orchestrator threads have
+    // core_trackers_[thread_idx].core_num() == 0 so they skip the loop harmlessly.
+    int32_t shutdown_rc = sched_ctx_.shutdown(thread_idx);
+    if (shutdown_rc != 0 && run_rc == 0) {
+        run_rc = shutdown_rc;
+    }
+
+    LOG_INFO_V0("Thread %d: Completed", thread_idx);
+
+    // Check if this is the last thread to finish
+    int32_t prev_finished = finished_count_.fetch_add(1, std::memory_order_acq_rel);
+    if (prev_finished + 1 == aicpu_thread_num_) {
+        finished_.store(true, std::memory_order_release);
+        // Destroy PTO2 runtime. sm_handle / rt are recreated every run so we
+        // always tear them down here, but we keep the per-cid orch SO entries
+        // alive for the next run's cache-hit reuse (see run() reload_so branch).
+        if (rt != nullptr) {
+            // Clear g_current_runtime in this DSO and in the orchestration SO before destroying rt.
+            const int32_t callable_id = runtime->get_active_callable_id();
+            framework_bind_runtime(nullptr);
+            if (callable_id >= 0 && callable_id < MAX_REGISTERED_CALLABLE_IDS) {
+                DeviceOrchestrationBindRuntimeFunc bind = orch_so_table_[callable_id].bind;
+                if (bind != nullptr) {
+                    bind(nullptr);
+                }
+            }
+            runtime_destroy(rt, runtime_arena_);
+            rt = nullptr;
+        }
+    }
+
+    return run_rc;
+}
+
+void AicpuExecutor::deinit(Runtime *runtime) {
+    // 1. Invalidate AICPU cache for Runtime address range.
+    //    Next round's Host DMA (rtMemcpy) writes fresh Runtime to HBM but
+    //    bypasses this cache. Invalidating now ensures next round reads from HBM.
+    cache_invalidate_range(runtime, sizeof(Runtime));
+
+    // Reset all SchedulerContext-owned state in one place.
+    sched_ctx_.deinit();
+
+    finished_count_.store(0, std::memory_order_release);
+    runtime_init_ready_.store(false, std::memory_order_release);
+
+    aicpu_thread_num_ = 0;
+    sched_thread_num_ = 0;
+    orch_to_sched_ = false;
+
+    orch_args_cached_.reset();
+    // orch_so_table_ entries are intentionally preserved across deinit: the
+    // next run reuses cached handles when register_new_callable_id() returns
+    // false. The destructor releases them at process teardown.
+
+    // Clear file-scope PTO2Runtime pointer (freed by orchestrator thread before deinit)
+    rt = nullptr;
+
+    // Clear dep_gen file-local bookkeeping. No-op when dep_gen is disabled.
+    dep_gen_aicpu_finalize();
+
+    LOG_INFO_V0("DeInit: Runtime execution state reset");
+
+    initialized_.store(false, std::memory_order_release);
+    init_done_.store(false, std::memory_order_release);
+    init_failed_.store(false, std::memory_order_release);
+    thread_idx_.store(0, std::memory_order_release);
+    finished_.store(false, std::memory_order_release);
+
+    LOG_INFO_V0("DeInit: AicpuExecutor reset complete");
+}
+
+// ===== Public Entry Point =====
+
+/**
+ * aicpu_execute - Main AICPU kernel execution entry point
+ *
+ * This is called by DynTileFwkBackendKernelServer in kernel.cpp.
+ * Orchestrates the complete task runtime execution:
+ * 1. Initialize executor (thread-safe, first thread only)
+ * 2. Wait for initialization to complete
+ * 3. Execute tasks on managed cores
+ * 4. Cleanup when last thread finishes
+ *
+ * @param runtime Pointer to Runtime structure
+ * @return 0 on success, non-zero on error
+ */
+extern "C" int32_t aicpu_execute(Runtime *runtime) {
+    if (runtime == nullptr) {
+        LOG_ERROR("%s", "Invalid argument: null Runtime pointer");
+        return -1;
+    }
+
+    LOG_INFO_V0("%s", "aicpu_execute: Starting AICPU kernel execution");
+
+    g_aicpu_executor.init(runtime);
+
+    while (!g_aicpu_executor.init_done_.load(std::memory_order_acquire)) {
+        if (g_aicpu_executor.init_failed_.load(std::memory_order_acquire)) {
+            LOG_ERROR("%s", "aicpu_execute: Initialization failed, aborting execution");
+            return -1;
+        }
+    }
+
+    int32_t rc = g_aicpu_executor.run(runtime);
+    if (rc != 0) {
+        LOG_ERROR("aicpu_execute: Thread execution failed with rc=%d", rc);
+    }
+
+    int32_t runtime_rc = read_runtime_status(runtime);
+
+    // Last thread cleans up
+    if (g_aicpu_executor.finished_.load(std::memory_order_acquire)) {
+        LOG_INFO_V0("aicpu_execute: Last thread finished, cleaning up");
+        g_aicpu_executor.deinit(runtime);
+    }
+
+    if (runtime_rc != 0) {
+        LOG_ERROR("aicpu_execute: PTO2 runtime failed with rc=%d", runtime_rc);
+        return runtime_rc;
+    }
+
+    if (rc != 0) {
+        return rc;
+    }
+
+    LOG_INFO_V0("%s", "aicpu_execute: Kernel execution completed successfully");
+    return 0;
+}
diff --git a/src/a5/runtime/fully_distributed_within_core/build_config.py b/src/a5/runtime/fully_distributed_within_core/build_config.py
new file mode 100644
index 000000000..da34f14f9
--- /dev/null
+++ b/src/a5/runtime/fully_distributed_within_core/build_config.py
@@ -0,0 +1,32 @@
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+# fully_distributed_within_core runtime build configuration
+# All paths are relative to this file's directory (src/runtime/fully_distributed_within_core/)
+#
+# Goal: orchestration + scheduling + execution run on the AI cores themselves in
+# SPMD fashion, removing AICPU from orchestration/scheduling. See the design spec:
+#   docs/fully_distributed_within_core.md
+#
+# This tree is currently re-based on the tensormap_and_ringbuffer runtime so it
+# is discoverable and compiles; it reuses TensorMap, MixedKernels/ActiveMask,
+# L0TaskArgs, the pto_orchestration_api submit API, and kernel-address
+# resolution. The distributed model (claim race + per-core TensorMap + private
+# task ring + global completion-flag ring) is layered on incrementally per the
+# spec; the AICPU is reduced to an init/teardown stub.
+#
+# The "orchestration" directory contains source files compiled into both
+# runtime targets AND the orchestration .so (e.g., tensor methods needed
+# by the Tensor constructor's validation logic).
+
+BUILD_CONFIG = {
+    "aicore": {"include_dirs": ["runtime", "common", ".."], "source_dirs": ["aicore", "orchestration"]},
+    "aicpu": {"include_dirs": ["runtime", "common", ".."], "source_dirs": ["aicpu", "runtime", "orchestration"]},
+    "host": {"include_dirs": ["runtime", "common", ".."], "source_dirs": ["host", "runtime/shared", "orchestration"]},
+    "orchestration": {"include_dirs": ["runtime", "orchestration", "common", ".."], "source_dirs": ["orchestration"]},
+}
diff --git a/src/a5/runtime/fully_distributed_within_core/common/intrinsic.h b/src/a5/runtime/fully_distributed_within_core/common/intrinsic.h
new file mode 100644
index 000000000..99803483a
--- /dev/null
+++ b/src/a5/runtime/fully_distributed_within_core/common/intrinsic.h
@@ -0,0 +1,202 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+/**
+ * @file intrinsic.h
+ * @brief SPMD execution context for AICore user kernels
+ *
+ * Topology data exposed to user kernels has two distinct lifetimes:
+ *
+ *   1. Global topology (per-core, fixed after runtime init):
+ *      - sub_block_id : identifies the AIV lane within a cluster
+ *        (0 = AIV0/left, 1 = AIV1/right).  Initialized once at runtime
+ *        startup based on each core's cluster position; never changes.
+ *        Only meaningful for AIV kernels in MIX tasks.
+ *
+ *   2. Local per-dispatch context (changes each dispatch):
+ *      - s_block_idx : which logical block the current worker is executing
+ *      - s_block_num : total number of blocks in this task (= block_dim)
+ *      Written by build_payload() before each dispatch.
+ *
+ * Both categories are injected via two pointer slots appended at the tail
+ * of the kernel args[] array:
+ *
+ *   args layout:
+ *     [0 .. tensor_count-1]                 = tensor GM pointers
+ *     [tensor_count .. +scalar_count-1]     = scalar values
+ *     ...
+ *     [SPMD_LOCAL_CONTEXT_INDEX]            = (uint64_t)&LocalContext   (per-dispatch)
+ *     [SPMD_GLOBAL_CONTEXT_INDEX]           = (uint64_t)&GlobalContext  (per-core)
+ *
+ * The suffix positions are compile-time constants and do not depend on the
+ * runtime tensor_count or scalar_count.
+ *
+ * Include this header in AICore kernel source files to use the Get* accessors.
+ * Do NOT depend on the raw index constants; always use the accessor functions.
+ *
+ * On CCEC (real hardware), __gm__ and __aicore__ must be defined before
+ * including this header (e.g. via <pto/pto-inst.hpp> or manual #define).
+ * The #ifndef guards below provide fallbacks for non-kernel builds
+ * (AICPU, HOST) where these qualifiers are not needed.
+ *
+ * IMPORTANT — do NOT mix these with the CCE built-in topology intrinsics
+ * (`get_subblockid()`, `get_block_idx()`, `get_block_num()` declared in
+ * `kernel_operator.h` / tikcfw). Those intrinsics read AICore hardware
+ * registers that simpler's tensormap_and_ringbuffer runtime does NOT
+ * program. Specifically:
+ *
+ *   - CCE `get_subblockid()` returns whatever stale value the AICore
+ *     sub-block register holds — under simpler's MIX dispatch it is 0
+ *     for BOTH AIV0 and AIV1 of every cluster, so a kernel that uses
+ *     it to partition heads will silently have AIV1 redo AIV0's work
+ *     and the AIV1 share of the output is never written. This is the
+ *     exact failure mode that produced the partial-zero output in
+ *     issue #900 (PR #899 spmd_paged_attention_highperf); the kernel
+ *     compiled, ran without error, and produced wrong output. Use
+ *     `get_sub_block_id(args)` instead, which reads from the runtime's
+ *     `GlobalContext.sub_block_id` that the scheduler initializes per
+ *     AIV core in `scheduler_cold_path.cpp::SchedulerContext::init`.
+ *
+ *   - `get_block_idx()` and `get_block_num()` are not redirected to
+ *     simpler's LocalContext either — use the `(args)` variants below
+ *     so the values reflect simpler's logical block_dim (which can
+ *     differ from `RUNTIME_CONFIG.block_dim`, the physical core count).
+ *
+ * If you are porting a kernel originally written for native CANN dispatch
+ * (AscendC, ascend-transformer-boost, etc.), every reference to those
+ * three CCE intrinsics needs to be rewritten against this header. See
+ * `docs/aicore-kernel-programming.md` for the full author contract,
+ * porting checklist, and the worked example from PR #899 / issue #900.
+ */
+
+#pragma once
+
+#include <stdint.h>
+
+#include "aicore_completion_mailbox_types.h"
+#include "pto_task_id.h"
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__
+#endif
+
+/** Number of extra pointer slots appended to the args[] tail (LocalContext + GlobalContext). */
+static constexpr int32_t PTO2_EXT_PARAMS_COUNT = 2;
+
+/**
+ * Args[] suffix indices for context pointers.
+ * Derived from MAX_TENSOR_ARGS(32) + MAX_SCALAR_ARGS(16).
+ * Users should not depend on these values; use the Get* functions below.
+ */
+static constexpr int32_t SPMD_LOCAL_CONTEXT_INDEX = 48;
+static constexpr int32_t SPMD_GLOBAL_CONTEXT_INDEX = 49;
+static constexpr int32_t PAYLOAD_LOCAL_CONTEXT_INDEX = SPMD_LOCAL_CONTEXT_INDEX;
+static constexpr int32_t PAYLOAD_GLOBAL_CONTEXT_INDEX = SPMD_GLOBAL_CONTEXT_INDEX;
+
+/**
+ * Per-core global context, stored in PTO2DispatchPayload.
+ * Initialized once at runtime startup (init_global_context) based on each
+ * core's cluster position.  Never modified after initialization.
+ */
+struct GlobalContext {
+    // AIV lane within cluster: 0=AIV0(left), 1=AIV1(right).
+    // Used by AIV to select the correct intra-cluster hw instruction.
+    // Not meaningful for AIC kernels or single-AIV tasks.
+    int32_t sub_block_id;
+};
+
+struct AsyncCtx {
+    volatile __gm__ uint32_t *completion_count;
+    volatile __gm__ int32_t *completion_error_code;
+    volatile __gm__ DeferredCompletionEntry *completion_entries;
+    uint32_t completion_capacity;
+    PTO2TaskId task_token;
+
+    static inline AsyncCtx make(PTO2TaskId task_token, volatile __gm__ DeferredCompletionSlab *buffer) {
+        AsyncCtx ctx{};
+        ctx.task_token = task_token;
+        if (buffer == nullptr) {
+            ctx.task_token = PTO2TaskId::invalid();
+            return ctx;
+        }
+        ctx.completion_count = &buffer->count;
+        ctx.completion_error_code = &buffer->error_code;
+        ctx.completion_entries = &buffer->entries[0];
+        ctx.completion_capacity = MAX_COMPLETIONS_PER_TASK;
+        return ctx;
+    }
+};
+
+/**
+ * Per-dispatch local context, stored in PTO2DispatchPayload.
+ * Written by build_payload() before each dispatch. Different blocks of the
+ * same task receive different s_block_idx values but the same s_block_num.
+ *
+ * NOTE: Fields are prefixed with s_ to avoid collisions with compiler
+ * built-in symbols block_idx / block_num on the a5 AICore target, which
+ * would cause a compile error if the unprefixed names were used.
+ */
+struct LocalContext {
+    int32_t s_block_idx;  // Logical block index within the task [0, s_block_num)
+    int32_t s_block_num;  // How many logical blocks this task requires.
+                          // Currently fixed to 1 (block_dim > 1 not yet implemented).
+                          // NOT the same as RUNTIME_CONFIG.block_dim in kernel_config.py,
+                          // which controls how many physical cores the runtime launches.
+    AsyncCtx async_ctx;
+};
+
+/**
+ * Return the AIV lane index within the cluster.
+ * In a MIX 1C2V task: AIV0(left)=0, AIV1(right)=1.
+ *
+ * This value is only meaningful for AIV kernels in MIX tasks.  It tells
+ * the AIV whether it is the left lane or the right lane within the cluster,
+ * which determines the correct hardware instruction for intra-cluster
+ * communication.
+ *
+ * AIC kernels should NOT call this function.
+ * Single-AIV tasks have no intra-cluster communication, so sub_block_id
+ * has no meaning and should not be used.
+ */
+static __aicore__ inline int32_t get_sub_block_id(__gm__ int64_t *args) {
+    __gm__ GlobalContext *ctx =
+        reinterpret_cast<__gm__ GlobalContext *>(static_cast<uint64_t>(args[SPMD_GLOBAL_CONTEXT_INDEX]));
+    return ctx->sub_block_id;
+}
+
+/**
+ * Return the logical block index assigned to the current worker.
+ * Range: [0, get_block_num(args)).
+ * Within the same task, different blocks receive different indices.
+ */
+static __aicore__ inline int32_t get_block_idx(__gm__ int64_t *args) {
+    __gm__ LocalContext *ctx =
+        reinterpret_cast<__gm__ LocalContext *>(static_cast<uint64_t>(args[SPMD_LOCAL_CONTEXT_INDEX]));
+    return ctx->s_block_idx;
+}
+
+/**
+ * Return how many logical blocks the current task requires.
+ * All blocks of the same task see the same value.
+ * Currently always returns 1 (block_dim>1 not yet implemented).
+ *
+ * Note: this is NOT the same as RUNTIME_CONFIG.block_dim in
+ * kernel_config.py, which controls how many physical cores are launched.
+ */
+static __aicore__ inline int32_t get_block_num(__gm__ int64_t *args) {
+    __gm__ LocalContext *ctx =
+        reinterpret_cast<__gm__ LocalContext *>(static_cast<uint64_t>(args[SPMD_LOCAL_CONTEXT_INDEX]));
+    return ctx->s_block_num;
+}
diff --git a/src/a5/runtime/fully_distributed_within_core/common/pto_runtime_status.h b/src/a5/runtime/fully_distributed_within_core/common/pto_runtime_status.h
new file mode 100644
index 000000000..e663ef477
--- /dev/null
+++ b/src/a5/runtime/fully_distributed_within_core/common/pto_runtime_status.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+/**
+ * PTO2 Runtime Status Helpers
+ *
+ * Shared error-code contract used inside the tensormap_and_ringbuffer runtime.
+ */
+
+#ifndef SRC_A5_RUNTIME_TENSORMAP_AND_RINGBUFFER_COMMON_PTO_RUNTIME_STATUS_H_
+#define SRC_A5_RUNTIME_TENSORMAP_AND_RINGBUFFER_COMMON_PTO_RUNTIME_STATUS_H_
+
+#include <stdint.h>
+
+// Orchestrator errors (1-99): detected in orchestrator thread
+#define PTO2_ERROR_NONE 0  // Explicitly means "no error"; it is not an "unknown/unspecified" error code.
+#define PTO2_ERROR_SCOPE_DEADLOCK 1
+#define PTO2_ERROR_HEAP_RING_DEADLOCK 2
+#define PTO2_ERROR_FLOW_CONTROL_DEADLOCK 3
+#define PTO2_ERROR_DEP_POOL_OVERFLOW 4
+#define PTO2_ERROR_INVALID_ARGS 5         // Arg construction error (invalid args)
+#define PTO2_ERROR_DEPENDENCY_OVERFLOW 6  // Too many unique fanin dependencies for one task
+#define PTO2_ERROR_REQUIRE_SYNC_START_INVALID 7
+#define PTO2_ERROR_TENSOR_WAIT_TIMEOUT 8
+#define PTO2_ERROR_EXPLICIT_ORCH_FATAL 9
+#define PTO2_ERROR_SCOPE_TASKS_OVERFLOW 10  // scope_tasks buffer saturated (all rings full)
+
+// Scheduler errors (100+): detected in scheduler threads
+#define PTO2_ERROR_SCHEDULER_TIMEOUT 100
+#define PTO2_ERROR_ASYNC_COMPLETION_INVALID 101
+#define PTO2_ERROR_ASYNC_WAIT_OVERFLOW 102
+#define PTO2_ERROR_ASYNC_REGISTRATION_FAILED 103
+
+static inline int32_t runtime_status_from_error_codes(int32_t orch_error_code, int32_t sched_error_code) {
+    if (orch_error_code != PTO2_ERROR_NONE) {
+        return orch_error_code < 0 ? orch_error_code : -orch_error_code;
+    }
+    if (sched_error_code != PTO2_ERROR_NONE) {
+        return sched_error_code < 0 ? sched_error_code : -sched_error_code;
+    }
+    return 0;
+}
+
+#endif  // SRC_A5_RUNTIME_TENSORMAP_AND_RINGBUFFER_COMMON_PTO_RUNTIME_STATUS_H_
diff --git a/src/a5/runtime/fully_distributed_within_core/docs/MULTI_RING.md b/src/a5/runtime/fully_distributed_within_core/docs/MULTI_RING.md
new file mode 100644
index 000000000..db4cda386
--- /dev/null
+++ b/src/a5/runtime/fully_distributed_within_core/docs/MULTI_RING.md
@@ -0,0 +1,330 @@
+# Multi-Ring Buffer Architecture
+
+> Extension to the PTO2 runtime. For the base architecture, see [RUNTIME_LOGIC.md](RUNTIME_LOGIC.md).
+
+## 1. Problem
+
+The single-ring design uses one `last_task_alive` watermark shared by HeapRing, TaskRing, and DepPool. When tasks from an inner scope (e.g., per-block iteration) complete, their resources cannot be reclaimed until **all** prior tasks — including those from the outer scope — also complete. This wastes ring capacity and can trigger deadlocks when ring sizes are small.
+
+## 2. Solution
+
+Split HeapRing, TaskRing, and DepPool into arrays of `PTO2_MAX_RING_DEPTH` (4) independent instances. Each scope depth maps to its own ring, with an independent `last_task_alive` watermark.
+
+```text
+Scope depth 0  ──►  rings[0] = { HeapRing, TaskRing, DepPool }
+Scope depth 1  ──►  rings[1] = { HeapRing, TaskRing, DepPool }
+Scope depth 2  ──►  rings[2] = { HeapRing, TaskRing, DepPool }
+Scope depth ≥3 ──►  rings[3] = { HeapRing, TaskRing, DepPool }  (clamped)
+```
+
+Inner-scope tasks can now be reclaimed independently without waiting for outer-scope tasks to complete.
+
+## 3. Task ID Encoding
+
+Task IDs are widened from 32-bit to 64-bit to carry the ring identity:
+
+```text
+task_id.raw = (ring_id << 32) | local_id
+```
+
+`PTO2TaskId` exposes direct accessors in `pto_runtime2_types.h`:
+
+| API | Purpose |
+| --- | ------- |
+| `pto2_make_task_id(ring_id, local_id)` | Compose a 64-bit task ID (`PTO2TaskId`) |
+| `task_id.ring()` | Extract `ring_id` (bits 63-32) |
+| `task_id.local()` | Extract `local_id` (bits 31-0) |
+| `task_id.raw` | Access the packed 64-bit encoding |
+
+Type changes:
+
+| Field | Before | After |
+| ----- | ------ | ----- |
+| `PTO2TaskDescriptor.task_id` | `int32_t` | `PTO2TaskId` |
+| `PTO2TensorMapEntry.producer_task_id` | `int32_t` | `PTO2TaskId` |
+| `PTO2TaskSlotState.ring_id` | N/A | `uint8_t` (new, denormalized for fast access) |
+
+## 4. Data Structures
+
+### 4.1 PTO2RingSet (new)
+
+Bundles the three per-ring resources into a single aggregate (`pto_ring_buffer.h`):
+
+```cpp
+struct PTO2RingSet {
+    PTO2HeapRing   heap_ring;
+    PTO2TaskRing   task_ring;
+    PTO2FaninPool fanin_pool;
+};
+```
+
+### 4.2 PTO2OrchestratorState (modified)
+
+```cpp
+// Before: single ring
+PTO2HeapRing heap_ring;
+PTO2TaskRing task_ring;
+PTO2DepListPool dep_pool;
+
+// After: per-ring array (dep_pool moved to scheduler, see §4.5)
+PTO2RingSet rings[PTO2_MAX_RING_DEPTH];
+```
+
+Ring selection: `current_ring_id() = min(scope_stack_top, PTO2_MAX_RING_DEPTH - 1)`.
+
+### 4.3 PTO2SharedMemoryHeader (modified)
+
+Per-ring flow control and per-ring layout info are grouped together:
+
+```cpp
+struct PTO2RingFlowControl {
+    std::atomic<int32_t> current_task_index;  // task ring head
+    std::atomic<int32_t> last_task_alive;     // task ring tail
+    std::atomic<uint64_t> heap_top;           // heap alloc pointer
+    std::atomic<uint64_t> heap_tail;          // heap reclaim pointer
+};
+
+struct alignas(64) PTO2SharedMemoryRingHeader {
+    PTO2RingFlowControl fc;
+
+    // Layout metadata (set once at init)
+    uint64_t task_window_size;
+    int32_t task_window_mask;       // task_window_size - 1
+    uint64_t heap_size;
+    uint64_t task_descriptors_offset;
+
+    // Per-ring data pointers (host-side, set by PTO2SharedMemoryHandle::setup_pointers)
+    PTO2TaskDescriptor *task_descriptors;
+    PTO2TaskPayload *task_payloads;
+    PTO2TaskSlotState *slot_states;
+
+    // Accessors (slot = local_id & task_window_mask)
+    PTO2TaskDescriptor &get_task_by_slot(int32_t slot);
+    PTO2TaskDescriptor &get_task_by_task_id(int32_t local_id);
+    PTO2TaskPayload &get_payload_by_slot(int32_t slot);
+    PTO2TaskPayload &get_payload_by_task_id(int32_t local_id);
+    PTO2TaskSlotState &get_slot_state_by_slot(int32_t slot);
+    PTO2TaskSlotState &get_slot_state_by_task_id(int32_t local_id);
+};
+
+// In header:
+PTO2SharedMemoryRingHeader rings[PTO2_MAX_RING_DEPTH];
+```
+
+Per-ring try-locks in the scheduler state prevent concurrent scheduler threads from interleaving watermark writes within the same ring. `FaninPool`/`DepListPool` `reclaim`/`ensure_space` take `PTO2SharedMemoryRingHeader&` directly (no `ring_id` or `fc` parameters).
+
+### 4.4 PTO2SharedMemoryHandle (lifecycle-only)
+
+Slimmed to lifecycle management only. Per-ring data pointers now live in `PTO2SharedMemoryRingHeader` (§4.3). Runtime components (orchestrator, scheduler) store `PTO2SharedMemoryHeader*` directly, eliminating one indirection on every per-ring access.
+
+```cpp
+struct PTO2SharedMemoryHandle {
+    void *sm_base;
+    uint64_t sm_size;
+    PTO2SharedMemoryHeader *header;
+    bool is_owner;
+};
+```
+
+### 4.5 PTO2SchedulerState (modified)
+
+```cpp
+struct RingSchedState {
+    // Cache Line 0: ring pointer (read-only) + hot path (read-write)
+    PTO2SharedMemoryRingHeader *ring;  // direct pointer, no indirection
+    int32_t last_task_alive;
+    std::atomic<int32_t> advance_lock;  // multi-thread CAS
+
+    // Cache Line 1+: Thread 0 only (wiring dep_pool, cache-isolated)
+    alignas(64) PTO2DepListPool dep_pool;
+};
+
+RingSchedState ring_sched_states[PTO2_MAX_RING_DEPTH];
+PTO2SpscQueue wiring_queue;  // global SPSC queue: orchestrator pushes, scheduler thread 0 drains
+```
+
+`slot_states`, `task_window_size`, and `task_window_mask` are no longer duplicated — callers access them via `ring->get_slot_state_by_*()` and other ring header accessors. The ring pointer shares cache line 0 with `last_task_alive` and `advance_lock`.
+
+### 4.6 PTO2TensorMap (modified)
+
+```cpp
+PTO2TensorMapEntry** task_entry_heads[PTO2_MAX_RING_DEPTH];
+int64_t last_task_alives[PTO2_MAX_RING_DEPTH];
+```
+
+Entry validity checks and `cleanup_retired` operate per-ring:
+
+```cpp
+bool entry_valid(const PTO2TensorMapEntry& e) {
+    int32_t ring = e.producer_task_id.ring();
+    int32_t local = e.producer_task_id.local();
+    return local >= last_task_alives[ring];
+}
+```
+
+### 4.7 Unchanged Structures
+
+| Structure | Reason |
+| --------- | ------ |
+| `PTO2DepListEntry` | Stores `PTO2TaskSlotState*` pointer — naturally crosses ring boundaries |
+| `PTO2TaskPayload` | `fanin_slot_states[]` are pointers — no ring coupling |
+| `PTO2ReadyQueue` | Global ready queues shared across all rings (tasks ready to dispatch regardless of origin ring) |
+| `PTO2DispatchPayload` | Built per-dispatch, no ring state needed |
+
+## 5. Reclamation
+
+### 5.1 Per-Ring Watermark Advancement
+
+Each ring's `last_task_alive` advances independently:
+
+```text
+advance_ring_pointers(ring_id):  // protected by per-ring advance_lock
+    la = ring->fc.last_task_alive
+    while ring->get_slot_state_by_task_id(la).task_state >= CONSUMED:
+        reset slot for reuse
+        la++
+    sync_to_sm()  // release-store last_task_alive
+```
+
+Per-ring try-locks in the scheduler state prevent concurrent scheduler threads from interleaving heap_tail writes within the same ring.
+
+### 5.2 Cross-Ring Dependencies
+
+Dependency edges use `PTO2TaskSlotState*` pointers, which naturally span rings:
+
+- Ring 1 task depends on ring 0 producer → ring 0's `fanout_head` linked list contains a ring 1 `PTO2TaskSlotState*`
+- When ring 0 task completes, it walks its fanout list and decrements ring 1 consumers' `fanin_refcount`
+- No special cross-ring logic needed — pointer-based design is ring-agnostic
+
+### 5.3 DepPool Reclamation
+
+DepPool is exclusively managed by scheduler thread 0 (allocation during wiring, reclamation during watermark advancement):
+
+```text
+// Called by scheduler thread 0 during wiring_queue drain:
+dep_pool_reclaim(ring_id):
+    la = ring->fc.last_task_alive
+    newest_consumed = la - 1
+    mark = ring->get_slot_state_by_task_id(newest_consumed).dep_pool_mark
+    if mark > 0:
+        ring_sched_states[ring_id].dep_pool.advance_tail(mark)
+```
+
+Note: dep entries from ring N's pool may appear in ring M's fanout lists. Reclamation is safe because the entries are accessed during fanout traversal (completion time), which always happens before the consumer task — and therefore the dep entry — becomes eligible for reclamation.
+
+## 6. AICPU Register Protocol Fix
+
+The AICore dispatch protocol uses 32-bit registers. With multi-ring, `task_id` truncation to 32-bit loses the `ring_id`, causing collisions:
+
+```text
+Ring 0, local_id=0  →  DATA_MAIN_BASE = 0 + 1 = 1
+Ring 1, local_id=0  →  DATA_MAIN_BASE = 0 + 1 = 1  (collision!)
+```
+
+AICore uses `last_reg_val` to detect new dispatches — identical values cause skipped tasks and false completions from stale COND registers.
+
+**Fix**: Per-core monotonic dispatch counter `s_dispatch_seq[core_id]` replaces `task_id` in register writes, guaranteeing unique `DATA_MAIN_BASE` values per core regardless of ring origin.
+
+## 7. Configuration
+
+### 7.1 Compile-Time Defaults (per ring)
+
+| Constant | Default | Total (×4 rings) |
+| -------- | ------- | ---------------- |
+| `PTO2_TASK_WINDOW_SIZE` | 16384 | 65536 |
+| `PTO2_HEAP_SIZE` | 256 MB | 1 GB |
+| `PTO2_DEP_LIST_POOL_SIZE` | 16384 | 65536 |
+
+### 7.2 Runtime Overrides
+
+Ring sizing can be configured either uniformly for every ring or independently
+per ring. Precedence is resolved independently for each resource and ring:
+
+```text
+per-ring CallConfig value
+  > scalar CallConfig value
+  > per-ring PTO2_RING_* env value
+  > scalar PTO2_RING_* env value
+  > compile-time default
+```
+
+`ring_id` is the scope-depth ring selected by the runtime:
+
+```text
+scope depth 0 -> ring 0
+scope depth 1 -> ring 1
+scope depth 2 -> ring 2
+scope depth >=3 -> ring 3
+```
+
+Per-task via `CallConfig.runtime_env` — different L2 tasks in one launch can
+each carry their own sizes. Invalid values raise at submit time (`validate()`).
+The scalar fields preserve the old behavior and broadcast one value to all
+rings:
+
+```python
+cfg = CallConfig()
+cfg.runtime_env.ring_task_window = 128   # power of 2, >= 4
+cfg.runtime_env.ring_heap = 262144       # bytes/ring, >= 1024
+cfg.runtime_env.ring_dep_pool = 256      # 4 .. INT32_MAX
+orchestrator.submit_next_level(handle, args, cfg)
+```
+
+Set the array fields to tune the four scope-depth rings independently. Each
+array must contain exactly four entries; use `0` for an entry that should fall
+through to the next precedence tier. All `CallConfig` values are integer
+byte/count values.
+
+```python
+cfg = CallConfig()
+cfg.runtime_env.ring_task_windows = [8192, 16384, 131072, 524288]
+cfg.runtime_env.ring_heaps = [
+    128 * 1024 * 1024,
+    256 * 1024 * 1024,
+    384 * 1024 * 1024,
+    512 * 1024 * 1024,
+]
+cfg.runtime_env.ring_dep_pools = [4096, 8192, 16384, 32768]
+orchestrator.submit_next_level(handle, args, cfg)
+```
+
+Scene tests set the same keys under a nested `runtime_env` block in the
+per-case `config` dict:
+
+```python
+"config": {
+    "runtime_env": {
+        "ring_task_windows": [8192, 16384, 131072, 524288],
+        "ring_heaps": [134217728, 268435456, 402653184, 536870912],
+        "ring_dep_pools": [4096, 8192, 16384, 32768],
+    }
+}
+```
+
+Process-wide env fallback accepts either one scalar value or exactly four
+comma-separated per-ring values. Invalid env values are logged and ignored, then
+fall through to defaults. `PTO2_RING_HEAP` values are integer bytes:
+
+```bash
+# Uniform, old behavior:
+PTO2_RING_TASK_WINDOW=1024
+PTO2_RING_HEAP=1048576
+PTO2_RING_DEP_POOL=1024
+
+# Per-ring, indexed by ring_id 0..3:
+PTO2_RING_TASK_WINDOW=8192,16384,131072,524288
+PTO2_RING_HEAP=134217728,268435456,402653184,536870912
+PTO2_RING_DEP_POOL=4096,8192,16384,32768
+```
+
+Use `--enable-scope-stats` to confirm the effective values for a real run. The
+first line of `scope_stats/scope_stats.jsonl` includes `task_window_max`,
+`heap_max`, and `dep_pool_max`, indexed by `ring`.
+
+### 7.3 Sizing Guidelines
+
+- `task_window` must be ≥ max tasks in any single scope + headroom for concurrent scopes
+- `heap` must accommodate peak output buffer allocation across all in-flight tasks on that ring
+- `dep_pool` must be ≥ total dependency entries for all in-flight tasks on that ring
+- On hardware, back-pressure latency is higher than in simulation — size conservatively
+- Adding inner `PTO2_SCOPE` reduces peak per-ring usage, enabling smaller sizes
diff --git a/src/a5/runtime/fully_distributed_within_core/docs/RUNTIME_LOGIC.md b/src/a5/runtime/fully_distributed_within_core/docs/RUNTIME_LOGIC.md
new file mode 100644
index 000000000..e6760fb1e
--- /dev/null
+++ b/src/a5/runtime/fully_distributed_within_core/docs/RUNTIME_LOGIC.md
@@ -0,0 +1,39 @@
+# Runtime Logic: fully_distributed_within_core
+
+**Target design.** Orchestration, scheduling, and execution all run on the AI
+cores in SPMD fashion; the AICPU is removed from orchestration/scheduling. The
+authoritative specification is:
+
+- [`docs/fully_distributed_within_core.md`](../../../../docs/fully_distributed_within_core.md)
+
+Core elements (see the spec):
+
+- Task ownership via a claim race over two global cursors (`cube_cursor`,
+  `vector_cursor`); `owner = builder = executor`.
+- Per-core full-duplicate TensorMap for dependency discovery (pull model via a
+  global `task_completed_flag` ring).
+- Per-core private task ring + block-shared `block.won[N]` deposit table for
+  multi-core (MIX / 2V) co-ownership (anchor push + follower async drain).
+- Deterministic, per-core-replicated GM output heap with frontier-based
+  reclamation.
+
+## Current state (re-based on tensormap_and_ringbuffer)
+
+This runtime is re-based on `tensormap_and_ringbuffer` to reuse its
+`PTO2TensorMap`, `MixedKernels`/`ActiveMask`, `L0TaskArgs`, the
+`pto_orchestration_api.h` submit API, and kernel-address resolution. The
+distributed model is layered on incrementally:
+
+- `runtime/` — adds global claim cursors, a global completion-flag ring, a
+  deterministic GM output heap, and per-core replicated TensorMap + private task
+  ring on top of the reused types.
+- `aicore/` — the SPMD run-ahead orchestrate+execute loop (spec section 6).
+- `aicpu/` — reduced to an init/wire/signal/wait stub (no orchestration,
+  scheduling, or dispatch).
+- `host/` — runtime maker / compile info (orchestration entry is invoked on the
+  cores).
+- `orchestration/` — the PTO2 orchestration API (unchanged surface).
+
+The legacy AICPU orchestrator/scheduler sources inherited from
+`tensormap_and_ringbuffer` (`runtime/scheduler/`, the orchestrator pipeline) are
+progressively replaced or bypassed by the distributed path.
diff --git a/src/a5/runtime/fully_distributed_within_core/docs/SCALAR_DATA_ACCESS.md b/src/a5/runtime/fully_distributed_within_core/docs/SCALAR_DATA_ACCESS.md
new file mode 100644
index 000000000..ef1de83b4
--- /dev/null
+++ b/src/a5/runtime/fully_distributed_within_core/docs/SCALAR_DATA_ACCESS.md
@@ -0,0 +1,137 @@
+# Scalar Data Access — get/set_tensor_data Design
+
+## 1. Overview
+
+During task graph construction, orchestration sometimes needs to read InCore kernel results (for control-flow decisions) or write initial values into tensors. `get_tensor_data` / `set_tensor_data` provide **blocking** cross-layer data access, allowing orchestration to safely read and write tensor data.
+
+**Core design principle**: Reuse the existing TensorMap dependency tracking mechanism — no new synchronization infrastructure.
+
+## 2. API
+
+```cpp
+// Blocking read: returns value at the given indices (default: raw uint64_t bits)
+// Specify T for typed read: float val = get_tensor_data<float>(tensor, 1, idx);
+template<typename T = uint64_t>
+T get_tensor_data(const Tensor& tensor, uint32_t ndims, const uint32_t indices[]);
+
+// Blocking write: stores value at the given indices (type deduced from argument)
+// Typed write: set_tensor_data(tensor, 1, idx, 42.0f);
+template<typename T = uint64_t>
+void set_tensor_data(Tensor& tensor, uint32_t ndims, const uint32_t indices[], T value);
+```
+
+Both call into the runtime through the ops table — orchestration .so needs no runtime symbol linkage.
+
+## 3. Blocking Interface Design
+
+### 3.1 get_tensor_data Flow
+
+```text
+addr null-check → TensorMap lookup → spin-wait producer COMPLETED → compute flat offset → memcpy read
+```
+
+- **addr null-check**: `buffer.addr == 0` means unallocated — log error, return 0
+- **TensorMap lookup**: find producer task by `buffer.addr`
+- **spin-wait**: wait until producer `task_state >= PTO2_TASK_COMPLETED`
+- **No producer** (lookup callback never fires): skip waiting, read immediately
+
+### 3.2 set_tensor_data Flow
+
+```text
+addr null-check → TensorMap lookup → spin-wait producer COMPLETED → spin-wait consumers done → memcpy write
+```
+
+One extra step versus get_tensor_data: wait for all consumers to finish (`fanout_refcount >= fanout_count - 1`, excluding the scope reference).
+
+### 3.3 Timeout
+
+- Uses cycle counter (`get_sys_cnt_aicpu()`), checked every 1024 spins
+- Threshold: `PTO2_TENSOR_DATA_TIMEOUT_CYCLES` (~10 s at 1.5 GHz)
+- On timeout: sets `orch.fatal = true`, preventing further task submission
+
+## 4. add_output with Initial Value
+
+```cpp
+TensorCreateInfo ci(shapes, ndims, dtype);
+ci.set_initial_value(initial_value);
+args.add_output(ci);
+```
+
+**Mechanism**:
+
+1. `ci.set_initial_value(value)` marks the create-info with an initial value before submission
+2. `add_output(ci)` stores a pointer to `ci` in `L0TaskArgs` (the original must remain valid until submit)
+3. During payload init, the output tensor is materialized via `init_from_create_info()` which triggers the fill
+4. Fill strategy:
+   - Small buffer (< 64 B): element-by-element memcpy directly into dst
+   - Large buffer (≥ 64 B): fill the first 64 bytes as a template block, then bulk-memcpy in 64 B chunks; partial tail copy for remainder
+
+**Constraint**: existing tensors are write targets only through `add_inout()`.
+
+## 5. Scalar Dependencies via 1-Element Tensors
+
+Traditional scalars (`L0TaskArgs::add_scalar`) are one-way inputs with no TensorMap tracking. For cross-task scalar values, use a 1-element tensor as the carrier:
+
+```cpp
+uint32_t shapes[1] = {1};
+TensorCreateInfo scalar_ci(shapes, 1, DataType::FLOAT32);
+
+// Submit with initial value and keep the returned tensor
+scalar_ci.set_initial_value(float_to_u64(77.0f));
+L0TaskArgs args;
+args.add_output(scalar_ci);
+TaskOutputTensors outs = rt_submit_aiv_task(FUNC_NOOP, args);
+const Tensor& scalar_tensor = outs.get_ref(0);
+
+// Orchestration-side blocking read (waits for kernel completion)
+uint32_t idx[1] = {0};
+float val = get_tensor_data<float>(scalar_tensor, 1, idx);
+```
+
+**Advantage**: Fully reuses existing TensorMap (producer tracking, fanin/fanout dependencies) — no new infrastructure needed.
+
+## 6. Data Hazard Analysis
+
+Three actors:
+
+- **Kernel**: InCore task submitted via add_input/add_output/add_inout (asynchronous execution)
+- **Orch Read**: orchestration calls `get_tensor_data` (blocking read)
+- **Orch Write**: orchestration calls `set_tensor_data` (blocking write)
+
+### Hazard Matrix (earlier operation → later operation)
+
+| # | Earlier Op | Later Op | Hazard | Guarantee | Safe? |
+| - | ---------- | -------- | ------ | --------- | ----- |
+| 1 | Kernel write (OUTPUT) | Orch Read | RAW | spin-wait producer COMPLETED | Yes |
+| 2 | Kernel write (OUTPUT) | Orch Write | WAW | spin-wait producer COMPLETED | Yes |
+| 3 | Kernel read (INPUT) | Orch Write | WAR | spin-wait fanout_refcount | **Needs INOUT** |
+| 4 | Kernel read-write (INOUT) | Orch Read | RAW | spin-wait producer COMPLETED | Yes |
+| 5 | Kernel read-write (INOUT) | Orch Write | WAW+WAR | spin-wait producer + consumers | Yes |
+| 6 | Orch Write | Kernel read (INPUT) | RAW | blocking completes before next submit | Yes |
+| 7 | Orch Write | Kernel write (OUTPUT) | WAW | same — serial guarantee | Yes |
+| 8 | Orch Read | Kernel write (OUTPUT) | WAR | same — serial guarantee | Yes |
+| 9–12 | Orch ↔ Orch | — | — | same-thread serial execution | Yes |
+
+### Key Design Points
+
+**Scenario #3 is the only case requiring special attention**:
+
+TensorMap tracks only producers (OUTPUT/INOUT), not pure INPUT consumers. If a tensor is only registered via `add_input()`, TensorMap has no producer entry for it. `set_tensor_data`'s `wait_for_tensor_ready()` finds no matching producer (the lookup callback never fires) and returns immediately — but the kernel may still be reading → **WAR data race**.
+
+**Solution**: For tensors that may later be written via `set_tensor_data`, use `add_inout()` instead of `add_input()`. INOUT registers a producer entry in TensorMap, enabling `set_tensor_data` to track all consumers through `fanout_refcount`.
+
+**Scenarios #6–8 serial guarantee**:
+
+get/set_tensor_data are blocking calls, and orchestration is single-threaded serial submission. After a blocking operation completes, subsequent code (including task submissions) executes strictly afterward.
+
+## 7. External Tensor Behavior
+
+`make_tensor_external()` creates tensors with a pre-set `buffer.addr` (pointing to host-allocated device memory).
+
+| Scenario | Behavior |
+| -------- | -------- |
+| External tensor never submitted as OUTPUT/INOUT | No TensorMap entry — get/set execute immediately |
+| External tensor previously submitted as OUTPUT/INOUT | TensorMap has producer entry — get/set spin-wait |
+| External tensor submitted as INPUT, then set_tensor_data | **WAR risk** — must use INOUT instead (same as scenario #3) |
+
+**Key rule**: If an external tensor will later be written via `set_tensor_data`, all prior kernel accesses must use `add_inout()`, not `add_input()`.
diff --git a/src/a5/runtime/fully_distributed_within_core/docs/SUBMIT_BY_CLUSTER.md b/src/a5/runtime/fully_distributed_within_core/docs/SUBMIT_BY_CLUSTER.md
new file mode 100644
index 000000000..8cba7e90c
--- /dev/null
+++ b/src/a5/runtime/fully_distributed_within_core/docs/SUBMIT_BY_CLUSTER.md
@@ -0,0 +1,222 @@
+# Submit by Cluster - Requirements and Main-Branch-Aligned Design
+
+## 1. Goal
+
+Define a single, main-branch-aligned specification for PTO2 cluster submission that combines:
+
+1. Product requirements (what must be true).
+2. Runtime design (how it is implemented on current main baseline).
+
+The target model is: one submitted graph node is one `MixedTask`, and dispatch/completion is mixed-task-granular.
+
+## 2. Background and Motivation
+
+Future Ascend hardware is expected to provide stronger locality within an AICore cluster (`1 AIC + 2 AIV`).
+The runtime therefore needs a "submit together, run together" model for related AIC/AIV kernels.
+
+Legacy per-task submit (`kernel_id + worker_type`) cannot express atomic co-dispatch of multiple kernels to one cluster.
+
+## 3. Scope
+
+### In Scope
+
+1. New orchestration-facing submit API for cluster-aware mixed submission.
+2. Runtime/backend scheduler and executor changes to treat a mixed submit as one atomic scheduling unit.
+3. Dependency gating, readiness, dispatch, completion, and reclamation at mixed-task granularity.
+4. AIV slot equivalence (`AIV0` and `AIV1` are equivalent execution targets).
+
+### Out of Scope
+
+1. User-facing cluster pinning (`allocate_cluster/free_cluster`-style APIs).
+2. New worker types beyond AIC/AIV.
+3. Cross-cluster user placement policies.
+4. Hardware topology changes beyond `1 AIC + 2 AIV` per cluster.
+
+## 4. Main-Branch Baseline Constraints
+
+Design must preserve the current main runtime architecture:
+
+1. Executor threading split (orchestrator thread vs scheduler threads), and post-orchestrator transition (`transition_requested_` + `reassign_cores_for_all_threads()`).
+2. Shared-memory hot/cold split (`PTO2TaskDescriptor` hot + `PTO2TaskPayload` cold).
+
+## 5. Terminology
+
+1. `cluster`: one physical unit with `1 AIC + 2 AIV`.
+2. `MixedKernels`: 3 submit slots (`AIC`, `AIV0`, `AIV1`) with `INVALID_KERNEL_ID` for inactive slots.
+3. `MixedTask`: one runtime graph node created by one submit call.
+4. `active_mask`: bitmask of active subtask slots.
+5. `resource shape`: normalized lane demand class of a mixed task.
+
+## 6. API Contract
+
+```cpp
+inline constexpr int32_t INVALID_KERNEL_ID = -1;
+
+struct MixedKernels {
+    int32_t aic_kernel_id{INVALID_KERNEL_ID};
+    int32_t aiv0_kernel_id{INVALID_KERNEL_ID};
+    int32_t aiv1_kernel_id{INVALID_KERNEL_ID};
+};
+
+static inline void rt_submit_task(PTO2Runtime* rt,
+                                       const MixedKernels& mixed_kernels,
+                                       Arg* args,
+                                       int32_t num_args);
+
+static inline void rt_submit_aic_task(PTO2Runtime* rt,
+                                           int32_t kernel_id,
+                                           Arg* args,
+                                           int32_t num_args);
+
+static inline void rt_submit_aiv_task(PTO2Runtime* rt,
+                                           int32_t kernel_id,
+                                           Arg* args,
+                                           int32_t num_args);
+```
+
+Rules:
+
+1. One submit call creates one `MixedTask`.
+2. All active slots share the same `args` and `num_args`.
+3. At least one slot must be active.
+4. `aiv0_kernel_id` and `aiv1_kernel_id` are semantically equivalent.
+5. Wrappers are orchestration sugar only (inline in orchestration API); no dedicated runtime ops entries.
+6. Submit-contract types are defined once in a shared header-only submit-types surface consumed by orchestration and runtime headers.
+7. Invalid submits follow existing PTO2 behavior (`always_assert`), not a new recoverable return-code API.
+
+## 7. Data Model (Requirements + Design)
+
+`PTO2TaskDescriptor` (hot path) carries mixed-task identity/state:
+
+1. `task_id`
+2. `active_mask`
+3. `completed_subtasks` (atomic counter, incremented per subtask completion)
+4. `kernel_id[3]` for `(AIC, AIV0, AIV1)`
+5. dependency heads/counters and packed-buffer metadata
+
+`PTO2TaskPayload` (cold path) carries:
+
+1. shared args/tensors/scalars copied once per mixed submit
+2. fanin mixed-task IDs
+3. other cold-path submit metadata
+
+Producer identity in TensorMap is mixed-task ID end-to-end.
+
+## 8. Scheduling Model
+
+### 8.1 Resource Shapes
+
+Runtime uses shape-based ready queues (not worker-type queues):
+
+1. `AIC_ONLY`
+2. `AIV_X1`
+3. `AIV_X2`
+4. `AIC_AIV_X1`
+5. `AIC_AIV_X2`
+
+Queueing key is normalized resource shape (not raw slot label).
+
+### 8.2 Atomic Cluster Dispatch
+
+1. Dispatch decision unit is one mixed task.
+2. For multi-slot mixed tasks, partial launch is forbidden.
+3. A mixed task is dispatchable only when one local owned cluster can satisfy all required lanes.
+4. Compatible mixed tasks may co-reside over time if they use disjoint free lanes.
+
+### 8.3 Dependency and Completion
+
+1. Fanin release/readiness remains dependency-correct and graph-level.
+2. Two-stage completion:
+   - `on_subtask_complete(task_id, subslot)`
+   - `on_task_complete(task_id)` only when `completed_subtasks == total_required_subtasks`
+3. Downstream release is triggered once per mixed task completion, not once per subslot.
+
+## 9. Executor Ownership and Numbering
+
+### 9.1 Canonical Flattened Numbering (Unchanged)
+
+Given `block_dim` clusters:
+
+1. AIC IDs: `[0, block_dim)`
+2. AIV IDs: `[block_dim, 3 * block_dim)`
+3. Cluster `i`: `{i, block_dim + i, 2 * block_dim + i}`
+
+This project-defined flattened numbering is kept unchanged.
+
+### 9.2 Cluster Ownership
+
+1. One cluster must be owned by one scheduler domain/thread at a time.
+2. No split-cluster ownership in either:
+   - initial `assign_cores_to_threads()`
+   - post-orchestrator `reassign_cores_for_all_threads()`
+3. Lane occupancy bookkeeping must remain consistent with ownership after reassignment.
+
+## 10. Functional Requirements
+
+### 10.1 Valid Mixed Shapes
+
+1. AIC only
+2. AIV only (1 or 2 AIV lanes)
+3. AIC + 1 AIV
+4. AIC + 2 AIV
+
+### 10.2 Runtime Behavior per Submit
+
+1. Validate submit arguments.
+2. Allocate mixed-task ID and initialize descriptor/payload/slot_state once.
+3. Lookup producers via TensorMap; collect fanin metadata and increment producers' `fanout_count`.
+4. Push task to scheduler's wiring queue (scheduler thread 0 asynchronously wires fanout edges and determines readiness).
+5. Dispatch all active lanes atomically when resources allow.
+6. Aggregate completion and release downstream once.
+
+## 11. Non-Functional Requirements
+
+1. Correctness: no dependency violation, no partial mixed-task dispatch.
+2. Determinism: dependency-correct ordering preserved; AIV lane choice may vary but remains semantically equivalent.
+3. Fairness: resource-aware polling heuristic is allowed; strict starvation-free guarantee across all shapes is not required.
+4. Performance: no obvious regression for non-cluster workflows.
+5. Observability: lifecycle visibility for submit/ready/dispatch/block/complete.
+
+## 12. Acceptance Criteria
+
+Feature is accepted when:
+
+1. Orchestration compiles and submits via `MixedKernels` API/wrappers.
+2. Scheduler dispatches each mixed task as one cluster scheduling decision.
+3. Dependencies gate mixed-task readiness correctly.
+4. AIV execution remains cluster-local and semantically equivalent across lanes.
+5. Existing non-cluster workflows continue to pass without behavior regression.
+6. Cluster ownership is never split across scheduler domains before/after transition.
+
+## 13. Verification Matrix
+
+Recommended validation coverage:
+
+1. Mapping correctness for cluster-to-core ID relation.
+2. Atomic dispatch for multi-slot shapes.
+3. Dependency gating and completion aggregation (`done_mask == active_mask`).
+4. Lane-occupancy co-residency behavior for compatible shapes.
+5. Core-transition ownership stability.
+6. Invalid submit handling (`always_assert` path).
+7. Regression coverage for existing examples/tests.
+
+Milestone command (device):
+
+```bash
+python tests/st/a2a3/tensormap_and_ringbuffer/batch_paged_attention/test_batch_paged_attention.py \
+  -p a2a3 -d 9
+```
+
+Final validation:
+
+```bash
+pytest examples tests/st --platform a2a3
+```
+
+## 14. Resolved Decisions
+
+1. Legacy orchestration-facing single-task submit is replaced by mixed submit contract.
+2. Invalid mixed submits fail with existing submit-time assert behavior.
+3. Per-cluster concurrent capacity is lane-occupancy-driven, not a fixed constant.
+4. Submit-contract types live in one shared header-only surface.
+5. Resource-aware dispatch heuristics are allowed without a strict starvation-free guarantee.
diff --git a/src/a5/runtime/fully_distributed_within_core/docs/device_log_profiling.md b/src/a5/runtime/fully_distributed_within_core/docs/device_log_profiling.md
new file mode 100644
index 000000000..af661d440
--- /dev/null
+++ b/src/a5/runtime/fully_distributed_within_core/docs/device_log_profiling.md
@@ -0,0 +1,166 @@
+# PTO2 Device Log Profiling Guide
+
+## How to Find Device Logs
+
+AICPU logs (via `LOG_INFO_V9`) are written by CANN's **dlog** subsystem and do **not** appear in the `python test_*.py` / pytest terminal output. They are written to CANN's device log directory:
+
+```text
+$HOME/ascend/log/debug/device-<device_id>/device-<pid>_<timestamp>.log
+```
+
+Each run produces a new log file (or appends to an existing one). Find the most recent file by modification time:
+
+```bash
+ls -lt $HOME/ascend/log/debug/device-<device_id>/ | head -5
+```
+
+## Log Structure Overview
+
+A single run produces two profiling blocks in the device log:
+
+| Block | Emitted by | Function | Content |
+| ----- | ---------- | -------- | ------- |
+| **Orchestrator Profiling** | Thread 3 (orchestrator) | `aicpu_orchestration_entry` | Time breakdown of graph construction on device |
+| **PTO2 Scheduler Summary** | Threads 0/1/2 (schedulers) | `SchedulerContext::resolve_and_dispatch` | Per-thread scheduling statistics, phase timing, and lock contention |
+
+All timing values are in microseconds (us), converted from AICPU cycle counters.
+
+---
+
+## Block 1: Orchestrator Profiling
+
+Thread 3 loads the orchestration `.so` via `dlopen`, calls `aicpu_orchestration_entry`, and prints a profiling summary after it returns.
+
+### Example (from a real run: batch=64, 16704 tasks)
+
+```text
+Thread 3: Calling aicpu_orchestration_entry from SO
+Thread 3: aicpu_orchestration_entry returned, cost 20943.940us
+Thread 3: === Orchestrator Profiling: 16704 tasks, total=14601.580us ===
+Thread 3:   sync_tensormap : 286.300us (2.0%)
+Thread 3:   task_ring_alloc: 380.400us (2.6%)
+Thread 3:   param_copy     : 2147.800us (14.7%)
+Thread 3:   lookup+dep     : 7290.300us (49.9%)
+Thread 3:   heap_alloc     : 701.500us (4.8%)
+Thread 3:   tensormap_ins  : 1890.380us (12.9%)
+Thread 3:   fanin+ready    : 1207.400us (8.3%)
+Thread 3:   finalize+SM    : 697.500us (4.8%)
+Thread 3:   scope_end      : 364.080us
+Thread 3:   avg/task       : 0.874us
+Thread 3: PTO2 total submitted tasks = 16704
+```
+
+### Field Reference
+
+| Field | Source (`pto_orchestrator.cpp`) | Description |
+| ----- | ------------------------------- | ----------- |
+| **cost** | Wall-clock around `orch_func()` call | Total time including orchestration logic + scope overhead |
+| **total** | Sum of all sub-steps below | Accumulated time inside `submit_task` across all tasks |
+| **sync_tensormap** | `g_orch_sync_cycle` | TensorMap validity sync and optional cleanup before each submission |
+| **task_ring_alloc** | `g_orch_alloc_cycle` | Allocating a task slot from the task ring buffer |
+| **param_copy** | `g_orch_args_cycle` | Copying param descriptors + tensor descriptor copies into task-owned storage |
+| **lookup+dep** | `g_orch_lookup_cycle` | TensorMap lookup for inputs/inouts + building fanin/fanout dependency edges |
+| **heap_alloc** | `g_orch_heap_cycle` | Allocating packed output buffers from the heap ring |
+| **tensormap_ins** | `g_orch_insert_cycle` | Inserting output/inout tensors into the TensorMap |
+| **fanin+ready** | `g_orch_fanin_cycle` | Building the fanin list + checking if task is already ready (Step 5/5b) |
+| **scope_end** | `g_orch_scope_end_cycle` | `end_scope` overhead (notifying scheduler of scope completion) |
+| **avg/task** | `total / submit_count` | Average orchestrator time per task submission |
+
+### Interpreting the Numbers
+
+- **cost > total**: The difference is overhead outside `submit_task` (the orchestration user code itself, scope_begin/end, TensorCreateInfo construction, etc.).
+- **lookup+dep** is typically the dominant cost (~50%) because it involves TensorMap hash lookups and building dependency edges with spinlock-protected fanout list insertions.
+- **param_copy** scales with the number of parameters per task.
+- **avg/task < 1us** indicates efficient graph construction.
+
+---
+
+## Block 2: PTO2 Scheduler Summary
+
+Each of the 3 scheduler threads (Thread 0, 1, 2) prints its own summary after completing all tasks. The output has two sub-sections: **summary** and **phase breakdown**.
+
+### Example (Thread 0, from a different run: batch=1, 1044 tasks)
+
+```text
+Thread 0: completed=352 tasks in 3477.420us (147 loops, 2.4 tasks/loop)
+Thread 0: --- Phase Breakdown ---
+Thread 0:   complete:    1485.020us (42.7%)
+Thread 0:   scan:        14.400us (0.4%)
+Thread 0:   dispatch:    1973.060us (56.7%)
+Thread 0:   idle:        4.940us (0.1%)
+```
+
+### Summary Line
+
+```text
+Thread N: completed=X tasks in Yus (Z loops, W tasks/loop)
+```
+
+| Field | Description |
+| ----- | ----------- |
+| **completed** | Number of tasks this thread processed to completion |
+| **Y us** | Total scheduler loop time (sum of all phase cycles) |
+| **Z loops** | Number of scheduler loop iterations |
+| **W tasks/loop** | Average tasks completed per loop iteration; higher = better throughput |
+
+### Phase Breakdown
+
+The scheduler loop runs four phases each iteration. Each phase's time is accumulated across all loop iterations.
+
+| Phase | What it does | Inline stats |
+| ----- | ------------ | ------------ |
+| **complete** | Polls handshake on each managed core; when a core completes, calls `on_subtask_complete(task_id, subslot)` to increment the completion counter; when `completed_subtasks == total_required_subtasks`, triggers `on_task_complete` which traverses fanout list (notify consumers) and fanin list (release producers) | `fanout`: edges/max_degree/avg for consumer notification; `fanin`: edges/max_degree/avg for producer release |
+| **scan** | Updates the perf profiling header with latest scheduler state | — |
+| **dispatch** | For each idle core, pops a task from the shape-based ready queue via `get_ready_task(shape)`, builds the dispatch payload, and writes the task to the core's handshake register | `pop`: `hit` = successful pops (task dispatched), `miss` = empty queue pops, `hit_rate` = hit/(hit+miss) |
+| **idle** | Scheduler loop iteration where no progress was made (no completions, no dispatches) | — |
+
+**Interpreting phase percentages:**
+
+- **dispatch** is typically the largest (~55-60%) because it includes ready-queue pops (with spinlock), payload construction, and cache flush (`dc cvac` + `dsb sy`).
+- **complete** is the second largest (~40-45%) because it traverses both fanout (CAS-based fanin decrement, conditional ready-queue push) and fanin (release_producer, check_consumed, ring pointer advancement).
+- **scan** is small (<1%) — only updates the perf header.
+- **idle** is negligible when tasks are flowing; high idle% indicates the scheduler is starved.
+
+**Interpreting pop hit_rate:**
+
+- **High hit_rate (>50%)**: Ready queue is well-supplied; dispatch is efficient.
+- **Low hit_rate (<10%)**: Ready queue is mostly empty when cores become idle. The bottleneck is upstream (orchestrator submission speed or fanout resolution latency), not dispatch itself.
+
+### Per-Task Averages
+
+Divide each thread's phase times by its `completed` count to get per-task scheduling cost:
+
+| Metric | Formula | Typical value |
+| ------ | ------- | ------------- |
+| Scheduling overhead per task | total_time / completed | ~5-10 us/task |
+| Dispatch per task | dispatch_time / completed | ~3-6 us/task |
+| Complete per task | complete_time / completed | ~2-4 us/task |
+
+---
+
+## Cross-Referencing with Host Profiling
+
+When `--enable-l2-swimlane` is used, the host terminal prints a **Task Statistics by Function** table with `Total_Exec` (total AICore kernel execution time). Combined with device log data:
+
+| Metric | Source | Description |
+| ------ | ------ | ----------- |
+| Avg kernel exec time | `Total_Exec / total_tasks` (host) | Time AICore spends executing each kernel |
+| Avg scheduling overhead | `sum(thread_total) / total_tasks` (device log) | Time AICPU spends scheduling each task |
+| Sched/Exec ratio | scheduling / execution | Scheduling overhead relative to kernel execution |
+
+A high sched/exec ratio (e.g., >3x) indicates that scheduling overhead dominates, and optimizations should target the scheduler's dispatch hot path (cache flush, payload construction) or upstream task flow.
+
+---
+
+## Quick Reference: Extracting Profiling Data
+
+```bash
+# Find the latest device log for device 2
+ls -t $HOME/ascend/log/debug/device-2/device-*.log | head -1
+
+# Extract orchestrator profiling (Thread 3)
+grep "Thread 3:" <logfile>
+
+# Extract scheduler profiling (Threads 0/1/2)
+grep -E "Thread [012]:" <logfile>
+```
diff --git a/src/a5/runtime/fully_distributed_within_core/docs/profiling_levels.md b/src/a5/runtime/fully_distributed_within_core/docs/profiling_levels.md
new file mode 100644
index 000000000..2ef6c1b6a
--- /dev/null
+++ b/src/a5/runtime/fully_distributed_within_core/docs/profiling_levels.md
@@ -0,0 +1,450 @@
+# PTO Runtime2 Profiling Levels
+
+This document describes the profiling macro hierarchy and logging control in the PTO Runtime2 system.
+
+## Overview
+
+PTO Runtime2 uses a hierarchical profiling system with compile-time macros to control profiling code compilation and log output. The `enable_l2_swimlane` runtime flag (integer perf_level 0–4) controls data collection granularity (performance buffers, shared memory writes) but does NOT control log output.
+
+## Profiling Macro Hierarchy
+
+Defaults and dependency validation are centralized in
+`src/common/task_interface/profiling_config.h`. Runtime headers include that
+file before using the macros, so both a2a3 and a5 share the same default
+values and compile-time checks.
+
+```text
+PTO2_PROFILING (base level, default=1)
+├── PTO2_ORCH_PROFILING (orchestrator, default=0, requires PTO2_PROFILING=1)
+|   └──PTO2_TENSORMAP_PROFILING (tensormap, default=0, requires PTO2_ORCH_PROFILING=1)
+├── PTO2_SCHED_PROFILING (scheduler, default=0, requires PTO2_PROFILING=1)
+└── --enable-l2-swimlane [PERF_LEVEL] (L2 swimlane data collection, 0-4, bare=4, requires PTO2_PROFILING=1)
+
+```
+
+### Compile-Time Validation
+
+Each sub-level macro requires `PTO2_PROFILING=1`:
+
+```cpp
+#if PTO2_ORCH_PROFILING && !PTO2_PROFILING
+#error "PTO2_ORCH_PROFILING requires PTO2_PROFILING=1"
+#endif
+
+#if PTO2_SCHED_PROFILING && !PTO2_PROFILING
+#error "PTO2_SCHED_PROFILING requires PTO2_PROFILING=1"
+#endif
+
+#if PTO2_TENSORMAP_PROFILING && !PTO2_ORCH_PROFILING
+#error "PTO2_TENSORMAP_PROFILING requires PTO2_ORCH_PROFILING=1"
+#endif
+```
+
+## Profiling Levels
+
+### Level 0: No Profiling (PTO2_PROFILING=0)
+
+**What's compiled:**
+
+- Debug/diagnostic logs (always present)
+- Progress tracking (`PTO2 progress: completed=...`)
+- Stall detection and dump (triggered after the `SCHEDULER_TIMEOUT_MS` wall-clock no-progress budget)
+- Deadlock/livelock detection (`diagnose_stuck_state`, called on stall)
+
+**What's NOT compiled:**
+
+- All `CYCLE_COUNT_*` timing counters (`sched_*_cycle`, orchestrator cost counters)
+- Scheduler/Orchestrator profiling summary logs guarded by `#if PTO2_PROFILING`
+- Performance data collection paths (`enable_l2_swimlane` runtime flag becomes ineffective because profiling code is not compiled)
+
+**Log output (normal run, no stall):**
+
+- No `sched_start/sched_end/sched_cost` timestamps
+- No `orch_start/orch_end/orch_cost` timestamps
+- No `Scheduler summary: total_time=...`
+- No `PTO2 total submitted tasks` log
+- `PTO2 progress: completed=... total=...` may appear (thread 0 only, at task completion milestones)
+
+---
+
+### Level 1: Basic Profiling (PTO2_PROFILING=1)
+
+**What's compiled:**
+
+- Base timing counters for scheduler loop (`sched_complete/dispatch/idle/scan`)
+- Per-thread orchestration timing (`orch_start`, `orch_end`, `orch_cost`)
+- Stage-level orchestration end timestamp (`orch_stage_end`, printed by last orch thread only, marks the moment all orch threads have finished and core transition is about to be requested; only when `orch_to_sched_` is true)
+- PTO2 total submitted tasks count (printed by last orch thread, after orch timing line)
+- Scheduler summary output (`total_time`, `loops`, `tasks_scheduled`)
+- Scheduler lifetime timestamps and cost (`sched_start`, `sched_end`, `sched_cost` — captured inside `resolve_and_dispatch_pto2()`, printed before Scheduler summary)
+
+**What's NOT compiled:**
+
+- Detailed phase breakdowns
+- TensorMap statistics
+
+**Log output (additional lines vs Level 0, per normal run):**
+
+- `Thread %d: orch_start=%llu orch_end=%llu orch_cost=%.3fus` — each orch thread, after orchestration fully complete
+- `PTO2 total submitted tasks = %d, already executed %d tasks` — last orch thread only (×1), after orch timing line
+- `Thread %d: orch_stage_end=%llu` — last orch thread only (×1), only when `orch_to_sched_=true`
+- `Thread %d: sched_start=%llu sched_end=%llu sched_cost=%.3fus` — each sched thread, printed before Scheduler summary
+- `Thread %d: Scheduler summary: total_time=%.3fus, loops=%llu, tasks_scheduled=%d` — each sched thread
+- `Thread %d: sched_start=%llu sched_end(timeout)=%llu sched_cost=%.3fus` — timeout path only (replaces normal `sched_end`)
+
+**LOG_INFO_V9 count (normal run):**
+
+- `orch_to_sched_=false` (default): `N_sched*2 + N_orch*1 + 1` (orch_timing + PTO2_total + sched_timing + Scheduler_summary)
+- `orch_to_sched_=true` (`PTO2_ORCH_TO_SCHED=1`): adds 1 (`orch_stage_end`)
+
+> See the table at the end for concrete counts based on the `paged_attention` example.
+
+**Example log output — `orch_to_sched_=false`** (from `paged_attention`, device 10):
+
+```text
+Thread 2: orch_start=48214752948321 orch_end=48214752959379 orch_cost=230.000us
+Thread 3: orch_start=48214752948316 orch_end=48214752961505 orch_cost=275.000us
+PTO2 total submitted tasks = 13, already executed 13 tasks
+Thread 1: sched_start=48214752948235 sched_end=48214752962379 sched_cost=295.000us
+Thread 1: Scheduler summary: total_time=159.560us, loops=3782, tasks_scheduled=6
+Thread 0: sched_start=48214752948200 sched_end=48214752963571 sched_cost=320.000us
+Thread 0: Scheduler summary: total_time=183.180us, loops=4611, tasks_scheduled=7
+```
+
+**Example log output — `orch_to_sched_=true`** (`PTO2_ORCH_TO_SCHED=1`, from `paged_attention`, device 11):
+
+```text
+Thread 3: orch_stage_end=48236915058307
+Thread 3: orch_start=48236915044001 orch_end=48236915058781 orch_cost=308.000us
+Thread 2: orch_start=48236915044003 orch_end=48236915058782 orch_cost=308.000us
+PTO2 total submitted tasks = 13, already executed 13 tasks
+Thread 0: sched_start=48236915043911 sched_end=48236915059191 sched_cost=318.000us
+Thread 0: Scheduler summary: total_time=187.920us, loops=4561, tasks_scheduled=4
+Thread 1: sched_start=48236915043947 sched_end=48236915061881 sched_cost=372.000us
+Thread 1: Scheduler summary: total_time=168.620us, loops=3880, tasks_scheduled=9
+```
+
+> With `orch_to_sched_=true`, orch threads transition to schedulers after orchestration. They print `orch_end` but do NOT print `Scheduler summary` or `sched_end` (they have no cores assigned at shutdown time).
+
+**Note:**
+
+- All logs above are controlled by compile-time macro `PTO2_PROFILING`, not by `enable_l2_swimlane`.
+- `enable_l2_swimlane` only controls shared-memory data collection / swimlane export.
+- Enable `orch_to_sched_` via environment variable: `PTO2_ORCH_TO_SCHED=1`.
+
+---
+
+### Level 2: Scheduler Detailed Profiling (PTO2_SCHED_PROFILING=1)
+
+**Requires:** `PTO2_PROFILING=1`
+
+**What's compiled:**
+
+- All Level 1 features
+- Detailed scheduler phase counters
+- Phase-specific statistics (complete, scan, dispatch, idle)
+- Hit rate tracking (complete poll, ready queue pop)
+
+**Log output:** 18 LOG_INFO_V9 logs (11 debug + 2 basic + 7 scheduler detailed - 2 replaced)
+
+- Replaces scheduler summary with detailed breakdown
+
+**Scheduler output:**
+
+```text
+Thread X: === Scheduler Phase Breakdown: total=XXXus, XXX tasks ===
+Thread X:   complete       : XXXus (XX.X%)
+Thread X:     poll         : XXXus (XX.X%)  hit=XXX, miss=XXX, hit_rate=XX.X%
+Thread X:     otc_lock     : XXXus (XX.X%)  work=XXXus wait=XXXus  atomics=XXX
+Thread X:     otc_fanout   : XXXus (XX.X%)  work=XXXus wait=XXXus  atomics=XXX
+Thread X:     otc_fanin    : XXXus (XX.X%)  atomics=XXX
+Thread X:     otc_self     : XXXus (XX.X%)  atomics=XXX
+Thread X:     perf         : XXXus (XX.X%)
+Thread X:   dispatch       : XXXus (XX.X%)
+Thread X:     poll         : XXXus (XX.X%)
+Thread X:     pop          : XXXus (XX.X%)  work=XXXus wait=XXXus  atomics=XXX
+Thread X:     setup        : XXXus (XX.X%)
+Thread X:   scan           : XXXus (XX.X%)
+Thread X:   idle           : XXXus (XX.X%)
+Thread X:   avg/complete   : XXXus
+Thread X: Scheduler summary: total_time=XXXus, loops=XXX, tasks_scheduled=XXX
+```
+
+Per-thread fanout / fanin edge counts and ready-queue pop hit / miss
+stats live in `aicpu_scheduler_phases[]` (in `l2_swimlane_records.json`
+captured at l2_swimlane_level >= 3) and `deps.json`; consume them via
+`simpler_setup/tools/sched_overhead_analysis.py`.
+
+---
+
+### Level 3: Orchestrator Detailed Profiling (PTO2_ORCH_PROFILING=1)
+
+**Requires:** `PTO2_PROFILING=1`
+
+**What's compiled:**
+
+- All Level 1 features
+- Detailed orchestrator phase counters
+- Per-phase cycle tracking
+- Atomic operation counters
+- Wait time tracking
+
+**Log output:** 30 LOG_INFO_V9 logs (11 debug + 2 basic + 1 scheduler summary + 17 orchestrator detailed - 1 replaced)
+
+- Replaces basic orchestration completion with detailed breakdown
+
+**Orchestrator output:**
+
+```text
+Thread X: === Orchestrator Profiling: XXX tasks, total=XXXus ===
+Thread X:   sync_tensormap : XXXus (XX.X%)
+Thread X:   task_ring_alloc: XXXus (XX.X%)  work=XXXus wait=XXXus  atomics=XXX
+Thread X:   param_copy     : XXXus (XX.X%)  atomics=XXX
+Thread X:   lookup+dep     : XXXus (XX.X%)
+Thread X:   heap_alloc     : XXXus (XX.X%)  work=XXXus wait=XXXus  atomics=XXX
+Thread X:   tensormap_ins  : XXXus (XX.X%)
+Thread X:   fanin+ready    : XXXus (XX.X%)  work=XXXus wait=XXXus  atomics=XXX
+Thread X:   finalize+SM    : XXXus (XX.X%)  work=XXXus wait=XXXus  atomics=XXX
+Thread X:   scope_end      : XXXus  atomics=XXX
+Thread X:   avg/task       : XXXus
+```
+
+**Note:** Orchestrator logs always print when `PTO2_ORCH_PROFILING=1`, regardless of `enable_l2_swimlane` flag.
+
+---
+
+### Level 4: TensorMap Profiling (PTO2_TENSORMAP_PROFILING=1)
+
+**Requires:** `PTO2_PROFILING=1` AND `PTO2_ORCH_PROFILING=1`
+
+**What's compiled:**
+
+- All Level 3 features
+- TensorMap lookup statistics
+- Hash chain walk tracking
+- Overlap check counters
+
+**Log output:** 34 LOG_INFO_V9 logs (30 from Level 3 + 4 tensormap)
+
+**TensorMap output:**
+
+```text
+Thread X: === TensorMap Lookup Stats ===
+Thread X:   lookups        : XXX, inserts: XXX
+Thread X:   chain walked   : total=XXX, avg=X.X, max=X
+Thread X:   overlap checks : XXX, hits=XXX (XX.X%)
+```
+
+---
+
+## Runtime Flag: enable_l2_swimlane (perf_level)
+
+`--enable-l2-swimlane` accepts an integer perf_level (0–4). Transport
+mirrors the PMU pattern — two independent channels (one binary, one int):
+
+- **Binary on/off** — `KernelArgs::enable_profiling_flag` bit1
+  (`PROFILING_FLAG_L2_SWIMLANE`). Set by the host whenever level > 0; read
+  by AICore (which only needs on/off to decide whether to write timing) and
+  by AICPU kernel entry via `set_l2_swimlane_enabled(bool)`.
+- **Granular level (0–4)** — `L2SwimlaneDataHeader::l2_swimlane_level`
+  (shared memory). Host writes it in `L2SwimlaneCollector::initialize`; AICPU
+  promotes it from the header in `l2_swimlane_aicpu_init` and exposes it via
+  `get_l2_swimlane_level()` (typed `L2SwimlaneLevel`) for
+  `>= AICPU_TIMING / SCHED_PHASES / ORCH_PHASES` gates.
+
+On sim, the binary on/off travels via the dlsym'd `set_l2_swimlane_enabled`
+entry point; the granular level still goes through the shared-memory
+header just like on onboard.
+
+| Level | Collects |
+| ----- | -------- |
+| 0 | Nothing (disabled) |
+| 1 | AICore timing only (start/end/task_id/func_id/core_type) |
+| 2 | + dispatch_time, finish_time |
+| 3 | + Scheduler phases (`SCHED_*`) |
+| 4 | + Orchestrator phases (full) |
+
+Bare `--enable-l2-swimlane` = level 4 (backward compatible).
+
+### Level gating in AICPU code
+
+Use the strongly-typed `L2SwimlaneLevel` enum so each gate names the
+content it depends on instead of relying on magic numbers:
+
+```cpp
+// Any level > 0: AICPU task record buffer init / flush.
+// Cheap binary check, available immediately after kernel entry.
+if (is_l2_swimlane_enabled()) { ... }
+
+// AICPU dispatch/finish timestamps.
+// Granular checks below require l2_swimlane_aicpu_init to have already run
+// (so the level has been promoted from the shared-memory header).
+if (get_l2_swimlane_level() >= L2SwimlaneLevel::AICPU_TIMING) { ... }
+
+// Scheduler main-loop phase records (SCHED_*)
+if (get_l2_swimlane_level() >= L2SwimlaneLevel::SCHED_PHASES) { ... }
+
+// Orchestrator phase records
+if (get_l2_swimlane_level() >= L2SwimlaneLevel::ORCH_PHASES) { ... }
+```
+
+`L2SwimlaneLevel` is defined in `common/l2_swimlane_profiling.h` with
+underlying type `uint32_t` (matches the `L2SwimlaneDataHeader::l2_swimlane_level`
+shared-memory field and mirrors `PmuEventType : uint32_t`):
+
+| Enumerator | Underlying value |
+| ---------- | ---------------- |
+| `DISABLED` | 0 |
+| `AICORE_TIMING` | 1 |
+| `AICPU_TIMING` | 2 |
+| `SCHED_PHASES` | 3 |
+| `ORCH_PHASES` | 4 |
+
+### When enable_l2_swimlane=0
+
+- No performance data collection
+- No shared memory writes
+- Logs still print (controlled by macros only)
+
+---
+
+## Common Profiling Configurations
+
+### Development (minimal overhead)
+
+```bash
+# No profiling overhead
+PTO2_PROFILING=0
+```
+
+### Basic Performance Monitoring
+
+```bash
+# Minimal overhead, summary logs only
+PTO2_PROFILING=1
+PTO2_ORCH_PROFILING=0
+PTO2_SCHED_PROFILING=0
+```
+
+### Scheduler Performance Analysis
+
+```bash
+# Detailed scheduler breakdown
+PTO2_PROFILING=1
+PTO2_ORCH_PROFILING=0
+PTO2_SCHED_PROFILING=1
+```
+
+### Orchestrator Performance Analysis
+
+```bash
+# Detailed orchestrator breakdown
+PTO2_PROFILING=1
+PTO2_ORCH_PROFILING=1
+PTO2_SCHED_PROFILING=0
+```
+
+### Full Profiling (maximum overhead)
+
+```bash
+# All profiling features enabled
+PTO2_PROFILING=1
+PTO2_ORCH_PROFILING=1
+PTO2_SCHED_PROFILING=1
+PTO2_TENSORMAP_PROFILING=1
+```
+
+---
+
+## Setting Profiling Macros
+
+### At compile time
+
+Pass compile definitions through the build command or CI `CXXFLAGS`.
+This overrides the defaults in `profiling_config.h` without changing source.
+
+```bash
+# Example: disable all profiling code
+CXXFLAGS="-DPTO2_PROFILING=0" pip install --no-build-isolation -e .
+
+# Example: enable orchestrator and tensormap profiling
+CXXFLAGS="-DPTO2_ORCH_PROFILING=1 -DPTO2_TENSORMAP_PROFILING=1" \
+    pip install --no-build-isolation -e .
+```
+
+### In source code (before including headers)
+
+Source-level overrides are only for local experiments. They must appear before
+any header includes `profiling_config.h`; do not add duplicated fallback
+definitions to runtime headers.
+
+```cpp
+#define PTO2_PROFILING 1
+#define PTO2_ORCH_PROFILING 1
+#include "pto_runtime2_types.h"
+```
+
+---
+
+## Log Output Summary
+
+> Example: `paged_attention` on Ascend hardware, 2 sched threads + 2 orch threads, normal run (no stall/timeout).
+
+| Level | Macro Settings | LOG_INFO_V9 Count (`orch_to_sched_=false`) | LOG_INFO_V9 Count (`orch_to_sched_=true`) | Description |
+| ----- | -------------- | ------------------------------------------ | ----------------------------------------- | ----------- |
+| 0 | `PTO2_PROFILING=0` | 0 | 0 | No timing output |
+| 1 | `PTO2_PROFILING=1` | 7 | 8 | Timing timestamps + scheduler summary |
+| 2 | `+PTO2_SCHED_PROFILING=1` | — | — | Scheduler detailed phase breakdown |
+| 3 | `+PTO2_ORCH_PROFILING=1` | — | — | Orchestrator detailed phase breakdown |
+| 4 | `+PTO2_TENSORMAP_PROFILING=1` | — | — | TensorMap lookup stats |
+
+---
+
+## Implementation Notes
+
+### Key Principles
+
+1. **Macros control compilation and logging**
+   - `#if PTO2_PROFILING` controls whether profiling code is compiled
+   - Logs print when macro is enabled, regardless of runtime flag
+
+2. **Runtime flag controls data collection**
+   - `enable_l2_swimlane` controls performance buffer allocation
+   - Controls shared memory writes for host-side export
+   - Does NOT control log output
+
+3. **Consistent behavior across components**
+   - Scheduler logs: macro-controlled only
+   - Orchestrator logs: macro-controlled only
+   - Data collection: runtime flag controlled
+
+### Code Locations
+
+- Macro defaults and validation: `src/common/task_interface/profiling_config.h`
+- Scheduler profiling: `src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp` and `scheduler_cold_path.cpp`
+- Orchestrator profiling: `src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp`
+- TensorMap profiling: `src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h`
+
+---
+
+## Performance Impact
+
+### Compilation overhead
+
+- Level 0: No overhead
+- Level 1: Minimal (counter increments, basic arithmetic)
+- Level 2-4: Low to moderate (additional counters, cycle measurements)
+
+### Runtime overhead
+
+- Logging: Negligible (device logs are asynchronous)
+- Data collection (`enable_l2_swimlane>0`): Low to moderate
+  - Performance buffer writes
+  - Shared memory updates
+  - Per-task timing measurements
+
+### Recommendation
+
+- Use Level 0 for production
+- Use Level 1-2 for performance monitoring
+- Use Level 3-4 for detailed performance analysis only
diff --git a/src/a5/runtime/fully_distributed_within_core/host/dep_gen_replay.cpp b/src/a5/runtime/fully_distributed_within_core/host/dep_gen_replay.cpp
new file mode 100644
index 000000000..55565e885
--- /dev/null
+++ b/src/a5/runtime/fully_distributed_within_core/host/dep_gen_replay.cpp
@@ -0,0 +1,784 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+/**
+ * @file dep_gen_replay.cpp
+ * @brief Replay in-memory DepGenRecord stream → deps.json (strided tensor
+ *        representation, tensor-annotated) via a host-resident PTO2TensorMap,
+ *        with a differential check against the runtime template `compute_task_fanin`.
+ *
+ * Two passes run per record against two parallel PTO2TensorMap instances that
+ * evolve in lockstep:
+ *
+ *   ORACLE pass (read-only contract):
+ *     Drives `compute_task_fanin` (the same template the device orchestrator
+ *     uses in pto_orchestrator.cpp:submit_task) against `tm_oracle`. Emits
+ *     only PTO2TaskId values — the canonical set of producer IDs the runtime
+ *     would have wired. We never widen this template's emit signature: this
+ *     pass IS the contract, and any future change to `compute_task_fanin`
+ *     automatically refreshes the oracle.
+ *
+ *   ANNOT pass (this file's feature):
+ *     Inlines the same STEP A (creator retention) + STEP B (tensormap lookup)
+ *     against `tm_annot`, but the callback fires with the full
+ *     `PTO2TensorMapEntry&` + the consumer Tensor* + the arg index, so the
+ *     replay can record per-edge tensor metadata (producer/consumer
+ *     shape/offset, dtype, version).
+ *
+ * After both passes finish per record, we compare the producer-ID set the
+ * oracle emitted to the producer-ID set the annot pass emitted. They MUST
+ * match. If they diverge, deps.json is not written and the function returns
+ * non-zero — this is the "no shotgun modifications" guarantee: anyone who
+ * changes `compute_task_fanin` will trip this gate immediately and know to
+ * mirror the change in the annot pass.
+ *
+ * STEP 1 (explicit_deps) is emitted at the call site (per pto_dep_compute.h's
+ * "kept at call site" note); both passes run the same explicit-deps loop, so
+ * the comparison covers it too.
+ *
+ * STEP 4 (`register_task_outputs`) runs on BOTH tensor maps after both passes
+ * complete, keeping `tm_oracle` and `tm_annot` bit-equivalent for the next
+ * record's INOUT+COVERED `remove_entry` mutations.
+ *
+ * Pool sizing: replay never advances last_task_alive, so each tensor map's
+ * entry pool must accommodate every output write across the whole trace. We
+ * scan the record buffer once to count INOUT + OUTPUT_EXISTING slots and size
+ * the pool accordingly. Both maps get the same size.
+ */
+
+#include "dep_gen_replay.h"
+
+#include <cinttypes>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <fstream>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "common/dep_gen.h"
+#include "common/unified_log.h"
+#include "data_type.h"
+#include "pto_dep_compute.h"
+#include "pto_task_id.h"
+#include "pto_tensormap.h"
+#include "tensor.h"
+
+namespace {
+
+int32_t ceil_pow2(int32_t v) {
+    if (v <= 1) return 1;
+    v--;
+    v |= v >> 1;
+    v |= v >> 2;
+    v |= v >> 4;
+    v |= v >> 8;
+    v |= v >> 16;
+    return v + 1;
+}
+
+// Count INOUT + OUTPUT_EXISTING slots across the record buffer —
+// register_task_outputs only inserts those, and skips entries with manual_dep
+// set. Counting both without inspecting manual_dep is a conservative upper
+// bound (manual_dep is rare; the small over-allocation pays for itself in
+// avoided pool exhaustion).
+int32_t count_outputs(const DepGenRecord *records, size_t n) {
+    int32_t total = 0;
+    for (size_t i = 0; i < n; i++) {
+        const DepGenRecord &r = records[i];
+        // Overflow chain slots are reinterpret_cast views with no tensor data;
+        // their `tensor_count` bytes are actually the overflow `dep_count` field,
+        // which would mislead the loop below if read as a tensor count.
+        if (r.flags & DEP_GEN_FLAG_OVERFLOW) continue;
+        for (uint16_t j = 0; j < r.tensor_count; j++) {
+            auto t = static_cast<TensorArgType>(r.arg_types[j]);
+            if (t == TensorArgType::INOUT || t == TensorArgType::OUTPUT_EXISTING) {
+                total++;
+            }
+        }
+    }
+    return total;
+}
+
+// ---------------------------------------------------------------------------
+// JSON output accumulators (in-memory tables that get serialized at the end)
+// ---------------------------------------------------------------------------
+
+// Edge categories — matches the three places a runtime fanin edge is born.
+enum class EdgeSource { EXPLICIT, CREATOR, TENSORMAP };
+
+const char *edge_source_str(EdgeSource s) {
+    switch (s) {
+    case EdgeSource::EXPLICIT:
+        return "explicit";
+    case EdgeSource::CREATOR:
+        return "creator";
+    case EdgeSource::TENSORMAP:
+        return "tensormap";
+    }
+    return "unknown";
+}
+
+const char *overlap_status_str(OverlapStatus s) {
+    switch (s) {
+    case OverlapStatus::COVERED:
+        return "covered";
+    case OverlapStatus::OTHER:
+        return "other";
+    case OverlapStatus::NO_OVERLAP:
+        return "no_overlap";
+    }
+    return "unknown";
+}
+
+// One annotated edge. consumer_* always populated. producer_* populated for
+// TENSORMAP source only — the explicit/creator emit paths don't have a
+// matched tensormap entry to copy from.
+//
+// Slice description follows the strided Tensor model: (start_offset, strides[])
+// in element units. Byte offset of element coords[] is
+//   (start_offset + Σ coords[i] · strides[i]) · dtype_bytes
+struct EdgeAnnot {
+    uint64_t pred;
+    uint64_t succ;
+    int32_t consumer_arg_idx;  // -1 for EXPLICIT (not tied to a tensor arg)
+    EdgeSource source;
+    OverlapStatus overlap;  // only meaningful for TENSORMAP
+    uint64_t tensor_id;     // 0 for EXPLICIT
+    // Consumer side (the Tensor the submitting task is reading).
+    uint8_t consumer_dtype;
+    uint32_t consumer_ndims;
+    uint32_t consumer_shape[MAX_TENSOR_DIMS];
+    uint64_t consumer_start_offset;  // 1D element offset
+    uint32_t consumer_strides[MAX_TENSOR_DIMS];
+    // Producer side (the slice the producer wrote, from the tensormap entry).
+    // Only populated when source == TENSORMAP.
+    uint32_t producer_ndims;
+    uint32_t producer_shape[MAX_TENSOR_DIMS];
+    uint64_t producer_start_offset;
+    uint32_t producer_strides[MAX_TENSOR_DIMS];
+};
+
+// One entry in the tensors[] table: the underlying storage, keyed by
+// (buffer_addr, version). buffer_numel is the storage element count;
+// per-edge fields describe the slice (start_offset + stride).
+struct TensorTableEntry {
+    uint64_t tensor_id;
+    uint64_t buffer_addr;
+    uint64_t buffer_numel;  // storage size in elements (= buffer.size / dtype_bytes)
+    int32_t version;
+    uint8_t dtype;
+};
+
+// One arg slot of a task, captured for the `tasks[].args[]` block so
+// downstream viewers can render per-task input / output compartments without
+// having to scan every edge. `has_tensor_info` is false only for OUTPUT slots:
+// the runtime hasn't materialized a Tensor yet at submit_task time, so the
+// captured blob is zeroed.
+struct TaskArgEntry {
+    int32_t idx;
+    TensorArgType arg_type;
+    bool has_tensor_info;
+    uint64_t tensor_id;
+    uint8_t dtype;
+    uint32_t ndims;
+    uint32_t shape[MAX_TENSOR_DIMS];
+    uint64_t start_offset;  // 1D element offset
+    uint32_t strides[MAX_TENSOR_DIMS];
+};
+
+struct TaskTableEntry {
+    uint64_t task_id;
+    bool in_manual_scope;
+    int32_t kernel_id[3];  // per-subslot {AIC, AIV0, AIV1}, -1 = inactive
+    std::vector<TaskArgEntry> args;
+};
+
+const char *arg_type_str(TensorArgType t) {
+    switch (t) {
+    case TensorArgType::INPUT:
+        return "INPUT";
+    case TensorArgType::OUTPUT:
+        return "OUTPUT";
+    case TensorArgType::INOUT:
+        return "INOUT";
+    case TensorArgType::OUTPUT_EXISTING:
+        return "OUTPUT_EXISTING";
+    }
+    return "UNKNOWN";
+}
+
+// FNV-1a 64-bit hash of (buffer_addr, version) — stable tensor identity
+// across runs (no time-dependent inputs).
+uint64_t make_tensor_id(uint64_t buffer_addr, int32_t version) {
+    constexpr uint64_t FNV_OFFSET = 0xcbf29ce484222325ULL;
+    constexpr uint64_t FNV_PRIME = 0x100000001b3ULL;
+    uint64_t h = FNV_OFFSET;
+    const uint8_t *p;
+    p = reinterpret_cast<const uint8_t *>(&buffer_addr);
+    for (size_t i = 0; i < sizeof(buffer_addr); i++) {
+        h ^= p[i];
+        h *= FNV_PRIME;
+    }
+    uint32_t v = static_cast<uint32_t>(version);
+    p = reinterpret_cast<const uint8_t *>(&v);
+    for (size_t i = 0; i < sizeof(v); i++) {
+        h ^= p[i];
+        h *= FNV_PRIME;
+    }
+    return h;
+}
+
+// Register a tensor in the tensors[] table on first sight of (addr,
+// version). buffer_numel describes the underlying storage size in elements;
+// per-edge fields describe the slice via (start_offset, strides[]). Subsequent
+// sightings of the same (addr, version) are no-ops.
+uint64_t register_tensor(
+    std::unordered_map<uint64_t, size_t> &index_by_id, std::vector<TensorTableEntry> &table, const Tensor &t
+) {
+    uint64_t id = make_tensor_id(t.buffer.addr, t.version);
+    auto it = index_by_id.find(id);
+    if (it != index_by_id.end()) {
+        return id;
+    }
+    TensorTableEntry e;
+    e.tensor_id = id;
+    e.buffer_addr = t.buffer.addr;
+    e.version = t.version;
+    e.dtype = static_cast<uint8_t>(t.dtype);
+    const uint64_t elem_size = get_element_size(t.dtype);
+    e.buffer_numel = (elem_size == 0) ? 0 : (t.buffer.size / elem_size);
+    index_by_id[id] = table.size();
+    table.push_back(e);
+    return id;
+}
+
+// Copy a Tensor's slice description (shape + start_offset + stride) into an
+// EdgeAnnot's consumer_* fields.
+void fill_consumer(EdgeAnnot &e, const Tensor &t) {
+    e.consumer_dtype = static_cast<uint8_t>(t.dtype);
+    e.consumer_ndims = t.ndims;
+    e.consumer_start_offset = t.start_offset;
+    for (uint32_t i = 0; i < t.ndims && i < MAX_TENSOR_DIMS; i++) {
+        e.consumer_shape[i] = t.shapes[i];
+        e.consumer_strides[i] = t.strides[i];
+    }
+}
+
+// Copy a PTO2TensorMapEntry's slice description into an EdgeAnnot's producer_*
+// fields. Only called from the TENSORMAP emit path.
+void fill_producer(EdgeAnnot &e, const PTO2TensorMapEntry &entry) {
+    e.producer_ndims = entry.ndims;
+    e.producer_start_offset = entry.start_offset;
+    for (uint32_t i = 0; i < entry.ndims && i < MAX_TENSOR_DIMS; i++) {
+        e.producer_shape[i] = entry.shapes[i];
+        e.producer_strides[i] = entry.strides[i];
+    }
+}
+
+// ---------------------------------------------------------------------------
+// JSON writer
+// ---------------------------------------------------------------------------
+
+void write_uint_array(std::ofstream &out, const uint32_t *data, uint32_t n) {
+    out << '[';
+    for (uint32_t i = 0; i < n; i++) {
+        if (i > 0) out << ',';
+        out << data[i];
+    }
+    out << ']';
+}
+
+bool write_deps_json(
+    const char *path, const std::vector<TaskTableEntry> &tasks, const std::vector<TensorTableEntry> &tensors,
+    const std::vector<EdgeAnnot> &edges
+) {
+    std::ofstream out(path, std::ios::out | std::ios::trunc);
+    if (!out) {
+        LOG_ERROR("dep_gen replay: failed to open '%s' for write", path);
+        return false;
+    }
+    // Strided tensor representation. tensors[].buffer_numel is the underlying
+    // storage element count; tasks[].args[] and edges[] carry per-slice
+    // geometry as (start_offset uint64, strides[] uint32 — runtime invariant
+    // forbids zero / negative strides, see runtime/tensor.h).
+    out << "{\"tasks\":[";
+    for (size_t i = 0; i < tasks.size(); i++) {
+        if (i > 0) out << ',';
+        const auto &t = tasks[i];
+        // uint64 fields are quoted as strings — task_id/tensor_id/buffer_addr/
+        // pred/succ can exceed Number.MAX_SAFE_INTEGER (2^53-1), silently
+        // losing precision in JS-based JSON parsers. Python consumers already
+        // pass these through int(...) and don't care which form they receive.
+        out << "{\"task_id\":\"" << t.task_id << '"';
+        out << ",\"scope\":\"" << (t.in_manual_scope ? "manual" : "auto") << '"';
+        // Per-subslot kernel ids {AIC, AIV0, AIV1}; INVALID_KERNEL_ID = -1 for
+        // inactive subslots. Emitted as a plain int triple — downstream viewers
+        // (and the swimlane host post-processor) use it to resolve task_id →
+        // kernel without the AICore record carrying the field itself.
+        out << ",\"kernel_ids\":[" << t.kernel_id[0] << ',' << t.kernel_id[1] << ',' << t.kernel_id[2] << ']';
+        out << ",\"args\":[";
+        for (size_t a = 0; a < t.args.size(); a++) {
+            if (a > 0) out << ',';
+            const auto &arg = t.args[a];
+            out << "{\"idx\":" << arg.idx;
+            out << ",\"type\":\"" << arg_type_str(arg.arg_type) << '"';
+            if (arg.has_tensor_info) {
+                out << ",\"tensor_id\":\"" << arg.tensor_id << '"';
+                out << ",\"dtype\":\"" << get_dtype_name(static_cast<DataType>(arg.dtype)) << '"';
+                out << ",\"shape\":";
+                write_uint_array(out, arg.shape, arg.ndims);
+                out << ",\"start_offset\":\"" << arg.start_offset << '"';
+                out << ",\"strides\":";
+                write_uint_array(out, arg.strides, arg.ndims);
+            }
+            out << '}';
+        }
+        out << "]}";
+    }
+    out << ']';
+
+    out << ",\"tensors\":[";
+    for (size_t i = 0; i < tensors.size(); i++) {
+        if (i > 0) out << ',';
+        const auto &t = tensors[i];
+        out << "{\"tensor_id\":\"" << t.tensor_id << '"';
+        out << ",\"buffer_addr\":\"" << t.buffer_addr << '"';
+        out << ",\"version\":" << t.version;
+        out << ",\"dtype\":\"" << get_dtype_name(static_cast<DataType>(t.dtype)) << '"';
+        out << ",\"buffer_numel\":\"" << t.buffer_numel << '"';
+        out << '}';
+    }
+    out << ']';
+
+    out << ",\"edges\":[";
+    for (size_t i = 0; i < edges.size(); i++) {
+        if (i > 0) out << ',';
+        const auto &e = edges[i];
+        out << "{\"pred\":\"" << e.pred << "\",\"succ\":\"" << e.succ << '"';
+        out << ",\"arg\":" << e.consumer_arg_idx;
+        out << ",\"source\":\"" << edge_source_str(e.source) << '"';
+        if (e.source == EdgeSource::TENSORMAP) {
+            out << ",\"overlap\":\"" << overlap_status_str(e.overlap) << '"';
+        }
+        if (e.source != EdgeSource::EXPLICIT) {
+            out << ",\"tensor_id\":\"" << e.tensor_id << '"';
+            out << ",\"consumer_dtype\":\"" << get_dtype_name(static_cast<DataType>(e.consumer_dtype)) << '"';
+            out << ",\"consumer_shape\":";
+            write_uint_array(out, e.consumer_shape, e.consumer_ndims);
+            out << ",\"consumer_start_offset\":\"" << e.consumer_start_offset << '"';
+            out << ",\"consumer_strides\":";
+            write_uint_array(out, e.consumer_strides, e.consumer_ndims);
+        }
+        if (e.source == EdgeSource::TENSORMAP) {
+            out << ",\"producer_shape\":";
+            write_uint_array(out, e.producer_shape, e.producer_ndims);
+            out << ",\"producer_start_offset\":\"" << e.producer_start_offset << '"';
+            out << ",\"producer_strides\":";
+            write_uint_array(out, e.producer_strides, e.producer_ndims);
+        }
+        out << '}';
+    }
+    out << "]}\n";
+    return static_cast<bool>(out);
+}
+
+// ---------------------------------------------------------------------------
+// Annot pass — mirrors compute_task_fanin step-by-step against tm_annot.
+// Must stay bit-equivalent to pto_dep_compute.h::compute_task_fanin in terms
+// of which producer IDs are emitted (the differential check enforces this).
+// ---------------------------------------------------------------------------
+
+template <typename EmitTM, typename EmitCreator>
+void annot_pass(
+    const DepInputs &inputs, PTO2TensorMap &tensor_map, bool in_manual_scope, EmitCreator emit_creator,
+    EmitTM emit_tensormap
+) {
+    if (in_manual_scope) {
+        return;
+    }
+    for (int32_t i = 0; i < inputs.tensor_count; i++) {
+        TensorArgType ptype = inputs.arg_types[i];
+        if (ptype == TensorArgType::OUTPUT) {
+            continue;
+        }
+        const Tensor *tensor = &inputs.tensors[i].ref();
+
+        // STEP A: creator retention.
+        PTO2TaskId owner = tensor->owner_task_id;
+        if (owner.is_valid()) {
+            emit_creator(owner, i, *tensor);
+        }
+
+        // STEP B: tensormap lookup (only INPUT/INOUT, skip manual_dep).
+        if (ptype != TensorArgType::INPUT && ptype != TensorArgType::INOUT) {
+            continue;
+        }
+        if (tensor->manual_dep) {
+            continue;
+        }
+
+        tensor_map.lookup(*tensor, [&](PTO2TensorMapEntry &entry, OverlapStatus overlap_status) -> bool {
+            emit_tensormap(entry.producer_task_id, i, *tensor, entry, overlap_status);
+            if (ptype == TensorArgType::INOUT && overlap_status == OverlapStatus::COVERED) {
+                tensor_map.remove_entry(entry);
+            }
+            return true;
+        });
+    }
+}
+
+}  // namespace
+
+extern "C" int
+dep_gen_replay_emit_deps_json(const DepGenRecord *records, size_t num_records, const char *deps_json_path) {
+    if (deps_json_path == nullptr) {
+        LOG_ERROR("dep_gen replay: null deps_json_path");
+        return -1;
+    }
+    if (num_records > 0 && records == nullptr) {
+        LOG_ERROR("dep_gen replay: num_records=%zu but records pointer is null", num_records);
+        return -1;
+    }
+    LOG_INFO_V0("dep_gen replay: processing %zu in-memory records (dual-pass)", num_records);
+
+    // Per-ring task window sizes — tensormap masks slot indices and requires
+    // each to be a power of two. Auto-size from the records themselves so each
+    // ring's window comfortably covers its observed max local_id (no slot
+    // aliasing during INOUT+COVERED remove_from_task). Same sizes feed both
+    // maps so they stay in lockstep.
+    int32_t task_window_sizes[PTO2_MAX_RING_DEPTH];
+    uint32_t max_local[PTO2_MAX_RING_DEPTH] = {0};
+    for (size_t i = 0; i < num_records; i++) {
+        PTO2TaskId tid{records[i].task_id};
+        uint8_t ring = tid.ring();
+        uint32_t local = tid.local();
+        if (ring < PTO2_MAX_RING_DEPTH && local > max_local[ring]) {
+            max_local[ring] = local;
+        }
+    }
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        int32_t need = static_cast<int32_t>(max_local[r] + 1);
+        task_window_sizes[r] = ceil_pow2(need < 16 ? 16 : need);
+    }
+
+    int32_t output_count = count_outputs(records, num_records);
+    int32_t pool_size = output_count + (output_count / 10) + 64;
+    if (pool_size < PTO2_TENSORMAP_POOL_SIZE) {
+        pool_size = PTO2_TENSORMAP_POOL_SIZE;
+    }
+
+    PTO2TensorMap tm_oracle;
+    PTO2TensorMap tm_annot;
+    std::memset(&tm_oracle, 0, sizeof(tm_oracle));
+    std::memset(&tm_annot, 0, sizeof(tm_annot));
+
+    // Libc-backed arena (default ctor) that owns both replay tensormaps'
+    // storage. Released by the arena destructor when this function returns.
+    DeviceArena replay_arena;
+
+    auto oracle_layout =
+        PTO2TensorMap::reserve_layout(replay_arena, PTO2_TENSORMAP_NUM_BUCKETS, pool_size, task_window_sizes);
+    auto annot_layout =
+        PTO2TensorMap::reserve_layout(replay_arena, PTO2_TENSORMAP_NUM_BUCKETS, pool_size, task_window_sizes);
+    if (replay_arena.commit() == nullptr || !tm_oracle.init_data_from_layout(oracle_layout, replay_arena) ||
+        !tm_annot.init_data_from_layout(annot_layout, replay_arena)) {
+        LOG_ERROR("dep_gen replay: tensormap.init failed (buckets=%d, pool=%d)", PTO2_TENSORMAP_NUM_BUCKETS, pool_size);
+        return -3;
+    }
+    // Replay tensormaps live entirely on host; only arena-internal pointer
+    // fields need wiring (no parent-orch back-reference exists anymore).
+    tm_oracle.wire_arena_pointers(oracle_layout, replay_arena);
+    tm_annot.wire_arena_pointers(annot_layout, replay_arena);
+
+    // JSON output accumulators.
+    std::vector<TaskTableEntry> task_table;
+    std::vector<TensorTableEntry> tensor_table;
+    std::unordered_map<uint64_t, size_t> tensor_index;  // tensor_id → table idx
+    std::vector<EdgeAnnot> annot_edges;
+    annot_edges.reserve(num_records * 2);
+
+    TensorRef tref_buf[CORE_MAX_TENSOR_ARGS];
+    TensorArgType atype_buf[CORE_MAX_TENSOR_ARGS];
+
+    // Per-record dedup of producer IDs — must match runtime's
+    // PTO2FaninBuilder::append_fanin_or_fail semantics, which collapses STEP 1
+    // (explicit_deps) + STEP A (creator retention) + STEP B (tensormap lookup)
+    // into a single per-task fanin list. Both oracle and annot use this same
+    // semantics so the divergence check is meaningful.
+    std::unordered_set<uint64_t> oracle_preds;
+    std::unordered_set<uint64_t> annot_preds;
+
+    // Scratch buffer for assembling full dep lists across overflow chains.
+    // Declared outside the loop so it can be reused (clear() keeps capacity).
+    std::vector<uint64_t> full_deps_buf;
+
+    for (size_t rec_i = 0; rec_i < num_records; rec_i++) {
+        const DepGenRecord &rec = records[rec_i];
+
+        // Overflow chain records are consumed by the preceding base; skip
+        // them in the main scan so we don't double-process or read the
+        // overflow's reinterpreted bytes as tensor/dep info.
+        if (rec.flags & DEP_GEN_FLAG_OVERFLOW) continue;
+
+        PTO2TaskId task_id{rec.task_id};
+        bool in_manual_scope = (rec.flags & DEP_GEN_FLAG_IN_MANUAL_SCOPE) != 0;
+
+        oracle_preds.clear();
+        annot_preds.clear();
+
+        int32_t tc = static_cast<int32_t>(rec.tensor_count);
+        if (tc > CORE_MAX_TENSOR_ARGS) {
+            tc = CORE_MAX_TENSOR_ARGS;
+        }
+        for (int32_t i = 0; i < tc; i++) {
+            tref_buf[i] = reinterpret_cast<const Tensor *>(&rec.tensors[i][0]);
+            atype_buf[i] = static_cast<TensorArgType>(rec.arg_types[i]);
+        }
+
+        // Assemble the full dep list. Fast path: ≤ DEP_GEN_MAX_EXPLICIT_DEPS,
+        // no chain, point straight at rec.explicit_deps. Slow path: gather
+        // base + chain into full_deps_buf and point at the buffer.
+        //
+        // `explicit_dep_count` / `over->dep_count` originate from device
+        // shared memory and are bounded by the writer to the array sizes, but
+        // we clamp on read too so a corrupted record never drives an OOB read
+        // off the end of rec.explicit_deps[64] / over->deps[582].
+        const uint64_t *deps_data;
+        int32_t dc;
+        if (rec.flags & DEP_GEN_FLAG_HAS_OVERFLOW) {
+            full_deps_buf.clear();
+            uint16_t base_dc = rec.explicit_dep_count;
+            if (base_dc > DEP_GEN_MAX_EXPLICIT_DEPS) {
+                LOG_ERROR(
+                    "dep_gen replay: clamping base explicit_dep_count %u > %d at rec_idx=%zu (task_id=%" PRIu64 ")",
+                    base_dc, DEP_GEN_MAX_EXPLICIT_DEPS, rec_i, rec.task_id
+                );
+                base_dc = DEP_GEN_MAX_EXPLICIT_DEPS;
+            }
+            full_deps_buf.reserve(static_cast<size_t>(base_dc) + DEP_GEN_OVERFLOW_DEPS_PER_RECORD);
+            full_deps_buf.insert(full_deps_buf.end(), rec.explicit_deps, rec.explicit_deps + base_dc);
+            bool chain_complete = false;
+            for (size_t j = rec_i + 1; j < num_records; j++) {
+                const DepGenRecord &maybe = records[j];
+                if (!(maybe.flags & DEP_GEN_FLAG_OVERFLOW)) {
+                    LOG_ERROR(
+                        "dep_gen replay: unterminated overflow chain at rec_idx=%zu (task_id=%" PRIu64 ")", rec_i,
+                        rec.task_id
+                    );
+                    break;
+                }
+                if (maybe.task_id != rec.task_id) {
+                    LOG_ERROR(
+                        "dep_gen replay: orphan overflow at rec_idx=%zu (expected task_id=%" PRIu64 ", found %" PRIu64
+                        ")",
+                        j, rec.task_id, maybe.task_id
+                    );
+                    break;
+                }
+                const auto *over = reinterpret_cast<const DepGenOverflowRecord *>(&maybe);
+                uint16_t over_dc = over->dep_count;
+                if (over_dc > DEP_GEN_OVERFLOW_DEPS_PER_RECORD) {
+                    LOG_ERROR(
+                        "dep_gen replay: clamping overflow dep_count %u > %d at rec_idx=%zu (task_id=%" PRIu64 ")",
+                        over_dc, DEP_GEN_OVERFLOW_DEPS_PER_RECORD, j, rec.task_id
+                    );
+                    over_dc = DEP_GEN_OVERFLOW_DEPS_PER_RECORD;
+                }
+                full_deps_buf.insert(full_deps_buf.end(), over->deps, over->deps + over_dc);
+                if (over->flags & DEP_GEN_FLAG_LAST_OVERFLOW) {
+                    chain_complete = true;
+                    break;
+                }
+            }
+            if (!chain_complete) {
+                LOG_ERROR(
+                    "dep_gen replay: chain for task_id=%" PRIu64 " missing LAST_OVERFLOW marker — "
+                    "using partial dep list (%zu deps)",
+                    rec.task_id, full_deps_buf.size()
+                );
+            }
+            deps_data = full_deps_buf.data();
+            dc = static_cast<int32_t>(full_deps_buf.size());
+        } else {
+            deps_data = rec.explicit_deps;
+            uint16_t base_dc = rec.explicit_dep_count;
+            if (base_dc > DEP_GEN_MAX_EXPLICIT_DEPS) {
+                LOG_ERROR(
+                    "dep_gen replay: clamping no-chain explicit_dep_count %u > %d at rec_idx=%zu (task_id=%" PRIu64 ")",
+                    base_dc, DEP_GEN_MAX_EXPLICIT_DEPS, rec_i, rec.task_id
+                );
+                base_dc = DEP_GEN_MAX_EXPLICIT_DEPS;
+            }
+            dc = static_cast<int32_t>(base_dc);
+        }
+
+        DepInputs inputs;
+        inputs.tensor_count = tc;
+        inputs.tensors = tref_buf;
+        inputs.arg_types = atype_buf;
+        inputs.explicit_dep_count = dc;
+        inputs.explicit_deps = reinterpret_cast<const PTO2TaskId *>(deps_data);
+
+        // Register tasks[] entry (with per-arg slot info) and any unseen
+        // tensors[] entries up-front. Tensors are registered from the
+        // consumer-side blob so raw_shapes / dtype are populated (the
+        // producer-side PTO2TensorMapEntry drops raw_shapes to fit in two
+        // cache lines).
+        TaskTableEntry task_entry;
+        task_entry.task_id = rec.task_id;
+        task_entry.in_manual_scope = in_manual_scope;
+        task_entry.kernel_id[0] = rec.kernel_id[0];
+        task_entry.kernel_id[1] = rec.kernel_id[1];
+        task_entry.kernel_id[2] = rec.kernel_id[2];
+        task_entry.args.reserve(tc);
+        for (int32_t i = 0; i < tc; i++) {
+            TaskArgEntry slot{};
+            slot.idx = i;
+            slot.arg_type = atype_buf[i];
+            if (atype_buf[i] == TensorArgType::OUTPUT) {
+                // OUTPUT blob is zero at submit time (writer has no Tensor
+                // yet); leave has_tensor_info=false. Viewers render this as
+                // a placeholder "alloc" output slot.
+                slot.has_tensor_info = false;
+            } else {
+                const Tensor &t = tref_buf[i].ref();
+                register_tensor(tensor_index, tensor_table, t);
+                slot.has_tensor_info = true;
+                slot.tensor_id = make_tensor_id(t.buffer.addr, t.version);
+                slot.dtype = static_cast<uint8_t>(t.dtype);
+                slot.ndims = t.ndims;
+                slot.start_offset = t.start_offset;
+                for (uint32_t d = 0; d < t.ndims && d < MAX_TENSOR_DIMS; d++) {
+                    slot.shape[d] = t.shapes[d];
+                    slot.strides[d] = t.strides[d];
+                }
+            }
+            task_entry.args.push_back(slot);
+        }
+        task_table.push_back(std::move(task_entry));
+
+        // ============ STEP 1 — explicit_deps (call-site emit) ============
+        // Same loop on both passes; they MUST produce identical sets here
+        // because they read the same record. Annot records explicit edges
+        // with consumer_arg_idx = -1 (not tied to any tensor arg). Reads
+        // from deps_data (base record's explicit_deps[] on fast path, the
+        // gathered base+chain buffer on overflow path).
+        for (int32_t i = 0; i < dc; i++) {
+            uint64_t pred_raw = deps_data[i];
+            if (oracle_preds.insert(pred_raw).second) {
+                // First time this pred is seen at runtime call site.
+            }
+            if (annot_preds.insert(pred_raw).second) {
+                EdgeAnnot e{};
+                e.pred = pred_raw;
+                e.succ = rec.task_id;
+                e.consumer_arg_idx = -1;
+                e.source = EdgeSource::EXPLICIT;
+                annot_edges.push_back(e);
+            }
+        }
+
+        // ============ ORACLE pass — drive compute_task_fanin ============
+        bool ok = compute_task_fanin(inputs, tm_oracle, in_manual_scope, [&](PTO2TaskId producer) -> bool {
+            oracle_preds.insert(producer.raw);
+            return true;
+        });
+        if (!ok) {
+            LOG_ERROR("dep_gen replay: compute_task_fanin returned fatal at task_id=%" PRIu64, rec.task_id);
+            tm_oracle.destroy();
+            tm_annot.destroy();
+            return -4;
+        }
+
+        // ============ ANNOT pass — inline mirror, full entry capture ============
+        annot_pass(
+            inputs, tm_annot, in_manual_scope,
+            // emit_creator(producer, arg_idx, consumer_tensor)
+            [&](PTO2TaskId producer, int32_t arg_idx, const Tensor &consumer) {
+                if (!annot_preds.insert(producer.raw).second) {
+                    return;  // already covered by an earlier emit on this record
+                }
+                EdgeAnnot e{};
+                e.pred = producer.raw;
+                e.succ = rec.task_id;
+                e.consumer_arg_idx = arg_idx;
+                e.source = EdgeSource::CREATOR;
+                e.tensor_id = make_tensor_id(consumer.buffer.addr, consumer.version);
+                fill_consumer(e, consumer);
+                annot_edges.push_back(e);
+            },
+            // emit_tensormap(producer, arg_idx, consumer_tensor, entry, status)
+            [&](PTO2TaskId producer, int32_t arg_idx, const Tensor &consumer, const PTO2TensorMapEntry &entry,
+                OverlapStatus status) {
+                // Per-(succ, arg_idx, producer_buffer_addr, producer_version)
+                // dedup gives us "the same producer slice fired twice for the
+                // same consumer arg" collapse — but two distinct slices from
+                // the same producer (different version), or two different
+                // producers, both yield their own edges. The producer-id-set
+                // comparison below uses annot_preds, which dedups by pred
+                // only, matching runtime PTO2FaninBuilder semantics.
+                annot_preds.insert(producer.raw);
+                EdgeAnnot e{};
+                e.pred = producer.raw;
+                e.succ = rec.task_id;
+                e.consumer_arg_idx = arg_idx;
+                e.source = EdgeSource::TENSORMAP;
+                e.overlap = status;
+                e.tensor_id = make_tensor_id(entry.buffer_addr, entry.version);
+                fill_consumer(e, consumer);
+                fill_producer(e, entry);
+                annot_edges.push_back(e);
+            }
+        );
+
+        // ============ Differential check ============
+        if (oracle_preds != annot_preds) {
+            LOG_ERROR(
+                "dep_gen replay: DIVERGENCE at task_id=%" PRIu64 " (rec_idx=%zu): oracle has %zu preds, annot has %zu",
+                rec.task_id, rec_i, oracle_preds.size(), annot_preds.size()
+            );
+            // Log the symmetric difference for debugging.
+            for (uint64_t p : oracle_preds) {
+                if (annot_preds.find(p) == annot_preds.end()) {
+                    LOG_ERROR("  only-in-oracle pred: %" PRIu64, p);
+                }
+            }
+            for (uint64_t p : annot_preds) {
+                if (oracle_preds.find(p) == oracle_preds.end()) {
+                    LOG_ERROR("  only-in-annot  pred: %" PRIu64, p);
+                }
+            }
+            tm_oracle.destroy();
+            tm_annot.destroy();
+            return -6;
+        }
+
+        // ============ STEP 4 — publish outputs on BOTH maps ============
+        register_task_outputs(inputs, task_id, tm_oracle, in_manual_scope);
+        register_task_outputs(inputs, task_id, tm_annot, in_manual_scope);
+    }
+
+    tm_oracle.destroy();
+    tm_annot.destroy();
+
+    if (!write_deps_json(deps_json_path, task_table, tensor_table, annot_edges)) {
+        return -5;
+    }
+    LOG_INFO_V0(
+        "dep_gen replay: wrote deps.json to %s (tasks=%zu, tensors=%zu, edges=%zu)", deps_json_path, task_table.size(),
+        tensor_table.size(), annot_edges.size()
+    );
+    return 0;
+}
diff --git a/src/a5/runtime/fully_distributed_within_core/host/dep_gen_replay.h b/src/a5/runtime/fully_distributed_within_core/host/dep_gen_replay.h
new file mode 100644
index 000000000..49cc2331c
--- /dev/null
+++ b/src/a5/runtime/fully_distributed_within_core/host/dep_gen_replay.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+/**
+ * @file dep_gen_replay.h
+ * @brief Host-side replay of in-memory DepGenRecord stream → deps.json.
+ *
+ * Takes the records the host collector drained from the device ring buffer
+ * (``DepGenCollector::records()``) and runs them back through a host-resident
+ * PTO2TensorMap using the same ``compute_task_fanin`` / ``register_task_outputs``
+ * primitives the device orchestrator uses, emitting the full
+ * predecessor → successor edge list to deps.json.
+ *
+ * The records buffer is passed in directly — there is no intermediate
+ * ``submit_trace.bin`` on disk. The host already has the records once the
+ * device run completes, so going through the filesystem would just be
+ * extra I/O and an extra file in the output directory.
+ *
+ * deps.json is the sole source of truth for fanout: the L2 swimlane hot
+ * path no longer records ``L2SwimlaneAicpuTaskRecord::fanout[]`` (taking the per-task
+ * 1 KB GM store off the scheduler critical path). Replay sees every
+ * submit and reconstructs the complete dependency graph.
+ *
+ * Output format (deps.json, strided tensor representation):
+ *
+ *   {"tasks":   [{"task_id":<u64>, "scope":"auto|manual",
+ *                 "args":[{"idx":<i32>, "type":"<arg_type>",
+ *                          "tensor_id":<u64>, "dtype":"...", "shape":[...],
+ *                          "start_offset":<u64>, "strides":[...]}, ...]}, ...],
+ *    "tensors": [{"tensor_id":<u64>, "buffer_addr":<u64>, "version":<i32>,
+ *                 "dtype":"FLOAT32", "buffer_numel":<u64>}, ...],
+ *    "edges":   [{"pred":<u64>, "succ":<u64>, "arg":<i32>,
+ *                 "source":"explicit|creator|tensormap",
+ *                 "overlap":"covered|other" (tensormap only),
+ *                 "tensor_id":<u64> (non-explicit),
+ *                 "consumer_dtype":"...", "consumer_shape":[...],
+ *                 "consumer_start_offset":<u64>, "consumer_strides":[...],
+ *                 "producer_shape":[...] (tensormap),
+ *                 "producer_start_offset":<u64> (tensormap),
+ *                 "producer_strides":[...] (tensormap)},
+ *                ...]}
+ *
+ *   - All task ids are ``PTO2TaskId::raw`` values (``(ring_id << 32) | local_id``).
+ *   - ``tensor_id`` is a stable FNV-1a hash of ``(buffer_addr, version)``.
+ *   - ``buffer_numel`` is the underlying storage element count; tensor shapes
+ *     are carried per-arg / per-edge alongside ``start_offset`` + ``strides``.
+ *   - Distinct producers / arg indices / sources keep their own edges; per-record
+ *     deduplication of producer ids mirrors the runtime
+ *     ``PTO2FaninBuilder::append_fanin_or_fail`` semantics so the set of
+ *     ``(pred, succ)`` pairs is identical to what the runtime would have
+ *     recorded.
+ *
+ * Self-checking: the replay runs two parallel tensormap instances per record —
+ * an "oracle" map driven by the canonical ``compute_task_fanin`` template, and
+ * an "annotated" map driven by an inlined mirror that captures the per-edge
+ * tensor metadata. If the producer-id set on the two passes ever diverges,
+ * deps.json is NOT written and the function returns a non-zero error code.
+ * This is the guarantee against silent shotgun modifications: anyone who
+ * changes ``compute_task_fanin`` semantics has to mirror the change here too
+ * or the gate fires immediately.
+ *
+ * The replay is single-threaded and pure CPU: no device handle is required.
+ */
+
+#ifndef SRC_A5_RUNTIME_TENSORMAP_AND_RINGBUFFER_HOST_DEP_GEN_REPLAY_H_
+#define SRC_A5_RUNTIME_TENSORMAP_AND_RINGBUFFER_HOST_DEP_GEN_REPLAY_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+// Opaque forward decl — the canonical layout lives in common/dep_gen.h, but
+// replay's API only needs to take a pointer + count. Callers who construct
+// the buffer must include common/dep_gen.h themselves.
+struct DepGenRecord;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * Replay an in-memory DepGenRecord stream and write deps.json.
+ *
+ * Per-ring task window sizes are auto-derived from the trace itself so each
+ * ring's window covers its observed max local_id without slot aliasing.
+ *
+ * @param records            Pointer to a contiguous DepGenRecord array
+ *                           (typically ``DepGenCollector::records().data()``).
+ * @param num_records        Number of records in the array.
+ * @param deps_json_path     Output path; truncated if it exists.
+ * @return 0 on success; negative on error (see source for codes).
+ */
+int dep_gen_replay_emit_deps_json(const struct DepGenRecord *records, size_t num_records, const char *deps_json_path);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // SRC_A5_RUNTIME_TENSORMAP_AND_RINGBUFFER_HOST_DEP_GEN_REPLAY_H_
diff --git a/src/a5/runtime/fully_distributed_within_core/host/runtime_compile_info.cpp b/src/a5/runtime/fully_distributed_within_core/host/runtime_compile_info.cpp
new file mode 100644
index 000000000..dfc5590c1
--- /dev/null
+++ b/src/a5/runtime/fully_distributed_within_core/host/runtime_compile_info.cpp
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+#include "host/platform_compile_info.h"
+#include "host/runtime_compile_info.h"
+#include <string.h>
+
+extern "C" {
+
+ToolchainType get_incore_compiler(void) {
+    if (strcmp(get_platform(), "a2a3") == 0) return TOOLCHAIN_CCEC;
+    return TOOLCHAIN_HOST_GXX_15;
+}
+
+ToolchainType get_orchestration_compiler(void) {
+    // tensormap_and_ringbuffer: a2a3 needs aarch64 cross-compile (AICPU is aarch64)
+    if (strcmp(get_platform(), "a2a3") == 0) return TOOLCHAIN_AARCH64_GXX;
+    return TOOLCHAIN_HOST_GXX;
+}
+}
diff --git a/src/a5/runtime/fully_distributed_within_core/host/runtime_maker.cpp b/src/a5/runtime/fully_distributed_within_core/host/runtime_maker.cpp
new file mode 100644
index 000000000..b95411a6c
--- /dev/null
+++ b/src/a5/runtime/fully_distributed_within_core/host/runtime_maker.cpp
@@ -0,0 +1,691 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Runtime Builder - rt2 Implementation (Device Orchestration)
+ *
+ * Provides init_runtime_impl and validate_runtime_impl functions for rt2 runtime.
+ * Supports device orchestration where AICPU thread 3 runs the orchestrator.
+ *
+ * init_runtime_impl:
+ *   - Converts host tensor pointers to device pointers (all inputs copied H2D;
+ *     only OUTPUT/INOUT tensors are copied back D2H)
+ *   - Copies orchestration SO to device memory
+ *   - Sets up runtime state for device orchestration
+ *
+ * validate_runtime_impl:
+ *   - Copies OUTPUT/INOUT tensors back from device to host (read-only inputs
+ *     are skipped)
+ *   - Frees device memory
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+#include <sys/time.h>
+
+#include <cerrno>
+#include <cinttypes>
+#include <cstddef>
+#include <cstdint>
+#include <cctype>
+#include <cstdlib>
+#include <cstring>
+#include <limits>
+#include <string>
+
+#include "../common/pto_runtime_status.h"
+#include "../runtime/pto_runtime2.h"
+#include "../runtime/pto_shared_memory.h"
+#include "../runtime/runtime.h"
+#include "../../../../common/task_interface/call_config.h"
+#include "utils/device_arena.h"
+#include "callable.h"
+#include "common/platform_config.h"
+#include "common/unified_log.h"
+#include "prepare_callable_common.h"
+
+static_assert(
+    RUNTIME_ENV_RING_COUNT == PTO2_MAX_RING_DEPTH, "RuntimeEnv ring count must match PTO2 runtime ring depth"
+);
+
+// Helper: return current time in milliseconds
+static int64_t _now_ms() {
+    struct timeval tv;
+    gettimeofday(&tv, nullptr);
+    return static_cast<int64_t>(tv.tv_sec) * 1000 + tv.tv_usec / 1000;
+}
+
+static bool is_power_of_2_u64(uint64_t value) { return value != 0 && (value & (value - 1)) == 0; }
+
+template <typename T>
+static std::string format_ring_array(const T (&values)[PTO2_MAX_RING_DEPTH]) {
+    std::string out = "[";
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; ++r) {
+        if (r != 0) {
+            out += ", ";
+        }
+        out += std::to_string(values[r]);
+    }
+    out += "]";
+    return out;
+}
+
+static std::string trim_copy(const std::string &input) {
+    size_t begin = 0;
+    while (begin < input.size() && std::isspace(static_cast<unsigned char>(input[begin]))) {
+        ++begin;
+    }
+    size_t end = input.size();
+    while (end > begin && std::isspace(static_cast<unsigned char>(input[end - 1]))) {
+        --end;
+    }
+    return input.substr(begin, end - begin);
+}
+
+static bool parse_uint_token(
+    const char *name, const std::string &raw, uint64_t min_val, uint64_t max_val, bool require_power_of_2, uint64_t *out
+) {
+    std::string token = trim_copy(raw);
+    if (token.empty()) {
+        LOG_WARN("%s has an empty value in '%s', ignored", name, raw.c_str());
+        return false;
+    }
+
+    if (token[0] == '-') {
+        LOG_WARN("%s=%s invalid (must be a non-negative integer), ignored", name, token.c_str());
+        return false;
+    }
+    char *endptr = nullptr;
+    errno = 0;
+    unsigned long long parsed = std::strtoull(token.c_str(), &endptr, 10);
+    if (errno == ERANGE || endptr == token.c_str() || *endptr != '\0') {
+        LOG_WARN("%s=%s invalid (must be a non-negative integer), ignored", name, token.c_str());
+        return false;
+    }
+    uint64_t val = static_cast<uint64_t>(parsed);
+
+    if (val < min_val || val > max_val) {
+        LOG_WARN(
+            "%s=%s invalid (must be in [%" PRIu64 ", %" PRIu64 "]), ignored", name, token.c_str(), min_val, max_val
+        );
+        return false;
+    }
+    if (require_power_of_2 && !is_power_of_2_u64(val)) {
+        LOG_WARN("%s=%s invalid (must be a power of 2), ignored", name, token.c_str());
+        return false;
+    }
+    *out = val;
+    return true;
+}
+
+static void apply_env_ring_values(
+    const char *name, uint64_t min_val, uint64_t max_val, bool require_power_of_2, uint64_t out[PTO2_MAX_RING_DEPTH]
+) {
+    const char *env = std::getenv(name);
+    if (!env) return;
+
+    std::string text(env);
+    if (text.find(',') == std::string::npos) {
+        uint64_t value = 0;
+        if (!parse_uint_token(name, text, min_val, max_val, require_power_of_2, &value)) {
+            return;
+        }
+        for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+            out[r] = value;
+        }
+        return;
+    }
+
+    uint64_t parsed[PTO2_MAX_RING_DEPTH]{};
+    size_t pos = 0;
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        size_t comma = text.find(',', pos);
+        std::string token = text.substr(pos, comma == std::string::npos ? std::string::npos : comma - pos);
+        if (!parse_uint_token(name, token, min_val, max_val, require_power_of_2, &parsed[r])) {
+            return;
+        }
+        if (comma == std::string::npos) {
+            if (r != PTO2_MAX_RING_DEPTH - 1) {
+                LOG_WARN(
+                    "%s=%s invalid (expected exactly %d comma-separated values), ignored", name, env,
+                    PTO2_MAX_RING_DEPTH
+                );
+                return;
+            }
+            pos = text.size();
+        } else {
+            pos = comma + 1;
+        }
+    }
+    if (pos < text.size() || (!text.empty() && text.back() == ',')) {
+        LOG_WARN("%s=%s invalid (expected exactly %d comma-separated values), ignored", name, env, PTO2_MAX_RING_DEPTH);
+        return;
+    }
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        out[r] = parsed[r];
+    }
+}
+
+static bool resolve_ring_config(
+    uint64_t ring_task_window, uint64_t ring_heap, uint64_t ring_dep_pool, const uint64_t *ring_task_windows,
+    const uint64_t *ring_heaps, const uint64_t *ring_dep_pools, uint64_t eff_task_window_sizes[PTO2_MAX_RING_DEPTH],
+    uint64_t eff_heap_sizes[PTO2_MAX_RING_DEPTH], int32_t eff_dep_pool_capacities[PTO2_MAX_RING_DEPTH]
+) {
+    uint64_t dep_pool_values[PTO2_MAX_RING_DEPTH];
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        eff_task_window_sizes[r] = PTO2_TASK_WINDOW_SIZE;
+        eff_heap_sizes[r] = PTO2_HEAP_SIZE;
+        dep_pool_values[r] = PTO2_DEP_LIST_POOL_SIZE;
+    }
+
+    apply_env_ring_values("PTO2_RING_TASK_WINDOW", 4, static_cast<uint64_t>(INT32_MAX), true, eff_task_window_sizes);
+    apply_env_ring_values("PTO2_RING_HEAP", 1024, std::numeric_limits<uint64_t>::max(), false, eff_heap_sizes);
+    apply_env_ring_values("PTO2_RING_DEP_POOL", 4, static_cast<uint64_t>(INT32_MAX), false, dep_pool_values);
+
+    if (ring_task_window != 0) {
+        if (ring_task_window < 4 || ring_task_window > static_cast<uint64_t>(INT32_MAX) ||
+            !is_power_of_2_u64(ring_task_window)) {
+            LOG_ERROR(
+                "runtime_env.ring_task_window=%" PRIu64 " must be a power of 2 in [4, INT32_MAX]", ring_task_window
+            );
+            return false;
+        }
+        for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+            eff_task_window_sizes[r] = ring_task_window;
+        }
+    }
+    if (ring_heap != 0) {
+        if (ring_heap < 1024) {
+            LOG_ERROR("runtime_env.ring_heap=%" PRIu64 " must be >= 1024", ring_heap);
+            return false;
+        }
+        for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+            eff_heap_sizes[r] = ring_heap;
+        }
+    }
+    if (ring_dep_pool != 0) {
+        if (ring_dep_pool < 4 || ring_dep_pool > static_cast<uint64_t>(INT32_MAX)) {
+            LOG_ERROR("runtime_env.ring_dep_pool=%" PRIu64 " must be in [4, INT32_MAX]", ring_dep_pool);
+            return false;
+        }
+        for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+            dep_pool_values[r] = ring_dep_pool;
+        }
+    }
+
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        if (ring_task_windows != nullptr && ring_task_windows[r] != 0) {
+            eff_task_window_sizes[r] = ring_task_windows[r];
+        }
+        if (ring_heaps != nullptr && ring_heaps[r] != 0) {
+            eff_heap_sizes[r] = ring_heaps[r];
+        }
+        if (ring_dep_pools != nullptr && ring_dep_pools[r] != 0) {
+            dep_pool_values[r] = ring_dep_pools[r];
+        }
+
+        if (eff_task_window_sizes[r] < 4 || eff_task_window_sizes[r] > static_cast<uint64_t>(INT32_MAX) ||
+            !is_power_of_2_u64(eff_task_window_sizes[r])) {
+            LOG_ERROR(
+                "ring_task_windows[%d]=%" PRIu64 " must be a power of 2 in [4, INT32_MAX]", r, eff_task_window_sizes[r]
+            );
+            return false;
+        }
+        if (eff_heap_sizes[r] < 1024) {
+            LOG_ERROR("ring_heaps[%d]=%" PRIu64 " must be >= 1024", r, eff_heap_sizes[r]);
+            return false;
+        }
+        if (dep_pool_values[r] < 4 || dep_pool_values[r] > static_cast<uint64_t>(INT32_MAX)) {
+            LOG_ERROR("ring_dep_pools[%d]=%" PRIu64 " must be in [4, INT32_MAX]", r, dep_pool_values[r]);
+            return false;
+        }
+        eff_dep_pool_capacities[r] = static_cast<int32_t>(dep_pool_values[r]);
+    }
+
+    return true;
+}
+
+static int32_t read_runtime_status(Runtime *runtime, PTO2SharedMemoryHeader *host_header) {
+    if (runtime == nullptr || host_header == nullptr) {
+        return 0;
+    }
+
+    void *sm_ptr = runtime->get_gm_sm_ptr();
+    if (sm_ptr == nullptr) {
+        return 0;
+    }
+
+    int hdr_rc = runtime->host_api.copy_from_device(host_header, sm_ptr, sizeof(PTO2SharedMemoryHeader));
+    if (hdr_rc != 0) {
+        LOG_WARN("Failed to copy PTO2 header from device");
+        return 0;
+    }
+
+    int32_t orch_error_code = host_header->orch_error_code.load(std::memory_order_relaxed);
+    int32_t sched_error_code = host_header->sched_error_code.load(std::memory_order_relaxed);
+    return runtime_status_from_error_codes(orch_error_code, sched_error_code);
+}
+
+/**
+ * Stage the per-callable resources (kernel binaries + orchestration SO) into
+ * the supplied runtime so a subsequent bind_callable_to_runtime_impl can use
+ * them. This is the cacheable half of init_runtime_impl: nothing here depends
+ * on per-run argument values, so the prepare_callable / run_prepared split
+ * lets us run this once per callable_id and amortize across runs.
+ *
+ * @param runtime   Pointer to pre-constructed Runtime (host_api populated)
+ * @param callable  ChipCallable carrying the orch SO + child kernel binaries
+ * @return 0 on success, -1 on failure
+ */
+extern "C" int
+prepare_callable_impl(const ChipCallable *callable, uint64_t (*upload_fn)(const void *), CallableArtifacts *out) {
+    if (callable == nullptr) {
+        LOG_ERROR("Callable pointer is null");
+        return -1;
+    }
+    if (upload_fn == nullptr || out == nullptr) {
+        LOG_ERROR("upload_fn or out is null");
+        return -1;
+    }
+    *out = CallableArtifacts{};
+    out->signature.assign(callable->signature_, callable->signature_ + callable->sig_count());
+
+    LOG_INFO_V0("Registering %d kernel(s) in prepare_callable_impl", callable->child_count());
+    if (upload_and_collect_child_addrs(callable, upload_fn, &out->kernel_addrs) != 0) {
+        LOG_ERROR("Failed to upload ChipCallable buffer");
+        return -1;
+    }
+    for (const ChildKernelAddr &c : out->kernel_addrs) {
+        if (c.func_id < 0 || c.func_id >= RUNTIME_MAX_FUNC_ID) {
+            LOG_ERROR("func_id=%d is out of range [0, %d)", c.func_id, RUNTIME_MAX_FUNC_ID);
+            return -1;
+        }
+    }
+
+    const uint8_t *orch_so_binary = static_cast<const uint8_t *>(callable->binary_data());
+    size_t orch_so_size = callable->binary_size();
+
+    if (orch_so_binary == nullptr || orch_so_size == 0) {
+        LOG_ERROR("Orchestration SO binary is required for device orchestration");
+        return -1;
+    }
+
+    out->orch_so_data = orch_so_binary;
+    out->orch_so_size = orch_so_size;
+    out->func_name = callable->func_name();
+    out->config_name = callable->config_name();
+    LOG_INFO_V0("Orchestration SO: %zu bytes staged (host-only)", orch_so_size);
+    return 0;
+}
+
+/**
+ * Per-run binding: build device-side argument storage (tensor copy-out, GM
+ * heap, PTO2 shared memory) and publish it to the runtime. Assumes the
+ * callable-side state (kernel binaries, orch SO bytes, func/config names)
+ * is already populated by prepare_callable_impl.
+ *
+ * Splitting this from prepare_callable_impl matches the per-callable_id
+ * design: register/run_prepared invokes this every call, while the prep
+ * half runs only once per callable_id.
+ *
+ * @param runtime    Pointer to pre-constructed Runtime (host_api populated)
+ * @param orch_args  Separated tensor/scalar arguments for this run
+ * @return 0 on success, -1 on failure
+ */
+extern "C" int bind_callable_to_runtime_impl(
+    Runtime *runtime, const ChipStorageTaskArgs *orch_args, void *host_orch_func_ptr, const ArgDirection *signature,
+    int sig_count, uint64_t ring_task_window, uint64_t ring_heap, uint64_t ring_dep_pool,
+    const uint64_t *ring_task_windows, const uint64_t *ring_heaps, const uint64_t *ring_dep_pools
+) {
+    if (runtime == nullptr) {
+        LOG_ERROR("Runtime pointer is null");
+        return -1;
+    }
+    if (orch_args == nullptr) {
+        LOG_ERROR("orch_args pointer is null");
+        return -1;
+    }
+    // trb runs orchestration on the device — there is no host-side orch
+    // function pointer to invoke. The c_api signature accepts one for
+    // symmetry with hbg; assert the trb-side invariant here.
+    if (host_orch_func_ptr != nullptr) {
+        LOG_ERROR("bind_callable_to_runtime_impl: trb does not accept a host_orch_func_ptr");
+        return -1;
+    }
+
+    int tensor_count = orch_args->tensor_count();
+    int scalar_count = orch_args->scalar_count();
+    LOG_INFO_V0("RT2 bind: %d tensors + %d scalars, device orchestration mode", tensor_count, scalar_count);
+
+    int64_t t_total_start = _now_ms();
+
+    uint64_t eff_task_window_sizes[PTO2_MAX_RING_DEPTH];
+    uint64_t eff_heap_sizes[PTO2_MAX_RING_DEPTH];
+    int32_t eff_dep_pool_capacities[PTO2_MAX_RING_DEPTH];
+    if (!resolve_ring_config(
+            ring_task_window, ring_heap, ring_dep_pool, ring_task_windows, ring_heaps, ring_dep_pools,
+            eff_task_window_sizes, eff_heap_sizes, eff_dep_pool_capacities
+        )) {
+        return -1;
+    }
+    const std::string task_window_log = format_ring_array(eff_task_window_sizes);
+    const std::string heap_log = format_ring_array(eff_heap_sizes);
+    const std::string dep_pool_log = format_ring_array(eff_dep_pool_capacities);
+    LOG_INFO_V0(
+        "Ring buffer sizes: task_window=%s heap=%s dep_pool=%s", task_window_log.c_str(), heap_log.c_str(),
+        dep_pool_log.c_str()
+    );
+
+    // Build device args: copy from input, replace host tensor pointers with device pointers
+    ChipStorageTaskArgs device_args;
+
+    int64_t t_args_start = _now_ms();
+    for (int i = 0; i < tensor_count; i++) {
+        Tensor t = orch_args->tensor(i);
+
+        if (t.is_child_memory()) {
+            LOG_INFO_V0("  Tensor %d: child memory, pass-through (0x%" PRIx64 ")", i, t.buffer.addr);
+            device_args.add_tensor(t);
+            continue;
+        }
+
+        void *host_ptr = reinterpret_cast<void *>(static_cast<uintptr_t>(t.buffer.addr));
+        size_t size = static_cast<size_t>(t.nbytes());
+
+        void *dev_ptr = runtime->host_api.device_malloc(size);
+        if (dev_ptr == nullptr) {
+            LOG_ERROR("Failed to allocate device memory for tensor %d", i);
+            return -1;
+        }
+
+        // Pure write-only OUTPUT buffers carry no meaningful host content, so
+        // the H2D copy-in is wasted. Zero them on-device instead (cheap HBM
+        // memset, no PCIe) so any region the kernel leaves unwritten reads as 0
+        // rather than pooled-allocator garbage. INOUT (read-before-write)
+        // and IN keep the H2D copy. Falls back to copy_to_device if a backend
+        // did not wire device_memset.
+        bool is_pure_output = (signature != nullptr && i < sig_count && signature[i] == ArgDirection::OUT);
+        int rc;
+        if (is_pure_output && runtime->host_api.device_memset != nullptr) {
+            rc = runtime->host_api.device_memset(dev_ptr, 0, size);
+        } else {
+            rc = runtime->host_api.copy_to_device(dev_ptr, host_ptr, size);
+        }
+        if (rc != 0) {
+            LOG_ERROR("Failed to stage tensor %d to device", i);
+            runtime->host_api.device_free(dev_ptr);
+            return -1;
+        }
+        // Read-only INPUT tensors are never written by the kernel, so there is
+        // no point copying them back D2H at the end. Index the signature
+        // by the orch tensor index `i` (child_memory tensors are skipped above
+        // but do not consume a separate signature slot — scalars follow the
+        // tensor entries). Anything not provably IN keeps the safe default of
+        // copying back.
+        bool needs_copy_back = !(signature != nullptr && i < sig_count && signature[i] == ArgDirection::IN);
+        runtime->tensor_pairs_.push_back({host_ptr, dev_ptr, size, needs_copy_back});
+        LOG_INFO_V0("  Tensor %d: %zu bytes at %p", i, size, dev_ptr);
+
+        t.buffer.addr = reinterpret_cast<uint64_t>(dev_ptr);
+        device_args.add_tensor(t);
+    }
+    for (int i = 0; i < scalar_count; i++) {
+        device_args.add_scalar(orch_args->scalar(i));
+    }
+    int64_t t_args_end = _now_ms();
+
+    // Read orchestrator-to-scheduler transition flag from environment
+    {
+        const char *env_val = std::getenv("PTO2_ORCH_TO_SCHED");
+        if (env_val && (env_val[0] == '1' || env_val[0] == 't' || env_val[0] == 'T')) {
+            runtime->orch_to_sched = true;
+        }
+        LOG_INFO_V0("Orchestrator-to-scheduler transition: %s", runtime->orch_to_sched ? "enabled" : "disabled");
+    }
+
+    // Lay out the per-Worker static device arena. GM heap, PTO2 shared memory,
+    // and the prebuilt runtime arena all live in a single backing allocation;
+    // setup_static_arena reserves the three regions and commits in one shot.
+    // Owned by DeviceRunner across runs — do NOT record in tensor_pairs_; the
+    // free is deferred to DeviceRunner::finalize(). The runtime-arena size is
+    // determined by replaying the reserve sequence on a host-side arena.
+    uint64_t total_heap_size = 0;
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        if (eff_heap_sizes[r] > std::numeric_limits<uint64_t>::max() - total_heap_size) {
+            LOG_ERROR("Total ring heap size overflows uint64_t");
+            return -1;
+        }
+        total_heap_size += eff_heap_sizes[r];
+    }
+    uint64_t sm_size = PTO2SharedMemoryHandle::calculate_size_per_ring(eff_task_window_sizes);
+
+    int64_t t_prebuilt_start = _now_ms();
+    DeviceArena host_arena;  // libc malloc backend by default
+    PTO2RuntimeArenaLayout layout =
+        runtime_reserve_layout(host_arena, eff_task_window_sizes, eff_heap_sizes, eff_dep_pool_capacities);
+    if (host_arena.commit(DeviceArena::kDefaultBaseAlign) == nullptr) {
+        LOG_ERROR("Failed to commit host arena for prebuilt runtime image");
+        return -1;
+    }
+
+    int64_t t_setup_start = _now_ms();
+    if (runtime->host_api.setup_static_arena(total_heap_size, sm_size, layout.arena_size) != 0) {
+        LOG_ERROR("Failed to setup pooled static arena");
+        return -1;
+    }
+    int64_t t_setup_end = _now_ms();
+
+    int64_t t_heap_start = _now_ms();
+    void *gm_heap = runtime->host_api.acquire_pooled_gm_heap();
+    int64_t t_heap_end = _now_ms();
+    if (gm_heap == nullptr) {
+        LOG_ERROR("Failed to acquire pooled GM heap");
+        return -1;
+    }
+    runtime->set_gm_heap(gm_heap);
+
+    int64_t t_sm_start = _now_ms();
+    void *sm_ptr = runtime->host_api.acquire_pooled_gm_sm();
+    int64_t t_sm_end = _now_ms();
+    if (sm_ptr == nullptr) {
+        LOG_ERROR("Failed to acquire pooled PTO2 shared memory");
+        return -1;
+    }
+    runtime->set_gm_sm_ptr(sm_ptr);
+
+    void *runtime_arena_dev = runtime->host_api.acquire_pooled_runtime_arena();
+    if (runtime_arena_dev == nullptr) {
+        LOG_ERROR("Failed to acquire pooled runtime arena");
+        return -1;
+    }
+
+    // Set up device orchestration state
+    runtime->set_orch_args(device_args);
+
+    // -------------------------------------------------------------------------
+    // Build the prebuilt runtime-arena image on host.
+    //
+    // We pre-compute every byte the AICPU's runtime arena would otherwise have
+    // to write at boot: layout offsets, sub-structure init data, and pointers
+    // back to the SM / GM heap. Then we rtMemcpy the image into the pooled
+    // runtime-arena region that DeviceRunner keeps alive across runs. AICPU
+    // boot becomes attach + wire (cheap pointer fixup) + sm_handle->init (SM
+    // reset) + a handful of device-only field fixups.
+    // -------------------------------------------------------------------------
+    PTO2Runtime *rt =
+        runtime_init_data_from_layout(host_arena, layout, PTO2_MODE_EXECUTE, sm_ptr, sm_size, gm_heap, eff_heap_sizes);
+    if (rt == nullptr) {
+        LOG_ERROR("runtime_init_data_from_layout failed");
+        return -1;
+    }
+    runtime_wire_arena_pointers(host_arena, layout, rt);
+
+    // Stash the layout inside the PTO2Runtime image so the AICPU can recover
+    // every arena-internal offset after rtMemcpy. The runtime arena's device
+    // base does NOT travel in this image — it's on the host Runtime
+    // (set_prebuilt_arena below), since the AICPU needs that pointer
+    // *before* it can dereference the image.
+    rt->prebuilt_layout = layout;
+
+    int rc_upload = runtime->host_api.copy_to_device(runtime_arena_dev, host_arena.base(), layout.arena_size);
+    if (rc_upload != 0) {
+        LOG_ERROR("Failed to rtMemcpy prebuilt runtime arena to device (rc=%d)", rc_upload);
+        return -1;
+    }
+    runtime->set_prebuilt_arena(runtime_arena_dev, layout.off_runtime);
+    int64_t t_prebuilt_end = _now_ms();
+
+    LOG_INFO_V0("Device orchestration ready: %d tensors + %d scalars", tensor_count, scalar_count);
+
+    int64_t t_total_end = _now_ms();
+    LOG_INFO_V0("TIMING: args_malloc_copy = %" PRId64 "ms", t_args_end - t_args_start);
+    LOG_INFO_V0("TIMING: static_arena_setup = %" PRId64 "ms", t_setup_end - t_setup_start);
+    LOG_INFO_V0("TIMING: gm_heap_acquire = %" PRId64 "ms", t_heap_end - t_heap_start);
+    LOG_INFO_V0("TIMING: shared_mem_acquire = %" PRId64 "ms", t_sm_end - t_sm_start);
+    LOG_INFO_V0("TIMING: prebuilt_runtime_arena = %" PRId64 "ms", t_prebuilt_end - t_prebuilt_start);
+    LOG_INFO_V0("TIMING: total_init_runtime_impl = %" PRId64 "ms", t_total_end - t_total_start);
+
+    return 0;
+}
+
+/**
+ * Validate runtime results and cleanup.
+ *
+ * This function:
+ * 1. Copies recorded tensors from device back to host
+ * 2. Frees device memory for recorded tensors
+ * 3. Clears tensor pair state
+ *
+ * @param runtime  Pointer to Runtime
+ * @return 0 on success, -1 on failure
+ */
+extern "C" int validate_runtime_impl(Runtime *runtime) {
+    if (runtime == nullptr) {
+        LOG_ERROR("Runtime pointer is null");
+        return -1;
+    }
+
+    int rc = 0;
+
+    LOG_INFO_V0("=== Copying Results Back to Host ===");
+
+    // Copy all recorded tensors from device back to host
+    TensorPair *tensor_pairs = runtime->tensor_pairs_.data();
+    int tensor_pair_count = static_cast<int>(runtime->tensor_pairs_.size());
+
+    LOG_INFO_V0("Tensor pairs to process: %d", tensor_pair_count);
+
+    // PTO2 (device orchestration): graph output may be in packed buffer
+    uint64_t graph_out_ptr = 0;
+    uint64_t graph_out_size = 0;
+    bool skip_tensor_copy_back = false;
+    int32_t runtime_status = 0;
+    PTO2SharedMemoryHeader host_header;
+    memset(&host_header, 0, sizeof(host_header));
+
+    runtime_status = read_runtime_status(runtime, &host_header);
+    if (runtime_status != 0) {
+        int32_t orch_error_code = host_header.orch_error_code.load(std::memory_order_relaxed);
+        int32_t sched_error_code = host_header.sched_error_code.load(std::memory_order_relaxed);
+        LOG_ERROR(
+            "PTO2 runtime failed: orch_error_code=%d sched_error_code=%d runtime_status=%d", orch_error_code,
+            sched_error_code, runtime_status
+        );
+        skip_tensor_copy_back = true;
+    } else {
+        graph_out_ptr = host_header.graph_output_ptr;
+        graph_out_size = host_header.graph_output_size;
+        if (graph_out_ptr != 0) {
+            LOG_INFO_V0("Graph output buffer: ptr=0x%" PRIx64 ", size=%" PRIu64, graph_out_ptr, graph_out_size);
+        }
+    }
+
+    if (skip_tensor_copy_back) {
+        LOG_WARN("Skipping tensor copy-back because PTO2 runtime reported fatal status");
+    } else {
+        bool first_output_tensor = true;
+        for (int i = 0; i < tensor_pair_count; i++) {
+            const TensorPair &pair = tensor_pairs[i];
+
+            // Skip if device pointer is null
+            if (pair.dev_ptr == nullptr) {
+                LOG_WARN("Tensor %d has null device pointer, skipping", i);
+                continue;
+            }
+
+            // If host pointer is null, this is a device-only allocation (no copy-back)
+            if (pair.host_ptr == nullptr) {
+                LOG_INFO_V0("Tensor %d: device-only allocation (no copy-back)", i);
+                continue;
+            }
+
+            // Read-only INPUT tensors were uploaded H2D but the kernel never
+            // wrote them — copying them back (potentially ~GB) is pure waste.
+            // They are still device_free'd in the cleanup loop below.
+            if (!pair.needs_copy_back) {
+                LOG_INFO_V0("Tensor %d: read-only input, skipping copy-back", i);
+                continue;
+            }
+
+            void *src_ptr = pair.dev_ptr;
+            size_t copy_size = pair.size;
+
+            // Use graph_output_ptr for the first output tensor if available
+            if (first_output_tensor && graph_out_ptr != 0 && graph_out_size > 0) {
+                src_ptr = reinterpret_cast<void *>(static_cast<uintptr_t>(graph_out_ptr));
+                copy_size = static_cast<size_t>(graph_out_size);
+                LOG_INFO_V0("Using packed output buffer for tensor %d", i);
+                first_output_tensor = false;
+            }
+
+            int copy_rc = runtime->host_api.copy_from_device(pair.host_ptr, src_ptr, copy_size);
+            if (copy_rc != 0) {
+                LOG_ERROR("Failed to copy tensor %d from device: %d", i, copy_rc);
+                rc = copy_rc;
+            } else {
+                LOG_INFO_V0("Tensor %d: %zu bytes copied to host", i, pair.size);
+            }
+        }
+    }
+
+    // Cleanup device tensors
+    LOG_INFO_V0("=== Cleaning Up ===");
+    for (int i = 0; i < tensor_pair_count; i++) {
+        if (tensor_pairs[i].dev_ptr != nullptr) {
+            runtime->host_api.device_free(tensor_pairs[i].dev_ptr);
+        }
+    }
+    LOG_INFO_V0("Freed %d device allocations", tensor_pair_count);
+
+    // Clear the per-run dispatch-table entries staged by prepare_callable_impl.
+    // The underlying chip-callable device buffer is pool-managed by
+    // DeviceRunner (keyed by content hash) and bulk-freed in
+    // DeviceRunner::finalize().
+    int kernel_count = runtime->get_registered_kernel_count();
+    for (int i = 0; i < kernel_count; i++) {
+        int func_id = runtime->get_registered_kernel_func_id(i);
+        runtime->set_function_bin_addr(func_id, 0);
+    }
+    if (kernel_count > 0) {
+        LOG_INFO_V0("Cleared %d kernel dispatch-table entries", kernel_count);
+    }
+    runtime->clear_registered_kernels();
+
+    // Clear tensor pairs
+    runtime->tensor_pairs_.clear();
+
+    LOG_INFO_V0("=== Finalize Complete ===");
+
+    if (rc == 0 && runtime_status != 0) {
+        rc = runtime_status;
+    }
+
+    return rc;
+}
diff --git a/src/a5/runtime/fully_distributed_within_core/orchestration/common.cpp b/src/a5/runtime/fully_distributed_within_core/orchestration/common.cpp
new file mode 100644
index 000000000..c4878a1c2
--- /dev/null
+++ b/src/a5/runtime/fully_distributed_within_core/orchestration/common.cpp
@@ -0,0 +1,197 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+#include "common.h"
+
+#ifdef __linux__
+#include <cxxabi.h>
+#include <dlfcn.h>
+#include <execinfo.h>
+#include <unistd.h>
+
+#include <array>
+#include <cstring>
+#include <vector>
+#endif
+
+struct PTO2Runtime;
+
+// Unified-log error sink. Forward-declared here rather than pulled via
+// common/unified_log.h: that header lives under common/log/include, which is
+// not on the orchestration .so build's include path. The symbol resolves at
+// link time for the runtime targets, and at dlopen time for the orchestration
+// .so (against the executor's unified_log_device), so onboard diagnostics still
+// reach the CANN device log.
+extern "C" void unified_log_error(const char *func, const char *fmt, ...);
+
+namespace {
+// Plain global (not thread_local) to avoid glibc TLSDESC stale-resolution
+// crash (BZ #32412) when the orchestration SO is dlclose'd/re-dlopen'd
+// between execution rounds.  All orchestrator threads bind the same rt
+// value, so per-thread storage is unnecessary.
+PTO2Runtime *g_current_runtime = nullptr;
+}  // namespace
+
+extern "C" __attribute__((visibility("default"))) void framework_bind_runtime(PTO2Runtime *rt) {
+    g_current_runtime = rt;
+}
+
+// Keep current_runtime local to this .so so orchestration helpers do not
+// accidentally bind to the AICPU binary's same-named symbol.
+extern "C" __attribute__((visibility("hidden"))) PTO2Runtime *framework_current_runtime() { return g_current_runtime; }
+
+/**
+ * Use addr2line to convert an address to file:line information.
+ * Uses the -i flag to expand inlines; returns the first line (innermost actual code location).
+ * If inlining is present, also returns the outer call chain via inline_chain.
+ */
+#ifdef __linux__
+static std::string addr_to_line(const char *executable, void *addr, std::string *inline_chain = nullptr) {
+    char cmd[512];
+    snprintf(cmd, sizeof(cmd), "addr2line -e %s -f -C -p -i %p 2>/dev/null", executable, addr);
+
+    std::array<char, 256> buffer;
+    std::string raw_output;
+
+    FILE *pipe = popen(cmd, "r");
+    if (pipe) {
+        while (fgets(buffer.data(), buffer.size(), pipe) != nullptr) {
+            raw_output += buffer.data();
+        }
+        pclose(pipe);
+    }
+
+    if (raw_output.empty() || raw_output.find("??") != std::string::npos) {
+        return "";
+    }
+
+    // Split by lines
+    std::vector<std::string> lines;
+    size_t pos = 0;
+    while (pos < raw_output.size()) {
+        size_t nl = raw_output.find('\n', pos);
+        if (nl == std::string::npos) nl = raw_output.size();
+        std::string line = raw_output.substr(pos, nl - pos);
+        while (!line.empty() && line.back() == '\r')
+            line.pop_back();
+        if (!line.empty()) lines.push_back(line);
+        pos = nl + 1;
+    }
+
+    if (lines.empty()) return "";
+
+    // First line is the innermost actual code location; subsequent lines are outer inline callers
+    if (inline_chain && lines.size() > 1) {
+        *inline_chain = "";
+        for (size_t j = 1; j < lines.size(); j++) {
+            *inline_chain += "    [inlined by] " + lines[j] + "\n";
+        }
+    }
+
+    return lines.front();
+}
+#endif
+
+/**
+ * Get current stack trace information (including file paths and line numbers).
+ * Uses dladdr to locate the shared library for each stack frame, then calls addr2line with relative addresses.
+ */
+std::string get_stacktrace(int skip_frames) {
+    (void)skip_frames;  // May be unused on non-Linux platforms
+    std::string result;
+#ifdef __linux__
+    const int max_frames = 64;
+    void *buffer[max_frames];
+    int nframes = backtrace(buffer, max_frames);
+    char **symbols = backtrace_symbols(buffer, nframes);
+
+    if (symbols) {
+        result = "Stack trace:\n";
+        for (int i = skip_frames; i < nframes; i++) {
+            std::string frame_info;
+
+            void *addr = (void *)((char *)buffer[i] - 1);
+
+            Dl_info dl_info;
+            std::string inline_chain;
+            if (dladdr(addr, &dl_info) && dl_info.dli_fname) {
+                void *rel_addr = (void *)((char *)addr - (char *)dl_info.dli_fbase);
+                std::string addr2line_result = addr_to_line(dl_info.dli_fname, rel_addr, &inline_chain);
+
+                if (addr2line_result.empty()) {
+                    addr2line_result = addr_to_line(dl_info.dli_fname, addr, &inline_chain);
+                }
+
+                if (!addr2line_result.empty()) {
+                    frame_info = std::string(dl_info.dli_fname) + ": " + addr2line_result;
+                }
+            }
+
+            if (frame_info.empty()) {
+                std::string frame(symbols[i]);
+
+                size_t start = frame.find('(');
+                size_t end = frame.find('+', start);
+                if (start != std::string::npos && end != std::string::npos) {
+                    std::string mangled = frame.substr(start + 1, end - start - 1);
+                    int status;
+                    char *demangled = abi::__cxa_demangle(mangled.c_str(), nullptr, nullptr, &status);
+                    if (status == 0 && demangled) {
+                        frame = frame.substr(0, start + 1) + demangled + frame.substr(end);
+                        free(demangled);
+                    }
+                }
+                frame_info = frame;
+            }
+
+            char buf[16];
+            snprintf(buf, sizeof(buf), "  #%d ", i - skip_frames);
+            result += buf + frame_info + "\n";
+            if (!inline_chain.empty()) {
+                result += inline_chain;
+            }
+        }
+        free(symbols);
+    }
+#else
+    result = "(Stack trace is only available on Linux)\n";
+#endif
+    return result;
+}
+
+// AssertionError constructor
+static std::string build_assert_message(const char *condition, const char *file, int line) {
+    std::string msg = "Assertion failed: " + std::string(condition) + "\n";
+    msg += "  Location: " + std::string(file) + ":" + std::to_string(line) + "\n";
+    msg += get_stacktrace(3);
+    return msg;
+}
+
+AssertionError::AssertionError(const char *condition, const char *file, int line) :
+    std::runtime_error(build_assert_message(condition, file, line)),
+    condition_(condition),
+    file_(file),
+    line_(line) {}
+
+[[noreturn]] void assert_impl(const char *condition, const char *file, int line) {
+    // Use unified_log_error directly rather than the LOG_ERROR macro: that macro
+    // lives in pto_orchestration_api.h and expands to
+    // current_runtime()->ops->log_error, but the ops table's definition pulls in
+    // pto_types.h (Arg → __aicore__-only to_u64), which the AICore build of this
+    // TU cannot compile. unified_log_error reaches the same sink without that
+    // dependency.
+    unified_log_error(__FUNCTION__, "\n========================================");
+    unified_log_error(__FUNCTION__, "Assertion failed: %s", condition);
+    unified_log_error(__FUNCTION__, "Location: %s:%d", file, line);
+    unified_log_error(__FUNCTION__, "%s", get_stacktrace(2).c_str());
+    unified_log_error(__FUNCTION__, "========================================\n");
+
+    throw AssertionError(condition, file, line);
+}
diff --git a/src/a5/runtime/fully_distributed_within_core/orchestration/pto_arg_with_deps.h b/src/a5/runtime/fully_distributed_within_core/orchestration/pto_arg_with_deps.h
new file mode 100644
index 000000000..ed2f03989
--- /dev/null
+++ b/src/a5/runtime/fully_distributed_within_core/orchestration/pto_arg_with_deps.h
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Convenience layer over Arg: bundles a fixed-capacity dependency buffer with
+ * an Arg and exposes an incremental add_dep(...) API on top of the runtime
+ * primitive L0TaskArgs::set_dependencies(ptr, count).
+ *
+ * Layering:
+ *   - Primitive:   Arg + set_dependencies(ptr, count) in pto_types.h.
+ *                  No cap, caller owns the deps buffer.
+ *   - Convenience: L0TaskArgsWithDeps<N> in this header. Owns a stack-sized dep
+ *                  buffer of capacity N (default 16); provides add_dep().
+ *                  Submitted via the rt_submit_*_task overloads below, which
+ *                  forward the bundled deps into the underlying Arg.
+ *
+ * This file is auto-included at the bottom of pto_orchestration_api.h so
+ * orchestration sources see L0TaskArgsWithDeps after a single `#include
+ * "pto_orchestration_api.h"`. The split is purely organizational —
+ * orchestration code should not include this header directly. Code generated
+ * from pypto can ignore the convenience layer entirely and target Arg +
+ * set_dependencies(ptr, count) directly.
+ *
+ * L0TaskArgsWithDeps uses private inheritance from Arg so that set_dependencies and
+ * the explicit_dep* accessors are NOT reachable on a wrapper instance — users
+ * who pick the convenience layer cannot accidentally mix it with the
+ * primitive layer's dep API on the same object.
+ */
+
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <type_traits>
+
+#include "pto_orchestration_api.h"  // Arg, MixedKernels, rt_submit_* primitives
+
+template <size_t MAX_DEP_COUNT = 16>
+class L0TaskArgsWithDeps : private L0TaskArgs {
+public:
+    // Tensor / scalar setters — forward to Arg
+    using L0TaskArgs::add_inout;
+    using L0TaskArgs::add_input;
+    using L0TaskArgs::add_no_dep;
+    using L0TaskArgs::add_output;
+    using L0TaskArgs::add_scalar;
+    using L0TaskArgs::add_scalars;
+    using L0TaskArgs::add_scalars_i32;
+    using L0TaskArgs::copy_scalars_from;
+
+    // Error / status — forward to Arg
+    using L0TaskArgs::error_msg;
+    using L0TaskArgs::has_error;
+    using L0TaskArgs::launch_spec;
+    using L0TaskArgs::set_error;
+
+    // NOT exposed: set_dependencies, explicit_dep_count, explicit_dep,
+    // explicit_deps_data — these are the primitive-layer dep API. Users of
+    // the convenience layer reach dependencies only through add_dep() below.
+
+    /**
+     * Append one or more dependencies to the bundled buffer. May be called
+     * multiple times; deps accumulate. Variadic accepts any non-zero number
+     * of PTO2TaskId arguments.
+     *
+     * Overflow (more than MAX_DEP_COUNT total) records an error on the
+     * underlying Arg; the error surfaces at submit time.
+     */
+    template <typename... Ids>
+    void add_dep(Ids... ids) {
+        static_assert(sizeof...(Ids) >= 1, "add_dep: at least one task id is required");
+        static_assert(
+            (std::is_same_v<std::decay_t<Ids>, PTO2TaskId> && ...), "add_dep: all arguments must be PTO2TaskId"
+        );
+        if (count_ + sizeof...(Ids) > MAX_DEP_COUNT) {
+            L0TaskArgs::set_error(
+                "L0TaskArgsWithDeps::add_dep: dep count exceeds MAX_DEP_COUNT (bump the template arg)"
+            );
+            return;
+        }
+        ((deps_[count_++] = ids), ...);
+    }
+
+    /**
+     * Clear the bundled dep buffer and reset the underlying Arg.
+     * Use this to recycle an L0TaskArgsWithDeps across loop iterations.
+     */
+    void reset() {
+        L0TaskArgs::reset();
+        count_ = 0;
+    }
+
+    /**
+     * Submit-only hook: bind the bundled deps onto the underlying Arg and
+     * return it as Arg&. Called by the rt_submit_*_task overloads below;
+     * orchestration code does not invoke this directly.
+     *
+     * Idempotent: explicitly clears any prior dep binding before re-setting,
+     * so a wrapper can be re-finalized (e.g. resubmitted) without tripping
+     * the primitive layer's single-shot check.
+     */
+    L0TaskArgs &finalize_for_submit() {
+        L0TaskArgs::set_dependencies(nullptr, 0);
+        L0TaskArgs::set_dependencies(deps_, count_);
+        return *this;
+    }
+
+private:
+    PTO2TaskId deps_[MAX_DEP_COUNT];
+    uint32_t count_ = 0;
+};
+
+// =============================================================================
+// Submit overloads — accept L0TaskArgsWithDeps<N> transparently
+// =============================================================================
+
+template <size_t N>
+static inline TaskOutputTensors rt_submit_task(const MixedKernels &mixed_kernels, L0TaskArgsWithDeps<N> &awd) {
+    return rt_submit_task(mixed_kernels, awd.finalize_for_submit());
+}
+
+template <size_t N>
+static inline TaskOutputTensors rt_submit_aic_task(int32_t kernel_id, L0TaskArgsWithDeps<N> &awd) {
+    return rt_submit_aic_task(kernel_id, awd.finalize_for_submit());
+}
+
+template <size_t N>
+static inline TaskOutputTensors rt_submit_aiv_task(int32_t kernel_id, L0TaskArgsWithDeps<N> &awd) {
+    return rt_submit_aiv_task(kernel_id, awd.finalize_for_submit());
+}
diff --git a/src/a5/runtime/fully_distributed_within_core/orchestration/pto_orchestration_api.h b/src/a5/runtime/fully_distributed_within_core/orchestration/pto_orchestration_api.h
new file mode 100644
index 000000000..fa0fc9c8f
--- /dev/null
+++ b/src/a5/runtime/fully_distributed_within_core/orchestration/pto_orchestration_api.h
@@ -0,0 +1,386 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * PTO Orchestration API - Slim header for orchestration .so files
+ *
+ * This header provides everything an orchestration source needs without
+ * pulling in runtime implementation headers.  The orchestration .so has
+ * zero link dependencies on runtime .cpp files; all runtime calls go
+ * through the PTO2RuntimeOps function-pointer table embedded in
+ * PTO2Runtime.
+ *
+ * Orchestration sources include ONLY this header:
+ *   #include "pto_orchestration_api.h"
+ *
+ * Runtime sources continue to use pto_runtime2.h (which defines the
+ * full PTO2Runtime struct with all internal fields).
+ */
+
+#pragma once
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include <type_traits>
+
+// Type headers needed by orchestration
+#include "common.h"              // framework_bind_runtime / framework_current_runtime
+#include "pto_runtime2_types.h"  // PTO2_ERROR_*
+#include "pto_submit_types.h"    // MixedKernels, INVALID_KERNEL_ID, subtask slots
+#include "pto_types.h"           // Arg, TaskOutputTensors, TensorArgType
+#include "task_args.h"           // ChipStorageTaskArgs, Tensor
+#include "tensor.h"              // Tensor, TensorCreateInfo
+
+// =============================================================================
+// Tensor Factory Helpers
+// =============================================================================
+
+// make_tensor_external(...) — canonical factory for pre-allocated external
+// memory — is defined in the unified tensor.h (common), so host and runtime
+// build Tensors through the same controlled path.
+
+// =============================================================================
+// Ops Table and Opaque Runtime
+// =============================================================================
+
+/**
+ * Forward declaration — the orchestration sees PTO2Runtime as a partial
+ * struct whose first field is the ops pointer.  The full definition
+ * lives in pto_runtime2.h (used only by runtime .cpp files).
+ */
+typedef struct PTO2Runtime PTO2Runtime;
+
+/**
+ * Function-pointer table for runtime operations.
+ * Populated by the runtime; called by orchestration through inline wrappers.
+ */
+typedef struct PTO2RuntimeOps {
+    TaskOutputTensors (*submit_task)(PTO2Runtime *rt, const MixedKernels &mixed_kernels, const L0TaskArgs &args);
+    void (*scope_begin)(PTO2Runtime *rt);
+    void (*scope_end)(PTO2Runtime *rt);
+    void (*orchestration_done)(PTO2Runtime *rt);
+    bool (*is_fatal)(PTO2Runtime *rt);
+    void (*report_fatal)(PTO2Runtime *rt, int32_t error_code, const char *func, const char *fmt, ...);
+
+    // Logging (populated by runtime, called by orchestration)
+    void (*log_error)(const char *func, const char *fmt, ...);
+    void (*log_warn)(const char *func, const char *fmt, ...);
+    void (*log_debug)(const char *func, const char *fmt, ...);
+    // INFO with explicit verbosity tier (v ∈ [0,9]; gating done inside).
+    void (*log_info_v)(const char *func, int v, const char *fmt, ...);
+
+    // Cross-layer data access (orchestration reads/writes tensor values via runtime)
+    // Placed after logging to avoid shifting hot-path field offsets.
+    uint64_t (*get_tensor_data)(PTO2Runtime *rt, const Tensor &tensor, uint32_t ndims, const uint32_t indices[]);
+    void (*set_tensor_data)(
+        PTO2Runtime *rt, const Tensor &tensor, uint32_t ndims, const uint32_t indices[], uint64_t value
+    );
+    TaskOutputTensors (*alloc_tensors)(PTO2Runtime *rt, const L0TaskArgs &args);
+    TaskOutputTensors (*submit_dummy_task)(PTO2Runtime *rt, const L0TaskArgs &args);
+
+    // Stash the call-site of the next PTO2ScopeGuard so the [ScopeStats]
+    // collector can log it. Always present to keep ops-table layout stable
+    // across PTO2_PROFILING settings; set to nullptr at PTO2_PROFILING=0.
+    void (*scope_set_site)(const char *file, int line);
+} PTO2RuntimeOps;
+
+/**
+ * Partial PTO2Runtime definition for orchestration.
+ *
+ * Exposes the ops pointer (for runtime calls) and pending_scope_mode
+ * (read directly by inline scope wrappers).  The real struct (in
+ * pto_runtime2.h) has the same first fields, so accessing them through
+ * this definition is well-defined (C struct layout guarantee).
+ */
+struct PTO2Runtime {
+    const PTO2RuntimeOps *ops;
+    PTO2ScopeMode pending_scope_mode;
+};
+
+// =============================================================================
+// Inline Convenience Wrappers (call through ops table)
+// =============================================================================
+
+static inline PTO2Runtime *current_runtime() { return framework_current_runtime(); }
+
+static inline TaskOutputTensors alloc_tensors(const L0TaskArgs &args) {
+    PTO2Runtime *rt = current_runtime();
+    if (rt->ops->is_fatal(rt)) {
+        return TaskOutputTensors{};
+    }
+    return rt->ops->alloc_tensors(rt, args);
+}
+
+static inline TaskOutputTensors alloc_tensors(const TensorCreateInfo create_infos[], uint32_t count) {
+    PTO2Runtime *rt = current_runtime();
+    if (rt->ops->is_fatal(rt)) {
+        return TaskOutputTensors{};
+    }
+    L0TaskArgs args;
+    for (uint32_t i = 0; i < count; i++) {
+        args.add_output(create_infos[i]);
+    }
+    if (args.has_error) {
+        rt->ops->report_fatal(
+            rt, PTO2_ERROR_INVALID_ARGS, __FUNCTION__, "%s",
+            args.error_msg ? args.error_msg : "alloc_tensors failed to construct output-only Arg"
+        );
+        return TaskOutputTensors{};
+    }
+    return alloc_tensors(args);
+}
+
+template <typename... CIs>
+static inline TaskOutputTensors alloc_tensors(const CIs &...cis) {
+    static_assert(sizeof...(cis) > 0, "alloc_tensors requires at least one TensorCreateInfo");
+    static_assert(
+        (std::is_same_v<std::decay_t<CIs>, TensorCreateInfo> && ...),
+        "alloc_tensors only accepts TensorCreateInfo arguments"
+    );
+    PTO2Runtime *rt = current_runtime();
+    if (rt->ops->is_fatal(rt)) {
+        return TaskOutputTensors{};
+    }
+    L0TaskArgs args;
+    (args.add_output(cis), ...);
+    if (args.has_error) {
+        rt->ops->report_fatal(
+            rt, PTO2_ERROR_INVALID_ARGS, __FUNCTION__, "%s",
+            args.error_msg ? args.error_msg : "alloc_tensors failed to construct output-only Arg"
+        );
+        return TaskOutputTensors{};
+    }
+    return alloc_tensors(args);
+}
+
+static inline TaskOutputTensors rt_submit_task(const MixedKernels &mixed_kernels, const L0TaskArgs &args) {
+    PTO2Runtime *rt = current_runtime();
+    if (rt->ops->is_fatal(rt)) {
+        return TaskOutputTensors{};
+    }
+    return rt->ops->submit_task(rt, mixed_kernels, args);
+}
+
+/**
+ * Convenience wrapper: submit an AIC-only task.
+ */
+static inline TaskOutputTensors rt_submit_aic_task(int32_t kernel_id, const L0TaskArgs &args) {
+    MixedKernels mk;
+    mk.aic_kernel_id = kernel_id;
+    return rt_submit_task(mk, args);
+}
+
+/**
+ * Convenience wrapper: submit an AIV-only task (uses AIV0 slot).
+ */
+static inline TaskOutputTensors rt_submit_aiv_task(int32_t kernel_id, const L0TaskArgs &args) {
+    MixedKernels mk;
+    mk.aiv0_kernel_id = kernel_id;
+    return rt_submit_task(mk, args);
+}
+
+/**
+ * Submit a dependency-only task. Accepts the same Arg shape as rt_submit_task
+ * (inputs, outputs, inouts, explicit_deps, scalars) but does not run any
+ * AICore kernel. The task still participates in the dependency graph: it
+ * waits on its fanin and notifies its fanout. Useful as a synchronization
+ * barrier or as a placeholder producer for tests / dep-graph wiring.
+ */
+static inline TaskOutputTensors rt_submit_dummy_task(const L0TaskArgs &args) {
+    PTO2Runtime *rt = current_runtime();
+    if (rt->ops->is_fatal(rt)) {
+        return TaskOutputTensors{};
+    }
+    return rt->ops->submit_dummy_task(rt, args);
+}
+
+static inline void rt_scope_begin(PTO2ScopeMode mode = PTO2ScopeMode::AUTO) {
+    PTO2Runtime *rt = current_runtime();
+    if (rt->ops->is_fatal(rt)) {
+        return;
+    }
+    rt->pending_scope_mode = mode;
+    rt->ops->scope_begin(rt);
+}
+
+static inline void rt_scope_end() {
+    PTO2Runtime *rt = current_runtime();
+    if (rt->ops->is_fatal(rt)) {
+        return;
+    }
+    rt->ops->scope_end(rt);
+}
+
+static inline void rt_orchestration_done() {
+    PTO2Runtime *rt = current_runtime();
+    rt->ops->orchestration_done(rt);
+}
+
+static inline bool rt_is_fatal() {
+    PTO2Runtime *rt = current_runtime();
+    return rt->ops->is_fatal(rt);
+}
+
+#define rt_report_fatal(code, fmt, ...)                                          \
+    do {                                                                         \
+        PTO2Runtime *_rt = current_runtime();                                    \
+        _rt->ops->report_fatal(_rt, (code), __FUNCTION__, (fmt), ##__VA_ARGS__); \
+    } while (0)
+
+// =============================================================================
+// Logging Macros for Orchestration (call through ops table)
+// =============================================================================
+
+#define LOG_ERROR(fmt, ...) current_runtime()->ops->log_error(__FUNCTION__, fmt, ##__VA_ARGS__)
+#define LOG_WARN(fmt, ...) current_runtime()->ops->log_warn(__FUNCTION__, fmt, ##__VA_ARGS__)
+#define LOG_DEBUG(fmt, ...) current_runtime()->ops->log_debug(__FUNCTION__, fmt, ##__VA_ARGS__)
+
+// INFO verbosity tiers. v=0 most verbose, v=9 must-see, v=5 default.
+#define LOG_INFO_V0(fmt, ...) current_runtime()->ops->log_info_v(__FUNCTION__, 0, fmt, ##__VA_ARGS__)
+#define LOG_INFO_V1(fmt, ...) current_runtime()->ops->log_info_v(__FUNCTION__, 1, fmt, ##__VA_ARGS__)
+#define LOG_INFO_V2(fmt, ...) current_runtime()->ops->log_info_v(__FUNCTION__, 2, fmt, ##__VA_ARGS__)
+#define LOG_INFO_V3(fmt, ...) current_runtime()->ops->log_info_v(__FUNCTION__, 3, fmt, ##__VA_ARGS__)
+#define LOG_INFO_V4(fmt, ...) current_runtime()->ops->log_info_v(__FUNCTION__, 4, fmt, ##__VA_ARGS__)
+#define LOG_INFO_V5(fmt, ...) current_runtime()->ops->log_info_v(__FUNCTION__, 5, fmt, ##__VA_ARGS__)
+#define LOG_INFO_V6(fmt, ...) current_runtime()->ops->log_info_v(__FUNCTION__, 6, fmt, ##__VA_ARGS__)
+#define LOG_INFO_V7(fmt, ...) current_runtime()->ops->log_info_v(__FUNCTION__, 7, fmt, ##__VA_ARGS__)
+#define LOG_INFO_V8(fmt, ...) current_runtime()->ops->log_info_v(__FUNCTION__, 8, fmt, ##__VA_ARGS__)
+#define LOG_INFO_V9(fmt, ...) current_runtime()->ops->log_info_v(__FUNCTION__, 9, fmt, ##__VA_ARGS__)
+
+// =============================================================================
+// Cross-Layer Data Access
+// =============================================================================
+
+/**
+ * Read a value from a tensor at the given multi-dimensional indices.
+ *
+ * Default T = uint64_t preserves old behavior (raw bits).
+ * Specify T to get automatic type conversion:
+ *
+ *   uint64_t raw = get_tensor_data(tensor, 1, idx);       // old usage unchanged
+ *   float val = get_tensor_data<float>(tensor, 1, idx);   // typed read
+ *
+ * If the tensor has a producer in TensorMap, spin-waits until the producer
+ * task completes before reading. External tensors (make_tensor_external)
+ * are read immediately without waiting.
+ */
+template <typename T = uint64_t>
+static inline T get_tensor_data(const Tensor &tensor, uint32_t ndims, const uint32_t indices[]) {
+    PTO2Runtime *rt = current_runtime();
+    if (rt->ops->is_fatal(rt)) {
+        return from_u64<T>(0);
+    }
+    return from_u64<T>(rt->ops->get_tensor_data(rt, tensor, ndims, indices));
+}
+
+/**
+ * Write a value to a tensor at the given multi-dimensional indices.
+ *
+ * Type is deduced from value argument; uint64_t by default:
+ *
+ *   set_tensor_data(tensor, 1, idx, raw_u64);     // old usage unchanged
+ *   set_tensor_data(tensor, 1, idx, 42.0f);       // typed write (T = float)
+ *
+ * If the tensor has a producer in TensorMap, spin-waits until the producer
+ * and all its consumers complete before writing (WAW + WAR safety).
+ * External tensors (make_tensor_external) with no TensorMap entry are
+ * written immediately without waiting.
+ *
+ * Limitation: TensorMap only tracks producers (OUTPUT/INOUT), not consumers
+ * that used the tensor as INPUT. If a kernel reads this tensor as INPUT
+ * (not INOUT) and the tensor has no TensorMap producer entry, set_tensor_data
+ * cannot detect the reader and may cause a data race.
+ *
+ * To ensure WAR safety for all access patterns, use add_inout() instead of
+ * add_input() for kernel parameters that may later be written via
+ * set_tensor_data. INOUT creates a TensorMap entry that enables automatic
+ * consumer tracking via fanout_refcount.
+ *
+ * The tensor must already have an allocated buffer (addr != 0).
+ * For runtime-created outputs, call this only on the Tensor returned by
+ * add_output(TensorCreateInfo) after submit returns.
+ */
+template <typename T = uint64_t>
+static inline void set_tensor_data(const Tensor &tensor, uint32_t ndims, const uint32_t indices[], T value) {
+    PTO2Runtime *rt = current_runtime();
+    if (rt->ops->is_fatal(rt)) {
+        return;
+    }
+    rt->ops->set_tensor_data(rt, tensor, ndims, indices, to_u64(value));
+}
+
+// =============================================================================
+// C++ Scope Guards and Macros
+// =============================================================================
+
+/**
+ * RAII Scope Guard (calls through ops table)
+ */
+class PTO2ScopeGuard {
+public:
+    explicit PTO2ScopeGuard(
+        PTO2ScopeMode mode = PTO2ScopeMode::AUTO, const char *file = __builtin_FILE(), int line = __builtin_LINE()
+    ) :
+        rt_(current_runtime()) {
+        if (!rt_->ops->is_fatal(rt_)) {
+            rt_->pending_scope_mode = mode;
+            if (rt_->ops->scope_set_site) rt_->ops->scope_set_site(file, line);
+            rt_->ops->scope_begin(rt_);
+        }
+    }
+    ~PTO2ScopeGuard() {
+        if (!rt_->ops->is_fatal(rt_)) {
+            rt_->ops->scope_end(rt_);
+        }
+    }
+
+private:
+    PTO2Runtime *rt_;
+};
+
+#define _PTO2_CONCATENATE_IMPL(x, y) x##y
+#define _PTO2_CONCATENATE(x, y) _PTO2_CONCATENATE_IMPL(x, y)
+
+#define PTO2_SCOPE_GUARD(...) \
+    [[maybe_unused]] PTO2ScopeGuard _PTO2_CONCATENATE(scope_guard_, __COUNTER__) { __VA_ARGS__ }
+
+/**
+ * Scoped block macro:
+ *   PTO2_SCOPE() {
+ *       rt_submit_task(...);
+ *   }
+ */
+#define PTO2_SCOPE(...) if (PTO2ScopeGuard _PTO2_CONCATENATE(scope_guard_, __COUNTER__){__VA_ARGS__}; true)
+
+// =============================================================================
+// Orchestration Config
+// =============================================================================
+
+/**
+ * Configuration exported by orchestration .so via aicpu_orchestration_config().
+ * The executor reads these values to set up shared memory and runtime.
+ *
+ * This struct is defined identically in pto_runtime2.h (with an include
+ * guard) so the executor can use the same type without including this header.
+ */
+#ifndef PTO2_ORCHESTRATION_CONFIG_DEFINED
+#define PTO2_ORCHESTRATION_CONFIG_DEFINED
+struct PTO2OrchestrationConfig {
+    int expected_arg_count;
+};
+#endif
+
+// Convenience layer (L0TaskArgsWithDeps<N> + matching rt_submit_*_task overloads).
+// Pulled in at the bottom so the wrapper sees L0TaskArgs, MixedKernels, and the
+// rt_submit_*_task primitives defined above. Orchestration sources include
+// only this single header to access both the primitive and convenience APIs.
+#include "pto_arg_with_deps.h"  // NOLINT(build/include_subdir)
diff --git a/src/a5/runtime/fully_distributed_within_core/runtime/aicore_completion_mailbox.h b/src/a5/runtime/fully_distributed_within_core/runtime/aicore_completion_mailbox.h
new file mode 100644
index 000000000..f914bfddf
--- /dev/null
+++ b/src/a5/runtime/fully_distributed_within_core/runtime/aicore_completion_mailbox.h
@@ -0,0 +1,189 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+#ifndef SRC_A5_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_AICORE_COMPLETION_MAILBOX_H_
+#define SRC_A5_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_AICORE_COMPLETION_MAILBOX_H_
+
+#include <atomic>
+#include <cstdint>
+
+#include "aicore_completion_mailbox_types.h"
+#include "pto_constants.h"
+#include "pto_task_id.h"
+
+// AICPU-only MPSC ring used to convey deferred-completion observations from
+// FIN-handling scheduler threads to the dispatch thread. Producers push under
+// CAS on `head`; the single consumer (dispatch thread, under AsyncWaitList::
+// busy) drains in seq order. Kernel-side code never touches this struct —
+// AICore writes go into DeferredCompletionSlab (see
+// aicore_completion_mailbox_types.h), which the FIN thread reads, flattens
+// into messages here, and forwards.
+
+#define AICORE_COMPLETION_MAILBOX_CAPACITY 4096u
+#define AICORE_COMPLETION_MAILBOX_MASK (AICORE_COMPLETION_MAILBOX_CAPACITY - 1u)
+
+static_assert(
+    (AICORE_COMPLETION_MAILBOX_CAPACITY & (AICORE_COMPLETION_MAILBOX_CAPACITY - 1u)) == 0,
+    "AICORE_COMPLETION_MAILBOX_CAPACITY must be a power of two"
+);
+
+// Mailbox message discriminator. CONDITION carries one deferred-completion
+// observation flattened from a DeferredCompletionEntry. TASK_NORMAL_DONE
+// carries the slot_state pointer in `addr` so the consumer can finalize the
+// AsyncWaitEntry.slot_state binding for tasks whose conditions arrived
+// before the FIN thread saw task_complete. New kinds may be added in future
+// without growing the message — the `_pad[5]` slack is reserved for
+// kind-specific payload extension.
+#define MSG_KIND_CONDITION 0u
+#define MSG_KIND_TASK_NORMAL_DONE 1u
+
+struct AICoreCompletionMailboxMessage {
+    // Per-slot ready flag. Producer publishes `tail+1` after filling the rest
+    // of the slot with a release store; consumer waits for the matching seq
+    // value with an acquire load. The release-acquire pair publishes all
+    // other fields below as a side effect, so they stay plain.
+    std::atomic<uint64_t> seq;
+    PTO2TaskId task_token;
+    // CONDITION: completion observation addr (counter / SDMA event record).
+    // TASK_NORMAL_DONE: PTO2TaskSlotState pointer carried over to the consumer
+    //   so it can finalize the AsyncWaitEntry.slot_state binding.
+    uint64_t addr;
+    uint32_t expected_value;
+    uint32_t engine;
+    int32_t completion_type;
+    uint32_t kind;
+    uint32_t _pad[5];
+};
+
+static_assert(sizeof(AICoreCompletionMailboxMessage) == PTO2_ALIGN_SIZE, "AICoreCompletionMailboxMessage layout drift");
+static_assert(
+    sizeof(std::atomic<uint64_t>) == sizeof(uint64_t),
+    "std::atomic<uint64_t> must be layout-compatible with uint64_t for the message slot layout to hold"
+);
+static_assert(
+    std::atomic<uint64_t>::is_always_lock_free,
+    "AICoreCompletionMailbox requires lock-free uint64_t atomics on every supported target"
+);
+
+// POD view of a drained message. `seq` is the ring's publication flag, not
+// payload, so try_pop copies out only the fields below (and seq is not even
+// copyable — it is a std::atomic).
+struct AICoreCompletionMsgView {
+    PTO2TaskId task_token{PTO2TaskId::invalid()};
+    uint64_t addr{0};
+    uint32_t expected_value{0};
+    uint32_t engine{0};
+    int32_t completion_type{0};
+    uint32_t kind{0};
+};
+
+struct AICoreCompletionMailbox {
+    // head and tail live on their own cache lines so producer CAS contention
+    // on head can't false-share with the consumer's tail updates.
+    alignas(PTO2_ALIGN_SIZE) std::atomic<uint64_t> head;
+    uint8_t _head_pad[PTO2_ALIGN_SIZE - sizeof(uint64_t)];
+    alignas(PTO2_ALIGN_SIZE) std::atomic<uint64_t> tail;
+    uint8_t _tail_pad[PTO2_ALIGN_SIZE - sizeof(uint64_t)];
+    alignas(PTO2_ALIGN_SIZE) AICoreCompletionMailboxMessage entries[AICORE_COMPLETION_MAILBOX_CAPACITY];
+
+    // Cheap, lock-free pending hint. Callers may invoke this outside the
+    // consumer lock; a stale answer only over/under-triggers a drain attempt.
+    bool has_pending() { return tail.load(std::memory_order_acquire) < head.load(std::memory_order_acquire); }
+
+    // MPSC push for a CONDITION message. Returns false when the ring is full
+    // (head - tail >= CAPACITY); caller should SPIN_WAIT_HINT and retry.
+    // Lock-free: CAS the shared head to claim a slot, write the fields, then
+    // release-store seq so the single consumer observes the publication.
+    //
+    // The head CAS is relaxed: head is a pure ticket counter and carries no
+    // data to the consumer — publication is solely the seq release-store, and
+    // slot-reuse safety rests on the acquire load of tail. The relaxed failure
+    // order is likewise sufficient since a lost CAS just re-reads head and
+    // retries. compare_exchange_weak is used because this loop already re-reads
+    // head and re-checks fullness, so masking LL/SC spurious failures (what
+    // _strong adds on aarch64) would only be a redundant inner retry.
+    //
+    // Safe to call concurrently from any number of producers; structurally
+    // independent of the AsyncWaitList::busy lock.
+    bool try_push_condition(
+        PTO2TaskId task_token, uint64_t addr, uint32_t expected_value, uint32_t engine, int32_t completion_type
+    ) {
+        while (true) {
+            uint64_t h = head.load(std::memory_order_relaxed);
+            uint64_t t = tail.load(std::memory_order_acquire);
+            if (h - t >= AICORE_COMPLETION_MAILBOX_CAPACITY) return false;
+            uint64_t new_head = h + 1;
+            if (head.compare_exchange_weak(h, new_head, std::memory_order_relaxed, std::memory_order_relaxed)) {
+                AICoreCompletionMailboxMessage *slot = &entries[h & AICORE_COMPLETION_MAILBOX_MASK];
+                slot->task_token.raw = task_token.raw;
+                slot->addr = addr;
+                slot->expected_value = expected_value;
+                slot->engine = engine;
+                slot->completion_type = completion_type;
+                slot->kind = MSG_KIND_CONDITION;
+                slot->seq.store(new_head, std::memory_order_release);
+                return true;
+            }
+            // CAS lost: another producer claimed the slot, retry with refreshed head.
+        }
+    }
+
+    // MPSC push for a TASK_NORMAL_DONE sentinel. Carries the PTO2TaskSlotState
+    // pointer in the `addr` field so the consumer can finish binding the
+    // AsyncWaitEntry.slot_state without going back to the FIN-handling thread.
+    bool try_push_normal_done(PTO2TaskId task_token, uint64_t slot_state_addr) {
+        while (true) {
+            uint64_t h = head.load(std::memory_order_relaxed);
+            uint64_t t = tail.load(std::memory_order_acquire);
+            if (h - t >= AICORE_COMPLETION_MAILBOX_CAPACITY) return false;
+            uint64_t new_head = h + 1;
+            if (head.compare_exchange_weak(h, new_head, std::memory_order_relaxed, std::memory_order_relaxed)) {
+                AICoreCompletionMailboxMessage *slot = &entries[h & AICORE_COMPLETION_MAILBOX_MASK];
+                slot->task_token.raw = task_token.raw;
+                slot->addr = slot_state_addr;
+                slot->expected_value = 0;
+                slot->engine = 0;
+                slot->completion_type = 0;
+                slot->kind = MSG_KIND_TASK_NORMAL_DONE;
+                slot->seq.store(new_head, std::memory_order_release);
+                return true;
+            }
+        }
+    }
+
+    // Single-consumer transport-level dequeue (caller holds the consumer lock).
+    // Returns false at the first not-yet-published slot (gap) or when empty;
+    // otherwise copies the next message in tail order into `out`, advances
+    // tail, and returns true. tail is consumer-only-written (relaxed read);
+    // head bounds the scan (relaxed); the seq acquire is the real publication
+    // gate; the tail release publishes "slot free" to reusing producers.
+    bool try_pop(AICoreCompletionMsgView &out) {
+        uint64_t t = tail.load(std::memory_order_relaxed);
+        uint64_t h = head.load(std::memory_order_relaxed);
+        if (t >= h) return false;
+        AICoreCompletionMailboxMessage *slot = &entries[t & AICORE_COMPLETION_MAILBOX_MASK];
+        if (slot->seq.load(std::memory_order_acquire) != t + 1) return false;
+        out.task_token.raw = slot->task_token.raw;
+        out.addr = slot->addr;
+        out.expected_value = slot->expected_value;
+        out.engine = slot->engine;
+        out.completion_type = slot->completion_type;
+        out.kind = slot->kind;
+        tail.store(t + 1, std::memory_order_release);
+        return true;
+    }
+};
+
+static_assert(
+    sizeof(AICoreCompletionMailbox) % PTO2_ALIGN_SIZE == 0, "AICoreCompletionMailbox size must be cache-line aligned"
+);
+
+#endif  // SRC_A5_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_AICORE_COMPLETION_MAILBOX_H_
diff --git a/src/a5/runtime/fully_distributed_within_core/runtime/aicore_completion_mailbox_types.h b/src/a5/runtime/fully_distributed_within_core/runtime/aicore_completion_mailbox_types.h
new file mode 100644
index 000000000..24c04c09e
--- /dev/null
+++ b/src/a5/runtime/fully_distributed_within_core/runtime/aicore_completion_mailbox_types.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+#ifndef SRC_A5_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_AICORE_COMPLETION_MAILBOX_TYPES_H_
+#define SRC_A5_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_AICORE_COMPLETION_MAILBOX_TYPES_H_
+
+#include <stdint.h>
+
+#include "pto_constants.h"
+
+// Types shared across the AICore↔AICPU boundary.
+//
+// This header is reachable from AICore-side translation units (via
+// pto_async_kernel_api.h / pto_completion_token.h / sdma_completion_kernel.h)
+// and must stay parseable by every AICore toolchain configuration: no
+// <atomic>, no __atomic_* intrinsics, no MPSC ring buffer struct.
+//
+// The MPSC ring (AICoreCompletionMailbox) and its push/drain helpers live in
+// aicore_completion_mailbox.h, which is AICPU-only.
+
+inline constexpr int32_t MAX_COMPLETIONS_PER_TASK = 64;
+
+#define COMPLETION_ENGINE_SDMA 0u
+#define COMPLETION_ENGINE_ROCE 1u
+#define COMPLETION_ENGINE_URMA 2u
+#define COMPLETION_ENGINE_CCU 3u
+
+#define COMPLETION_TYPE_COUNTER 0
+#define COMPLETION_TYPE_SDMA_EVENT_RECORD 1
+
+// DeferredCompletionEntry / DeferredCompletionSlab back the per-task scratch
+// area that AICore writes into to record "this completion has to be observed
+// before the task can retire." The FIN-handling scheduler thread reads the
+// slab, flattens entries into AICoreCompletionMailbox messages, and forwards
+// them to the dispatch thread. `volatile` here is load-bearing: writers live
+// on AICore and readers on AICPU, so the qualifier is the correct way to
+// pin the compiler against caching / reordering on either side.
+struct DeferredCompletionEntry {
+    uint64_t addr;
+    uint32_t expected_value;
+    uint32_t engine;
+    int32_t completion_type;
+    uint32_t _pad;
+};
+
+static_assert(sizeof(DeferredCompletionEntry) == 24, "DeferredCompletionEntry layout drift");
+
+struct alignas(PTO2_ALIGN_SIZE) DeferredCompletionSlab {
+    volatile uint32_t count;
+    volatile int32_t error_code;
+    DeferredCompletionEntry entries[MAX_COMPLETIONS_PER_TASK];
+};
+
+static_assert(
+    sizeof(DeferredCompletionSlab) % PTO2_ALIGN_SIZE == 0,
+    "DeferredCompletionSlab size must preserve array element cache-line boundaries"
+);
+
+#endif  // SRC_A5_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_AICORE_COMPLETION_MAILBOX_TYPES_H_
diff --git a/src/a5/runtime/fully_distributed_within_core/runtime/backend/sdma/sdma_completion_kernel.h b/src/a5/runtime/fully_distributed_within_core/runtime/backend/sdma/sdma_completion_kernel.h
new file mode 100644
index 000000000..5e596e17b
--- /dev/null
+++ b/src/a5/runtime/fully_distributed_within_core/runtime/backend/sdma/sdma_completion_kernel.h
@@ -0,0 +1,146 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+#ifndef SRC_A5_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_BACKEND_SDMA_SDMA_COMPLETION_KERNEL_H_
+#define SRC_A5_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_BACKEND_SDMA_SDMA_COMPLETION_KERNEL_H_
+
+#include <stdint.h>
+
+#include <pto/comm/async_common/async_event_impl.hpp>
+#include <pto/npu/comm/async/sdma/sdma_async_intrin.hpp>
+
+#include "pto_async_kernel_api.h"
+#include "aicore_completion_mailbox_types.h"
+#include "pto_runtime_status.h"
+
+#ifndef __aicore__
+#define __aicore__
+#endif
+#ifndef __gm__
+#define __gm__
+#endif
+
+// Re-exposed PTO-ISA constant so examples / callers don't need to include
+// <pto/npu/comm/async/sdma/sdma_types.hpp> just to spell their scratch tile.
+inline constexpr uint32_t SDMA_SCRATCH_ALIGNMENT = pto::comm::sdma::UB_ALIGN_SIZE;
+
+enum class SdmaOp : uint8_t {
+    TGET = 0,
+    TPUT = 1,
+};
+
+// SdmaRequestDescriptor bundles everything send_request_entry needs to drive
+// one SDMA transfer + completion registration. It is a template because the
+// destination / source / scratch types carry tensor shape & stride at compile
+// time; the SdmaTget() / SdmaTput() helpers below let callers skip the
+// template arguments.
+//
+// sync_id selects which event-record slot inside the workspace the engine
+// writes into. Concurrent dispatches must use distinct sync_ids; today every
+// caller submits one request per kernel invocation so passing 0 is safe.
+// Future work (see .docs/25.comm-api-refactor/03.implementation-plan.md §5.2)
+// will fold sync_id allocation into the adapter.
+template <typename DstTensor, typename SrcTensor, typename ScratchTileT>
+struct SdmaRequestDescriptor {
+    SdmaOp op;
+    DstTensor dst;
+    SrcTensor src;
+    ScratchTileT scratch;
+    __gm__ uint8_t *workspace;
+    uint32_t sync_id;
+};
+
+template <typename DstTensor, typename SrcTensor, typename ScratchTileT>
+inline __aicore__ SdmaRequestDescriptor<DstTensor, SrcTensor, ScratchTileT> SdmaTget(
+    const DstTensor &dst, const SrcTensor &src, const ScratchTileT &scratch, __gm__ uint8_t *workspace,
+    uint32_t sync_id = 0
+) {
+    return SdmaRequestDescriptor<DstTensor, SrcTensor, ScratchTileT>{SdmaOp::TGET, dst,       src,
+                                                                     scratch,      workspace, sync_id};
+}
+
+template <typename DstTensor, typename SrcTensor, typename ScratchTileT>
+inline __aicore__ SdmaRequestDescriptor<DstTensor, SrcTensor, ScratchTileT> SdmaTput(
+    const DstTensor &dst, const SrcTensor &src, const ScratchTileT &scratch, __gm__ uint8_t *workspace,
+    uint32_t sync_id = 0
+) {
+    return SdmaRequestDescriptor<DstTensor, SrcTensor, ScratchTileT>{SdmaOp::TPUT, dst,       src,
+                                                                     scratch,      workspace, sync_id};
+}
+
+namespace pto2::detail {
+
+inline __aicore__ void register_sdma_event_record(AsyncCtx &ctx, volatile __gm__ void *record_addr) {
+    CompletionToken token{
+        reinterpret_cast<uint64_t>(record_addr), 0, COMPLETION_ENGINE_SDMA, COMPLETION_TYPE_SDMA_EVENT_RECORD, 0
+    };
+    (void)register_completion_condition(ctx, token);
+}
+
+template <typename PtoAsyncEvent, typename PtoAsyncSession>
+inline __aicore__ void
+register_pto_async_event(AsyncCtx &ctx, const PtoAsyncEvent &event, const PtoAsyncSession &session) {
+    if (ctx.task_token.is_invalid() || ctx.completion_count == nullptr || ctx.completion_entries == nullptr) {
+        (void)event.Wait(session);
+        return;
+    }
+    if (event.handle == 0) {
+        return;
+    }
+
+    const uint32_t engine = static_cast<uint32_t>(event.engine);
+    if (engine != static_cast<uint32_t>(::pto::comm::DmaEngine::SDMA)) {
+        defer_error(ctx, PTO2_ERROR_ASYNC_COMPLETION_INVALID);
+        return;
+    }
+
+    ::pto::comm::sdma::detail::UbTmpBuf tmp_buf;
+    uint32_t sync_id = 0;
+    __gm__ uint8_t *recv_workspace = nullptr;
+    uint32_t queue_num = 0;
+    if (!::pto::comm::sdma::detail::PrepareEventCheck(
+            session.sdmaSession, tmp_buf, sync_id, recv_workspace, queue_num
+        )) {
+        defer_error(ctx, PTO2_ERROR_ASYNC_COMPLETION_INVALID);
+        return;
+    }
+    for (uint32_t queue_id = 0; queue_id < queue_num; ++queue_id) {
+        register_sdma_event_record(ctx, ::pto::comm::sdma::detail::GetEventRecord(recv_workspace, queue_id));
+    }
+}
+
+}  // namespace pto2::detail
+
+// SDMA overload of the runtime's send_request_entry. Submits the descriptor
+// to PTO-ISA, then registers the resulting AsyncEvent's GM flag(s) into the
+// AsyncCtx deferred-wait slab and flushes. Returns false on submit/session
+// failure (also records the error in ctx.completion_error_code).
+template <typename DstTensor, typename SrcTensor, typename ScratchTileT>
+inline __aicore__ bool
+send_request_entry(AsyncCtx &ctx, SdmaRequestDescriptor<DstTensor, SrcTensor, ScratchTileT> desc) {
+    pto::comm::AsyncSession session;
+    if (!pto::comm::BuildAsyncSession(desc.scratch, desc.workspace, session, desc.sync_id)) {
+        pto2::detail::defer_error(ctx, PTO2_ERROR_ASYNC_COMPLETION_INVALID);
+        return false;
+    }
+
+    pto::comm::AsyncEvent event;
+    if (desc.op == SdmaOp::TGET) {
+        event = pto::comm::TGET_ASYNC(desc.dst, desc.src, session);
+    } else {
+        event = pto::comm::TPUT_ASYNC(desc.dst, desc.src, session);
+    }
+    pto2::detail::register_pto_async_event(ctx, event, session);
+    pto2::detail::defer_flush(ctx);
+    return true;
+}
+
+#endif  // SRC_A5_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_BACKEND_SDMA_SDMA_COMPLETION_KERNEL_H_
diff --git a/src/a5/runtime/fully_distributed_within_core/runtime/backend/sdma/sdma_completion_scheduler.h b/src/a5/runtime/fully_distributed_within_core/runtime/backend/sdma/sdma_completion_scheduler.h
new file mode 100644
index 000000000..107fab62d
--- /dev/null
+++ b/src/a5/runtime/fully_distributed_within_core/runtime/backend/sdma/sdma_completion_scheduler.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+#ifndef SRC_A5_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_BACKEND_SDMA_SDMA_COMPLETION_SCHEDULER_H_
+#define SRC_A5_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_BACKEND_SDMA_SDMA_COMPLETION_SCHEDULER_H_
+
+#include <cstddef>
+#include <cstdint>
+
+#include "aicpu/platform_regs.h"
+#include "aicore_completion_mailbox.h"
+#include "pto_completion_token.h"
+#include "pto_runtime_status.h"
+
+// runtime-side mirror of the PTO-ISA SdmaEventRecord. SDMA backend is the only
+// allowed holder of this ABI knowledge; the generic scheduler dispatches into
+// the helpers below through the completion ops table.
+struct SdmaEventRecord {
+    uint32_t flag;
+    uint32_t sq_tail;
+    uint64_t channel_info;
+};
+
+static_assert(sizeof(SdmaEventRecord) == 16, "SDMA event record ABI drift");
+static_assert(offsetof(SdmaEventRecord, sq_tail) == 4, "SDMA event record ABI drift");
+
+inline uintptr_t sdma_completion_cache_line(const volatile void *addr) {
+    return reinterpret_cast<uintptr_t>(addr) & ~(uintptr_t(PTO2_ALIGN_SIZE) - 1u);
+}
+
+inline CompletionPollResult poll_sdma_event_record(uint64_t record_addr) {
+    if (record_addr == 0) {
+        return {CompletionPollState::FAILED, PTO2_ERROR_ASYNC_COMPLETION_INVALID};
+    }
+    volatile SdmaEventRecord *record =
+        reinterpret_cast<volatile SdmaEventRecord *>(static_cast<uintptr_t>(record_addr));
+    cache_invalidate_range(reinterpret_cast<const void *>(sdma_completion_cache_line(record)), PTO2_ALIGN_SIZE);
+    uint32_t flag = __atomic_load_n(&record->flag, __ATOMIC_ACQUIRE);
+    return {flag != 0 ? CompletionPollState::READY : CompletionPollState::PENDING, PTO2_ERROR_NONE};
+}
+
+inline void retire_sdma_event_record(uint64_t record_addr) {
+    if (record_addr == 0) return;
+    volatile SdmaEventRecord *record =
+        reinterpret_cast<volatile SdmaEventRecord *>(static_cast<uintptr_t>(record_addr));
+    cache_invalidate_range(reinterpret_cast<const void *>(sdma_completion_cache_line(record)), PTO2_ALIGN_SIZE);
+    uint32_t completed_tail = __atomic_load_n(&record->sq_tail, __ATOMIC_ACQUIRE);
+    uint64_t channel_info_addr = __atomic_load_n(&record->channel_info, __ATOMIC_ACQUIRE);
+
+    volatile uint64_t *record_head = reinterpret_cast<volatile uint64_t *>(record);
+    __atomic_store_n(record_head, 0ULL, __ATOMIC_RELEASE);
+    cache_flush_range(const_cast<const void *>(reinterpret_cast<volatile void *>(record_head)), sizeof(uint64_t));
+
+    if (channel_info_addr == 0) return;
+    uint64_t packed = (static_cast<uint64_t>(completed_tail) << 32) | static_cast<uint64_t>(completed_tail);
+    volatile uint64_t *channel_info = reinterpret_cast<volatile uint64_t *>(static_cast<uintptr_t>(channel_info_addr));
+    __atomic_store_n(channel_info, packed, __ATOMIC_RELEASE);
+    cache_flush_range(const_cast<const void *>(reinterpret_cast<volatile void *>(channel_info)), sizeof(uint64_t));
+}
+
+#endif  // SRC_A5_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_BACKEND_SDMA_SDMA_COMPLETION_SCHEDULER_H_
diff --git a/src/a5/runtime/fully_distributed_within_core/runtime/common.h b/src/a5/runtime/fully_distributed_within_core/runtime/common.h
new file mode 100644
index 000000000..9dcf438ed
--- /dev/null
+++ b/src/a5/runtime/fully_distributed_within_core/runtime/common.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <stdlib.h>
+
+// Assertion macros (always_assert / debug_assert), AssertionError, and the
+// MAYBE_UNINITIALIZED diagnostics live in the shared header so the unified
+// Tensor (src/common/task_interface/tensor.h) can use them without depending
+// on this runtime-specific header. assert_impl / get_stacktrace are defined in
+// orchestration/common.cpp for runtime targets.
+#include "assert_compat.h"
+
+// Framework-internal TLS bridge. The executor binds the current thread's
+// runtime before invoking the orchestration entry, so orchestration helpers can
+// fetch the current PTO2Runtime without explicit parameter threading. Declared
+// here (rather than in pto_orchestration_api.h) so framework TUs the AICore
+// build also compiles — notably orchestration/common.cpp — see these symbols
+// without pulling in pto_types.h, whose Arg::add_scalar → to_u64 path is
+// __aicore__-only and would break the ccec build.
+#ifdef __cplusplus
+extern "C" {
+#endif
+struct PTO2Runtime;
+PTO2Runtime *framework_current_runtime(void);
+void framework_bind_runtime(PTO2Runtime *rt);
+#ifdef __cplusplus
+}
+#endif
diff --git a/src/a5/runtime/fully_distributed_within_core/runtime/pto2_dispatch_payload.h b/src/a5/runtime/fully_distributed_within_core/runtime/pto2_dispatch_payload.h
new file mode 100644
index 000000000..cae275625
--- /dev/null
+++ b/src/a5/runtime/fully_distributed_within_core/runtime/pto2_dispatch_payload.h
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+/**
+ * @file pto2_dispatch_payload.h
+ * @brief Per-core dispatch payload for AICore kernel execution
+ *
+ * PTO2DispatchPayload holds the kernel function address, a per-core args[]
+ * array, and embedded SPMD context (LocalContext + GlobalContext).  AICPU
+ * maintains a static array of these (one per core).
+ *
+ * GlobalContext (sub_block_id) is initialized once at runtime startup via
+ * init_global_context() and never modified afterwards.
+ *
+ * LocalContext (block_idx, block_num) and args[] are rebuilt by build_payload()
+ * before each dispatch.  Both context struct pointers are written into the
+ * args[] suffix on every dispatch (since args[] is rebuilt entirely each time).
+ *
+ * AICore caches a pointer to its per-core slot at startup and reads from
+ * it on each dispatch.  The struct is cache-line aligned to avoid false
+ * sharing across concurrently dispatched cores.
+ *
+ * The DATA_MAIN_BASE register protocol is unchanged from the base runtime:
+ * a monotonically increasing reg_task_id signals new work to AICore.
+ */
+
+#pragma once
+
+#include <stdint.h>
+
+#include "arg_direction.h"
+#include "intrinsic.h"
+
+/** Max dispatch arguments: 16 scalars + up to 32 tensor pointers + ext params */
+#ifndef PTO2_DISPATCH_MAX_ARGS
+#define PTO2_DISPATCH_MAX_ARGS (MAX_TENSOR_ARGS + MAX_SCALAR_ARGS + PTO2_EXT_PARAMS_COUNT)
+#endif
+
+#ifndef PTO2_ALIGN_UP
+#define PTO2_ALIGN_UP(x, align) (((x) + (align) - 1) & ~((align) - 1))
+#endif
+
+// Verify hardcoded indices in intrinsic.h match the computed values.
+static_assert(
+    (MAX_TENSOR_ARGS + MAX_SCALAR_ARGS) == SPMD_LOCAL_CONTEXT_INDEX, "LOCAL_CONTEXT_INDEX out of sync with intrinsic.h"
+);
+static_assert(
+    (MAX_TENSOR_ARGS + MAX_SCALAR_ARGS + 1) == SPMD_GLOBAL_CONTEXT_INDEX,
+    "GLOBAL_CONTEXT_INDEX out of sync with intrinsic.h"
+);
+
+/**
+ * Per-core dispatch payload: function address + args[] + SPMD context.
+ *
+ * AICPU maintains a static array s_payload_per_core[RUNTIME_MAX_WORKER].
+ * AICore caches a pointer to its per-core slot at startup (via Handshake.task)
+ * and reads from it on each dispatch.
+ *
+ * The struct is cache-line aligned to prevent false sharing across
+ * concurrently dispatched cores.
+ */
+struct alignas(64) PTO2DispatchPayload {
+    uint64_t function_bin_addr;            /**< Kernel entry address in GM (set by Scheduler) */
+    uint64_t args[PTO2_DISPATCH_MAX_ARGS]; /**< Kernel arguments (GM pointers + scalars + ext params) */
+
+    /** Per-dispatch context: block_idx and block_num.
+     *  Written by build_payload() before each dispatch.
+     *  args[SPMD_LOCAL_CONTEXT_INDEX] points here. */
+    LocalContext local_context;
+
+    /** Per-core global context: sub_block_id (AIV lane identity).
+     *  Initialized once by init_global_context() at runtime startup.
+     *  args[SPMD_GLOBAL_CONTEXT_INDEX] points here. */
+    GlobalContext global_context;
+
+    uint8_t reserved_payload_abi_pad[8];
+
+    static_assert(sizeof(args[0]) == 8);
+    static_assert(
+        PTO2_ALIGN_UP((MAX_TENSOR_ARGS + MAX_SCALAR_ARGS) * sizeof(args[0]), 64) ==
+        (MAX_TENSOR_ARGS + MAX_SCALAR_ARGS) * sizeof(args[0])
+    );
+};
+
+static_assert(sizeof(PTO2DispatchPayload) == 512, "PTO2DispatchPayload hardware ABI size drift");
diff --git a/src/a5/runtime/fully_distributed_within_core/runtime/pto_async_kernel_api.h b/src/a5/runtime/fully_distributed_within_core/runtime/pto_async_kernel_api.h
new file mode 100644
index 000000000..cf6eb4790
--- /dev/null
+++ b/src/a5/runtime/fully_distributed_within_core/runtime/pto_async_kernel_api.h
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+#ifndef PTO_ASYNC_KERNEL_API_H
+#define PTO_ASYNC_KERNEL_API_H
+
+#include <stdint.h>
+
+#include <pto/comm/comm_types.hpp>
+#include <pto/comm/pto_comm_inst.hpp>
+
+#include "intrinsic.h"
+#include "aicore_completion_mailbox_types.h"
+#include "pto_completion_token.h"
+#include "pto_runtime_status.h"
+
+#ifndef __aicore__
+#define __aicore__
+#endif
+#ifndef __gm__
+#define __gm__
+#endif
+
+// Public surface: get_async_ctx, async_ctx_is_deferred,
+// register_completion_condition, send_notification,
+// save_expected_notification_counter. Everything else lives in
+// pto2::detail and is reserved for backend adapters / internal use.
+namespace pto2::detail {
+
+inline __aicore__ void defer_load_slab(AsyncCtx &ctx) {
+    if (ctx.completion_count == nullptr) return;
+#if defined(__CCE_KT_TEST__) || defined(__CCE_AICORE__) || defined(__DAV_C220__)
+    uintptr_t line = reinterpret_cast<uintptr_t>(ctx.completion_count) & ~(uintptr_t(PTO2_ALIGN_SIZE) - 1u);
+    dcci((__gm__ int32_t *)line, SINGLE_CACHE_LINE);
+#else
+    __asm__ __volatile__("" ::: "memory");
+#endif
+}
+
+inline __aicore__ void defer_error(AsyncCtx &ctx, int32_t error_code) {
+    if (ctx.task_token.is_valid() && ctx.completion_error_code != nullptr) {
+        *ctx.completion_error_code = error_code;
+    }
+}
+
+inline __aicore__ void defer_flush_range(volatile __gm__ void *addr, uint32_t size_bytes) {
+    if (addr == nullptr || size_bytes == 0) return;
+#if defined(__CCE_KT_TEST__) || defined(__CCE_AICORE__) || defined(__DAV_C220__)
+    uintptr_t start = reinterpret_cast<uintptr_t>(addr) & ~(uintptr_t(PTO2_ALIGN_SIZE) - 1u);
+    uintptr_t end =
+        (reinterpret_cast<uintptr_t>(addr) + size_bytes + PTO2_ALIGN_SIZE - 1u) & ~(uintptr_t(PTO2_ALIGN_SIZE) - 1u);
+    for (uintptr_t p = start; p < end; p += PTO2_ALIGN_SIZE) {
+        dcci((__gm__ int32_t *)p, SINGLE_CACHE_LINE, CACHELINE_OUT);
+    }
+#else
+    (void)addr;
+    (void)size_bytes;
+#endif
+}
+
+inline __aicore__ void defer_flush(AsyncCtx &ctx) {
+    if (ctx.task_token.is_invalid() || ctx.completion_count == nullptr) return;
+#if defined(__CCE_KT_TEST__) || defined(__CCE_AICORE__) || defined(__DAV_C220__)
+    uint32_t count = *ctx.completion_count;
+    if (count > ctx.completion_capacity) {
+        count = ctx.completion_capacity;
+    }
+    uint32_t flush_bytes = static_cast<uint32_t>(sizeof(*ctx.completion_count));
+    if (ctx.completion_error_code != nullptr) {
+        flush_bytes += static_cast<uint32_t>(sizeof(*ctx.completion_error_code));
+    }
+    if (ctx.completion_entries != nullptr) {
+        flush_bytes += count * static_cast<uint32_t>(sizeof(DeferredCompletionEntry));
+    }
+    defer_flush_range(ctx.completion_count, flush_bytes);
+#if defined(__CPU_SIM)
+    dsb(0);
+#else
+    dsb(DSB_DDR);
+#endif
+    pipe_barrier(PIPE_ALL);
+#else
+    (void)ctx;
+    __asm__ __volatile__("" ::: "memory");
+#endif
+}
+
+}  // namespace pto2::detail
+
+inline __aicore__ AsyncCtx get_async_ctx(__gm__ int64_t *args) {
+    __gm__ LocalContext *lc =
+        reinterpret_cast<__gm__ LocalContext *>(static_cast<uintptr_t>(args[PAYLOAD_LOCAL_CONTEXT_INDEX]));
+    AsyncCtx ctx{};
+    ctx.completion_count = lc->async_ctx.completion_count;
+    ctx.completion_error_code = lc->async_ctx.completion_error_code;
+    ctx.completion_entries = lc->async_ctx.completion_entries;
+    ctx.completion_capacity = lc->async_ctx.completion_capacity;
+    ctx.task_token.raw = lc->async_ctx.task_token.raw;
+    pto2::detail::defer_load_slab(ctx);
+    return ctx;
+}
+
+inline __aicore__ bool async_ctx_is_deferred(const AsyncCtx &ctx) { return ctx.task_token.is_valid(); }
+
+// Canonical writer: backend submit handlers build a CompletionToken and pass
+// it here. Writes one DeferredCompletionEntry to the AsyncCtx slab and
+// bumps completion_count. Returns false on overflow (also stores
+// PTO2_ERROR_ASYNC_WAIT_OVERFLOW in ctx.completion_error_code) or when ctx is
+// not currently a deferred context.
+inline __aicore__ bool register_completion_condition(AsyncCtx &ctx, const CompletionToken &token) {
+    if (ctx.task_token.is_invalid() || ctx.completion_count == nullptr || ctx.completion_entries == nullptr) {
+        return false;
+    }
+
+    uint32_t idx = *ctx.completion_count;
+    if (idx >= ctx.completion_capacity) {
+        if (ctx.completion_error_code != nullptr) {
+            *ctx.completion_error_code = PTO2_ERROR_ASYNC_WAIT_OVERFLOW;
+        }
+        return false;
+    }
+
+    volatile __gm__ DeferredCompletionEntry *slot = &ctx.completion_entries[idx];
+    slot->addr = token.addr;
+    slot->expected_value = token.expected_value;
+    slot->engine = token.engine;
+    slot->completion_type = token.completion_type;
+    slot->_pad = 0;
+    *ctx.completion_count = idx + 1;
+    return true;
+}
+
+inline __aicore__ void
+send_notification(volatile __gm__ void *remote_counter_addr, int32_t value, pto::comm::NotifyOp notify_op) {
+    __gm__ int32_t *counter = reinterpret_cast<__gm__ int32_t *>(const_cast<__gm__ void *>(remote_counter_addr));
+    pto::comm::Signal signal(counter);
+    pto::comm::TNOTIFY(signal, value, notify_op);
+}
+
+inline __aicore__ void
+save_expected_notification_counter(AsyncCtx &ctx, volatile __gm__ void *counter_addr, uint32_t expected_value) {
+    CompletionToken token{
+        reinterpret_cast<uint64_t>(counter_addr), expected_value, COMPLETION_ENGINE_SDMA, COMPLETION_TYPE_COUNTER, 0
+    };
+    (void)register_completion_condition(ctx, token);
+    pto2::detail::defer_flush(ctx);
+}
+
+#endif  // PTO_ASYNC_KERNEL_API_H
diff --git a/src/a5/runtime/fully_distributed_within_core/runtime/pto_async_wait.h b/src/a5/runtime/fully_distributed_within_core/runtime/pto_async_wait.h
new file mode 100644
index 000000000..65608ad2f
--- /dev/null
+++ b/src/a5/runtime/fully_distributed_within_core/runtime/pto_async_wait.h
@@ -0,0 +1,303 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+#ifndef PTO_ASYNC_WAIT_H
+#define PTO_ASYNC_WAIT_H
+
+#include <atomic>
+#include <cstddef>
+#include <cstdint>
+
+#include "aicpu/platform_regs.h"
+#include "backend/sdma/sdma_completion_scheduler.h"
+#include "intrinsic.h"
+#include "aicore_completion_mailbox.h"
+#include "pto_completion_token.h"
+#include "pto_runtime2_types.h"
+
+struct PTO2SchedulerState;
+struct PTO2LocalReadyBuffer;
+struct CompletionStats;
+
+inline constexpr int32_t MAX_ASYNC_WAITS = 64;
+
+// The mailbox transport (has_pending / try_push_condition /
+// try_push_normal_done / try_pop) lives as AICoreCompletionMailbox member
+// functions in aicore_completion_mailbox.h. This file only holds the
+// application layer: translating drained messages into wait-list state.
+
+inline uintptr_t mailbox_cache_line(const volatile void *addr) {
+    return reinterpret_cast<uintptr_t>(addr) & ~(uintptr_t(PTO2_ALIGN_SIZE) - 1u);
+}
+
+struct CompletionCondition;
+
+using CompletionPollFn = CompletionPollResult (*)(const CompletionCondition &);
+using CompletionRetireFn = void (*)(CompletionCondition &);
+
+struct CompletionBackendOps {
+    CompletionPollFn poll;
+    CompletionRetireFn retire;
+};
+
+struct CompletionCondition {
+    AsyncEngine engine{ASYNC_ENGINE_SDMA};
+    int32_t completion_type{COMPLETION_TYPE_COUNTER};
+    bool satisfied{false};
+    bool retired{false};
+    volatile uint32_t *counter_addr{nullptr};
+    uint64_t addr{0};
+    uint32_t expected_value{0};
+
+    CompletionPollResult test() const;
+    void retire();
+};
+
+// Per-completion-type ops. SDMA_EVENT_RECORD detail lives in
+// backend/sdma/sdma_completion_scheduler.h; the op wrappers below are thin
+// glue mapping CompletionCondition.addr into the backend's raw-addr helpers.
+inline CompletionPollResult counter_poll_op(const CompletionCondition &cond) {
+    if (cond.counter_addr == nullptr) {
+        return {CompletionPollState::FAILED, PTO2_ERROR_ASYNC_COMPLETION_INVALID};
+    }
+    return {
+        *cond.counter_addr >= cond.expected_value ? CompletionPollState::READY : CompletionPollState::PENDING,
+        PTO2_ERROR_NONE
+    };
+}
+
+inline void counter_retire_op(CompletionCondition & /*cond*/) {}
+
+inline CompletionPollResult sdma_event_record_poll_op(const CompletionCondition &cond) {
+    return poll_sdma_event_record(cond.addr);
+}
+
+inline void sdma_event_record_retire_op(CompletionCondition &cond) { retire_sdma_event_record(cond.addr); }
+
+inline const CompletionBackendOps *completion_backend_ops_for(int completion_type) {
+    static const CompletionBackendOps kOps[] = {
+        {counter_poll_op, counter_retire_op},                      // COMPLETION_TYPE_COUNTER = 0
+        {sdma_event_record_poll_op, sdma_event_record_retire_op},  // COMPLETION_TYPE_SDMA_EVENT_RECORD = 1
+    };
+    constexpr int kOpsCount = static_cast<int>(sizeof(kOps) / sizeof(kOps[0]));
+    if (completion_type < 0 || completion_type >= kOpsCount) return nullptr;
+    return &kOps[completion_type];
+}
+
+inline CompletionPollResult CompletionCondition::test() const {
+    if (satisfied) {
+        return {CompletionPollState::READY, PTO2_ERROR_NONE};
+    }
+    const CompletionBackendOps *ops = completion_backend_ops_for(completion_type);
+    if (ops == nullptr || ops->poll == nullptr) {
+        return {CompletionPollState::FAILED, PTO2_ERROR_ASYNC_COMPLETION_INVALID};
+    }
+    return ops->poll(*this);
+}
+
+inline void CompletionCondition::retire() {
+    if (retired) return;
+    const CompletionBackendOps *ops = completion_backend_ops_for(completion_type);
+    if (ops != nullptr && ops->retire != nullptr) {
+        ops->retire(*this);
+    }
+    retired = true;
+}
+
+struct AsyncWaitEntry {
+    PTO2TaskSlotState *slot_state{nullptr};
+    PTO2TaskId task_token{PTO2TaskId::invalid()};
+    CompletionCondition conditions[MAX_COMPLETIONS_PER_TASK];
+    int32_t condition_count{0};
+    int32_t waiting_completion_count{0};
+    bool normal_done{false};
+};
+
+struct AsyncPollResult {
+    int32_t completed{0};
+    int32_t error_code{PTO2_ERROR_NONE};
+    PTO2TaskSlotState *failed_slot_state{nullptr};
+};
+
+inline const char *async_engine_name(AsyncEngine engine) {
+    switch (engine) {
+    case ASYNC_ENGINE_SDMA:
+        return "SDMA";
+    case ASYNC_ENGINE_ROCE:
+        return "ROCE";
+    case ASYNC_ENGINE_URMA:
+        return "URMA";
+    case ASYNC_ENGINE_CCU:
+        return "CCU";
+    default:
+        return "UNKNOWN";
+    }
+}
+
+struct AsyncWaitList {
+    std::atomic<int32_t> busy{0};
+    AsyncWaitEntry entries[MAX_ASYNC_WAITS];
+    int32_t count{0};
+    // Diagnostic: counts every FIN-side try_push that hit a full mailbox.
+    // Expected to stay zero on real workloads (ring is 4096 entries); a
+    // non-zero value means consumers are too slow or the ring is undersized.
+    // Read by scheduler shutdown / l2 perf summary; not on the hot path.
+    std::atomic<uint64_t> mpsc_skipped_count{0};
+
+    bool try_lock() {
+        int32_t expected = 0;
+        return busy.compare_exchange_strong(expected, 1, std::memory_order_acquire, std::memory_order_relaxed);
+    }
+
+    void unlock() { busy.store(0, std::memory_order_release); }
+
+    AsyncWaitEntry *find_entry_by_token(PTO2TaskId token) {
+        for (int32_t i = 0; i < count; i++) {
+            if (entries[i].task_token == token) return &entries[i];
+        }
+        return nullptr;
+    }
+
+    // Captures the side-channel a scheduler-aware drain needs to complete
+    // NotDeferred tasks inline (without storing a transient entry in
+    // entries[]).
+    struct DrainCompletionSink {
+        PTO2SchedulerState *sched{nullptr};
+        PTO2LocalReadyBuffer *local_bufs{nullptr};
+        PTO2TaskSlotState **deferred_release_slot_states{nullptr};
+        int32_t *deferred_release_count{nullptr};
+        int32_t deferred_release_capacity{0};
+        int32_t inline_completed{0};
+#if PTO2_SCHED_PROFILING
+        int32_t thread_idx{0};
+#endif
+
+        bool can_inline_complete() const { return sched != nullptr; }
+    };
+
+    // Inline-complete a NotDeferred task during drain. Returns false on
+    // deferred_release_slot_states overflow.
+    bool try_inline_complete_locked(DrainCompletionSink &sink, PTO2TaskSlotState &slot_state);
+
+    // Single-consumer drain: pop each published message in tail order and
+    // translate it into wait-list state. An empty sink (sched == nullptr) just
+    // materializes entries; a sched-aware sink additionally inline-completes
+    // lonely NotDeferred NORMAL_DONEs without ever growing entries[].
+    int32_t drain_aicore_completion_mailbox_locked(
+        AICoreCompletionMailbox *aicore_mailbox, DrainCompletionSink &sink, int32_t &error_code
+    ) {
+        error_code = PTO2_ERROR_NONE;
+        if (aicore_mailbox == nullptr) return 0;
+
+        int32_t drained = 0;
+        AICoreCompletionMsgView msg;
+        // try_pop is the transport layer (seq-gated, in-order dequeue); this
+        // loop is the application layer (translate each message into wait-list
+        // state). try_pop returns false at the first gap or when empty.
+        while (aicore_mailbox->try_pop(msg)) {
+            drained++;
+            if (msg.kind == MSG_KIND_CONDITION) {
+                AsyncWaitEntry *entry = find_entry_by_token(msg.task_token);
+                if (entry == nullptr) {
+                    // First message for this task — materialize the entry here.
+                    // slot_state stays null until the matching TASK_NORMAL_DONE
+                    // sentinel arrives.
+                    if (count >= MAX_ASYNC_WAITS) {
+                        error_code = PTO2_ERROR_ASYNC_WAIT_OVERFLOW;
+                        return drained;
+                    }
+                    entry = &entries[count++];
+                    entry->task_token = msg.task_token;
+                    entry->slot_state = nullptr;
+                    entry->condition_count = 0;
+                    entry->waiting_completion_count = 0;
+                    entry->normal_done = false;
+                }
+                if (!append_condition_locked(
+                        *entry, msg.addr, msg.expected_value, static_cast<AsyncEngine>(msg.engine), msg.completion_type,
+                        error_code
+                    )) {
+                    return drained;
+                }
+            } else if (msg.kind == MSG_KIND_TASK_NORMAL_DONE) {
+                PTO2TaskSlotState *slot_state_ptr =
+                    reinterpret_cast<PTO2TaskSlotState *>(static_cast<uintptr_t>(msg.addr));
+                AsyncWaitEntry *entry = find_entry_by_token(msg.task_token);
+                if (entry == nullptr) {
+                    // Producers strictly order: all CONDITIONs for token T are
+                    // pushed before the matching NORMAL_DONE (the acq_rel on
+                    // on_subtask_complete enforces this across producers). So
+                    // observing NORMAL_DONE first => the task registered no
+                    // conditions => NotDeferred. Complete it inline when the
+                    // sink allows; otherwise fall back to the entry-store path.
+                    if (sink.can_inline_complete()) {
+                        (void)try_inline_complete_locked(sink, *slot_state_ptr);
+                        continue;
+                    }
+                    if (count >= MAX_ASYNC_WAITS) {
+                        error_code = PTO2_ERROR_ASYNC_WAIT_OVERFLOW;
+                        return drained;
+                    }
+                    entry = &entries[count++];
+                    entry->task_token = msg.task_token;
+                    entry->slot_state = slot_state_ptr;
+                    entry->condition_count = 0;
+                    entry->waiting_completion_count = 0;
+                    entry->normal_done = true;
+                } else {
+                    if (entry->slot_state == nullptr) {
+                        entry->slot_state = slot_state_ptr;
+                    }
+                    entry->normal_done = true;
+                }
+            } else {
+                error_code = PTO2_ERROR_ASYNC_REGISTRATION_FAILED;
+                return drained;
+            }
+        }
+        return drained;
+    }
+
+    bool append_condition_locked(
+        AsyncWaitEntry &entry, uint64_t addr, uint32_t expected_value, AsyncEngine engine, int32_t completion_type,
+        int32_t &error_code
+    ) {
+        if (entry.condition_count >= MAX_COMPLETIONS_PER_TASK) {
+            error_code = PTO2_ERROR_ASYNC_REGISTRATION_FAILED;
+            return false;
+        }
+        CompletionCondition &cond = entry.conditions[entry.condition_count++];
+        cond.engine = engine;
+        cond.completion_type = completion_type;
+        cond.satisfied = false;
+        cond.retired = false;
+        cond.addr = addr;
+        cond.counter_addr = completion_type == COMPLETION_TYPE_COUNTER ?
+                                reinterpret_cast<volatile uint32_t *>(static_cast<uintptr_t>(addr)) :
+                                nullptr;
+        cond.expected_value = expected_value;
+        entry.waiting_completion_count++;
+        return true;
+    }
+
+    template <bool Profiling>
+    AsyncPollResult poll_and_complete(
+        AICoreCompletionMailbox *aicore_mailbox, PTO2SchedulerState *sched, PTO2LocalReadyBuffer *local_bufs,
+        PTO2TaskSlotState **deferred_release_slot_states, int32_t &deferred_release_count,
+        int32_t deferred_release_capacity
+#if PTO2_SCHED_PROFILING
+        ,
+        int thread_idx
+#endif
+    );
+};
+
+#endif  // PTO_ASYNC_WAIT_H
diff --git a/src/a5/runtime/fully_distributed_within_core/runtime/pto_completion_token.h b/src/a5/runtime/fully_distributed_within_core/runtime/pto_completion_token.h
new file mode 100644
index 000000000..45cdb0b51
--- /dev/null
+++ b/src/a5/runtime/fully_distributed_within_core/runtime/pto_completion_token.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+#ifndef SRC_A5_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_PTO_COMPLETION_TOKEN_H_
+#define SRC_A5_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_PTO_COMPLETION_TOKEN_H_
+
+#include <stdint.h>
+
+#include "aicore_completion_mailbox_types.h"
+#include "pto_runtime_status.h"
+
+// CompletionToken is the runtime-internal POD that backend submit handlers
+// produce and the generic register_completion_condition() consumes. It is the
+// ABI contract for "this is one completion to wait on" — independent of which
+// backend (SDMA, RoCE, notification counter, ...) generated it. Each backend's
+// (poll, retire) pair is registered in pto_async_wait.h's ops table, keyed by
+// completion_type.
+struct CompletionToken {
+    uint64_t addr;
+    uint32_t expected_value;
+    uint32_t engine;
+    int32_t completion_type;
+    uint64_t backend_cookie;
+};
+
+enum class CompletionPollState : uint8_t {
+    PENDING = 0,
+    READY = 1,
+    FAILED = 2,
+};
+
+struct CompletionPollResult {
+    CompletionPollState state{CompletionPollState::PENDING};
+    int32_t error_code{PTO2_ERROR_NONE};
+};
+
+#endif  // SRC_A5_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_PTO_COMPLETION_TOKEN_H_
diff --git a/src/a5/runtime/fully_distributed_within_core/runtime/pto_constants.h b/src/a5/runtime/fully_distributed_within_core/runtime/pto_constants.h
new file mode 100644
index 000000000..0707f53f9
--- /dev/null
+++ b/src/a5/runtime/fully_distributed_within_core/runtime/pto_constants.h
@@ -0,0 +1,19 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+#ifndef SRC_A5_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_PTO_CONSTANTS_H_
+#define SRC_A5_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_PTO_CONSTANTS_H_
+
+#define PTO2_ALIGN_SIZE 64             // Cache line alignment
+#define PTO2_PACKED_OUTPUT_ALIGN 1024  // Each output in packed buffer aligned to 1024B; gap is padding
+#define PTO2_ALIGN_UP(x, align) (((x) + (align) - 1) & ~((align) - 1))
+
+#endif  // SRC_A5_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_PTO_CONSTANTS_H_
diff --git a/src/a5/runtime/fully_distributed_within_core/runtime/pto_dep_compute.h b/src/a5/runtime/fully_distributed_within_core/runtime/pto_dep_compute.h
new file mode 100644
index 000000000..1f78a78e5
--- /dev/null
+++ b/src/a5/runtime/fully_distributed_within_core/runtime/pto_dep_compute.h
@@ -0,0 +1,155 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+/**
+ * @file pto_dep_compute.h
+ * @brief Dependency computation primitives shared by runtime submit_task and dep_gen replay.
+ *
+ * Two header-only template entry points:
+ *
+ *   compute_task_fanin     — STEP 3 in submit_task: per-tensor creator retention (Step A)
+ *                            + tensormap.lookup for INPUT/INOUT (Step B). Calls back into
+ *                            user-supplied `emit` for each producer it identifies.
+ *
+ *   register_task_outputs  — STEP 4 in submit_task: tensormap.insert for INOUT and
+ *                            OUTPUT_EXISTING tensors. No callbacks.
+ *
+ * STEP 1 (explicit_deps) is intentionally left at the runtime call site because its
+ * `last_task_alive` shortcut + unchecked slot lookup is subtly different from the
+ * `slot_state->task->task_id == producer` reuse check in STEP 3. Unifying them would
+ * require two emit semantics or a marginal behavior change in transients — not worth
+ * the minor structural overlap. Replay handles STEP 1 with a one-line loop of its own.
+ *
+ * The Emit callback contract:
+ *   bool emit(PTO2TaskId producer);
+ *     - return true to continue (whether or not the producer was actually recorded —
+ *       producer-not-alive / dedup-hit / etc. all return true silently)
+ *     - return false to signal fatal (e.g. fanin spill overflow); caller bails
+ *
+ * Performance: Emit is a template parameter, not std::function. Both runtime
+ * (lambda capturing fanin_builder + sm_header) and replay (lambda capturing edge
+ * vector) instantiate at the call site and inline through. Do NOT replace with
+ * std::function — it would break the inlining and add ~5 ns/call to the orch hot path.
+ */
+
+#ifndef SRC_A5_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_PTO_DEP_COMPUTE_H_
+#define SRC_A5_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_PTO_DEP_COMPUTE_H_
+
+#include <cstdint>
+
+#include "pto_task_id.h"
+#include "pto_tensormap.h"
+#include "pto_types.h"  // TensorRef
+#include "tensor.h"
+
+/**
+ * View struct for inputs to compute_task_fanin / register_task_outputs.
+ *
+ * Both runtime and replay assemble one of these from their own data sources
+ * (runtime: from Arg accessors; replay: from SubmitTraceEntry fields). All
+ * pointer arrays must remain valid for the duration of the call.
+ */
+struct DepInputs {
+    int32_t tensor_count;
+    const TensorRef *tensors;        // length = tensor_count (union; OUTPUT slots' .ptr is unused)
+    const TensorArgType *arg_types;  // length = tensor_count
+    int32_t explicit_dep_count;
+    const PTO2TaskId *explicit_deps;  // length = explicit_dep_count (validity checked by caller)
+};
+
+/**
+ * Compute fanin for a task being submitted (STEP 3: Step A creator retention +
+ * Step B tensormap modifier lookup).
+ *
+ * For each non-OUTPUT tensor:
+ *   - If owner_task_id is valid, emit(owner)
+ *   - For INPUT/INOUT (and not manual_dep), tensor_map.lookup(*tensor) and emit
+ *     each matching producer. INOUT+COVERED triggers tensor_map.remove_entry(entry).
+ *
+ * @return true on success (or producer-skipped-silently); false if emit signaled
+ *         fatal — caller should propagate (after any fatal bookkeeping done by emit).
+ */
+template <typename Emit>
+[[nodiscard]] inline bool
+compute_task_fanin(const DepInputs &inputs, PTO2TensorMap &tensor_map, bool in_manual_scope, Emit emit) {
+    if (in_manual_scope) {
+        return true;
+    }
+
+    for (int32_t i = 0; i < inputs.tensor_count; i++) {
+        TensorArgType ptype = inputs.arg_types[i];
+        if (ptype == TensorArgType::OUTPUT) {
+            // Runtime-created OUTPUT tensors are not looked up in the TensorMap since
+            // they have no dependencies.
+            continue;
+        }
+
+        const Tensor *tensor = &inputs.tensors[i].ref();
+
+        // Step A: creator retention — all existing tensors extend their creator lifetime.
+        PTO2TaskId owner = tensor->owner_task_id;
+        if (owner.is_valid()) {
+            if (!emit(owner)) {
+                return false;
+            }
+        }
+
+        // Step B: only INPUT/INOUT need modifier dependency lookup.
+        if (ptype != TensorArgType::INPUT && ptype != TensorArgType::INOUT) {
+            continue;
+        }
+        if (tensor->manual_dep) {
+            continue;
+        }
+
+        bool fatal = false;
+        tensor_map.lookup(*tensor, [&](PTO2TensorMapEntry &entry, OverlapStatus overlap_status) -> bool {
+            if (!emit(entry.producer_task_id)) {
+                fatal = true;
+                return false;  // stop iteration
+            }
+            if (ptype == TensorArgType::INOUT && overlap_status == OverlapStatus::COVERED) {
+                tensor_map.remove_entry(entry);
+            }
+            return true;
+        });
+        if (fatal) {
+            return false;
+        }
+    }
+    return true;
+}
+
+/**
+ * Register a task's outputs in the tensormap (STEP 4 in submit_task).
+ *
+ * For INOUT and OUTPUT_EXISTING tensors (excluding manual_dep), inserts the
+ * tensor into tensor_map keyed by its buffer.addr with `task_id` as producer.
+ *
+ * No-op when in_manual_scope.
+ */
+inline void
+register_task_outputs(const DepInputs &inputs, PTO2TaskId task_id, PTO2TensorMap &tensor_map, bool in_manual_scope) {
+    if (in_manual_scope) {
+        return;
+    }
+    for (int32_t i = 0; i < inputs.tensor_count; i++) {
+        TensorArgType ptype = inputs.arg_types[i];
+        if (ptype == TensorArgType::INOUT || ptype == TensorArgType::OUTPUT_EXISTING) {
+            const Tensor *tensor = &inputs.tensors[i].ref();
+            if (!tensor->manual_dep) {
+                tensor_map.insert(*tensor, task_id);
+            }
+        }
+    }
+}
+
+#endif  // SRC_A5_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_PTO_DEP_COMPUTE_H_
diff --git a/src/a5/runtime/fully_distributed_within_core/runtime/pto_orchestrator.cpp b/src/a5/runtime/fully_distributed_within_core/runtime/pto_orchestrator.cpp
new file mode 100644
index 000000000..09e0f35a5
--- /dev/null
+++ b/src/a5/runtime/fully_distributed_within_core/runtime/pto_orchestrator.cpp
@@ -0,0 +1,977 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+/**
+ * PTO Runtime2 - Orchestrator Implementation
+ *
+ * Implements orchestrator state management, scope handling, and task submission.
+ *
+ * Based on: docs/RUNTIME_LOGIC.md
+ */
+
+#include "pto_orchestrator.h"
+
+#include <assert.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <stdarg.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "aicpu/dep_gen_collector_aicpu.h"
+#include "common/dep_gen.h"
+#include "common/unified_log.h"
+#include "pto_dep_compute.h"
+#include "pto_runtime2_types.h"
+#include "pto_shared_memory.h"
+#include "pto_tensormap.h"
+#include "pto_types.h"
+#include "tensor.h"
+
+#if PTO2_PROFILING
+#include "aicpu/tensor_dump_aicpu.h"
+#endif
+
+// Verify the captured Tensor blob size in DepGenRecord matches the runtime
+// Tensor layout. The platform header defines DEP_GEN_TENSOR_SIZE without
+// including runtime/tensor.h, so this check lives at the orch callsite.
+static_assert(sizeof(Tensor) == DEP_GEN_TENSOR_SIZE, "DepGenRecord::tensors slot size out of sync with sizeof(Tensor)");
+// DEP_GEN_MAX_EXPLICIT_DEPS is a diagnostic-side capture cap only; the runtime
+// imposes no hard cap on explicit dep count. If a submit exceeds this cap,
+// dep_gen_aicpu_record_submit() logs and truncates — runtime correctness is
+// unaffected, only the captured replay record is truncated.
+
+// Weak fallbacks: dep_gen_collector_aicpu.cpp provides the strong symbols in
+// AICPU builds. Host builds (host_build_graph runtime, future dep_gen replay)
+// link these no-op stubs so the runtime translation unit is self-contained.
+// Visibility is hidden so the HOST .so doesn't export them into the global
+// dynamic symbol table where they'd shadow the AICPU .so's strong symbols
+// (same pattern as get_sys_cnt_aicpu / l2_perf_aicpu_record_orch_phase below).
+extern "C" __attribute__((weak, visibility("hidden"))) bool is_dep_gen_enabled() { return false; }
+__attribute__((weak, visibility("hidden"))) void dep_gen_aicpu_record_submit(
+    uint64_t, bool, int, const void *const *, const uint8_t *, int, const uint64_t *, const int32_t[3]
+) {}
+
+#if PTO2_PROFILING
+#include "aicpu/scope_stats_collector_aicpu.h"
+
+// Scope_stats enable gate, queried via the same predicate idiom as
+// is_dep_gen_enabled. The AICPU collector links the strong definition; host
+// builds fall back to this weak `false`. Gating here still skips the
+// cross-agent occupancy reads that feed the sample when scope_stats is disabled.
+extern "C" __attribute__((weak, visibility("hidden"))) bool is_scope_stats_enabled() { return false; }
+
+// Heap-ring wrap report, called from the allocator (pto_ring_buffer.h) on each
+// wrap. Strong definition lives in the AICPU collector; host builds fall back to
+// this weak no-op so the runtime translation unit stays self-contained.
+extern "C" __attribute__((weak, visibility("hidden"))) void scope_stats_note_heap_wrap(int) {}
+#endif
+
+// =============================================================================
+// Orchestrator Profiling (compile-time toggle)
+// =============================================================================
+#if PTO2_ORCH_PROFILING
+#include "aicpu/device_time.h"
+#include "aicpu/l2_swimlane_collector_aicpu.h"
+// Weak fallback for builds that don't link device_time.cpp (e.g. host).
+// The strong symbol from platform/.../device_time.cpp wins in the AICPU build.
+//
+// IMPORTANT: visibility("hidden") is required to prevent the HOST .so from
+// exporting this weak fallback into the global dynamic symbol table via
+// RTLD_GLOBAL. Without it, when the AICPU .so is loaded and its PLT entry
+// for get_sys_cnt_aicpu is resolved, the dynamic linker finds the HOST .so's
+// weak definition first (already in global table) and uses it — returning 0.
+// With hidden visibility, the HOST .so does not export this symbol globally,
+// so the AICPU .so's PLT resolves to its own strong definition from
+// device_time.cpp.
+__attribute__((weak, visibility("hidden"))) uint64_t get_sys_cnt_aicpu() { return 0; }
+// Weak fallback for builds that don't link l2_swimlane_collector_aicpu.cpp.
+// The strong symbol from the AICPU build wins when profiling is available.
+// Also hidden to prevent HOST .so from polluting the global symbol table.
+__attribute__((weak, visibility("hidden"))) void
+l2_swimlane_aicpu_record_orch_phase(uint64_t, uint64_t, uint64_t, uint32_t) {}
+// Accumulated cycles per sub-step (only needed for ORCH_PROFILING export)
+static uint64_t g_orch_sync_cycle = 0;       // tensormap sync
+static uint64_t g_orch_alloc_cycle = 0;      // unified task+heap alloc
+static uint64_t g_orch_args_cycle = 0;       // param copy
+static uint64_t g_orch_lookup_cycle = 0;     // tensormap lookup + dep building
+static uint64_t g_orch_insert_cycle = 0;     // tensormap insert
+static uint64_t g_orch_fanin_cycle = 0;      // fanin list + early-return check
+static uint64_t g_orch_scope_end_cycle = 0;  // scope_end overhead
+static int64_t g_orch_submit_count = 0;
+static uint32_t g_orch_submit_idx = 0;
+uint64_t g_orch_alloc_wait_cycle = 0;
+uint64_t g_orch_fanin_wait_cycle = 0;
+uint64_t g_orch_alloc_atomic_count = 0;
+uint64_t g_orch_args_atomic_count = 0;
+uint64_t g_orch_scope_end_atomic_count = 0;
+// Cycle accumulation feeds the per-sub-step `g_orch_*_cycle` cumulatives
+// printed in the cold-path log. Per-sub-step swim-lane phase records were
+// dropped; the per-submit envelope record (CYCLE_COUNT_ORCH_SUBMIT_RECORD)
+// is the only swim-lane emit on the orch path.
+#define CYCLE_COUNT_START()                                                        \
+    bool _prof_active = (orch->l2_swimlane_level >= L2SwimlaneLevel::ORCH_PHASES); \
+    uint64_t _t0 = get_sys_cnt_aicpu(), _t1;                                       \
+    uint64_t _submit_start_ts = _t0
+#define CYCLE_COUNT_LAP(acc)       \
+    do {                           \
+        _t1 = get_sys_cnt_aicpu(); \
+        acc += (_t1 - _t0);        \
+        _t0 = _t1;                 \
+    } while (0)
+#define CYCLE_COUNT_ORCH_SUBMIT_RECORD(tid)                                                       \
+    do {                                                                                          \
+        if (_prof_active) {                                                                       \
+            l2_swimlane_aicpu_record_orch_phase(_submit_start_ts, _t1, (tid), g_orch_submit_idx); \
+        }                                                                                         \
+    } while (0)
+#elif PTO2_PROFILING
+#include "aicpu/device_time.h"
+#include "aicpu/l2_swimlane_collector_aicpu.h"
+__attribute__((weak, visibility("hidden"))) uint64_t get_sys_cnt_aicpu() { return 0; }
+__attribute__((weak, visibility("hidden"))) void
+l2_swimlane_aicpu_record_orch_phase(uint64_t, uint64_t, uint64_t, uint32_t) {}
+// submit_idx needed for swimlane task_id tagging (no cycle accumulation at this level)
+static uint32_t g_orch_submit_idx = 0;
+#define CYCLE_COUNT_START()                                                        \
+    bool _prof_active = (orch->l2_swimlane_level >= L2SwimlaneLevel::ORCH_PHASES); \
+    uint64_t _t0 = _prof_active ? get_sys_cnt_aicpu() : 0, _t1 = 0;                \
+    uint64_t _submit_start_ts = _t0
+#define CYCLE_COUNT_LAP(acc) \
+    do {                     \
+    } while (0)
+#define CYCLE_COUNT_ORCH_SUBMIT_RECORD(tid)                                                       \
+    do {                                                                                          \
+        if (_prof_active) {                                                                       \
+            _t1 = get_sys_cnt_aicpu();                                                            \
+            l2_swimlane_aicpu_record_orch_phase(_submit_start_ts, _t1, (tid), g_orch_submit_idx); \
+        }                                                                                         \
+    } while (0)
+#else
+#define CYCLE_COUNT_START()
+#define CYCLE_COUNT_LAP(acc)
+#define CYCLE_COUNT_ORCH_SUBMIT_RECORD(tid)
+#endif
+
+static int32_t orch_mark_fatal(PTO2OrchestratorState *orch, int32_t error_code) {
+    always_assert(orch != nullptr);
+    orch->fatal = true;
+    if (error_code == PTO2_ERROR_NONE || orch->sm_header == nullptr) {
+        return PTO2_ERROR_NONE;
+    }
+
+    int32_t expected = PTO2_ERROR_NONE;
+    std::atomic<int32_t> &orch_error_code = orch->sm_header->orch_error_code;
+    if (orch_error_code.compare_exchange_strong(expected, error_code, std::memory_order_acq_rel)) {
+        return error_code;
+    }
+    return expected;
+}
+
+static void
+orch_report_fatal_v(PTO2OrchestratorState *orch, int32_t error_code, const char *func, const char *fmt, va_list args) {
+    int32_t latched_code = orch_mark_fatal(orch, error_code);
+#if PTO2_PROFILING
+    // Flush the active scope's peaks before the FATAL line so the diagnostic
+    // context lands adjacent in the log. Latched internally — safe to call
+    // from every cascaded report_fatal.
+    scope_stats_on_fatal();
+#endif
+
+    if (fmt == nullptr || fmt[0] == '\0') {
+        if (latched_code != PTO2_ERROR_NONE && latched_code != error_code) {
+            unified_log_error(func, "FATAL(code=%d, latched=%d)", error_code, latched_code);
+        } else {
+            unified_log_error(func, "FATAL(code=%d)", error_code);
+        }
+        return;
+    }
+
+    char message[1024];
+    vsnprintf(message, sizeof(message), fmt, args);
+    if (latched_code != PTO2_ERROR_NONE && latched_code != error_code) {
+        unified_log_error(func, "FATAL(code=%d, latched=%d): %s", error_code, latched_code, message);
+        return;
+    }
+    unified_log_error(func, "FATAL(code=%d): %s", error_code, message);
+}
+
+void PTO2OrchestratorState::report_fatal(int32_t error_code, const char *func, const char *fmt, ...) {
+    auto *orch = this;
+    va_list args;
+    va_start(args, fmt);
+    orch_report_fatal_v(orch, error_code, func, fmt, args);
+    va_end(args);
+}
+
+static uint32_t next_fanin_seen_epoch(PTO2OrchestratorState *orch) {
+    uint32_t next = orch->fanin_seen_current_epoch + 1;
+    if (next == 0) {
+        for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+            memset(
+                orch->fanin_seen_epoch[r], 0,
+                static_cast<size_t>(orch->sm_header->rings[r].task_window_size) * sizeof(uint32_t)
+            );
+        }
+        next = 1;
+    }
+    orch->fanin_seen_current_epoch = next;
+    return next;
+}
+
+struct PTO2FaninBuilder {
+    PTO2FaninBuilder(PTO2OrchestratorState *orch, PTO2FaninPool &spill_pool, uint32_t seen_epoch) :
+        count(0),
+        spill_start(0),
+        orch(orch),
+        seen_epoch(seen_epoch),
+        spill_pool(spill_pool) {}
+    int32_t count{0};
+    int32_t spill_start{0};
+    PTO2OrchestratorState *orch{nullptr};
+    uint32_t seen_epoch{0};
+    PTO2FaninPool &spill_pool;
+    PTO2TaskSlotState *inline_slots[PTO2_FANIN_INLINE_CAP];
+
+    template <typename Fn>
+    PTO2FaninForEachReturn<Fn> for_each(Fn &&fn) const {
+        return for_each_fanin_storage(inline_slots, count, spill_start, spill_pool, static_cast<Fn &&>(fn));
+    }
+
+    bool mark_seen(uint8_t prod_ring, int32_t prod_slot) {
+        if (prod_ring >= PTO2_MAX_RING_DEPTH || prod_slot < 0) {
+            return false;
+        }
+        uint32_t *seen = orch->fanin_seen_epoch[prod_ring];
+        uint32_t slot = static_cast<uint32_t>(prod_slot);
+        if (seen[slot] == seen_epoch) {
+            return true;
+        }
+        seen[slot] = seen_epoch;
+        return false;
+    }
+};
+
+static bool append_fanin_or_fail(
+    PTO2OrchestratorState *orch, uint8_t prod_ring, int32_t prod_slot, PTO2TaskSlotState *prod_state,
+    PTO2FaninBuilder *fanin_builder, uint8_t ring_id
+) {
+    if (fanin_builder->mark_seen(prod_ring, prod_slot)) {
+        return true;
+    }
+
+    if (fanin_builder->count < PTO2_FANIN_INLINE_CAP) {
+        fanin_builder->inline_slots[fanin_builder->count++] = prod_state;
+        return true;
+    }
+
+    PTO2FaninPool &fanin_pool = fanin_builder->spill_pool;
+    if (!fanin_pool.ensure_space(orch->sm_header->rings[ring_id], 1)) {
+        orch_mark_fatal(orch, PTO2_ERROR_DEP_POOL_OVERFLOW);
+        return false;
+    }
+    int32_t spill_idx = fanin_pool.top;
+    PTO2FaninSpillEntry *entry = fanin_pool.alloc();
+    if (entry == nullptr) {
+        orch_mark_fatal(orch, PTO2_ERROR_DEP_POOL_OVERFLOW);
+        return false;
+    }
+    if (fanin_builder->count == PTO2_FANIN_INLINE_CAP) {
+        fanin_builder->spill_start = spill_idx;
+    }
+    entry->slot_state = prod_state;
+    fanin_builder->count++;
+    return true;
+}
+
+static void scope_tasks_push(PTO2OrchestratorState *orch, PTO2TaskSlotState *task_slot_state);
+
+struct PTO2PreparedTask {
+    PTO2TaskId task_id = PTO2TaskId::invalid();
+    PTO2TaskAllocResult alloc_result = {-1, 0, nullptr, nullptr};
+    PTO2TaskDescriptor *task = nullptr;
+    PTO2TaskPayload *payload = nullptr;
+    PTO2TaskSlotState *slot_state = nullptr;
+};
+
+static PTO2OutputLayout calculate_output_layout(const L0TaskArgs &args) {
+    PTO2OutputLayout layout;
+    for (int32_t i = 0; i < args.tensor_count(); i++) {
+        if (args.tag(i) != TensorArgType::OUTPUT) {
+            continue;
+        }
+        layout.offsets[i] = layout.total_output_size;
+        layout.buffer_sizes[i] =
+            PTO2_ALIGN_UP(args.tensor(i).create_info().buffer_size_bytes(), PTO2_PACKED_OUTPUT_ALIGN);
+        layout.total_output_size += layout.buffer_sizes[i];
+    }
+    return layout;
+}
+
+static bool check_scope_can_accept_task(PTO2OrchestratorState *orch, PTO2TaskAllocator &allocator, uint8_t ring_id) {
+    always_assert(orch->scope_stack_top >= 0 && "Cannot submit task outside a scope");
+
+    int32_t scope_task_count = orch->scope_tasks_size - orch->scope_begins[orch->scope_stack_top];
+    if (scope_task_count < allocator.window_size() - 1) {
+        return true;
+    }
+
+    int32_t active_count = allocator.active_count();
+
+    LOG_ERROR("========================================");
+    LOG_ERROR("FATAL: Scope Deadlock Detected! (ring %d)", ring_id);
+    LOG_ERROR("========================================");
+    LOG_ERROR("Tasks in current scope (%d) >= task_window_size (%d).", scope_task_count, allocator.window_size());
+    LOG_ERROR("  scope_depth:        %d", orch->scope_stack_top + 1);
+    LOG_ERROR("  ring_id:            %d", ring_id);
+    LOG_ERROR("  scope_task_count:   %d", scope_task_count);
+    LOG_ERROR("  active_tasks:       %d / %d", active_count, allocator.window_size());
+    LOG_ERROR("Root Cause:");
+    LOG_ERROR("  Tasks within a scope hold a fanout_count reference that is only");
+    LOG_ERROR("  released at scope_end. When scope task count >= window_size,");
+    LOG_ERROR("  no slots can be reclaimed -> deadlock.");
+    LOG_ERROR("Solution:");
+    LOG_ERROR("  1. Reduce tasks per scope (use batching/unroll)");
+    LOG_ERROR("  2. Increase task window (current: %d)", allocator.window_size());
+    LOG_ERROR("     Compile-time: PTO2_TASK_WINDOW_SIZE in pto_runtime2_types.h");
+    LOG_ERROR("     Runtime env:  PTO2_RING_TASK_WINDOW=<power-of-2>");
+    LOG_ERROR("  3. Split work across multiple scopes");
+    LOG_ERROR("========================================");
+    orch_mark_fatal(orch, PTO2_ERROR_SCOPE_DEADLOCK);
+    return false;
+}
+
+static void prefetch_payload(PTO2TaskPayload *payload, int32_t tensor_count, int32_t scalar_count) {
+    for (int32_t i = 0; i < tensor_count; i++) {
+        __builtin_prefetch(&payload->tensors[i], 1, 3);
+        __builtin_prefetch(reinterpret_cast<char *>(&payload->tensors[i]) + 64, 1, 3);
+    }
+    for (int32_t i = 0; i < scalar_count; i += 8) {
+        __builtin_prefetch(&payload->scalars[i], 1, 3);
+    }
+    __builtin_prefetch(payload, 1, 3);
+    __builtin_prefetch(reinterpret_cast<char *>(payload) + 64, 1, 3);
+    __builtin_prefetch(reinterpret_cast<char *>(payload) + 128, 1, 3);
+}
+
+static bool prepare_task(
+    PTO2OrchestratorState *orch, const L0TaskArgs &args, int32_t total_output_size, ActiveMask active_mask,
+    PTO2PreparedTask *out
+) {
+    uint8_t ring_id = orch->current_ring_id();
+    auto &allocator = orch->rings[ring_id].task_allocator;
+
+    if (!check_scope_can_accept_task(orch, allocator, ring_id)) {
+        return false;
+    }
+
+    out->alloc_result = allocator.alloc(total_output_size);
+    if (out->alloc_result.failed()) {
+        orch_mark_fatal(orch, PTO2_ERROR_HEAP_RING_DEADLOCK);
+        return false;
+    }
+
+    out->task_id = PTO2TaskId::make(ring_id, static_cast<uint32_t>(out->alloc_result.task_id));
+    out->slot_state = &orch->sm_header->rings[ring_id].get_slot_state_by_slot(out->alloc_result.slot);
+    out->task = &orch->sm_header->rings[ring_id].task_descriptors[out->alloc_result.slot];
+    out->payload = &orch->sm_header->rings[ring_id].task_payloads[out->alloc_result.slot];
+
+    prefetch_payload(out->payload, args.tensor_count(), args.scalar_count());
+
+    // Re-bind payload/task pointers each submit. Value is per-slot constant
+    // (same as &task_payloads[slot] / &task_descriptors[slot]), but writing
+    // here lets RingSchedState::init_data_from_layout() skip the
+    // O(window_size) bind loop. Both writes hit the same 64B slot_state
+    // cache line we're about to dirty below, so the extra cost is two
+    // stores on an already-hot line. Must precede the scheduler
+    // wiring.queue.push at the end of submit_task_common — that push is
+    // the first read of slot_state->task / slot_state->payload by another
+    // thread.
+    out->slot_state->bind_buffers(out->payload, out->task);
+
+    // Fields already reset by advance_ring_pointers (eager reset after CONSUMED):
+    //   fanout_lock=0, fanout_count=1, fanout_head=nullptr,
+    //   fanin_refcount=0, fanout_refcount=0, completed_subtasks=0, next_block_idx=0
+    // Fields immutable after RingSchedState::init_data_from_layout():
+    //   ring_id
+    // task_state left as CONSUMED by eager reset (safe for stale wait_for_tensor
+    // observers); set to PENDING here when orchestrator actually reuses the slot.
+    out->slot_state->task_state.store(PTO2_TASK_PENDING, std::memory_order_relaxed);
+    int16_t block_num = args.launch_spec.core_num();
+    out->slot_state->total_required_subtasks =
+        static_cast<int16_t>(block_num * __builtin_popcount(active_mask.core_mask()));
+    out->slot_state->logical_block_num = block_num;
+    out->slot_state->active_mask = active_mask;
+    // fanin_count is set by scheduler during wiring
+    scope_tasks_push(orch, out->slot_state);
+
+    return true;
+}
+
+// =============================================================================
+// Scope Management
+// =============================================================================
+
+static void scope_tasks_push(PTO2OrchestratorState *orch, PTO2TaskSlotState *task_slot_state) {
+    if (orch->scope_tasks_size >= orch->scope_tasks_capacity) {
+        // scope_tasks lives in the per-Worker arena (single backing allocation),
+        // so realloc is not legal. Capacity == PTO2_SCOPE_TASKS_CAP ==
+        // PTO2_TASK_WINDOW_SIZE × PTO2_MAX_RING_DEPTH, the total in-flight slot
+        // budget — hitting it means every ring is saturated, so no further push
+        // could succeed regardless of buffer growth.
+        orch->report_fatal(
+            PTO2_ERROR_SCOPE_TASKS_OVERFLOW, __FUNCTION__,
+            "scope_tasks buffer saturated at %d entries (all rings full)", orch->scope_tasks_capacity
+        );
+        return;
+    }
+    orch->scope_tasks[orch->scope_tasks_size++] = task_slot_state;
+}
+
+void PTO2OrchestratorState::begin_scope(PTO2ScopeMode mode) {
+    auto *orch = this;
+    if (orch->fatal) {
+        return;
+    }
+    assert(orch->scope_stack_top < static_cast<int32_t>(orch->scope_stack_capacity - 1) && "Scope stack overflow");
+    if (mode == PTO2ScopeMode::AUTO && orch->in_manual_scope()) {
+        report_fatal(PTO2_ERROR_INVALID_ARGS, __FUNCTION__, "auto scope nested inside manual scope is not supported");
+        return;
+    }
+
+    bool already_in_manual_scope = orch->in_manual_scope();
+    ++orch->scope_stack_top;
+    orch->scope_begins[orch->scope_stack_top] = orch->scope_tasks_size;
+    if (mode == PTO2ScopeMode::MANUAL && !already_in_manual_scope) {
+        orch->manual_begin_depth = orch->scope_stack_top;
+    }
+#if PTO2_PROFILING
+    // Gate via is_scope_stats_enabled() (weak-false in host builds) BEFORE the
+    // collector call: when disabled we pay nothing. Sample the current ring's
+    // task/heap start-end and tensormap usage at the scope boundary.
+    if (is_scope_stats_enabled()) {
+        uint8_t ring_id = orch->current_ring_id();
+        auto &alloc = orch->rings[ring_id].task_allocator;
+        int32_t dep_pool_tail = 0;
+        int32_t dep_pool_top = 0;
+        if (orch->scheduler) {
+            orch->scheduler->ring_sched_states[ring_id].read_dep_pool_snapshot(dep_pool_tail, dep_pool_top);
+        }
+        scope_stats_begin(
+            ring_id, alloc.task_tail(), alloc.task_head(), alloc.heap_tail(), alloc.heap_top(), dep_pool_tail,
+            dep_pool_top, orch->tensor_map.current_used()
+        );
+    }
+#endif
+}
+
+void PTO2OrchestratorState::end_scope() {
+    auto *orch = this;
+    if (orch->fatal) {
+        return;
+    }
+    assert(orch->scope_stack_top >= 0 && "Scope stack underflow");
+
+    // Snapshot the ring start/end BEFORE the orchestrator drains pending tasks
+    // via scheduler->on_scope_end, so the end record reflects the scope's
+    // occupancy at close, not the residual after teardown.
+#if PTO2_PROFILING
+    // Gate via is_scope_stats_enabled() (see begin_scope). One collector call
+    // emits the end-boundary record and tears down bookkeeping.
+    if (is_scope_stats_enabled()) {
+        uint8_t ring_id = orch->current_ring_id();
+        auto &alloc = orch->rings[ring_id].task_allocator;
+        int32_t dep_pool_tail = 0;
+        int32_t dep_pool_top = 0;
+        if (orch->scheduler) {
+            orch->scheduler->ring_sched_states[ring_id].read_dep_pool_snapshot(dep_pool_tail, dep_pool_top);
+        }
+        scope_stats_end(
+            ring_id, alloc.task_tail(), alloc.task_head(), alloc.heap_tail(), alloc.heap_top(), dep_pool_tail,
+            dep_pool_top, orch->tensor_map.current_used()
+        );
+    }
+#endif
+
+#if PTO2_ORCH_PROFILING
+    uint64_t _se0 = get_sys_cnt_aicpu();
+#endif
+
+    bool ending_manual_scope = orch->scope_stack_top == orch->manual_begin_depth;
+    int32_t begin = orch->scope_begins[orch->scope_stack_top--];
+    int32_t count = orch->scope_tasks_size - begin;
+    if (ending_manual_scope) {
+        orch->manual_begin_depth = PTO2_MAX_SCOPE_DEPTH;
+    }
+
+    if (orch->scheduler && count > 0) {
+        orch->scheduler->on_scope_end(&orch->scope_tasks[begin], count);
+    }
+
+    // Rewind the task buffer — these entries are no longer needed
+    orch->scope_tasks_size = begin;
+
+#if PTO2_ORCH_PROFILING
+    uint64_t _se1 = get_sys_cnt_aicpu();
+    g_orch_scope_end_cycle += (_se1 - _se0);
+#endif
+}
+
+// =============================================================================
+// Task Submission
+// =============================================================================
+
+// Shared body for submit_task / submit_dummy_task. Caller has already validated
+// args.has_error, decided active_mask (empty for dummy), and resolved the per-slot
+// kernel_ids (all INVALID_KERNEL_ID for dummy). Performs tensormap sync, fanin
+// computation (explicit_deps + auto), output registration, slot init, and pushes
+// to the scheduler wiring queue.
+static TaskOutputTensors submit_task_common(
+    PTO2OrchestratorState *orch, const L0TaskArgs &args, ActiveMask active_mask, int32_t aic_kernel_id,
+    int32_t aiv0_kernel_id, int32_t aiv1_kernel_id
+) {
+    CYCLE_COUNT_START();
+    TaskOutputTensors result;
+    PTO2OutputLayout layout = calculate_output_layout(args);
+    PTO2PreparedTask prepared;
+    if (!prepare_task(orch, args, layout.total_output_size, active_mask, &prepared)) {
+        return result;
+    }
+    uint8_t ring_id = prepared.task_id.ring();
+    PTO2SchedulerState *sched = orch->scheduler;
+    PTO2RingFlowControl &fc = orch->sm_header->rings[ring_id].fc;
+    PTO2TaskId task_id = prepared.task_id;
+    PTO2TaskSlotState &cur_slot_state = *prepared.slot_state;
+    PTO2TaskDescriptor &task = *prepared.task;
+    PTO2TaskPayload &payload = *prepared.payload;
+    result.set_task_id(task_id);
+
+    // dep_gen capture point: snapshot the orch submit_task inputs while the
+    // tensormap is still in its pre-lookup state for this task. Replay reads
+    // these records offline to reconstruct the complete dep graph — the sole
+    // source of truth for fanout now that the swimlane hot path no longer
+    // records it.
+    if (is_dep_gen_enabled()) {
+        const void *tensor_ptrs[MAX_TENSOR_ARGS];
+        // TensorArgType is `enum class : int32_t` (4 bytes); the on-disk record
+        // packs arg_types as uint8_t[16] (5-value enum fits in a byte). Narrow
+        // each tag here rather than letting the AICPU writer reinterpret a
+        // 4×-wider array as bytes — that path silently lost two of every three
+        // tags on little-endian and synthesized phantom self-edges in replay.
+        uint8_t arg_types_u8[MAX_TENSOR_ARGS];
+        // Clamp to MAX_TENSOR_ARGS even though the Arg builder caps adds at
+        // MAX_TENSOR_ARGS: defensive against any future builder bypass /
+        // shared-memory bit-flip that could otherwise overrun the two
+        // MAX_TENSOR_ARGS-sized stack buffers above.
+        const int tc_raw = args.tensor_count();
+        const int tc = tc_raw > MAX_TENSOR_ARGS ? MAX_TENSOR_ARGS : tc_raw;
+        for (int i = 0; i < tc; i++) {
+            // OUTPUT slots carry create_info (not yet a Tensor); skip them —
+            // they have no producer to look up and replay's per-tensor loop
+            // also skips OUTPUT.
+            tensor_ptrs[i] = (args.tag(i) == TensorArgType::OUTPUT) ? nullptr : &args.tensor(i).ref();
+            arg_types_u8[i] = static_cast<uint8_t>(args.tag(i));
+        }
+        const int32_t kernel_ids_capture[3] = {aic_kernel_id, aiv0_kernel_id, aiv1_kernel_id};
+        dep_gen_aicpu_record_submit(
+            task_id.raw, orch->in_manual_scope(), tc, tensor_ptrs, arg_types_u8,
+            static_cast<int>(args.explicit_dep_count()), reinterpret_cast<const uint64_t *>(args.explicit_deps_data()),
+            kernel_ids_capture
+        );
+    }
+
+    PTO2FaninBuilder fanin_builder(orch, orch->rings[ring_id].fanin_pool, next_fanin_seen_epoch(orch));
+
+    CYCLE_COUNT_LAP(g_orch_alloc_cycle);
+
+#if PTO2_PROFILING
+    if (layout.total_output_size > 0) {
+        orch->buffers_allocated++;
+        orch->bytes_allocated += layout.total_output_size;
+    }
+#endif
+
+    // === STEP 2: Sync TensorMap validity and optional cleanup ===
+    // Read current last_task_alive from shared memory for this ring
+    int32_t sm_last_task_alive = fc.last_task_alive.load(std::memory_order_acquire);
+
+    orch->tensor_map.sync_tensormap(task_id, sm_last_task_alive);
+
+    CYCLE_COUNT_LAP(g_orch_sync_cycle);
+
+    for (uint32_t i = 0; i < args.explicit_dep_count(); i++) {
+        PTO2TaskId dep_task_id = args.explicit_dep(i);
+        if (!dep_task_id.is_valid()) {
+            orch->report_fatal(
+                PTO2_ERROR_INVALID_ARGS, __FUNCTION__, "Arg.set_dependencies(...) requires valid task ids"
+            );
+            return result;
+        }
+        uint8_t dep_ring_id = dep_task_id.ring();
+        PTO2SharedMemoryRingHeader &dep_ring = orch->sm_header->rings[dep_ring_id];
+        int32_t dep_local_task_id = static_cast<int32_t>(dep_task_id.local());
+        int32_t dep_last_task_alive = dep_ring.fc.last_task_alive.load(std::memory_order_acquire);
+        if (dep_local_task_id < dep_last_task_alive) {
+            continue;
+        }
+        int32_t dep_slot = dep_ring.get_slot_by_task_id(dep_local_task_id);
+        PTO2TaskSlotState *producer_slot_state = &dep_ring.get_slot_state_by_slot(dep_slot);
+        if (!append_fanin_or_fail(orch, dep_ring_id, dep_slot, producer_slot_state, &fanin_builder, ring_id)) {
+            return result;
+        }
+    }
+
+    // === STEP 3: Lookup inputs (creator retention + tensormap modifier lookup) ===
+    DepInputs dep_inputs{
+        args.tensor_count(),       args.tensor_data(), args.tag_data(), static_cast<int32_t>(args.explicit_dep_count()),
+        args.explicit_deps_data(),
+    };
+
+    auto runtime_emit = [&](PTO2TaskId producer_task_id) -> bool {
+        uint8_t prod_ring = producer_task_id.ring();
+        PTO2SharedMemoryRingHeader &producer_ring = orch->sm_header->rings[prod_ring];
+        int32_t prod_slot = producer_ring.get_slot_by_task_id(static_cast<int32_t>(producer_task_id.local()));
+        PTO2TaskSlotState *prod_state = &producer_ring.get_slot_state_by_slot(prod_slot);
+        return append_fanin_or_fail(orch, prod_ring, prod_slot, prod_state, &fanin_builder, ring_id);
+    };
+
+    if (!compute_task_fanin(dep_inputs, orch->tensor_map, orch->in_manual_scope(), runtime_emit)) {
+        return result;
+    }
+
+    CYCLE_COUNT_LAP(g_orch_lookup_cycle);
+
+    // === STEP 4: Register outputs/inouts in TensorMap (must be separate from lookup) ===
+    register_task_outputs(dep_inputs, task_id, orch->tensor_map, orch->in_manual_scope());
+
+    CYCLE_COUNT_LAP(g_orch_insert_cycle);
+
+    // === STEP 5: Batch-write to GM (single cache line burst) ===
+    // Deferred from allocation phase to avoid scattered GM writes that get
+    // evicted by TensorMap lookup/insert cache pressure.
+    __builtin_prefetch(&task, 1, 1);
+    task.task_id = task_id;
+    task.kernel_id[static_cast<int>(PTO2SubtaskSlot::AIC)] = aic_kernel_id;
+    task.kernel_id[static_cast<int>(PTO2SubtaskSlot::AIV0)] = aiv0_kernel_id;
+    task.kernel_id[static_cast<int>(PTO2SubtaskSlot::AIV1)] = aiv1_kernel_id;
+    task.packed_buffer_base = prepared.alloc_result.packed_base;
+    task.packed_buffer_end = prepared.alloc_result.packed_end;
+
+    // Increment fanout_count on each producer (no lock — only orch writes this field).
+    // Prevents premature CONSUMED: scope_end's release_producer checks fanout_refcount == fanout_count.
+    for_each_fanin_storage(
+        fanin_builder.inline_slots, fanin_builder.count, fanin_builder.spill_start, fanin_builder.spill_pool,
+        [](PTO2TaskSlotState *producer) {
+            producer->fanout_count++;
+        }
+    );
+
+    int32_t inline_count = std::min(fanin_builder.count, PTO2_FANIN_INLINE_CAP);
+    // Store fanin metadata in payload for scheduler to iterate
+    payload.fanin_actual_count = fanin_builder.count;
+    payload.fanin_spill_start = fanin_builder.spill_start;
+    payload.fanin_spill_pool = &fanin_builder.spill_pool;
+    for (int i = 0; i < inline_count; i++) {
+        payload.fanin_inline_slot_states[i] = fanin_builder.inline_slots[i];
+    }
+
+    payload.init(args, result, prepared.alloc_result, layout);
+#if PTO2_PROFILING
+    if (is_dump_args_enabled()) {
+        if (args.scalar_count() > 0) {
+            set_dump_args_task_scalar_dtypes(
+                task_id.raw, static_cast<uint32_t>(args.scalar_count()), args.scalar_dtypes()
+            );
+        }
+        // Selective vs full dump is latched at dump_args_init from DumpDataHeader
+        // (host-decided before any dispatch), so it is race-free regardless of
+        // submission order. Here we only record each marked task's arg mask and
+        // metadata flags, which selective collection consults.
+        if (args.dump_arg_mask() != 0) {
+            set_dump_args_task_mask(task_id.raw, args.dump_arg_mask(), args.dump_arg_index_ambiguous_mask());
+        }
+    }
+#endif
+
+    CYCLE_COUNT_LAP(g_orch_args_cycle);
+#if PTO2_ORCH_PROFILING
+    g_orch_args_atomic_count += 2;  // fanout_lock.store + fanout_count.store
+#endif
+
+    // === STEP 6: push to wiring queue ===
+    // Deferred wiring: orchestrator only stores dependency metadata and increments
+    // fanout_count. The actual fanout_head wiring (lock + dep_pool + early_finished)
+    // is handled asynchronously by scheduler thread 0 via the wiring queue.
+    // Push to global wiring queue — scheduler sets fanin_count, wires fanout, checks readiness
+    while (!sched->wiring.queue.push(&cur_slot_state)) {
+        SPIN_WAIT_HINT();
+    }
+
+    CYCLE_COUNT_LAP(g_orch_fanin_cycle);
+    CYCLE_COUNT_ORCH_SUBMIT_RECORD(task_id.raw);
+
+#if PTO2_PROFILING
+    orch->tasks_submitted++;
+#if PTO2_ORCH_PROFILING
+    g_orch_submit_count++;
+#endif
+    g_orch_submit_idx++;
+#endif
+    return result;
+}
+
+TaskOutputTensors PTO2OrchestratorState::submit_task(const MixedKernels &mixed_kernels, const L0TaskArgs &args) {
+    auto *orch = this;
+
+    // Orchestration API should short-circuit after fatal, but keep this entry
+    // robust as a no-op in case a caller reaches it directly.
+    if (orch->fatal) {
+        return TaskOutputTensors{};
+    }
+
+    // Validate Arg construction (errors recorded by add_input/add_output/etc.)
+    if (args.has_error) {
+        LOG_ERROR("========================================");
+        LOG_ERROR("FATAL: Invalid Arg Detected!");
+        LOG_ERROR("========================================");
+        LOG_ERROR("Error: %s", args.error_msg ? args.error_msg : "(unknown)");
+        LOG_ERROR("  tensor_count: %d, scalar_count: %d", args.tensor_count(), args.scalar_count());
+        LOG_ERROR("This is a bug in the orchestration code.");
+        LOG_ERROR("========================================");
+        orch_mark_fatal(orch, PTO2_ERROR_INVALID_ARGS);
+        return TaskOutputTensors{};
+    }
+    always_assert(orch->scheduler != nullptr);
+    // === Validate submit inputs ===
+    ActiveMask active_mask = mixed_kernels.to_active_mask();
+    always_assert(static_cast<bool>(active_mask) && "MixedKernels must have at least one active slot");
+
+    int16_t block_num = args.launch_spec.core_num();
+    always_assert(block_num >= 1 && "block_num must be >= 1");
+
+    // Normalize single-AIV tasks: if only aiv1 is set (no aic, no aiv0), move
+    // it to the aiv0 slot.  This guarantees the dispatch path can always use
+    // PTO2SubtaskSlot::AIV0 for single-AIV shapes without inspecting active_mask.
+    // Mixed tasks (AIC+AIV) keep their original AIV identity so the correct
+    // hardware channel (AIV0→AIC vs AIV1→AIC) is used at dispatch time.
+    MixedKernels normalized = mixed_kernels;
+    bool has_aic = active_mask.has_mask(PTO2_SUBTASK_MASK_AIC);
+    bool has_aiv0 = active_mask.has_mask(PTO2_SUBTASK_MASK_AIV0);
+    bool has_aiv1 = active_mask.has_mask(PTO2_SUBTASK_MASK_AIV1);
+    if (!has_aic && has_aiv1 && !has_aiv0) {
+        normalized.aiv0_kernel_id = normalized.aiv1_kernel_id;
+        normalized.aiv1_kernel_id = INVALID_KERNEL_ID;
+        active_mask = normalized.to_active_mask();
+    }
+
+    // Encode require_sync_start into active_mask bit 3 (only meaningful for tasks with block_num > 1)
+    if (block_num > 1 && args.launch_spec.require_sync_start()) {
+        // Deadlock check: block_num >= total available slots of the required type.
+        // For MIX/AIC: limit is total_cluster_count (one AIC per cluster).
+        // For AIV:     limit is total_aiv_count.
+        PTO2ResourceShape shape = active_mask.to_shape();
+        int32_t limit = (shape == PTO2ResourceShape::AIV) ? orch->total_aiv_count : orch->total_cluster_count;
+        if (limit > 0 && block_num > limit) {
+            report_fatal(
+                PTO2_ERROR_REQUIRE_SYNC_START_INVALID, __FUNCTION__,
+                "require_sync_start block_num=%d > limit=%d (deadlock guaranteed)", block_num, limit
+            );
+            return TaskOutputTensors{};
+        }
+        active_mask.set_sync_start();
+    }
+
+    return submit_task_common(
+        orch, args, active_mask, normalized.aic_kernel_id, normalized.aiv0_kernel_id, normalized.aiv1_kernel_id
+    );
+}
+
+// Submit a dependency-only task: full dependency graph participation
+// (tensormap lookup/insert, explicit_deps, manual_dep, manual_scope) but no
+// AICore dispatch. Empty active_mask routes the slot to the DUMMY ready
+// bucket; dispatch loop short-circuits to completion. Accepts the same Arg
+// shape as submit_task; scalars are permitted but never consumed.
+TaskOutputTensors PTO2OrchestratorState::submit_dummy_task(const L0TaskArgs &args) {
+    auto *orch = this;
+
+    if (orch->fatal) {
+        return TaskOutputTensors{};
+    }
+
+    if (args.has_error) {
+        LOG_ERROR("========================================");
+        LOG_ERROR("FATAL: Invalid Arg in submit_dummy_task!");
+        LOG_ERROR("========================================");
+        LOG_ERROR("Error: %s", args.error_msg ? args.error_msg : "(unknown)");
+        LOG_ERROR("  tensor_count: %d, scalar_count: %d", args.tensor_count(), args.scalar_count());
+        LOG_ERROR("========================================");
+        orch_mark_fatal(orch, PTO2_ERROR_INVALID_ARGS);
+        return TaskOutputTensors{};
+    }
+    always_assert(orch->scheduler != nullptr);
+
+    return submit_task_common(orch, args, ActiveMask{}, INVALID_KERNEL_ID, INVALID_KERNEL_ID, INVALID_KERNEL_ID);
+}
+
+TaskOutputTensors PTO2OrchestratorState::alloc_tensors(const L0TaskArgs &args) {
+    auto *orch = this;
+    // Orchestration API should short-circuit after fatal, but keep this entry
+    // robust as a no-op in case a caller reaches it directly.
+    if (orch->fatal) {
+        return TaskOutputTensors{};
+    }
+
+    if (args.tensor_count() <= 0) {
+        report_fatal(PTO2_ERROR_INVALID_ARGS, __FUNCTION__, "alloc_tensors requires at least one TensorCreateInfo");
+        return TaskOutputTensors{};
+    }
+    if (args.scalar_count() != 0) {
+        report_fatal(PTO2_ERROR_INVALID_ARGS, __FUNCTION__, "alloc_tensors only accepts output TensorCreateInfo args");
+        return TaskOutputTensors{};
+    }
+    for (int32_t i = 0; i < args.tensor_count(); i++) {
+        if (args.tag(i) != TensorArgType::OUTPUT) {
+            report_fatal(
+                PTO2_ERROR_INVALID_ARGS, __FUNCTION__, "alloc_tensors only accepts output TensorCreateInfo args"
+            );
+            return TaskOutputTensors{};
+        }
+    }
+
+    CYCLE_COUNT_START();
+
+    if (args.has_error) {
+        report_fatal(
+            PTO2_ERROR_INVALID_ARGS, __FUNCTION__, "%s",
+            args.error_msg ? args.error_msg : "alloc_tensors failed to construct output-only Arg"
+        );
+        return TaskOutputTensors{};
+    }
+
+    PTO2OutputLayout layout = calculate_output_layout(args);
+    PTO2PreparedTask prepared;
+    if (!prepare_task(orch, args, layout.total_output_size, ActiveMask{}, &prepared)) {
+        return TaskOutputTensors{};
+    }
+
+    PTO2TaskDescriptor &task = *prepared.task;
+    PTO2TaskPayload &payload = *prepared.payload;
+
+    CYCLE_COUNT_LAP(g_orch_alloc_cycle);
+
+#if PTO2_PROFILING
+    if (layout.total_output_size > 0) {
+        orch->buffers_allocated++;
+        orch->bytes_allocated += layout.total_output_size;
+    }
+#endif
+
+    task.task_id = prepared.task_id;
+    task.kernel_id[static_cast<int>(PTO2SubtaskSlot::AIC)] = INVALID_KERNEL_ID;
+    task.kernel_id[static_cast<int>(PTO2SubtaskSlot::AIV0)] = INVALID_KERNEL_ID;
+    task.kernel_id[static_cast<int>(PTO2SubtaskSlot::AIV1)] = INVALID_KERNEL_ID;
+    task.packed_buffer_base = prepared.alloc_result.packed_base;
+    task.packed_buffer_end = prepared.alloc_result.packed_end;
+
+    TaskOutputTensors outputs;
+    outputs.set_task_id(prepared.task_id);
+    payload.init(args, outputs, prepared.alloc_result, layout);
+    payload.fanin_actual_count = 0;
+    payload.fanin_spill_start = 0;
+    payload.fanin_spill_pool = &orch->rings[prepared.task_id.ring()].fanin_pool;
+    CYCLE_COUNT_LAP(g_orch_args_cycle);
+
+    if (prepared.slot_state != nullptr) {
+        // Hidden alloc tasks complete inline in the orchestrator before any
+        // consumer can exist, so they have no fanout to notify and no worker
+        // subtasks to retire. Running the full on_task_complete path
+        // would only pay unnecessary fanout_lock / traversal overhead here.
+        // The generic slot initialization done in prepare_task() is still
+        // required so scope_end can release the producer-side reference and
+        // drive the slot to CONSUMED, but worker dispatch fields are never
+        // observed for hidden alloc tasks.
+        prepared.slot_state->task_state.store(PTO2_TASK_COMPLETED, std::memory_order_release);
+    }
+    orch->inline_completed_tasks++;
+
+    CYCLE_COUNT_LAP(g_orch_fanin_cycle);
+    CYCLE_COUNT_ORCH_SUBMIT_RECORD(prepared.task_id.raw);
+
+#if PTO2_PROFILING
+    orch->tasks_submitted++;
+#if PTO2_ORCH_PROFILING
+    g_orch_submit_count++;
+#endif
+    g_orch_submit_idx++;
+#endif
+
+    return outputs;
+}
+
+// =============================================================================
+// Flow Control
+// =============================================================================
+
+void PTO2OrchestratorState::mark_done() {
+    auto *orch = this;
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        int32_t total_tasks = orch->rings[r].task_allocator.active_count();
+        if (total_tasks > 0) {
+            LOG_INFO_V0("=== [Orchestrator] ring %d: total_tasks=%d ===", r, total_tasks);
+        }
+        auto &fanin_pool = orch->rings[r].fanin_pool;
+        if (fanin_pool.top > 1) {
+            LOG_INFO_V0(
+                "=== [FaninPool %d] top=%d tail=%d used=%d high_water=%d capacity=%d ===", r, fanin_pool.top,
+                fanin_pool.tail, fanin_pool.top - fanin_pool.tail, fanin_pool.high_water, fanin_pool.capacity
+            );
+        }
+    }
+    orch->sm_header->orchestrator_done.store(1, std::memory_order_release);
+    orch->scope_tasks_size = 0;
+    orch->scope_stack_top = -1;
+    orch->manual_begin_depth = PTO2_MAX_SCOPE_DEPTH;
+#if !PTO2_ORCH_PROFILING && PTO2_PROFILING
+    g_orch_submit_idx = 0;
+#endif
+}
+
+#if PTO2_ORCH_PROFILING
+PTO2OrchProfilingData orchestrator_get_profiling() {
+    PTO2OrchProfilingData d;
+    d.sync_cycle = g_orch_sync_cycle;
+    d.alloc_cycle = g_orch_alloc_cycle;
+    d.args_cycle = g_orch_args_cycle;
+    d.lookup_cycle = g_orch_lookup_cycle;
+    d.insert_cycle = g_orch_insert_cycle;
+    d.fanin_cycle = g_orch_fanin_cycle;
+    d.scope_end_cycle = g_orch_scope_end_cycle;
+    d.submit_count = g_orch_submit_count;
+    d.alloc_wait_cycle = g_orch_alloc_wait_cycle;
+    d.fanin_wait_cycle = g_orch_fanin_wait_cycle;
+    d.alloc_atomic_count = g_orch_alloc_atomic_count;
+    d.args_atomic_count = g_orch_args_atomic_count;
+    d.scope_end_atomic_count = g_orch_scope_end_atomic_count;
+
+    // Reset
+    g_orch_sync_cycle = g_orch_alloc_cycle = g_orch_args_cycle = 0;
+    g_orch_lookup_cycle = g_orch_insert_cycle = 0;
+    g_orch_fanin_cycle = g_orch_scope_end_cycle = 0;
+    g_orch_submit_count = 0;
+    g_orch_submit_idx = 0;
+    g_orch_alloc_wait_cycle = 0;
+    g_orch_fanin_wait_cycle = 0;
+    g_orch_alloc_atomic_count = 0;
+    g_orch_args_atomic_count = 0;
+    g_orch_scope_end_atomic_count = 0;
+    return d;
+}
+#endif
diff --git a/src/a5/runtime/fully_distributed_within_core/runtime/pto_orchestrator.h b/src/a5/runtime/fully_distributed_within_core/runtime/pto_orchestrator.h
new file mode 100644
index 000000000..a8ed3817f
--- /dev/null
+++ b/src/a5/runtime/fully_distributed_within_core/runtime/pto_orchestrator.h
@@ -0,0 +1,206 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * PTO Runtime2 - Orchestrator Interface
+ *
+ * The Orchestrator is responsible for:
+ * 1. Executing the orchestration function (Turing-complete control flow)
+ * 2. Allocating intermediate buffers from the heap
+ * 3. Submitting tasks via async InCore function calls
+ * 4. Building the dependency graph using TensorMap
+ * 5. Managing buffer scopes for lifecycle control
+ *
+ * The Orchestrator can run on either:
+ * - Host CPU (lower latency for complex control, easier debugging)
+ * - Device AI_CPU (lower latency for task submission)
+ *
+ * Based on: docs/RUNTIME_LOGIC.md
+ */
+
+#pragma once
+
+#include "utils/device_arena.h"
+#include "common/l2_swimlane_profiling.h"
+#include "pto_ring_buffer.h"
+#include "pto_runtime2_types.h"
+#include "pto_submit_types.h"
+#include "scheduler/pto_scheduler.h"
+#include "pto_shared_memory.h"
+#include "pto_tensormap.h"
+#include "pto_types.h"
+
+/**
+ * Layout descriptor produced by PTO2OrchestratorState::reserve_layout(). Holds
+ * arena offsets for every sub-region the orchestrator owns (per-ring fanin
+ * pools, scope arrays, plus the nested PTO2TensorMap layout).
+ */
+struct PTO2OrchestratorLayout {
+    size_t off_fanin_pool[PTO2_MAX_RING_DEPTH];
+    size_t off_fanin_seen_epoch[PTO2_MAX_RING_DEPTH];
+    size_t off_scope_tasks;
+    size_t off_scope_begins;
+    PTO2TensorMapLayout tensor_map;
+    int32_t dep_pool_capacities[PTO2_MAX_RING_DEPTH];
+    int32_t scope_tasks_cap;
+    uint64_t scope_stack_capacity;
+};
+
+// =============================================================================
+// Orchestrator State
+// =============================================================================
+
+/**
+ * Orchestrator state structure (private to Orchestrator)
+ *
+ * Contains all state needed for task graph construction and buffer management.
+ */
+struct PTO2OrchestratorState {
+    // === SHARED MEMORY ACCESS ===
+    PTO2SharedMemoryHeader *sm_header;
+
+    // === PER-RING RESOURCES ===
+    PTO2RingSet rings[PTO2_MAX_RING_DEPTH];
+    uint32_t *fanin_seen_epoch[PTO2_MAX_RING_DEPTH];
+    uint32_t fanin_seen_current_epoch{1};
+
+    // === TENSOR MAP (Private) ===
+    PTO2TensorMap tensor_map;  // Producer lookup
+
+    // === SCOPE STACK (Private) ===
+    // Single contiguous buffer of task IDs, partitioned by scope level.
+    // scope_begins[i] is the index into scope_tasks where scope i starts.
+    // Tasks for the top scope occupy [scope_begins[top], scope_tasks_size).
+    PTO2TaskSlotState **scope_tasks;  // Flat buffer of taskSlotState (all scopes concatenated)
+    int32_t scope_tasks_size;         // Number of task IDs currently in the buffer
+    int32_t scope_tasks_capacity;     // Allocated capacity of scope_tasks
+    int32_t *scope_begins;            // scope_begins[i] = start index of scope i in scope_tasks
+    int32_t scope_stack_top;          // Current top of stack (-1 = no scope open)
+    uint64_t scope_stack_capacity;    // Max nesting depth (PTO2_MAX_SCOPE_DEPTH)
+    int32_t manual_begin_depth{PTO2_MAX_SCOPE_DEPTH};
+
+    // === SCHEDULER REFERENCE ===
+    // Note: In simulated mode, orchestrator and scheduler share address space
+    // In real mode, they communicate via shared memory only
+    PTO2SchedulerState *scheduler;  // For simulated mode only
+
+    // Total core counts set once at executor init; used for submit-time deadlock detection.
+    int32_t total_cluster_count{0};  // AIC cores = MIX clusters
+    int32_t total_aiv_count{0};      // AIV cores (= 2 × clusters on standard hardware)
+#if PTO2_PROFILING
+    // L2 swimlane_level copied from get_l2_swimlane_level().
+    L2SwimlaneLevel l2_swimlane_level{L2SwimlaneLevel::DISABLED};
+#endif
+
+    // === GM HEAP (for output buffers) ===
+    void *gm_heap_base;     // Base address of GM heap
+    uint64_t gm_heap_size;  // Total size of GM heap (all rings)
+
+    // === FATAL ERROR ===
+    // Fatal error flag (single-thread access by orchestrator, no atomic needed)
+    // Cross-thread notification uses shared memory orch_error_code (atomic)
+    bool fatal;
+
+    // Hidden alloc tasks complete synchronously inside the orchestrator and
+    // therefore bypass the executor's normal worker-completion counter path.
+    // The executor adds this count into its completed_tasks_ progress counter
+    // after orchestration finishes so shutdown/profiling totals remain closed.
+    int64_t inline_completed_tasks{0};
+
+    // === STATISTICS ===
+#if PTO2_PROFILING
+    int64_t tasks_submitted;
+    int64_t buffers_allocated;
+    int64_t bytes_allocated;
+#endif
+
+    /**
+     * Get current ring index from scope depth.
+     * Maps scope depth to ring_id: min(scope_depth, PTO2_MAX_RING_DEPTH - 1)
+     */
+    uint8_t current_ring_id() const {
+        int32_t depth = scope_stack_top;
+        if (depth < 0) depth = 0;
+        return depth < PTO2_MAX_RING_DEPTH ? static_cast<uint8_t>(depth) : PTO2_MAX_RING_DEPTH - 1;
+    }
+
+    bool in_manual_scope() const { return scope_stack_top >= manual_begin_depth; }
+
+    // === Cold-path API (defined in pto_orchestrator.cpp) ===
+
+    // Phase 1: declare every sub-region (per-ring fanin pool, scope arrays,
+    // tensor_map sub-layout) on the supplied arena. task_window_sizes feeds
+    // the nested tensor_map layout. Returned layout is consumed by
+    // init_data_from_layout.
+    static PTO2OrchestratorLayout reserve_layout(
+        DeviceArena &arena, const int32_t task_window_sizes[PTO2_MAX_RING_DEPTH],
+        int32_t dep_pool_capacity = PTO2_DEP_LIST_POOL_SIZE
+    );
+    static PTO2OrchestratorLayout reserve_layout(
+        DeviceArena &arena, const int32_t task_window_sizes[PTO2_MAX_RING_DEPTH],
+        const int32_t dep_pool_capacities[PTO2_MAX_RING_DEPTH]
+    );
+
+    // Phase 3a: write everything *except* arena-internal pointer fields.
+    // sm_dev_base is the SM device address (only stored, never dereferenced);
+    // task_window_size feeds the per-ring SM address arithmetic. Safe to call
+    // on a host arena that holds the prebuilt image.
+    bool init_data_from_layout(
+        const PTO2OrchestratorLayout &layout, DeviceArena &arena, void *sm_dev_base, void *gm_heap, uint64_t heap_size,
+        uint64_t task_window_size
+    );
+    bool init_data_from_layout(
+        const PTO2OrchestratorLayout &layout, DeviceArena &arena, void *sm_dev_base, void *gm_heap,
+        const uint64_t heap_sizes[PTO2_MAX_RING_DEPTH], const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH]
+    );
+
+    // Phase 3b: write the arena-internal pointer fields (scope_tasks,
+    // scope_begins, rings[].fanin_pool.base, tensor_map.{buckets,entry_pool,
+    // free_entry_list,task_entry_heads}, scheduler reference).
+    // Idempotent — host runs once on the image, AICPU runs once after attach.
+    void wire_arena_pointers(const PTO2OrchestratorLayout &layout, DeviceArena &arena, PTO2SchedulerState *scheduler);
+
+    // Forget pointers; arena owns the backing buffers.
+    void destroy();
+    void set_scheduler(PTO2SchedulerState *scheduler);
+    void report_fatal(int32_t error_code, const char *func, const char *fmt, ...);
+    void begin_scope(PTO2ScopeMode mode = PTO2ScopeMode::AUTO);
+    void end_scope();
+    TaskOutputTensors submit_task(const MixedKernels &mixed_kernels, const L0TaskArgs &args);
+    TaskOutputTensors submit_dummy_task(const L0TaskArgs &args);
+    TaskOutputTensors alloc_tensors(const L0TaskArgs &args);
+    void mark_done();
+};
+
+// =============================================================================
+// Orchestrator Profiling Data
+// =============================================================================
+
+#if PTO2_ORCH_PROFILING
+struct PTO2OrchProfilingData {
+    uint64_t sync_cycle;
+    uint64_t alloc_cycle;  // Combined task slot + heap allocation
+    uint64_t args_cycle;
+    uint64_t lookup_cycle;
+    uint64_t insert_cycle;
+    uint64_t fanin_cycle;
+    uint64_t scope_end_cycle;
+    int64_t submit_count;
+    // Wait time tracking for blocking phases
+    uint64_t alloc_wait_cycle;  // Cycles spent waiting in unified alloc
+    uint64_t fanin_wait_cycle;  // Cycles spent waiting in fanout_lock
+    // Atomic operation counts per phase
+    uint64_t alloc_atomic_count;
+    uint64_t args_atomic_count;
+    uint64_t scope_end_atomic_count;
+};
+
+PTO2OrchProfilingData orchestrator_get_profiling();
+#endif
diff --git a/src/a5/runtime/fully_distributed_within_core/runtime/pto_ring_buffer.cpp b/src/a5/runtime/fully_distributed_within_core/runtime/pto_ring_buffer.cpp
new file mode 100644
index 000000000..f6009dc57
--- /dev/null
+++ b/src/a5/runtime/fully_distributed_within_core/runtime/pto_ring_buffer.cpp
@@ -0,0 +1,168 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * PTO Runtime2 - Ring Buffer Implementation
+ *
+ * Implements DepListPool ring buffer for zero-overhead dependency management.
+ * TaskAllocator methods are defined inline in pto_ring_buffer.h.
+ *
+ * Based on: docs/RUNTIME_LOGIC.md
+ */
+
+#include "pto_ring_buffer.h"
+#include <inttypes.h>
+#include <string.h>
+#include "common/unified_log.h"
+#include "scheduler/pto_scheduler.h"
+
+static void latch_pool_error(std::atomic<int32_t> *error_code_ptr, int32_t error_code) {
+    if (error_code_ptr == nullptr) {
+        return;
+    }
+    int32_t expected = PTO2_ERROR_NONE;
+    error_code_ptr->compare_exchange_strong(expected, error_code, std::memory_order_acq_rel);
+}
+
+// =============================================================================
+// Fanin Spill Pool Implementation
+// =============================================================================
+void PTO2FaninPool::reclaim(PTO2SharedMemoryRingHeader &ring, int32_t sm_last_task_alive) {
+    if (sm_last_task_alive <= reclaim_task_cursor) return;
+
+    int32_t scan_end = sm_last_task_alive;
+    for (int32_t task_id = reclaim_task_cursor; task_id < scan_end; ++task_id) {
+        PTO2TaskPayload &payload = ring.get_payload_by_task_id(task_id);
+        if (payload.fanin_spill_pool != this) {
+            continue;
+        }
+
+        int32_t inline_count = std::min(payload.fanin_actual_count, PTO2_FANIN_INLINE_CAP);
+        int32_t spill_edge_count = payload.fanin_actual_count - inline_count;
+        if (spill_edge_count > 0) {
+            advance_tail(payload.fanin_spill_start + spill_edge_count);
+        }
+    }
+    reclaim_task_cursor = scan_end;
+}
+
+bool PTO2FaninPool::ensure_space(PTO2SharedMemoryRingHeader &ring, int32_t needed) {
+    if (available() >= needed) return true;
+
+    int spin_count = 0;
+    int32_t prev_last_alive = ring.fc.last_task_alive.load(std::memory_order_acquire);
+    while (available() < needed) {
+        reclaim(ring, prev_last_alive);
+        if (available() >= needed) return true;
+
+        spin_count++;
+
+        int32_t cur_last_alive = ring.fc.last_task_alive.load(std::memory_order_acquire);
+        if (cur_last_alive > prev_last_alive) {
+            spin_count = 0;
+            prev_last_alive = cur_last_alive;
+        }
+
+        if (spin_count >= PTO2_DEP_POOL_SPIN_LIMIT) {
+            int32_t current = ring.fc.current_task_index.load(std::memory_order_acquire);
+            LOG_ERROR("========================================");
+            LOG_ERROR("FATAL: Fanin Spill Pool Deadlock Detected!");
+            LOG_ERROR("========================================");
+            LOG_ERROR("Fanin spill pool cannot reclaim space after %d spins (no progress).", spin_count);
+            LOG_ERROR(
+                "  - Pool used:     %d / %d (%.1f%%)", used(), capacity,
+                (capacity > 0) ? (100.0 * used() / capacity) : 0.0
+            );
+            LOG_ERROR("  - Pool top:      %d (linear)", top);
+            LOG_ERROR("  - Pool tail:     %d (linear)", tail);
+            LOG_ERROR("  - High water:    %d", high_water);
+            LOG_ERROR("  - Needed:        %d entries", needed);
+            LOG_ERROR("  - last_task_alive: %d (stuck here)", cur_last_alive);
+            LOG_ERROR("  - current_task:    %d", current);
+            LOG_ERROR("  - In-flight tasks: %d", current - cur_last_alive);
+            LOG_ERROR("Diagnosis:");
+            LOG_ERROR("  last_task_alive is not advancing, so fanin spill pool tail");
+            LOG_ERROR("  cannot reclaim. Check TaskRing diagnostics for root cause.");
+            LOG_ERROR("Solution:");
+            LOG_ERROR("  Increase fanin spill pool capacity (current: %d, recommended: %d)", capacity, high_water * 2);
+            LOG_ERROR("  Compile-time: PTO2_DEP_LIST_POOL_SIZE in pto_runtime2_types.h");
+            LOG_ERROR("  Runtime env:  PTO2_RING_DEP_POOL=%d", high_water * 2);
+            LOG_ERROR("========================================");
+            latch_pool_error(error_code_ptr, PTO2_ERROR_DEP_POOL_OVERFLOW);
+            return false;
+        }
+        SPIN_WAIT_HINT();
+    }
+    return true;
+}
+
+// =============================================================================
+// Dependency List Pool Implementation
+// =============================================================================
+void PTO2DepListPool::reclaim(PTO2SharedMemoryRingHeader &ring, int32_t sm_last_task_alive) {
+    if (sm_last_task_alive >= last_reclaimed + PTO2_DEP_POOL_CLEANUP_INTERVAL && sm_last_task_alive > 0) {
+        int32_t mark = ring.get_slot_state_by_task_id(sm_last_task_alive - 1).dep_pool_mark;
+        if (mark > 0) {
+            advance_tail(mark);
+        }
+        last_reclaimed = sm_last_task_alive;
+    }
+}
+
+bool PTO2DepListPool::ensure_space(PTO2SharedMemoryRingHeader &ring, int32_t needed) {
+    if (available() >= needed) return true;
+
+    int spin_count = 0;
+    int32_t prev_last_alive = ring.fc.last_task_alive.load(std::memory_order_acquire);
+    while (available() < needed) {
+        reclaim(ring, prev_last_alive);
+        if (available() >= needed) return true;
+
+        spin_count++;
+
+        // Progress detection: reset spin counter if last_task_alive advances
+        int32_t cur_last_alive = ring.fc.last_task_alive.load(std::memory_order_acquire);
+        if (cur_last_alive > prev_last_alive) {
+            spin_count = 0;
+            prev_last_alive = cur_last_alive;
+        }
+
+        if (spin_count >= PTO2_DEP_POOL_SPIN_LIMIT) {
+            int32_t current = ring.fc.current_task_index.load(std::memory_order_acquire);
+            LOG_ERROR("========================================");
+            LOG_ERROR("FATAL: Dependency Pool Deadlock Detected!");
+            LOG_ERROR("========================================");
+            LOG_ERROR("DepListPool cannot reclaim space after %d spins (no progress).", spin_count);
+            LOG_ERROR(
+                "  - Pool used:     %d / %d (%.1f%%)", used(), capacity,
+                (capacity > 0) ? (100.0 * used() / capacity) : 0.0
+            );
+            LOG_ERROR("  - Pool top:      %d (linear)", top);
+            LOG_ERROR("  - Pool tail:     %d (linear)", tail);
+            LOG_ERROR("  - High water:    %d", high_water);
+            LOG_ERROR("  - Needed:        %d entries", needed);
+            LOG_ERROR("  - last_task_alive: %d (stuck here)", cur_last_alive);
+            LOG_ERROR("  - current_task:    %d", current);
+            LOG_ERROR("  - In-flight tasks: %d", current - cur_last_alive);
+            LOG_ERROR("Diagnosis:");
+            LOG_ERROR("  last_task_alive is not advancing, so dep pool tail");
+            LOG_ERROR("  cannot reclaim. Check TaskRing diagnostics for root cause.");
+            LOG_ERROR("Solution:");
+            LOG_ERROR("  Increase dep pool capacity (current: %d, recommended: %d)", capacity, high_water * 2);
+            LOG_ERROR("  Compile-time: PTO2_DEP_LIST_POOL_SIZE in pto_runtime2_types.h");
+            LOG_ERROR("  Runtime env:  PTO2_RING_DEP_POOL=%d", high_water * 2);
+            LOG_ERROR("========================================");
+            latch_pool_error(error_code_ptr, PTO2_ERROR_DEP_POOL_OVERFLOW);
+            return false;
+        }
+        SPIN_WAIT_HINT();
+    }
+    return true;
+}
diff --git a/src/a5/runtime/fully_distributed_within_core/runtime/pto_ring_buffer.h b/src/a5/runtime/fully_distributed_within_core/runtime/pto_ring_buffer.h
new file mode 100644
index 000000000..b07435197
--- /dev/null
+++ b/src/a5/runtime/fully_distributed_within_core/runtime/pto_ring_buffer.h
@@ -0,0 +1,694 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * PTO Runtime2 - Ring Buffer Data Structures
+ *
+ * Implements ring buffer designs for zero-overhead memory management:
+ *
+ * 1. TaskAllocator - Unified task slot + output buffer allocation
+ *    - Combines task ring (slot allocation) and heap ring (output buffer allocation)
+ *    - Single spin-wait loop with unified back-pressure and deadlock detection
+ *    - O(1) bump allocation for both task slots and heap buffers
+ *
+ * 2. FaninPool - Fanin spill entry allocation
+ *    - Ring buffer for spilled fanin entries
+ *    - O(1) append allocation
+ *    - Implicit reclamation with task ring
+ *
+ * 3. DepListPool - Dependency list entry allocation
+ *    - Ring buffer for linked list entries
+ *    - O(1) prepend operation
+ *    - Implicit reclamation with task ring
+ *
+ * Based on: docs/RUNTIME_LOGIC.md
+ */
+
+#ifndef PTO_RING_BUFFER_H
+#define PTO_RING_BUFFER_H
+
+#include <algorithm>
+#include <inttypes.h>
+#include <type_traits>
+
+#include "pto_runtime2_types.h"
+#include "pto_shared_memory.h"
+#include "common/unified_log.h"
+
+#if PTO2_PROFILING
+// Heap-ring wrap reporting — the allocator is the only place each individual
+// wrap is observable, so it notifies the scope_stats collector here. Gated:
+// pays nothing (no include, no call) when profiling is compiled out.
+#include "aicpu/scope_stats_collector_aicpu.h"
+#endif
+
+// Block notification interval (in spin counts)
+#define PTO2_BLOCK_NOTIFY_INTERVAL 10000
+// Alloc spin limit - after this, report deadlock and exit
+#define PTO2_ALLOC_SPIN_LIMIT 100000
+
+// Dep pool spin limit - if exceeded, dep pool capacity too small for workload
+#define PTO2_DEP_POOL_SPIN_LIMIT 100000
+
+// =============================================================================
+// Task Allocator (unified task slot + heap buffer allocation)
+// =============================================================================
+
+/**
+ * Unified task slot + heap buffer allocator.
+ *
+ * Since task and heap are always allocated together and the orchestrator is
+ * single-threaded, both pointers (task index, heap top) are tracked locally
+ * and published to shared memory via plain store — no fetch_add or CAS needed.
+ *
+ * The alloc() method checks both resources BEFORE committing to either,
+ * eliminating the need for rollback on partial failure.
+ */
+class PTO2TaskAllocator {
+public:
+    /**
+     * Initialize the allocator with task ring and heap ring resources.
+     *
+     * All pointer arguments are device addresses (live in SM / GM heap); this
+     * function only stores them, no dereferences, so it is safe to invoke
+     * from host code that constructs a prebuilt arena image.
+     *
+     * Production callers leave `initial_local_task_id` at 0: the SM ring
+     * flow-control counters that current_index_ptr / last_alive_ptr point at
+     * start at zero (PTO2RingFlowControl::init() runs on the AICPU during SM
+     * reset), so we keep local_task_id_ aligned with that without reading the
+     * SM. Tests that drive SM state directly may pass a non-zero seed to
+     * exercise corner cases like task IDs near INT32_MAX.
+     */
+    void init(
+        PTO2TaskDescriptor *descriptors, int32_t window_size, std::atomic<int32_t> *current_index_ptr,
+        std::atomic<int32_t> *last_alive_ptr, void *heap_base, uint64_t heap_size, std::atomic<int32_t> *error_code_ptr,
+        int32_t initial_local_task_id = 0
+    ) {
+        descriptors_ = descriptors;
+        window_size_ = window_size;
+        window_mask_ = window_size - 1;
+        current_index_ptr_ = current_index_ptr;
+        last_alive_ptr_ = last_alive_ptr;
+        heap_base_ = heap_base;
+        heap_size_ = heap_size;
+        error_code_ptr_ = error_code_ptr;
+        local_task_id_ = initial_local_task_id;
+        heap_top_ = 0;
+        heap_tail_ = 0;
+        last_alive_seen_ = 0;
+    }
+
+    /**
+     * Allocate a task slot and its associated output buffer in one call.
+     *
+     * Both task index and heap top are maintained as local counters and
+     * published to shared memory only on success. Since the orchestrator is
+     * single-threaded, no CAS or fetch_add is needed — just check-then-commit.
+     *
+     * @param output_size  Total packed output size in bytes (0 = no heap needed)
+     * @return Allocation result; check failed() for errors
+     */
+    PTO2TaskAllocResult alloc(int32_t output_size) {
+        uint64_t aligned_size =
+            output_size > 0 ? PTO2_ALIGN_UP(static_cast<uint64_t>(output_size), PTO2_ALIGN_SIZE) : 0;
+
+        int spin_count = 0;
+        int32_t prev_last_alive = last_alive_ptr_->load(std::memory_order_acquire);
+        int32_t last_alive = prev_last_alive;
+        update_heap_tail(last_alive);
+        bool blocked_on_heap = false;
+#if PTO2_ORCH_PROFILING
+        uint64_t wait_start = 0;
+        bool waiting = false;
+#endif
+
+        while (true) {
+            // Check both resources; commit only if both available
+            if (local_task_id_ - last_alive + 1 < window_size_) {
+                void *heap_ptr = try_bump_heap(aligned_size);
+                if (heap_ptr) {
+                    int32_t task_id = commit_task();
+#if PTO2_ORCH_PROFILING
+                    record_wait(spin_count, wait_start, waiting);
+#endif
+                    return {task_id, task_id & window_mask_, heap_ptr, static_cast<char *>(heap_ptr) + aligned_size};
+                }
+                blocked_on_heap = true;
+            } else {
+                blocked_on_heap = false;
+            }
+
+            // Spin: wait for scheduler to advance last_task_alive
+            spin_count++;
+#if PTO2_ORCH_PROFILING
+            if (!waiting) {
+                wait_start = get_sys_cnt_aicpu();
+                waiting = true;
+            }
+#endif
+            last_alive = last_alive_ptr_->load(std::memory_order_acquire);
+            update_heap_tail(last_alive);
+            if (last_alive > prev_last_alive) {
+                spin_count = 0;
+                prev_last_alive = last_alive;
+            } else {
+                if (spin_count % PTO2_BLOCK_NOTIFY_INTERVAL == 0) {
+                    LOG_WARN(
+                        "[TaskAllocator] BLOCKED: tasks=%d/%d, heap=%" PRIu64 "/%" PRIu64 ", on=%s, spins=%d",
+                        local_task_id_ - last_alive, window_size_, heap_top_, heap_size_,
+                        blocked_on_heap ? "heap" : "task", spin_count
+                    );
+                }
+                if (spin_count >= PTO2_ALLOC_SPIN_LIMIT) {
+                    report_deadlock(output_size, blocked_on_heap);
+                    return {-1, -1, nullptr, nullptr};
+                }
+            }
+            SPIN_WAIT_HINT();
+        }
+    }
+
+    // =========================================================================
+    // State queries
+    // =========================================================================
+
+    int32_t active_count() const {
+        int32_t last_alive = last_alive_ptr_->load(std::memory_order_acquire);
+        return local_task_id_ - last_alive;
+    }
+
+    // Task ring start/end: tail = oldest live task (last_task_alive), head =
+    // next task id to allocate. head - tail == active_count().
+    int32_t task_tail() const { return last_alive_ptr_->load(std::memory_order_acquire); }
+    int32_t task_head() const { return local_task_id_; }
+
+    int32_t window_size() const { return window_size_; }
+
+    uint64_t heap_available() const {
+        uint64_t tail = heap_tail_;
+        if (heap_top_ >= tail) {
+            uint64_t at_end = heap_size_ - heap_top_;
+            uint64_t at_begin = tail;
+            return at_end > at_begin ? at_end : at_begin;
+        }
+        return tail - heap_top_;
+    }
+
+    uint64_t heap_top() const { return heap_top_; }
+    // Heap ring start: reclaim pointer (oldest byte still live). heap_top() is
+    // the end (next allocation). heap_top - heap_tail == heap_used_bytes().
+    uint64_t heap_tail() const { return heap_tail_; }
+    uint64_t heap_capacity() const { return heap_size_; }
+
+    uint64_t heap_used_bytes() const {
+        if (heap_size_ == 0) return 0;
+        return (heap_top_ + heap_size_ - heap_tail_) % heap_size_;
+    }
+
+private:
+    // --- Task Ring ---
+    PTO2TaskDescriptor *descriptors_ = nullptr;
+    int32_t window_size_ = 0;
+    int32_t window_mask_ = 0;
+    std::atomic<int32_t> *current_index_ptr_ = nullptr;
+    std::atomic<int32_t> *last_alive_ptr_ = nullptr;
+
+    // --- Heap ---
+    void *heap_base_ = nullptr;
+    uint64_t heap_size_ = 0;
+
+    // --- Local state (single-writer, no atomics needed) ---
+    int32_t local_task_id_ = 0;    // Next task ID to allocate
+    uint64_t heap_top_ = 0;        // Current heap allocation pointer
+    uint64_t heap_tail_ = 0;       // Heap reclamation pointer (derived from consumed tasks)
+    int32_t last_alive_seen_ = 0;  // last_task_alive at last heap_tail derivation
+
+    // --- Shared ---
+    std::atomic<int32_t> *error_code_ptr_ = nullptr;
+
+    // =========================================================================
+    // Internal helpers
+    // =========================================================================
+
+    /**
+     * Commit a task slot: bump local counter and publish to shared memory.
+     * Must only be called after space check has passed.
+     */
+    int32_t commit_task() {
+        int32_t task_id = local_task_id_++;
+        current_index_ptr_->store(local_task_id_, std::memory_order_release);
+        return task_id;
+    }
+
+    /**
+     * Derive heap_tail_ from the last consumed task's packed_buffer_end.
+     *
+     * Every task has a valid packed_buffer_end (equal to packed_buffer_base
+     * for zero-size allocations), so the last consumed task always determines
+     * the correct heap_tail — no backward scan needed.
+     */
+    void update_heap_tail(int32_t last_alive) {
+        if (last_alive <= last_alive_seen_) return;
+        last_alive_seen_ = last_alive;
+
+        PTO2TaskDescriptor &desc = descriptors_[(last_alive - 1) & window_mask_];
+        uint64_t old_tail = heap_tail_;
+        heap_tail_ =
+            static_cast<uint64_t>(static_cast<char *>(desc.packed_buffer_end) - static_cast<char *>(heap_base_));
+#if PTO2_PROFILING
+        // Reclaim pointer moves forward monotonically in ring order; a decrease
+        // means it wrapped past heap_size_ (occupancy < heap_size_ guarantees at
+        // most one wrap per call). Report it so scope_stats can unroll.
+        if (is_scope_stats_enabled() && heap_tail_ < old_tail) {
+            scope_stats_note_heap_wrap(SCOPE_STATS_HEAP_SIDE_RECLAIM);
+        }
+#else
+        (void)old_tail;
+#endif
+    }
+
+    /**
+     * Bump the heap pointer for the given allocation size.
+     * Returns the allocated pointer, or nullptr if insufficient space.
+     * When alloc_size == 0, returns current position without advancing.
+     */
+    void *try_bump_heap(uint64_t alloc_size) {
+        uint64_t top = heap_top_;
+        if (alloc_size == 0) {
+            return static_cast<char *>(heap_base_) + top;
+        }
+        uint64_t tail = heap_tail_;
+        void *result;
+
+        if (top >= tail) {
+            uint64_t space_at_end = heap_size_ - top;
+            if (space_at_end >= alloc_size) {
+                result = static_cast<char *>(heap_base_) + top;
+                heap_top_ = top + alloc_size;
+            } else if (tail > alloc_size) {
+                LOG_DEBUG(
+                    "try_bump_heap wrap-around alloc: top=%" PRIu64 ", tail=%" PRIu64 ", alloc=%" PRIu64, top, tail,
+                    alloc_size
+                );
+                result = heap_base_;
+                heap_top_ = alloc_size;
+#if PTO2_PROFILING
+                // Allocation pointer just wrapped past heap_size_; report it so
+                // scope_stats can unroll the wrapping offset into a monotonic value.
+                // The collector attributes the wrap to the current scope's ring.
+                if (is_scope_stats_enabled()) scope_stats_note_heap_wrap(SCOPE_STATS_HEAP_SIDE_ALLOC);
+#endif
+            } else {
+                LOG_DEBUG(
+                    "try_bump_heap failed (top>=tail): top=%" PRIu64 ", tail=%" PRIu64 ", alloc=%" PRIu64
+                    ", heap_size=%" PRIu64,
+                    top, tail, alloc_size, heap_size_
+                );
+                return nullptr;
+            }
+        } else {
+            if (tail - top > alloc_size) {
+                result = static_cast<char *>(heap_base_) + top;
+                heap_top_ = top + alloc_size;
+            } else {
+                LOG_DEBUG(
+                    "try_bump_heap failed (top<tail): top=%" PRIu64 ", tail=%" PRIu64 ", alloc=%" PRIu64
+                    ", free_gap=%" PRIu64,
+                    top, tail, alloc_size, tail - top
+                );
+                return nullptr;
+            }
+        }
+
+        return result;
+    }
+
+#if PTO2_ORCH_PROFILING
+    void record_wait(int spin_count, uint64_t wait_start, bool waiting) {
+        if (waiting) {
+            extern uint64_t g_orch_alloc_wait_cycle;
+            g_orch_alloc_wait_cycle += (get_sys_cnt_aicpu() - wait_start);
+        }
+        {
+            extern uint64_t g_orch_alloc_atomic_count;
+            g_orch_alloc_atomic_count += spin_count + 1;
+        }
+    }
+#endif
+
+    /**
+     * Report deadlock with targeted diagnostics.
+     */
+    void report_deadlock(int32_t requested_output_size, bool heap_blocked) {
+        int32_t last_alive = last_alive_ptr_->load(std::memory_order_acquire);
+        int32_t active_tasks = local_task_id_ - last_alive;
+        uint64_t htail = heap_tail_;
+
+        LOG_ERROR("========================================");
+        if (heap_blocked) {
+            LOG_ERROR("FATAL: Task Allocator Deadlock - Heap Exhausted!");
+        } else {
+            LOG_ERROR("FATAL: Task Allocator Deadlock - Task Ring Full!");
+        }
+        LOG_ERROR("========================================");
+        LOG_ERROR("No progress after %d spins.", PTO2_ALLOC_SPIN_LIMIT);
+        LOG_ERROR(
+            "  Task ring:  current=%d, last_alive=%d, active=%d/%d (%.1f%%)", local_task_id_, last_alive, active_tasks,
+            window_size_, 100.0 * active_tasks / window_size_
+        );
+        LOG_ERROR(
+            "  Heap ring:  top=%" PRIu64 ", tail=%" PRIu64 ", size=%" PRIu64 ", available=%" PRIu64, heap_top_, htail,
+            heap_size_, heap_available()
+        );
+        if (heap_blocked) {
+            LOG_ERROR("  Requested:  %d bytes", requested_output_size);
+        }
+        LOG_ERROR("Diagnosis:");
+        LOG_ERROR("  last_task_alive is stuck at %d, meaning task %d", last_alive, last_alive);
+        LOG_ERROR("  cannot transition to CONSUMED. Possible causes:");
+        LOG_ERROR("  1. Task %d still executing (subtasks not complete)", last_alive);
+        LOG_ERROR("  2. Task %d fanout not fully released (downstream not done)", last_alive);
+        LOG_ERROR("  3. Scope reference not released (scope_end not called)");
+        LOG_ERROR("  4. Orchestrator blocked here -> can't call scope_end -> circular wait");
+        LOG_ERROR("Solution:");
+        if (heap_blocked) {
+            LOG_ERROR(
+                "  Increase heap size (current: %" PRIu64 ", recommended: %" PRIu64 ")", heap_size_, heap_size_ * 2
+            );
+            LOG_ERROR("  Compile-time: PTO2_HEAP_SIZE in pto_runtime2_types.h");
+            LOG_ERROR("  Runtime env:  PTO2_RING_HEAP=<bytes> (e.g. %" PRIu64 ")", heap_size_ * 2);
+        } else {
+            LOG_ERROR("  Increase task window size (current: %d, recommended: %d)", window_size_, active_tasks * 2);
+            LOG_ERROR("  Compile-time: PTO2_TASK_WINDOW_SIZE in pto_runtime2_types.h");
+            LOG_ERROR("  Runtime env:  PTO2_RING_TASK_WINDOW=<power-of-2> (e.g. %d)", active_tasks * 2);
+        }
+        LOG_ERROR("========================================");
+        if (error_code_ptr_) {
+            int32_t code = heap_blocked ? PTO2_ERROR_HEAP_RING_DEADLOCK : PTO2_ERROR_FLOW_CONTROL_DEADLOCK;
+            error_code_ptr_->store(code, std::memory_order_release);
+        }
+    }
+};
+
+// =============================================================================
+// Fanin Spill Pool
+// =============================================================================
+
+/**
+ * Fanin spill pool structure
+ *
+ * True ring buffer for allocating spilled fanin entries.
+ * Entries are reclaimed when their consumer tasks become CONSUMED.
+ *
+ * Linear counters (top, tail) grow monotonically; the physical index
+ * is obtained via modulo: base[linear_index % capacity].
+ */
+struct PTO2FaninPool {
+    PTO2FaninSpillEntry *base;       // Pool base address
+    int32_t capacity;                // Total number of entries
+    int32_t top;                     // Linear next-allocation counter (starts from 1)
+    int32_t tail;                    // Linear first-alive counter (entries before this are dead)
+    int32_t high_water;              // Peak concurrent usage (top - tail)
+    int32_t reclaim_task_cursor{0};  // Last task id scanned for reclaim on this pool
+
+    std::atomic<int32_t> *error_code_ptr = nullptr;
+
+    void init(PTO2FaninSpillEntry *in_base, int32_t in_capacity, std::atomic<int32_t> *in_error_code_ptr) {
+        base = in_base;
+        capacity = in_capacity;
+        top = 1;
+        tail = 1;
+        high_water = 0;
+        reclaim_task_cursor = 0;
+        base[0].slot_state = nullptr;
+        error_code_ptr = in_error_code_ptr;
+    }
+
+    void reclaim(PTO2SharedMemoryRingHeader &ring, int32_t sm_last_task_alive);
+
+    bool ensure_space(PTO2SharedMemoryRingHeader &ring, int32_t needed);
+
+    PTO2FaninSpillEntry *alloc() {
+        int32_t used = top - tail;
+        if (used >= capacity) {
+            LOG_ERROR("========================================");
+            LOG_ERROR("FATAL: Fanin Spill Pool Overflow!");
+            LOG_ERROR("========================================");
+            LOG_ERROR("Fanin spill pool exhausted: %d entries alive (capacity=%d).", used, capacity);
+            LOG_ERROR("  - Pool top:      %d (linear)", top);
+            LOG_ERROR("  - Pool tail:     %d (linear)", tail);
+            LOG_ERROR("  - High water:    %d", high_water);
+            LOG_ERROR("Solution:");
+            LOG_ERROR("  Increase fanin spill pool capacity (current: %d, recommended: %d).", capacity, capacity * 2);
+            LOG_ERROR("  Compile-time: PTO2_DEP_LIST_POOL_SIZE in pto_runtime2_types.h");
+            LOG_ERROR("  Runtime env:  PTO2_RING_DEP_POOL=%d", capacity * 2);
+            LOG_ERROR("========================================");
+            if (error_code_ptr) {
+                error_code_ptr->store(PTO2_ERROR_DEP_POOL_OVERFLOW, std::memory_order_release);
+            }
+            return nullptr;
+        }
+        int32_t idx = top % capacity;
+        top++;
+        used++;
+        if (used > high_water) high_water = used;
+        return &base[idx];
+    }
+
+    void advance_tail(int32_t new_tail) {
+        if (new_tail > tail) {
+            tail = new_tail;
+        }
+    }
+
+    int32_t used() const { return top - tail; }
+
+    int32_t available() const { return capacity - used(); }
+};
+
+template <typename Fn>
+using PTO2FaninCallbackResult = std::invoke_result_t<Fn &, PTO2TaskSlotState *>;
+
+template <typename Fn>
+using PTO2FaninForEachReturn = std::conditional_t<std::is_same_v<PTO2FaninCallbackResult<Fn>, void>, void, bool>;
+
+template <typename InlineSlots, typename Fn>
+inline PTO2FaninForEachReturn<Fn> for_each_fanin_storage(
+    InlineSlots &&inline_slot_states, int32_t fanin_count, int32_t spill_start, PTO2FaninPool &spill_pool, Fn &&fn
+) {
+    using FaninCallbackResult = PTO2FaninCallbackResult<Fn>;
+    static_assert(
+        std::is_same_v<FaninCallbackResult, void> || std::is_same_v<FaninCallbackResult, bool>,
+        "fanin callback must return void or bool"
+    );
+
+    if constexpr (std::is_void_v<FaninCallbackResult>) {
+        int32_t inline_count = std::min(fanin_count, PTO2_FANIN_INLINE_CAP);
+        for (int32_t i = 0; i < inline_count; i++) {
+            fn(inline_slot_states[i]);
+        }
+
+        int32_t spill_count = fanin_count - inline_count;
+        if (spill_count <= 0) {
+            return;
+        }
+
+        int32_t start_idx = spill_start % spill_pool.capacity;
+        int32_t first_count = std::min(spill_count, spill_pool.capacity - start_idx);
+        PTO2FaninSpillEntry *first = spill_pool.base + start_idx;
+        for (int32_t i = 0; i < first_count; i++) {
+            fn(first[i].slot_state);
+        }
+
+        int32_t second_count = spill_count - first_count;
+        for (int32_t i = 0; i < second_count; i++) {
+            fn(spill_pool.base[i].slot_state);
+        }
+        return;
+    } else {
+        int32_t inline_count = std::min(fanin_count, PTO2_FANIN_INLINE_CAP);
+        for (int32_t i = 0; i < inline_count; i++) {
+            if (!fn(inline_slot_states[i])) {
+                return false;
+            }
+        }
+
+        int32_t spill_count = fanin_count - inline_count;
+        if (spill_count <= 0) {
+            return true;
+        }
+
+        int32_t start_idx = spill_start % spill_pool.capacity;
+        int32_t first_count = std::min(spill_count, spill_pool.capacity - start_idx);
+        PTO2FaninSpillEntry *first = spill_pool.base + start_idx;
+        for (int32_t i = 0; i < first_count; i++) {
+            if (!fn(first[i].slot_state)) {
+                return false;
+            }
+        }
+
+        int32_t second_count = spill_count - first_count;
+        for (int32_t i = 0; i < second_count; i++) {
+            if (!fn(spill_pool.base[i].slot_state)) {
+                return false;
+            }
+        }
+        return true;
+    }
+}
+
+template <typename Fn>
+inline PTO2FaninForEachReturn<Fn> for_each_fanin_slot_state(const PTO2TaskPayload &payload, Fn &&fn) {
+    return for_each_fanin_storage(
+        payload.fanin_inline_slot_states, payload.fanin_actual_count, payload.fanin_spill_start,
+        *payload.fanin_spill_pool, static_cast<Fn &&>(fn)
+    );
+}
+
+// =============================================================================
+// Dependency List Pool
+// =============================================================================
+
+/**
+ * Dependency list pool structure
+ *
+ * True ring buffer for allocating linked list entries.
+ * Entries are reclaimed when their producer tasks become CONSUMED,
+ * as tracked by the orchestrator via dep_pool_mark per task.
+ *
+ * Linear counters (top, tail) grow monotonically; the physical index
+ * is obtained via modulo: base[linear_index % capacity].
+ */
+struct PTO2DepListPool {
+    PTO2DepListEntry *base;     // Pool base address
+    int32_t capacity;           // Total number of entries
+    int32_t top;                // Linear next-allocation counter (starts from 1)
+    int32_t tail;               // Linear first-alive counter (entries before this are dead)
+    int32_t high_water;         // Peak concurrent usage (top - tail)
+    int32_t last_reclaimed{0};  // last_task_alive at last successful reclamation
+
+    // Error code pointer for fatal error reporting (→ sm_header->orch_error_code)
+    std::atomic<int32_t> *error_code_ptr = nullptr;
+
+    /**
+     *
+     * Initialize dependency list pool
+     * @param base      Pool base address from shared memory
+     * @param capacity  Total number of entries
+     */
+    void init(PTO2DepListEntry *in_base, int32_t in_capacity, std::atomic<int32_t> *in_error_code_ptr) {
+        base = in_base;
+        capacity = in_capacity;
+        top = 1;   // Start from 1, 0 means NULL/empty
+        tail = 1;  // Match initial top (no reclaimable entries yet)
+        high_water = 0;
+        last_reclaimed = 0;
+
+        // Initialize entry 0 as NULL marker
+        base[0].slot_state = nullptr;
+        base[0].next = nullptr;
+
+        error_code_ptr = in_error_code_ptr;
+    }
+
+    /**
+     * Reclaim dead entries based on scheduler's slot state dep_pool_mark.
+     * Safe to call multiple times — only advances tail forward.
+     *
+     * @param ring             Ring header (for reading slot dep_pool_mark)
+     * @param sm_last_task_alive Current last_task_alive from shared memory
+     */
+    void reclaim(PTO2SharedMemoryRingHeader &ring, int32_t sm_last_task_alive);
+
+    /**
+     * Ensure dep pool for a specific ring has at least `needed` entries available.
+     * Spin-waits for reclamation if under pressure. Detects deadlock if no progress.
+     */
+    bool ensure_space(PTO2SharedMemoryRingHeader &ring, int32_t needed);
+
+    /**
+     * Allocate a single entry from the pool (single-thread per pool instance)
+     *
+     * @return Pointer to allocated entry, or nullptr on fatal error
+     */
+    PTO2DepListEntry *alloc() {
+        int32_t used = top - tail;
+        if (used >= capacity) {
+            LOG_ERROR("========================================");
+            LOG_ERROR("FATAL: Dependency Pool Overflow!");
+            LOG_ERROR("========================================");
+            LOG_ERROR("DepListPool exhausted: %d entries alive (capacity=%d).", used, capacity);
+            LOG_ERROR("  - Pool top:      %d (linear)", top);
+            LOG_ERROR("  - Pool tail:     %d (linear)", tail);
+            LOG_ERROR("  - High water:    %d", high_water);
+            LOG_ERROR("Solution:");
+            LOG_ERROR("  Increase dep pool capacity (current: %d, recommended: %d).", capacity, capacity * 2);
+            LOG_ERROR("  Compile-time: PTO2_DEP_LIST_POOL_SIZE in pto_runtime2_types.h");
+            LOG_ERROR("  Runtime env:  PTO2_RING_DEP_POOL=%d", capacity * 2);
+            LOG_ERROR("========================================");
+            if (error_code_ptr) {
+                error_code_ptr->store(PTO2_ERROR_DEP_POOL_OVERFLOW, std::memory_order_release);
+            }
+            return nullptr;
+        }
+        int32_t idx = top % capacity;
+        top++;
+        used++;
+        if (used > high_water) high_water = used;
+        return &base[idx];
+    }
+
+    /**
+     * Advance the tail pointer, reclaiming dead entries.
+     * Called by the orchestrator based on last_task_alive advancement.
+     */
+    void advance_tail(int32_t new_tail) {
+        if (new_tail > tail) {
+            tail = new_tail;
+        }
+    }
+
+    /**
+     * Prepend a task ID to a dependency list
+     *
+     * O(1) operation: allocates new entry and links to current head.
+     *
+     * @param current_head  Current list head offset (0 = empty list)
+     * @param task_slot     Task slot to prepend
+     * @return New head offset
+     */
+    PTO2DepListEntry *prepend(PTO2DepListEntry *cur, PTO2TaskSlotState *slot_state) {
+        PTO2DepListEntry *new_entry = alloc();
+        if (!new_entry) return nullptr;
+        new_entry->slot_state = slot_state;
+        new_entry->next = cur;
+        return new_entry;
+    }
+
+    int32_t used() const { return top - tail; }
+
+    int32_t available() const { return capacity - used(); }
+};
+
+// =============================================================================
+// Ring Set (per-depth aggregate)
+// =============================================================================
+
+/**
+ * Groups a TaskAllocator and DepPool into one per-depth unit.
+ * PTO2_MAX_RING_DEPTH instances provide independent reclamation per scope depth.
+ */
+struct PTO2RingSet {
+    PTO2TaskAllocator task_allocator;
+    PTO2FaninPool fanin_pool;
+};
+
+#endif  // PTO_RING_BUFFER_H
diff --git a/src/a5/runtime/fully_distributed_within_core/runtime/pto_runtime2.cpp b/src/a5/runtime/fully_distributed_within_core/runtime/pto_runtime2.cpp
new file mode 100644
index 000000000..263adec8d
--- /dev/null
+++ b/src/a5/runtime/fully_distributed_within_core/runtime/pto_runtime2.cpp
@@ -0,0 +1,287 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+/**
+ * PTO Runtime2 - Main Implementation
+ *
+ * Implements the unified runtime API that combines orchestrator and scheduler.
+ *
+ * Based on: docs/RUNTIME_LOGIC.md
+ */
+
+#include "pto_runtime2.h"
+
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <algorithm>
+
+#include "aicpu/device_time.h"
+#include "common/unified_log.h"
+#if PTO2_PROFILING
+#include "aicpu/scope_stats_collector_aicpu.h"
+#endif
+
+// Weak fallback for HOST .so builds (never called, but satisfies linker).
+// The AICPU build links the strong symbol from platform/.../device_time.cpp.
+// Hidden visibility prevents HOST .so from polluting global symbol table.
+__attribute__((weak, visibility("hidden"))) uint64_t get_sys_cnt_aicpu() { return 0; }
+
+// =============================================================================
+// Orchestration Ops Table (function-pointer dispatch for orchestration .so)
+// =============================================================================
+
+static TaskOutputTensors submit_task_impl(PTO2Runtime *rt, const MixedKernels &mixed_kernels, const L0TaskArgs &args) {
+    return rt->orchestrator.submit_task(mixed_kernels, args);
+}
+
+static TaskOutputTensors alloc_tensors_impl(PTO2Runtime *rt, const L0TaskArgs &args) {
+    return rt->orchestrator.alloc_tensors(args);
+}
+
+static TaskOutputTensors submit_dummy_task_impl(PTO2Runtime *rt, const L0TaskArgs &args) {
+    return rt->orchestrator.submit_dummy_task(args);
+}
+
+void rt_scope_begin(PTO2Runtime *rt) {
+    PTO2ScopeMode mode = rt->pending_scope_mode;
+    rt->pending_scope_mode = PTO2ScopeMode::AUTO;
+    rt->orchestrator.begin_scope(mode);
+}
+
+void rt_scope_end(PTO2Runtime *rt) { rt->orchestrator.end_scope(); }
+
+void rt_orchestration_done(PTO2Runtime *rt) { rt->orchestrator.mark_done(); }
+
+static bool is_fatal_impl(PTO2Runtime *rt) { return rt->orchestrator.fatal; }
+
+void rt_report_fatal(PTO2Runtime *rt, int32_t error_code, const char *func, const char *fmt, ...) {
+    va_list args;
+    va_start(args, fmt);
+    if (fmt == nullptr || fmt[0] == '\0') {
+        rt->orchestrator.report_fatal(error_code, func, nullptr);
+    } else {
+        char message[1024];
+        vsnprintf(message, sizeof(message), fmt, args);
+        rt->orchestrator.report_fatal(error_code, func, "%s", message);
+    }
+    va_end(args);
+}
+
+// Wait for all producers of this tensor to be safe for data access.
+// Checks owner metadata (lifecycle anchor) and OverlapMap (modifier writers).
+// For reads: wait until each producer COMPLETED (done writing).
+// For writes: also wait until all consumers done reading
+//   (fanout_refcount >= fanout_count - 1, excluding scope reference).
+// Uses cycle-based timeout (checked every 1024 spins).
+// Returns false on timeout (sets orch.fatal).
+MAYBE_UNINITIALIZED_BEGIN
+static bool wait_for_tensor_ready(PTO2Runtime *rt, const Tensor &tensor, bool wait_for_consumers, const char *caller) {
+    PTO2TaskId owner = tensor.owner_task_id;
+    PTO2OrchestratorState &orch = rt->orchestrator;
+
+    // Segmented wait: collect up to kSegmentCap producer slots, then flush by
+    // spinning on each. When the segment fills, we wait for the accumulated
+    // batch before continuing to gather more. Dedup is per-segment only; a
+    // producer that appears in two segments is waited on twice, which is
+    // idempotent (task_state is monotonic) and only adds one atomic load on
+    // the second encounter.
+    constexpr int kSegmentCap = 64;
+    const PTO2TaskSlotState *seg[kSegmentCap];
+    int seg_count = 0;
+    bool signaled = false;
+    bool failed = false;
+
+    auto wait_one_producer = [&](const PTO2TaskSlotState &slot) {
+        uint8_t ring_id = slot.ring_id;
+        int32_t local_id = static_cast<int32_t>(slot.task->task_id.local());
+        uint64_t t0 = get_sys_cnt_aicpu();
+        int32_t spin_count = 0;
+        while (slot.task_state.load(std::memory_order_acquire) < PTO2_TASK_COMPLETED) {
+            SPIN_WAIT_HINT();
+            if ((++spin_count & 1023) == 0 && get_sys_cnt_aicpu() - t0 > PTO2_TENSOR_DATA_TIMEOUT_CYCLES) {
+                orch.report_fatal(
+                    PTO2_ERROR_TENSOR_WAIT_TIMEOUT, caller,
+                    "Timeout (%llu cycles): producer (ring=%d, local=%d) not completed",
+                    (unsigned long long)PTO2_TENSOR_DATA_TIMEOUT_CYCLES, ring_id, local_id
+                );
+                failed = true;
+                return;
+            }
+        }
+    };
+
+    auto wait_one_consumers = [&](const PTO2TaskSlotState &slot) {
+        uint8_t ring_id = slot.ring_id;
+        int32_t local_id = slot.task->task_id.local();
+        uint64_t t0 = get_sys_cnt_aicpu();
+        int32_t spin_count = 0;
+        while (slot.fanout_refcount.load(std::memory_order_acquire) < slot.fanout_count - 1) {
+            SPIN_WAIT_HINT();
+            if ((++spin_count & 1023) == 0 && get_sys_cnt_aicpu() - t0 > PTO2_TENSOR_DATA_TIMEOUT_CYCLES) {
+                orch.report_fatal(
+                    PTO2_ERROR_TENSOR_WAIT_TIMEOUT, caller,
+                    "Timeout (%llu cycles): consumers of producer (ring=%d, local=%d) not done",
+                    (unsigned long long)PTO2_TENSOR_DATA_TIMEOUT_CYCLES, ring_id, local_id
+                );
+                failed = true;
+                return;
+            }
+        }
+    };
+
+    auto flush_segment = [&]() {
+        for (int i = 0; i < seg_count; i++) {
+            wait_one_producer(*seg[i]);
+            if (failed) return;
+            if (!wait_for_consumers) continue;
+            wait_one_consumers(*seg[i]);
+            if (failed) return;
+        }
+        seg_count = 0;
+    };
+
+    auto try_push = [&](const PTO2TaskSlotState &s) {
+        for (int j = 0; j < seg_count; j++) {
+            if (seg[j] == &s) return;  // per-segment dedup
+        }
+        if (seg_count == kSegmentCap) {
+            flush_segment();
+            if (failed) return;
+        }
+        seg[seg_count++] = &s;
+        if (!signaled) {
+            orch.scheduler->wiring.orch_needs_drain.store(true, std::memory_order_release);
+            signaled = true;
+        }
+    };
+
+    auto do_wait = [&]() {
+        // Step A: creator retention — read owner directly from tensor metadata
+        if (owner.is_valid()) {
+            auto &s = orch.sm_header->rings[owner.ring()].get_slot_state_by_task_id(owner.local());
+            try_push(s);
+            if (failed) return;
+        }
+
+        // Step B: modifier writer lookup (OverlapMap), direct callback
+        orch.tensor_map.lookup(tensor, [&](PTO2TensorMapEntry &entry, OverlapStatus) -> bool {
+            PTO2TaskId pid = entry.producer_task_id;
+            auto &s = orch.sm_header->rings[pid.ring()].get_slot_state_by_task_id(pid.local());
+            try_push(s);
+            return !failed;
+        });
+        if (failed) return;
+        flush_segment();
+    };
+
+    do_wait();
+    if (signaled) {
+        orch.scheduler->wiring.orch_needs_drain.store(false, std::memory_order_release);
+    }
+    return !failed;
+}
+MAYBE_UNINITIALIZED_END
+
+uint64_t get_tensor_data(PTO2Runtime *rt, const Tensor &tensor, uint32_t ndims, const uint32_t indices[]) {
+    if (tensor.buffer.addr == 0) {
+        unified_log_error(
+            __FUNCTION__, "get_tensor_data: buffer not allocated (addr=0). "
+                          "Use the Tensor returned by add_output(TensorCreateInfo) after submit returns."
+        );
+        return 0;
+    }
+
+    if (!wait_for_tensor_ready(rt, tensor, false, __FUNCTION__)) {
+        return 0;
+    }
+
+    uint64_t flat_offset = tensor.compute_flat_offset(indices, ndims);
+    uint64_t elem_size = get_element_size(tensor.dtype);
+    const void *ptr = reinterpret_cast<const void *>(tensor.buffer.addr + flat_offset * elem_size);
+    uint64_t result = 0;
+    memcpy(&result, ptr, elem_size);
+    return result;
+}
+
+void set_tensor_data(PTO2Runtime *rt, const Tensor &tensor, uint32_t ndims, const uint32_t indices[], uint64_t value) {
+    if (tensor.buffer.addr == 0) {
+        unified_log_error(
+            __FUNCTION__, "set_tensor_data: buffer not allocated (addr=0). "
+                          "Use the Tensor returned by add_output(TensorCreateInfo) after submit returns."
+        );
+        return;
+    }
+
+    // Wait for producer + all consumers before writing (WAW + WAR safety)
+    if (!wait_for_tensor_ready(rt, tensor, true, __FUNCTION__)) {
+        return;
+    }
+
+    uint64_t flat_offset = tensor.compute_flat_offset(indices, ndims);
+    uint64_t elem_size = get_element_size(tensor.dtype);
+    void *ptr = reinterpret_cast<void *>(tensor.buffer.addr + flat_offset * elem_size);
+    memcpy(ptr, &value, elem_size);
+}
+
+// Ops-table entry that hands the call-site captured by PTO2ScopeGuard to the
+// [ScopeStats] collector. The slot is always present in the struct to keep
+// the layout stable; at PTO2_PROFILING=0 we fill nullptr so the orchestration
+// .so's null-check skips it.
+#if PTO2_PROFILING
+static void scope_set_site_impl(const char *file, int line) { scope_stats_set_pending_site(file, line); }
+#endif
+
+static const PTO2RuntimeOps s_runtime_ops = {
+    .submit_task = submit_task_impl,
+    .scope_begin = rt_scope_begin,
+    .scope_end = rt_scope_end,
+    .orchestration_done = rt_orchestration_done,
+    .is_fatal = is_fatal_impl,
+    .report_fatal = rt_report_fatal,
+    .log_error = unified_log_error,
+    .log_warn = unified_log_warn,
+    .log_debug = unified_log_debug,
+    .log_info_v = unified_log_info_v,
+    .get_tensor_data = get_tensor_data,
+    .set_tensor_data = set_tensor_data,
+    .alloc_tensors = alloc_tensors_impl,
+    .submit_dummy_task = submit_dummy_task_impl,
+#if PTO2_PROFILING
+    .scope_set_site = scope_set_site_impl,
+#else
+    .scope_set_site = nullptr,
+#endif
+};
+
+// =============================================================================
+// Runtime Lifecycle (AICPU-only fixup)
+// =============================================================================
+//
+// Layout / init_data / wire / destroy live in
+// runtime/shared/pto_runtime2_init.cpp so the host build can pre-populate the
+// prebuilt arena image. The pieces below — wiring the ops table and the
+// SPMD core counts — depend on the device-side s_runtime_ops global and the
+// AICPU SchedulerContext respectively, so they remain in the AICPU build.
+
+void runtime_finalize_after_wire(PTO2Runtime *rt, int32_t aic_count, int32_t aiv_count) {
+    rt->ops = &s_runtime_ops;
+    rt->orchestrator.total_cluster_count = aic_count;
+    rt->orchestrator.total_aiv_count = aiv_count;
+}
+
+void runtime_set_mode(PTO2Runtime *rt, PTO2RuntimeMode mode) {
+    if (rt) {
+        rt->mode = mode;
+    }
+}
diff --git a/src/a5/runtime/fully_distributed_within_core/runtime/pto_runtime2.h b/src/a5/runtime/fully_distributed_within_core/runtime/pto_runtime2.h
new file mode 100644
index 000000000..db4af47ed
--- /dev/null
+++ b/src/a5/runtime/fully_distributed_within_core/runtime/pto_runtime2.h
@@ -0,0 +1,291 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * PTO Runtime2 - Main Interface
+ *
+ * This is the main header for the PTO Runtime2 system.
+ * It provides a unified API for task graph construction and execution.
+ *
+ * Key Features:
+ * - Ring buffer based memory management (zero allocation overhead)
+ * - Lazy invalidation TensorMap for dependency discovery
+ * - Scope-based buffer lifecycle management
+ * - Per-task spinlocks for concurrent fanout updates
+ * - Orchestrator-Scheduler decoupling via shared memory
+ *
+ * Usage:
+ *   1. Create runtime: PTO2Runtime create methods
+ *   2. Build task graph in orchestration function:
+ *      - begin_scope() / end_scope()
+ *      - submit_task()
+ *   3. Mark orchestration complete: mark_done()
+ *   4. Destroy runtime
+ *
+ * Based on: docs/RUNTIME_LOGIC.md
+ */
+
+#pragma once
+
+#include "utils/device_arena.h"
+#include "pto_runtime2_types.h"
+#include "pto_submit_types.h"
+#include "pto_shared_memory.h"
+#include "pto_ring_buffer.h"
+#include "pto_tensormap.h"
+#include "scheduler/pto_scheduler.h"
+#include "pto_orchestrator.h"
+#include "aicore_completion_mailbox.h"
+
+// =============================================================================
+// Runtime Context
+// =============================================================================
+
+/**
+ * Runtime execution mode
+ */
+enum PTO2RuntimeMode {
+    PTO2_MODE_EXECUTE = 0,    // Execute tasks on workers
+    PTO2_MODE_SIMULATE = 1,   // Simulate task execution with cycle counting
+    PTO2_MODE_GRAPH_ONLY = 2  // Build graph only, no execution
+};
+
+/**
+ * Function-pointer ops table for runtime operations.
+ *
+ * The orchestration .so calls runtime functions through this table
+ * (via pto_orchestration_api.h inline wrappers), so it has zero link
+ * dependencies on runtime .cpp files.
+ */
+typedef struct PTO2Runtime PTO2Runtime;  // forward declare for ops signatures
+
+struct PTO2RuntimeOps {
+    TaskOutputTensors (*submit_task)(PTO2Runtime *rt, const MixedKernels &mixed_kernels, const L0TaskArgs &args);
+    void (*scope_begin)(PTO2Runtime *rt);
+    void (*scope_end)(PTO2Runtime *rt);
+    void (*orchestration_done)(PTO2Runtime *rt);
+    bool (*is_fatal)(PTO2Runtime *rt);
+    void (*report_fatal)(PTO2Runtime *rt, int32_t error_code, const char *func, const char *fmt, ...);
+
+    // Logging (populated by runtime, called by orchestration)
+    void (*log_error)(const char *func, const char *fmt, ...);
+    void (*log_warn)(const char *func, const char *fmt, ...);
+    void (*log_debug)(const char *func, const char *fmt, ...);
+    // INFO with explicit verbosity tier (v ∈ [0,9]; gating done inside).
+    void (*log_info_v)(const char *func, int v, const char *fmt, ...);
+
+    // Cross-layer data access (orchestration reads/writes tensor values via runtime)
+    // Placed after logging to avoid shifting hot-path field offsets.
+    uint64_t (*get_tensor_data)(PTO2Runtime *rt, const Tensor &tensor, uint32_t ndims, const uint32_t indices[]);
+    void (*set_tensor_data)(
+        PTO2Runtime *rt, const Tensor &tensor, uint32_t ndims, const uint32_t indices[], uint64_t value
+    );
+    TaskOutputTensors (*alloc_tensors)(PTO2Runtime *rt, const L0TaskArgs &args);
+    TaskOutputTensors (*submit_dummy_task)(PTO2Runtime *rt, const L0TaskArgs &args);
+
+    // Stash the call-site captured by PTO2ScopeGuard into the [ScopeStats]
+    // collector. Always present to keep ops-table layout stable across
+    // PTO2_PROFILING settings; set to nullptr at PTO2_PROFILING=0.
+    void (*scope_set_site)(const char *file, int line);
+};
+
+/**
+ * Layout descriptor for the prebuilt runtime arena. Holds all sub-region
+ * offsets (orchestrator / scheduler / sm_handle wrapper / runtime header /
+ * AICore mailbox) plus the layout-defining capacities. Produced once on the
+ * host by runtime_reserve_layout(); consumed by runtime_init_data_from_layout
+ * and runtime_wire_arena_pointers.
+ */
+struct PTO2RuntimeArenaLayout {
+    size_t off_sm_handle{0};
+    PTO2OrchestratorLayout orch;
+    PTO2SchedulerLayout sched;
+    size_t off_runtime{0};
+    size_t off_mailbox{0};
+
+    // Cached parameters (re-used by init_data + wire stages).
+    uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH]{};
+    uint64_t heap_sizes[PTO2_MAX_RING_DEPTH]{};
+    int32_t dep_pool_capacities[PTO2_MAX_RING_DEPTH]{};
+
+    // Total arena byte size post-commit. Used by host to size the prebuilt
+    // image buffer and as the rtMemcpy length.
+    size_t arena_size{0};
+};
+
+/**
+ * PTO Runtime2 context
+ *
+ * Contains all state for orchestration and scheduling.
+ * In simulated mode, runs in single process with shared address space.
+ */
+struct PTO2Runtime {
+    // Ops table (first field — used by orchestration .so via function pointers)
+    const PTO2RuntimeOps *ops;
+    PTO2ScopeMode pending_scope_mode;
+
+    // Components
+    PTO2SharedMemoryHandle *sm_handle;
+    PTO2OrchestratorState orchestrator;
+    PTO2SchedulerState scheduler;
+    AICoreCompletionMailbox *aicore_mailbox;
+
+    // GM Heap for output buffers
+    void *gm_heap;
+    uint64_t gm_heap_size;
+    bool gm_heap_owned;  // True if we allocated it
+
+    // Mode
+    PTO2RuntimeMode mode;
+
+    // Statistics
+    int64_t total_cycles;
+
+    // Prebuilt-arena fast path metadata. Carries every offset
+    // wire_arena_pointers needs at AICPU boot so the AICPU can reconstruct
+    // all arena-internal pointer fields without re-running init_data. The
+    // device base of the runtime arena travels separately on the host-side
+    // Runtime (Runtime::prebuilt_arena_base_), since the AICPU needs it
+    // *before* dereferencing this image. Populated on host by
+    // runtime_init_data_from_layout + runtime_wire_arena_pointers; read by
+    // aicpu_executor.cpp.
+    PTO2RuntimeArenaLayout prebuilt_layout;
+};
+
+// =============================================================================
+// Runtime Lifecycle API
+// =============================================================================
+
+/**
+ * Phase 1 — declare every sub-region (sm_handle wrapper, orchestrator /
+ * scheduler / tensor_map / mailbox / PTO2Runtime header) on the supplied
+ * arena. Pure arithmetic; does not touch device memory and may run on host.
+ * Returns the layout descriptor; caller commits/attaches the arena before
+ * Phase 2/3.
+ */
+PTO2RuntimeArenaLayout runtime_reserve_layout(
+    DeviceArena &arena, uint64_t task_window_size, int32_t dep_pool_capacity = PTO2_DEP_LIST_POOL_SIZE
+);
+PTO2RuntimeArenaLayout runtime_reserve_layout(
+    DeviceArena &arena, const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH],
+    const uint64_t heap_sizes[PTO2_MAX_RING_DEPTH], const int32_t dep_pool_capacities[PTO2_MAX_RING_DEPTH]
+);
+
+/**
+ * Phase 2 — write the data half of the runtime arena: standalone fields,
+ * memset'd arena regions, sub-structure initializers, and SM-side device
+ * pointers. The arena must already be committed (or attached); writes go
+ * into arena.base() + sub-region offsets.
+ *
+ * `sm_dev_base` / `gm_heap_dev_base` are device addresses; we only store
+ * them (never dereference). Safe to run on a host arena that owns a host
+ * mirror of the runtime image — the resulting buffer is rtMemcpy-ready.
+ *
+ * Returns the PTO2Runtime* that sits at layout.off_runtime within the arena.
+ * Caller must follow up with runtime_wire_arena_pointers; rt->ops and the
+ * AICore-side count fields are left untouched and must be filled by the
+ * AICPU at boot.
+ */
+PTO2Runtime *runtime_init_data_from_layout(
+    DeviceArena &arena, const PTO2RuntimeArenaLayout &layout, PTO2RuntimeMode mode, void *sm_dev_base, uint64_t sm_size,
+    void *gm_heap_dev_base, uint64_t heap_size
+);
+PTO2Runtime *runtime_init_data_from_layout(
+    DeviceArena &arena, const PTO2RuntimeArenaLayout &layout, PTO2RuntimeMode mode, void *sm_dev_base, uint64_t sm_size,
+    void *gm_heap_dev_base, const uint64_t heap_sizes[PTO2_MAX_RING_DEPTH]
+);
+
+/**
+ * Phase 3 — wire every arena-internal pointer field (rt->sm_handle,
+ * rt->aicore_mailbox, orchestrator.{scope_tasks, scope_begins, scheduler,
+ * tensor_map.*, rings[].fanin_pool.base}, scheduler.{ready_queues, dep_pool,
+ * wiring.queue}) so each holds arena.base() + offset. Idempotent — runs on
+ * both host (writing host-mirror addresses) and AICPU (writing device
+ * addresses) sides.
+ */
+void runtime_wire_arena_pointers(DeviceArena &arena, const PTO2RuntimeArenaLayout &layout, PTO2Runtime *rt);
+
+/**
+ * AICPU-only Phase 4 — fill in the few fields the host could not know at
+ * prebuilt-image build time: the ops table (s_runtime_ops is a device-side
+ * file-local global, host cannot resolve its device address) and the
+ * orchestrator's core counts (depend on the executor's scheduler context).
+ * Call once per boot after runtime_wire_arena_pointers.
+ */
+void runtime_finalize_after_wire(PTO2Runtime *rt, int32_t aic_count, int32_t aiv_count);
+
+/**
+ * Destroy runtime. With the prebuilt-arena fast path the arena buffer is
+ * pooled across runs by DeviceRunner, so we never call arena.release()
+ * here — the destructor only forgets sub-structure pointers (idempotent
+ * cleanup).
+ */
+void runtime_destroy(PTO2Runtime *rt, DeviceArena &arena);
+
+/**
+ * Set execution mode
+ */
+void runtime_set_mode(PTO2Runtime *rt, PTO2RuntimeMode mode);
+
+// =============================================================================
+// Orchestration API (called by orchestration function)
+// =============================================================================
+
+/**
+ * Begin a new scope
+ *
+ * All tasks submitted within this scope will have their lifetime
+ * bounded by the scope. When scope_end() is called, the scope
+ * releases its reference to all enclosed tasks.
+ */
+void rt_scope_begin(PTO2Runtime *rt);
+
+/**
+ * End current scope
+ *
+ * Releases scope reference for all tasks submitted since scope_begin().
+ * Tasks whose refcount reaches zero will have their buffers released.
+ */
+void rt_scope_end(PTO2Runtime *rt);
+
+/**
+ * Mark orchestration as complete
+ *
+ * Signals that no more tasks will be submitted.
+ */
+void rt_orchestration_done(PTO2Runtime *rt);
+
+/**
+ * Enter fatal state explicitly from orchestration.
+ */
+void rt_report_fatal(PTO2Runtime *rt, int32_t error_code, const char *func, const char *fmt, ...);
+
+/**
+ * Cross-layer data access: read a tensor value by waiting for its producer.
+ */
+uint64_t get_tensor_data(PTO2Runtime *rt, const Tensor &tensor, uint32_t ndims, const uint32_t indices[]);
+
+/**
+ * Cross-layer data access: write a value to a tensor at given indices.
+ * Waits for producer completion (WAW) and all consumers (WAR) via TensorMap.
+ * See set_tensor_data in pto_orchestration_api.h for full documentation.
+ */
+void set_tensor_data(PTO2Runtime *rt, const Tensor &tensor, uint32_t ndims, const uint32_t indices[], uint64_t value);
+
+/**
+ * Slim config struct exported by orchestration .so via aicpu_orchestration_config().
+ * Shared definition with pto_orchestration_api.h (same layout, guarded).
+ */
+#ifndef PTO2_ORCHESTRATION_CONFIG_DEFINED
+#define PTO2_ORCHESTRATION_CONFIG_DEFINED
+struct PTO2OrchestrationConfig {
+    int expected_arg_count;
+};
+#endif
diff --git a/src/a5/runtime/fully_distributed_within_core/runtime/pto_runtime2_types.h b/src/a5/runtime/fully_distributed_within_core/runtime/pto_runtime2_types.h
new file mode 100644
index 000000000..f2715982b
--- /dev/null
+++ b/src/a5/runtime/fully_distributed_within_core/runtime/pto_runtime2_types.h
@@ -0,0 +1,420 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+/**
+ * PTO Runtime2 - Core Type Definitions
+ *
+ * This header defines all fundamental types used by the PTO Runtime2 system:
+ * - Configuration constants
+ * - Worker types and task states
+ * - Tensor regions and task parameters
+ * - Task descriptors with fanin/fanout tracking
+ * - Dependency list entries
+ *
+ * Based on: docs/RUNTIME_LOGIC.md
+ */
+
+#ifndef SRC_A5_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_PTO_RUNTIME2_TYPES_H_
+#define SRC_A5_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_PTO_RUNTIME2_TYPES_H_
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include <atomic>
+
+#include "profiling_config.h"
+#include "pto_constants.h"
+#include "pto_runtime_status.h"
+#include "pto2_dispatch_payload.h"
+#include "aicore_completion_mailbox.h"
+#include "pto_submit_types.h"
+#include "pto_task_id.h"
+#include "pto_types.h"
+
+// Spin-wait hint for AICPU threads.  On real hardware the AICPU has dedicated
+// ARM A55 cores — no OS yield is needed, so the hint is a no-op.  In simulation
+// all threads share host CPU cores, so we yield to prevent starvation.
+// This header is also compiled into the Host .so (for struct definitions only),
+// where the hint is never called — the fallback no-op keeps Host builds clean.
+#if __has_include("spin_hint.h")
+#include "spin_hint.h"
+#else
+#define SPIN_WAIT_HINT() ((void)0)
+#endif
+
+#if PTO2_ORCH_PROFILING || PTO2_SCHED_PROFILING
+#include "aicpu/device_time.h"
+#endif
+
+// =============================================================================
+// Configuration Constants
+// =============================================================================
+
+// Task management
+// NOTE: PTO2_TASK_WINDOW_SIZE is now a per-ring default value.
+// Actual window size is passed at runtime to runtime_reserve_layout().
+// Use pto2_task_slot(sched, task_id) for slot calculation.
+#define PTO2_TASK_WINDOW_SIZE 16384  // Default per-ring task window size (power of 2)
+
+// Multi-ring: number of independent ring layers (HeapRing + TaskRing + DepPool per layer)
+// Scope depth maps to ring index via: min(scope_depth, PTO2_MAX_RING_DEPTH - 1)
+#define PTO2_MAX_RING_DEPTH 4
+
+// Memory pools (per-ring defaults; total = value × PTO2_MAX_RING_DEPTH)
+#define PTO2_HEAP_SIZE (256 * 1024 * 1024)  // 256MB per ring (1GB total)
+#define PTO2_DEP_LIST_POOL_SIZE 16384       // Per-ring dependency list pool entries
+#define PTO2_TENSORMAP_POOL_SIZE (65536)    // TensorMap entry pool
+#define PTO2_TENSORMAP_NUM_BUCKETS 4096     // Power of 2 for fast hash (4096×8B=32KB fits L1)
+
+// Scope management
+#define PTO2_MAX_SCOPE_DEPTH 64  // Maximum nesting depth
+// Hard cap for the scope_tasks buffer. Equals the total in-flight ring slot
+// budget (PTO2_TASK_WINDOW_SIZE × PTO2_MAX_RING_DEPTH): once every ring slot
+// is in flight, no more tasks can ever be pushed regardless of buffer size.
+// scope_tasks_push fatals on overflow rather than growing the arena-owned
+// buffer (which would be UB on the arena's malloc'd backing).
+#define PTO2_SCOPE_TASKS_CAP (PTO2_TASK_WINDOW_SIZE * PTO2_MAX_RING_DEPTH)
+
+// Ready queue
+#define PTO2_READY_QUEUE_SIZE 65536  // Per-shape queue size
+
+// Wiring queue
+#define PTO2_WRIRING_QUEUE_SIZE 1024  // Per-shape queue size
+
+// Fanin storage
+#define PTO2_FANIN_INLINE_CAP 64
+
+// TensorMap cleanup interval
+#define PTO2_TENSORMAP_CLEANUP_INTERVAL 64  // Cleanup every N retired tasks
+#define PTO2_DEP_POOL_CLEANUP_INTERVAL 64   // Cleanup every N retired tasks
+
+// get_tensor_data/set_tensor_data spin wait timeout in cycles.
+// ~10s on hardware (1.5 GHz counter), ~10s on simulation (chrono-based).
+constexpr uint64_t PTO2_TENSOR_DATA_TIMEOUT_CYCLES = 15 * 1000 * 1000 * 1000ULL;
+
+// =============================================================================
+// Task States
+// =============================================================================
+
+/**
+ * Task state enumeration
+ *
+ * State transitions:
+ *   PENDING -> COMPLETED -> CONSUMED
+ *
+ * The slot stays in PENDING from submit through "ready in queue" and "running
+ * on a worker"; readiness and running-vs-idle are derived from fanin_refcount
+ * and per-core running_slot_state respectively, not from task_state itself.
+ *
+ * Conditions:
+ *   PENDING->COMPLETED:   all subtasks finish (set by scheduler) or task is a
+ *                         hidden alloc completed inline by the orchestrator
+ *   COMPLETED->CONSUMED:  fanout_refcount == fanout_count && state == COMPLETED
+ */
+typedef enum {
+    PTO2_TASK_PENDING = 0,    // Submitted; awaiting fanin, queued, or dispatched
+    PTO2_TASK_COMPLETED = 1,  // Execution finished, output may still be in use
+    PTO2_TASK_CONSUMED = 2    // Output fully consumed, buffers can be released
+} PTO2TaskState;
+
+/**
+ * Result of a unified task allocation.
+ */
+struct PTO2TaskAllocResult {
+    int32_t task_id;    // Absolute task ID (not wrapped)
+    int32_t slot;       // task_id & (window_size - 1)
+    void *packed_base;  // Heap allocation result (nullptr if failure)
+    void *packed_end;   // packed_base + aligned output_size
+
+    bool failed() const { return task_id < 0; }
+};
+
+struct PTO2OutputLayout {
+    uint64_t offsets[MAX_TENSOR_ARGS] = {};
+    uint64_t buffer_sizes[MAX_TENSOR_ARGS] = {};
+    int32_t total_output_size = 0;
+};
+
+// =============================================================================
+// Dependency List Entry
+// =============================================================================
+
+/**
+ * Fanin spill entry
+ * Stored in the dedicated fanin spill ring buffer.
+ */
+struct PTO2TaskSlotState;  // Forward declaration
+struct PTO2FaninPool;      // Forward declaration
+struct PTO2FaninSpillEntry {
+    PTO2TaskSlotState *slot_state;
+};
+static_assert(sizeof(PTO2FaninSpillEntry) == sizeof(uintptr_t));
+
+/**
+ * Dependency list entry (singly-linked list node)
+ * Stored in DepListPool ring buffer.
+ */
+struct PTO2DepListEntry {
+    PTO2TaskSlotState *slot_state;  // Consumer slot state (direct pointer)
+    PTO2DepListEntry *next;         // next entry
+};
+
+// =============================================================================
+// Task Descriptor
+// =============================================================================
+
+/**
+ * Task descriptor structure (shared memory)
+ *
+ * Stored in the TaskDescriptor ring buffer in shared memory.
+ * Contains static identification and buffer pointers only.
+ * Dynamic scheduling state (fanin/fanout/task_state) is in PTO2TaskSlotState.
+ *
+ * Fields set by Orchestrator at submission, read by Scheduler for dispatch.
+ */
+struct PTO2TaskDescriptor {
+    // Mixed-task identification (encodes ring_id in upper 32 bits)
+    PTO2TaskId task_id;  // raw: (ring_id << 32) | local_id
+
+    // Per-slot kernel IDs (INVALID_KERNEL_ID = inactive)
+    int32_t kernel_id[PTO2_SUBTASK_SLOT_COUNT];
+
+    // Packed output buffer (all outputs packed into single contiguous buffer)
+    void *packed_buffer_base;  // Start of packed buffer in GM Heap
+    void *packed_buffer_end;   // End of packed buffer (for heap reclamation)
+};
+
+// =============================================================================
+// Per-Slot Scheduling State
+// =============================================================================
+
+/**
+ * Task payload data (cold path - only accessed during orchestration and dispatch)
+ *
+ * Layout: metadata + inline fanin packed in the first 9 cache lines, followed
+ * by bulk tensor and scalar data. Small fanins stay fully inline; larger
+ * fanins spill into a per-ring ring buffer slice.
+ */
+struct PTO2TaskPayload {
+    // === Cache lines 0-8 (576B) — metadata + inline fanin ===
+    int32_t tensor_count{0};
+    int32_t scalar_count{0};
+    int32_t fanin_actual_count{0};  // Actual fanin count (without the +1 redundance)
+    int32_t fanin_spill_start{0};   // Linear start index in fanin spill pool (0 = no spill)
+    PTO2FaninPool *fanin_spill_pool{nullptr};
+    PTO2TaskSlotState *fanin_inline_slot_states[PTO2_FANIN_INLINE_CAP];
+    // === Cache lines 9-72 (4096B) — tensors (alignas(64) forces alignment) ===
+    Tensor tensors[MAX_TENSOR_ARGS];
+    // === Cache lines 73-74 (128B) — scalars ===
+    uint64_t scalars[MAX_SCALAR_ARGS];
+
+    // Layout verification (size checks that don't need offsetof).
+    static_assert(sizeof(Tensor) == 128, "Tensor must be 2 cache lines");
+    static_assert(MAX_SCALAR_ARGS * sizeof(uint64_t) == 128, "scalar region must be 128B (2 cache lines)");
+
+    /**
+     * Initialize payload: copy tensors, store scalars.
+     *
+     * For each param slot, the tensor source is determined by TensorArgType:
+     * - OUTPUT -> use materialized_outputs.output_ptr(out_idx++)
+     * - INPUT / INOUT -> use refs[i].tensor
+     *
+     * @param args                Task arguments (tensors + scalars)
+     * @param result  Materialized output tensors (from TensorCreateInfo path)
+     */
+    void init(
+        const L0TaskArgs &args, TaskOutputTensors &result, PTO2TaskAllocResult &alloc_result, PTO2OutputLayout &layout
+    ) {
+        tensor_count = args.tensor_count();
+        scalar_count = args.scalar_count();
+
+        // int32_t out_idx = 0;
+        for (int32_t i = 0; i < args.tensor_count(); i++) {
+            if (args.tag(i) != TensorArgType::OUTPUT) {
+                tensors[i].copy(args.tensor(i).ref());
+            } else {
+                init_tensor_from_create_info(
+                    tensors[i], args.tensor(i).create_info(),
+                    reinterpret_cast<void *>(reinterpret_cast<char *>(alloc_result.packed_base) + layout.offsets[i]),
+                    layout.buffer_sizes[i]
+                );
+                tensors[i].owner_task_id = result.task_id();
+                result.materialize_output(tensors[i]);
+            }
+        }
+        // Round up to cache line boundary. Both arrays are 128B so no overrun.
+        // Eliminates branches; extra bytes within the same CL have zero additional cost.
+        memcpy(scalars, args.scalars(), PTO2_ALIGN_UP(args.scalar_count() * sizeof(uint64_t), 64));
+    }
+};
+
+// PTO2TaskPayload layout verification (offsetof requires complete type).
+static_assert(offsetof(PTO2TaskPayload, fanin_spill_pool) == 16, "spill pool pointer layout drift");
+static_assert(
+    offsetof(PTO2TaskPayload, fanin_inline_slot_states) == 24, "inline fanin array must follow spill metadata"
+);
+static_assert(offsetof(PTO2TaskPayload, tensors) == 576, "tensors must start at byte 576 (cache line 9)");
+static_assert(
+    offsetof(PTO2TaskPayload, scalars) == 576 + MAX_TENSOR_ARGS * sizeof(Tensor),
+    "scalars must immediately follow tensors"
+);
+static_assert(
+    sizeof(PTO2TaskPayload) == 576 + MAX_TENSOR_ARGS * sizeof(Tensor) + MAX_SCALAR_ARGS * sizeof(uint64_t),
+    "PTO2TaskPayload size must stay on the baseline cache-line footprint"
+);
+
+/**
+ * Per-task slot scheduling state (scheduler-private, NOT in shared memory)
+ *
+ * Consolidates all hot-path scheduling fields into a single cache-friendly
+ * structure (32 bytes = half a cache line). Accessing any field of a task's
+ * slot state brings all related fields into the same cache line.
+ *
+ * Concurrency notes:
+ * - fanout_head, fanout_count protected by fanout_lock (per-task spinlock)
+ * - fanin_count set once at submission, read-only after (hot path for ready check)
+ * - task_state, fanin_refcount, fanout_refcount updated atomically
+ */
+struct alignas(64) PTO2TaskSlotState {
+    // Fanout lock + list (accessed together under lock in on_task_complete)
+    std::atomic<int32_t> fanout_lock;  // Per-task spinlock (0=unlocked, 1=locked)
+    int32_t fanout_count;              // 1 (owning scope) + number of consumers
+
+    PTO2DepListEntry *fanout_head;  // Pointer to first fanout entry (nullptr = empty)
+
+    // Task state (completion, consumed check, ready check)
+    std::atomic<PTO2TaskState> task_state;  // PENDING/COMPLETED/CONSUMED
+
+    // Fanin (accessed together in release_fanin_and_check_ready)
+    std::atomic<int32_t> fanin_refcount;  // Dynamic: counts completed producers
+    int32_t fanin_count;                  // Number of producer dependencies (set once by wiring)
+
+    // Fanout refcount (accessed with fanout_count in check_and_handle_consumed)
+    std::atomic<int32_t> fanout_refcount;  // Dynamic: counts released references
+
+    // --- Per-slot constant, re-bound by orch::prepare_task each submit ---
+    // Value is the same on every reuse (&task_payloads[slot] / &task_descriptors[slot]),
+    // but written here per-submit instead of in an O(window_size) init loop —
+    // these are the only "scale-dependent" pointers in this struct, so moving
+    // them out of init makes startup cost independent of task_window_size.
+    PTO2TaskPayload *payload;
+    PTO2TaskDescriptor *task;
+
+    // --- Set per-submit (depend on task inputs) ---
+    ActiveMask active_mask;  // Bitmask of active subtask slots (set once)
+    uint8_t ring_id;         // Ring layer (immutable after init)
+    // Set by any subtask FIN that pushed deferred-completion CONDITIONs to
+    // the runtime mailbox; read by the last subtask FIN to decide MPSC vs
+    // inline completion. Mirrors a2a3; see that mirror for the full
+    // memory-order argument. Carved out of the padding byte between ring_id
+    // and dep_pool_mark to keep PTO2TaskSlotState at 64 bytes.
+    std::atomic<bool> any_subtask_deferred{false};
+    uint8_t _async_pad{0};
+    int32_t dep_pool_mark{0};  // Dep pool top after wiring (thread-0-only)
+
+    std::atomic<int16_t> completed_subtasks{0};  // Each core completion increments by 1
+    int16_t total_required_subtasks{0};          // = logical_block_num * popcount(active_mask)
+    int16_t logical_block_num{1};                // Total logical blocks (set by orchestrator)
+    int16_t next_block_idx{0};                   // Next block to dispatch (scheduler state)
+
+    /**
+     * Bind the slot-invariant ring id. Called once per slot during
+     * RingSchedState::init(); ring_id never changes across reuses.
+     */
+    void bind_ring(uint8_t rid) { ring_id = rid; }
+
+    /**
+     * Re-bind the per-slot payload/task pointers. Called by
+     * orch::prepare_task on every submit. Value is constant for a given
+     * slot, but we pay the cheap re-write each submit (both fields land on
+     * the same 64B slot_state cache line that prepare_task is already
+     * dirtying) to avoid the init-time per-slot loop.
+     */
+    void bind_buffers(PTO2TaskPayload *p, PTO2TaskDescriptor *t) {
+        payload = p;
+        task = t;
+    }
+
+    /**
+     * Reset dynamic scheduling fields for slot reuse.
+     * Called by advance_ring_pointers() after a slot transitions to CONSUMED
+     * and last_task_alive advances past it, but before sync_to_sm() publishes
+     * the new last_task_alive to the orchestrator.
+     *
+     * Skips payload, task, ring_id (immutable, bound once at init).
+     * Skips task_state: left as CONSUMED so that wait_for_tensor_ready()
+     * callers holding stale owner_task_id still observe a completed state.
+     * task_state is set to PENDING by the orchestrator when it reuses the slot.
+     */
+    void reset_for_reuse() {
+        fanout_lock.store(0, std::memory_order_relaxed);
+        fanout_count = 1;
+        fanout_head = nullptr;
+        fanin_refcount.store(0, std::memory_order_relaxed);
+        fanout_refcount.store(0, std::memory_order_relaxed);
+        completed_subtasks.store(0, std::memory_order_relaxed);
+        next_block_idx = 0;
+        any_subtask_deferred.store(false, std::memory_order_relaxed);
+    }
+
+    // === Per-task fanout spinlock ===
+    //
+    // Used by BOTH the orchestrator and the scheduler. The fanout_lock MUST
+    // be held whenever reading or writing fanout_head / fanout_count, because
+    // the orchestrator adds consumers concurrently with the scheduler
+    // traversing the list after task completion.
+
+#if PTO2_ORCH_PROFILING || PTO2_SCHED_PROFILING
+    void lock_fanout(uint64_t &atomic_count, uint64_t &wait_cycle) {
+        uint64_t t0 = get_sys_cnt_aicpu();
+        bool contended = false;
+        uint32_t atomic_ops = 0;
+
+        for (;;) {
+            while (fanout_lock.load(std::memory_order_acquire) != 0) {
+                contended = true;
+                atomic_ops++;
+                SPIN_WAIT_HINT();
+            }
+            int32_t expected = 0;
+            if (fanout_lock.compare_exchange_weak(expected, 1, std::memory_order_acquire, std::memory_order_relaxed)) {
+                atomic_ops++;
+                atomic_count += atomic_ops;
+                if (contended) {
+                    wait_cycle += (get_sys_cnt_aicpu() - t0);
+                }
+                return;
+            }
+            contended = true;
+            atomic_ops++;
+        }
+    }
+#endif
+
+    void lock_fanout() {
+        for (;;) {
+            while (fanout_lock.load(std::memory_order_acquire) != 0) {
+                SPIN_WAIT_HINT();
+            }
+            int32_t expected = 0;
+            if (fanout_lock.compare_exchange_weak(expected, 1, std::memory_order_acquire, std::memory_order_relaxed)) {
+                return;
+            }
+        }
+    }
+
+    void unlock_fanout() { fanout_lock.store(0, std::memory_order_release); }
+};
+
+static_assert(sizeof(PTO2TaskSlotState) == 64);
+
+#endif  // SRC_A5_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_PTO_RUNTIME2_TYPES_H_
diff --git a/src/a5/runtime/fully_distributed_within_core/runtime/pto_shared_memory.h b/src/a5/runtime/fully_distributed_within_core/runtime/pto_shared_memory.h
new file mode 100644
index 000000000..cad5cec36
--- /dev/null
+++ b/src/a5/runtime/fully_distributed_within_core/runtime/pto_shared_memory.h
@@ -0,0 +1,270 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * PTO Runtime2 - Shared Memory Layout
+ *
+ * Defines the shared memory structure for Orchestrator-Scheduler communication.
+ *
+ * Memory Layout (per-ring sections repeat for each ring 0..PTO2_MAX_RING_DEPTH-1):
+ *   +---------------------------+
+ *   | SharedMemoryHeader        |  (per-ring flow control + sync)
+ *   +---------------------------+
+ *   | Ring 0: TaskDescriptor[]  |
+ *   | Ring 0: TaskPayload[]     |
+ *   | Ring 0: TaskSlotState[]   |
+ *   +---------------------------+
+ *   | Ring 1: TaskDescriptor[]  |
+ *   | Ring 1: TaskPayload[]     |
+ *   | Ring 1: TaskSlotState[]   |
+ *   +---------------------------+
+ *   | ...                       |
+ *   +---------------------------+
+ *
+ * Design principles:
+ * - Only data needed for Orchestrator<->Scheduler communication is here
+ * - TensorMap, scope_stack, ready_queues, dep_pool are in private memory
+ * - Flow control via atomic counters/flags (no locks needed for single-word R/W)
+ *
+ * Based on: docs/RUNTIME_LOGIC.md
+ */
+
+#pragma once
+
+#include "utils/device_arena.h"
+#include "pto_runtime2_types.h"
+
+// =============================================================================
+// Shared Memory Header
+// =============================================================================
+
+struct PTO2SharedMemoryHandle;
+
+/**
+ * Per-ring flow control state in shared memory.
+ * Written/read by Orchestrator and Scheduler for synchronization.
+ */
+struct alignas(64) PTO2RingFlowControl {
+    // === Cache Line 0: Written by Orchestrator, Read by Scheduler ===
+    alignas(64) std::atomic<int32_t> current_task_index;  // Task ring head (next to allocate)
+
+    // === Cache Line 1: Written by Scheduler, Read by Orchestrator (for back-pressure) ===
+    alignas(64) std::atomic<int32_t> last_task_alive;  // Task ring tail (oldest active task)
+
+    // Per-boot SM reset. PTO2TaskAllocator::init() seeds its private
+    // local_task_id_ from initial_local_task_id (default 0 in production)
+    // *without* dereferencing current_task_index — it relies on this reset
+    // running on every AICPU boot so 0 stays in sync. If you ever change
+    // the initial fc value or the boot ordering, update the default in
+    // PTO2TaskAllocator::init (pto_ring_buffer.h) in the same change, or
+    // submit IDs will be off by the divergence.
+    void init() {
+        current_task_index.store(0, std::memory_order_relaxed);
+        last_task_alive.store(0, std::memory_order_relaxed);
+    }
+
+    bool validate(PTO2SharedMemoryHandle *handle, int32_t ring_id) const;
+};
+
+static_assert(sizeof(PTO2RingFlowControl) == 128, "PTO2RingFlowControl must be exactly 2 cache lines (128B)");
+
+/**
+ * Per-ring shared memory header section.
+ *
+ * Groups flow-control, layout info, and per-ring data pointers for a single ring.
+ * Pointers are host-side only (set by setup_pointers, invalid on device).
+ */
+struct alignas(64) PTO2SharedMemoryRingHeader {
+    PTO2RingFlowControl fc;
+
+    // Layout metadata (set once at init)
+    uint64_t task_window_size;
+    int32_t task_window_mask;
+    uint64_t heap_size;
+    uint64_t task_descriptors_offset;  // Offset from SM base, in bytes
+
+    // Per-ring data pointers (host-side, set by setup_pointers)
+    PTO2TaskDescriptor *task_descriptors;
+    PTO2TaskPayload *task_payloads;
+    PTO2TaskSlotState *slot_states;
+
+    int32_t get_slot_by_task_id(int32_t local_task_id) { return local_task_id & task_window_mask; }
+
+    PTO2TaskDescriptor &get_task_by_slot(int32_t slot) { return task_descriptors[slot]; }
+
+    PTO2TaskDescriptor &get_task_by_task_id(int32_t local_id) {
+        return task_descriptors[get_slot_by_task_id(local_id)];
+    }
+
+    PTO2TaskPayload &get_payload_by_slot(int32_t slot) { return task_payloads[slot]; }
+
+    PTO2TaskPayload &get_payload_by_task_id(int32_t local_id) { return task_payloads[get_slot_by_task_id(local_id)]; }
+
+    PTO2TaskSlotState &get_slot_state_by_slot(int32_t slot) { return slot_states[slot]; }
+
+    PTO2TaskSlotState &get_slot_state_by_task_id(int32_t local_id) {
+        return slot_states[get_slot_by_task_id(local_id)];
+    }
+};
+
+/**
+ * Shared memory header structure
+ *
+ * Contains per-ring flow control and global layout information.
+ */
+struct alignas(PTO2_ALIGN_SIZE) PTO2SharedMemoryHeader {
+    // === PER-RING FLOW CONTROL + LAYOUT INFO (set once at init) ===
+    PTO2SharedMemoryRingHeader rings[PTO2_MAX_RING_DEPTH];
+
+    // === GLOBAL FIELDS ===
+    std::atomic<int32_t> orchestrator_done;  // Flag: orchestration complete
+
+    // Total shared memory size (for validation)
+    uint64_t total_size;
+
+    // Graph output for copy-back (set by orchestrator when using packed buffer)
+    // Host finalize copies from this address instead of dev_ptr when non-zero
+    std::atomic<uint64_t> graph_output_ptr;   // Address where final output was written (packed buffer)
+    std::atomic<uint64_t> graph_output_size;  // Size in bytes
+
+    // === ERROR REPORTING ===
+
+    // Orchestrator fatal error code (Orchestrator → Scheduler, AICPU → Host)
+    // Non-zero signals fatal error. Written by orchestrator, read by scheduler and host.
+    std::atomic<int32_t> orch_error_code;
+
+    // Scheduler error state (Scheduler → Host, independent of orchestrator)
+    // Written by scheduler threads on timeout; read by orchestrator and host.
+    std::atomic<uint32_t> sched_error_bitmap;  // Bit X set = thread X had error
+    std::atomic<int32_t> sched_error_code;     // Last scheduler error code (last-writer-wins)
+    std::atomic<int32_t> sched_error_thread;   // Thread index of last error writer
+};
+
+static_assert(
+    (sizeof(PTO2SharedMemoryHeader) % PTO2_ALIGN_SIZE == 0) && (sizeof(PTO2SharedMemoryHeader) < 4096),
+    "PTO2SharedMemoryHeader should be reasonably sized"
+);
+
+// =============================================================================
+// Shared Memory Handle
+// =============================================================================
+
+/**
+ * Handle for shared memory lifecycle management (create/destroy).
+ * Runtime components (orchestrator, scheduler) use PTO2SharedMemoryHeader* directly.
+ */
+struct PTO2SharedMemoryHandle {
+    void *sm_base;     // Base address of shared memory
+    uint64_t sm_size;  // Total size of shared memory
+
+    PTO2SharedMemoryHeader *header;
+
+    // Ownership flag
+    bool is_owner;  // True if this handle allocated the memory
+
+    // === Static helpers ===
+
+    static uint64_t calculate_size(uint64_t task_window_size);
+    static uint64_t calculate_size_per_ring(const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH]);
+
+    // UT convenience: reserve wrapper + sm_base on `arena`, commit, and init
+    // using default PTO2_TASK_WINDOW_SIZE / PTO2_HEAP_SIZE. Only valid when the
+    // arena is otherwise empty (the call performs the single commit). All
+    // memory is owned by the arena — caller must not call destroy().
+    static PTO2SharedMemoryHandle *create_and_init_default(DeviceArena &arena);
+
+    // === Instance methods ===
+
+    // In-place init for caller-provided wrapper storage (e.g. a region carved
+    // out of a DeviceArena). Sets is_owner = false, calls setup_pointers and
+    // init_header. Returns false when `sm_size` is too small for the requested
+    // `task_window_size`.
+    bool init(void *sm_base, uint64_t sm_size, uint64_t task_window_size, uint64_t heap_size);
+    bool init_per_ring(
+        void *sm_base, uint64_t sm_size, const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH],
+        const uint64_t heap_sizes[PTO2_MAX_RING_DEPTH]
+    );
+
+    void destroy();
+    void print_layout();
+    bool validate();
+
+private:
+    void init_header(uint64_t task_window_size, uint64_t heap_size);
+    void init_header_per_ring(
+        const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH], const uint64_t heap_sizes[PTO2_MAX_RING_DEPTH]
+    );
+    void setup_pointers(uint64_t task_window_size);
+    void setup_pointers_per_ring(const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH]);
+};
+
+// =============================================================================
+// SM Device Layout Helpers
+// =============================================================================
+//
+// When the host pre-builds a runtime-arena image, it needs the device-side
+// addresses of several SM sub-fields (ring flow-control counters,
+// task_descriptors arrays, orch_error_code) so it can wire them into the
+// orchestrator / scheduler init_data path without dereferencing the SM —
+// the SM lives in device memory and cannot be touched from host.
+//
+// These helpers compute those addresses by offset arithmetic on the SM
+// device base. Pure pointer math, no loads/stores; safe to call from host.
+// The same arithmetic happens on AICPU too (via PTO2SharedMemoryHandle's
+// own setup_pointers), so values are guaranteed consistent across sides.
+namespace pto2_sm_layout {
+
+inline std::atomic<int32_t> *orch_error_code_addr(void *sm_dev_base) noexcept {
+    return reinterpret_cast<std::atomic<int32_t> *>(
+        static_cast<char *>(sm_dev_base) + offsetof(PTO2SharedMemoryHeader, orch_error_code)
+    );
+}
+
+inline PTO2SharedMemoryRingHeader *ring_header_addr(void *sm_dev_base, int ring_id) noexcept {
+    return reinterpret_cast<PTO2SharedMemoryRingHeader *>(
+        static_cast<char *>(sm_dev_base) + offsetof(PTO2SharedMemoryHeader, rings) +
+        static_cast<size_t>(ring_id) * sizeof(PTO2SharedMemoryRingHeader)
+    );
+}
+
+inline std::atomic<int32_t> *ring_current_task_index_addr(void *sm_dev_base, int ring_id) noexcept {
+    return reinterpret_cast<std::atomic<int32_t> *>(
+        reinterpret_cast<char *>(ring_header_addr(sm_dev_base, ring_id)) + offsetof(PTO2SharedMemoryRingHeader, fc) +
+        offsetof(PTO2RingFlowControl, current_task_index)
+    );
+}
+
+inline std::atomic<int32_t> *ring_last_task_alive_addr(void *sm_dev_base, int ring_id) noexcept {
+    return reinterpret_cast<std::atomic<int32_t> *>(
+        reinterpret_cast<char *>(ring_header_addr(sm_dev_base, ring_id)) + offsetof(PTO2SharedMemoryRingHeader, fc) +
+        offsetof(PTO2RingFlowControl, last_task_alive)
+    );
+}
+
+// Walk the per-ring SM layout (same arithmetic as setup_pointers_per_ring)
+// to compute ring `ring_id`'s task_descriptors device address. Accepts a
+// per-ring window-size array so the helper's signature mirrors
+// `PTO2SharedMemoryHandle::setup_pointers_per_ring` and cannot silently
+// disagree with the SM layout when (hypothetically) ring sizes diverge.
+inline PTO2TaskDescriptor *ring_task_descriptors_addr(
+    void *sm_dev_base, const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH], int ring_id
+) noexcept {
+    assert(ring_id >= 0 && ring_id < PTO2_MAX_RING_DEPTH && "pto2_sm_layout: ring_id out of range");
+    char *p = static_cast<char *>(sm_dev_base);
+    p += PTO2_ALIGN_UP(sizeof(PTO2SharedMemoryHeader), PTO2_ALIGN_SIZE);
+    for (int r = 0; r < ring_id; r++) {
+        p += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskDescriptor), PTO2_ALIGN_SIZE);
+        p += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskPayload), PTO2_ALIGN_SIZE);
+        p += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskSlotState), PTO2_ALIGN_SIZE);
+    }
+    return reinterpret_cast<PTO2TaskDescriptor *>(p);
+}
+
+}  // namespace pto2_sm_layout
diff --git a/src/a5/runtime/fully_distributed_within_core/runtime/pto_submit_types.h b/src/a5/runtime/fully_distributed_within_core/runtime/pto_submit_types.h
new file mode 100644
index 000000000..fa5a5df02
--- /dev/null
+++ b/src/a5/runtime/fully_distributed_within_core/runtime/pto_submit_types.h
@@ -0,0 +1,161 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+/**
+ * PTO Submit Types - Shared submit-contract definitions
+ *
+ * Header-only definitions shared by orchestration-facing and runtime-facing
+ * headers. Keeps orchestration slim (no dependency on pto_runtime2_types.h).
+ */
+
+#pragma once
+
+#include <stdint.h>
+
+inline constexpr int32_t INVALID_KERNEL_ID = -1;
+
+/**
+ * Subtask slot count: AIC, AIV0, AIV1
+ */
+inline constexpr int32_t PTO2_SUBTASK_SLOT_COUNT = 3;
+
+/**
+ * Subtask slot indices
+ */
+enum class PTO2SubtaskSlot : uint8_t {
+    AIC = 0,
+    AIV0 = 1,
+    AIV1 = 2,
+};
+
+/**
+ * Subtask mask bits (for ActiveMask)
+ */
+inline constexpr uint8_t PTO2_SUBTASK_MASK_AIC = (1u << 0);         // 0x1
+inline constexpr uint8_t PTO2_SUBTASK_MASK_AIV0 = (1u << 1);        // 0x2
+inline constexpr uint8_t PTO2_SUBTASK_MASK_AIV1 = (1u << 2);        // 0x4
+inline constexpr uint8_t PTO2_SUBTASK_FLAG_SYNC_START = (1u << 3);  // 0x8: all blocks must launch atomically
+
+/**
+ * Resource shape — classifies a MixedKernels into one of 3 scheduling buckets.
+ *
+ * Multi-subtask tasks (2+ active slots) are all scheduled as MIX. Dispatch
+ * chooses one cluster, then uses active_mask to decide which cores in that
+ * cluster must be placed together: all used cores idle -> running placement;
+ * all used cores already running with free pending slots -> pending placement;
+ * mixed used-core state is rejected and retried later.
+ *
+ * DUMMY is a synthetic shape for dep-only tasks (no AICore dispatch). Tasks
+ * with an empty core_mask route to a dedicated DUMMY ready queue and are
+ * completed inline by the scheduler dispatch loop, bypassing core allocation.
+ */
+enum class PTO2ResourceShape : uint8_t {
+    AIC = 0,    // Single AIC
+    AIV = 1,    // Single AIV
+    MIX = 2,    // Full cluster (dispatch uses active_mask)
+    DUMMY = 3,  // Dependency-only (no AICore dispatch)
+};
+
+// Number of *dispatchable* resource shapes (AIC, AIV, MIX). DUMMY does not
+// allocate a per-shape ready_queue entry / local buffer — it lives in a
+// dedicated queue inside PTO2SchedulerState.
+inline constexpr int32_t PTO2_NUM_RESOURCE_SHAPES = 3;
+
+/**
+ * Bitmask of active subtask slots + flags, sizeof == 1.
+ */
+class ActiveMask {
+public:
+    constexpr ActiveMask() = default;
+    constexpr explicit ActiveMask(uint8_t raw) :
+        raw_(raw) {}
+
+    uint8_t raw() const { return raw_; }
+
+    bool subtask_active(PTO2SubtaskSlot slot) const { return (raw_ & (1u << static_cast<uint8_t>(slot))) != 0; }
+
+    uint8_t core_mask() const { return raw_ & 0x07u; }
+
+    bool requires_sync_start() const { return (raw_ & PTO2_SUBTASK_FLAG_SYNC_START) != 0; }
+
+    PTO2ResourceShape to_shape() const {
+        uint8_t cmask = core_mask();
+        if (cmask == 0) return PTO2ResourceShape::DUMMY;
+        int bit_count = __builtin_popcount(cmask);
+        if (bit_count >= 2) return PTO2ResourceShape::MIX;
+        if (cmask & PTO2_SUBTASK_MASK_AIC) return PTO2ResourceShape::AIC;
+        return PTO2ResourceShape::AIV;
+    }
+
+    void set_sync_start() { raw_ |= PTO2_SUBTASK_FLAG_SYNC_START; }
+
+    bool operator==(ActiveMask other) const { return raw_ == other.raw_; }
+    bool operator!=(ActiveMask other) const { return raw_ != other.raw_; }
+
+    ActiveMask operator|(ActiveMask other) const { return ActiveMask(raw_ | other.raw_); }
+    ActiveMask &operator|=(ActiveMask other) {
+        raw_ |= other.raw_;
+        return *this;
+    }
+
+    ActiveMask operator&(uint8_t mask) const { return ActiveMask(raw_ & mask); }
+
+    bool has_mask(uint8_t mask) const { return (raw_ & mask) != 0; }
+
+    explicit operator bool() const { return raw_ != 0; }
+
+private:
+    uint8_t raw_{0};
+};
+
+static_assert(sizeof(ActiveMask) == 1, "ActiveMask must be exactly 1 byte");
+
+/**
+ * Mixed-task submit contract.
+ *
+ * Each field holds either a valid kernel ID or INVALID_KERNEL_ID (inactive).
+ * At least one slot must be valid.
+ */
+struct MixedKernels {
+    int32_t aic_kernel_id{INVALID_KERNEL_ID};
+    int32_t aiv0_kernel_id{INVALID_KERNEL_ID};
+    int32_t aiv1_kernel_id{INVALID_KERNEL_ID};
+
+    ActiveMask to_active_mask() const {
+        uint8_t mask = 0;
+        if (aic_kernel_id != INVALID_KERNEL_ID) mask |= PTO2_SUBTASK_MASK_AIC;
+        if (aiv0_kernel_id != INVALID_KERNEL_ID) mask |= PTO2_SUBTASK_MASK_AIV0;
+        if (aiv1_kernel_id != INVALID_KERNEL_ID) mask |= PTO2_SUBTASK_MASK_AIV1;
+        return ActiveMask(mask);
+    }
+};
+
+/**
+ * SPMD launch parameters carried inside Arg.
+ *
+ * Controls how many logical blocks (SPMD dimension) a single task
+ * is expanded into at dispatch time.  Each block receives a unique
+ * block_idx in [0, core_num) via the per-dispatch LocalContext.
+ */
+class PTO2LaunchSpec {
+public:
+    constexpr PTO2LaunchSpec() = default;
+
+    int16_t core_num() const { return core_num_; }
+    void set_core_num(int16_t n) { core_num_ = n; }
+
+    bool require_sync_start() const { return require_sync_start_; }
+    void set_require_sync_start(bool v) { require_sync_start_ = v; }
+
+private:
+    int16_t core_num_{1};
+    bool require_sync_start_{false};
+};
diff --git a/src/a5/runtime/fully_distributed_within_core/runtime/pto_tensormap.h b/src/a5/runtime/fully_distributed_within_core/runtime/pto_tensormap.h
new file mode 100644
index 000000000..30017fadd
--- /dev/null
+++ b/src/a5/runtime/fully_distributed_within_core/runtime/pto_tensormap.h
@@ -0,0 +1,723 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+/**
+ * PTO Runtime2 - TensorMap Interface
+ *
+ * TensorMap provides producer lookup for dependency discovery:
+ * - Maps Tensor -> producer task ID
+ * - Used by pto_submit_task() to find dependencies
+ *
+ * Key design features:
+ * 1. Ring buffer pool for entries (no malloc/free)
+ * 2. Lazy invalidation (entries become stale when producer retires)
+ * 3. Per-task per-ring entry tracking for efficient cleanup
+ * 4. OVERLAP DETECTION: Detects dependencies for overlapping sub-regions
+ *
+ * Hash table with chaining:
+ * - buckets[] array of head offsets
+ * - Entries linked via next_in_bucket
+ * - Insert at head (newest first) for sorted chains
+ *
+ * CRITICAL: Hash only by base_ptr
+ * ==============================
+ * For overlap detection to work, ALL sub-regions of the same base tensor
+ * MUST be in the SAME hash bucket. This allows lookup to compare all
+ * potentially overlapping regions.
+ *
+ * Overlap detection: Two regions create a dependency if:
+ *   1. Same base_ptr (raw tensor pointer)
+ *   2. Byte ranges [offset, offset+size) intersect
+ *
+ * Based on: docs/RUNTIME_LOGIC.md
+ */
+
+#pragma once
+
+#include "common.h"
+#include "profiling_config.h"
+#include "utils/device_arena.h"
+#include "pto_runtime2_types.h"
+#include "tensor.h"
+
+// Overlap geometry types. Relocated here from tensor.h: they are used only by
+// the runtime's overlap-detection / dependency machinery, not by the
+// wire/host-facing Tensor definition.
+enum class OverlapStatus {
+    NO_OVERLAP,
+    COVERED,
+    OTHER,
+};
+
+struct Segment {
+    uint64_t begin;
+    uint64_t end;
+
+    bool line_segment_intersection(const Segment &other) const { return end > other.begin && other.end > begin; }
+    bool contains(const Segment &other) const { return begin <= other.begin && other.end <= end; }
+};
+
+/**
+ * Layout descriptor produced by PTO2TensorMap::reserve_layout(). Stores the
+ * region offsets returned by DeviceArena::reserve() so init_from_layout()
+ * can fetch the matching pointers after the arena is committed.
+ *
+ * All offsets are relative to the arena's base.
+ */
+struct PTO2TensorMapLayout {
+    size_t off_buckets;
+    size_t off_entry_pool;
+    size_t off_free_entry_list;
+    size_t off_task_entry_heads[PTO2_MAX_RING_DEPTH];
+    int32_t num_buckets;
+    int32_t pool_size;
+    int32_t task_window_sizes[PTO2_MAX_RING_DEPTH];
+};
+
+// TensorMap Lookup Profiling (must precede inline lookup/insert methods).
+#if PTO2_TENSORMAP_PROFILING
+extern uint64_t g_lookup_chain_total;
+extern uint64_t g_lookup_count;
+extern int32_t g_lookup_chain_max;
+extern uint64_t g_lookup_overlap_checks;
+extern uint64_t g_lookup_overlap_hits;
+extern uint64_t g_insert_count;
+#endif
+
+// =============================================================================
+// TensorMap Structure
+// =============================================================================
+
+/**
+ * TensorMap entry structure — cache-line optimized for lookup
+ *
+ * Cache line 1 (64B, lookup hot path) mirrors Tensor cache line 1 byte-for-byte
+ * from byte 16 onward, so that `memcpy(this, &tensor, 64)` populates everything
+ * we need for overlap checks. Bytes [0, 16) carry entry-only fields (hash
+ * bucket head + chain pointer) that overlap Tensor::buffer (addr in [0, 8) is
+ * the hash key, size in [8, 16) is unused by the entry — we repurpose it for
+ * `next_in_bucket`).
+ *
+ *   buffer_addr / next_in_bucket / producer_task_id   — chain traversal + match
+ *   start_offset                                       — overlap byte range begin
+ *   version, ndims, dtype, manual_dep, is_contiguous   — overlap fast path
+ *   shapes[5]                                          — overlap comparison (line 1)
+ *
+ * Cache line 2 (64B, slow-path / non-contiguous overlap):
+ *   prev_in_bucket / next_in_task / prev_in_task       — chain manipulation
+ *   bucket_index                                       — bookkeeping
+ *   extent_elem_cache                                  — overlap byte range end
+ *   strides[5]                                          — reserved for L2 overlap (PR-2)
+ *
+ * When both entry & probe are `is_contiguous && start_offset == 0`, the overlap
+ * check derives `extent_elem = prod(shapes)` from cache line 1 alone.
+ *
+ * Entry size: 128B (2 cache lines), matches Tensor.
+ */
+struct alignas(64) PTO2TensorMapEntry {
+    // === Cache line 1 (64B) — lookup hot path; mirrors Tensor line 1 from byte 16 ===
+    uint64_t buffer_addr;                // 8B [0, 8):   tensor base address (hash key, mirrors Tensor::buffer.addr)
+    PTO2TensorMapEntry *next_in_bucket;  // 8B [8, 16):  next entry in hash bucket chain (overlays Tensor::buffer.size)
+    PTO2TaskId producer_task_id;         // 8B [16,24):  mirrors Tensor::owner_task_id slot
+    uint64_t start_offset;               // 8B [24,32):  mirrors Tensor::start_offset (element offset)
+    int32_t version;                     // 4B [32,36):  mirrors Tensor::version
+    uint32_t ndims;                      // 4B [36,40):  mirrors Tensor::ndims
+    DataType dtype;                      // 1B [40,41):  mirrors Tensor::dtype
+    bool manual_dep;                     // 1B [41,42):  mirrors Tensor::manual_dep
+    bool is_contiguous;                  // 1B [42,43):  mirrors Tensor::is_contiguous
+    uint8_t __padding1__;                // 1B [43,44):  mirrors Tensor padding
+    uint32_t shapes[MAX_TENSOR_DIMS];    // 20B [44,64): mirrors Tensor::shapes
+
+    // === Cache line 2 (64B) — chain manipulation + non-contiguous overlap data ===
+    PTO2TensorMapEntry *prev_in_bucket;  // 8B [64, 72)
+    PTO2TensorMapEntry *next_in_task;    // 8B [72, 80)
+    PTO2TensorMapEntry *prev_in_task;    // 8B [80, 88)
+    int32_t bucket_index;                // 4B [88, 92): -1 when unlinked
+    uint32_t __padding2__;               // 4B [92, 96)
+    uint64_t extent_elem_cache;          // 8B [96,104): non-contiguous extent (mirrors Tensor)
+    uint32_t strides[MAX_TENSOR_DIMS];   // 20B [104,124): element strides, mirrors Tensor::strides
+    uint8_t __padding3__[4];             // 4B [124,128)
+
+    /**
+     * Copy overlap-relevant fields from a Tensor into this entry.
+     *
+     * 64B memcpy of Tensor cache line 1 populates buffer_addr (byte [0,8)),
+     * producer_task_id, start_offset, version, ndims, dtype, manual_dep,
+     * is_contiguous and shapes[]. Byte [8,16) holds Tensor::buffer.size in
+     * the source and gets written into next_in_bucket; that's harmless
+     * because link_entry() overwrites next_in_bucket immediately after.
+     *
+     * Cache line 2 (stride / extent_elem_cache) is derived from line 1 when
+     * the source is canonically contiguous (is_contiguous && start_offset==0),
+     * so the producer Tensor's cache line 2 stays cold during insert. Only
+     * non-contiguous producers pay one extra line 2 read.
+     */
+    void copy_from_tensor(const Tensor &tensor) {
+        memcpy(this, &tensor, 64);
+        if (tensor.is_contiguous && tensor.start_offset == 0) {
+            uint64_t numel = 1;
+            for (uint32_t i = 0; i < tensor.ndims; i++)
+                numel *= tensor.shapes[i];
+            extent_elem_cache = numel;
+            uint32_t s = 1;
+            for (int32_t i = static_cast<int32_t>(tensor.ndims) - 1; i >= 0; i--) {
+                strides[i] = s;
+                s *= tensor.shapes[i];
+            }
+        } else {
+            extent_elem_cache = tensor.extent_elem_cache;
+            for (uint32_t i = 0; i < tensor.ndims; i++) {
+                strides[i] = tensor.strides[i];
+            }
+        }
+    }
+
+    void copy_tensor_create_info(const TensorCreateInfo &tensor_create_info, uint64_t addr) {
+        memcpy(this, &tensor_create_info, 64);
+        buffer_addr = addr;
+        // Create-info outputs are always contiguous with start_offset = 0;
+        // extent_elem = prod(shapes); stride is row-major.
+        uint64_t numel = 1;
+        for (uint32_t i = 0; i < tensor_create_info.ndims; i++) {
+            numel *= tensor_create_info.shapes[i];
+        }
+        extent_elem_cache = numel;
+        uint32_t s = 1;
+        for (int32_t i = static_cast<int32_t>(tensor_create_info.ndims) - 1; i >= 0; i--) {
+            strides[i] = s;
+            s *= tensor_create_info.shapes[i];
+        }
+    }
+
+    /**
+     * Effective element extent of this entry.
+     * Contiguous-aligned views compute it from shapes alone (line 1 hit only);
+     * non-contiguous views read the cached value from line 2.
+     */
+    uint64_t effective_extent_elem() const {
+        if (is_contiguous) {
+            uint64_t n = 1;
+            for (uint32_t i = 0; i < ndims; i++)
+                n *= shapes[i];
+            return n;
+        }
+        return extent_elem_cache;
+    }
+
+    /**
+     * Check overlap between input tensor and this entry (the producer output).
+     *
+     * Three-level cascade:
+     *   L1 — O(1) byte-range intersection. Disjoint -> NO_OVERLAP.
+     *   L2 — O(ndims) hyper-rectangle precise check, eligible only when both
+     *        sides share the same canonical row-major axis layout (same
+     *        dtype/ndims/strides[], stride descends as integer multiples,
+     *        start_offset decomposes cleanly under the reference shape).
+     *        Yields NO_OVERLAP / COVERED / OTHER per-dim.
+     *   L3 — Non-hyper-rectangle pairs (transpose/permute mismatch, slice
+     *        with step, etc): conservative OTHER. Exact enumeration via
+     *        contiguous-segment merge is scheduled for a follow-up.
+     *
+     * COVERED is returned when `input` completely contains `entry` per-dim
+     * — dep_compute uses this to retire the now-redundant entry.
+     */
+    OverlapStatus check_overlap(const Tensor &input) const {
+        debug_assert(input.buffer.addr == buffer_addr);
+        debug_assert(input.version >= version);
+        if (input.version > version) {
+            return OverlapStatus::OTHER;
+        }
+
+        // -------- L1: byte-range intersection (O(1) fast reject) --------
+        const uint64_t in_begin = input.start_offset;
+        const uint64_t in_end = input.start_offset + input.extent_elem();
+        const uint64_t ent_begin = start_offset;
+        const uint64_t ent_end = start_offset + effective_extent_elem();
+        Segment in_range_bytes{in_begin, in_end};
+        Segment ent_range_bytes{ent_begin, ent_end};
+        if (!in_range_bytes.line_segment_intersection(ent_range_bytes)) {
+            return OverlapStatus::NO_OVERLAP;
+        }
+
+        // -------- L2 prereqs: same axis layout? --------
+        if (input.dtype != dtype || input.ndims != ndims || ndims == 0) {
+            return OverlapStatus::OTHER;
+        }
+        for (uint32_t i = 0; i < ndims; i++) {
+            if (input.strides[i] != strides[i]) return OverlapStatus::OTHER;
+        }
+        // strides[ndims-1] must be 1 and strides[i-1] must be an integer
+        // multiple of strides[i] for the row-major reference-shape derivation
+        // below to hold. This rejects slice-with-step (strides[d] != prev factor)
+        // and any view chain that scrambles the axis order. (strides is
+        // uint32_t with the > 0 invariant enforced at construction, so no
+        // sign check needed.)
+        if (strides[ndims - 1] != 1) return OverlapStatus::OTHER;
+        for (uint32_t i = 1; i < ndims; i++) {
+            if (strides[i - 1] % strides[i] != 0) return OverlapStatus::OTHER;
+        }
+
+        // Derive reference shape A from stride. By construction stride is
+        // row-major over A: strides[i] = prod(A[i+1..ndims-1]). So
+        //   A[i] = strides[i-1] / strides[i]   for i >= 1
+        //   A[0] = (buffer.size / dtype_bytes) / strides[0]
+        // input.buffer.size is the storage size; entry shares the same buffer
+        // (debug-asserted by buffer.addr equality at the top), so we read it
+        // from input rather than mirroring buffer.size into the entry.
+        //
+        // Note on buffer padding: runtime allocators may over-allocate
+        // `buffer.size` (cache-line / 1024B alignment, ring-buffer slot
+        // rounding, etc). When that happens, `numel_storage` is larger than
+        // the true logical extent and `ref_shapes[0]` ends up generously over-
+        // sized. This is intentional: ref_shapes is only used as an *upper
+        // bound* in the in-bounds checks below; the actual overlap test (the
+        // per-dim line-segment intersection on the real start_offset /
+        // shapes / stride further down) is unaffected. A larger-than-truth
+        // ref_shapes[0] simply makes the bounds check more permissive — it
+        // can never cause a false NO_OVERLAP nor a false COVERED.
+        uint32_t ref_shapes[MAX_TENSOR_DIMS] = {};
+        for (uint32_t i = 1; i < ndims; i++) {
+            ref_shapes[i] = strides[i - 1] / strides[i];
+        }
+        const uint64_t elem_size = get_element_size(dtype);
+        if (elem_size == 0) return OverlapStatus::OTHER;
+        const uint64_t numel_storage = input.buffer.size / elem_size;
+        const uint32_t stride0 = strides[0];  // > 0 by Tensor invariant
+        if (numel_storage % stride0 != 0) return OverlapStatus::OTHER;
+        ref_shapes[0] = static_cast<uint32_t>(numel_storage / stride0);
+
+        // Decompose start_offset into row-major multi-dim offsets. By the same
+        // relation strides[i] = prod(ref_shapes[i+1..]) so dividing by strides[i]
+        // (no inner loop) yields each axis offset directly.
+        uint32_t in_offsets[MAX_TENSOR_DIMS] = {};
+        uint32_t ent_offsets[MAX_TENSOR_DIMS] = {};
+        uint64_t in_remain = input.start_offset;
+        uint64_t ent_remain = start_offset;
+        for (uint32_t i = 0; i < ndims; i++) {
+            const uint32_t s = strides[i];
+            in_offsets[i] = static_cast<uint32_t>(in_remain / s);
+            ent_offsets[i] = static_cast<uint32_t>(ent_remain / s);
+            in_remain %= s;
+            ent_remain %= s;
+        }
+        if (in_remain != 0 || ent_remain != 0) return OverlapStatus::OTHER;
+
+        // Validate that each side fits within ref_shapes (defense in depth —
+        // a well-formed view always satisfies this).
+        for (uint32_t i = 0; i < ndims; i++) {
+            if (static_cast<uint64_t>(in_offsets[i]) + input.shapes[i] > ref_shapes[i]) return OverlapStatus::OTHER;
+            if (static_cast<uint64_t>(ent_offsets[i]) + shapes[i] > ref_shapes[i]) return OverlapStatus::OTHER;
+        }
+
+        // -------- L2 core: per-dim line-segment intersection --------
+        bool input_contains_entry = true;
+        for (uint32_t i = 0; i < ndims; i++) {
+            Segment in_seg{in_offsets[i], static_cast<uint64_t>(in_offsets[i]) + input.shapes[i]};
+            Segment ent_seg{ent_offsets[i], static_cast<uint64_t>(ent_offsets[i]) + shapes[i]};
+            if (!in_seg.line_segment_intersection(ent_seg)) {
+                return OverlapStatus::NO_OVERLAP;
+            }
+            if (!in_seg.contains(ent_seg)) {
+                input_contains_entry = false;
+            }
+        }
+        return input_contains_entry ? OverlapStatus::COVERED : OverlapStatus::OTHER;
+    }
+};
+
+static_assert(sizeof(PTO2TensorMapEntry) == 128, "TensorMapEntry must be exactly 2 cache lines (128 bytes)");
+static_assert(offsetof(PTO2TensorMapEntry, buffer_addr) == offsetof(Tensor, buffer.addr));
+static_assert(offsetof(PTO2TensorMapEntry, producer_task_id) == offsetof(Tensor, owner_task_id));
+static_assert(offsetof(PTO2TensorMapEntry, start_offset) == offsetof(Tensor, start_offset));
+static_assert(offsetof(PTO2TensorMapEntry, version) == offsetof(Tensor, version));
+static_assert(offsetof(PTO2TensorMapEntry, ndims) == offsetof(Tensor, ndims));
+static_assert(offsetof(PTO2TensorMapEntry, dtype) == offsetof(Tensor, dtype));
+static_assert(offsetof(PTO2TensorMapEntry, manual_dep) == offsetof(Tensor, manual_dep));
+static_assert(offsetof(PTO2TensorMapEntry, is_contiguous) == offsetof(Tensor, is_contiguous));
+static_assert(offsetof(PTO2TensorMapEntry, shapes) == offsetof(Tensor, shapes));
+static_assert(
+    offsetof(PTO2TensorMapEntry, prev_in_bucket) == 64, "TensorMapEntry must be exactly 2 cache lines (128 bytes)"
+);
+
+// =============================================================================
+// TensorMap Lookup Chain Length Statistics (compile-time toggle)
+// =============================================================================
+
+/**
+ * TensorMap structure
+ *
+ * Hash table with ring buffer entry pool and lazy invalidation.
+ */
+struct PTO2TensorMap {
+    // Hash table buckets (fixed size, power of 2)
+    PTO2TensorMapEntry **buckets;  // Array of offsets into entry_pool (-1 = empty)
+    int32_t num_buckets;           // Must be power of 2 for fast modulo
+
+    // Entry pool as ring buffer
+    PTO2TensorMapEntry *entry_pool;        // Ring buffer of entries
+    PTO2TensorMapEntry **free_entry_list;  // free entry ids
+    int32_t pool_size;                     // Total pool capacity
+    int32_t next_entry_idx;                // id when next entry insert
+    int32_t free_num;                      // free entry number in entry pool
+
+    // Per-ring per-task entry tracking (for efficient bucket cleanup)
+    // Indexed by [ring_id][local_id & (task_window_sizes[ring_id] - 1)]
+    PTO2TensorMapEntry **task_entry_heads[PTO2_MAX_RING_DEPTH];
+    int32_t task_window_sizes[PTO2_MAX_RING_DEPTH];  // Per-ring task window size (for slot masking)
+
+    // Per-ring validity threshold (for lazy invalidation)
+    int32_t last_task_alives[PTO2_MAX_RING_DEPTH];  // Cached from shared memory per ring
+
+    // Per-ring cleanup progress (for periodic cleanup_retired)
+    int32_t last_cleanup[PTO2_MAX_RING_DEPTH]{};
+
+    uint32_t get_task_local_id_slot(uint8_t ring_id, uint32_t task_local_id) const {
+        return task_local_id & (task_window_sizes[ring_id] - 1);
+    }
+
+    // Accessors read by scope_stats_collector. Declared unconditionally so the
+    // collector .cpp compiles at PTO2_PROFILING=0 (collector is unconditional —
+    // setter symbols must export for host dlsym; the probe call sites that use
+    // these accessors stay gated by PTO2_PROFILING).
+    int32_t current_used() const { return next_entry_idx - free_num; }
+    int32_t pool_capacity() const { return pool_size; }
+
+    // new_entry only allocates memory, does not assign attributes
+    PTO2TensorMapEntry *new_entry() {
+        if (free_num > 0) {
+            PTO2TensorMapEntry *res = free_entry_list[--free_num];
+            debug_assert(res->bucket_index == -1);
+            return res;
+        }
+        always_assert(next_entry_idx < pool_size);
+        PTO2TensorMapEntry *res = &entry_pool[next_entry_idx++];
+        debug_assert(res->bucket_index == -1);
+        return res;
+    }
+
+    void free_entry(PTO2TensorMapEntry &entry) {
+        always_assert(entry.bucket_index != -1);  // must still be in a bucket
+
+        // Update predecessor's next pointer (O(1) via prev_in_bucket)
+        if (entry.prev_in_bucket == nullptr) {
+            // Entry is the head of its bucket chain, update bucket head
+            // Must compute hash BEFORE clearing tensor
+            buckets[entry.bucket_index] = entry.next_in_bucket;
+        } else {
+            entry.prev_in_bucket->next_in_bucket = entry.next_in_bucket;
+        }
+
+        // Update successor's prev pointer
+        if (entry.next_in_bucket != nullptr) {
+            entry.next_in_bucket->prev_in_bucket = entry.prev_in_bucket;
+        }
+
+        free_entry_list[free_num++] = &entry;
+        entry.bucket_index = -1;
+        entry.next_in_bucket = nullptr;
+        entry.prev_in_bucket = nullptr;
+        entry.next_in_task = nullptr;
+        entry.prev_in_task = nullptr;
+    }
+
+    // =============================================================================
+    // TensorMap API
+    // =============================================================================
+
+    /**
+     * Phase 1: reserve every sub-region (buckets, entry_pool, free list, per-ring
+     * task_entry_heads) on the supplied arena. Records the resulting offsets in
+     * the returned layout descriptor. Must be called before the arena is
+     * committed.
+     */
+    static PTO2TensorMapLayout reserve_layout(
+        DeviceArena &arena, int32_t num_buckets, int32_t pool_size, const int32_t task_window_sizes[PTO2_MAX_RING_DEPTH]
+    );
+
+    /**
+     * Same as reserve_layout() with default sizes (PTO2_TENSORMAP_NUM_BUCKETS,
+     * PTO2_TENSORMAP_POOL_SIZE).
+     */
+    static PTO2TensorMapLayout
+    reserve_layout_default(DeviceArena &arena, const int32_t task_window_sizes[PTO2_MAX_RING_DEPTH]);
+
+    /**
+     * Phase 3a: write everything *except* arena-internal pointer fields
+     * (buckets, entry_pool, free_entry_list, task_entry_heads[r]).
+     * Uses arena.region_ptr to address the arena regions for data writes,
+     * but does not store those addresses in struct fields. Safe to call on
+     * a host arena that holds the prebuilt image.
+     */
+    bool init_data_from_layout(const PTO2TensorMapLayout &layout, DeviceArena &arena);
+
+    /**
+     * Phase 3b: write the arena-internal pointer fields. Idempotent;
+     * called once on the host arena and once on the AICPU after attach.
+     */
+    void wire_arena_pointers(const PTO2TensorMapLayout &layout, DeviceArena &arena);
+
+    /**
+     * Tear down state. Does not free memory — the arena owns the backing
+     * buffer. Pointers are set to nullptr so accidental reuse traps.
+     */
+    void destroy();
+
+    /**
+     * Update validity threshold from shared memory
+     * Called periodically to refresh the lazy invalidation threshold.
+     *
+     * @param last_task_alive  Current value from shared memory
+     */
+    void sync_validity(int32_t ring_id, int32_t last_task_alive) { this->last_task_alives[ring_id] = last_task_alive; }
+
+    /**
+     * Lookup producer for a tensor region
+     *
+     * Searches the hash table for matching regions and invokes the callback
+     * for each overlapping valid entry.
+     * Stale entries from different rings are skipped (not truncated).
+     *
+     * The callback receives (PTO2TensorMapEntry &, OverlapStatus) and should
+     * return true to continue iteration, false to stop early. It is safe for
+     * the callback to call remove_entry() on the current entry: next_in_bucket
+     * is latched before invocation.
+     *
+     * @param tensor    Tensor to look up
+     * @param on_match  Callback invoked for each overlapping entry
+     */
+    template <typename Fn>
+    void lookup(const Tensor &tensor, Fn &&on_match) {
+        uint32_t bucket_index = hash(tensor.buffer.addr);
+        PTO2TensorMapEntry *cur_entry = buckets[bucket_index];
+
+#if PTO2_TENSORMAP_PROFILING
+        g_lookup_count++;
+        int32_t chain_len = 0;
+#endif
+
+        while (cur_entry != nullptr) {
+            PTO2TensorMapEntry *next_entry = cur_entry->next_in_bucket;
+
+#if PTO2_TENSORMAP_PROFILING
+            chain_len++;
+#endif
+            // Skip stale entries (no chain truncation — entries from different
+            // rings can be interleaved, so a stale entry from one ring does NOT
+            // imply subsequent entries from other rings are also stale)
+            if (!entry_valid(*cur_entry)) {
+                cur_entry = next_entry;
+                continue;
+            }
+
+            // Entry is valid - check if regions OVERLAP (not just exact match)
+            // Since we hash only by base_ptr, all entries in this bucket have
+            // potential to overlap. We must check actual byte-range overlap.
+            if (tensor.buffer.addr == cur_entry->buffer_addr) {
+#if PTO2_TENSORMAP_PROFILING
+                g_lookup_overlap_checks++;
+#endif
+                auto overlap_status = cur_entry->check_overlap(tensor);
+                if (overlap_status != OverlapStatus::NO_OVERLAP) {
+#if PTO2_TENSORMAP_PROFILING
+                    g_lookup_overlap_hits++;
+#endif
+                    if (!on_match(*cur_entry, overlap_status)) {
+#if PTO2_TENSORMAP_PROFILING
+                        g_lookup_chain_total += chain_len;
+                        if (chain_len > g_lookup_chain_max) g_lookup_chain_max = chain_len;
+#endif
+                        return;
+                    }
+                }
+            }
+
+            // Move to next entry
+            cur_entry = next_entry;
+        }
+#if PTO2_TENSORMAP_PROFILING
+        g_lookup_chain_total += chain_len;
+        if (chain_len > g_lookup_chain_max) g_lookup_chain_max = chain_len;
+#endif
+    }
+
+    /**
+     * Insert a new entry (called when task produces output)
+     *
+     * Allocates from ring buffer pool, may overwrite stale entries.
+     * Inserts at head of hash bucket chain (maintains task_id ordering).
+     *
+     * @param tensor            Tensor produced
+     * @param producer_task_id  Task ID of producer
+     */
+    void insert(const Tensor &tensor, PTO2TaskId producer_task_id) {
+        PTO2TensorMapEntry *entry = new_entry();
+        entry->copy_from_tensor(tensor);
+        link_entry(entry, tensor.buffer.addr, producer_task_id);
+    }
+
+    /**
+     * Cleanup stale entries for retired tasks
+     *
+     * Called periodically by Orchestrator when last_task_alive advances.
+     * Removes entries from bucket chains for tasks in [old, new) range.
+     *
+     * @param old_last_task_alive  Previous threshold
+     * @param new_last_task_alive  New threshold
+     */
+    void cleanup_retired(int32_t ring_id, int32_t old_last_task_alive, int32_t new_last_task_alive) {
+        // Iterate through retired tasks on this ring and remove their entries
+        for (int32_t local_id = old_last_task_alive; local_id < new_last_task_alive; local_id++) {
+            int32_t task_slot = local_id & (task_window_sizes[ring_id] - 1);
+            PTO2TensorMapEntry *cur_entry = task_entry_heads[ring_id][task_slot];
+
+            while (cur_entry != nullptr) {
+                PTO2TensorMapEntry *next_entry = cur_entry->next_in_task;  // Save before clearing
+                // Only remove if this entry belongs to the retiring task
+                // (slot may have been reused by a newer task)
+                debug_assert(
+                    cur_entry->producer_task_id ==
+                    PTO2TaskId::make(static_cast<uint8_t>(ring_id), static_cast<uint32_t>(local_id))
+                );
+                free_entry(*cur_entry);
+                cur_entry = next_entry;
+            }
+
+            // Clear task's entry head (slot will be reused by local_id + task_window_sizes[ring_id])
+            task_entry_heads[ring_id][task_slot] = nullptr;
+        }
+    }
+
+    // =============================================================================
+    // Internal Helpers (exposed for testing)
+    // =============================================================================
+
+    /**
+     * Compute hash for tensor addr
+     *
+     * Multiplicative hash using the golden-ratio constant.  Multiplication
+     * mixes ALL input bits into the high bits of the product, so aligned
+     * addresses (low bits all-zero) still distribute evenly.  We extract
+     * the top log2(num_buckets) bits which carry the most entropy.
+     */
+    uint32_t hash(uint64_t key) {
+        key *= 0x9E3779B97F4A7C15ULL;
+        return static_cast<uint32_t>(key >> (64 - __builtin_ctz(num_buckets)));
+    }
+
+    /**
+     * Link an initialized entry into bucket and task chains.
+     */
+    void link_entry(PTO2TensorMapEntry *entry, uint64_t addr, PTO2TaskId producer_task_id) {
+#if PTO2_TENSORMAP_PROFILING
+        g_insert_count++;
+#endif
+        uint32_t bucket_index = hash(addr);
+        auto ring_id = producer_task_id.ring();
+        auto local_id = producer_task_id.local();
+        int32_t task_slot = local_id & (task_window_sizes[ring_id] - 1);
+
+        entry->producer_task_id = producer_task_id;
+
+        // Insert at head of hash bucket
+        entry->bucket_index = bucket_index;
+        entry->next_in_bucket = buckets[bucket_index];
+        if (entry->next_in_bucket != nullptr) {
+            entry->next_in_bucket->prev_in_bucket = entry;
+        }
+        buckets[bucket_index] = entry;
+        entry->prev_in_bucket = nullptr;
+
+        // Link to task's entry list
+        entry->next_in_task = task_entry_heads[ring_id][task_slot];
+        entry->prev_in_task = nullptr;
+        if (entry->next_in_task != nullptr) {
+            entry->next_in_task->prev_in_task = entry;
+        }
+        task_entry_heads[ring_id][task_slot] = entry;
+    }
+
+    /**
+     * Check if entry is valid (producer has not retired)
+     */
+    bool entry_valid(const PTO2TensorMapEntry &entry) const {
+        return static_cast<int32_t>(entry.producer_task_id.local()) >= last_task_alives[entry.producer_task_id.ring()];
+    }
+
+    void remove_entry(PTO2TensorMapEntry &entry) {
+        remove_from_task(entry);
+        free_entry(entry);
+    }
+
+    /**
+     * Remove entry from its task chain (O(1) with prev pointer)
+     * Called during pool wrap-around to unlink reused entries.
+     */
+    void remove_from_task(PTO2TensorMapEntry &entry) {
+        always_assert(entry.bucket_index != -1);  // must still be in a bucket
+        // Update predecessor's next pointer (O(1) via prev_in_task)
+        if (entry.prev_in_task == nullptr) {
+            // Entry is the head of its task chain, update task_entry_heads
+            int32_t ring_id = entry.producer_task_id.ring();
+            int32_t local_id = static_cast<int32_t>(entry.producer_task_id.local());
+            int32_t task_slot = local_id & (task_window_sizes[ring_id] - 1);
+            task_entry_heads[ring_id][task_slot] = entry.next_in_task;
+        } else {
+            entry.prev_in_task->next_in_task = entry.next_in_task;
+        }
+
+        // Update successor's prev pointer
+        if (entry.next_in_task != nullptr) {
+            entry.next_in_task->prev_in_task = entry.prev_in_task;
+        }
+
+        entry.next_in_task = nullptr;
+        entry.prev_in_task = nullptr;
+    }
+
+    // =============================================================================
+    // Debug Utilities
+    // =============================================================================
+
+    /**
+     * Print TensorMap statistics
+     */
+    void print_stats();
+
+    /**
+     * Get count of valid entries
+     */
+    int32_t valid_count();
+
+    // =============================================================================
+    // TensorMap Synchronization
+    // =============================================================================
+
+    /**
+     * Sync TensorMap validity threshold from shared memory
+     *
+     * Called periodically to refresh the lazy invalidation threshold.
+     * Also triggers cleanup if threshold has advanced significantly.
+     */
+    void sync_tensormap(PTO2TaskId task_id, int32_t sm_last_task_alive);
+};
+
+#if PTO2_TENSORMAP_PROFILING
+struct PTO2TensorMapProfilingData {
+    uint64_t lookup_chain_total;
+    uint64_t lookup_count;
+    int32_t lookup_chain_max;
+    uint64_t overlap_checks;
+    uint64_t overlap_hits;
+    uint64_t insert_count;
+};
+
+PTO2TensorMapProfilingData pto2_tensormap_get_profiling();
+#endif
diff --git a/src/a5/runtime/fully_distributed_within_core/runtime/pto_types.h b/src/a5/runtime/fully_distributed_within_core/runtime/pto_types.h
new file mode 100644
index 000000000..669771424
--- /dev/null
+++ b/src/a5/runtime/fully_distributed_within_core/runtime/pto_types.h
@@ -0,0 +1,602 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Orchestration Build Graph Types - Data structures for orchestration runtime extensions
+ *
+ * Standalone header defining orchestration-specific types for:
+ * - TaskOutputTensors: Return value from submit containing materialized output Tensors
+ * - Arg: Aggregated argument container for pto_submit_task API
+ *
+ * Tensor descriptor types (Tensor, PTOBufferHandle, TensorCreateInfo) are
+ * defined in tensor.h.
+ *
+ * This header is independent of orch_build_graph_runtime.h to allow inclusion from runtime.h
+ * without type conflicts (Handshake, TensorPair, HostApi).
+ */
+
+#ifndef SRC_A5_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_PTO_TYPES_H_
+#define SRC_A5_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_PTO_TYPES_H_
+
+#include <stdint.h>
+#include <string.h>
+
+#include <string>
+#include <type_traits>
+#include <utility>
+
+#if defined(__aarch64__)
+#include <arm_neon.h>
+#endif
+
+#include "aicpu/dump_arg_selection.h"
+#include "data_type.h"
+#include "profiling_config.h"
+#include "pto_submit_types.h"
+#include "task_args.h"
+#include "tensor.h"
+#include "tensor_create_info.h"  // runtime-only TensorCreateInfo + materialization helpers
+
+typedef enum {
+    ASYNC_ENGINE_SDMA = 0,
+    ASYNC_ENGINE_ROCE = 1,
+    ASYNC_ENGINE_URMA = 2,
+    ASYNC_ENGINE_CCU = 3,
+    NUM_ASYNC_ENGINES = 4,
+} AsyncEngine;
+
+enum class CompletionType : int32_t {
+    COUNTER = 0,
+};
+
+// =============================================================================
+// Task Output Tensors (return value from submit)
+// =============================================================================
+
+enum class PTO2ScopeMode : uint8_t {
+    AUTO = 0,
+    MANUAL = 1,
+};
+
+/**
+ * TaskOutputTensors — returned by submit, holds materialized output Tensors.
+ *
+ * Only runtime-created outputs are stored here, indexed in add_output order.
+ *
+ * The underlying storage is uninitialized; only output_count elements are
+ * valid after submit returns.  This avoids default-constructing Tensor[]
+ * on the hot path (2 KB of unnecessary zeroing per submit).
+ *
+ * Users must hold a named TaskOutputTensors variable and borrow via get_ref();
+ * binding get_ref() on an rvalue is compile-time rejected to prevent dangling.
+ *
+ * LIFETIME — single-scope only:
+ *   Internally this class stores pointers into the submitting task's payload
+ *   (PTO2TaskPayload::tensors[]), which lives in a ring-buffer slot. After
+ *   scope_end the slot becomes eligible for reuse, and a later submit will
+ *   overwrite the same Tensor storage in place. Therefore the
+ *   TaskOutputTensors instance, the const Tensor& returned by get_ref(), and
+ *   any pointer derived from either MUST NOT outlive the PTO2_SCOPE in which
+ *   submit was called — do not move/copy them to outer-scope variables, do
+ *   not capture references by std::reference_wrapper or raw pointers across
+ *   scope boundaries.
+ *
+ *   This invariant is intentionally not enforced at runtime: a reused slot
+ *   simply carries a different but valid owner_task_id, so checking
+ *   owner_task_id cannot distinguish "still mine" from "silently aliased to
+ *   an unrelated task". Misuse manifests as a wrong-tensor read with no
+ *   diagnostic.
+ */
+class TaskOutputTensors {
+public:
+    TaskOutputTensors() :
+        task_id_(PTO2TaskId::invalid()),
+        output_count_(0) {}
+
+    bool empty() const { return output_count_ == 0; }
+    uint32_t size() const { return output_count_; }
+
+    /// Borrow a materialized output tensor by index (lvalue only).
+    const Tensor &get_ref(uint32_t index) const & {
+        always_assert(index < output_count_);
+        return *tensors_[index];
+    }
+    const Tensor &get_ref(uint32_t index) const && = delete;
+
+    /// Runtime-internal: append one materialized output Tensor.
+    void materialize_output(const Tensor &tensor) {
+        always_assert(output_count_ < MAX_TENSOR_ARGS);
+        tensors_[output_count_++] = &tensor;
+    }
+
+    void set_task_id(PTO2TaskId id) { task_id_ = id; }
+
+    PTO2TaskId task_id() const { return task_id_; }
+
+private:
+    PTO2TaskId task_id_;
+    uint32_t output_count_;
+    // Upper bound: a task cannot have more outputs than total tensor args
+    // (every OUTPUT/OUTPUT_EXISTING slot is one of the Arg's tensor slots).
+    const Tensor *tensors_[MAX_TENSOR_ARGS];
+};
+
+// =============================================================================
+// Argument Types (for pto_submit_task API)
+// =============================================================================
+
+// TensorArgType is defined in tensor.h (included via task_args.h above)
+
+/**
+ * Tagged reference to a single Arg slot — either a Tensor* or a
+ * TensorCreateInfo*. The active member is determined by the slot's
+ * TensorArgType tag (OUTPUT → create_info, else → tensor pointer).
+ *
+ * Minimal-permission: the union members are private; content is set only via
+ * operator=(ptr) and read via ref()/create_info(). Copy/move are deleted — a
+ * TensorRef is written in place inside an Arg's slot array, never passed by
+ * value.
+ */
+class TensorRef {
+    union {
+        const Tensor *ptr_;
+        const TensorCreateInfo *create_info_;
+    };
+
+public:
+    TensorRef() :
+        ptr_(nullptr) {}
+    TensorRef(const TensorRef &) = delete;
+    TensorRef(TensorRef &&) = delete;
+    TensorRef &operator=(const TensorRef &) = delete;
+    TensorRef &operator=(TensorRef &&) = delete;
+
+    TensorRef &operator=(const Tensor *p) {
+        ptr_ = p;
+        return *this;
+    }
+    TensorRef &operator=(const TensorCreateInfo *ci) {
+        create_info_ = ci;
+        return *this;
+    }
+
+    const Tensor &ref() const { return *ptr_; }
+    const TensorCreateInfo &create_info() const { return *create_info_; }
+    bool refers_to(const Tensor *t) const { return ptr_ == t; }
+    bool refers_to(const TensorCreateInfo *ci) const { return create_info_ == ci; }
+};
+
+/**
+ * Aggregated argument container for pto_submit_task
+ *
+ * Inherits storage from TaskArgsTpl<TensorRef, uint64_t, MAX_TENSOR_ARGS, MAX_SCALAR_ARGS, TensorArgType>.
+ * Each tensor slot stores a TensorRef union (Tensor* or TensorCreateInfo)
+ * discriminated by the corresponding tag().
+ * Tensors are dispatched first in kernel args, followed by scalars.
+ *
+ * Output arguments follow two distinct ownership models:
+ * - add_output(const TensorCreateInfo&): OUTPUT — runtime allocates buffer
+ *   and materializes a new Tensor, returned via TaskOutputTensors.
+ * - add_inout(const Tensor&): INOUT — reuses an existing Tensor as the write target.
+ *
+ * Example:
+ *   Tensor x = make_tensor_external(dev_a, shapes, 2);
+ *   TensorCreateInfo ci(shapes, 2);  // must outlive submit
+ *   Arg args;
+ *   args.add_input(x);
+ *   args.add_output(ci);
+ *   args.add_scalar(some_value);
+ *   TaskOutputTensors outs = rt_submit_aic_task(kernel_id, args);
+ *   const Tensor& y = outs.get_ref(0);
+ */
+template <size_t MaxT, size_t MaxS>
+struct Arg : TaskArgsTpl<TensorRef, uint64_t, MaxT, MaxS, TensorArgType> {
+    using Base = TaskArgsTpl<TensorRef, uint64_t, MaxT, MaxS, TensorArgType>;
+    // Make dependent-base members visible for unqualified use (two-phase lookup
+    // does not search a dependent base in a class template).
+    using Base::scalar_count_;
+    using Base::scalars_;
+    using Base::tags_;
+    using Base::tensor_count_;
+    using Base::tensors_;
+
+    // Minimal-permission: an Arg is built in place and consumed by reference;
+    // it is never copied/moved (it is a large object, and its TensorRef slots
+    // are non-copyable by design).
+    Arg() = default;
+    Arg(const Arg &) = delete;
+    Arg(Arg &&) = delete;
+    Arg &operator=(const Arg &) = delete;
+    Arg &operator=(Arg &&) = delete;
+
+    bool has_error{false};
+    const char *error_msg{nullptr};
+    PTO2LaunchSpec launch_spec;  // SPMD launch parameters (block_num, etc.)
+
+    void clear() {
+        Base::clear();
+#if PTO2_PROFILING
+        dump_arg_selection_.clear();
+#endif
+        explicit_deps_ = nullptr;
+        explicit_dep_count_ = 0;
+    }
+
+    void reset() {
+        clear();
+        has_error = false;
+        error_msg = nullptr;
+    }
+
+    void set_error(const char *msg) {
+        if (!has_error) {
+            has_error = true;
+            error_msg = msg;
+        }
+    }
+
+    template <typename... Args>
+    void dump(Args &&...args) {
+#if PTO2_PROFILING
+        static_assert(
+            (std::is_lvalue_reference_v<Args> && ...),
+            "dump: temporaries are not allowed — pass tensors/scalars already added to this Arg"
+        );
+        static_assert(
+            (is_supported_dump_arg_v<Args> && ...),
+            "dump: all arguments must be Tensor, TensorCreateInfo, or scalar lvalues"
+        );
+        if constexpr (sizeof...(Args) == 0) {
+            mark_all_dump_args();
+        } else {
+            (mark_dump_arg(args), ...);
+        }
+#else
+        ((void)args, ...);
+#endif
+    }
+
+#if PTO2_PROFILING
+    uint64_t dump_arg_mask() const { return dump_arg_selection_.dump_arg_mask(); }
+    uint64_t dump_arg_index_ambiguous_mask() const { return dump_arg_selection_.dump_arg_index_ambiguous_mask(); }
+#else
+    uint64_t dump_arg_mask() const { return 0; }
+    uint64_t dump_arg_index_ambiguous_mask() const { return 0; }
+#endif
+
+    template <typename... Args>
+    void add_input(Args &&...args) {
+        assert_add_tensor_args<false, Args...>();
+        if (!check_add_tensor_capacity(static_cast<int32_t>(sizeof...(Args)))) {
+            return;
+        }
+        ((tensors_[tensor_count_] = &args, tags_[tensor_count_] = TensorArgType::INPUT, tensor_count_++), ...);
+    }
+
+    /// Batch add outputs — all Tensor or all TensorCreateInfo:
+    ///   add_output(ci1, ci2)         — runtime allocates buffers (OUTPUT)
+    ///   add_output(t1, t2)           — write-only existing tensors (OUTPUT_EXISTING)
+    template <typename... Args>
+    void add_output(Args &&...args) {
+        assert_add_tensor_args<true, Args...>();
+        if (!check_add_tensor_capacity(static_cast<int32_t>(sizeof...(Args)))) return;
+        if constexpr ((std::is_same_v<std::decay_t<Args>, TensorCreateInfo> && ...)) {
+            ((tensors_[tensor_count_] = &args, tags_[tensor_count_] = TensorArgType::OUTPUT, tensor_count_++), ...);
+        } else {
+            ((tensors_[tensor_count_] = &args, tags_[tensor_count_] = TensorArgType::OUTPUT_EXISTING, tensor_count_++),
+             ...);
+        }
+    }
+
+    template <typename... Args>
+    void add_inout(Args &&...args) {
+        assert_add_tensor_args<false, Args...>();
+        if (!check_add_tensor_capacity(static_cast<int32_t>(sizeof...(Args)))) {
+            return;
+        }
+        ((tensors_[tensor_count_] = &args, tags_[tensor_count_] = TensorArgType::INOUT, tensor_count_++), ...);
+    }
+
+    /// No-dependency existing tensor: skips OverlapMap lookup, depends on creator only.
+    template <typename... Args>
+    void add_no_dep(Args &&...args) {
+        assert_add_tensor_args<false, Args...>();
+        if (!check_add_tensor_capacity(static_cast<int32_t>(sizeof...(Args)))) return;
+        ((tensors_[tensor_count_] = &args, tags_[tensor_count_] = TensorArgType::NO_DEP, tensor_count_++), ...);
+    }
+
+    /**
+     * Attach an explicit dependency array. The Arg stores (ptr, count) without
+     * copying — the caller's array must outlive the submit (same lifetime rule
+     * as add_input/add_output, which also store pointers).
+     *
+     * count == 0 is a valid "set empty" — it clears any previously stored deps
+     * and returns. This lets callers that build the dep set conditionally pass
+     * the result through unguarded, including in the no-dep branch:
+     *   PTO2TaskId deps[3];
+     *   uint32_t n = 0;
+     *   if (have_prev) deps[n++] = prev;
+     *   if (is_last)   deps[n++] = alloc;
+     *   args.set_dependencies(deps, n);    // safe even if n == 0
+     *
+     * For count > 0, the call is single-shot: a second non-empty call after
+     * deps are already set will fail with set_error(). Use count == 0 first
+     * if you need to re-set.
+     */
+    void set_dependencies(const PTO2TaskId *deps, uint32_t count) {
+        if (count == 0) {
+            explicit_deps_ = nullptr;
+            explicit_dep_count_ = 0;
+            return;
+        }
+        if (deps == nullptr) {
+            set_error("set_dependencies: deps must not be null when count > 0");
+            return;
+        }
+        if (explicit_deps_ != nullptr) {
+            set_error("set_dependencies: may be called at most once per Arg");
+            return;
+        }
+        explicit_deps_ = deps;
+        explicit_dep_count_ = count;
+    }
+
+    uint32_t explicit_dep_count() const { return explicit_dep_count_; }
+
+    PTO2TaskId explicit_dep(uint32_t index) const {
+        always_assert(index < explicit_dep_count_);
+        return explicit_deps_[index];
+    }
+
+    const PTO2TaskId *explicit_deps_data() const { return explicit_deps_; }
+
+    /**
+     * Add scalar values. Types are deduced per argument; each value is
+     * bit-cast to uint64_t for storage. Mixed types are allowed:
+     *
+     *   args.add_scalar(uint64_val);                  // single
+     *   args.add_scalar(3.14f, int32_t(42), 7u);     // mixed batch
+     */
+    template <typename... Args>
+    void add_scalar(Args &&...args) {
+        static_assert(sizeof...(Args) >= 1, "add_scalar: at least one argument required");
+        static_assert((is_supported_scalar_arg_v<Args> && ...), "add_scalar: all types must be arithmetic or enum");
+        if (scalar_count_ + sizeof...(Args) > MaxS) {
+            set_error(scalar_cap_msg());
+            return;
+        }
+        (add_scalar_one(std::forward<Args>(args)), ...);
+    }
+
+    void add_scalars(const uint64_t *values, int count) {
+        if (count < 0 || scalar_count_ + count > MaxS) {
+            set_error(scalar_cap_msg());
+            return;
+        }
+        memcpy(&scalars_[scalar_count_], values, count * sizeof(uint64_t));
+#if PTO2_PROFILING
+        dump_arg_selection_.clear_scalar_metadata(scalar_count_, count);
+#endif
+        scalar_count_ += count;
+    }
+
+    /**
+     * Zero-extend int32 bit patterns into uint64 scalar slots.
+     * Negative values are treated as their unsigned 32-bit representation
+     * (e.g., -1 → 0x00000000FFFFFFFF, not 0xFFFFFFFFFFFFFFFF).
+     * Uses NEON to process 4 elements per iteration on aarch64.
+     */
+    void add_scalars_i32(const int32_t *values, int count) {
+        if (count < 0 || scalar_count_ + count > MaxS) {
+            set_error(scalar_cap_msg());
+            return;
+        }
+        uint64_t *dst = &scalars_[scalar_count_];
+#if defined(__aarch64__)
+        int i = 0;
+        for (; i + 4 <= count; i += 4) {
+            uint32x4_t v = vld1q_u32(reinterpret_cast<const uint32_t *>(values + i));
+            uint64x2_t lo = vmovl_u32(vget_low_u32(v));
+            uint64x2_t hi = vmovl_u32(vget_high_u32(v));
+            vst1q_u64(dst + i, lo);
+            vst1q_u64(dst + i + 2, hi);
+        }
+        for (; i < count; i++) {
+            dst[i] = static_cast<uint64_t>(static_cast<uint32_t>(values[i]));
+        }
+#else
+        for (int i = 0; i < count; i++) {
+            dst[i] = static_cast<uint64_t>(static_cast<uint32_t>(values[i]));
+        }
+#endif
+#if PTO2_PROFILING
+        dump_arg_selection_.clear_scalar_metadata(scalar_count_, count);
+#endif
+        scalar_count_ += count;
+    }
+
+    /**
+     * Copy scalars from another Arg's scalar array.
+     * Useful when multiple tasks share the same scalar data (e.g., block indices).
+     */
+    void copy_scalars_from(const Arg &src, int src_offset, int count) {
+        if (src_offset < 0 || count < 0 || src_offset + count > src.scalar_count_) {
+            set_error("Source scalar range out of bounds in copy_scalars_from");
+            return;
+        }
+        if (scalar_count_ + count > MaxS) {
+            set_error(scalar_cap_msg());
+            return;
+        }
+        memcpy(&scalars_[scalar_count_], &src.scalars_[src_offset], count * sizeof(uint64_t));
+#if PTO2_PROFILING
+        dump_arg_selection_.copy_scalar_dtypes_from(src.dump_arg_selection_, scalar_count_, src_offset, count);
+#endif
+        scalar_count_ += count;
+    }
+
+#if PTO2_PROFILING
+    const uint8_t *scalar_dtypes() const { return dump_arg_selection_.scalar_dtypes(); }
+#else
+    const uint8_t *scalar_dtypes() const { return nullptr; }
+#endif
+
+private:
+    // Caller-owned dependency array; lifetime must extend through submit.
+#if PTO2_PROFILING
+    DumpArgSelection dump_arg_selection_;
+#endif
+    const PTO2TaskId *explicit_deps_{nullptr};
+    uint32_t explicit_dep_count_{0};
+#if PTO2_PROFILING
+    template <typename T>
+    static constexpr bool is_supported_dump_arg_v =
+        std::is_same_v<std::decay_t<T>, Tensor> || std::is_same_v<std::decay_t<T>, TensorCreateInfo> ||
+        is_supported_scalar_arg_v<T>;
+#endif
+
+    // Capacity-overflow messages — spell the actual limit (MaxS/MaxT, whatever
+    // the instantiation is) into the text via std::to_string. Built once into a
+    // function-local static so set_error() can hold the const char* safely.
+    static const char *scalar_cap_msg() {
+        static const std::string msg = "Too many scalar args (max " + std::to_string(MaxS) + ")";
+        return msg.c_str();
+    }
+    static const char *tensor_cap_msg() {
+        static const std::string msg = "Too many tensor args (max " + std::to_string(MaxT) + ")";
+        return msg.c_str();
+    }
+
+    template <typename T>
+    void add_scalar_one(T &&value) {
+        scalars_[scalar_count_] = to_u64(value);
+#if PTO2_PROFILING
+        uintptr_t scalar_source_ptr = 0;
+        if constexpr (std::is_lvalue_reference_v<T>) {
+            scalar_source_ptr = reinterpret_cast<uintptr_t>(&value);
+        }
+        dump_arg_selection_.record_scalar_source(
+            scalar_count_, scalar_source_ptr, dtype_of<std::remove_cv_t<std::remove_reference_t<T>>>()
+        );
+#endif
+        scalar_count_++;
+    }
+
+#if PTO2_PROFILING
+    // No-arg dump(): mark every arg already added to this Arg.
+    void mark_all_dump_args() {
+        if (tensor_count_ == 0 && scalar_count_ == 0) {
+            set_error("dump: no arguments added to this Arg");
+            return;
+        }
+        dump_arg_selection_.mark_all(tensor_count_, scalar_count_);
+    }
+
+    void mark_dump_arg(const Tensor &tensor) {
+        for (int32_t i = 0; i < tensor_count_; i++) {
+            if (tags_[i] != TensorArgType::OUTPUT && tensors_[i].refers_to(&tensor)) {
+                dump_arg_selection_.mark_index(i);
+                return;
+            }
+        }
+        set_error("dump: tensor is not part of this Arg");
+    }
+
+    void mark_dump_arg(const TensorCreateInfo &create_info) {
+        for (int32_t i = 0; i < tensor_count_; i++) {
+            if (tags_[i] == TensorArgType::OUTPUT && tensors_[i].refers_to(&create_info)) {
+                dump_arg_selection_.mark_index(i);
+                return;
+            }
+        }
+        set_error("dump: TensorCreateInfo is not part of this Arg");
+    }
+
+    template <typename T>
+    std::enable_if_t<is_supported_scalar_arg_v<T>, void> mark_dump_arg(const T &scalar) {
+        uintptr_t ptr = reinterpret_cast<uintptr_t>(&scalar);
+        if (dump_arg_selection_.mark_scalar_by_ptr(ptr, scalar_count_, tensor_count_)) {
+            return;
+        }
+        set_error("dump: scalar is not part of this Arg");
+    }
+#endif
+
+    // Compile-time validation: arg count, value category (reject temporaries —
+    // a stored &arg would dangle after the call), and element type. Driven
+    // purely by Args, with no runtime state.
+    template <bool is_output, typename... Args>
+    static void assert_add_tensor_args() {
+        static_assert(sizeof...(Args) >= 1, "at least one argument required");
+        static_assert(
+            (std::is_lvalue_reference_v<Args> && ...),
+            "temporaries are not allowed — stored pointers would dangle after the call"
+        );
+        if constexpr (is_output) {
+            static_assert(
+                (std::is_same_v<std::decay_t<Args>, Tensor> && ...) ||
+                    (std::is_same_v<std::decay_t<Args>, TensorCreateInfo> && ...),
+                "add_output: all arguments must be the same type (all Tensor or all TensorCreateInfo)"
+            );
+        } else {
+            static_assert((std::is_same_v<std::decay_t<Args>, Tensor> && ...), "all arguments must be Tensor");
+        }
+    }
+
+    // Runtime validation: tensor-before-scalar ordering + slot capacity. Records
+    // an error and returns false on violation.
+    bool check_add_tensor_capacity(int32_t count) {
+        if (scalar_count_ != 0) {
+            set_error(
+                "add_input/add_output/add_inout called after add_scalar: "
+                "all tensors must be added before any scalars"
+            );
+            return false;
+        }
+        if (tensor_count_ + count > static_cast<int32_t>(MaxT)) {
+            set_error(tensor_cap_msg());
+            return false;
+        }
+        return true;
+    }
+};
+
+// =============================================================================
+// Task-args layer aliases
+// =============================================================================
+//
+// L0TaskArgs — core-level container used to build and submit tasks inside
+//   orchestration (small, stack-friendly).
+using L0TaskArgs = Arg<MAX_TENSOR_ARGS, MAX_SCALAR_ARGS>;
+
+// L2TaskArgs — chip-level entry-arg holding the orchestration entry's
+// already-allocated inputs (capacity matches ChipStorageTaskArgs).
+// aicpu_orchestration_entry/config receive a const L2TaskArgs&.
+struct L2TaskArgs : Arg<CHIP_MAX_TENSOR_ARGS, CHIP_MAX_SCALAR_ARGS> {
+    // Build from the executor's ChipStorageTaskArgs: each input becomes a
+    // TensorRef pointing at src's Tensor, so `src` must outlive this (on the
+    // executor path src is runtime->orch_args_storage_, alive for the whole run).
+    void create_from_chip_args(const ChipStorageTaskArgs &src) {
+        reset();
+        for (int32_t i = 0; i < src.tensor_count(); ++i) {
+            // Entry inputs are external submit-time tensors; the entry binds them
+            // by const Tensor& (replacing from_tensor_arg's old version/manual_dep
+            // reset), so this invariant is what keeps that binding behavior-preserving.
+            const Tensor &t = src.tensor(i);
+            debug_assert(!t.manual_dep && t.version == 0);
+            add_input(t);
+        }
+        for (int32_t i = 0; i < src.scalar_count(); ++i) {
+            add_scalar(src.scalar(i));
+        }
+    }
+};
+
+#endif  // SRC_A5_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_PTO_TYPES_H_
diff --git a/src/a5/runtime/fully_distributed_within_core/runtime/runtime.h b/src/a5/runtime/fully_distributed_within_core/runtime/runtime.h
new file mode 100644
index 000000000..4ac9c2db4
--- /dev/null
+++ b/src/a5/runtime/fully_distributed_within_core/runtime/runtime.h
@@ -0,0 +1,379 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Runtime Class - Device Execution and Handshake Control
+ *
+ * This class manages device-side execution through AICPU-AICore handshake
+ * protocol. Task graph construction is handled by PTO2Runtime; this class
+ * only handles:
+ * - Handshake buffers for AICPU-AICore communication
+ * - Execution parameters (block_dim, aicpu_thread_num)
+ * - Tensor pair management for host-device memory tracking
+ * - Device orchestration state (gm_sm_ptr_, orch_args_)
+ * - Function address mapping (func_id_to_addr_)
+ *
+ * Task dispatch uses a per-core PTO2DispatchPayload written by the scheduler.
+ * At dispatch time, build_payload() copies tensor pointers and scalars from
+ * the task payload into the per-core args[], populates SPMD context, then
+ * signals AICore via DATA_MAIN_BASE.
+ */
+
+#ifndef SRC_A5_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_RUNTIME_H_
+#define SRC_A5_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_RUNTIME_H_
+
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>   // for fprintf, printf
+#include <string.h>  // for memset
+
+#include <vector>
+
+#include "common/core_type.h"
+#include "common/platform_config.h"
+#include "pto2_dispatch_payload.h"
+#include "task_args.h"
+
+// =============================================================================
+// Configuration Macros
+// =============================================================================
+
+#define RUNTIME_MAX_ARGS 128
+#define RUNTIME_MAX_WORKER 108  // 36 AIC + 72 AIV cores
+#define RUNTIME_MAX_FUNC_ID 1024
+#define RUNTIME_MAX_ORCH_SO_SIZE (4 * 1024 * 1024)  // 4MB max for orchestration SO
+#define RUNTIME_MAX_ORCH_SYMBOL_NAME 64
+
+// Default ready queue shards: one shard per worker thread (total minus orchestrator)
+constexpr int RUNTIME_DEFAULT_READY_QUEUE_SHARDS = PLATFORM_MAX_AICPU_THREADS - 1;
+
+// =============================================================================
+// Data Structures
+// =============================================================================
+
+/**
+ * Handshake Structure - Shared between Host, AICPU, and AICore
+ *
+ * This structure facilitates communication and synchronization between
+ * AICPU and AICore during task execution.
+ *
+ * Protocol State Machine:
+ * 1. Initialization: AICPU sets aicpu_ready=1
+ * 2. Acknowledgment: AICore sets aicore_done=core_id+1
+ * 3. Task Dispatch: AICPU writes DATA_MAIN_BASE after updating the per-core payload
+ * 4. Task Execution: AICore reads the cached PTO2DispatchPayload and executes
+ * 5. Task Completion: AICore writes FIN to COND; AICPU observes completion
+ * 6. Shutdown: AICPU sets control=1, AICore exits
+ *
+ * Each AICore instance has its own handshake buffer to enable concurrent
+ * task execution across multiple cores.
+ */
+
+/**
+ * Handshake buffer for AICPU-AICore communication
+ *
+ * Each AICore has its own handshake buffer for synchronization with AICPU.
+ * The structure is cache-line aligned (64 bytes) to prevent false sharing
+ * between cores and optimize cache coherency operations.
+ *
+ * Profiling state lives outside this struct: enablement bits and per-core
+ * ring/reg addresses travel through `KernelArgs::enable_profiling_flag` +
+ * `KernelArgs::aicore_* per-core address arrays`, which the AICore kernel entry
+ * forwards into platform-owned per-core slots
+ * (`aicore/aicore_profiling_state.h`). Adding a profiling sub-feature does
+ * not require touching this struct anymore.
+ *
+ * Field Access Patterns:
+ * - aicpu_ready: Written by AICPU, read by AICore
+ * - aicore_done: Written by AICore, read by AICPU
+ * - task: Written by AICPU, read by AICore (Init: PTO2DispatchPayload*; runtime: unused)
+ * - core_type: Written by AICPU, read by AICore (CoreType::AIC or CoreType::AIV)
+ * - physical_core_id: Written by AICore (Phase 2), read by AICPU
+ * - aicpu_regs_ready / aicore_regs_ready: handshake sequence flags
+ */
+struct Handshake {
+    volatile uint32_t aicpu_ready;        // AICPU ready signal: 0=not ready, 1=ready
+    volatile uint32_t aicore_done;        // AICore ready signal: 0=not ready, core_id+1=ready
+    volatile uint64_t task;               // Init: PTO2DispatchPayload* (set before aicpu_ready); runtime: unused
+    volatile CoreType core_type;          // Core type: CoreType::AIC or CoreType::AIV
+    volatile uint32_t physical_core_id;   // Physical core ID
+    volatile uint32_t aicpu_regs_ready;   // AICPU register init done: 0=pending, 1=done
+    volatile uint32_t aicore_regs_ready;  // AICore ID reported: 0=pending, 1=done
+} __attribute__((aligned(64)));
+
+/**
+ * Tensor pair for tracking host-device memory mappings.
+ * Used for copy-back during finalize.
+ */
+struct TensorPair {
+    void *host_ptr;
+    void *dev_ptr;
+    size_t size;
+    // false for read-only INPUT tensors: they are never written by the kernel,
+    // so the end-of-run D2H copy-back is skipped. OUTPUT/INOUT/unknown
+    // keep the safe default of copying back.
+    bool needs_copy_back = true;
+};
+
+/**
+ * Host API function pointers for device memory operations.
+ * Allows runtime to use pluggable device memory backends.
+ */
+struct HostApi {
+    void *(*device_malloc)(size_t size);
+    void (*device_free)(void *dev_ptr);
+    int (*copy_to_device)(void *dev_ptr, const void *host_ptr, size_t size);
+    int (*copy_from_device)(void *host_ptr, const void *dev_ptr, size_t size);
+    // Set a device buffer to a byte value (device-side, no PCIe). Used to
+    // zero-init pure OUTPUT buffers in lieu of an H2D copy-in. May be
+    // null on backends that don't wire it; callers must fall back to
+    // copy_to_device.
+    int (*device_memset)(void *dev_ptr, int value, size_t size);
+    // Commit the three per-Worker pooled regions (PTO2 GM heap, PTO2 shared
+    // memory, trb prebuilt runtime arena) as three independent device
+    // allocations. `runtime_arena_size == 0` skips the third region (hbg
+    // path: hbg has no prebuilt runtime arena). Idempotent on identical
+    // sizes; returns 0 on success, -1 on allocation failure.
+    int (*setup_static_arena)(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size);
+    // Return the per-Worker pooled pointer for the PTO2 GM heap / shared
+    // memory / prebuilt runtime arena. setup_static_arena must have already
+    // committed the relevant region; the returned pointer is owned by the
+    // DeviceRunner and freed in `DeviceRunner::finalize()` — do NOT pass it
+    // to device_free or record it in `tensor_pairs_`.
+    //
+    // acquire_pooled_runtime_arena is trb-only — the runtime-arena region is
+    // only committed when setup_static_arena was invoked with
+    // runtime_arena_size > 0. Calling it on the hbg path
+    // (setup_static_arena(...,0)) returns nullptr (not undefined).
+    void *(*acquire_pooled_gm_heap)();
+    void *(*acquire_pooled_gm_sm)();
+    void *(*acquire_pooled_runtime_arena)();
+    // Single-shot upload of the entire ChipCallable buffer. `callable` is a
+    // `const ChipCallable *` (declared void* to avoid pulling task_interface
+    // headers into runtime.h). DeviceRunner walks child_offsets_ to compute
+    // total byte size, allocates device GM once, fixes up each child's
+    // resolved_addr_ in an internal host scratch (onboard: device addr; sim:
+    // dlopen function pointer), H2D's once, and returns the device-side
+    // address of the ChipCallable header. Pool-managed: identical buffer
+    // contents (FNV-1a 64-bit) hit the dedup cache; all chip buffers are
+    // bulk-freed in DeviceRunner::finalize(). Returns 0 on error or when
+    // child_count() == 0. Caller computes child addrs as
+    //     chip_dev + offsetof(ChipCallable, storage_) + child_offset(i)
+    // and stores them via runtime->set_function_bin_addr(fid, child_dev).
+    uint64_t (*upload_chip_callable_buffer)(const void *callable);
+};
+
+/**
+ * Task structure - Compatibility stub for platform layer
+ *
+ * RT2 uses PTO2DispatchPayload instead of Task for task dispatch.
+ * This stub exists only for API compatibility with device_runner.cpp.
+ * Since get_task_count() returns 0, this struct is never actually used.
+ */
+struct Task {
+    int func_id;
+    uint64_t function_bin_addr;
+};
+
+// Per-core entry point of the fully_distributed_within_core engine. Implemented
+// in runtime/dist_engine.cpp (compiled into the AICPU .so), invoked by each
+// AICore worker thread via Runtime::dist.core_main_fn. `runtime` is Runtime*,
+// `core_type` is CoreType (cast to int to keep this typedef header-light).
+// See docs/fully_distributed_within_core.md.
+typedef void (*DistCoreMainFn)(void *runtime, int core_idx, int core_type);
+
+// =============================================================================
+// Runtime Class
+// =============================================================================
+
+/**
+ * Runtime class for device execution and handshake control
+ *
+ * This class manages AICPU-AICore communication through handshake buffers.
+ * Task graph construction is handled by PTO2Runtime; this class only handles
+ * execution control and device orchestration state.
+ */
+class Runtime {
+public:
+    // Handshake buffers for AICPU-AICore communication
+    Handshake workers[RUNTIME_MAX_WORKER];  // Worker (AICore) handshake buffers
+    int worker_count;                       // Number of active workers
+
+    // Execution parameters for AICPU scheduling.
+    //
+    // aicpu_thread_num is the *total* AICPU thread count launched on this run
+    // (= orch + schedulers). AicpuExecutor splits this into one orchestrator
+    // thread (highest idx, runs aicpu_orchestration_entry) and the remaining
+    // aicpu_thread_num-1 scheduler threads that dispatch tasks to AICore.
+    // The orch thread also dispatches when env PTO2_ORCH_TO_SCHED is set.
+    int aicpu_thread_num;
+    int ready_queue_shards;  // Number of ready queue shards (1..MAX_AICPU_THREADS, default MAX-1)
+
+    // Filter-style affinity gate input (a5 onboard). Host fills before
+    // launch from device-side OCCUPY + DSMI CPU_TOPO via
+    // pto::a5::compute_allowed_cpus. The on-device gate keeps threads whose
+    // sched_getcpu() lands on one of these cpu_ids; exec_idx = position in
+    // this array drives sched/orch role assignment. Indices 0..count-2 are
+    // scheduler slots, index count-1 is the orchestrator slot. Sized to
+    // PLATFORM_MAX_AICPU_THREADS_JUST_FOR_LAUNCH for headroom — current
+    // policy is 4 sched + 1 orch = 5 active.
+    int32_t aicpu_allowed_cpus[16];
+    int32_t aicpu_allowed_cpu_count;
+    // Actual AICPU thread launch count for this run. Host sets from
+    // popcount(OCCUPY) via the topology probe. See the matching field in
+    // src/a5/runtime/host_build_graph/runtime/runtime.h for rationale.
+    int32_t aicpu_launch_count;
+
+    // PTO2 integration: kernel_id -> GM function_bin_addr mapping
+    // NOTE: Made public for direct access from aicore code
+    uint64_t func_id_to_addr_[RUNTIME_MAX_FUNC_ID];
+
+    // Orchestrator-to-scheduler transition control
+    // When true, orchestrator threads convert to scheduler threads after orchestration completes.
+    // When false (default), orchestrator threads exit after orchestration without dispatching tasks.
+    // Controlled via PTO2_ORCH_TO_SCHED environment variable.
+    bool orch_to_sched;
+
+    // ---- fully_distributed_within_core handoff (SPMD-on-core) ----
+    // The AICPU orchestrator thread does dlopen/arena setup, then hands the
+    // resolved orchestration entry + per-core engine off to the AICore worker
+    // threads through these fields instead of running orchestration/scheduling
+    // itself. Each AICore worker invokes core_main_fn(runtime, idx, core_type)
+    // once `go` is set, then increments `done_count` when finished. See
+    // runtime/dist_engine.* and docs/fully_distributed_within_core.md.
+    struct DistHandoff {
+        volatile uint64_t core_main_fn;  // DistCoreMainFn (in AICPU .so)
+        volatile uint32_t go;            // 1 once engine wired and cores may start
+        volatile int32_t num_workers;    // number of AICore workers participating
+        volatile int32_t done_count;     // workers atomically increment when done
+    } dist;
+
+private:
+    // Kernel binary tracking for cleanup
+    int registered_kernel_func_ids_[RUNTIME_MAX_FUNC_ID];
+    int registered_kernel_count_;
+
+    void *gm_sm_ptr_;                        // GM pointer to PTO2 shared memory (device)
+    void *gm_heap_ptr_;                      // GM heap for orchestrator output buffers (device)
+    void *slot_states_ptr_;                  // Pointer to PTO2TaskSlotState array (scheduler-private, for profiling)
+    ChipStorageTaskArgs orch_args_storage_;  // Copy of args for device
+
+    // Prebuilt-arena fast path (trb only). Set by the host before rtMemcpy'ing
+    // Runtime to device; AICPU reads them in the boot path to skip
+    // runtime_create_from_sm and reuse the pooled, prebuilt arena buffer
+    // (already populated by runtime_init_data_from_layout + wire on host).
+    void *prebuilt_arena_base_;
+    size_t prebuilt_runtime_offset_;
+
+    // Device orchestration SO (for dlopen on AICPU thread 3).
+    // The SO bytes themselves live in a separately-allocated device buffer
+    // owned by DeviceRunner; only the metadata below travels inside Runtime.
+    uint64_t dev_orch_so_addr_;
+    uint64_t dev_orch_so_size_;
+    // Per-callable_id dispatch. AICPU dispatches via
+    // `orch_so_table_[active_callable_id_]`; `register_new_callable_id_`
+    // signals whether the host is delivering a freshly-registered
+    // callable_id (write+dlopen) or reusing an already-loaded one.
+    int32_t active_callable_id_;
+    bool register_new_callable_id_;
+    char device_orch_func_name_[RUNTIME_MAX_ORCH_SYMBOL_NAME];
+    char device_orch_config_name_[RUNTIME_MAX_ORCH_SYMBOL_NAME];
+
+public:
+    /**
+     * Constructor - zero-initialize all arrays
+     */
+    Runtime();
+
+    // =========================================================================
+    // Performance Profiling
+    // =========================================================================
+
+    // =========================================================================
+    // Device orchestration (for AICPU thread 3)
+    // =========================================================================
+
+    void *get_gm_sm_ptr() const;
+    void *get_gm_heap_ptr() const;
+    const ChipStorageTaskArgs &get_orch_args() const;
+    void set_gm_sm_ptr(void *p);
+    void set_gm_heap(void *p);
+    void set_slot_states_ptr(void *p);
+    void set_orch_args(const ChipStorageTaskArgs &args);
+
+    // Prebuilt-arena fast path (trb only). Set by host's
+    // bind_callable_to_runtime_impl; consumed by AICPU at boot to attach a
+    // DeviceArena to `prebuilt_arena_base_` and pick up the PTO2Runtime at
+    // `prebuilt_arena_base_ + prebuilt_runtime_offset_`. Both stay zero on
+    // first construction (Runtime() ctor zeros them) so a non-prebuilt boot
+    // path can still detect "no prebuilt image set" via nullptr.
+    void set_prebuilt_arena(void *arena_base, size_t runtime_off);
+    void *get_prebuilt_arena_base() const;
+    size_t get_prebuilt_runtime_offset() const;
+
+    // Device orchestration SO binary (for dlopen on AICPU thread 3)
+    void set_dev_orch_so(uint64_t dev_addr, uint64_t size);
+    uint64_t get_dev_orch_so_addr() const;
+    uint64_t get_dev_orch_so_size() const;
+    // Per-callable_id dispatch. callable_id must be in
+    // [0, MAX_REGISTERED_CALLABLE_IDS); register_new_callable_id_ tells AICPU
+    // whether to (re)load the orch SO into orch_so_table_[callable_id] or
+    // reuse the cached entry.
+    void set_active_callable_id(int32_t callable_id, bool is_new);
+    int32_t get_active_callable_id() const;
+    bool register_new_callable_id() const;
+    void set_device_orch_func_name(const char *name);
+    const char *get_device_orch_func_name() const;
+    void set_device_orch_config_name(const char *name);
+    const char *get_device_orch_config_name() const;
+
+    uint64_t get_function_bin_addr(int func_id) const;
+    void set_function_bin_addr(int func_id, uint64_t addr);
+    /**
+     * Replay a previously-uploaded kernel address onto a fresh Runtime
+     * without recording it in registered_kernel_func_ids_. Used by
+     * DeviceRunner::bind_callable_to_runtime so prepared kernel
+     * binaries are not freed by validate_runtime_impl across runs.
+     */
+    void replay_function_bin_addr(int func_id, uint64_t addr);
+
+    int get_registered_kernel_count() const;
+    int get_registered_kernel_func_id(int index) const;
+    void clear_registered_kernels();
+
+    // =========================================================================
+    // Deprecated API (for platform compatibility, always returns 0/nullptr)
+    // Task graph is now managed by PTO2Runtime, not Runtime
+    // =========================================================================
+
+    /** @deprecated Task count is now in PTO2 shared memory */
+    int get_task_count() const { return 0; }
+
+    /** @deprecated RT2 uses PTO2DispatchPayload, not Task. Always returns nullptr. */
+    Task *get_task(int) { return nullptr; }
+
+    // =========================================================================
+    // Host API (host-only, not copied to device)
+    // =========================================================================
+
+    // Host API function pointers for device memory operations
+    // NOTE: Placed at end of class to avoid affecting device memory layout
+    HostApi host_api;
+
+    // Host-side tensor ledger for D2H copy-back at finalize. Populated by
+    // runtime_maker.cpp from orch_args at bind time, then iterated in
+    // validate_runtime_impl. Not read by AICPU/AICore — the device-side
+    // Runtime image carries the std::vector control block as harmless
+    // garbage, identical to host_api above. No fixed cap — grows with the
+    // chip-level entry-tensor count.
+    std::vector<TensorPair> tensor_pairs_;
+};
+
+#endif  // SRC_A5_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_RUNTIME_H_
diff --git a/src/a5/runtime/fully_distributed_within_core/runtime/scheduler/pto_scheduler.cpp b/src/a5/runtime/fully_distributed_within_core/runtime/scheduler/pto_scheduler.cpp
new file mode 100644
index 000000000..4b7484bc9
--- /dev/null
+++ b/src/a5/runtime/fully_distributed_within_core/runtime/scheduler/pto_scheduler.cpp
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * PTO Runtime2 - Scheduler Implementation
+ *
+ * Implements scheduler state management, ready queues, and task lifecycle.
+ *
+ * Based on: docs/RUNTIME_LOGIC.md
+ */
+
+#include "pto_scheduler.h"
+#include <inttypes.h>
+#include <stdlib.h>
+#include "common/unified_log.h"
+
+#if PTO2_PROFILING
+// Weak fallbacks for host/UT builds that don't link the scope_stats collector.
+extern "C" __attribute__((weak, visibility("hidden"))) bool is_scope_stats_enabled() { return false; }
+extern "C" __attribute__((weak, visibility("hidden"))) void scope_stats_note_heap_wrap(int) {}
+#endif
+
+// =============================================================================
+// Scheduler Profiling Counters
+// =============================================================================
+
+#if PTO2_SCHED_PROFILING
+#include "common/platform_config.h"
+
+uint64_t g_sched_lock_cycle[PLATFORM_MAX_AICPU_THREADS] = {};
+uint64_t g_sched_fanout_cycle[PLATFORM_MAX_AICPU_THREADS] = {};
+uint64_t g_sched_fanin_cycle[PLATFORM_MAX_AICPU_THREADS] = {};
+uint64_t g_sched_self_consumed_cycle[PLATFORM_MAX_AICPU_THREADS] = {};
+uint64_t g_sched_lock_wait_cycle[PLATFORM_MAX_AICPU_THREADS] = {};
+uint64_t g_sched_push_wait_cycle[PLATFORM_MAX_AICPU_THREADS] = {};
+uint64_t g_sched_pop_wait_cycle[PLATFORM_MAX_AICPU_THREADS] = {};
+uint64_t g_sched_lock_atomic_count[PLATFORM_MAX_AICPU_THREADS] = {};
+uint64_t g_sched_fanout_atomic_count[PLATFORM_MAX_AICPU_THREADS] = {};
+uint64_t g_sched_fanin_atomic_count[PLATFORM_MAX_AICPU_THREADS] = {};
+uint64_t g_sched_self_atomic_count[PLATFORM_MAX_AICPU_THREADS] = {};
+uint64_t g_sched_pop_atomic_count[PLATFORM_MAX_AICPU_THREADS] = {};
+uint64_t g_sched_complete_count[PLATFORM_MAX_AICPU_THREADS] = {};
+
+PTO2SchedProfilingData scheduler_get_profiling(int thread_idx) {
+    PTO2SchedProfilingData d;
+    d.lock_cycle = std::exchange(g_sched_lock_cycle[thread_idx], 0);
+    d.fanout_cycle = std::exchange(g_sched_fanout_cycle[thread_idx], 0);
+    d.fanin_cycle = std::exchange(g_sched_fanin_cycle[thread_idx], 0);
+    d.self_consumed_cycle = std::exchange(g_sched_self_consumed_cycle[thread_idx], 0);
+    d.lock_wait_cycle = std::exchange(g_sched_lock_wait_cycle[thread_idx], 0);
+    d.push_wait_cycle = std::exchange(g_sched_push_wait_cycle[thread_idx], 0);
+    d.pop_wait_cycle = std::exchange(g_sched_pop_wait_cycle[thread_idx], 0);
+    d.lock_atomic_count = std::exchange(g_sched_lock_atomic_count[thread_idx], 0);
+    d.fanout_atomic_count = std::exchange(g_sched_fanout_atomic_count[thread_idx], 0);
+    d.fanin_atomic_count = std::exchange(g_sched_fanin_atomic_count[thread_idx], 0);
+    d.self_atomic_count = std::exchange(g_sched_self_atomic_count[thread_idx], 0);
+    d.pop_atomic_count = std::exchange(g_sched_pop_atomic_count[thread_idx], 0);
+    d.complete_count = std::exchange(g_sched_complete_count[thread_idx], 0);
+    return d;
+}
+#endif
+
+// =============================================================================
+// Debug Utilities
+// =============================================================================
+
+void PTO2SchedulerState::print_stats() {
+    PTO2SchedulerState *sched = this;
+    LOG_INFO_V0("=== Scheduler Statistics ===");
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        if (sched->ring_sched_states[r].last_task_alive > 0) {
+            LOG_INFO_V0("Ring %d:", r);
+            LOG_INFO_V0("  last_task_alive: %d", sched->ring_sched_states[r].last_task_alive);
+            auto &dp = sched->ring_sched_states[r].dep_pool;
+            if (dp.top > 0) {
+                LOG_INFO_V0(
+                    "  dep_pool: top=%d tail=%d used=%d high_water=%d capacity=%d", dp.top, dp.tail, dp.top - dp.tail,
+                    dp.high_water, dp.capacity
+                );
+            }
+        }
+    }
+#if PTO2_SCHED_PROFILING
+    LOG_INFO_V0("tasks_completed:   %lld", (long long)sched->tasks_completed.load(std::memory_order_relaxed));
+    LOG_INFO_V0("tasks_consumed:    %lld", (long long)sched->tasks_consumed.load(std::memory_order_relaxed));
+#endif
+    LOG_INFO_V0("============================");
+}
+
+void PTO2SchedulerState::print_queues() {
+    PTO2SchedulerState *sched = this;
+    LOG_INFO_V0("=== Ready Queues ===");
+
+    const char *shape_names[] = {"AIC", "AIV", "MIX"};
+
+    for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) {
+        LOG_INFO_V0("  %s: count=%" PRIu64, shape_names[i], sched->ready_queues[i].size());
+    }
+    LOG_INFO_V0("  DUMMY: count=%" PRIu64, sched->dummy_ready_queue.size());
+
+    LOG_INFO_V0("====================");
+}
diff --git a/src/a5/runtime/fully_distributed_within_core/runtime/scheduler/pto_scheduler.h b/src/a5/runtime/fully_distributed_within_core/runtime/scheduler/pto_scheduler.h
new file mode 100644
index 000000000..6413917f0
--- /dev/null
+++ b/src/a5/runtime/fully_distributed_within_core/runtime/scheduler/pto_scheduler.h
@@ -0,0 +1,1267 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+/**
+ * PTO Runtime2 - Scheduler Interface
+ *
+ * The Scheduler is responsible for:
+ * 1. Maintaining per-resource-shape ready queues
+ * 2. Tracking task state (PENDING -> COMPLETED -> CONSUMED)
+ * 3. Managing fanin/fanout refcounts for dependency resolution
+ * 4. Advancing last_task_alive for heap reclamation
+ * 5. Two-stage mixed-task completion (subtask done bits → mixed-task complete)
+ *
+ * The Scheduler runs on Device AI_CPU and processes:
+ * - Task state transitions based on fanin_refcount
+ * - Buffer lifecycle based on fanout_refcount
+ * - Ring pointer advancement for flow control
+ *
+ * Based on: docs/RUNTIME_LOGIC.md
+ */
+
+#pragma once
+
+#include <atomic>
+
+#include "common/core_type.h"
+#include "utils/device_arena.h"
+#include "pto_async_wait.h"
+#include "pto_ring_buffer.h"
+#include "pto_runtime2_types.h"
+#include "pto_shared_memory.h"
+
+#if PTO2_SCHED_PROFILING
+#include "aicpu/device_time.h"
+#define PTO2_SCHED_CYCLE_START() uint64_t _st0 = get_sys_cnt_aicpu(), _st1
+#define PTO2_SCHED_CYCLE_LAP(acc)   \
+    do {                            \
+        _st1 = get_sys_cnt_aicpu(); \
+        acc += (_st1 - _st0);       \
+        _st0 = _st1;                \
+    } while (0)
+#endif
+
+// =============================================================================
+// Ready Queue (Lock-free bounded MPMC — Vyukov design)
+// =============================================================================
+
+/**
+ * Per-slot entry: sequence counter for ABA safety + task payload
+ */
+struct PTO2ReadyQueueSlot {
+    std::atomic<int64_t> sequence;
+    PTO2TaskSlotState *slot_state;
+};
+
+/**
+ * Thread-local ready buffer for local-first dispatch optimization.
+ *
+ * Two buffers per scheduling thread, one per CoreType (AIC=0, AIV=1).
+ * Initialized once before the scheduling loop; must be empty at
+ * the start of each iteration (verified by always_assert).
+ *
+ * Phase 1 fills per-CoreType buffers via on_task_complete().
+ * The dispatch stage drains them local-first via get_ready_tasks_batch,
+ * with any remaining tasks pushed to the global ready queue.
+ */
+// Number of CoreType values eligible for local dispatch (AIC=0, AIV=1)
+static constexpr int PTO2_LOCAL_DISPATCH_TYPE_NUM = 2;
+
+struct PTO2LocalReadyBuffer {
+    PTO2TaskSlotState **slot_states = nullptr;
+    int count = 0;
+    int capacity = 0;
+
+    void reset(PTO2TaskSlotState **buf, int cap) {
+        slot_states = buf;
+        count = 0;
+        capacity = cap;
+    }
+
+    bool try_push(PTO2TaskSlotState *s) {
+        if (slot_states && count < capacity) {
+            slot_states[count++] = s;
+            return true;
+        }
+        return false;
+    }
+
+    PTO2TaskSlotState *pop() { return (count > 0) ? slot_states[--count] : nullptr; }
+};
+
+/**
+ * Lock-free bounded MPMC queue (Dmitry Vyukov design)
+ *
+ * Key properties:
+ * - enqueue_pos and dequeue_pos on separate cache lines (no false sharing)
+ * - Per-slot sequence counter prevents ABA problem
+ * - Empty queue pop returns immediately (single atomic load, no lock)
+ * - CAS contention is split: producers only touch enqueue_pos,
+ *   consumers only touch dequeue_pos
+ */
+struct alignas(64) PTO2ReadyQueue {
+    PTO2ReadyQueueSlot *slots;
+    uint64_t capacity;
+    uint64_t mask;        // capacity - 1
+    char _pad0[64 - 24];  // Pad to own cache line
+
+    std::atomic<uint64_t> enqueue_pos;
+    char _pad1[64 - sizeof(std::atomic<uint64_t>)];  // Own cache line
+
+    std::atomic<uint64_t> dequeue_pos;
+    char _pad2[64 - sizeof(std::atomic<uint64_t>)];  // Own cache line
+
+    uint64_t size() {
+        uint64_t e = enqueue_pos.load(std::memory_order_relaxed);
+        uint64_t d = dequeue_pos.load(std::memory_order_relaxed);
+        return (e >= d) ? (e - d) : 0;
+    }
+
+    bool push(PTO2TaskSlotState *slot_state) {
+        uint64_t pos;
+        PTO2ReadyQueueSlot *slot;
+        while (true) {
+            pos = enqueue_pos.load(std::memory_order_relaxed);
+            slot = &slots[pos & mask];
+            int64_t seq = slot->sequence.load(std::memory_order_acquire);
+            int64_t diff = seq - static_cast<int64_t>(pos);
+            if (diff == 0) {
+                if (enqueue_pos.compare_exchange_weak(
+                        pos, pos + 1, std::memory_order_relaxed, std::memory_order_relaxed
+                    )) {
+                    break;
+                }
+            } else if (diff < 0) {
+                return false;  // Queue full
+            }
+        }
+
+        slot->slot_state = slot_state;
+        slot->sequence.store(static_cast<int64_t>(pos + 1), std::memory_order_release);
+        return true;
+    }
+
+    // Batch push: reserve count slots with a single CAS after confirming
+    // every target slot is available under the usual Vyukov sequence check.
+    void push_batch(PTO2TaskSlotState **items, int count) {
+        if (count == 0) return;
+
+        uint64_t pos;
+        while (true) {
+            pos = enqueue_pos.load(std::memory_order_relaxed);
+            bool ready = true;
+            for (int i = 0; i < count; i++) {
+                PTO2ReadyQueueSlot *slot = &slots[(pos + i) & mask];
+                int64_t seq = slot->sequence.load(std::memory_order_acquire);
+                int64_t diff = seq - static_cast<int64_t>(pos + i);
+                if (diff != 0) {
+                    ready = false;
+                    break;
+                }
+            }
+            if (!ready) {
+                continue;
+            }
+            if (enqueue_pos.compare_exchange_weak(
+                    pos, pos + count, std::memory_order_relaxed, std::memory_order_relaxed
+                )) {
+                break;
+            }
+        }
+
+        for (int i = 0; i < count; i++) {
+            PTO2ReadyQueueSlot *slot = &slots[(pos + i) & mask];
+            slot->slot_state = items[i];
+            slot->sequence.store(static_cast<int64_t>(pos + i + 1), std::memory_order_release);
+        }
+    }
+
+#if PTO2_ORCH_PROFILING || PTO2_SCHED_PROFILING
+    bool push(PTO2TaskSlotState *slot_state, uint64_t &atomic_count, uint64_t &wait_cycle) {
+        uint64_t pos;
+        PTO2ReadyQueueSlot *slot;
+        uint64_t t0 = get_sys_cnt_aicpu();
+        bool contended = false;
+        uint32_t atomic_ops = 0;
+        while (true) {
+            pos = enqueue_pos.load(std::memory_order_relaxed);
+            slot = &slots[pos & mask];
+            int64_t seq = slot->sequence.load(std::memory_order_acquire);
+            int64_t diff = seq - static_cast<int64_t>(pos);
+            atomic_ops += 2;  // enqueue_pos.load + sequence.load
+            if (diff == 0) {
+                if (enqueue_pos.compare_exchange_weak(
+                        pos, pos + 1, std::memory_order_relaxed, std::memory_order_relaxed
+                    )) {
+                    atomic_ops++;  // successful CAS
+                    break;
+                }
+                contended = true;
+                atomic_ops++;  // failed CAS
+            } else if (diff < 0) {
+                return false;  // Queue full
+            } else {
+                contended = true;  // diff > 0: slot not yet released, spin
+            }
+        }
+        atomic_ops++;  // final sequence.store
+        atomic_count += atomic_ops;
+        if (contended) {
+            wait_cycle += (get_sys_cnt_aicpu() - t0);
+        }
+
+        slot->slot_state = slot_state;
+        slot->sequence.store(static_cast<int64_t>(pos + 1), std::memory_order_release);
+        return true;
+    }
+#endif
+
+    PTO2TaskSlotState *pop() {
+        // Fast-path: skip slot load when queue is clearly empty
+        uint64_t d = dequeue_pos.load(std::memory_order_relaxed);
+        uint64_t e = enqueue_pos.load(std::memory_order_relaxed);
+        if (d >= e) {
+            return nullptr;
+        }
+
+        uint64_t pos;
+        PTO2ReadyQueueSlot *slot;
+        while (true) {
+            pos = dequeue_pos.load(std::memory_order_relaxed);
+            slot = &slots[pos & mask];
+            int64_t seq = slot->sequence.load(std::memory_order_acquire);
+            int64_t diff = seq - static_cast<int64_t>(pos + 1);
+            if (diff == 0) {
+                if (dequeue_pos.compare_exchange_weak(
+                        pos, pos + 1, std::memory_order_relaxed, std::memory_order_relaxed
+                    ))
+                    break;
+            } else if (diff < 0) {
+                return nullptr;  // Queue empty
+            }
+        }
+
+        PTO2TaskSlotState *result = slot->slot_state;
+        slot->sequence.store(static_cast<int64_t>(pos + mask + 1), std::memory_order_release);
+        return result;
+    }
+
+#if PTO2_SCHED_PROFILING
+    PTO2TaskSlotState *pop(uint64_t &atomic_count, uint64_t &wait_cycle) {
+        // Fast-path: skip slot load when queue is clearly empty
+        uint64_t d = dequeue_pos.load(std::memory_order_relaxed);
+        uint64_t e = enqueue_pos.load(std::memory_order_relaxed);
+        atomic_count += 2;  // dequeue_pos.load + enqueue_pos.load
+        if (d >= e) {
+            return nullptr;
+        }
+
+        uint64_t pos;
+        PTO2ReadyQueueSlot *slot;
+        uint64_t t0 = get_sys_cnt_aicpu();
+        bool contended = false;
+        uint32_t atomic_ops = 0;
+        while (true) {
+            pos = dequeue_pos.load(std::memory_order_relaxed);
+            slot = &slots[pos & mask];
+            int64_t seq = slot->sequence.load(std::memory_order_acquire);
+            int64_t diff = seq - static_cast<int64_t>(pos + 1);
+            atomic_ops += 2;  // dequeue_pos.load + sequence.load
+            if (diff == 0) {
+                if (dequeue_pos.compare_exchange_weak(
+                        pos, pos + 1, std::memory_order_relaxed, std::memory_order_relaxed
+                    )) {
+                    atomic_ops++;  // successful CAS
+                    break;
+                }
+                contended = true;
+                atomic_ops++;  // failed CAS
+            } else if (diff < 0) {
+                atomic_count += atomic_ops;
+                return nullptr;  // Queue empty
+            } else {
+                contended = true;
+            }
+        }
+        atomic_ops++;  // final sequence.store
+        atomic_count += atomic_ops;
+        if (contended) {
+            wait_cycle += (get_sys_cnt_aicpu() - t0);
+        }
+
+        PTO2TaskSlotState *result = slot->slot_state;
+        slot->sequence.store(static_cast<int64_t>(pos + mask + 1), std::memory_order_release);
+        return result;
+    }
+#endif
+
+    // Batch pop: reserve a contiguous run of ready slots with a single CAS.
+    // Returns actual number of items popped (may be less than max_count).
+    int pop_batch(PTO2TaskSlotState **out, int max_count) {
+        uint64_t pos;
+        int count;
+        while (true) {
+            pos = dequeue_pos.load(std::memory_order_relaxed);
+            count = 0;
+            while (count < max_count) {
+                PTO2ReadyQueueSlot *slot = &slots[(pos + count) & mask];
+                int64_t seq = slot->sequence.load(std::memory_order_acquire);
+                int64_t diff = seq - static_cast<int64_t>(pos + count + 1);
+                if (diff == 0) {
+                    count++;
+                    continue;
+                }
+                if (diff < 0) {
+                    break;
+                }
+                count = -1;
+                break;
+            }
+            if (count == 0) return 0;
+            if (count < 0) continue;
+            if (dequeue_pos.compare_exchange_weak(
+                    pos, pos + count, std::memory_order_relaxed, std::memory_order_relaxed
+                )) {
+                break;
+            }
+        }
+
+        for (int i = 0; i < count; i++) {
+            PTO2ReadyQueueSlot *slot = &slots[(pos + i) & mask];
+            out[i] = slot->slot_state;
+            slot->sequence.store(static_cast<int64_t>(pos + i + mask + 1), std::memory_order_release);
+        }
+        return count;
+    }
+
+#if PTO2_SCHED_PROFILING
+    int pop_batch(PTO2TaskSlotState **out, int max_count, uint64_t &atomic_count, uint64_t &wait_cycle) {
+        uint64_t pos;
+        int count;
+        uint64_t t0 = get_sys_cnt_aicpu();
+        bool contended = false;
+        uint32_t atomic_ops = 0;
+        while (true) {
+            pos = dequeue_pos.load(std::memory_order_relaxed);
+            atomic_ops++;  // dequeue_pos.load
+            count = 0;
+            while (count < max_count) {
+                PTO2ReadyQueueSlot *slot = &slots[(pos + count) & mask];
+                int64_t seq = slot->sequence.load(std::memory_order_acquire);
+                int64_t diff = seq - static_cast<int64_t>(pos + count + 1);
+                atomic_ops++;  // sequence.load
+                if (diff == 0) {
+                    count++;
+                    continue;
+                }
+                if (diff < 0) {
+                    break;
+                }
+                contended = true;
+                count = -1;
+                break;
+            }
+            if (count == 0) {
+                atomic_count += atomic_ops;
+                return 0;
+            }
+            if (count < 0) {
+                continue;
+            }
+            if (dequeue_pos.compare_exchange_weak(
+                    pos, pos + count, std::memory_order_relaxed, std::memory_order_relaxed
+                )) {
+                atomic_ops++;  // successful CAS
+                break;
+            }
+            contended = true;
+            atomic_ops++;  // failed CAS
+        }
+
+        for (int i = 0; i < count; i++) {
+            PTO2ReadyQueueSlot *slot = &slots[(pos + i) & mask];
+            out[i] = slot->slot_state;
+            slot->sequence.store(static_cast<int64_t>(pos + i + mask + 1), std::memory_order_release);
+            atomic_ops++;  // sequence.store
+        }
+        atomic_count += atomic_ops;
+        if (contended) {
+            wait_cycle += (get_sys_cnt_aicpu() - t0);
+        }
+        return count;
+    }
+#endif
+};
+
+// Cold-path ready queue operations (defined in pto_scheduler.cpp). Declared
+// as non-member so PTO2ReadyQueue stays a POD-like struct with cache-line
+// alignment. Storage is owned by the caller-supplied arena.
+//   reserve_layout: declare the slots[] region on the arena (must precede commit)
+//   init_from_layout: bind slots pointer from arena.region_ptr(off) and
+//                     initialize sequence counters
+//   destroy: forget the slots pointer (arena owns the buffer)
+size_t ready_queue_reserve_layout(DeviceArena &arena, uint64_t capacity);
+// Writes everything *except* the arena-internal `slots` pointer field
+// (sequences/positions on the slot array, capacity, mask). Uses
+// arena.region_ptr(slots_off) only to address the slot array for writes;
+// does NOT store the pointer in `queue->slots`. Call
+// `ready_queue_wire_arena_pointers` afterwards to set the field itself.
+bool ready_queue_init_data_from_layout(PTO2ReadyQueue *queue, DeviceArena &arena, size_t slots_off, uint64_t capacity);
+// Stores queue->slots = arena.region_ptr(slots_off). Idempotent.
+void ready_queue_wire_arena_pointers(PTO2ReadyQueue *queue, DeviceArena &arena, size_t slots_off);
+void ready_queue_destroy(PTO2ReadyQueue *queue);
+
+// =============================================================================
+// SPSC Queue (Single-Producer Single-Consumer, wait-free)
+// =============================================================================
+//
+// Bounded ring buffer optimized for the wiring queue use case:
+//   - Producer: orchestrator thread (push)
+//   - Consumer: scheduler thread 0 (pop_batch)
+//
+// Design based on Rigtorp's cached-index technique: each side caches
+// the other's index locally, avoiding cross-core cache line bouncing
+// on the hot path. Only when the local cache says "full" or "empty"
+// does the thread issue an acquire load on the remote index.
+//
+// Memory layout: 5 cache-line-aligned fields ensure zero false sharing.
+
+struct alignas(64) PTO2SpscQueue {
+    // --- Producer cache lines (orchestrator thread) ---
+    alignas(64) std::atomic<uint64_t> head_{0};
+    alignas(64) uint64_t tail_cached_{0};
+
+    // --- Consumer cache lines (scheduler thread 0) ---
+    alignas(64) std::atomic<uint64_t> tail_{0};
+    alignas(64) uint64_t head_cached_{0};
+
+    // --- Shared Cacheline (read only) with mask and data ptr (immutable after init) ---
+    alignas(64) PTO2TaskSlotState **buffer_{nullptr};
+    uint64_t mask_{0};
+
+    // Padding to exactly 5 cache lines
+    char padding[64 - sizeof(PTO2TaskSlotState **) - sizeof(uint64_t)];
+
+    // Reserve the backing buffer region on the supplied arena. Returns the
+    // region offset, to be passed to init_from_layout() after the arena is
+    // committed. Cache-line aligned: the buffer is shared between the
+    // orchestrator (push) and scheduler thread 0 (pop_batch), so its base
+    // must not false-share with neighboring regions.
+    static size_t reserve_layout(DeviceArena &arena, uint64_t capacity) {
+        return arena.reserve(capacity * sizeof(uintptr_t), PTO2_ALIGN_SIZE);
+    }
+
+    // Writes everything except the arena-internal `buffer_` pointer field
+    // (zeros the slot pointer array, mask/head/tail). The host pre-builds the
+    // image without storing a host address in buffer_; the AICPU wires
+    // buffer_ at boot via wire_arena_pointers().
+    bool init_data_from_layout(DeviceArena &arena, size_t buffer_off, uint64_t capacity) {
+        if (capacity == 0 || (capacity & (capacity - 1)) != 0) return false;
+        auto *buf = static_cast<PTO2TaskSlotState **>(arena.region_ptr(buffer_off));
+        // calloc'd-equivalent: zero the slot pointers so spurious early pops
+        // observe nullptr.
+        for (uint64_t i = 0; i < capacity; i++)
+            buf[i] = nullptr;
+        mask_ = capacity - 1;
+        head_.store(0, std::memory_order_relaxed);
+        tail_.store(0, std::memory_order_relaxed);
+        tail_cached_ = 0;
+        head_cached_ = 0;
+        return true;
+    }
+
+    // Wire the arena-internal pointer. Called by both host (with host arena)
+    // and AICPU (with device arena attached to the prebuilt image).
+    void wire_arena_pointers(DeviceArena &arena, size_t buffer_off) {
+        buffer_ = static_cast<PTO2TaskSlotState **>(arena.region_ptr(buffer_off));
+    }
+
+    // Arena owns the buffer; here we only forget our pointer.
+    void destroy() { buffer_ = nullptr; }
+
+    // Push one item (producer only). Returns false if queue is full.
+    // Full condition: next_h - tail > mask_ (i.e. > capacity-1), so the
+    // effective usable capacity is capacity-1 (one slot is wasted as a
+    // sentinel to distinguish full from empty). uint64_t wrapping is safe
+    // since head and tail are monotonically increasing and subtraction
+    // wraps correctly.
+    bool push(PTO2TaskSlotState *item) {
+        uint64_t h = head_.load(std::memory_order_relaxed);
+        uint64_t next_h = h + 1;
+        if (next_h - tail_cached_ > mask_) {
+            tail_cached_ = tail_.load(std::memory_order_acquire);
+            if (next_h - tail_cached_ > mask_) {
+                return false;
+            }
+        }
+        buffer_[h & mask_] = item;
+        head_.store(next_h, std::memory_order_release);
+        return true;
+    }
+
+    // Pop up to max_count items (consumer only). Returns actual count.
+    int pop_batch(PTO2TaskSlotState **out, int max_count) {
+        uint64_t t = tail_.load(std::memory_order_relaxed);
+        uint64_t avail = head_cached_ - t;
+        if (avail < static_cast<uint64_t>(max_count)) {
+            head_cached_ = head_.load(std::memory_order_acquire);
+            avail = head_cached_ - t;
+            if (avail == 0) return 0;
+        }
+        int count = (avail < static_cast<uint64_t>(max_count)) ? static_cast<int>(avail) : max_count;
+        for (int i = 0; i < count; i++) {
+            out[i] = buffer_[(t + i) & mask_];
+        }
+        tail_.store(t + count, std::memory_order_release);
+        return count;
+    }
+
+    // Approximate size (used for backoff decisions, not exact).
+    uint64_t size() const {
+        uint64_t h = head_.load(std::memory_order_acquire);
+        uint64_t t = tail_.load(std::memory_order_acquire);
+        return h - t;
+    }
+};
+
+static_assert(sizeof(PTO2SpscQueue) == 5 * 64, "PTO2SpscQueue must be exactly 5 cache lines (320B)");
+// =============================================================================
+
+/**
+ * Statistics returned by mixed-task completion processing
+ */
+struct CompletionStats {
+    int32_t fanout_edges;       // Number of fanout edges traversed (notify consumers)
+    int32_t tasks_enqueued;     // Number of consumers that became READY
+    int32_t fanin_edges;        // Number of fanin edges traversed (release producers)
+    bool mixed_task_completed;  // True only when this callback completed a mixed task
+};
+
+/**
+ * Layout descriptor produced by PTO2SchedulerState::reserve_layout(). Holds
+ * the arena offsets of every sub-region the scheduler needs plus the
+ * capacities used at layout time (init_from_layout reuses them).
+ */
+struct PTO2SchedulerLayout {
+    size_t off_ready_queue_slots[PTO2_NUM_RESOURCE_SHAPES];
+    size_t off_dummy_ready_queue_slots;
+    size_t off_dep_pool_entries[PTO2_MAX_RING_DEPTH];
+    size_t off_wiring_spsc_buffer;
+    uint64_t ready_queue_capacity;
+    uint64_t spsc_capacity;
+    int32_t dep_pool_capacities[PTO2_MAX_RING_DEPTH];
+};
+
+/**
+ * Scheduler state structure
+ *
+ * Contains dynamic state updated during task execution.
+ * Separated from shared memory for cache efficiency.
+ * Hot-path methods are defined inline (implicitly inline as member functions).
+ */
+struct PTO2SchedulerState {
+    // Shared memory access
+    PTO2SharedMemoryHeader *sm_header;
+
+    // Per-ring state
+    struct alignas(64) RingSchedState {
+        // --- Cache Line 0: ring pointer (read-only) + hot path (read-write) ---
+        PTO2SharedMemoryRingHeader *ring;
+        int32_t last_task_alive;
+        std::atomic<int32_t> advance_lock;  // multi-thread CAS
+
+        // --- Cache Line 1+: Thread 0 only (wiring dep_pool) ---
+        alignas(64) PTO2DepListPool dep_pool;
+#if PTO2_PROFILING
+        // Published only for scope_stats; orchestrator must not read dep_pool's non-atomic counters directly.
+        alignas(64) std::atomic<int32_t> dep_pool_snapshot_tail;
+        std::atomic<int32_t> dep_pool_snapshot_top;
+#endif
+
+        // Initialize arena-internal data + arena-external pointers; does NOT
+        // store dep_pool.base (that lives in the runtime arena and is wired
+        // by SchedulerState::wire_arena_pointers). The `ring` field stores
+        // the device address of the SM ring header — computed via offset
+        // arithmetic, no SM dereference.
+        bool init_data_from_layout(void *sm_dev_base, int32_t ring_id);
+        void destroy();
+
+        void sync_to_sm() { ring->fc.last_task_alive.store(last_task_alive, std::memory_order_release); }
+
+#if PTO2_PROFILING
+        void publish_dep_pool_snapshot() {
+            dep_pool_snapshot_tail.store(dep_pool.tail, std::memory_order_release);
+            dep_pool_snapshot_top.store(dep_pool.top, std::memory_order_release);
+        }
+
+        void read_dep_pool_snapshot(int32_t &tail, int32_t &top) const {
+            top = dep_pool_snapshot_top.load(std::memory_order_acquire);
+            tail = dep_pool_snapshot_tail.load(std::memory_order_acquire);
+            if (tail > top) tail = top;
+        }
+#endif
+
+        void advance_ring_pointers() {
+            int32_t current_task_index = ring->fc.current_task_index.load(std::memory_order_acquire);
+            int32_t old_last_task_alive = last_task_alive;
+
+            while (last_task_alive < current_task_index) {
+                PTO2TaskSlotState &slot_state = ring->get_slot_state_by_task_id(last_task_alive);
+                if (slot_state.task_state.load(std::memory_order_acquire) != PTO2_TASK_CONSUMED) {
+                    break;
+                }
+                last_task_alive++;
+            }
+
+            // Eager reset: prepare reclaimed slots for reuse while still hot in cache.
+            // Safe because last_task_alive has advanced past these slots but
+            // sync_to_sm has not yet published — the orchestrator cannot reuse
+            // them until the release store below.
+            // Skips payload, task, ring_id — immutable after RingSchedState::init().
+            for (int32_t id = old_last_task_alive; id < last_task_alive; id++) {
+                ring->get_slot_state_by_task_id(id).reset_for_reuse();
+            }
+
+            sync_to_sm();
+        }
+    } ring_sched_states[PTO2_MAX_RING_DEPTH];
+
+    // Ready queues remain global (scheduling is ring-agnostic)
+    PTO2ReadyQueue ready_queues[PTO2_NUM_RESOURCE_SHAPES];
+
+    // Dependency-only tasks (active_mask is empty, shape == DUMMY). Drained by
+    // the dispatch loop and completed inline -- never goes to AICore.
+    PTO2ReadyQueue dummy_ready_queue;
+
+    // Wiring subsystem — groups all wiring-related state for cache-line isolation.
+    //
+    // Three cache-line regions by writer:
+    //   1. batch_*  / backoff — thread 0 exclusive (local batch buffer)
+    //   2. queue    — SPSC: orchestrator push, thread 0 pop
+    //   3. orch_needs_drain — orchestrator write, thread 0 read
+    struct alignas(64) WiringState {
+        static constexpr uint64_t BATCH_SIZE = 30;
+        static constexpr int BACKOFF_LIMIT = 32;
+
+        // --- Thread 0 exclusive: local batch buffer + backoff ---
+        int batch_count = 0;
+        int batch_index = 0;
+        int backoff_counter = 0;
+        PTO2TaskSlotState *batch[BATCH_SIZE];
+
+        // --- SPSC queue: orchestrator (push) ↔ thread 0 (pop) ---
+        alignas(64) PTO2SpscQueue queue;
+
+        // --- Orchestrator write, thread 0 read ---
+        alignas(64) std::atomic<bool> orch_needs_drain{false};
+    } wiring;
+
+    static_assert(
+        offsetof(WiringState, queue) == 256, "WiringState: batch region must be exactly 4 cache lines before queue"
+    );
+    static_assert(sizeof(WiringState) == 640, "WiringState must be exactly 10 cache lines (640B)");
+
+    alignas(64) AsyncWaitList async_wait_list;
+
+    // Statistics (cold path, isolated from hot-path fields)
+#if PTO2_SCHED_PROFILING
+    alignas(64) std::atomic<int64_t> tasks_completed;
+    std::atomic<int64_t> tasks_consumed;
+#endif
+    // =========================================================================
+    // Inline hot-path methods
+    // =========================================================================
+
+    /**
+     * Drain wiring queue: pop submitted tasks and wire their fanout edges.
+     * Called by scheduler thread 0 each loop iteration. Sets fanin_count,
+     * acquires fanout_lock per producer, allocates dep_pool entries, and
+     * pushes ready tasks to the appropriate ready queue.
+     *
+     * @return Number of tasks wired this call.
+     */
+
+    int drain_wiring_queue(bool force_drain = false) {
+        int wired = 0;
+
+        // Refill local batch buffer when exhausted.
+        if (wiring.batch_index >= wiring.batch_count) {
+            // Backoff: defer pop when queue holds fewer than a full batch,
+            // unless force_drain, orch_needs_drain, or backoff limit reached.
+            if (!force_drain && wiring.queue.size() < WiringState::BATCH_SIZE) {
+                if (!wiring.orch_needs_drain.load(std::memory_order_acquire) &&
+                    wiring.backoff_counter < WiringState::BACKOFF_LIMIT) {
+                    wiring.backoff_counter++;
+                    return 0;
+                }
+            }
+            wiring.backoff_counter = 0;
+            wiring.batch_count = wiring.queue.pop_batch(wiring.batch, WiringState::BATCH_SIZE);
+            wiring.batch_index = 0;
+            if (wiring.batch_count == 0) return 0;
+        }
+
+        // Process tasks from local buffer in strict FIFO order.
+        while (wiring.batch_index < wiring.batch_count) {
+            PTO2TaskSlotState *ws = wiring.batch[wiring.batch_index];
+            int ring_id = ws->ring_id;
+            auto &rss = ring_sched_states[ring_id];
+            int32_t wfanin = ws->payload->fanin_actual_count;
+
+            if (wfanin > 0 && rss.dep_pool.available() < wfanin) {
+                rss.dep_pool.reclaim(*rss.ring, rss.last_task_alive);
+                if (rss.dep_pool.available() < wfanin) {
+#if PTO2_PROFILING
+                    if (is_scope_stats_enabled()) {
+                        rss.publish_dep_pool_snapshot();
+                    }
+#endif
+                    break;  // not enough dep_pool space — keep remainder for next call
+                }
+            }
+
+            wiring.batch_index++;
+            wire_task(rss, ws, wfanin);
+            wired++;
+        }
+
+        return wired;
+    }
+
+    // Route a ready slot to the right global queue. Dummy tasks (empty
+    // active_mask) live in dummy_ready_queue; everything else goes to the
+    // per-shape ready_queues[]. Used by paths that do not have a thread-local
+    // ready buffer (e.g. wiring). See push_ready_routed_local for the
+    // dispatch-time fast path.
+    void push_ready_routed(PTO2TaskSlotState *slot_state) {
+        PTO2ResourceShape shape = slot_state->active_mask.to_shape();
+        if (shape == PTO2ResourceShape::DUMMY) {
+            dummy_ready_queue.push(slot_state);
+        } else {
+            ready_queues[static_cast<int32_t>(shape)].push(slot_state);
+        }
+    }
+
+    /**
+     * Wire fanout edges for a single task. Sets fanin_count, acquires each
+     * producer's fanout_lock, allocates dep_pool entries for live producers,
+     * pushes the task to the ready queue once its fanin refcount is satisfied.
+     */
+    void wire_task(RingSchedState &rss, PTO2TaskSlotState *ws, int32_t wfanin) {
+        PTO2TaskPayload *wp = ws->payload;
+        ws->fanin_count = wfanin + 1;
+
+        if (wfanin != 0) {
+            int32_t early_finished = 0;
+            for_each_fanin_slot_state(*wp, [&](PTO2TaskSlotState *producer) {
+                producer->lock_fanout();
+                int32_t pstate = producer->task_state.load(std::memory_order_acquire);
+                if (pstate >= PTO2_TASK_COMPLETED) {
+                    early_finished++;
+                } else {
+                    producer->fanout_head = rss.dep_pool.prepend(producer->fanout_head, ws);
+                }
+                producer->unlock_fanout();
+            });
+
+            int32_t init_rc = early_finished + 1;
+            int32_t new_rc = ws->fanin_refcount.fetch_add(init_rc, std::memory_order_acq_rel) + init_rc;
+            if (new_rc >= ws->fanin_count) {
+                push_ready_routed(ws);
+            }
+        } else {
+            ws->fanin_refcount.fetch_add(1, std::memory_order_acq_rel);
+            push_ready_routed(ws);
+        }
+
+        ws->dep_pool_mark = rss.dep_pool.top;
+#if PTO2_PROFILING
+        if (is_scope_stats_enabled()) {
+            rss.publish_dep_pool_snapshot();
+        }
+#endif
+    }
+
+    void check_and_handle_consumed(PTO2TaskSlotState &slot_state) {
+        if (slot_state.fanout_refcount.load(std::memory_order_acquire) != slot_state.fanout_count) return;
+
+        PTO2TaskState expected = PTO2_TASK_COMPLETED;
+        if (!slot_state.task_state.compare_exchange_strong(
+                expected, PTO2_TASK_CONSUMED, std::memory_order_acq_rel, std::memory_order_acquire
+            )) {
+            return;
+        }
+
+#if PTO2_SCHED_PROFILING
+        tasks_consumed.fetch_add(1, std::memory_order_relaxed);
+#endif
+
+        int32_t ring_id = slot_state.ring_id;
+        // Try-lock — if another thread is advancing this ring, it will scan our CONSUMED task
+        int32_t expected_lock = 0;
+        if (ring_sched_states[ring_id].advance_lock.compare_exchange_strong(
+                expected_lock, 1, std::memory_order_acquire, std::memory_order_relaxed
+            )) {
+            ring_sched_states[ring_id].advance_ring_pointers();
+            ring_sched_states[ring_id].advance_lock.store(0, std::memory_order_release);
+        }
+    }
+
+#if PTO2_ORCH_PROFILING || PTO2_SCHED_PROFILING
+    void check_and_handle_consumed(PTO2TaskSlotState &slot_state, uint64_t &atomic_count) {
+        int32_t fc = slot_state.fanout_count;
+        int32_t rc = slot_state.fanout_refcount.load(std::memory_order_acquire);
+
+        atomic_count += 2;  // fanout_count.load + fanout_refcount.load
+
+        if (rc != fc) return;
+
+        PTO2TaskState expected = PTO2_TASK_COMPLETED;
+        if (!slot_state.task_state.compare_exchange_strong(
+                expected, PTO2_TASK_CONSUMED, std::memory_order_acq_rel, std::memory_order_acquire
+            )) {
+            atomic_count += 1;  // failed CAS
+            return;
+        }
+
+        atomic_count += 1;  // successful CAS
+
+#if PTO2_SCHED_PROFILING
+        tasks_consumed.fetch_add(1, std::memory_order_relaxed);
+#endif
+
+        int32_t ring_id = slot_state.ring_id;
+        // Try-lock — if another thread is advancing this ring, it will scan our CONSUMED task
+        int32_t expected_lock = 0;
+        if (ring_sched_states[ring_id].advance_lock.compare_exchange_strong(
+                expected_lock, 1, std::memory_order_acquire, std::memory_order_relaxed
+            )) {
+            ring_sched_states[ring_id].advance_ring_pointers();
+            ring_sched_states[ring_id].advance_lock.store(0, std::memory_order_release);
+            atomic_count += 2;  // try-lock CAS + unlock store
+        } else {
+            atomic_count += 1;  // failed try-lock CAS
+        }
+    }
+#endif
+
+    void release_producer(PTO2TaskSlotState &slot_state) {
+        slot_state.fanout_refcount.fetch_add(1, std::memory_order_acq_rel);
+        check_and_handle_consumed(slot_state);
+    }
+
+#if PTO2_ORCH_PROFILING || PTO2_SCHED_PROFILING
+    void release_producer(PTO2TaskSlotState &slot_state, uint64_t &atomic_count) {
+        slot_state.fanout_refcount.fetch_add(1, std::memory_order_acq_rel);
+        atomic_count += 1;  // fanout_refcount.fetch_add
+        check_and_handle_consumed(slot_state, atomic_count);
+    }
+#endif
+
+    bool release_fanin_and_check_ready(PTO2TaskSlotState &slot_state, PTO2LocalReadyBuffer *local_bufs = nullptr) {
+        // Atomically increment fanin_refcount and check if all producers are done
+        // ACQ_REL on fanin_refcount already synchronizes with the orchestrator's
+        // init release, making fanin_count visible — plain load suffices.
+        int32_t new_refcount = slot_state.fanin_refcount.fetch_add(1, std::memory_order_acq_rel) + 1;
+
+        if (new_refcount == slot_state.fanin_count) {
+            // Local-first: try per-CoreType thread-local buffer before global queue
+            // Route by active_mask: AIC-containing tasks → buf[0], AIV-only → buf[1]
+            // DUMMY shape is out of range for local_bufs (sized PTO2_NUM_RESOURCE_SHAPES);
+            // dummy slots bypass the local fast path and go straight to dummy_ready_queue.
+            PTO2ResourceShape shape = slot_state.active_mask.to_shape();
+            if (shape == PTO2ResourceShape::DUMMY) {
+                dummy_ready_queue.push(&slot_state);
+            } else if (!local_bufs || !local_bufs[static_cast<int32_t>(shape)].try_push(&slot_state)) {
+                ready_queues[static_cast<int32_t>(shape)].push(&slot_state);
+            }
+            return true;
+        }
+        return false;
+    }
+
+#if PTO2_ORCH_PROFILING || PTO2_SCHED_PROFILING
+    bool release_fanin_and_check_ready(
+        PTO2TaskSlotState &slot_state, uint64_t &atomic_count, uint64_t &push_wait,
+        PTO2LocalReadyBuffer *local_bufs = nullptr
+    ) {
+        int32_t new_refcount = slot_state.fanin_refcount.fetch_add(1, std::memory_order_acq_rel) + 1;
+        atomic_count += 1;  // fanin_refcount.fetch_add
+
+        if (new_refcount == slot_state.fanin_count) {
+            // Local-first: try per-CoreType thread-local buffer before global queue.
+            // Dummy slots bypass local_bufs (out-of-range for PTO2_NUM_RESOURCE_SHAPES)
+            // and go straight to dummy_ready_queue; use the profiling-aware push so
+            // atomic_count / push_wait stay consistent with the non-dummy path.
+            PTO2ResourceShape shape = slot_state.active_mask.to_shape();
+            if (shape == PTO2ResourceShape::DUMMY) {
+                dummy_ready_queue.push(&slot_state, atomic_count, push_wait);
+            } else if (!local_bufs || !local_bufs[static_cast<int32_t>(shape)].try_push(&slot_state)) {
+                ready_queues[static_cast<int32_t>(shape)].push(&slot_state, atomic_count, push_wait);
+            }
+            return true;
+        }
+        return false;
+    }
+#endif
+
+    int get_ready_tasks_batch(
+        PTO2ResourceShape shape, PTO2LocalReadyBuffer &local_buf, PTO2TaskSlotState **out, int max_count
+    ) {
+        int count = 0;
+        while (count < max_count && local_buf.count > 0) {
+            out[count++] = local_buf.slot_states[--local_buf.count];
+        }
+        int remaining = max_count - count;
+        if (remaining > 0) {
+            count += ready_queues[static_cast<int32_t>(shape)].pop_batch(out + count, remaining);
+        }
+        return count;
+    }
+
+#if PTO2_SCHED_PROFILING
+    int get_ready_tasks_batch(
+        PTO2ResourceShape shape, PTO2LocalReadyBuffer &local_buf, PTO2TaskSlotState **out, int max_count,
+        uint64_t &atomic_count, uint64_t &wait_cycle
+    ) {
+        int count = 0;
+        while (count < max_count && local_buf.count > 0) {
+            out[count++] = local_buf.slot_states[--local_buf.count];
+        }
+        int remaining = max_count - count;
+        if (remaining > 0) {
+            count +=
+                ready_queues[static_cast<int32_t>(shape)].pop_batch(out + count, remaining, atomic_count, wait_cycle);
+        }
+        return count;
+    }
+#endif
+
+    void on_scope_end(PTO2TaskSlotState **task_slot_states, int32_t count) {
+#if PTO2_ORCH_PROFILING
+        extern uint64_t g_orch_scope_end_atomic_count;
+        if (count > 0) __builtin_prefetch(task_slot_states[0], 1, 0);
+        for (int32_t i = 0; i < count; i++) {
+            if (i + 1 < count) __builtin_prefetch(task_slot_states[i + 1], 1, 0);
+            release_producer(*task_slot_states[i], g_orch_scope_end_atomic_count);
+        }
+#else
+        if (count > 0) __builtin_prefetch(task_slot_states[0], 1, 0);
+        for (int32_t i = 0; i < count; i++) {
+            if (i + 1 < count) __builtin_prefetch(task_slot_states[i + 1], 1, 0);
+            release_producer(*task_slot_states[i]);
+        }
+#endif
+    }
+
+    /**
+     * Subtask completion: atomic counter model.
+     * Called when a single subtask (AIC, AIV0, or AIV1) finishes on any block.
+     * Atomically increments completed_subtasks and checks whether all subtasks
+     * across all blocks are done.
+     *
+     * @return true if this was the last subtask, completing the entire task.
+     */
+    bool on_subtask_complete(PTO2TaskSlotState &slot_state) {
+        int16_t prev = slot_state.completed_subtasks.fetch_add(1, std::memory_order_acq_rel);
+        return (prev + 1) == slot_state.total_required_subtasks;
+    }
+
+    /**
+     * Two-stage completion: second stage.
+     * Called exactly once when all subtasks of a mixed task are done
+     * (i.e., on_subtask_complete returned true).
+     * Handles fanout notification, fanin release, and self-consumption check.
+     */
+#if PTO2_SCHED_PROFILING
+    CompletionStats
+#else
+    void
+#endif
+    on_task_complete(
+        PTO2TaskSlotState &slot_state,
+#if PTO2_SCHED_PROFILING
+        int thread_idx,
+#endif
+
+        PTO2LocalReadyBuffer *local_bufs = nullptr
+    ) {
+#if PTO2_SCHED_PROFILING
+        CompletionStats stats = {0, 0, 0, true};
+#endif
+#if PTO2_SCHED_PROFILING
+        extern uint64_t g_sched_lock_cycle[], g_sched_fanout_cycle[];
+        extern uint64_t g_sched_lock_atomic_count[], g_sched_lock_wait_cycle[];
+        extern uint64_t g_sched_fanout_atomic_count[], g_sched_push_wait_cycle[];
+        uint64_t lock_atomics = 0, lock_wait = 0;
+        PTO2_SCHED_CYCLE_START();
+#endif
+
+#if PTO2_SCHED_PROFILING
+        slot_state.lock_fanout(lock_atomics, lock_wait);
+#else
+        slot_state.lock_fanout();
+#endif
+        slot_state.task_state.store(PTO2_TASK_COMPLETED, std::memory_order_release);
+        PTO2DepListEntry *current = slot_state.fanout_head;  // Protected by fanout_lock
+        slot_state.unlock_fanout();
+
+#if PTO2_SCHED_PROFILING
+        lock_atomics += 2;  // state.store + unlock.store
+        g_sched_lock_atomic_count[thread_idx] += lock_atomics;
+        g_sched_lock_wait_cycle[thread_idx] += lock_wait;
+        PTO2_SCHED_CYCLE_LAP(g_sched_lock_cycle[thread_idx]);
+#endif
+
+        // Fanout: notify consumers
+#if PTO2_SCHED_PROFILING
+        uint64_t fanout_atomics = 0, push_wait = 0;
+#endif
+        while (current != nullptr) {
+            PTO2TaskSlotState &consumer_slot = *current->slot_state;
+#if PTO2_SCHED_PROFILING
+            stats.fanout_edges++;
+            if (release_fanin_and_check_ready(consumer_slot, fanout_atomics, push_wait, local_bufs)) {
+                stats.tasks_enqueued++;
+            }
+#else
+            release_fanin_and_check_ready(consumer_slot, local_bufs);
+#endif
+            current = current->next;
+        }
+
+#if PTO2_SCHED_PROFILING
+        g_sched_fanout_atomic_count[thread_idx] += fanout_atomics;
+        g_sched_push_wait_cycle[thread_idx] += push_wait;
+        PTO2_SCHED_CYCLE_LAP(g_sched_fanout_cycle[thread_idx]);
+        return stats;
+#endif
+    }
+
+    /**
+     * Cold path: release producers (fanin traversal) + check self for CONSUMED.
+     * Returns fanin edge count for profiling.
+     */
+
+#if PTO2_SCHED_PROFILING
+    int32_t on_task_release(PTO2TaskSlotState &slot_state, int32_t thread_idx) {
+        PTO2_SCHED_CYCLE_START();
+        extern uint64_t g_sched_fanin_cycle[], g_sched_fanin_atomic_count[];
+        extern uint64_t g_sched_self_atomic_count[];
+        extern uint64_t g_sched_self_consumed_cycle[];
+        extern uint64_t g_sched_complete_count[];
+        uint64_t fanin_atomics = 0;
+#else
+    int32_t on_task_release(PTO2TaskSlotState &slot_state) {
+#endif
+        PTO2TaskPayload *payload = slot_state.payload;
+        for_each_fanin_slot_state(*payload, [&](PTO2TaskSlotState *producer_slot_state) {
+#if PTO2_SCHED_PROFILING
+            release_producer(*producer_slot_state, fanin_atomics);
+#else
+            release_producer(*producer_slot_state);
+#endif
+        });
+#if PTO2_SCHED_PROFILING
+        g_sched_fanin_atomic_count[thread_idx] += fanin_atomics;
+        PTO2_SCHED_CYCLE_LAP(g_sched_fanin_cycle[thread_idx]);
+#endif
+
+        // Self consumed check
+#if PTO2_SCHED_PROFILING
+        uint64_t self_atomics = 0;
+        check_and_handle_consumed(slot_state, self_atomics);
+        g_sched_self_atomic_count[thread_idx] += self_atomics;
+        PTO2_SCHED_CYCLE_LAP(g_sched_self_consumed_cycle[thread_idx]);
+        g_sched_complete_count[thread_idx]++;
+#else
+        check_and_handle_consumed(slot_state);
+#endif
+        return payload->fanin_actual_count;
+    }
+
+    // === Cold-path API (defined in pto_scheduler.cpp) ===
+
+    // Phase 1: declare every sub-region (ready_queue slots, dummy queue slots,
+    // per-ring dep_pool entries, wiring SPSC buffer) on the supplied arena.
+    // Capacities are baked into the returned layout; init_data_from_layout uses
+    // the same values.
+    static PTO2SchedulerLayout reserve_layout(DeviceArena &arena, int32_t dep_pool_capacity = PTO2_DEP_LIST_POOL_SIZE);
+    static PTO2SchedulerLayout
+    reserve_layout(DeviceArena &arena, const int32_t dep_pool_capacities[PTO2_MAX_RING_DEPTH]);
+
+    // Phase 3a: write everything *except* arena-internal pointer fields.
+    // `sm_dev_base` is the device address of the SM (only stored, never
+    // dereferenced here). Safe to call on a host arena that holds the
+    // prebuilt image buffer. (The orchestrator counterpart takes
+    // task_window_size for ring task_descriptors address arithmetic; the
+    // scheduler only needs the SM header / ring header base addresses,
+    // both window-size-independent.)
+    bool init_data_from_layout(const PTO2SchedulerLayout &layout, DeviceArena &arena, void *sm_dev_base);
+
+    // Phase 3b: write the arena-internal pointer fields
+    // (ready_queues[].slots, dummy_ready_queue.slots, dep_pool.base for each
+    // ring, wiring.queue.buffer_). Called on both host and device sides.
+    void wire_arena_pointers(const PTO2SchedulerLayout &layout, DeviceArena &arena);
+
+    // Forget per-region pointers; arena owns the backing memory.
+    void destroy();
+    void print_stats();
+    void print_queues();
+};
+
+// Scheduler cold-path API is declared as PTO2SchedulerState member functions.
+// See init()/destroy()/print_stats()/print_queues() below the struct definition.
+
+// Short-circuit NotDeferred completions seen during drain so they don't grow
+// entries[]. Mirrors the a2a3 impl; see that mirror for the rationale.
+inline bool
+AsyncWaitList::try_inline_complete_locked(AsyncWaitList::DrainCompletionSink &sink, PTO2TaskSlotState &slot_state) {
+#if PTO2_SCHED_PROFILING
+    sink.sched->on_task_complete(slot_state, sink.thread_idx, sink.local_bufs);
+#else
+    sink.sched->on_task_complete(slot_state, sink.local_bufs);
+#endif
+    if (*sink.deferred_release_count >= sink.deferred_release_capacity) {
+        while (*sink.deferred_release_count > 0) {
+#if PTO2_SCHED_PROFILING
+            (void)sink.sched->on_task_release(
+                *sink.deferred_release_slot_states[--(*sink.deferred_release_count)], sink.thread_idx
+            );
+#else
+            sink.sched->on_task_release(*sink.deferred_release_slot_states[--(*sink.deferred_release_count)]);
+#endif
+        }
+    }
+    sink.deferred_release_slot_states[(*sink.deferred_release_count)++] = &slot_state;
+    sink.inline_completed++;
+    return true;
+}
+
+template <bool Profiling>
+inline AsyncPollResult AsyncWaitList::poll_and_complete(
+    AICoreCompletionMailbox *aicore_mailbox, PTO2SchedulerState *sched, PTO2LocalReadyBuffer *local_bufs,
+    PTO2TaskSlotState **deferred_release_slot_states, int32_t &deferred_release_count, int32_t deferred_release_capacity
+#if PTO2_SCHED_PROFILING
+    ,
+    int thread_idx
+#endif
+) {
+    AsyncPollResult result;
+    if (!try_lock()) return result;
+
+    AsyncWaitList::DrainCompletionSink sink{};
+    sink.sched = sched;
+    sink.local_bufs = local_bufs;
+    sink.deferred_release_slot_states = deferred_release_slot_states;
+    sink.deferred_release_count = &deferred_release_count;
+    sink.deferred_release_capacity = deferred_release_capacity;
+#if PTO2_SCHED_PROFILING
+    sink.thread_idx = thread_idx;
+#endif
+
+    int32_t drain_err = PTO2_ERROR_NONE;
+    drain_aicore_completion_mailbox_locked(aicore_mailbox, sink, drain_err);
+    if (drain_err != PTO2_ERROR_NONE) {
+        result.error_code = drain_err;
+        unlock();
+        return result;
+    }
+    result.completed += sink.inline_completed;
+
+    for (int32_t i = count - 1; i >= 0; --i) {
+        AsyncWaitEntry &entry = entries[i];
+        uintptr_t last_invalidated_counter_line = static_cast<uintptr_t>(-1);
+        for (int32_t c = 0; c < entry.condition_count; c++) {
+            CompletionCondition &cond = entry.conditions[c];
+            if (cond.satisfied) continue;
+            if (cond.completion_type == COMPLETION_TYPE_COUNTER && cond.counter_addr != nullptr) {
+                uintptr_t counter_line = mailbox_cache_line(cond.counter_addr);
+                if (counter_line != last_invalidated_counter_line) {
+                    cache_invalidate_range(reinterpret_cast<const void *>(counter_line), sizeof(uint32_t));
+                    last_invalidated_counter_line = counter_line;
+                }
+            }
+            CompletionPollResult poll = cond.test();
+            if (poll.state == CompletionPollState::FAILED) {
+                result.error_code = poll.error_code;
+                result.failed_slot_state = entry.slot_state;
+                unlock();
+                return result;
+            }
+            if (poll.state == CompletionPollState::READY) {
+                cond.satisfied = true;
+                cond.retire();
+                entry.waiting_completion_count--;
+            }
+        }
+
+        if (entry.normal_done && entry.waiting_completion_count <= 0) {
+#if PTO2_SCHED_PROFILING
+            sched->on_task_complete(*entry.slot_state, thread_idx, local_bufs);
+#else
+            sched->on_task_complete(*entry.slot_state, local_bufs);
+#endif
+            if (deferred_release_count >= deferred_release_capacity) {
+                while (deferred_release_count > 0) {
+#if PTO2_SCHED_PROFILING
+                    (void)sched->on_task_release(*deferred_release_slot_states[--deferred_release_count], thread_idx);
+#else
+                    sched->on_task_release(*deferred_release_slot_states[--deferred_release_count]);
+#endif
+                }
+            }
+            deferred_release_slot_states[deferred_release_count++] = entry.slot_state;
+            result.completed++;
+
+            int32_t last = count - 1;
+            if (i != last) entries[i] = entries[last];
+            count = last;
+        }
+    }
+
+    unlock();
+    return result;
+}
+
+// =============================================================================
+// Scheduler Profiling Data
+// =============================================================================
+
+#if PTO2_SCHED_PROFILING
+struct PTO2SchedProfilingData {
+    // Sub-phase cycle breakdown within on_task_complete
+    uint64_t lock_cycle;           // lock_fanout + state store + unlock
+    uint64_t fanout_cycle;         // fanout traversal
+    uint64_t fanin_cycle;          // fanin traversal
+    uint64_t self_consumed_cycle;  // self check_and_handle_consumed
+
+    // Wait times
+    uint64_t lock_wait_cycle;  // spin-wait in fanout_lock
+    uint64_t push_wait_cycle;  // CAS contention in push()
+    uint64_t pop_wait_cycle;   // CAS contention in pop()
+
+    // Atomic counts per sub-phase
+    uint64_t lock_atomic_count;
+    uint64_t fanout_atomic_count;
+    uint64_t fanin_atomic_count;
+    uint64_t self_atomic_count;
+    uint64_t pop_atomic_count;
+
+    int64_t complete_count;
+};
+
+/**
+ * Get and reset scheduler profiling data for a specific thread.
+ * Returns accumulated profiling data and resets counters.
+ */
+PTO2SchedProfilingData scheduler_get_profiling(int thread_idx);
+#endif
diff --git a/src/a5/runtime/fully_distributed_within_core/runtime/scheduler/scheduler_cold_path.cpp b/src/a5/runtime/fully_distributed_within_core/runtime/scheduler/scheduler_cold_path.cpp
new file mode 100644
index 000000000..5e09042a1
--- /dev/null
+++ b/src/a5/runtime/fully_distributed_within_core/runtime/scheduler/scheduler_cold_path.cpp
@@ -0,0 +1,1096 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+#include "scheduler_context.h"
+
+#include <cinttypes>
+#include <cstdio>
+
+#include "common/unified_log.h"
+#include "aicpu/device_time.h"
+#include "aicpu/l2_swimlane_collector_aicpu.h"
+#include "aicpu/platform_regs.h"
+#include "aicpu/pmu_collector_aicpu.h"
+#include "aicpu/tensor_dump_aicpu.h"
+#include "common/memory_barrier.h"
+#include "common/l2_swimlane_profiling.h"
+#include "common/platform_config.h"
+#include "pto_runtime2.h"
+#include "pto_shared_memory.h"
+#include "runtime.h"
+#include "spin_hint.h"
+
+// =============================================================================
+// Cold-path helpers for the main dispatch loop (noinline to reduce hot-loop icache)
+// =============================================================================
+
+static void latch_scheduler_error(PTO2SharedMemoryHeader *header, int32_t thread_idx, int32_t error_code) {
+    if (header == nullptr || error_code == PTO2_ERROR_NONE) {
+        return;
+    }
+    // The first error code/thread pair wins; the bitmap cumulatively records all reporting threads.
+    int32_t expected = PTO2_ERROR_NONE;
+    if (header->sched_error_code.compare_exchange_strong(expected, error_code, std::memory_order_acq_rel)) {
+        header->sched_error_thread.store(thread_idx, std::memory_order_release);
+    }
+    if (thread_idx >= 0 && thread_idx < 32) {
+        header->sched_error_bitmap.fetch_or(1U << static_cast<uint32_t>(thread_idx), std::memory_order_acq_rel);
+    }
+}
+
+LoopAction SchedulerContext::handle_orchestrator_exit(
+    int32_t thread_idx, PTO2SharedMemoryHeader *header, Runtime *runtime, int32_t &task_count
+) {
+    if (completed_.load(std::memory_order_acquire)) {
+        return LoopAction::BREAK_LOOP;
+    }
+    int32_t orch_err = header->orch_error_code.load(std::memory_order_acquire);
+    if (orch_err != PTO2_ERROR_NONE) {
+        LOG_ERROR(
+            "Thread %d: Fatal error (code=%d), sending EXIT_SIGNAL to all cores. "
+            "completed_tasks=%d, total_tasks=%d",
+            thread_idx, orch_err, completed_tasks_.load(std::memory_order_relaxed), total_tasks_
+        );
+        if (!completed_.exchange(true, std::memory_order_acq_rel)) {
+            emergency_shutdown(runtime);
+        }
+        return LoopAction::BREAK_LOOP;
+    }
+    int32_t sched_err = header->sched_error_code.load(std::memory_order_acquire);
+    if (sched_err != PTO2_ERROR_NONE) {
+        LOG_ERROR("Thread %d: Scheduler fatal error detected (code=%d)", thread_idx, sched_err);
+        if (!completed_.exchange(true, std::memory_order_acq_rel)) {
+            emergency_shutdown(runtime);
+        }
+        return LoopAction::BREAK_LOOP;
+    }
+
+    bool orch_done = orchestrator_done_;
+    if (!orch_done) return LoopAction::NONE;
+
+    task_count = total_tasks_;
+    if (task_count > 0 && completed_tasks_.load(std::memory_order_relaxed) >= task_count) {
+        completed_.store(true, std::memory_order_release);
+        LOG_INFO_V0(
+            "Thread %d: PTO2 completed tasks %d/%d", thread_idx, completed_tasks_.load(std::memory_order_relaxed),
+            task_count
+        );
+        return LoopAction::BREAK_LOOP;
+    }
+    return LoopAction::NONE;
+}
+
+LoopAction SchedulerContext::handle_core_transition(bool &cores_released) {
+    if (!transition_requested_.load(std::memory_order_acquire)) return LoopAction::NONE;
+    if (!reassigned_.load(std::memory_order_acquire)) {
+        wait_reassign_.fetch_add(1, std::memory_order_release);
+        while (!reassigned_.load(std::memory_order_acquire)) {
+            if (completed_.load(std::memory_order_acquire)) {
+                return LoopAction::BREAK_LOOP;
+            }
+            SPIN_WAIT_HINT();
+        }
+    }
+    cores_released = true;
+    return LoopAction::NONE;
+}
+
+LoopAction
+SchedulerContext::check_idle_fatal_error(int32_t thread_idx, PTO2SharedMemoryHeader *header, Runtime *runtime) {
+    if (completed_.load(std::memory_order_acquire)) {
+        return LoopAction::BREAK_LOOP;
+    }
+    int32_t orch_err = header->orch_error_code.load(std::memory_order_acquire);
+    if (orch_err != PTO2_ERROR_NONE) {
+        LOG_ERROR("Thread %d: Fatal error detected (code=%d), sending EXIT_SIGNAL to all cores", thread_idx, orch_err);
+        if (!completed_.exchange(true, std::memory_order_acq_rel)) {
+            emergency_shutdown(runtime);
+        }
+        return LoopAction::BREAK_LOOP;
+    }
+    int32_t sched_err = header->sched_error_code.load(std::memory_order_acquire);
+    if (sched_err != PTO2_ERROR_NONE) {
+        LOG_ERROR("Thread %d: Scheduler fatal error detected (code=%d)", thread_idx, sched_err);
+        if (!completed_.exchange(true, std::memory_order_acq_rel)) {
+            emergency_shutdown(runtime);
+        }
+        return LoopAction::BREAK_LOOP;
+    }
+    return LoopAction::NONE;
+}
+
+// =============================================================================
+// Stall diagnostic log format.
+//
+// Every line is self-contained — when scheduler threads emit concurrently and
+// device_log interleaves their output, each line still carries enough context
+// to identify which thread / iteration / object it belongs to.
+//
+// Prefix on every line:
+//   [STALL thread=N idle_iterations=K] CATEGORY ...
+//
+// All scheduler threads spinning at the same idle rate hit STALL_LOG_INTERVAL
+// together, so lines with the same idle_iterations belong to one diagnostic
+// round; grep "idle_iterations=N" groups one round's output.
+//
+// Categories (and which thread emits them):
+//   SUMMARY  — completed / total counts and scan totals               (thread 0 only)
+//   TASK     — one per non-completed task scanned from shared rings   (thread 0 only)
+//              - state=RUNNING: includes running_on=[...] cross-ref
+//              - state=READY:   fanin satisfied but no idle core yet
+//              - state=WAIT:    includes missing_deps=N
+//   CLUSTER  — one per cluster owned by this thread                   (every thread)
+//              - busy slot shows kernel + task_id + cond_reg_state;
+//                ANOMALY suffix when COND register is fin while software
+//                still has the slot marked busy.
+//
+// Reader workflow:
+//   1. grep SUMMARY                          -> overall completion status
+//   2. grep "idle_iterations=N TASK"         -> stuck RUNNING task and which
+//                                               core/thread it is on
+//   3. grep "idle_iterations=N CLUSTER.*task=<id>" -> cross-check via the
+//                                                     cluster line (or just
+//                                                     read running_on in step 2)
+// =============================================================================
+
+namespace {
+
+// Format a core's idle/busy state into a fixed buffer. Used inside CLUSTER lines.
+// Layout (idle):    coreN(idle)
+// Layout (busy):    coreN(busy kernel=K task=T cond_reg_state=ack)
+// Layout (anomaly): coreN(busy kernel=K task=T cond_reg_state=fin ANOMALY)
+//
+// Healthy busy: COND register reports ack (AICore still executing). fin means
+// AICore wrote completion but AICPU hasn't recycled the running slot yet —
+// either a completion-poll bug or the diagnostic raced the recycle.
+void format_core_status(
+    char *buf, size_t buf_size, int32_t core_id, bool idle, const CoreExecState *core_state, uint64_t reg_addr_for_cond
+) {
+    if (idle) {
+        snprintf(buf, buf_size, "core%d(idle)", core_id);
+        return;
+    }
+    int32_t kernel = -1;
+    int64_t task_id_raw = -1;
+    if (core_state && core_state->running_slot_state) {
+        int32_t subslot = static_cast<int32_t>(core_state->running_subslot);
+        kernel = core_state->running_slot_state->task->kernel_id[subslot];
+        task_id_raw = static_cast<int64_t>(core_state->running_slot_state->task->task_id.raw);
+    }
+    uint64_t cond_reg = read_reg(reg_addr_for_cond, RegId::COND);
+    int32_t hw_state = EXTRACT_TASK_STATE(cond_reg);
+    const char *cond_reg_state_str = (hw_state == TASK_ACK_STATE) ? "ack" : "fin";
+    if (hw_state == TASK_ACK_STATE) {
+        snprintf(
+            buf, buf_size, "core%d(busy kernel=%d task=%" PRId64 " cond_reg_state=%s)", core_id, kernel, task_id_raw,
+            cond_reg_state_str
+        );
+    } else {
+        snprintf(
+            buf, buf_size, "core%d(busy kernel=%d task=%" PRId64 " cond_reg_state=%s ANOMALY)", core_id, kernel,
+            task_id_raw, cond_reg_state_str
+        );
+    }
+}
+
+}  // namespace
+
+int32_t SchedulerContext::find_core_owner_thread(int32_t core_id) const {
+    for (int32_t t = 0; t < aicpu_thread_num_; t++) {
+        const int32_t *ids = core_trackers_[t].core_ids();
+        int32_t n = core_trackers_[t].core_num();
+        for (int32_t i = 0; i < n; i++) {
+            if (ids[i] == core_id) return t;
+        }
+    }
+    return -1;
+}
+
+bool SchedulerContext::self_owns_running_task(int32_t thread_idx) const {
+    const int32_t *cores = core_trackers_[thread_idx].core_ids();
+    int32_t core_num = core_trackers_[thread_idx].core_num();
+    for (int32_t i = 0; i < core_num; i++) {
+        if (core_exec_states_[cores[i]].running_slot_state != nullptr) {
+            return true;
+        }
+    }
+    return false;
+}
+
+bool SchedulerContext::no_thread_owns_running_task() const {
+    for (int32_t t = 0; t < aicpu_thread_num_; t++) {
+        if (self_owns_running_task(t)) return false;
+    }
+    return true;
+}
+
+void SchedulerContext::log_stall_diagnostics(
+    int32_t thread_idx, int32_t task_count, int32_t idle_iterations, int32_t last_progress_count
+) {
+    CoreTracker &tracker = core_trackers_[thread_idx];
+
+    // T0 owns the shared-ring scan; printing it from other threads would
+    // produce identical TASK lines once per scheduler thread.
+    if (thread_idx == 0) {
+        int32_t cnt_ready = 0, cnt_waiting = 0, cnt_running = 0, submitted_in_ring = 0;
+        for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+            PTO2SharedMemoryRingHeader &ring = *sched_->ring_sched_states[r].ring;
+            int32_t ring_task_count = ring.fc.current_task_index.load(std::memory_order_relaxed);
+            submitted_in_ring += ring_task_count;
+            for (int32_t si = 0; si < ring_task_count; si++) {
+                PTO2TaskSlotState &slot_state = ring.get_slot_state_by_task_id(si);
+                PTO2TaskState st = slot_state.task_state.load(std::memory_order_relaxed);
+                int32_t rc = slot_state.fanin_refcount.load(std::memory_order_relaxed);
+                int32_t fi = slot_state.fanin_count;
+                int32_t kid_aic = slot_state.task->kernel_id[0];
+                int32_t kid_aiv0 = slot_state.task->kernel_id[1];
+                int32_t kid_aiv1 = slot_state.task->kernel_id[2];
+                int64_t task_id = static_cast<int64_t>(slot_state.task->task_id.raw);
+                if (st >= PTO2_TASK_COMPLETED) continue;
+                // task_state has no intermediate ready/running value — it
+                // stays PENDING until the worker stores COMPLETED. Classify
+                // by the ground truth instead: a slot is RUNNING iff some
+                // core has it as running_slot_state. A task occupies at most
+                // 3 cores (one cluster), all under the same owner thread by
+                // construction of assign_cores_to_threads.
+                char running_on[192] = {0};
+                int32_t owner = -1;
+                int32_t pos = 0;
+                bool is_running = false;
+                for (int32_t cid = 0; cid < cores_total_num_ && pos + 32 < (int32_t)sizeof(running_on); cid++) {
+                    if (core_exec_states_[cid].running_slot_state != &slot_state) continue;
+                    is_running = true;
+                    if (owner < 0) owner = find_core_owner_thread(cid);
+                    const char *sname = subslot_name(core_exec_states_[cid].running_subslot);
+                    int32_t written = snprintf(
+                        running_on + pos, sizeof(running_on) - pos, "%score=%d(%s)", pos == 0 ? "" : " ", cid, sname
+                    );
+                    if (written > 0) pos += written;
+                }
+
+                if (is_running) {
+                    cnt_running++;
+                    if (cnt_running > STALL_DUMP_READY_MAX) continue;
+                    LOG_INFO_V9(
+                        "[STALL thread=%d idle_iterations=%d] TASK ring=%d task_id=%" PRId64
+                        " state=RUNNING fanin_refcount=%d/%d kernels=[aic:%d aiv0:%d aiv1:%d] "
+                        "running_on=[owner_thread=%d cores=[%s]]",
+                        thread_idx, idle_iterations, r, task_id, rc, fi, kid_aic, kid_aiv0, kid_aiv1, owner, running_on
+                    );
+                    continue;
+                }
+                if (rc >= fi) {
+                    cnt_ready++;
+                    if (cnt_ready > STALL_DUMP_READY_MAX) continue;
+                    LOG_INFO_V9(
+                        "[STALL thread=%d idle_iterations=%d] TASK ring=%d task_id=%" PRId64
+                        " state=READY   fanin_refcount=%d/%d kernels=[aic:%d aiv0:%d aiv1:%d]",
+                        thread_idx, idle_iterations, r, task_id, rc, fi, kid_aic, kid_aiv0, kid_aiv1
+                    );
+                    continue;
+                }
+                cnt_waiting++;
+                if (cnt_waiting > STALL_DUMP_WAIT_MAX) continue;
+                LOG_INFO_V9(
+                    "[STALL thread=%d idle_iterations=%d] TASK ring=%d task_id=%" PRId64
+                    " state=WAIT    fanin_refcount=%d/%d kernels=[aic:%d aiv0:%d aiv1:%d] missing_deps=%d",
+                    thread_idx, idle_iterations, r, task_id, rc, fi, kid_aic, kid_aiv0, kid_aiv1, fi - rc
+                );
+            }
+        }
+        int32_t effective_total = task_count > 0 ? task_count : submitted_in_ring;
+        int32_t c = completed_tasks_.load(std::memory_order_relaxed);
+        LOG_INFO_V9(
+            "[STALL thread=%d idle_iterations=%d] SUMMARY completed=%d/%d last_progress_iteration=%d "
+            "scan_ready=%d scan_waiting=%d scan_running=%d",
+            thread_idx, idle_iterations, c, effective_total, last_progress_count, cnt_ready, cnt_waiting, cnt_running
+        );
+    }
+
+    // CLUSTER lines: one per cluster this thread owns.
+    // cluster_id = local_cluster_idx * active_sched_threads_ + thread_idx, matching the
+    // round-robin assignment in assign_cores_to_threads / reassign_cores_for_all_threads.
+    int32_t ast = active_sched_threads_ > 0 ? active_sched_threads_ : aicpu_thread_num_;
+    for (int32_t cli = 0; cli < tracker.get_cluster_count() && cli < STALL_DUMP_CORE_MAX; cli++) {
+        int32_t offset = cli * 3;
+        int32_t aic_id = tracker.get_aic_core_id(offset);
+        int32_t aiv0_id = tracker.get_aiv0_core_id(offset);
+        int32_t aiv1_id = tracker.get_aiv1_core_id(offset);
+        bool aic_idle = tracker.is_aic_core_idle(offset);
+        bool aiv0_idle = tracker.is_aiv0_core_idle(offset);
+        bool aiv1_idle = tracker.is_aiv1_core_idle(offset);
+        int32_t cluster_id = cli * ast + thread_idx;
+        char aic_buf[128], aiv0_buf[128], aiv1_buf[128];
+        format_core_status(
+            aic_buf, sizeof(aic_buf), aic_id, aic_idle, &core_exec_states_[aic_id], core_exec_states_[aic_id].reg_addr
+        );
+        format_core_status(
+            aiv0_buf, sizeof(aiv0_buf), aiv0_id, aiv0_idle, &core_exec_states_[aiv0_id],
+            core_exec_states_[aiv0_id].reg_addr
+        );
+        format_core_status(
+            aiv1_buf, sizeof(aiv1_buf), aiv1_id, aiv1_idle, &core_exec_states_[aiv1_id],
+            core_exec_states_[aiv1_id].reg_addr
+        );
+        LOG_INFO_V9(
+            "[STALL thread=%d idle_iterations=%d] CLUSTER cluster_id=%d aic=%s aiv0=%s aiv1=%s", thread_idx,
+            idle_iterations, cluster_id, aic_buf, aiv0_buf, aiv1_buf
+        );
+    }
+}
+
+void SchedulerContext::log_shutdown_stall_snapshot(
+    int32_t trigger_thread_idx, int32_t trigger_idle_iterations, int32_t trigger_last_progress_count
+) {
+    LOG_WARN(
+        "[SHUTDOWN_SNAPSHOT trigger_thread=%d reason=scheduler_timeout idle_iterations=%d] "
+        "dumping all scheduler threads before emergency shutdown",
+        trigger_thread_idx, trigger_idle_iterations
+    );
+    int32_t thread_count = active_sched_threads_ > 0 ? active_sched_threads_ : aicpu_thread_num_;
+    if (thread_count < 0 || thread_count > MAX_AICPU_THREADS) {
+        LOG_ERROR(
+            "[SHUTDOWN_SNAPSHOT trigger_thread=%d] invalid thread_count=%d, clamping to [0,%d]", trigger_thread_idx,
+            thread_count, MAX_AICPU_THREADS
+        );
+        thread_count = thread_count < 0 ? 0 : MAX_AICPU_THREADS;
+    }
+    for (int32_t t = 0; t < thread_count; t++) {
+        log_stall_diagnostics(t, total_tasks_, trigger_idle_iterations, trigger_last_progress_count);
+    }
+}
+
+int32_t SchedulerContext::handle_timeout_exit(
+    int32_t thread_idx, PTO2SharedMemoryHeader *header, Runtime *runtime, int32_t idle_iterations,
+    int32_t last_progress_count
+#if PTO2_PROFILING
+    ,
+    uint64_t sched_start_ts
+#endif
+) {
+    LOG_ERROR(
+        "[STALL thread=%d idle_iterations=%d] TIMEOUT_EXIT after_idle_iterations=%d", thread_idx, idle_iterations,
+        idle_iterations
+    );
+    latch_scheduler_error(header, thread_idx, PTO2_ERROR_SCHEDULER_TIMEOUT);
+    if (!completed_.exchange(true, std::memory_order_acq_rel)) {
+        log_shutdown_stall_snapshot(thread_idx, idle_iterations, last_progress_count);
+#if PTO2_PROFILING
+        // Capture the in-flight kernels' partial output before signalling the
+        // cores to exit, so the dump reflects the live stuck state.
+        if (is_dump_args_enabled()) {
+            dump_running_task_outputs<PTO2_SUBTASK_SLOT_COUNT>(
+                thread_idx, cores_total_num_,
+                [this](int32_t cid) {
+                    return core_exec_states_[cid].running_slot_state;
+                },
+                [](ActiveMask active_mask, int raw_subtask_id) {
+                    return active_mask.subtask_active(static_cast<PTO2SubtaskSlot>(raw_subtask_id));
+                },
+                [this](int32_t func_id) {
+                    return get_function_bin_addr(func_id);
+                }
+            );
+        }
+#endif
+        emergency_shutdown(runtime);
+    }
+#if PTO2_PROFILING
+    uint64_t sched_timeout_ts = get_sys_cnt_aicpu();
+    LOG_INFO_V9(
+        "Thread %d: sched_start=%" PRIu64 " sched_end(timeout)=%" PRIu64 " sched_cost=%.3fus", thread_idx,
+        static_cast<uint64_t>(sched_start_ts), static_cast<uint64_t>(sched_timeout_ts),
+        cycles_to_us(sched_timeout_ts - sched_start_ts)
+    );
+#endif
+    return -PTO2_ERROR_SCHEDULER_TIMEOUT;
+}
+
+#if PTO2_PROFILING
+void SchedulerContext::log_l2_swimlane_summary(int32_t thread_idx, int32_t cur_thread_completed) {
+    auto &l2_swimlane = sched_l2_swimlane_[thread_idx];
+    uint64_t sched_end_ts = get_sys_cnt_aicpu();
+    LOG_INFO_V9(
+        "Thread %d: sched_start=%" PRIu64 " sched_end=%" PRIu64 " sched_cost=%.3fus", thread_idx,
+        static_cast<uint64_t>(l2_swimlane.sched_start_ts), static_cast<uint64_t>(sched_end_ts),
+        cycles_to_us(sched_end_ts - l2_swimlane.sched_start_ts)
+    );
+
+    uint64_t sched_total = l2_swimlane.sched_wiring_cycle + l2_swimlane.sched_complete_cycle +
+                           l2_swimlane.sched_scan_cycle + l2_swimlane.sched_dispatch_cycle +
+                           l2_swimlane.sched_idle_cycle;
+    if (sched_total == 0) sched_total = 1;
+
+#if PTO2_SCHED_PROFILING
+    {
+        PTO2SchedProfilingData sp = scheduler_get_profiling(thread_idx);
+        uint64_t otc_total = sp.lock_cycle + sp.fanout_cycle + sp.fanin_cycle + sp.self_consumed_cycle;
+        uint64_t complete_poll =
+            (l2_swimlane.sched_complete_cycle > otc_total + l2_swimlane.sched_complete_perf_cycle) ?
+                (l2_swimlane.sched_complete_cycle - otc_total - l2_swimlane.sched_complete_perf_cycle) :
+                0;
+        uint64_t dispatch_poll = (l2_swimlane.sched_dispatch_cycle >
+                                  l2_swimlane.sched_dispatch_pop_cycle + l2_swimlane.sched_dispatch_setup_cycle) ?
+                                     (l2_swimlane.sched_dispatch_cycle - l2_swimlane.sched_dispatch_pop_cycle -
+                                      l2_swimlane.sched_dispatch_setup_cycle) :
+                                     0;
+
+        LOG_INFO_V9(
+            "Thread %d: === Scheduler Phase Breakdown: total=%.3fus, %d tasks ===", thread_idx,
+            cycles_to_us(sched_total), cur_thread_completed
+        );
+
+        // fanout / fanin per-thread aggregates live in
+        // sched_overhead_analysis.compute_dag_stats_from_deps (deps.json edges
+        // × core_to_thread).
+        LOG_INFO_V9(
+            "Thread %d:   complete       : %.3fus (%.1f%%)", thread_idx, cycles_to_us(l2_swimlane.sched_complete_cycle),
+            l2_swimlane.sched_complete_cycle * 100.0 / sched_total
+        );
+
+        uint64_t c_parent = l2_swimlane.sched_complete_cycle > 0 ? l2_swimlane.sched_complete_cycle : 1;
+        uint64_t complete_miss_count = (l2_swimlane.complete_probe_count > l2_swimlane.complete_hit_count) ?
+                                           (l2_swimlane.complete_probe_count - l2_swimlane.complete_hit_count) :
+                                           0;
+        double complete_hit_rate = l2_swimlane.complete_probe_count > 0 ?
+                                       l2_swimlane.complete_hit_count * 100.0 / l2_swimlane.complete_probe_count :
+                                       0.0;
+        LOG_INFO_V9(
+            "Thread %d:     poll         : %.3fus (%.1f%%)  hit=%" PRIu64 ", miss=%" PRIu64 ", hit_rate=%.1f%%",
+            thread_idx, cycles_to_us(complete_poll), complete_poll * 100.0 / c_parent,
+            static_cast<uint64_t>(l2_swimlane.complete_hit_count), static_cast<uint64_t>(complete_miss_count),
+            complete_hit_rate
+        );
+        LOG_INFO_V9(
+            "Thread %d:     otc_lock     : %.3fus (%.1f%%)  work=%.3fus wait=%.3fus  atomics=%" PRIu64 "", thread_idx,
+            cycles_to_us(sp.lock_cycle), sp.lock_cycle * 100.0 / c_parent,
+            cycles_to_us(sp.lock_cycle - sp.lock_wait_cycle), cycles_to_us(sp.lock_wait_cycle),
+            static_cast<uint64_t>(sp.lock_atomic_count)
+        );
+        LOG_INFO_V9(
+            "Thread %d:     otc_fanout   : %.3fus (%.1f%%)  work=%.3fus wait=%.3fus  atomics=%" PRIu64 "", thread_idx,
+            cycles_to_us(sp.fanout_cycle), sp.fanout_cycle * 100.0 / c_parent,
+            cycles_to_us(sp.fanout_cycle - sp.push_wait_cycle), cycles_to_us(sp.push_wait_cycle),
+            static_cast<uint64_t>(sp.fanout_atomic_count)
+        );
+        LOG_INFO_V9(
+            "Thread %d:     otc_fanin    : %.3fus (%.1f%%)  atomics=%" PRIu64 "", thread_idx,
+            cycles_to_us(sp.fanin_cycle), sp.fanin_cycle * 100.0 / c_parent,
+            static_cast<uint64_t>(sp.fanin_atomic_count)
+        );
+        LOG_INFO_V9(
+            "Thread %d:     otc_self     : %.3fus (%.1f%%)  atomics=%" PRIu64 "", thread_idx,
+            cycles_to_us(sp.self_consumed_cycle), sp.self_consumed_cycle * 100.0 / c_parent,
+            static_cast<uint64_t>(sp.self_atomic_count)
+        );
+        LOG_INFO_V9(
+            "Thread %d:     perf         : %.3fus (%.1f%%)", thread_idx,
+            cycles_to_us(l2_swimlane.sched_complete_perf_cycle),
+            l2_swimlane.sched_complete_perf_cycle * 100.0 / c_parent
+        );
+
+        LOG_INFO_V9(
+            "Thread %d:   dispatch       : %.3fus (%.1f%%)", thread_idx, cycles_to_us(l2_swimlane.sched_dispatch_cycle),
+            l2_swimlane.sched_dispatch_cycle * 100.0 / sched_total
+        );
+
+        uint64_t d_parent = l2_swimlane.sched_dispatch_cycle > 0 ? l2_swimlane.sched_dispatch_cycle : 1;
+        LOG_INFO_V9(
+            "Thread %d:     poll         : %.3fus (%.1f%%)", thread_idx, cycles_to_us(dispatch_poll),
+            dispatch_poll * 100.0 / d_parent
+        );
+        LOG_INFO_V9(
+            "Thread %d:     pop          : %.3fus (%.1f%%)  work=%.3fus wait=%.3fus  atomics=%" PRIu64 "", thread_idx,
+            cycles_to_us(l2_swimlane.sched_dispatch_pop_cycle), l2_swimlane.sched_dispatch_pop_cycle * 100.0 / d_parent,
+            cycles_to_us(l2_swimlane.sched_dispatch_pop_cycle - sp.pop_wait_cycle), cycles_to_us(sp.pop_wait_cycle),
+            static_cast<uint64_t>(sp.pop_atomic_count)
+        );
+        LOG_INFO_V9(
+            "Thread %d:     setup        : %.3fus (%.1f%%)", thread_idx,
+            cycles_to_us(l2_swimlane.sched_dispatch_setup_cycle),
+            l2_swimlane.sched_dispatch_setup_cycle * 100.0 / d_parent
+        );
+
+        LOG_INFO_V9(
+            "Thread %d:   scan           : %.3fus (%.1f%%)", thread_idx, cycles_to_us(l2_swimlane.sched_scan_cycle),
+            l2_swimlane.sched_scan_cycle * 100.0 / sched_total
+        );
+
+#if PTO2_SCHED_PROFILING
+        LOG_INFO_V9(
+            "Thread %d:   wiring         : %.3fus (%.1f%%)  tasks=%d", thread_idx,
+            cycles_to_us(l2_swimlane.sched_wiring_cycle), l2_swimlane.sched_wiring_cycle * 100.0 / sched_total,
+            l2_swimlane.phase_wiring_count
+        );
+#else
+        LOG_INFO_V9(
+            "Thread %d:   wiring         : %.3fus (%.1f%%)", thread_idx, cycles_to_us(l2_swimlane.sched_wiring_cycle),
+            l2_swimlane.sched_wiring_cycle * 100.0 / sched_total
+        );
+#endif
+
+        LOG_INFO_V9(
+            "Thread %d:   idle           : %.3fus (%.1f%%)", thread_idx, cycles_to_us(l2_swimlane.sched_idle_cycle),
+            l2_swimlane.sched_idle_cycle * 100.0 / sched_total
+        );
+
+        if (cur_thread_completed > 0) {
+            LOG_INFO_V9(
+                "Thread %d:   avg/complete   : %.3fus", thread_idx,
+                cycles_to_us(l2_swimlane.sched_complete_cycle) / cur_thread_completed
+            );
+        }
+    }
+#endif
+    LOG_INFO_V9(
+        "Thread %d: Scheduler summary: total_time=%.3fus, loops=%" PRIu64 ", tasks_scheduled=%d", thread_idx,
+        cycles_to_us(sched_total), static_cast<uint64_t>(l2_swimlane.sched_loop_count), cur_thread_completed
+    );
+}
+#endif
+
+// =============================================================================
+// Shutdown: deinit AICore regs for this thread's cores.
+// Orchestrator threads have core_trackers_[thread_idx].core_num() == 0 -> no-op.
+// platform_deinit_aicore_regs is idempotent; safe to call after early completion.
+// =============================================================================
+int32_t SchedulerContext::shutdown(int32_t thread_idx) {
+    const int32_t *cores = core_trackers_[thread_idx].core_ids();
+    int32_t core_num = core_trackers_[thread_idx].core_num();
+    if (core_num == 0) return 0;
+
+#if PTO2_PROFILING
+    // Restore PMU CTRL registers for this thread's cores before AICore shutdown
+    if (is_pmu_enabled()) {
+        pmu_aicpu_finalize(cores, core_num);
+    }
+#endif
+
+    LOG_INFO_V0("Thread %d: Shutting down %d cores", thread_idx, core_num);
+    int32_t rc = 0;
+    for (int32_t i = 0; i < core_num; i++) {
+        int32_t core_id = cores[i];
+        uint64_t reg_addr = core_exec_states_[core_id].reg_addr;
+        if (reg_addr != 0) {
+            // Timeout means AICore is unresponsive. Log and continue deiniting remaining cores.
+            if (platform_deinit_aicore_regs(reg_addr) != 0) {
+                LOG_ERROR("Thread %d: Core %d deinit timed out", thread_idx, core_id);
+                rc = -1;
+            }
+        } else {
+            LOG_ERROR("Thread %d: Core %d has invalid register address", thread_idx, core_id);
+        }
+    }
+    LOG_INFO_V0("Thread %d: Shutdown complete", thread_idx);
+    return rc;
+}
+
+// =============================================================================
+// Handshake with all AICore workers; discover core type and reg address.
+// =============================================================================
+int32_t SchedulerContext::handshake_all_cores(Runtime *runtime) {
+    Handshake *all_handshakes = reinterpret_cast<Handshake *>(runtime->workers);
+    cores_total_num_ = runtime->worker_count;
+
+    // Validate cores_total_num_ before using as array index
+    if (cores_total_num_ == 0 || cores_total_num_ > RUNTIME_MAX_WORKER) {
+        LOG_ERROR("Invalid cores_total_num %d (expected 1-%d)", cores_total_num_, RUNTIME_MAX_WORKER);
+        return -1;
+    }
+
+    aic_count_ = 0;
+    aiv_count_ = 0;
+
+    LOG_INFO_V0("Handshaking with %d cores", cores_total_num_);
+
+    // Step 1: Write per-core payload addresses and send handshake signal.
+    // OUT_OF_ORDER_STORE_BARRIER() ensures task is globally visible before
+    // aicpu_ready=1, so AICore reads the correct payload pointer after waking up.
+    for (int32_t i = 0; i < cores_total_num_; i++) {
+        all_handshakes[i].task = reinterpret_cast<uint64_t>(&payload_per_core_[i][0]);
+        OUT_OF_ORDER_STORE_BARRIER();
+        all_handshakes[i].aicpu_ready = 1;
+    }
+    OUT_OF_ORDER_STORE_BARRIER();
+
+    // Get platform physical cores count for validation
+    uint32_t max_physical_cores_count = platform_get_physical_cores_count();
+
+    // Step 2: Wait for all cores to respond, collect core type and register addresses
+    bool handshake_failed = false;
+    for (int32_t i = 0; i < cores_total_num_; i++) {
+        Handshake *hank = &all_handshakes[i];
+
+        while (hank->aicore_regs_ready == 0) {
+            SPIN_WAIT_HINT();
+        }
+
+        uint32_t physical_core_id = hank->physical_core_id;
+
+        if (physical_core_id >= max_physical_cores_count) {
+            LOG_ERROR(
+                "Core %d reported invalid physical_core_id=%u (platform max=%u)", i, physical_core_id,
+                max_physical_cores_count
+            );
+            handshake_failed = true;
+            continue;
+        }
+
+        uint64_t *regs = reinterpret_cast<uint64_t *>(regs_);
+        uint64_t reg_addr = regs[physical_core_id];
+
+        // Initialize AICore registers after discovery (first round)
+        platform_init_aicore_regs(reg_addr);
+        OUT_OF_ORDER_STORE_BARRIER();
+        hank->aicpu_regs_ready = 1;
+
+        OUT_OF_ORDER_STORE_BARRIER();
+
+        while (hank->aicore_done == 0) {
+            SPIN_WAIT_HINT();
+        }
+
+        CoreType type = hank->core_type;
+
+        core_exec_states_[i].reg_addr = reg_addr;
+        core_exec_states_[i].cond_ptr = get_reg_ptr(reg_addr, RegId::COND);
+
+#if PTO2_PROFILING
+        physical_core_ids_[i] = physical_core_id;
+#endif
+
+#if !PTO2_PROFILING
+        core_exec_states_[i].worker_id = i;
+        core_exec_states_[i].physical_core_id = physical_core_id;
+        core_exec_states_[i].core_type = type;
+#endif
+
+        if (type == CoreType::AIC) {
+            aic_worker_ids_[aic_count_++] = i;
+            LOG_INFO_V0("Core %d: AIC, physical_id=%u, reg_addr=0x%lx", i, physical_core_id, reg_addr);
+        } else {
+            aiv_worker_ids_[aiv_count_++] = i;
+            LOG_INFO_V0("Core %d: AIV, physical_id=%u, reg_addr=0x%lx", i, physical_core_id, reg_addr);
+        }
+    }
+
+    if (handshake_failed) {
+        emergency_shutdown(runtime);
+        return -1;
+    }
+
+    LOG_INFO_V0("Core discovery complete: %d AIC, %d AIV", aic_count_, aiv_count_);
+    return 0;
+}
+
+// =============================================================================
+// Assign discovered cores to scheduler threads (cluster-aligned round-robin).
+// =============================================================================
+bool SchedulerContext::assign_cores_to_threads() {
+    // Cluster-aligned round-robin assignment: cluster ci -> sched thread ci % active_sched_threads_.
+    // Each cluster = 1 AIC + 2 adjacent AIV; the triple is always kept together.
+    active_sched_threads_ = (sched_thread_num_ > 0) ? sched_thread_num_ : aicpu_thread_num_;
+    int32_t cluster_count = aic_count_;
+
+    // Max clusters any single sched thread can hold: ceil(cluster_count / active_sched_threads_).
+    int32_t max_clusters_per_thread = (cluster_count + active_sched_threads_ - 1) / active_sched_threads_;
+    int32_t thread_cores_num = max_clusters_per_thread * 3;
+
+    if (thread_cores_num > CoreTracker::MAX_CORE_PER_THREAD) {
+        LOG_ERROR("Can't assign more then 64 cores in per scheduler");
+        return false;
+    }
+
+    LOG_INFO_V0(
+        "Assigning cores (round-robin): %d clusters across %d sched threads (%d AIC, %d AIV)", cluster_count,
+        active_sched_threads_, aic_count_, aiv_count_
+    );
+
+    for (int32_t i = 0; i < RUNTIME_MAX_WORKER; i++) {
+        core_exec_states_[i].running_reg_task_id = AICPU_TASK_INVALID;
+        core_exec_states_[i].pending_reg_task_id = AICPU_TASK_INVALID;
+    }
+
+    // Count clusters per thread first (round-robin may distribute unevenly)
+    int32_t clusters_per_thread[MAX_AICPU_THREADS] = {};
+    for (int32_t ci = 0; ci < cluster_count; ci++) {
+        clusters_per_thread[ci % active_sched_threads_]++;
+    }
+    for (int32_t i = 0; i < active_sched_threads_; i++) {
+        core_trackers_[i].init(clusters_per_thread[i]);
+    }
+
+    int32_t cluster_idx_per_thread[MAX_AICPU_THREADS] = {};
+
+    for (int32_t ci = 0; ci < cluster_count; ci++) {
+        int32_t t = ci % active_sched_threads_;
+
+        int32_t aic_wid = aic_worker_ids_[ci];
+        int32_t aiv0_wid = aiv_worker_ids_[2 * ci];
+        int32_t aiv1_wid = aiv_worker_ids_[2 * ci + 1];
+
+        core_trackers_[t].set_cluster(cluster_idx_per_thread[t]++, aic_wid, aiv0_wid, aiv1_wid);
+
+        LOG_INFO_V0("Thread %d: cluster %d (AIC=%d, AIV0=%d, AIV1=%d)", t, ci, aic_wid, aiv0_wid, aiv1_wid);
+    }
+
+    for (int32_t t = 0; t < aicpu_thread_num_; t++) {
+        LOG_INFO_V0(
+            "Thread %d: total %d cores (%d clusters)", t, core_trackers_[t].core_num(),
+            core_trackers_[t].get_cluster_count()
+        );
+    }
+
+    LOG_INFO_V0(
+        "Config: threads=%d, cores=%d, cores_per_thread=%d", aicpu_thread_num_, cores_total_num_, thread_cores_num
+    );
+    return true;
+}
+
+// =============================================================================
+// Reassign all cores across all threads (sched + orchestrator) after orchestration.
+// =============================================================================
+void SchedulerContext::reassign_cores_for_all_threads() {
+    LOG_INFO_V0(
+        "Reassigning cores (cluster-aligned) for %d threads: %d AIC, %d AIV", aicpu_thread_num_, aic_count_, aiv_count_
+    );
+
+    // Collect running worker_ids from all current trackers
+    bool running_cores[RUNTIME_MAX_WORKER] = {};
+    for (int32_t i = 0; i < aicpu_thread_num_; i++) {
+        auto all_running = core_trackers_[i].get_all_running_cores();
+        int32_t bp;
+        while ((bp = all_running.pop_first()) >= 0) {
+            running_cores[core_trackers_[i].get_core_id_by_offset(bp)] = true;
+        }
+    }
+
+    // Count clusters per thread (round-robin across all threads)
+    int32_t cluster_count = aic_count_;
+    int32_t clusters_per_thread[MAX_AICPU_THREADS] = {};
+    for (int32_t ci = 0; ci < cluster_count; ci++) {
+        clusters_per_thread[ci % aicpu_thread_num_]++;
+    }
+
+    // Re-init all trackers and reset core counts
+    for (int32_t i = 0; i < aicpu_thread_num_; i++) {
+        core_trackers_[i].init(clusters_per_thread[i]);
+    }
+
+    // Assign clusters round-robin and restore running state
+    int32_t cluster_idx_per_thread[MAX_AICPU_THREADS] = {};
+    for (int32_t ci = 0; ci < cluster_count; ci++) {
+        int32_t t = ci % aicpu_thread_num_;
+
+        int32_t aic_wid = aic_worker_ids_[ci];
+        int32_t aiv0_wid = aiv_worker_ids_[2 * ci];
+        int32_t aiv1_wid = aiv_worker_ids_[2 * ci + 1];
+
+        int32_t cl_idx = cluster_idx_per_thread[t]++;
+        core_trackers_[t].set_cluster(cl_idx, aic_wid, aiv0_wid, aiv1_wid);
+
+        // init() marks all idle; toggle cores that were running and restore pending_occupied
+        if (running_cores[aic_wid]) {
+            core_trackers_[t].change_core_state(cl_idx * 3);
+            core_trackers_[t].set_pending_occupied(cl_idx * 3);
+        }
+        if (running_cores[aiv0_wid]) {
+            core_trackers_[t].change_core_state(cl_idx * 3 + 1);
+            core_trackers_[t].set_pending_occupied(cl_idx * 3 + 1);
+        }
+        if (running_cores[aiv1_wid]) {
+            core_trackers_[t].change_core_state(cl_idx * 3 + 2);
+            core_trackers_[t].set_pending_occupied(cl_idx * 3 + 2);
+        }
+    }
+
+    // Log final distribution
+    LOG_INFO_V0("Core reassignment complete:");
+    for (int32_t t = 0; t < aicpu_thread_num_; t++) {
+        int32_t aic_running = core_trackers_[t].get_running_count<CoreType::AIC>();
+        int32_t aiv_running = core_trackers_[t].get_running_count<CoreType::AIV>();
+        LOG_INFO_V0(
+            "  Thread %d: %d cores, %d clusters (AIC running=%d, AIV running=%d)", t, core_trackers_[t].core_num(),
+            core_trackers_[t].get_cluster_count(), aic_running, aiv_running
+        );
+    }
+    active_sched_threads_ = aicpu_thread_num_;
+}
+
+// =============================================================================
+// Emergency shutdown: broadcast exit signal to every handshake'd core and
+// deinit their AICore register blocks. Idempotent.
+// =============================================================================
+void SchedulerContext::emergency_shutdown(Runtime *runtime) {
+    LOG_WARN("Emergency shutdown: sending exit signal to all initialized cores");
+    Handshake *all_handshakes = reinterpret_cast<Handshake *>(runtime->workers);
+    int32_t timeout_count = 0;
+    for (int32_t i = 0; i < cores_total_num_; i++) {
+        Handshake *hank = &all_handshakes[i];
+        OUT_OF_ORDER_STORE_BARRIER();
+        hank->aicpu_regs_ready = 1;
+        if (core_exec_states_[i].reg_addr != 0) {
+            if (platform_deinit_aicore_regs(core_exec_states_[i].reg_addr) != 0) {
+                timeout_count++;
+            }
+        }
+    }
+    if (timeout_count > 0) {
+        LOG_ERROR("Emergency shutdown: %d cores did not acknowledge exit", timeout_count);
+    }
+    LOG_WARN("Emergency shutdown complete");
+}
+
+// =============================================================================
+// Lifecycle: init / deinit
+// =============================================================================
+int32_t SchedulerContext::init(
+    Runtime *runtime, int32_t aicpu_thread_num, int32_t sched_thread_num, bool orch_to_sched, uint64_t regs_base
+) {
+    always_assert(runtime != nullptr);
+
+    // Zero all per-core execution state before handshake
+    memset(core_exec_states_, 0, sizeof(core_exec_states_));
+
+    // Wire thread/transition configuration that handshake/assign need to read.
+    aicpu_thread_num_ = aicpu_thread_num;
+    sched_thread_num_ = sched_thread_num;
+    orch_to_sched_ = orch_to_sched;
+    regs_ = regs_base;
+
+#if PTO2_PROFILING
+    // l2_swimlane_aicpu_init promotes g_l2_swimlane_level from the shared-memory
+    // header — must be called BEFORE the orchestrator thread caches the level
+    // via rt->orchestrator.l2_swimlane_level = get_l2_swimlane_level() in
+    // AicpuExecutor::run(). Otherwise the cached value would still be DISABLED
+    // (only the binary enable bit has been seeded by kernel.cpp at this point),
+    // and the CYCLE_COUNT_START() gate in pto_orchestrator.cpp would suppress
+    // all ORCH_PHASES records. Reset the cached level on disabled runs so a
+    // prior enabled launch's level can't leak into the phase-record gates in
+    // scheduler_dispatch (`>= SCHED_PHASES`).
+    if (is_l2_swimlane_enabled()) {
+        l2_swimlane_aicpu_init(runtime->worker_count);
+        l2_swimlane_level_ = get_l2_swimlane_level();
+        if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) {
+            // When orchestrator phases merge into scheduler threads
+            // (PTO2_ORCH_TO_SCHED=1), phase records flow through
+            // aicpu_thread_num_ pools — matches the same branch in
+            // dump_args_init (scheduler_dispatch.cpp).
+            // Sched phase pool count = number of scheduler threads.
+            // sched_thread_num_ <= 0 is the "use all AICPU threads as
+            // scheduler threads" sentinel (see assign_cores_to_threads'
+            // active_sched_threads_ normalization). Without this
+            // normalization here, init_phase would prime zero sched pools
+            // and all sched_phase emits would silently drop.
+            const int active_sched = (sched_thread_num_ > 0) ? sched_thread_num_ : aicpu_thread_num_;
+            const int sched_phase_threads = orch_to_sched_ ? aicpu_thread_num_ : active_sched;
+            // Orch phase is a single instance (PR #971 design), so the orch
+            // pool count is always 1 regardless of orch_to_sched mode.
+            const int orch_phase_threads = 1;
+            l2_swimlane_aicpu_init_phase(runtime->worker_count, sched_phase_threads, orch_phase_threads);
+        }
+    } else {
+        l2_swimlane_level_ = L2SwimlaneLevel::DISABLED;
+    }
+#endif
+
+    // Discover cores and assign to scheduler threads.
+    int32_t rc = handshake_all_cores(runtime);
+    if (rc != 0) {
+        LOG_ERROR("handshake_all_cores failed");
+        return rc;
+    }
+    if (!assign_cores_to_threads()) {
+        return -1;
+    }
+
+    // Initialize task counters. Task count comes from PTO2 shared memory.
+    if (runtime->get_gm_sm_ptr()) {
+        auto *header = static_cast<PTO2SharedMemoryHeader *>(runtime->get_gm_sm_ptr());
+        // Read at one-time boot init, before the SM is reset for the run, so a
+        // ring not yet written holds uninitialized memory (0xbe... under ASAN's
+        // malloc-fill). Sum in int64 and only count rings whose value is a
+        // plausible task count — (0, PTO2_SCOPE_TASKS_CAP]; a ring cannot hold
+        // more than the scope cap. This rejects any garbage pattern (negative
+        // or positive), so uninitialized rings contribute 0 (the correct boot
+        // count) while valid counts still add up, with no signed overflow.
+        int64_t task_count = 0;
+        for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+            int32_t ring_tasks = header->rings[r].fc.current_task_index.load(std::memory_order_acquire);
+            if (ring_tasks > 0 && ring_tasks <= PTO2_SCOPE_TASKS_CAP) task_count += ring_tasks;
+        }
+        total_tasks_ = static_cast<int32_t>(task_count);
+    } else {
+        total_tasks_ = 0;
+    }
+    completed_tasks_.store(0, std::memory_order_release);
+
+    // Device orchestration: the orchestrator thread flips this when the graph is built.
+    orchestrator_done_ = false;
+
+    // Clear per-core dispatch payloads
+    memset(payload_per_core_, 0, sizeof(payload_per_core_));
+    memset(deferred_slab_per_core_, 0, sizeof(deferred_slab_per_core_));
+
+    // Initialize per-core GlobalContext (sub_block_id) based on cluster position.
+    // This is done once at startup and never modified afterwards.
+    for (int32_t t = 0; t < sched_thread_num_; t++) {
+        CoreTracker &tracker = core_trackers_[t];
+        for (int32_t c = 0; c < tracker.get_cluster_count(); c++) {
+            int32_t cluster_offset = c * 3;  // Each cluster = 1 AIC + 2 AIV
+            auto aiv0_id = tracker.get_core_id_by_offset(tracker.get_aiv0_core_offset(cluster_offset));
+            auto aiv1_id = tracker.get_core_id_by_offset(tracker.get_aiv1_core_offset(cluster_offset));
+            payload_per_core_[aiv0_id][0].global_context.sub_block_id = 0;
+            payload_per_core_[aiv0_id][1].global_context.sub_block_id = 0;
+            payload_per_core_[aiv1_id][0].global_context.sub_block_id = 1;
+            payload_per_core_[aiv1_id][1].global_context.sub_block_id = 1;
+        }
+    }
+
+    func_id_to_addr_ = runtime->func_id_to_addr_;
+
+    return 0;
+}
+
+void SchedulerContext::deinit() {
+    // Reset all per-core execution state
+    for (int32_t i = 0; i < RUNTIME_MAX_WORKER; i++) {
+        core_exec_states_[i] = {};
+        core_exec_states_[i].running_reg_task_id = AICPU_TASK_INVALID;
+        core_exec_states_[i].pending_reg_task_id = AICPU_TASK_INVALID;
+    }
+
+    // Clear per-core dispatch payloads
+    memset(payload_per_core_, 0, sizeof(payload_per_core_));
+    memset(deferred_slab_per_core_, 0, sizeof(deferred_slab_per_core_));
+
+    // Reset sync-start drain coordination — a previous run that aborted mid-drain
+    // would otherwise leave dirty pending/elected/ack state for the next reuse.
+    drain_state_.sync_start_pending.store(0, std::memory_order_release);
+    drain_state_.drain_worker_elected.store(0, std::memory_order_release);
+    drain_state_.drain_ack_mask.store(0, std::memory_order_release);
+    drain_state_.pending_task.store(nullptr, std::memory_order_release);
+
+    // Reset task counters and orchestrator state
+    completed_tasks_.store(0, std::memory_order_release);
+    total_tasks_ = 0;
+    orchestrator_done_ = false;
+    init_claimed_.store(false, std::memory_order_release);
+    init_complete_.store(false, std::memory_order_release);
+
+    // Reset core transition state
+    transition_requested_.store(false, std::memory_order_release);
+    wait_reassign_.store(0, std::memory_order_release);
+    reassigned_.store(false, std::memory_order_release);
+    completed_.store(false, std::memory_order_release);
+
+    // Reset core discovery and assignment state
+    aic_count_ = 0;
+    aiv_count_ = 0;
+    cores_total_num_ = 0;
+    aicpu_thread_num_ = 0;
+    sched_thread_num_ = 0;
+    orch_to_sched_ = false;
+    active_sched_threads_ = 0;
+    for (int32_t t = 0; t < MAX_AICPU_THREADS; t++) {
+        core_trackers_[t] = CoreTracker{};
+    }
+
+    regs_ = 0;
+    sched_ = nullptr;
+    rt_ = nullptr;
+    func_id_to_addr_ = nullptr;
+}
+
+void SchedulerContext::wait_init_complete() const {
+    while (!init_complete_.load(std::memory_order_acquire)) {
+        SPIN_WAIT_HINT();
+    }
+}
+
+void SchedulerContext::bind_runtime(PTO2Runtime *rt) {
+    rt_ = rt;
+    sched_ = &rt->scheduler;
+}
+
+// =============================================================================
+// Post-orchestration bookkeeping. Runs on the orchestrator thread once the
+// build phase finishes; folds inline-completed tasks, flips orchestrator_done_,
+// and drives the orchestrator → scheduler core transition (or fatal shutdown).
+// =============================================================================
+void SchedulerContext::on_orchestration_done(
+    Runtime *runtime, PTO2Runtime *rt, int32_t thread_idx, int32_t total_tasks
+) {
+#if PTO2_PROFILING
+    if (l2_swimlane_level_ >= L2SwimlaneLevel::ORCH_PHASES) {
+        // Flush orchestrator's phase record buffer (orch pool, ordinal 0)
+        l2_swimlane_aicpu_flush_orch_phase_buffer(thread_idx);
+    }
+#endif
+
+    total_tasks_ = total_tasks;
+
+    // Fold tasks completed inline during orchestration
+    int32_t inline_completed = static_cast<int32_t>(rt->orchestrator.inline_completed_tasks);
+    if (inline_completed > 0) {
+        completed_tasks_.fetch_add(inline_completed, std::memory_order_relaxed);
+#if PTO2_SCHED_PROFILING
+        rt->scheduler.tasks_completed.fetch_add(inline_completed, std::memory_order_relaxed);
+#endif
+    }
+    orchestrator_done_ = true;
+
+    // Check for fatal error from orchestration; if so, shut down immediately.
+    int32_t orch_err = 0;
+    if (sched_->sm_header) {
+        orch_err = sched_->sm_header->orch_error_code.load(std::memory_order_relaxed);
+    }
+    if (orch_err != PTO2_ERROR_NONE) {
+        if (!completed_.exchange(true, std::memory_order_acq_rel)) {
+            emergency_shutdown(runtime);
+        }
+    }
+
+    // Skip core transition on fatal error — cores already shut down above.
+    if (completed_.load(std::memory_order_acquire)) {
+        // Signal transition to unblock scheduler threads waiting at core transition
+        transition_requested_.store(true, std::memory_order_release);
+        reassigned_.store(true, std::memory_order_release);
+    } else if (orch_to_sched_) {
+        LOG_INFO_V0("Thread %d: Set orchestrator_done=true, requesting core transition", thread_idx);
+        transition_requested_.store(true, std::memory_order_release);
+
+        // Wait for scheduler threads to acknowledge transition request
+        while (wait_reassign_.load(std::memory_order_acquire) != sched_thread_num_) {
+            if (completed_.load(std::memory_order_acquire)) {
+                break;
+            }
+            SPIN_WAIT_HINT();
+        }
+        if (!completed_.load(std::memory_order_acquire)) {
+            reassign_cores_for_all_threads();
+            reassigned_.store(true, std::memory_order_release);
+        }
+    }
+
+#if PTO2_PROFILING
+    // Write core-to-thread mapping AFTER reassignment so the profiling data
+    // reflects the final distribution (all active_sched_threads_, including
+    // former orchestrator threads when orch_to_sched_ is enabled).
+    if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) {
+        l2_swimlane_aicpu_init_core_assignments(cores_total_num_);
+        for (int32_t t = 0; t < active_sched_threads_; t++) {
+            l2_swimlane_aicpu_write_core_assignments_for_thread(
+                t, core_trackers_[t].core_ids(), core_trackers_[t].core_num()
+            );
+        }
+    }
+#endif
+}
diff --git a/src/a5/runtime/fully_distributed_within_core/runtime/scheduler/scheduler_completion.cpp b/src/a5/runtime/fully_distributed_within_core/runtime/scheduler/scheduler_completion.cpp
new file mode 100644
index 000000000..7d83249ab
--- /dev/null
+++ b/src/a5/runtime/fully_distributed_within_core/runtime/scheduler/scheduler_completion.cpp
@@ -0,0 +1,514 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+#include "scheduler_context.h"
+
+#include "common/unified_log.h"
+#include "aicpu/device_time.h"
+#include "aicpu/platform_regs.h"
+#include "common/l2_swimlane_profiling.h"
+#include "common/memory_barrier.h"
+#include "common/platform_config.h"
+#include "pto_runtime2.h"
+#include "runtime.h"
+#include "spin_hint.h"
+
+// Performance profiling headers
+#include "aicpu/l2_swimlane_collector_aicpu.h"
+#include "aicpu/pmu_collector_aicpu.h"
+#include "aicpu/tensor_dump_aicpu.h"
+
+// =============================================================================
+// Dual-slot state machine helpers
+// =============================================================================
+
+namespace {
+inline constexpr int32_t PTO2_DEFERRED_RELEASE_CAP = 256;
+}
+
+// Pure function: read register result -> SlotTransition (no side effects).
+SlotTransition SchedulerContext::decide_slot_transition(
+    int32_t reg_task_id, int32_t reg_state, int32_t running_id, int32_t pending_id
+) {
+    SlotTransition t;
+    if (pending_id != AICPU_TASK_INVALID && reg_task_id == pending_id) {
+        t.matched = true;
+        t.running_done = true;  // Serial execution: pending event implies running done
+        t.running_freed = true;
+        t.pending_freed = true;
+        if (reg_state == TASK_FIN_STATE) {
+            t.pending_done = true;  // Case 1: pending FIN
+        }
+        // else: Case 2: pending ACK (pending_done stays false)
+    } else if (reg_task_id == running_id) {
+        if (reg_state == TASK_FIN_STATE) {
+            if (pending_id == AICPU_TASK_INVALID) {
+                // Case 3.2: running FIN, no pending -> core goes idle
+                t.matched = true;
+                t.running_done = true;
+                t.running_freed = true;
+            }
+            // Case 3.1: running FIN, pending exists -> skip (transient state).
+            // Case 1/2 (pending ACK/FIN) will complete running implicitly via running_done=true.
+        } else {
+            // Case 4: running ACK -- only pending_freed (slot now hardware-latched)
+            t.matched = true;
+            t.pending_freed = true;
+        }
+    }
+    return t;
+}
+
+// Complete one slot's task: subtask counting, mixed completion, deferred release, profiling.
+void SchedulerContext::complete_slot_task(
+    PTO2TaskSlotState &slot_state, int32_t expected_reg_task_id, [[maybe_unused]] PTO2SubtaskSlot subslot,
+    int32_t thread_idx, int32_t core_id, Handshake *hank, int32_t &completed_this_turn,
+    PTO2TaskSlotState *deferred_release_slot_states[], int32_t &deferred_release_count, PTO2LocalReadyBuffer *local_bufs
+#if PTO2_PROFILING
+    ,
+    uint64_t dispatch_ts, uint64_t finish_ts
+#endif
+) {
+#if PTO2_PROFILING
+    auto &l2_swimlane = sched_l2_swimlane_[thread_idx];
+#else
+    (void)hank;
+#endif
+    // MPSC fast-path: see a2a3 mirror for the full design narrative. The
+    // any_subtask_deferred flag on slot_state discriminates non-deferred
+    // tasks (inline complete in parallel on FIN thread) from deferred ones
+    // (route through the lock-free AICoreCompletionMailbox).
+    AICoreCompletionMailbox *mailbox = rt_ != nullptr ? rt_->aicore_mailbox : nullptr;
+    bool defer_completion_to_consumer = false;
+
+    if (slot_state.payload != nullptr) {
+        volatile DeferredCompletionSlab *deferred_slab = &deferred_slab_per_core_[core_id][expected_reg_task_id & 1];
+        int32_t slab_err = deferred_slab->error_code;
+        if (slab_err != PTO2_ERROR_NONE) {
+            int32_t expected = PTO2_ERROR_NONE;
+            sched_->sm_header->sched_error_code.compare_exchange_strong(
+                expected, slab_err, std::memory_order_acq_rel, std::memory_order_acquire
+            );
+            completed_.store(true, std::memory_order_release);
+            return;
+        }
+
+        uint32_t cond_count = deferred_slab->count;
+        if (cond_count > MAX_COMPLETIONS_PER_TASK) {
+            int32_t expected = PTO2_ERROR_NONE;
+            sched_->sm_header->sched_error_code.compare_exchange_strong(
+                expected, PTO2_ERROR_ASYNC_REGISTRATION_FAILED, std::memory_order_acq_rel, std::memory_order_acquire
+            );
+            completed_.store(true, std::memory_order_release);
+            return;
+        }
+
+        if (cond_count > 0) {
+            slot_state.any_subtask_deferred.store(true, std::memory_order_release);
+
+            const PTO2TaskId token = slot_state.task->task_id;
+            for (uint32_t i = 0; i < cond_count; ++i) {
+                volatile DeferredCompletionEntry *e = &deferred_slab->entries[i];
+                while (!mailbox->try_push_condition(token, e->addr, e->expected_value, e->engine, e->completion_type)) {
+                    sched_->async_wait_list.mpsc_skipped_count.fetch_add(1, std::memory_order_relaxed);
+                    SPIN_WAIT_HINT();
+                }
+            }
+        }
+    }
+
+    bool task_complete = sched_->on_subtask_complete(slot_state);
+
+    if (task_complete && slot_state.payload != nullptr &&
+        slot_state.any_subtask_deferred.load(std::memory_order_acquire)) {
+        while (!mailbox->try_push_normal_done(slot_state.task->task_id, reinterpret_cast<uint64_t>(&slot_state))) {
+            sched_->async_wait_list.mpsc_skipped_count.fetch_add(1, std::memory_order_relaxed);
+            SPIN_WAIT_HINT();
+        }
+        defer_completion_to_consumer = true;
+    }
+
+    if (task_complete && !defer_completion_to_consumer) {
+#if PTO2_PROFILING
+        if (is_dump_args_enabled()) {
+            dump_args_for_task<PTO2_SUBTASK_SLOT_COUNT>(
+                thread_idx, slot_state, TensorDumpStage::AFTER_COMPLETION,
+                [](ActiveMask active_mask, int raw_subtask_id) {
+                    return active_mask.subtask_active(static_cast<PTO2SubtaskSlot>(raw_subtask_id));
+                },
+                [this](int32_t func_id) {
+                    return get_function_bin_addr(func_id);
+                }
+            );
+        }
+#endif
+#if PTO2_SCHED_PROFILING
+        // SCHED_PROFILING variant takes thread_idx for its per-thread atomic
+        // counter side-effects (g_sched_*_atomic_count[thread_idx], consumed
+        // by the otc_* log lines). Its return value is unused.
+        (void)sched_->on_task_complete(slot_state, thread_idx, local_bufs);
+#else
+        sched_->on_task_complete(slot_state, local_bufs);
+#endif
+#if PTO2_PROFILING
+        l2_swimlane.phase_complete_count++;
+#endif
+        if (deferred_release_count < PTO2_DEFERRED_RELEASE_CAP) {
+            deferred_release_slot_states[deferred_release_count++] = &slot_state;
+        } else {
+            LOG_INFO_V9("Thread %d: release", thread_idx);
+            while (deferred_release_count > 0) {
+#if PTO2_SCHED_PROFILING
+                // SCHED_PROFILING variant takes thread_idx for the per-thread
+                // atomic counter side-effects. The return value is unused.
+                (void)sched_->on_task_release(*deferred_release_slot_states[--deferred_release_count], thread_idx);
+#else
+                sched_->on_task_release(*deferred_release_slot_states[--deferred_release_count]);
+#endif
+            }
+            deferred_release_slot_states[deferred_release_count++] = &slot_state;
+        }
+        completed_this_turn++;
+    }
+
+#if PTO2_PROFILING
+    // Level gate: at AICORE_TIMING (level=1) the AICore record alone carries
+    // {start, end, task_token_raw}, host resolves func_id/core_type from
+    // dep_gen / per-core mapping, and AICPU has nothing to write. Only at
+    // AICPU_TIMING (level=2) and above does AICPU contribute dispatch/finish
+    // timestamps via complete_task.
+    if (l2_swimlane.l2_swimlane_enabled && l2_swimlane_level_ >= L2SwimlaneLevel::AICPU_TIMING) {
+#if PTO2_SCHED_PROFILING
+        uint64_t t_perf_start = get_sys_cnt_aicpu();
+#endif
+
+        if (l2_swimlane_aicpu_complete_task(
+                core_id, thread_idx, static_cast<uint32_t>(expected_reg_task_id), dispatch_ts, finish_ts
+            ) != 0) {
+            LOG_ERROR(
+                "Core %d: l2_swimlane_aicpu_complete_task failed for task 0x%" PRIx64, core_id,
+                static_cast<uint64_t>(slot_state.task->task_id.raw)
+            );
+        }
+#if PTO2_SCHED_PROFILING
+        l2_swimlane.sched_complete_perf_cycle += (get_sys_cnt_aicpu() - t_perf_start);
+#endif
+    }
+#endif
+
+#if PTO2_PROFILING
+    if (is_pmu_enabled()) {
+        // Slot key must be the 32-bit register token AICore wrote into
+        // dual_issue_slots[task_id & 1].task_id (= DATA_MAIN_BASE value).
+        // task_id.raw is the full PTO2 (ring_id<<32|local_id) encoding —
+        // matching on that would never hit. Pass the PTO2 id separately
+        // for the PmuRecord.
+        pmu_aicpu_complete_record(
+            core_id, thread_idx, static_cast<uint32_t>(expected_reg_task_id), slot_state.task->task_id.raw,
+            slot_state.task->kernel_id[static_cast<int32_t>(subslot)], hank[core_id].core_type
+        );
+    }
+#endif
+}
+
+// Promote pending slot data to running slot. Clears pending fields.
+void SchedulerContext::promote_pending_to_running(CoreExecState &core) {
+    core.running_slot_state = core.pending_slot_state;
+    core.running_reg_task_id = core.pending_reg_task_id;
+    core.running_subslot = core.pending_subslot;
+#if PTO2_PROFILING
+    core.running_dispatch_timestamp = core.pending_dispatch_timestamp;
+#endif
+    core.pending_slot_state = nullptr;
+    core.pending_reg_task_id = AICPU_TASK_INVALID;
+}
+
+// Clear running slot (core becomes idle).
+void SchedulerContext::clear_running_slot(CoreExecState &core) {
+    core.running_slot_state = nullptr;
+    core.running_reg_task_id = AICPU_TASK_INVALID;
+}
+
+void SchedulerContext::check_running_cores_for_completion(
+    int32_t thread_idx, Handshake *hank, int32_t &completed_this_turn, int32_t &cur_thread_completed,
+    bool &made_progress, PTO2TaskSlotState *deferred_release_slot_states[], int32_t &deferred_release_count,
+    PTO2LocalReadyBuffer *local_bufs
+) {
+#if PTO2_SCHED_PROFILING
+    auto &l2_swimlane = sched_l2_swimlane_[thread_idx];
+#endif
+    CoreTracker &tracker = core_trackers_[thread_idx];
+    auto running_core_states = tracker.get_all_running_cores();
+    while (running_core_states.has_value()) {
+        int32_t bit_pos = running_core_states.pop_first();
+        int32_t core_id = tracker.get_core_id_by_offset(bit_pos);
+        CoreExecState &core = core_exec_states_[core_id];
+
+        // --- Judgment phase: read register, derive transition ---
+        // Use the precomputed cond_ptr (resolved once in handshake) to skip
+        // the reg_offset switch and reg_addr addition on every poll.
+        uint64_t reg_val = static_cast<uint64_t>(*core.cond_ptr);
+        // ARM64 allows Device-nGnRnE -> Normal-cacheable load reorder; the
+        // rmb() pins any AICore-published cacheable reads downstream of the
+        // FIN observation. Replaces the post-`__sync_synchronize` that the
+        // old read_reg() helper carried implicitly.
+        rmb();
+        int32_t reg_task_id = EXTRACT_TASK_ID(reg_val);
+        int32_t reg_state = EXTRACT_TASK_STATE(reg_val);
+
+#if PTO2_SCHED_PROFILING
+        if (l2_swimlane.l2_swimlane_enabled) {
+            l2_swimlane.complete_probe_count++;
+        }
+#endif
+
+        SlotTransition t =
+            decide_slot_transition(reg_task_id, reg_state, core.running_reg_task_id, core.pending_reg_task_id);
+        if (!t.matched) continue;
+
+#if PTO2_SCHED_PROFILING
+        if (l2_swimlane.l2_swimlane_enabled && (t.running_done || t.pending_done)) {
+            l2_swimlane.complete_hit_count++;
+        }
+#endif
+
+#if PTO2_PROFILING
+        // Capture finish_ts at the FIN observation point — right after rmb()
+        // pinned cacheable AICore reads downstream of the register load, and
+        // BEFORE any fanin / deferred-release work. Anything later would
+        // charge AICPU completion-processing cost to (end → finish).
+        uint64_t finish_ts = 0;
+        if (l2_swimlane_level_ >= L2SwimlaneLevel::AICPU_TIMING && (t.pending_done || t.running_done)) {
+            finish_ts = get_sys_cnt_aicpu();
+        }
+#endif
+
+        // --- Apply phase: execute actions based on transition ---
+
+        // 1. Complete finished tasks (capture pointers before modifying core state)
+        if (t.pending_done) {
+            complete_slot_task(
+                *core.pending_slot_state, core.pending_reg_task_id, core.pending_subslot, thread_idx, core_id, hank,
+                completed_this_turn, deferred_release_slot_states, deferred_release_count, local_bufs
+#if PTO2_PROFILING
+                ,
+                core.pending_dispatch_timestamp, finish_ts
+#endif
+            );
+            cur_thread_completed++;
+        }
+        if (t.running_done) {
+            complete_slot_task(
+                *core.running_slot_state, core.running_reg_task_id, core.running_subslot, thread_idx, core_id, hank,
+                completed_this_turn, deferred_release_slot_states, deferred_release_count, local_bufs
+#if PTO2_PROFILING
+                ,
+                core.running_dispatch_timestamp, finish_ts
+#endif
+            );
+            cur_thread_completed++;
+        }
+
+        // 2. Update slot data
+        if (t.running_freed) {
+            if (core.pending_slot_state != nullptr && !t.pending_done) {
+                promote_pending_to_running(core);  // Case 2 or Case 3 (with pending)
+            } else {
+                clear_running_slot(core);  // Case 1 or Case 3 (no pending)
+                if (t.pending_done) {
+                    // Case 1: pending FIN observed directly -- clear stale pending fields.
+                    // Without this, pending_reg_task_id retains a stale value that blocks
+                    // clear_pending_occupied and permanently degrades pipelining.
+                    core.pending_slot_state = nullptr;
+                    core.pending_reg_task_id = AICPU_TASK_INVALID;
+                }
+            }
+        }
+
+        // 3. Update tracker bitmap
+        bool is_idle = (core.running_reg_task_id == AICPU_TASK_INVALID);
+        if (is_idle) {
+            tracker.change_core_state(bit_pos);       // Mark idle
+            tracker.clear_pending_occupied(bit_pos);  // Idle safeguard: no payload to protect
+        } else if (t.pending_freed && core.pending_reg_task_id == AICPU_TASK_INVALID) {
+            // Case 4 (running ACK) or Case 2 (pending ACK): clear pending_occupied only
+            // when no pending task is currently held. Otherwise pending slot is occupied
+            // by a pre-loaded task and must stay protected.
+            tracker.clear_pending_occupied(bit_pos);
+        }
+
+        // 4. Progress signal (only when running task completes)
+        if (t.running_done) {
+            made_progress = true;
+        }
+    }
+}
+
+// =============================================================================
+// sync_start drain protocol
+// =============================================================================
+
+// Take ownership of slot_state and signal all threads to enter drain mode.
+// Returns true if this thread won the CAS and owns the drain slot.
+// Returns false if another thread already holds drain; caller must re-push slot_state.
+//
+// Two-phase protocol: CAS 0 -> -1 (sentinel) to claim ownership, store task and
+// reset election flag, then release-store block_num.  Other threads acquire-load
+// sync_start_pending; seeing block_num > 0 ensures all relaxed stores are visible.
+bool SchedulerContext::enter_drain_mode(PTO2TaskSlotState *slot_state, int32_t block_num) {
+    int32_t expected = 0;
+    if (!drain_state_.sync_start_pending.compare_exchange_strong(
+            expected, -1, std::memory_order_relaxed, std::memory_order_relaxed
+        )) {
+        return false;  // Another thread already holds the drain slot.
+    }
+    // We own the drain slot.  Store the task and reset election flag before making it visible.
+    drain_state_.pending_task.store(slot_state, std::memory_order_release);
+    drain_state_.drain_ack_mask.store(0, std::memory_order_relaxed);
+    drain_state_.drain_worker_elected.store(0, std::memory_order_relaxed);
+    // Release store: all stores above are now visible to any thread that
+    // acquire-loads sync_start_pending and sees block_num > 0.
+    drain_state_.sync_start_pending.store(block_num, std::memory_order_release);
+    return true;
+}
+
+// Count total available resources across all scheduler threads for a given shape.
+int32_t SchedulerContext::count_global_available(PTO2ResourceShape shape, uint8_t core_mask) {
+    int32_t total = 0;
+    for (int32_t t = 0; t < active_sched_threads_; t++) {
+        if (shape == PTO2ResourceShape::MIX) {
+            total += core_trackers_[t].count_mix_running_clusters(core_mask);
+        } else {
+            total += core_trackers_[t].get_idle_core_offset_states(shape).count();
+        }
+    }
+    return total;
+}
+
+// Drain worker: dispatch all blocks in one pass across all threads' trackers.
+// Called only when global resources >= block_num, so one pass always suffices.
+// All other threads are spinning -- the drain worker has exclusive tracker access.
+void SchedulerContext::drain_worker_dispatch(Runtime *runtime, int32_t block_num) {
+    PTO2TaskSlotState *slot_state = drain_state_.pending_task.load(std::memory_order_acquire);
+    if (!slot_state) {
+        drain_state_.sync_start_pending.store(0, std::memory_order_release);
+        return;
+    }
+    PTO2ResourceShape shape = slot_state->active_mask.to_shape();
+    uint8_t core_mask = slot_state->active_mask.core_mask();
+
+    for (int32_t t = 0; t < active_sched_threads_ && slot_state->next_block_idx < block_num; t++) {
+        auto valid = (shape == PTO2ResourceShape::MIX) ?
+                         core_trackers_[t].get_mix_running_cluster_offset_states(core_mask) :
+                         core_trackers_[t].get_idle_core_offset_states(shape);
+        while (valid.has_value() && slot_state->next_block_idx < block_num) {
+            dispatch_block(runtime, t, valid.pop_first(), *slot_state, shape, false, slot_state->next_block_idx);
+            slot_state->next_block_idx++;
+        }
+    }
+
+    // All blocks dispatched -- clear drain state.
+    // Release fence ensures tracker mutations are visible to threads that
+    // acquire-load sync_start_pending == 0 and resume normal operation.
+    std::atomic_thread_fence(std::memory_order_release);
+    drain_state_.pending_task.store(nullptr, std::memory_order_release);
+    drain_state_.drain_ack_mask.store(0, std::memory_order_relaxed);
+    drain_state_.drain_worker_elected.store(0, std::memory_order_relaxed);
+    drain_state_.sync_start_pending.store(0, std::memory_order_release);
+}
+
+// Called by each scheduler thread when drain_state_.sync_start_pending != 0.
+//
+// Protocol (single-stage ack barrier):
+//   1. Ack barrier: all threads signal they've stopped dispatch, then spin
+//      until all ack bits are set.
+//      If this thread's bit gets cleared while waiting, a reset occurred -- return.
+//   2. Election: one thread wins the CAS and becomes the drain worker.
+//      If resources are insufficient, reset ack/election fields and return --
+//      all threads resume completion polling to free running cores, then retry.
+//   3. Dispatch: elected thread dispatches all blocks (one pass, resources guaranteed).
+//      Non-elected threads spin-wait until sync_start_pending == 0.
+//      During dispatch the elected thread has exclusive tracker access.
+void SchedulerContext::handle_drain_mode(Runtime *runtime, int32_t thread_idx) {
+    // Every spin in this function honors is_completed(): once the run latches
+    // completed_ (all tasks done, or a fatal error raised elsewhere), peers leave
+    // the dispatch loop and stop participating in the drain. A thread parked in a
+    // drain spin would then wait forever for acks / a gate-open that can no longer
+    // arrive -- the AICPU watchdog never fires here because these spins live
+    // outside the dispatch loop's wall-clock budget, so the hang escalates straight
+    // to the 3 s STARS op-exec timeout (507018) and poisons the device. Bailing on
+    // completed_ is always safe: any pending sync_start task is either already
+    // dispatched (a stale re-popped slot) or moot under teardown, and deinit()
+    // resets drain_state_ before the next run, so leaving it dirty is harmless.
+    // Spin until drain is fully initialized (sentinel -1 -> block_num > 0).
+    int32_t block_num;
+    do {
+        if (is_completed()) return;
+        block_num = drain_state_.sync_start_pending.load(std::memory_order_acquire);
+    } while (block_num < 0);
+    if (block_num == 0) return;
+
+    uint32_t all_acked = (1u << active_sched_threads_) - 1;
+
+    // Ack barrier -- signal this thread has stopped dispatch.
+    drain_state_.drain_ack_mask.fetch_or(1u << thread_idx, std::memory_order_release);
+
+    // Spin until all threads have acked.
+    // If our bit is cleared while waiting, elected reset due to insufficient resources.
+    while (true) {
+        if (is_completed()) return;
+        uint32_t ack = drain_state_.drain_ack_mask.load(std::memory_order_acquire);
+        if ((ack & all_acked) == all_acked) break;
+        if ((ack & (1u << thread_idx)) == 0) return;
+        SPIN_WAIT_HINT();
+    }
+
+    // Election -- exactly one thread wins the CAS.
+    int32_t expected = 0;
+    drain_state_.drain_worker_elected.compare_exchange_strong(
+        expected, thread_idx + 1, std::memory_order_acquire, std::memory_order_relaxed
+    );
+
+    if (drain_state_.drain_worker_elected.load(std::memory_order_relaxed) != thread_idx + 1) {
+        // Non-elected: spin-wait for drain completion or resource-insufficient reset.
+        while (drain_state_.sync_start_pending.load(std::memory_order_acquire) != 0) {
+            if (is_completed()) return;
+            if (drain_state_.drain_worker_elected.load(std::memory_order_acquire) == 0) return;
+            SPIN_WAIT_HINT();
+        }
+        return;
+    }
+
+    // Elected: check if global resources are sufficient.
+    PTO2TaskSlotState *slot_state = drain_state_.pending_task.load(std::memory_order_acquire);
+    if (slot_state == nullptr) {
+        // pending_task is observed null only when a concurrent drain completion
+        // already cleared it (drain_worker_dispatch nulls it before reopening the
+        // gate). That drain is done and this is a stale-elected thread, so just
+        // release the election lock and return. Do NOT clear drain_ack_mask or
+        // sync_start_pending: a *new* drain run may already be active and
+        // accumulating acks, and zeroing them would corrupt it into a hang.
+        drain_state_.drain_worker_elected.store(0, std::memory_order_release);
+        return;
+    }
+    PTO2ResourceShape shape = slot_state->active_mask.to_shape();
+    int32_t available = count_global_available(shape, slot_state->active_mask.core_mask());
+
+    if (available < block_num) {
+        // Insufficient resources -- reset drain fields so threads can resume
+        // completion polling to free running cores, then retry.
+        drain_state_.drain_ack_mask.store(0, std::memory_order_release);
+        drain_state_.drain_worker_elected.store(0, std::memory_order_release);
+        return;
+    }
+
+    // Dispatch -- all other threads are spinning, elected thread has exclusive tracker access.
+    drain_worker_dispatch(runtime, block_num);
+}
diff --git a/src/a5/runtime/fully_distributed_within_core/runtime/scheduler/scheduler_context.h b/src/a5/runtime/fully_distributed_within_core/runtime/scheduler/scheduler_context.h
new file mode 100644
index 000000000..8aa8d0034
--- /dev/null
+++ b/src/a5/runtime/fully_distributed_within_core/runtime/scheduler/scheduler_context.h
@@ -0,0 +1,387 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+#ifndef SCHEDULER_CONTEXT_H
+#define SCHEDULER_CONTEXT_H
+
+#include "common/l2_swimlane_profiling.h"
+#include "common/unified_log.h"
+#include "scheduler_types.h"
+
+#include "scheduler/pto_scheduler.h"
+
+#include "aicore_completion_mailbox.h"
+
+// These macros are defined in runtime.h, but we cannot include it here
+// (it pulls in Handshake which we only forward-declare).  Mirror the
+// authoritative values so the class layout compiles standalone.
+#ifndef RUNTIME_MAX_WORKER
+#define RUNTIME_MAX_WORKER 108
+#endif
+#ifndef RUNTIME_MAX_FUNC_ID
+#define RUNTIME_MAX_FUNC_ID 1024
+#endif
+
+// Forward declarations — avoid pulling in full headers for pointer/reference params.
+class Runtime;
+struct Handshake;
+struct PTO2Runtime;
+
+/**
+ * SchedulerContext: owns all scheduler-side state and methods.
+ *
+ * Held as a member of AicpuExecutor (sched_ctx_).  The single public entry
+ * point is resolve_and_dispatch(), called once per scheduler thread.
+ *
+ * All dispatch/completion/drain/cold-path logic is implemented as private
+ * member methods, split across three .cpp files by responsibility:
+ *   - scheduler_completion.cpp  (completion polling, drain protocol)
+ *   - scheduler_cold_path.cpp   (exit checks, stall diagnostics, profiling)
+ *   - scheduler_dispatch.cpp    (task dispatch loop and helpers)
+ */
+class SchedulerContext {
+public:
+    // =========================================================================
+    // Lifecycle
+    // =========================================================================
+
+    // Initialize scheduler state from the given runtime and thread layout.
+    // - Discovers cores via handshake_all_cores()
+    // - Assigns cores to scheduler threads
+    // - Resets task counters, payloads, per-core GlobalContext
+    // - Binds func_id_to_addr_ / initial sched_ (if rt is already known)
+    // - Captures AICore-register base (consumed by handshake_all_cores())
+    // Returns 0 on success, negative on failure (handshake / assignment error).
+    int32_t
+    init(Runtime *runtime, int32_t aicpu_thread_num, int32_t sched_thread_num, bool orch_to_sched, uint64_t regs_base);
+
+    // Reset all SchedulerContext-owned state to its post-construction defaults.
+    // Called by AicpuExecutor::deinit() during per-run teardown.
+    void deinit();
+
+    // =========================================================================
+    // Per-thread execution entry points (called by AicpuExecutor::run)
+    // =========================================================================
+
+    // Main scheduler thread entry: poll completion + dispatch ready tasks.
+    int32_t resolve_and_dispatch(Runtime *runtime, int32_t thread_idx);
+
+    // Shutdown AICore registers for this thread's assigned cores.
+    // Also runs PMU finalize (PTO2_PROFILING) before deinit when enabled.
+    // Orchestrator threads (core_trackers_[thread_idx].core_num() == 0) are a no-op.
+    int32_t shutdown(int32_t thread_idx);
+
+    // Run all post-orchestration scheduler bookkeeping:
+    //  - publishes core assignments to the perf collector (PTO2_PROFILING)
+    //  - latches submitted task count from PTO2 shared memory
+    //  - folds inline_completed_tasks into completed_tasks_
+    //  - flips orchestrator_done_ and triggers core transition
+    //    (skipped on fatal error — emergency_shutdown runs instead)
+    // Callers must invoke rt_orchestration_done(rt) before this — that
+    // step belongs to the orchestrator lifecycle, not the scheduler.
+    void on_orchestration_done(Runtime *runtime, PTO2Runtime *rt, int32_t thread_idx, int32_t total_tasks);
+
+    // Bind the PTO2Runtime scheduler pointer. Required in device-orchestration
+    // mode where rt is created by the orchestrator thread after init().
+    void bind_runtime(PTO2Runtime *rt);
+
+    // =========================================================================
+    // State queries / external synchronization points
+    // =========================================================================
+
+    int32_t aic_count() const { return aic_count_; }
+    int32_t aiv_count() const { return aiv_count_; }
+    bool is_completed() const { return completed_.load(std::memory_order_acquire); }
+    int32_t completed_tasks_count() const { return completed_tasks_.load(std::memory_order_acquire); }
+
+    // Block until the first scheduler thread has finished one-time PTO2 init.
+    // Called by the orchestrator thread in device-orch mode.
+    void wait_init_complete() const;
+
+private:
+    // =========================================================================
+    // State
+    // =========================================================================
+
+    // --- Scheduler binding & per-core runtime state ---
+    alignas(64) PTO2SchedulerState *sched_{nullptr};
+    PTO2Runtime *rt_{nullptr};
+
+    // Per-core execution state, indexed by core_id (= worker_id)
+    CoreExecState core_exec_states_[RUNTIME_MAX_WORKER];
+
+    // Cluster-ordered core trackers, one per scheduler thread
+    CoreTracker core_trackers_[MAX_AICPU_THREADS];
+
+    // Per-core dispatch payload storage: dual-buffer for pipelining.
+    // buf_idx = reg_task_id & 1; adjacent dispatches alternate automatically.
+    PTO2DispatchPayload payload_per_core_[RUNTIME_MAX_WORKER][2];
+
+    // Per-core deferred-completion software registration storage.  This has
+    // the same runtime lifetime as payload_per_core_, but is kept out of the
+    // dispatch payload so normal task dispatch layout and cache footprint stay
+    // unchanged.
+    DeferredCompletionSlab deferred_slab_per_core_[RUNTIME_MAX_WORKER][2];
+
+    // sync_start drain coordination
+    SyncStartDrainState drain_state_;
+
+#if PTO2_PROFILING
+    SchedL2SwimlaneCounters sched_l2_swimlane_[MAX_AICPU_THREADS];
+    // Cached once at init() from get_l2_swimlane_level(), AFTER
+    // l2_swimlane_aicpu_init has promoted the level from the shared-memory header.
+    L2SwimlaneLevel l2_swimlane_level_{L2SwimlaneLevel::DISABLED};
+#endif
+
+    // --- Task-execution tracking ---
+    std::atomic<int32_t> completed_tasks_{0};
+    int32_t total_tasks_{0};
+    // Device orchestration: set by last orchestrator when graph is built; schedulers poll it.
+    // volatile prevents the compiler from hoisting the load out of spin loops.
+    volatile bool orchestrator_done_{false};
+    std::atomic<bool> completed_{false};
+    uint64_t *func_id_to_addr_{nullptr};
+
+    // --- Core-transition coordination ---
+    std::atomic<bool> transition_requested_{false};
+    std::atomic<int32_t> wait_reassign_{0};
+    std::atomic<bool> reassigned_{false};
+
+    // --- Thread/core configuration ---
+    int32_t active_sched_threads_{0};
+    int32_t sched_thread_num_{0};
+    bool orch_to_sched_{false};
+    int32_t aicpu_thread_num_{0};
+    int32_t cores_total_num_{0};
+
+    // Cluster-ordered worker_id lists, populated by handshake_all_cores().
+    int32_t aic_worker_ids_[RUNTIME_MAX_WORKER]{};
+    int32_t aiv_worker_ids_[RUNTIME_MAX_WORKER]{};
+    int32_t aic_count_{0};
+    int32_t aiv_count_{0};
+
+#if PTO2_PROFILING
+    // Physical core ids keyed by logical worker id. Populated by
+    // handshake_all_cores() and handed to pmu_aicpu_init() so the platform
+    // can resolve per-core PMU MMIO bases. Only needed when PTO2_PROFILING=1
+    // — without it, PMU is compiled out and core_exec_states_ already
+    // carries the field.
+    uint32_t physical_core_ids_[RUNTIME_MAX_WORKER]{};
+#endif
+
+    // Platform AICore-register base array (set by AicpuExecutor before init()).
+    uint64_t regs_{0};
+
+    // --- One-time init coordination ---
+    std::atomic<bool> init_claimed_{false};
+    std::atomic<bool> init_complete_{false};
+
+    // =========================================================================
+    // Core management (scheduler_cold_path.cpp)
+    // =========================================================================
+
+    // Handshake with all AICore workers; populates core_exec_states_, worker id lists.
+    int32_t handshake_all_cores(Runtime *runtime);
+
+    // Assign discovered cores (cluster = 1 AIC + 2 AIV) round-robin across scheduler threads.
+    bool assign_cores_to_threads();
+
+    // Re-distribute all cores across all threads after orchestration completes.
+    void reassign_cores_for_all_threads();
+
+    // Emergency shutdown: broadcast exit signal to every handshake'd core and
+    // deinit their AICore register blocks. Idempotent.
+    void emergency_shutdown(Runtime *runtime);
+
+    // =========================================================================
+    // Dispatch (scheduler_dispatch.cpp)
+    // =========================================================================
+
+    static const char *shape_name(PTO2ResourceShape shape);
+
+    // Lower-case rendering of PTO2SubtaskSlot, used by dispatch and stall logs.
+    // Kept lower-case to match the `kernels=[aic:N aiv0:N aiv1:N]` field
+    // convention already established in the stall log family.
+    static inline const char *subslot_name(PTO2SubtaskSlot s) {
+        switch (s) {
+        case PTO2SubtaskSlot::AIC:
+            return "aic";
+        case PTO2SubtaskSlot::AIV0:
+            return "aiv0";
+        case PTO2SubtaskSlot::AIV1:
+            return "aiv1";
+        }
+        return "?";
+    }
+
+    int pop_ready_tasks_batch(
+        PTO2ResourceShape shape, int32_t thread_idx, PTO2LocalReadyBuffer &local_buf, PTO2TaskSlotState **out,
+        int max_count
+    );
+
+    void build_payload(
+        PTO2DispatchPayload &dispatch_payload, PTO2TaskSlotState &slot_state, PTO2SubtaskSlot subslot,
+        const AsyncCtx &async_ctx, int32_t block_idx
+    );
+
+    void dispatch_subtask_to_core(
+        Runtime *runtime, int32_t thread_idx, int32_t core_offset, PTO2TaskSlotState &slot_state,
+        PTO2SubtaskSlot subslot, bool to_pending, int32_t block_idx
+    );
+
+    void dispatch_mix_block_to_cluster(
+        Runtime *runtime, int32_t thread_idx, int32_t cluster_offset, PTO2TaskSlotState &slot_state, bool to_pending,
+        int32_t block_idx
+    );
+
+    void dispatch_block(
+        Runtime *runtime, int32_t thread_idx, int32_t core_offset, PTO2TaskSlotState &slot_state,
+        PTO2ResourceShape shape, bool to_pending, int32_t block_idx
+    );
+
+    void dispatch_shape(
+        Runtime *runtime, int32_t thread_idx, PTO2ResourceShape shape, CoreTracker::DispatchPhase phase,
+        PTO2LocalReadyBuffer &local_buf, CoreTracker &tracker, bool &entered_drain, bool &made_progress,
+        bool &try_pushed
+    );
+
+    // One pass of "Phase 4" in the resolve_and_dispatch loop: IDLE-stage dispatch
+    // for MIX then (if no mix residual) AIC/AIV; mid-flush of local buffers; then
+    // PENDING-stage dispatch with cross-thread idle gating. MIX is strictly
+    // prioritized — when mix residual is detected after MIX-IDLE, AIC/AIV are
+    // skipped for the whole pass but MIX-PENDING still runs.
+    //
+    // Forward-progress argument for AIC/AIV: skip_aic_aiv is sticky for the
+    // current pass only. The next loop iteration re-evaluates after Phase 1
+    // completion polling and the global MIX queue draining (here or on any
+    // peer thread). AIC/AIV starvation is therefore bounded by MIX throughput,
+    // not unbounded — once mix completes on at least one cluster, the next
+    // pass either drains the residual or admits AIC/AIV.
+    void dispatch_ready_tasks(
+        Runtime *runtime, int32_t thread_idx, CoreTracker &tracker,
+        PTO2LocalReadyBuffer (&local_bufs)[PTO2_NUM_RESOURCE_SHAPES], bool pmu_active, bool &made_progress,
+        bool &try_pushed
+    );
+
+    // Returns true if any *other* scheduler thread currently has an idle core
+    // matching `shape`. Used as a scheduling hint on the PENDING dispatch path
+    // — see the implementation in scheduler_dispatch.cpp for the hint-semantics
+    // rationale and the safety argument against the drain worker.
+    bool has_idle_in_other_threads(int32_t self_thread_idx, PTO2ResourceShape shape) const;
+
+    // True if mix tasks remain anywhere this thread could see them: the caller's
+    // MIX local LIFO stack or the global MIX ready queue. Approximate —
+    // PTO2ReadyQueue::size() (see pto_scheduler.h) snapshots its enqueue/dequeue
+    // positions with std::memory_order_relaxed and may interleave with concurrent
+    // push/pop. Don't confuse with PTO2SpscQueue::size(), which uses acquire
+    // loads — that one isn't on this path. A stale read here causes at most one
+    // extra/missed AIC/AIV skip and self-corrects on the next loop iteration.
+    bool has_residual_mix(const PTO2LocalReadyBuffer &mix_local_buf) const {
+        return mix_local_buf.count > 0 || sched_->ready_queues[static_cast<int32_t>(PTO2ResourceShape::MIX)].size() > 0;
+    }
+
+    // =========================================================================
+    // Completion & drain (scheduler_completion.cpp)
+    // =========================================================================
+
+    static SlotTransition
+    decide_slot_transition(int32_t reg_task_id, int32_t reg_state, int32_t running_id, int32_t pending_id);
+
+    void complete_slot_task(
+        PTO2TaskSlotState &slot_state, int32_t expected_reg_task_id, PTO2SubtaskSlot subslot, int32_t thread_idx,
+        int32_t core_id, Handshake *hank, int32_t &completed_this_turn,
+        PTO2TaskSlotState *deferred_release_slot_states[], int32_t &deferred_release_count,
+        PTO2LocalReadyBuffer *local_bufs
+#if PTO2_PROFILING
+        ,
+        uint64_t dispatch_ts, uint64_t finish_ts
+#endif
+    );
+
+    static void promote_pending_to_running(CoreExecState &core);
+    static void clear_running_slot(CoreExecState &core);
+
+    void check_running_cores_for_completion(
+        int32_t thread_idx, Handshake *hank, int32_t &completed_this_turn, int32_t &cur_thread_completed,
+        bool &made_progress, PTO2TaskSlotState *deferred_release_slot_states[], int32_t &deferred_release_count,
+        PTO2LocalReadyBuffer *local_bufs
+    );
+
+    bool enter_drain_mode(PTO2TaskSlotState *slot_state, int32_t block_num);
+    int32_t count_global_available(PTO2ResourceShape shape, uint8_t core_mask);
+    void drain_worker_dispatch(Runtime *runtime, int32_t block_num);
+    void handle_drain_mode(Runtime *runtime, int32_t thread_idx);
+
+    // =========================================================================
+    // Cold path: exit checks, stall diagnostics, profiling (scheduler_cold_path.cpp)
+    // =========================================================================
+
+    __attribute__((noinline, cold)) LoopAction
+    handle_orchestrator_exit(int32_t thread_idx, PTO2SharedMemoryHeader *header, Runtime *runtime, int32_t &task_count);
+
+    __attribute__((noinline, cold)) LoopAction handle_core_transition(bool &cores_released);
+
+    __attribute__((noinline, cold)) LoopAction
+    check_idle_fatal_error(int32_t thread_idx, PTO2SharedMemoryHeader *header, Runtime *runtime);
+
+    __attribute__((noinline, cold)) void
+    log_stall_diagnostics(int32_t thread_idx, int32_t task_count, int32_t idle_iterations, int32_t last_progress_count);
+
+    __attribute__((noinline, cold)) void log_shutdown_stall_snapshot(
+        int32_t trigger_thread_idx, int32_t trigger_idle_iterations, int32_t trigger_last_progress_count
+    );
+
+    // Reverse lookup: given a global core_id, find which scheduler thread's
+    // tracker owns it. Returns -1 if not found. Linear scan — only used on
+    // the cold diagnostic path.
+    int32_t find_core_owner_thread(int32_t core_id) const;
+
+    // Does this thread own any core with a RUNNING task (running_slot_state set)?
+    // Gates the scheduler timeout fatal latch: a thread without an owned
+    // RUNNING task has no first-hand evidence of a stuck dispatch and must
+    // not declare global fatal on its own idle observation. The thread that
+    // does own the stuck task will reach the budget on its own polls and
+    // latch with valid evidence (or recover when the COND register flips).
+    bool self_owns_running_task(int32_t thread_idx) const;
+
+    // Does *any* scheduler thread own a RUNNING task? Used as the second
+    // fatal-latch condition: if the wall-clock budget elapsed AND no thread
+    // owns RUNNING work AND tasks remain incomplete, the system is in a
+    // pre-dispatch / WAIT-only deadlock (e.g. dependency cycle) and the
+    // ownerless idle threads are the only observers — let one of them latch.
+    bool no_thread_owns_running_task() const;
+
+    __attribute__((noinline, cold)) int32_t handle_timeout_exit(
+        int32_t thread_idx, PTO2SharedMemoryHeader *header, Runtime *runtime, int32_t idle_iterations,
+        int32_t last_progress_count
+#if PTO2_PROFILING
+        ,
+        uint64_t sched_start_ts
+#endif
+    );
+
+#if PTO2_PROFILING
+    __attribute__((noinline, cold)) void log_l2_swimlane_summary(int32_t thread_idx, int32_t cur_thread_completed);
+#endif
+
+    // =========================================================================
+    // Small inline helpers
+    // =========================================================================
+
+    uint64_t get_function_bin_addr(int func_id) const {
+        if (!func_id_to_addr_ || func_id < 0 || func_id >= RUNTIME_MAX_FUNC_ID) {
+            LOG_ERROR("func_id=%d is out of range [0, %d) or map is null", func_id, RUNTIME_MAX_FUNC_ID);
+            return 0;
+        }
+        return func_id_to_addr_[func_id];
+    }
+};
+
+#endif  // SCHEDULER_CONTEXT_H
diff --git a/src/a5/runtime/fully_distributed_within_core/runtime/scheduler/scheduler_dispatch.cpp b/src/a5/runtime/fully_distributed_within_core/runtime/scheduler/scheduler_dispatch.cpp
new file mode 100644
index 000000000..d3fbbde5d
--- /dev/null
+++ b/src/a5/runtime/fully_distributed_within_core/runtime/scheduler/scheduler_dispatch.cpp
@@ -0,0 +1,1020 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+#include "scheduler_context.h"
+
+#include <algorithm>
+#include <cinttypes>
+#include <limits>
+
+#include "common.h"  // debug_assert
+#include "common/unified_log.h"
+#include "aicpu/device_time.h"
+#include "aicpu/platform_regs.h"
+#include "callable.h"
+#include "common/l2_swimlane_profiling.h"
+#include "common/memory_barrier.h"
+#include "common/platform_config.h"
+#include "pto_runtime2.h"
+#include "runtime.h"
+#include "spin_hint.h"
+
+// Performance profiling headers
+#include "aicpu/l2_swimlane_collector_aicpu.h"
+#include "aicpu/pmu_collector_aicpu.h"
+#include "aicpu/tensor_dump_aicpu.h"
+
+// =============================================================================
+// Dispatch helpers
+// =============================================================================
+
+namespace {
+inline constexpr int32_t PTO2_DEFERRED_RELEASE_CAP = 256;
+}
+
+const char *SchedulerContext::shape_name(PTO2ResourceShape shape) {
+    switch (shape) {
+    case PTO2ResourceShape::AIC:
+        return "AIC";
+    case PTO2ResourceShape::AIV:
+        return "AIV";
+    case PTO2ResourceShape::MIX:
+        return "MIX";
+    case PTO2ResourceShape::DUMMY:
+        return "DUMMY";
+    }
+    return "UNKNOWN";
+}
+
+bool SchedulerContext::has_idle_in_other_threads(int32_t self_thread_idx, PTO2ResourceShape shape) const {
+    // Cross-thread read of peer trackers without explicit synchronization. The
+    // backing `core_states_` is a naturally aligned uint64_t; aarch64 guarantees
+    // single-copy atomicity for an 8-byte aligned load, so no torn read. The
+    // value is consumed only as a scheduling *hint* — a stale read at worst
+    // causes one missed/extra pending dispatch, corrected on the next iteration.
+    // Drain-mode cross-thread writes are serialized by handle_drain_mode's ack
+    // barrier (all peers spin out of the dispatch path before any tracker
+    // mutation), so this routine is never racing the drain worker.
+    for (int32_t t = 0; t < active_sched_threads_; t++) {
+        if (t == self_thread_idx) continue;
+        if (core_trackers_[t].get_idle_core_offset_states(shape).has_value()) {
+            return true;
+        }
+    }
+    return false;
+}
+
+int SchedulerContext::pop_ready_tasks_batch(
+    PTO2ResourceShape shape, int32_t thread_idx, PTO2LocalReadyBuffer &local_buf, PTO2TaskSlotState **out, int max_count
+) {
+#if PTO2_PROFILING
+    auto &l2_swimlane = sched_l2_swimlane_[thread_idx];
+#if PTO2_SCHED_PROFILING
+    extern uint64_t g_sched_pop_atomic_count[], g_sched_pop_wait_cycle[];
+    uint64_t t_pop_start = get_sys_cnt_aicpu();
+    int count = sched_->get_ready_tasks_batch(
+        shape, local_buf, out, max_count, g_sched_pop_atomic_count[thread_idx], g_sched_pop_wait_cycle[thread_idx]
+    );
+    l2_swimlane.sched_dispatch_pop_cycle += (get_sys_cnt_aicpu() - t_pop_start);
+#else
+    int count = sched_->get_ready_tasks_batch(shape, local_buf, out, max_count);
+#endif
+    if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) {
+        if (count > 0) {
+            l2_swimlane.pop_hit += count;
+        } else {
+            l2_swimlane.pop_miss++;
+        }
+    }
+#else
+    (void)thread_idx;
+    int count = sched_->get_ready_tasks_batch(shape, local_buf, out, max_count);
+#endif
+    return count;
+}
+
+void SchedulerContext::build_payload(
+    PTO2DispatchPayload &dispatch_payload, PTO2TaskSlotState &slot_state, PTO2SubtaskSlot subslot,
+    const AsyncCtx &async_ctx, int32_t block_idx
+) {
+    int32_t slot_idx = static_cast<int32_t>(subslot);
+    uint64_t callable_addr = get_function_bin_addr(slot_state.task->kernel_id[slot_idx]);
+    const CoreCallable *callable = reinterpret_cast<const CoreCallable *>(callable_addr);
+    dispatch_payload.function_bin_addr = callable->resolved_addr();
+    auto &payload = *slot_state.payload;
+    int n = 0;
+    for (int32_t i = 0; i < payload.tensor_count; i++) {
+        dispatch_payload.args[n++] = reinterpret_cast<uint64_t>(&payload.tensors[i]);
+    }
+    for (int32_t i = 0; i < payload.scalar_count; i++) {
+        dispatch_payload.args[n++] = payload.scalars[i];
+    }
+    dispatch_payload.local_context.s_block_idx = block_idx;
+    dispatch_payload.local_context.s_block_num = slot_state.logical_block_num;
+    dispatch_payload.local_context.async_ctx = async_ctx;
+    dispatch_payload.args[PAYLOAD_LOCAL_CONTEXT_INDEX] = reinterpret_cast<uint64_t>(&dispatch_payload.local_context);
+    dispatch_payload.args[PAYLOAD_GLOBAL_CONTEXT_INDEX] = reinterpret_cast<uint64_t>(&dispatch_payload.global_context);
+}
+
+void SchedulerContext::dispatch_subtask_to_core(
+    Runtime *runtime, int32_t thread_idx, int32_t core_offset, PTO2TaskSlotState &slot_state, PTO2SubtaskSlot subslot,
+    bool to_pending, int32_t block_idx
+) {
+    CoreTracker &tracker = core_trackers_[thread_idx];
+    auto core_id = tracker.get_core_id_by_offset(core_offset);
+    (void)runtime;
+    CoreExecState &core_exec_state = core_exec_states_[core_id];
+    core_exec_state.dispatch_seq++;
+    uint32_t reg_task_id = core_exec_state.dispatch_seq & TASK_ID_MASK;
+    static_assert(
+        (TASK_ID_MASK - AICORE_EXIT_SIGNAL + 1) % 2 == 0, "Sentinel skip must be even to preserve dual-buffer parity"
+    );
+    if (reg_task_id >= AICORE_EXIT_SIGNAL) {
+        core_exec_state.dispatch_seq += (TASK_ID_MASK - reg_task_id + 1);
+        reg_task_id = core_exec_state.dispatch_seq & TASK_ID_MASK;
+    }
+
+    uint32_t buf_idx = reg_task_id & 1u;
+    PTO2DispatchPayload &payload = payload_per_core_[core_id][buf_idx];
+    DeferredCompletionSlab *deferred_slab = &deferred_slab_per_core_[core_id][buf_idx];
+    deferred_slab->count = 0;
+    deferred_slab->error_code = PTO2_ERROR_NONE;
+    AsyncCtx async_ctx = AsyncCtx::make(slot_state.task->task_id, deferred_slab);
+    build_payload(payload, slot_state, subslot, async_ctx, block_idx);
+
+    if (to_pending) {
+        core_exec_state.pending_subslot = subslot;
+        core_exec_state.pending_slot_state = &slot_state;
+        core_exec_state.pending_reg_task_id = static_cast<int32_t>(reg_task_id);
+    } else {
+        core_exec_state.running_subslot = subslot;
+        core_exec_state.running_slot_state = &slot_state;
+        core_exec_state.running_reg_task_id = static_cast<int32_t>(reg_task_id);
+        tracker.change_core_state(core_offset);
+    }
+
+    LOG_DEBUG(
+        "Thread %d: Dispatched %s %s task %" PRId64 " kernel_id=[%d,%d,%d] block_idx=%d/total_blocks=%d to"
+        " core_offset=%d core_id=%d reg_task_id=%u",
+        thread_idx, to_pending ? "pending" : "idle", subslot_name(subslot),
+        static_cast<int64_t>(slot_state.task->task_id.raw), slot_state.task->kernel_id[0],
+        slot_state.task->kernel_id[1], slot_state.task->kernel_id[2], block_idx, slot_state.logical_block_num,
+        core_offset, core_id, reg_task_id
+    );
+
+    // AICore buffer rotation lives on the dispatch path: count this dispatch
+    // and rotate before write_reg when we're about to cross a BUFFER_SIZE
+    // boundary. The completion-before-dispatch invariant makes this race-free.
+#if PTO2_PROFILING
+    if (l2_swimlane_level_ != L2SwimlaneLevel::DISABLED) {
+        l2_swimlane_aicpu_on_aicore_dispatch(core_id, thread_idx);
+    }
+#endif
+
+    // Publish task data (slot_state / args writes done above) before AICore
+    // can observe the dispatched task_id. ARM64 needs an explicit store-store
+    // fence across Normal-cacheable -> Device-nGnRnE; the old write_reg()
+    // helper provided this implicitly via __sync_synchronize.
+    wmb();
+
+    // Capture dispatch timestamp at the latest possible moment — after wmb,
+    // immediately before the DATA_MAIN_BASE write.
+#if PTO2_PROFILING
+    if (l2_swimlane_level_ >= L2SwimlaneLevel::AICPU_TIMING) {
+        uint64_t dispatch_ts = get_sys_cnt_aicpu();
+        if (to_pending) {
+            core_exec_state.pending_dispatch_timestamp = dispatch_ts;
+        } else {
+            core_exec_state.running_dispatch_timestamp = dispatch_ts;
+        }
+    }
+#endif
+
+    write_reg(core_exec_state.reg_addr, RegId::DATA_MAIN_BASE, static_cast<uint64_t>(reg_task_id));
+    tracker.set_pending_occupied(core_offset);
+}
+
+void SchedulerContext::dispatch_mix_block_to_cluster(
+    Runtime *runtime, int32_t thread_idx, int32_t cluster_offset, PTO2TaskSlotState &slot_state, bool to_pending,
+    int32_t block_idx
+) {
+    CoreTracker &tracker = core_trackers_[thread_idx];
+    uint8_t cmask = slot_state.active_mask.core_mask();
+    if (cmask & PTO2_SUBTASK_MASK_AIC) {
+        dispatch_subtask_to_core(
+            runtime, thread_idx, tracker.get_aic_core_offset(cluster_offset), slot_state, PTO2SubtaskSlot::AIC,
+            to_pending, block_idx
+        );
+    }
+    if (cmask & PTO2_SUBTASK_MASK_AIV0) {
+        dispatch_subtask_to_core(
+            runtime, thread_idx, tracker.get_aiv0_core_offset(cluster_offset), slot_state, PTO2SubtaskSlot::AIV0,
+            to_pending, block_idx
+        );
+    }
+    if (cmask & PTO2_SUBTASK_MASK_AIV1) {
+        dispatch_subtask_to_core(
+            runtime, thread_idx, tracker.get_aiv1_core_offset(cluster_offset), slot_state, PTO2SubtaskSlot::AIV1,
+            to_pending, block_idx
+        );
+    }
+}
+
+void SchedulerContext::dispatch_block(
+    Runtime *runtime, int32_t thread_idx, int32_t core_offset, PTO2TaskSlotState &slot_state, PTO2ResourceShape shape,
+    bool to_pending, int32_t block_idx
+) {
+#if PTO2_PROFILING
+    if (is_dump_args_enabled()) {
+        dump_args_for_task<PTO2_SUBTASK_SLOT_COUNT>(
+            thread_idx, slot_state, TensorDumpStage::BEFORE_DISPATCH,
+            [](ActiveMask active_mask, int raw_subtask_id) {
+                return active_mask.subtask_active(static_cast<PTO2SubtaskSlot>(raw_subtask_id));
+            },
+            [this](int32_t func_id) {
+                return get_function_bin_addr(func_id);
+            }
+        );
+    }
+#endif
+    if (shape == PTO2ResourceShape::MIX) {
+        dispatch_mix_block_to_cluster(runtime, thread_idx, core_offset, slot_state, to_pending, block_idx);
+    } else if (shape == PTO2ResourceShape::AIC) {
+        dispatch_subtask_to_core(
+            runtime, thread_idx, core_offset, slot_state, PTO2SubtaskSlot::AIC, to_pending, block_idx
+        );
+    } else {
+        dispatch_subtask_to_core(
+            runtime, thread_idx, core_offset, slot_state, PTO2SubtaskSlot::AIV0, to_pending, block_idx
+        );
+    }
+#if PTO2_PROFILING
+    sched_l2_swimlane_[thread_idx].phase_dispatch_count += __builtin_popcount(slot_state.active_mask.core_mask());
+#endif
+}
+
+void SchedulerContext::dispatch_shape(
+    Runtime *runtime, int32_t thread_idx, PTO2ResourceShape shape, CoreTracker::DispatchPhase phase,
+    PTO2LocalReadyBuffer &local_buf, CoreTracker &tracker, bool &entered_drain, bool &made_progress, bool &try_pushed
+) {
+#if PTO2_SCHED_PROFILING
+    auto &l2_swimlane = sched_l2_swimlane_[thread_idx];
+#endif
+    if (entered_drain) return;
+
+    bool is_pending = (phase == CoreTracker::DispatchPhase::PENDING);
+    bool is_mix = (shape == PTO2ResourceShape::MIX);
+    auto cores = is_mix ? tracker.get_cluster_offset_states() : tracker.get_dispatchable_cores(shape, phase);
+    if (!cores.has_value()) return;
+
+    while (cores.has_value() && !entered_drain) {
+        int want = cores.count();
+        PTO2TaskSlotState *batch[CoreTracker::MAX_CLUSTERS * 3];
+        int got = pop_ready_tasks_batch(shape, thread_idx, local_buf, batch, want);
+        if (got == 0) break;
+
+        bool dispatched_any = false;
+        for (int bi = 0; bi < got; bi++) {
+            PTO2TaskSlotState *slot_state = batch[bi];
+            CoreTracker::BitStates selected_mix_clusters(0ULL);
+
+            if (is_mix) {
+                auto candidates = cores;
+                uint8_t cmask = slot_state->active_mask.core_mask();
+                auto wanted = is_pending ? CoreTracker::MixPlacement::PENDING : CoreTracker::MixPlacement::RUNNING;
+                while (candidates.has_value()) {
+                    int32_t cluster_offset = candidates.pop_first();
+                    if (tracker.classify_mix_cluster(cluster_offset, cmask) == wanted) {
+                        selected_mix_clusters |= CoreTracker::BitStates(1ULL << cluster_offset);
+                    }
+                }
+                if (!selected_mix_clusters.has_value()) {
+                    sched_->ready_queues[static_cast<int32_t>(shape)].push(slot_state);
+                    continue;
+                }
+            }
+
+            if (slot_state->active_mask.requires_sync_start()) {
+                if (is_pending) {
+                    sched_->ready_queues[static_cast<int32_t>(shape)].push(slot_state);
+                    continue;
+                }
+                int32_t available = is_mix ? selected_mix_clusters.count() : cores.count();
+                if (available < slot_state->logical_block_num) {
+                    if (!enter_drain_mode(slot_state, slot_state->logical_block_num)) {
+                        sched_->ready_queues[static_cast<int32_t>(shape)].push(slot_state);
+                    }
+                    for (int rem = bi + 1; rem < got; rem++) {
+                        sched_->ready_queues[static_cast<int32_t>(shape)].push(batch[rem]);
+                    }
+                    entered_drain = true;
+                    break;
+                }
+            }
+
+            if (!cores.has_value()) {
+                sched_->ready_queues[static_cast<int32_t>(shape)].push_batch(&batch[bi], got - bi);
+                break;
+            }
+
+            dispatched_any = true;
+            try_pushed = true;
+#if PTO2_SCHED_PROFILING
+            uint64_t t_setup_start = get_sys_cnt_aicpu();
+#endif
+            // Claim a contiguous range of blocks, hand the slot back to the
+            // ready queue immediately, then perform the expensive dispatches.
+            // This lets other schedulers concurrently claim and dispatch the
+            // remaining blocks of the same SPMD task instead of spinning while
+            // this thread fills all its own cores.  Only local `start + b` is
+            // read after the push -- `next_block_idx` may already be advanced
+            // by another scheduler that popped the slot.
+            int32_t remaining = slot_state->logical_block_num - slot_state->next_block_idx;
+            int32_t available = is_mix ? selected_mix_clusters.count() : cores.count();
+            int32_t claim = std::min(available, remaining);
+            int32_t start = slot_state->next_block_idx;
+            slot_state->next_block_idx += claim;
+
+            if (slot_state->next_block_idx < slot_state->logical_block_num) {
+                sched_->ready_queues[static_cast<int32_t>(shape)].push(slot_state);
+            }
+
+            for (int32_t b = 0; b < claim; b++) {
+                auto core_offset = is_mix ? selected_mix_clusters.pop_first() : cores.pop_first();
+                if (is_mix) {
+                    cores.clear_bit(core_offset);
+                }
+                dispatch_block(runtime, thread_idx, core_offset, *slot_state, shape, is_pending, start + b);
+            }
+            made_progress = true;
+#if PTO2_SCHED_PROFILING
+            l2_swimlane.sched_dispatch_setup_cycle += (get_sys_cnt_aicpu() - t_setup_start);
+#endif
+        }
+
+        if (!dispatched_any) break;
+
+        if (!cores.has_value()) {
+            cores = is_mix ? tracker.get_cluster_offset_states() : tracker.get_dispatchable_cores(shape, phase);
+        }
+    }
+}
+
+void SchedulerContext::dispatch_ready_tasks(
+    Runtime *runtime, int32_t thread_idx, CoreTracker &tracker,
+    PTO2LocalReadyBuffer (&local_bufs)[PTO2_NUM_RESOURCE_SHAPES], bool pmu_active, bool &made_progress, bool &try_pushed
+) {
+    using Phase = CoreTracker::DispatchPhase;
+    constexpr int32_t MIX_I = static_cast<int32_t>(PTO2ResourceShape::MIX);
+
+    // MIX is handled explicitly at the top of each stage; only AIC/AIV cycle
+    // through this 2-elem array, with order toggled by thread parity for
+    // shape-level load balancing across threads.
+    static constexpr PTO2ResourceShape kAicAivOrder[2][2] = {
+        {PTO2ResourceShape::AIC, PTO2ResourceShape::AIV},
+        {PTO2ResourceShape::AIV, PTO2ResourceShape::AIC},
+    };
+    const PTO2ResourceShape *aic_aiv = kAicAivOrder[thread_idx & 1];
+
+    auto flush_local_bufs = [&]() {
+        for (int32_t s = 0; s < PTO2_NUM_RESOURCE_SHAPES; s++) {
+            auto &lb = local_bufs[s];
+            if (lb.count > 0) {
+                sched_->ready_queues[s].push_batch(lb.slot_states, lb.count);
+                lb.count = 0;
+            }
+        }
+    };
+    // Every return path below must flush; wrap in RAII so we cannot forget.
+    // The mid-function flush between IDLE and PENDING is still called
+    // explicitly — guard only covers exit.
+    struct FlushGuard {
+        decltype(flush_local_bufs) &flush_fn;
+        ~FlushGuard() { flush_fn(); }
+    } flush_guard{flush_local_bufs};
+
+    bool entered_drain = false;
+
+    // ===== IDLE stage =====
+    dispatch_shape(
+        runtime, thread_idx, PTO2ResourceShape::MIX, Phase::IDLE, local_bufs[MIX_I], tracker, entered_drain,
+        made_progress, try_pushed
+    );
+    if (entered_drain) return;
+
+    // MIX-IDLE residual: AIC/AIV (both IDLE and PENDING) yield for this pass.
+    // MIX-PENDING below still runs — that is the core of "mix strict priority":
+    // pending slots are spent on mix before AIC/AIV get any chance.
+    bool skip_aic_aiv = has_residual_mix(local_bufs[MIX_I]);
+
+    if (!skip_aic_aiv) {
+        for (int i = 0; i < 2; i++) {
+            PTO2ResourceShape s = aic_aiv[i];
+            dispatch_shape(
+                runtime, thread_idx, s, Phase::IDLE, local_bufs[static_cast<int32_t>(s)], tracker, entered_drain,
+                made_progress, try_pushed
+            );
+            if (entered_drain) return;
+        }
+    }
+
+    // Flush between IDLE and PENDING so PENDING-stage queue-size checks and any
+    // peer-thread reads see the IDLE-stage release_fanin output.
+    flush_local_bufs();
+
+    if (pmu_active) return;
+
+    // ===== PENDING stage =====
+    // MIX-PENDING gate: skip when a peer has an idle MIX-capable cluster — that
+    // peer's next IDLE-MIX iteration will pull the mix task from the global
+    // queue (already flushed above) at lower latency than us pre-loading a
+    // pending slot here. Forward progress for MIX is preserved: at least one
+    // thread will run MIX-IDLE next pass and consume the residual.
+    //
+    // The gate is NOT subject to skip_aic_aiv — residual mix continues to drain
+    // via pending slots on this thread when no peer is idle.
+    if (!has_idle_in_other_threads(thread_idx, PTO2ResourceShape::MIX)) {
+        dispatch_shape(
+            runtime, thread_idx, PTO2ResourceShape::MIX, Phase::PENDING, local_bufs[MIX_I], tracker, entered_drain,
+            made_progress, try_pushed
+        );
+        if (entered_drain) return;
+    }
+
+    // Re-check after MIX-PENDING. If MIX-IDLE already set skip_aic_aiv, leave
+    // it set; otherwise, escalate iff PENDING-MIX left residual.
+    if (!skip_aic_aiv && has_residual_mix(local_bufs[MIX_I])) {
+        skip_aic_aiv = true;
+    }
+
+    // PENDING-MIX may have re-populated AIC/AIV local_bufs via release_fanin
+    // during in-flight completions; flush_guard ensures these don't carry
+    // across to the next iteration's IDLE stage.
+    if (skip_aic_aiv) return;
+
+    // AIC/AIV-PENDING gate: a peer-idle skip is a delay, not a loss — the peer
+    // will pull from the global queue on its next IDLE pass.
+    for (int i = 0; i < 2; i++) {
+        PTO2ResourceShape s = aic_aiv[i];
+        if (has_idle_in_other_threads(thread_idx, s)) continue;
+        dispatch_shape(
+            runtime, thread_idx, s, Phase::PENDING, local_bufs[static_cast<int32_t>(s)], tracker, entered_drain,
+            made_progress, try_pushed
+        );
+        if (entered_drain) return;
+    }
+}
+
+// =============================================================================
+// Main scheduler dispatch loop
+// =============================================================================
+
+int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_idx) {
+    CoreTracker &tracker = core_trackers_[thread_idx];
+    LOG_INFO_V0("Thread %d: resolve_and_dispatch entry", thread_idx);
+
+    PTO2SharedMemoryHeader *header = sched_->sm_header;
+    if (!header) {
+        LOG_ERROR("PTO2 dispatch: header is null");
+        return -1;
+    }
+    LOG_INFO_V0(
+        "Thread %d: header=%p, task_desc_offset[0]=%lu, window_size=%lu", thread_idx, static_cast<void *>(header),
+        static_cast<uint64_t>(header->rings[0].task_descriptors_offset),
+        static_cast<uint64_t>(header->rings[0].task_window_size)
+    );
+
+    Handshake *hank = static_cast<Handshake *>(runtime->workers);
+    LOG_INFO_V0(
+        "Thread %d: hank=%p, window_size=%lu", thread_idx, static_cast<void *>(hank),
+        static_cast<uint64_t>(header->rings[0].task_window_size)
+    );
+
+    // One-time init: assign perf buffers (one thread does it; others wait).
+    // l2_swimlane_aicpu_init / l2_swimlane_aicpu_init_phase already ran eagerly in
+    // SchedulerContext::init() so the orchestrator thread can read the
+    // promoted g_l2_swimlane_level before caching it on rt->orchestrator. Only
+    // dump_tensor / pmu init remain dispatch-time because they depend on
+    // handshake-derived core IDs / counts.
+    if (!init_claimed_.exchange(true, std::memory_order_acq_rel)) {
+        LOG_INFO_V0("Thread %d: doing one-time init", thread_idx);
+
+#if PTO2_PROFILING
+        if (is_dump_args_enabled()) {
+            dump_args_init(orch_to_sched_ ? aicpu_thread_num_ : sched_thread_num_);
+        }
+        if (is_pmu_enabled()) {
+            pmu_aicpu_init(physical_core_ids_, cores_total_num_);
+            LOG_INFO_V0("PMU profiling started on %d cores", cores_total_num_);
+        }
+#endif
+
+        LOG_INFO_V0("Thread %d: one-time init done", thread_idx);
+        init_complete_.store(true, std::memory_order_release);
+    } else {
+        while (!init_complete_.load(std::memory_order_acquire)) {
+            SPIN_WAIT_HINT();
+        }
+    }
+
+    LOG_INFO_V0("Thread %d: PTO2 dispatch starting with %d cores", thread_idx, tracker.core_num());
+    int32_t cur_thread_completed = 0;
+    // Non-zero once a scheduler-hang timeout latches; returned in place of the
+    // completed count so the caller still sees the negative error rc while the
+    // shared end-of-loop flush below runs.
+    int32_t timeout_rc = 0;
+    int32_t idle_iterations = 0;
+    int32_t last_progress_count = 0;
+#if PTO2_PROFILING
+    auto &l2_swimlane = sched_l2_swimlane_[thread_idx];
+    l2_swimlane.reset();
+    l2_swimlane.l2_swimlane_enabled = (l2_swimlane_level_ != L2SwimlaneLevel::DISABLED);
+#endif
+
+    constexpr int LOCAL_READY_CAP_PER_TYPE = 64;
+    PTO2TaskSlotState *local_ptrs[PTO2_NUM_RESOURCE_SHAPES][LOCAL_READY_CAP_PER_TYPE];
+    PTO2LocalReadyBuffer local_bufs[PTO2_NUM_RESOURCE_SHAPES];
+    for (int32_t i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) {
+        local_bufs[i].reset(local_ptrs[i], LOCAL_READY_CAP_PER_TYPE);
+    }
+    PTO2TaskSlotState *deferred_release_slot_states[PTO2_DEFERRED_RELEASE_CAP];
+    int32_t deferred_release_count = 0;
+
+    bool cores_released = false;
+
+#if PTO2_PROFILING
+    l2_swimlane.sched_start_ts = get_sys_cnt_aicpu();
+#endif
+
+#if PTO2_PROFILING
+    // Queue-depth snapshot carried across the iteration boundary: each phase
+    // emit consumes (phase_start_*) and refreshes them with its own end snapshot
+    // so the next phase's "at_start" equals the previous phase's "at_end".
+    //
+    // L2SWIMLANE_NUM_QUEUE_SHAPES (3) matches PTO2_NUM_RESOURCE_SHAPES: AIC/AIV/MIX.
+    //
+    // **Hot-path cost discipline.** Local depth (this thread's PTO2LocalReadyBuffer)
+    // is a single int read on a register-cached stack — free. Shared depth
+    // (PTO2ReadyQueue::size) is two atomic relaxed loads against cache lines
+    // that all peer sched threads also write to (enqueue_pos and dequeue_pos
+    // bounce on every flush_local_bufs + every pop). With both phases emitting
+    // per iter that's 12 cross-core loads × thousands of iters per run, a
+    // measurable AICPU slowdown. Mitigation: lazy + per-iter cached shared
+    // snapshot, refreshed at most once per iteration. The complete-emit and
+    // dispatch-emit in the same iter both reuse the same shared sample; the
+    // big transitions (local→shared flush) still show up across iter boundaries.
+    static_assert(
+        L2SWIMLANE_NUM_QUEUE_SHAPES == PTO2_NUM_RESOURCE_SHAPES,
+        "queue snapshot width must match runtime resource shape count"
+    );
+    int16_t phase_start_local[L2SWIMLANE_NUM_QUEUE_SHAPES] = {0};
+    int16_t phase_start_shared[L2SWIMLANE_NUM_QUEUE_SHAPES] = {0};
+    int16_t iter_shared_snapshot[L2SWIMLANE_NUM_QUEUE_SHAPES] = {0};
+    bool iter_shared_sampled = false;
+    auto capture_local_snapshot = [&](int16_t local_out[L2SWIMLANE_NUM_QUEUE_SHAPES]) {
+        for (int s = 0; s < L2SWIMLANE_NUM_QUEUE_SHAPES; s++) {
+            local_out[s] = static_cast<int16_t>(local_bufs[s].count);
+        }
+    };
+    auto get_or_sample_shared = [&]() -> const int16_t * {
+        if (!iter_shared_sampled) {
+            // Clamp to int16_t max before narrowing. PTO2_PROF_READYQUEUE_SIZE
+            // is in the low thousands today but could grow with platform
+            // scaling — without clamp, sizes above 32767 wrap to negatives
+            // and silently corrupt the snapshot.
+            constexpr size_t kMax = static_cast<size_t>(std::numeric_limits<int16_t>::max());
+            for (int s = 0; s < L2SWIMLANE_NUM_QUEUE_SHAPES; s++) {
+                const size_t qsize = sched_->ready_queues[s].size();
+                iter_shared_snapshot[s] = static_cast<int16_t>(std::min(qsize, kMax));
+            }
+            iter_shared_sampled = true;
+        }
+        return iter_shared_snapshot;
+    };
+    auto capture_phase_end = [&](int16_t local_out[L2SWIMLANE_NUM_QUEUE_SHAPES],
+                                 int16_t shared_out[L2SWIMLANE_NUM_QUEUE_SHAPES]) {
+        capture_local_snapshot(local_out);
+        const int16_t *shared_cached = get_or_sample_shared();
+        for (int s = 0; s < L2SWIMLANE_NUM_QUEUE_SHAPES; s++)
+            shared_out[s] = shared_cached[s];
+    };
+    if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) {
+        capture_phase_end(phase_start_local, phase_start_shared);
+    }
+#endif
+
+    // Wall-clock timestamp of the last completed task on this thread.
+    // Updated on made_progress; consulted to decide whether the wall-clock
+    // budget for declaring a scheduler hang has elapsed. Initialized to
+    // "now" so the first budget cycle starts when this thread does, not at
+    // an undefined value.
+    uint64_t last_progress_ts = get_sys_cnt_aicpu();
+
+    while (true) {
+        if (completed_.load(std::memory_order_acquire)) {
+            break;
+        }
+        bool made_progress = false;
+#if PTO2_PROFILING
+        CYCLE_COUNT_START();
+        l2_swimlane.sched_loop_count++;
+        uint64_t _t0_phase = _t0;
+        // Per-iter lazy shared-queue snapshot: first phase emit in this iter
+        // pays the atomic-load cost, subsequent emits in the same iter reuse
+        // the cached value. Reset here so we re-sample exactly once per iter
+        // (or skip entirely on iters with no phase emit).
+        iter_shared_sampled = false;
+#endif
+        int32_t task_count = 0;
+        if (!tracker.has_any_running_cores()) {
+            LoopAction action = handle_orchestrator_exit(thread_idx, header, runtime, task_count);
+            if (action == LoopAction::BREAK_LOOP) break;
+        }
+
+        if (!cores_released && orch_to_sched_) {
+            LoopAction action = handle_core_transition(cores_released);
+            if (action == LoopAction::BREAK_LOOP) break;
+        }
+
+#if PTO2_PROFILING
+        CYCLE_COUNT_LAP(l2_swimlane.sched_idle_cycle);
+#endif
+
+        // Phase 1: Check running cores for completion
+        int32_t completed_this_turn = 0;
+
+        bool try_completed = tracker.has_any_running_cores();
+        if (try_completed) {
+            check_running_cores_for_completion(
+                thread_idx, hank, completed_this_turn, cur_thread_completed, made_progress,
+                deferred_release_slot_states, deferred_release_count, local_bufs
+            );
+        }
+        if (completed_this_turn > 0) {
+#if PTO2_SCHED_PROFILING
+            sched_->tasks_completed.fetch_add(completed_this_turn, std::memory_order_relaxed);
+#endif
+            int32_t prev = completed_tasks_.fetch_add(completed_this_turn, std::memory_order_relaxed);
+            int32_t new_total = prev + completed_this_turn;
+            last_progress_count = new_total;
+            if (thread_idx == 0 && task_count > 0) {
+                if (new_total <= PROGRESS_VERBOSE_THRESHOLD ||
+                    new_total / PROGRESS_LOG_INTERVAL != prev / PROGRESS_LOG_INTERVAL || new_total >= task_count) {
+                    LOG_INFO_V9(
+                        "PTO2 progress: completed=%d total=%d (%.1f%%)", new_total, task_count,
+                        100.0 * new_total / task_count
+                    );
+                }
+            }
+        }
+
+        if (rt_ != nullptr && rt_->aicore_mailbox != nullptr &&
+            (sched_->async_wait_list.count > 0 || rt_->aicore_mailbox->has_pending())) {
+            AsyncPollResult poll_result = sched_->async_wait_list.poll_and_complete<false>(
+                rt_->aicore_mailbox, sched_, local_bufs, deferred_release_slot_states, deferred_release_count,
+                PTO2_DEFERRED_RELEASE_CAP
+#if PTO2_SCHED_PROFILING
+                ,
+                thread_idx
+#endif
+            );
+            if (poll_result.error_code != PTO2_ERROR_NONE) {
+                int32_t expected = PTO2_ERROR_NONE;
+                header->sched_error_code.compare_exchange_strong(
+                    expected, poll_result.error_code, std::memory_order_acq_rel, std::memory_order_acquire
+                );
+                completed_.store(true, std::memory_order_release);
+                break;
+            }
+            if (poll_result.completed > 0) {
+#if PTO2_SCHED_PROFILING
+                sched_->tasks_completed.fetch_add(poll_result.completed, std::memory_order_relaxed);
+#endif
+                int32_t prev = completed_tasks_.fetch_add(poll_result.completed, std::memory_order_relaxed);
+                int32_t new_total = prev + poll_result.completed;
+                last_progress_count = new_total;
+                made_progress = true;
+            }
+        }
+
+#if PTO2_PROFILING
+        if (!try_completed) {
+            CYCLE_COUNT_LAP(l2_swimlane.sched_idle_cycle);
+        } else {
+            CYCLE_COUNT_LAP(l2_swimlane.sched_complete_cycle);
+            if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES && l2_swimlane.phase_complete_count > 0) {
+                // Local depth is cheap (this thread's own buffer counter).
+                // Shared depth is NOT sampled here: complete's release_fanin
+                // pushes to local_bufs in the fast path (try_push succeeds
+                // until cap=64). Shared only changes on dispatch's flush
+                // path. Carrying phase_start_shared forward as end_shared
+                // is the right answer 99% of the time AND skips three
+                // contended atomic loads per emit.
+                int16_t phase_end_local[L2SWIMLANE_NUM_QUEUE_SHAPES];
+                capture_local_snapshot(phase_end_local);
+                l2_swimlane_aicpu_record_sched_phase(
+                    thread_idx, L2SwimlaneSchedPhaseKind::Complete, _t0_phase, _t1, l2_swimlane.sched_loop_count,
+                    l2_swimlane.phase_complete_count, /*pop_hit=*/0, /*pop_miss=*/0, phase_start_local,
+                    phase_start_shared, phase_end_local, phase_start_shared
+                );
+                for (int s = 0; s < L2SWIMLANE_NUM_QUEUE_SHAPES; s++) {
+                    phase_start_local[s] = phase_end_local[s];
+                    // phase_start_shared unchanged — carried forward
+                }
+                _t0_phase = _t1;
+                l2_swimlane.phase_complete_count = 0;
+            }
+        }
+#endif
+
+        bool try_pushed = false;
+
+        // Phase 2 drain check
+        if (drain_state_.sync_start_pending.load(std::memory_order_acquire) != 0) {
+            handle_drain_mode(runtime, thread_idx);
+            continue;
+        }
+
+        // Phase 3: Drain wiring queue (thread 0 only)
+        int wired = 0;
+        if (thread_idx == 0) {
+            wired = sched_->drain_wiring_queue(orchestrator_done_);
+            if (wired > 0) {
+                made_progress = true;
+#if PTO2_SCHED_PROFILING
+                l2_swimlane.phase_wiring_count += wired;
+#endif
+            }
+        }
+#if PTO2_PROFILING
+        CYCLE_COUNT_LAP(l2_swimlane.sched_wiring_cycle);
+        // Wire outer phase: emit one bar covering this iter's drain_wiring_queue
+        // pass when it wired any tasks. tasks_processed = wired count.
+        if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES && wired > 0) {
+            int16_t phase_end_local[L2SWIMLANE_NUM_QUEUE_SHAPES];
+            capture_local_snapshot(phase_end_local);
+            l2_swimlane_aicpu_record_sched_phase(
+                thread_idx, L2SwimlaneSchedPhaseKind::Wire, _t0_phase, _t1, l2_swimlane.sched_loop_count,
+                static_cast<uint32_t>(wired), /*pop_hit=*/0, /*pop_miss=*/0, phase_start_local, phase_start_shared,
+                phase_end_local, phase_start_shared
+            );
+            for (int s = 0; s < L2SWIMLANE_NUM_QUEUE_SHAPES; s++) {
+                phase_start_local[s] = phase_end_local[s];
+            }
+            _t0_phase = _t1;
+        }
+#endif
+
+        // Phase 3b: Drain dummy ready queue (thread 0 only).
+        //
+        // Dependency-only tasks bypass AICore dispatch: they go through the
+        // scheduler so fanin/fanout edges stay consistent, but completion is
+        // signalled inline here. Pinned to thread 0 to avoid cross-thread
+        // races and to keep cache hot near the wiring drain above.
+        if (thread_idx == 0) {
+            constexpr int DUMMY_DRAIN_BATCH = 16;
+            PTO2TaskSlotState *dummy_batch[DUMMY_DRAIN_BATCH];
+            int dummy_got = sched_->dummy_ready_queue.pop_batch(dummy_batch, DUMMY_DRAIN_BATCH);
+#if PTO2_PROFILING
+            // Dummy outer phase: covers handling of all dummies popped this
+            // iter. tasks_processed = dummy_got.
+            uint64_t dummy_outer_t0 =
+                (dummy_got > 0 && l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) ? get_sys_cnt_aicpu() : 0;
+#endif
+            for (int di = 0; di < dummy_got; di++) {
+                PTO2TaskSlotState &dummy_slot = *dummy_batch[di];
+#if PTO2_SCHED_PROFILING
+                sched_->on_task_complete(dummy_slot, thread_idx, local_bufs);
+#else
+                sched_->on_task_complete(dummy_slot, local_bufs);
+#endif
+                // Dummy tasks have no subtasks to retire and no fanout pre-conditions
+                // beyond their own producers; release self-reference so the slot can
+                // reach CONSUMED once all consumers drain.
+                deferred_release_slot_states[deferred_release_count++] = &dummy_slot;
+                if (deferred_release_count >= PTO2_DEFERRED_RELEASE_CAP) {
+                    while (deferred_release_count > 0) {
+#if PTO2_SCHED_PROFILING
+                        (void)sched_->on_task_release(
+                            *deferred_release_slot_states[--deferred_release_count], thread_idx
+                        );
+#else
+                        sched_->on_task_release(*deferred_release_slot_states[--deferred_release_count]);
+#endif
+                    }
+                }
+                int32_t prev = completed_tasks_.fetch_add(1, std::memory_order_relaxed);
+                last_progress_count = prev + 1;
+                cur_thread_completed++;
+            }
+            if (dummy_got > 0) {
+                made_progress = true;
+            }
+#if PTO2_PROFILING
+            if (dummy_outer_t0 != 0) {
+                int16_t phase_end_local[L2SWIMLANE_NUM_QUEUE_SHAPES];
+                capture_local_snapshot(phase_end_local);
+                uint64_t dummy_outer_t1 = get_sys_cnt_aicpu();
+                l2_swimlane_aicpu_record_sched_phase(
+                    thread_idx, L2SwimlaneSchedPhaseKind::Dummy, dummy_outer_t0, dummy_outer_t1,
+                    l2_swimlane.sched_loop_count, static_cast<uint32_t>(dummy_got), /*pop_hit=*/0,
+                    /*pop_miss=*/0, phase_start_local, phase_start_shared, phase_end_local, phase_start_shared
+                );
+                for (int s = 0; s < L2SWIMLANE_NUM_QUEUE_SHAPES; s++) {
+                    phase_start_local[s] = phase_end_local[s];
+                }
+                _t0_phase = dummy_outer_t1;
+            }
+#endif
+        }
+
+        // Phase 4: MIX-strict-priority dispatch with phase-split and
+        // cross-thread idle gating. See dispatch_ready_tasks for the policy.
+        const bool pmu_active = is_pmu_enabled();
+        dispatch_ready_tasks(runtime, thread_idx, tracker, local_bufs, pmu_active, made_progress, try_pushed);
+
+#if PTO2_PROFILING
+        if (!try_pushed) {
+            CYCLE_COUNT_LAP(l2_swimlane.sched_idle_cycle);
+        } else {
+            CYCLE_COUNT_LAP(l2_swimlane.sched_dispatch_cycle);
+            if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES && l2_swimlane.phase_dispatch_count > 0) {
+                // Final-drain at loop end emits the trailing-idle tail so
+                // sum-of-deltas == run-cumulative.
+                uint64_t pop_hit_delta = l2_swimlane.pop_hit - l2_swimlane.pop_hit_at_last_emit;
+                uint64_t pop_miss_delta = l2_swimlane.pop_miss - l2_swimlane.pop_miss_at_last_emit;
+                // L2SwimlaneAicpuPhaseRecord's extras are uint32 — a delta that overflows means
+                // an emit was missed for ~4 billion pops, which is well outside any
+                // realistic dispatch cadence and silently truncates without this guard.
+                debug_assert(pop_hit_delta < (1ULL << 32));
+                debug_assert(pop_miss_delta < (1ULL << 32));
+                int16_t phase_end_local[L2SWIMLANE_NUM_QUEUE_SHAPES];
+                int16_t phase_end_shared[L2SWIMLANE_NUM_QUEUE_SHAPES];
+                capture_phase_end(phase_end_local, phase_end_shared);
+                l2_swimlane_aicpu_record_sched_phase(
+                    thread_idx, L2SwimlaneSchedPhaseKind::Dispatch, _t0_phase, _t1, l2_swimlane.sched_loop_count,
+                    l2_swimlane.phase_dispatch_count, static_cast<uint32_t>(pop_hit_delta),
+                    static_cast<uint32_t>(pop_miss_delta), phase_start_local, phase_start_shared, phase_end_local,
+                    phase_end_shared
+                );
+                for (int s = 0; s < L2SWIMLANE_NUM_QUEUE_SHAPES; s++) {
+                    phase_start_local[s] = phase_end_local[s];
+                    phase_start_shared[s] = phase_end_shared[s];
+                }
+                _t0_phase = _t1;
+                l2_swimlane.phase_dispatch_count = 0;
+                l2_swimlane.pop_hit_at_last_emit = l2_swimlane.pop_hit;
+                l2_swimlane.pop_miss_at_last_emit = l2_swimlane.pop_miss;
+            }
+        }
+#endif
+
+#if !PTO2_PROFILING
+        (void)try_completed;
+        (void)try_pushed;
+#endif
+
+        if (made_progress) {
+            idle_iterations = 0;
+            last_progress_ts = get_sys_cnt_aicpu();
+        } else {
+            while (deferred_release_count > 0) {
+#if PTO2_SCHED_PROFILING
+                (void)sched_->on_task_release(*deferred_release_slot_states[--deferred_release_count], thread_idx);
+#else
+                sched_->on_task_release(*deferred_release_slot_states[--deferred_release_count]);
+#endif
+            }
+            idle_iterations++;
+
+            if (idle_iterations % FATAL_ERROR_CHECK_INTERVAL == 0) {
+                LoopAction action = check_idle_fatal_error(thread_idx, header, runtime);
+                if (action == LoopAction::BREAK_LOOP) break;
+            }
+
+            if (idle_iterations % STALL_LOG_INTERVAL == 0) {
+                log_stall_diagnostics(thread_idx, total_tasks_, idle_iterations, last_progress_count);
+            }
+            // Wall-clock budget gate, with two fatal-latch branches:
+            //
+            // 1. Self owns a RUNNING task — first-hand evidence the
+            //    dispatch is stuck. Latch.
+            // 2. No thread anywhere owns a RUNNING task AND tasks remain
+            //    unfinished — the system is in a pre-dispatch / WAIT-only
+            //    deadlock (e.g. dependency cycle). Ownerless idle threads
+            //    are the only observers; let this one latch on the global
+            //    evidence (`completed_tasks_ < total_tasks_` and
+            //    `no_thread_owns_running_task()`).
+            //
+            // Otherwise: a sibling thread owns a RUNNING task but hasn't
+            // hit its own budget yet (typical distributed startup-skew
+            // case) — refresh last_progress_ts and keep spinning. The
+            // STALL diagnostic above still fires periodically so
+            // observability is preserved.
+            if (get_sys_cnt_aicpu() - last_progress_ts > SCHEDULER_TIMEOUT_CYCLES) {
+                bool self_owns = self_owns_running_task(thread_idx);
+                bool global_stuck = !self_owns && total_tasks_ > 0 &&
+                                    completed_tasks_.load(std::memory_order_relaxed) < total_tasks_ &&
+                                    no_thread_owns_running_task();
+                if (self_owns || global_stuck) {
+                    // Latch the error + emergency_shutdown, then break to the
+                    // shared end-of-loop cleanup so the diagnostic buffers get
+                    // flushed to the host. An early return here would strand the
+                    // stuck task's already-dumped inputs and every completed
+                    // task's in/out records in the unflushed per-thread dump
+                    // buffer — exactly the state we need to triage the hang.
+                    timeout_rc = handle_timeout_exit(
+                        thread_idx, header, runtime, idle_iterations, last_progress_count
+#if PTO2_PROFILING
+                        ,
+                        l2_swimlane.sched_start_ts
+#endif
+                    );
+                    break;
+                }
+                last_progress_ts = get_sys_cnt_aicpu();
+            }
+            SPIN_WAIT_HINT();
+#if PTO2_PROFILING
+            CYCLE_COUNT_LAP(l2_swimlane.sched_idle_cycle);
+            // a2a3 design has Complete + Dispatch sched phases only; idle gaps
+            // are reconstructed at post-process time from sched record spacing.
+            (void)_t0_phase;
+#endif
+        }
+    }
+
+    // Drain any entries left in the deferred-release batch. The in-loop flush
+    // only fires on idle iterations and on buffer-full; a loop exit while the
+    // last iteration made progress can leave entries un-released. Drop them
+    // here so every consumed producer slot completes its on_task_release
+    // regardless of which loop-exit path fired.
+    while (deferred_release_count > 0) {
+#if PTO2_SCHED_PROFILING
+        (void)sched_->on_task_release(*deferred_release_slot_states[--deferred_release_count], thread_idx);
+#else
+        sched_->on_task_release(*deferred_release_slot_states[--deferred_release_count]);
+#endif
+    }
+
+#if PTO2_PROFILING
+    // Final-drain: emit any pop_hit / pop_miss accrued since the last
+    // dispatch emit (typically the trailing idle loops while waiting for
+    // orchestrator_done_) as a zero-duration synthetic dispatch record so
+    // sum(record.pop_*) reconciles with the run-cumulative counter.
+    // Gate on SCHED_PHASES — at lower levels the phase buffer is never
+    // flushed (see below), so writing this record would be wasted work.
+    if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) {
+        uint64_t final_pop_hit_delta = l2_swimlane.pop_hit - l2_swimlane.pop_hit_at_last_emit;
+        uint64_t final_pop_miss_delta = l2_swimlane.pop_miss - l2_swimlane.pop_miss_at_last_emit;
+        debug_assert(final_pop_hit_delta < (1ULL << 32));
+        debug_assert(final_pop_miss_delta < (1ULL << 32));
+        if (final_pop_hit_delta != 0 || final_pop_miss_delta != 0) {
+            uint64_t t_now = get_sys_cnt_aicpu();
+            int16_t phase_end_local[L2SWIMLANE_NUM_QUEUE_SHAPES];
+            int16_t phase_end_shared[L2SWIMLANE_NUM_QUEUE_SHAPES];
+            capture_phase_end(phase_end_local, phase_end_shared);
+            l2_swimlane_aicpu_record_sched_phase(
+                thread_idx, L2SwimlaneSchedPhaseKind::Dispatch, t_now, t_now, l2_swimlane.sched_loop_count, 0,
+                static_cast<uint32_t>(final_pop_hit_delta), static_cast<uint32_t>(final_pop_miss_delta),
+                phase_end_local, phase_end_shared, phase_end_local, phase_end_shared
+            );
+            l2_swimlane.pop_hit_at_last_emit = l2_swimlane.pop_hit;
+            l2_swimlane.pop_miss_at_last_emit = l2_swimlane.pop_miss;
+        }
+    }
+    log_l2_swimlane_summary(thread_idx, cur_thread_completed);
+#endif
+
+#if PTO2_PROFILING
+    if (l2_swimlane.l2_swimlane_enabled) {
+        l2_swimlane_aicpu_flush(
+            thread_idx, core_trackers_[thread_idx].core_ids(), core_trackers_[thread_idx].core_num()
+        );
+        if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) {
+            l2_swimlane_aicpu_flush_sched_phase_buffer(thread_idx);
+        }
+    }
+#endif
+#if PTO2_PROFILING
+    if (is_dump_args_enabled()) {
+        dump_args_flush(thread_idx);
+    }
+#endif
+#if PTO2_PROFILING
+    if (is_pmu_enabled()) {
+        pmu_aicpu_flush_buffers(
+            thread_idx, core_trackers_[thread_idx].core_ids(), core_trackers_[thread_idx].core_num()
+        );
+    }
+#endif
+
+    return timeout_rc != 0 ? timeout_rc : cur_thread_completed;
+}
diff --git a/src/a5/runtime/fully_distributed_within_core/runtime/scheduler/scheduler_types.h b/src/a5/runtime/fully_distributed_within_core/runtime/scheduler/scheduler_types.h
new file mode 100644
index 000000000..c23a547af
--- /dev/null
+++ b/src/a5/runtime/fully_distributed_within_core/runtime/scheduler/scheduler_types.h
@@ -0,0 +1,464 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+#ifndef SCHEDULER_TYPES_H
+#define SCHEDULER_TYPES_H
+
+#include <atomic>
+#include <cstdint>
+
+#include "common/core_type.h"
+#include "common/platform_config.h"
+#include "pto2_dispatch_payload.h"
+#include "pto_runtime2_types.h"
+#include "spin_hint.h"
+
+// =============================================================================
+// Profiling macros (compile-time gated)
+// =============================================================================
+
+#if PTO2_PROFILING
+#include "aicpu/device_time.h"
+// Accumulated nanoseconds per sub-step
+#define CYCLE_COUNT_START() uint64_t _t0 = get_sys_cnt_aicpu(), _t1
+#define CYCLE_COUNT_LAP(acc)       \
+    do {                           \
+        _t1 = get_sys_cnt_aicpu(); \
+        acc += (_t1 - _t0);        \
+        _t0 = _t1;                 \
+    } while (0)
+#else
+#define CYCLE_COUNT_START()
+#define CYCLE_COUNT_LAP(acc)
+#endif
+
+// =============================================================================
+// Scheduler constants
+// =============================================================================
+
+constexpr int32_t MAX_AICPU_THREADS = PLATFORM_MAX_AICPU_THREADS;
+
+// Periodic cadence (in idle iterations) for emitting the per-thread STALL
+// diagnostic while no progress is being made. Purely an observability knob,
+// independent of the wall-clock timeout below: small enough to fire a few times
+// before the budget expires, large enough not to flood device_log.
+constexpr int32_t STALL_LOG_INTERVAL = 480000;
+constexpr int32_t FATAL_ERROR_CHECK_INTERVAL = 1024;  // Check orchestrator error every N idle iters
+
+// Wall-clock budget for declaring "no progress = scheduler timeout". Replaces
+// the per-thread iteration-count cap that once lived here as MAX_IDLE_ITERATIONS
+// for the fatal-latch decision; STALL_LOG_INTERVAL above keeps the per-thread
+// diagnostic cadence.
+//
+// Using wall-clock here is load-bearing for distributed runs: with per-thread
+// iteration counts, a pure-idle thread spinning ~115 ns/iter hits the cap in
+// ~92 ms while a sibling thread polling a RUNNING task takes ~200 ms for the
+// same iteration count. The fast spinner racing ahead and latching fatal
+// kills the slower-but-correct poller mid-poll — see the distributed
+// startup-skew scenario in issue #897.
+//
+// The budget is platform-defined (PLATFORM_SCHEDULER_TIMEOUT_MS in spin_hint.h)
+// because the safe value differs per variant: onboard trims it to 2 s so the
+// AICPU detects a hang and flushes its diagnostics (tensor dump, in-flight
+// partial output) before STARS reaps the op and poisons the context (chain:
+// this < op-exec < host stream-sync, platform_config.h); sim has no STARS to
+// race and keeps the full 5 s #897 headroom. See spin_hint.h for the per-variant
+// rationale.
+constexpr int32_t SCHEDULER_TIMEOUT_MS = PLATFORM_SCHEDULER_TIMEOUT_MS;
+constexpr uint64_t SCHEDULER_TIMEOUT_CYCLES =
+    static_cast<uint64_t>(SCHEDULER_TIMEOUT_MS) * (PLATFORM_PROF_SYS_CNT_FREQ / 1000);
+constexpr int32_t STALL_DUMP_READY_MAX = 8;
+constexpr int32_t STALL_DUMP_WAIT_MAX = 4;
+constexpr int32_t STALL_DUMP_CORE_MAX = 8;
+constexpr int32_t PROGRESS_VERBOSE_THRESHOLD = 10;  // log every completion for the first N tasks
+constexpr int32_t PROGRESS_LOG_INTERVAL = 250;      // log every N completions after threshold
+
+// =============================================================================
+// Control flow signal from cold-path helpers back to the main dispatch loop.
+// =============================================================================
+
+enum class LoopAction : int8_t {
+    NONE,        // cold path did not trigger; proceed normally
+    BREAK_LOOP,  // equivalent to 'break' from the while(true) loop
+};
+
+// =============================================================================
+// Per-core state: one cache line per core to eliminate false sharing
+// and co-locate all hot-path fields for minimal cache misses.
+// Dual-slot layout: running (currently executing) + pending (pre-loaded, awaiting hardware pickup).
+// =============================================================================
+
+struct alignas(64) CoreExecState {
+    // --- Hot fields (completion + dispatch, every iteration) ---
+    uint64_t reg_addr;                      // offset  0: register base address (set once in handshake)
+    PTO2TaskSlotState *running_slot_state;  // offset  8: slot state for running task (nullptr = empty)
+    PTO2TaskSlotState *pending_slot_state;  // offset 16: slot state for pending task (nullptr = empty)
+    int32_t running_reg_task_id;            // offset 24: register task ID (AICPU_TASK_INVALID = idle)
+    int32_t pending_reg_task_id;            // offset 28: pending register task ID (AICPU_TASK_INVALID = none)
+    uint32_t dispatch_seq;                  // offset 32: monotonic dispatch counter
+    PTO2SubtaskSlot running_subslot;        // offset 36: which subtask slot is running
+    PTO2SubtaskSlot pending_subslot;        // offset 37: which subtask slot is pending
+    uint8_t pad0_[2];                       // offset 38: alignment padding
+    // Precomputed COND register pointer; resolved once in handshake so the
+    // hot completion poll does a single volatile load instead of recomputing
+    // reg_base + reg_offset(COND) on every iteration.
+    volatile uint32_t *cond_ptr;  // offset 40: precomputed pointer to COND register
+#if PTO2_PROFILING
+    // --- Profiling fields (dispatch path, compile-time gated) ---
+    uint64_t running_dispatch_timestamp;  // offset 48: AICPU dispatch timestamp for running task
+    uint64_t pending_dispatch_timestamp;  // offset 56: AICPU dispatch timestamp for pending task
+#else
+    // --- Cold fields (init/diagnostics only, never in hot path) ---
+    int32_t worker_id;          // offset 48: index in runtime.workers[]
+    uint32_t physical_core_id;  // offset 52: hardware physical core ID
+    CoreType core_type;         // offset 56: AIC or AIV (enum class : int32_t)
+    uint8_t pad2_[4];           // offset 60: pad to 64 bytes
+#endif
+};
+static_assert(sizeof(CoreExecState) == 64, "CoreExecState must occupy exactly one cache line");
+
+// =============================================================================
+// CoreTracker: cluster-based bitmask tracker for idle/running core state.
+//
+// core_states_ encodes per-cluster core idle/running in 3 bits per cluster:
+//   bit i*3   = AIC of cluster i   (1 = idle, 0 = running)
+//   bit i*3+1 = AIV0 of cluster i
+//   bit i*3+2 = AIV1 of cluster i
+// Max 21 clusters per tracker (63 bits in uint64_t).
+// =============================================================================
+
+class alignas(64) CoreTracker {
+public:
+    static inline int32_t MAX_CORE_PER_THREAD = 63;
+    static constexpr int32_t MAX_CLUSTERS = 63 / 3;
+
+public:
+    CoreTracker() = default;
+
+    class BitStates {
+    public:
+        BitStates() = default;
+
+        explicit BitStates(uint64_t states) :
+            states_(states) {}
+        void init() { states_ = 0; }
+
+        BitStates operator~() const { return BitStates(~states_); }
+        BitStates operator&(const BitStates &other) const { return BitStates(states_ & other.states_); }
+        BitStates operator|(const BitStates &other) const { return BitStates(states_ | other.states_); }
+        BitStates operator^(const BitStates &other) const { return BitStates(states_ ^ other.states_); }
+        BitStates operator>>(int32_t offset) const { return BitStates(states_ >> offset); }
+        BitStates operator<<(int32_t offset) const { return BitStates(states_ << offset); }
+        void operator&=(const BitStates &other) { states_ &= other.states_; }
+        void operator|=(const BitStates &other) { states_ |= other.states_; }
+        void operator^=(const BitStates &other) { states_ ^= other.states_; }
+
+        bool has_value() const { return states_ > 0; }
+        int32_t count() const { return __builtin_popcountll(states_); }
+        void clear_bit(int32_t offset) { states_ &= ~(1ULL << offset); }
+
+        // Extract the lowest set bit from mask, clear it, and return its position.
+        // Returns -1 if mask is empty.
+        int32_t pop_first() {
+            if (states_ == 0) return -1;
+            int32_t pos = __builtin_ctzll(states_);
+            states_ &= states_ - 1;
+            return pos;
+        }
+
+    private:
+        uint64_t states_{0};
+    };
+
+public:
+    void init(int32_t cluster_count) {
+        cluster_count_ = cluster_count;
+        aic_mask_.init();
+        aiv_mask_.init();
+        pending_occupied_.init();
+        for (int32_t i = 0; i < cluster_count; i++) {
+            aic_mask_ |= BitStates(1ULL << (i * 3));
+            aiv_mask_ |= BitStates(6ULL << (i * 3));
+        }
+        core_states_ = aic_mask_ | aiv_mask_;
+    }
+
+    void set_cluster(int32_t cluster_idx, int32_t aic_wid, int32_t aiv0_wid, int32_t aiv1_wid) {
+        core_id_map_[cluster_idx * 3] = aic_wid;
+        core_id_map_[cluster_idx * 3 + 1] = aiv0_wid;
+        core_id_map_[cluster_idx * 3 + 2] = aiv1_wid;
+    }
+
+    int32_t get_cluster_count() const { return cluster_count_; }
+
+    // --- Running core queries ---
+
+    template <CoreType CT>
+    bool has_running_cores() const {
+        if constexpr (CT == CoreType::AIC) {
+            return ((~core_states_) & aic_mask_).has_value();
+        } else {
+            return ((~core_states_) & aiv_mask_).has_value();
+        }
+    }
+
+    bool has_any_running_cores() const { return ((~core_states_) & (aic_mask_ | aiv_mask_)).has_value(); }
+
+    template <CoreType CT>
+    int32_t get_running_count() const {
+        if constexpr (CT == CoreType::AIC) {
+            return ((~core_states_) & aic_mask_).count();
+        } else {
+            return ((~core_states_) & aiv_mask_).count();
+        }
+    }
+
+    // Return an opaque bitmask for iterating running cores of a given type.
+    // Use pop_first() to extract core bit offsets one at a time.
+    template <CoreType CT>
+    BitStates get_running_cores() const {
+        if constexpr (CT == CoreType::AIC) {
+            return (~core_states_) & aic_mask_;
+        } else {
+            return (~core_states_) & aiv_mask_;
+        }
+    }
+
+    BitStates get_all_running_cores() const { return (~core_states_) & (aic_mask_ | aiv_mask_); }
+    BitStates get_cluster_offset_states() const { return aic_mask_; }
+
+    // --- Cluster matching ---
+
+    BitStates get_valid_cluster_offset_states(PTO2ResourceShape shape) const {
+        switch (shape) {
+        case PTO2ResourceShape::AIC:
+            return core_states_ & aic_mask_;
+        case PTO2ResourceShape::AIV:
+            return ((core_states_ >> 1) | (core_states_ >> 2)) & aic_mask_;
+        case PTO2ResourceShape::MIX:
+            return (core_states_ >> 1) & (core_states_ >> 2) & core_states_ & aic_mask_;
+        case PTO2ResourceShape::DUMMY:
+            // DUMMY tasks never reach the core-tracker dispatch path; they are
+            // completed inline by resolve_and_dispatch via dummy_ready_queue.
+            return BitStates(0ULL);
+        }
+        return BitStates(0ULL);
+    }
+
+    int32_t get_aic_core_id(int32_t cluster_offset) const { return core_id_map_[cluster_offset]; }
+    int32_t get_aiv0_core_id(int32_t cluster_offset) const { return core_id_map_[cluster_offset + 1]; }
+    int32_t get_aiv1_core_id(int32_t cluster_offset) const { return core_id_map_[cluster_offset + 2]; }
+
+    int32_t get_aic_core_offset(int32_t cluster_offset) const { return cluster_offset; }
+    int32_t get_aiv0_core_offset(int32_t cluster_offset) const { return cluster_offset + 1; }
+    int32_t get_aiv1_core_offset(int32_t cluster_offset) const { return cluster_offset + 2; }
+
+    bool is_aic_core_idle(int32_t cluster_offset) const {
+        return ((core_states_ >> cluster_offset) & BitStates(1ULL)).has_value();
+    }
+    bool is_aiv0_core_idle(int32_t cluster_offset) const {
+        return ((core_states_ >> (cluster_offset + 1)) & BitStates(1ULL)).has_value();
+    }
+    bool is_aiv1_core_idle(int32_t cluster_offset) const {
+        return ((core_states_ >> (cluster_offset + 2)) & BitStates(1ULL)).has_value();
+    }
+
+    // --- State mutation ---
+
+    // Toggle bit at the given bit offset (running <-> idle)
+    void change_core_state(int32_t bit_offset) { core_states_ ^= BitStates(1ULL << bit_offset); }
+
+    // --- Pending-occupied tracking ---
+    // Tracks whether a core's pending payload slot is occupied (awaiting hardware ACK).
+    // SET on dispatch (both running-first and pending), CLEAR on idle or pending_freed.
+
+    void set_pending_occupied(int32_t bit_offset) { pending_occupied_ |= BitStates(1ULL << bit_offset); }
+    void clear_pending_occupied(int32_t bit_offset) {
+        pending_occupied_ ^= (pending_occupied_ & BitStates(1ULL << bit_offset));
+    }
+
+    // --- Two-phase dispatch queries ---
+
+    // Idle dispatch: returns bit offsets of idle cores for the given shape.
+    // For AIC: 1 bit per cluster (core offset == cluster offset).
+    // For AIV: 1 bit per AIV core (2 bits per cluster at aiv_mask_ positions).
+    // Only AIC needs pending_occupied filtering: by invariant, idle cores (core_states_ bit=1)
+    // always have pending_occupied=0, so AIV/MIX need no extra filtering.
+    // Skipping the AIC-centric filter also fixes a latent bug where a running+pending AIC core
+    // would incorrectly block AIV idle dispatch on the same cluster.
+    BitStates get_idle_core_offset_states(PTO2ResourceShape shape) const {
+        if (shape == PTO2ResourceShape::AIC) {
+            return get_valid_cluster_offset_states(shape) & ~(pending_occupied_ & aic_mask_);
+        }
+        if (shape == PTO2ResourceShape::AIV) {
+            return core_states_ & aiv_mask_;
+        }
+        return get_valid_cluster_offset_states(shape);  // MIX: cluster-level
+    }
+
+    // Pending dispatch: returns bit offsets of cores eligible for pending-slot dispatch.
+    // AIC: 1 bit per cluster (aic_mask_ positions). AIV: 1 bit per AIV core (aiv_mask_ positions).
+    // Runtime MIX dispatch uses classify_mix_cluster() so the decision follows the task's active_mask.
+    enum class MixPlacement : uint8_t { RUNNING, PENDING, REJECT };
+
+    // A MIX block must place all cores named by active_mask the same way:
+    // all idle means running placement, all running means pending placement,
+    // and any mixed state is retried later.
+    MixPlacement classify_mix_cluster(int32_t cluster_offset, uint8_t core_mask) const {
+        BitStates used(0ULL);
+        if (core_mask & PTO2_SUBTASK_MASK_AIC) {
+            used |= BitStates(1ULL << cluster_offset);
+        }
+        if (core_mask & PTO2_SUBTASK_MASK_AIV0) {
+            used |= BitStates(1ULL << (cluster_offset + 1));
+        }
+        if (core_mask & PTO2_SUBTASK_MASK_AIV1) {
+            used |= BitStates(1ULL << (cluster_offset + 2));
+        }
+        if (!used.has_value() || (pending_occupied_ & used).has_value()) {
+            return MixPlacement::REJECT;
+        }
+
+        BitStates idle = core_states_ & used;
+        if (idle.count() == used.count()) {
+            return MixPlacement::RUNNING;
+        }
+        if (!idle.has_value()) {
+            return MixPlacement::PENDING;
+        }
+        return MixPlacement::REJECT;
+    }
+
+    BitStates get_mix_running_cluster_offset_states(uint8_t core_mask) const {
+        BitStates result(0ULL);
+        BitStates candidates = get_cluster_offset_states();
+        while (candidates.has_value()) {
+            int32_t cluster_offset = candidates.pop_first();
+            if (classify_mix_cluster(cluster_offset, core_mask) == MixPlacement::RUNNING) {
+                result |= BitStates(1ULL << cluster_offset);
+            }
+        }
+        return result;
+    }
+
+    int32_t count_mix_running_clusters(uint8_t core_mask) const {
+        return get_mix_running_cluster_offset_states(core_mask).count();
+    }
+
+    BitStates get_pending_core_offset_states(PTO2ResourceShape shape) const {
+        if (shape == PTO2ResourceShape::MIX) {
+            // Shape-level query kept conservative for legacy callers/tests.
+            // The real MIX dispatch path applies active_mask in classify_mix_cluster().
+            // Any core without a pending payload can accept a dispatch (idle or running).
+            BitStates available = ~pending_occupied_;
+            BitStates mix_available =
+                (available & aic_mask_) & ((available >> 1) & aic_mask_) & ((available >> 2) & aic_mask_);
+            // Pending MIX can only reuse a fully-running cluster. Partially-running clusters
+            // could split one MIX block across immediate and pending placement.
+            BitStates running = ~core_states_;
+            BitStates cluster_all_running =
+                (running & aic_mask_) & ((running >> 1) & aic_mask_) & ((running >> 2) & aic_mask_);
+            return mix_available & cluster_all_running;
+        }
+        if (shape == PTO2ResourceShape::AIC) {
+            return (~core_states_) & aic_mask_ & ~(pending_occupied_ & aic_mask_);
+        }
+        // AIV
+        return (~core_states_) & aiv_mask_ & ~pending_occupied_;
+    }
+
+    // --- Two-phase dispatch unified query ---
+
+    enum class DispatchPhase : uint8_t { IDLE, PENDING };
+
+    BitStates get_dispatchable_cores(PTO2ResourceShape shape, DispatchPhase phase) const {
+        return (phase == DispatchPhase::IDLE) ? get_idle_core_offset_states(shape) :
+                                                get_pending_core_offset_states(shape);
+    }
+
+    // --- Bit offset <-> worker_id mapping ---
+
+    int32_t get_core_id_by_offset(int32_t offset) const { return core_id_map_[offset]; }
+
+    const int32_t *core_ids() const { return core_id_map_; }
+    int32_t core_num() const { return cluster_count_ * 3; }
+
+private:
+    int32_t cluster_count_;
+    BitStates aic_mask_;
+    BitStates aiv_mask_;
+    BitStates core_states_;
+    BitStates pending_occupied_;
+    int32_t core_id_map_[63];  // bit_position -> worker_id, max 21 clusters * 3
+};
+
+// =============================================================================
+// SlotTransition: pure event signals from a single register poll.
+// true = event occurred, false = no-op (maintain current state).
+// =============================================================================
+
+struct SlotTransition {
+    bool running_done = false;   // running task completed
+    bool pending_done = false;   // pending task completed
+    bool running_freed = false;  // running slot data should be released
+    bool pending_freed = false;  // pending_occupied can be cleared
+    bool matched = false;        // some case was hit (otherwise skip apply)
+};
+
+// =============================================================================
+// Profiling counters (compile-time gated)
+// =============================================================================
+
+#if PTO2_PROFILING
+struct alignas(64) SchedL2SwimlaneCounters {
+    bool l2_swimlane_enabled{false};
+    uint64_t sched_start_ts{0};
+    uint64_t sched_scan_cycle{0};
+    uint64_t sched_complete_cycle{0};
+    uint64_t sched_dispatch_cycle{0};
+    uint64_t sched_wiring_cycle{0};
+    uint64_t sched_idle_cycle{0};
+    uint64_t sched_loop_count{0};
+    uint32_t phase_complete_count{0};
+    uint32_t phase_dispatch_count{0};
+    // Per-emit delta is (current - *_at_last_emit). Accumulated only when
+    // l2_swimlane_level_ >= SCHED_PHASES.
+    uint64_t pop_hit{0};
+    uint64_t pop_miss{0};
+    uint64_t pop_hit_at_last_emit{0};
+    uint64_t pop_miss_at_last_emit{0};
+#if PTO2_SCHED_PROFILING
+    uint32_t phase_wiring_count{0};
+    uint64_t complete_probe_count{0};
+    uint64_t complete_hit_count{0};
+    uint64_t sched_complete_perf_cycle{0};
+    uint64_t sched_dispatch_pop_cycle{0};
+    uint64_t sched_dispatch_setup_cycle{0};
+#endif
+    void reset() { *this = SchedL2SwimlaneCounters{}; }
+};
+#endif
+
+// =============================================================================
+// sync_start drain coordination
+// =============================================================================
+
+// When sync_start_pending != 0, all scheduler threads skip dispatch
+// (only process completions) until the drain worker finishes launching all blocks.
+struct alignas(64) SyncStartDrainState {
+    std::atomic<int32_t> sync_start_pending{0};    // 0=normal; -1=initializing; >0=active (value=block_num)
+    std::atomic<int32_t> drain_worker_elected{0};  // 0=none; >0: elected thread's (thread_idx+1)
+    std::atomic<uint32_t> drain_ack_mask{0};       // bit per thread; all-set = all threads reached ack barrier
+    std::atomic<PTO2TaskSlotState *> pending_task{nullptr};  // held task (not re-queued)
+    int32_t _pad[10];
+};
+static_assert(sizeof(SyncStartDrainState) == 64);
+
+#endif  // SCHEDULER_TYPES_H
diff --git a/src/a5/runtime/fully_distributed_within_core/runtime/shared/pto_runtime2_init.cpp b/src/a5/runtime/fully_distributed_within_core/runtime/shared/pto_runtime2_init.cpp
new file mode 100644
index 000000000..f98c56cb6
--- /dev/null
+++ b/src/a5/runtime/fully_distributed_within_core/runtime/shared/pto_runtime2_init.cpp
@@ -0,0 +1,457 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Host/AICPU shared runtime-arena layout, init_data and wire implementations.
+ *
+ * Lives under runtime/shared/ so it is included in both the host_runtime.so
+ * build (host pre-populates the prebuilt arena image) and the aicpu_runtime
+ * build (AICPU runs wire_arena_pointers + destroy after attach). The
+ * device-only parts of pto_runtime2.cpp / pto_orchestrator.cpp / pto_scheduler.cpp
+ * (ops table, scope/submit/dispatch business logic, profiling) stay in their
+ * original files and the aicpu build only.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+
+#include <limits>
+
+#include "pto_orchestrator.h"
+#include "pto_runtime2.h"
+#include "pto_ring_buffer.h"
+#include "pto_shared_memory.h"
+#include "pto_tensormap.h"
+#include "scheduler/pto_scheduler.h"
+
+static bool sum_ring_heap_sizes(const uint64_t heap_sizes[PTO2_MAX_RING_DEPTH], uint64_t *total) {
+    uint64_t sum = 0;
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        if (heap_sizes[r] > std::numeric_limits<uint64_t>::max() - sum) {
+            LOG_ERROR("Total ring heap size overflows uint64_t");
+            return false;
+        }
+        sum += heap_sizes[r];
+    }
+    *total = sum;
+    return true;
+}
+
+// =============================================================================
+// Ready queue
+// =============================================================================
+
+size_t ready_queue_reserve_layout(DeviceArena &arena, uint64_t capacity) {
+    // Align the slots[] base to a full cache line so MPMC CAS traffic on the
+    // first slot cannot false-share with whatever region sits in front of us
+    // (e.g. orchestrator tensormap heads written by the orch thread).
+    return arena.reserve(capacity * sizeof(PTO2ReadyQueueSlot), PTO2_ALIGN_SIZE);
+}
+
+bool ready_queue_init_data_from_layout(PTO2ReadyQueue *queue, DeviceArena &arena, size_t slots_off, uint64_t capacity) {
+    // Address the slots region for data writes without storing the pointer in
+    // queue->slots — that field is set by ready_queue_wire_arena_pointers.
+    auto *slots_arena = static_cast<PTO2ReadyQueueSlot *>(arena.region_ptr(slots_off));
+    queue->capacity = capacity;
+    queue->mask = capacity - 1;
+    queue->enqueue_pos.store(0, std::memory_order_relaxed);
+    queue->dequeue_pos.store(0, std::memory_order_relaxed);
+
+    for (uint64_t i = 0; i < capacity; i++) {
+        slots_arena[i].sequence.store((int64_t)i, std::memory_order_relaxed);
+        slots_arena[i].slot_state = nullptr;
+    }
+
+    return true;
+}
+
+void ready_queue_wire_arena_pointers(PTO2ReadyQueue *queue, DeviceArena &arena, size_t slots_off) {
+    queue->slots = static_cast<PTO2ReadyQueueSlot *>(arena.region_ptr(slots_off));
+}
+
+void ready_queue_destroy(PTO2ReadyQueue *queue) {
+    // Arena owns the slots[] buffer; just forget the pointer.
+    queue->slots = nullptr;
+}
+
+// =============================================================================
+// Scheduler
+// =============================================================================
+
+bool PTO2SchedulerState::RingSchedState::init_data_from_layout(void *sm_dev_base, int32_t ring_id) {
+    // ring stores the device address of the SM ring header — pure offset
+    // arithmetic, no SM load.
+    ring = pto2_sm_layout::ring_header_addr(sm_dev_base, ring_id);
+    last_task_alive = 0;
+    advance_lock.store(0, std::memory_order_relaxed);
+#if PTO2_PROFILING
+    dep_pool_snapshot_tail.store(1, std::memory_order_relaxed);
+    dep_pool_snapshot_top.store(1, std::memory_order_relaxed);
+#endif
+
+    // Per-slot SM-side initialization (bind_ring + reset_for_reuse +
+    // fanin_count/active_mask zero) lives in PTO2SharedMemoryHandle::
+    // init_header_per_ring so the AICPU performs it during SM reset; host
+    // prebuilt-arena init skips SM access here.
+
+    return true;
+}
+
+void PTO2SchedulerState::RingSchedState::destroy() { ring = nullptr; }
+
+PTO2SchedulerLayout PTO2SchedulerState::reserve_layout(DeviceArena &arena, int32_t dep_pool_capacity) {
+    int32_t dep_pool_capacities[PTO2_MAX_RING_DEPTH];
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        dep_pool_capacities[r] = dep_pool_capacity;
+    }
+    return reserve_layout(arena, dep_pool_capacities);
+}
+
+PTO2SchedulerLayout
+PTO2SchedulerState::reserve_layout(DeviceArena &arena, const int32_t dep_pool_capacities[PTO2_MAX_RING_DEPTH]) {
+    PTO2SchedulerLayout layout{};
+    layout.ready_queue_capacity = PTO2_READY_QUEUE_SIZE;
+    layout.spsc_capacity = PTO2_WRIRING_QUEUE_SIZE;
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        layout.dep_pool_capacities[r] = dep_pool_capacities[r];
+    }
+
+    for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) {
+        layout.off_ready_queue_slots[i] = ready_queue_reserve_layout(arena, PTO2_READY_QUEUE_SIZE);
+    }
+    layout.off_dummy_ready_queue_slots = ready_queue_reserve_layout(arena, PTO2_READY_QUEUE_SIZE);
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        // Force a cache-line base so writes from scheduler thread 0 (sole
+        // writer of this ring's dep_pool) do not invalidate adjacent
+        // multi-threaded regions like ready_queue.slots.
+        layout.off_dep_pool_entries[r] =
+            arena.reserve(static_cast<size_t>(dep_pool_capacities[r]) * sizeof(PTO2DepListEntry), PTO2_ALIGN_SIZE);
+    }
+    layout.off_wiring_spsc_buffer = PTO2SpscQueue::reserve_layout(arena, PTO2_WRIRING_QUEUE_SIZE);
+    return layout;
+}
+
+bool PTO2SchedulerState::init_data_from_layout(
+    const PTO2SchedulerLayout &layout, DeviceArena &arena, void *sm_dev_base
+) {
+    PTO2SchedulerState *sched = this;
+    sched->sm_header = reinterpret_cast<PTO2SharedMemoryHeader *>(sm_dev_base);
+#if PTO2_SCHED_PROFILING
+    sched->tasks_completed.store(0, std::memory_order_relaxed);
+    sched->tasks_consumed.store(0, std::memory_order_relaxed);
+#endif
+
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        if (!sched->ring_sched_states[r].init_data_from_layout(sm_dev_base, r)) {
+            return false;
+        }
+    }
+
+    for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) {
+        if (!ready_queue_init_data_from_layout(
+                &sched->ready_queues[i], arena, layout.off_ready_queue_slots[i], layout.ready_queue_capacity
+            )) {
+            return false;
+        }
+    }
+    if (!ready_queue_init_data_from_layout(
+            &sched->dummy_ready_queue, arena, layout.off_dummy_ready_queue_slots, layout.ready_queue_capacity
+        )) {
+        return false;
+    }
+
+    auto *orch_err = pto2_sm_layout::orch_error_code_addr(sm_dev_base);
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        auto *dep_entries = static_cast<PTO2DepListEntry *>(arena.region_ptr(layout.off_dep_pool_entries[r]));
+        memset(dep_entries, 0, static_cast<size_t>(layout.dep_pool_capacities[r]) * sizeof(PTO2DepListEntry));
+        sched->ring_sched_states[r].dep_pool.init(dep_entries, layout.dep_pool_capacities[r], orch_err);
+    }
+
+    if (!sched->wiring.queue.init_data_from_layout(arena, layout.off_wiring_spsc_buffer, layout.spsc_capacity)) {
+        return false;
+    }
+    sched->wiring.batch_count = 0;
+    sched->wiring.batch_index = 0;
+    sched->wiring.backoff_counter = 0;
+
+    return true;
+}
+
+void PTO2SchedulerState::wire_arena_pointers(const PTO2SchedulerLayout &layout, DeviceArena &arena) {
+    PTO2SchedulerState *sched = this;
+    for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) {
+        ready_queue_wire_arena_pointers(&sched->ready_queues[i], arena, layout.off_ready_queue_slots[i]);
+    }
+    ready_queue_wire_arena_pointers(&sched->dummy_ready_queue, arena, layout.off_dummy_ready_queue_slots);
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        sched->ring_sched_states[r].dep_pool.base =
+            static_cast<PTO2DepListEntry *>(arena.region_ptr(layout.off_dep_pool_entries[r]));
+    }
+    sched->wiring.queue.wire_arena_pointers(arena, layout.off_wiring_spsc_buffer);
+}
+
+void PTO2SchedulerState::destroy() {
+    PTO2SchedulerState *sched = this;
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        sched->ring_sched_states[r].destroy();
+        sched->ring_sched_states[r].dep_pool.base = nullptr;
+    }
+    sched->wiring.queue.destroy();
+    for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) {
+        ready_queue_destroy(&sched->ready_queues[i]);
+    }
+    ready_queue_destroy(&sched->dummy_ready_queue);
+}
+
+// =============================================================================
+// Orchestrator
+// =============================================================================
+
+PTO2OrchestratorLayout PTO2OrchestratorState::reserve_layout(
+    DeviceArena &arena, const int32_t task_window_sizes[PTO2_MAX_RING_DEPTH], int32_t dep_pool_capacity
+) {
+    int32_t dep_pool_capacities[PTO2_MAX_RING_DEPTH];
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        dep_pool_capacities[r] = dep_pool_capacity;
+    }
+    return reserve_layout(arena, task_window_sizes, dep_pool_capacities);
+}
+
+PTO2OrchestratorLayout PTO2OrchestratorState::reserve_layout(
+    DeviceArena &arena, const int32_t task_window_sizes[PTO2_MAX_RING_DEPTH],
+    const int32_t dep_pool_capacities[PTO2_MAX_RING_DEPTH]
+) {
+    PTO2OrchestratorLayout layout{};
+    layout.scope_tasks_cap = PTO2_SCOPE_TASKS_CAP;
+    layout.scope_stack_capacity = PTO2_MAX_SCOPE_DEPTH;
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        layout.dep_pool_capacities[r] = dep_pool_capacities[r];
+    }
+
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        const size_t fanin_pool_bytes =
+            PTO2_ALIGN_UP(static_cast<size_t>(dep_pool_capacities[r]) * sizeof(PTO2FaninSpillEntry), PTO2_ALIGN_SIZE);
+        layout.off_fanin_pool[r] = arena.reserve(fanin_pool_bytes, PTO2_ALIGN_SIZE);
+
+        always_assert(task_window_sizes[r] > 0 && (task_window_sizes[r] & (task_window_sizes[r] - 1)) == 0);
+        const size_t seen_epoch_bytes =
+            PTO2_ALIGN_UP(static_cast<size_t>(task_window_sizes[r]) * sizeof(uint32_t), PTO2_ALIGN_SIZE);
+        layout.off_fanin_seen_epoch[r] = arena.reserve(seen_epoch_bytes, PTO2_ALIGN_SIZE);
+    }
+    layout.off_scope_tasks =
+        arena.reserve(static_cast<size_t>(layout.scope_tasks_cap) * sizeof(uintptr_t), alignof(PTO2TaskSlotState *));
+    layout.off_scope_begins =
+        arena.reserve(static_cast<size_t>(layout.scope_stack_capacity) * sizeof(int32_t), alignof(int32_t));
+    layout.tensor_map = PTO2TensorMap::reserve_layout_default(arena, task_window_sizes);
+    return layout;
+}
+
+bool PTO2OrchestratorState::init_data_from_layout(
+    const PTO2OrchestratorLayout &layout, DeviceArena &arena, void *sm_dev_base, void *gm_heap, uint64_t heap_size,
+    uint64_t task_window_size
+) {
+    uint64_t heap_sizes[PTO2_MAX_RING_DEPTH];
+    uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH];
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        heap_sizes[r] = heap_size;
+        task_window_sizes[r] = task_window_size;
+    }
+    return init_data_from_layout(layout, arena, sm_dev_base, gm_heap, heap_sizes, task_window_sizes);
+}
+
+bool PTO2OrchestratorState::init_data_from_layout(
+    const PTO2OrchestratorLayout &layout, DeviceArena &arena, void *sm_dev_base, void *gm_heap,
+    const uint64_t heap_sizes[PTO2_MAX_RING_DEPTH], const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH]
+) {
+    auto *orch = this;
+    *orch = PTO2OrchestratorState{};
+
+    orch->sm_header = reinterpret_cast<PTO2SharedMemoryHeader *>(sm_dev_base);
+    orch->gm_heap_base = gm_heap;
+    uint64_t total_heap_size = 0;
+    if (!sum_ring_heap_sizes(heap_sizes, &total_heap_size)) {
+        return false;
+    }
+    orch->gm_heap_size = total_heap_size;
+    orch->fatal = false;
+
+    auto *orch_err = pto2_sm_layout::orch_error_code_addr(sm_dev_base);
+    uint64_t heap_offset = 0;
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        void *ring_heap_base = reinterpret_cast<char *>(gm_heap) + heap_offset;
+        auto *task_descs_dev = pto2_sm_layout::ring_task_descriptors_addr(sm_dev_base, task_window_sizes, r);
+        auto *cur_idx_dev = pto2_sm_layout::ring_current_task_index_addr(sm_dev_base, r);
+        auto *last_alive_dev = pto2_sm_layout::ring_last_task_alive_addr(sm_dev_base, r);
+
+        orch->rings[r].task_allocator.init(
+            task_descs_dev, static_cast<int32_t>(task_window_sizes[r]), cur_idx_dev, last_alive_dev, ring_heap_base,
+            heap_sizes[r], orch_err
+        );
+        heap_offset += heap_sizes[r];
+
+        const size_t fanin_pool_bytes = PTO2_ALIGN_UP(
+            static_cast<size_t>(layout.dep_pool_capacities[r]) * sizeof(PTO2FaninSpillEntry), PTO2_ALIGN_SIZE
+        );
+        auto *fanin_entries = static_cast<PTO2FaninSpillEntry *>(arena.region_ptr(layout.off_fanin_pool[r]));
+        memset(fanin_entries, 0, fanin_pool_bytes);
+        orch->rings[r].fanin_pool.init(fanin_entries, layout.dep_pool_capacities[r], orch_err);
+
+        const size_t seen_epoch_bytes = PTO2_ALIGN_UP(
+            static_cast<size_t>(layout.tensor_map.task_window_sizes[r]) * sizeof(uint32_t), PTO2_ALIGN_SIZE
+        );
+        auto *seen_epoch = static_cast<uint32_t *>(arena.region_ptr(layout.off_fanin_seen_epoch[r]));
+        memset(seen_epoch, 0, seen_epoch_bytes);
+        orch->fanin_seen_epoch[r] = seen_epoch;
+    }
+
+    if (!orch->tensor_map.init_data_from_layout(layout.tensor_map, arena)) {
+        return false;
+    }
+
+    orch->scope_tasks_size = 0;
+    orch->scope_tasks_capacity = layout.scope_tasks_cap;
+    orch->scope_stack_top = -1;
+    orch->scope_stack_capacity = layout.scope_stack_capacity;
+    orch->manual_begin_depth = PTO2_MAX_SCOPE_DEPTH;
+
+    return true;
+}
+
+void PTO2OrchestratorState::wire_arena_pointers(
+    const PTO2OrchestratorLayout &layout, DeviceArena &arena, PTO2SchedulerState *scheduler_arg
+) {
+    auto *orch = this;
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        orch->rings[r].fanin_pool.base = static_cast<PTO2FaninSpillEntry *>(arena.region_ptr(layout.off_fanin_pool[r]));
+        orch->fanin_seen_epoch[r] = static_cast<uint32_t *>(arena.region_ptr(layout.off_fanin_seen_epoch[r]));
+    }
+    orch->tensor_map.wire_arena_pointers(layout.tensor_map, arena);
+    orch->scope_tasks = static_cast<PTO2TaskSlotState **>(arena.region_ptr(layout.off_scope_tasks));
+    orch->scope_begins = static_cast<int32_t *>(arena.region_ptr(layout.off_scope_begins));
+    orch->scheduler = scheduler_arg;
+}
+
+void PTO2OrchestratorState::destroy() {
+    auto *orch = this;
+    orch->tensor_map.destroy();
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        orch->rings[r].fanin_pool.base = nullptr;
+        orch->fanin_seen_epoch[r] = nullptr;
+    }
+    orch->scope_tasks = nullptr;
+    orch->scope_begins = nullptr;
+}
+
+void PTO2OrchestratorState::set_scheduler(PTO2SchedulerState *scheduler) { this->scheduler = scheduler; }
+
+// =============================================================================
+// Top-level runtime arena
+// =============================================================================
+
+PTO2RuntimeArenaLayout
+runtime_reserve_layout(DeviceArena &arena, uint64_t task_window_size, int32_t dep_pool_capacity) {
+    uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH];
+    uint64_t heap_sizes[PTO2_MAX_RING_DEPTH];
+    int32_t dep_pool_capacities[PTO2_MAX_RING_DEPTH];
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        task_window_sizes[r] = task_window_size;
+        heap_sizes[r] = 0;
+        dep_pool_capacities[r] = dep_pool_capacity;
+    }
+    return runtime_reserve_layout(arena, task_window_sizes, heap_sizes, dep_pool_capacities);
+}
+
+PTO2RuntimeArenaLayout runtime_reserve_layout(
+    DeviceArena &arena, const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH],
+    const uint64_t heap_sizes[PTO2_MAX_RING_DEPTH], const int32_t dep_pool_capacities[PTO2_MAX_RING_DEPTH]
+) {
+    PTO2RuntimeArenaLayout layout{};
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        layout.task_window_sizes[r] = task_window_sizes[r];
+        layout.heap_sizes[r] = heap_sizes[r];
+        layout.dep_pool_capacities[r] = dep_pool_capacities[r];
+    }
+
+    layout.off_sm_handle = arena.reserve(sizeof(PTO2SharedMemoryHandle), alignof(PTO2SharedMemoryHandle));
+    int32_t task_window_sizes_i32[PTO2_MAX_RING_DEPTH];
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        task_window_sizes_i32[r] = static_cast<int32_t>(task_window_sizes[r]);
+    }
+    layout.orch = PTO2OrchestratorState::reserve_layout(arena, task_window_sizes_i32, dep_pool_capacities);
+    layout.sched = PTO2SchedulerState::reserve_layout(arena, dep_pool_capacities);
+    layout.off_runtime = arena.reserve(sizeof(PTO2Runtime), PTO2_ALIGN_SIZE);
+    layout.off_mailbox = arena.reserve(sizeof(AICoreCompletionMailbox), alignof(AICoreCompletionMailbox));
+
+    layout.arena_size = arena.total_size();
+    return layout;
+}
+
+PTO2Runtime *runtime_init_data_from_layout(
+    DeviceArena &arena, const PTO2RuntimeArenaLayout &layout, PTO2RuntimeMode mode, void *sm_dev_base,
+    uint64_t /*sm_size*/, void *gm_heap_dev_base, uint64_t heap_size
+) {
+    uint64_t heap_sizes[PTO2_MAX_RING_DEPTH];
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        heap_sizes[r] = heap_size;
+    }
+    return runtime_init_data_from_layout(arena, layout, mode, sm_dev_base, 0, gm_heap_dev_base, heap_sizes);
+}
+
+PTO2Runtime *runtime_init_data_from_layout(
+    DeviceArena &arena, const PTO2RuntimeArenaLayout &layout, PTO2RuntimeMode mode, void *sm_dev_base,
+    uint64_t /*sm_size*/, void *gm_heap_dev_base, const uint64_t heap_sizes[PTO2_MAX_RING_DEPTH]
+) {
+    PTO2Runtime *rt = static_cast<PTO2Runtime *>(arena.region_ptr(layout.off_runtime));
+    memset(rt, 0, sizeof(*rt));
+
+    auto *sm_wrap = static_cast<PTO2SharedMemoryHandle *>(arena.region_ptr(layout.off_sm_handle));
+    memset(sm_wrap, 0, sizeof(*sm_wrap));
+
+    // rt->ops is filled by the AICPU at boot.
+    rt->mode = mode;
+    rt->gm_heap = gm_heap_dev_base;
+    uint64_t total_heap_size = 0;
+    if (!sum_ring_heap_sizes(heap_sizes, &total_heap_size)) {
+        return nullptr;
+    }
+    rt->gm_heap_size = total_heap_size;
+    rt->gm_heap_owned = false;
+    rt->total_cycles = 0;
+
+    if (!rt->orchestrator.init_data_from_layout(
+            layout.orch, arena, sm_dev_base, gm_heap_dev_base, heap_sizes, layout.task_window_sizes
+        )) {
+        return nullptr;
+    }
+    if (!rt->scheduler.init_data_from_layout(layout.sched, arena, sm_dev_base)) {
+        return nullptr;
+    }
+
+    auto *mailbox = static_cast<AICoreCompletionMailbox *>(arena.region_ptr(layout.off_mailbox));
+    memset(mailbox, 0, sizeof(*mailbox));
+
+    return rt;
+}
+
+void runtime_wire_arena_pointers(DeviceArena &arena, const PTO2RuntimeArenaLayout &layout, PTO2Runtime *rt) {
+    rt->sm_handle = static_cast<PTO2SharedMemoryHandle *>(arena.region_ptr(layout.off_sm_handle));
+    rt->aicore_mailbox = static_cast<AICoreCompletionMailbox *>(arena.region_ptr(layout.off_mailbox));
+    rt->orchestrator.wire_arena_pointers(layout.orch, arena, &rt->scheduler);
+    rt->scheduler.wire_arena_pointers(layout.sched, arena);
+}
+
+void runtime_destroy(PTO2Runtime *rt, DeviceArena & /*arena*/) {
+    // Arena buffer is pooled across runs by DeviceRunner — never freed here.
+    if (!rt) return;
+    rt->scheduler.destroy();
+    rt->orchestrator.destroy();
+    rt->aicore_mailbox = nullptr;
+    rt->sm_handle = nullptr;
+}
diff --git a/src/a5/runtime/fully_distributed_within_core/runtime/shared/pto_shared_memory.cpp b/src/a5/runtime/fully_distributed_within_core/runtime/shared/pto_shared_memory.cpp
new file mode 100644
index 000000000..d704bd85d
--- /dev/null
+++ b/src/a5/runtime/fully_distributed_within_core/runtime/shared/pto_shared_memory.cpp
@@ -0,0 +1,268 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * PTO Runtime2 - Shared Memory Implementation
+ *
+ * Implements shared memory allocation, initialization, and management
+ * for Orchestrator-Scheduler communication.
+ *
+ * Based on: docs/RUNTIME_LOGIC.md
+ */
+
+#include "pto_shared_memory.h"
+#include <inttypes.h>
+#include <stdlib.h>
+#include <string.h>
+#include "common/unified_log.h"
+
+// =============================================================================
+// Size Calculation
+// =============================================================================
+
+uint64_t PTO2SharedMemoryHandle::calculate_size(uint64_t task_window_size) {
+    uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH];
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        task_window_sizes[r] = task_window_size;
+    }
+    return calculate_size_per_ring(task_window_sizes);
+}
+
+uint64_t PTO2SharedMemoryHandle::calculate_size_per_ring(const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH]) {
+    uint64_t size = 0;
+
+    // Header (aligned to cache line)
+    size += PTO2_ALIGN_UP(sizeof(PTO2SharedMemoryHeader), PTO2_ALIGN_SIZE);
+
+    // Per-ring task descriptors and payloads
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        size += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskDescriptor), PTO2_ALIGN_SIZE);
+        size += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskPayload), PTO2_ALIGN_SIZE);
+        size += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskSlotState), PTO2_ALIGN_SIZE);
+    }
+
+    return size;
+}
+
+// =============================================================================
+// Creation and Destruction
+// =============================================================================
+
+void PTO2SharedMemoryHandle::setup_pointers_per_ring(const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH]) {
+    char *ptr = (char *)sm_base;
+
+    // Header
+    header = (PTO2SharedMemoryHeader *)ptr;
+    ptr += PTO2_ALIGN_UP(sizeof(PTO2SharedMemoryHeader), PTO2_ALIGN_SIZE);
+
+    // Per-ring task descriptors, payloads, and slot states
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        auto &ring = header->rings[r];
+        ring.task_descriptors = (PTO2TaskDescriptor *)ptr;
+        ptr += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskDescriptor), PTO2_ALIGN_SIZE);
+
+        ring.task_payloads = (PTO2TaskPayload *)ptr;
+        ptr += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskPayload), PTO2_ALIGN_SIZE);
+
+        ring.slot_states = (PTO2TaskSlotState *)ptr;
+        ptr += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskSlotState), PTO2_ALIGN_SIZE);
+    }
+}
+
+void PTO2SharedMemoryHandle::setup_pointers(uint64_t task_window_size) {
+    uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH];
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        task_window_sizes[r] = task_window_size;
+    }
+    setup_pointers_per_ring(task_window_sizes);
+}
+
+bool PTO2SharedMemoryHandle::init(
+    void *sm_base_arg, uint64_t sm_size_arg, uint64_t task_window_size, uint64_t heap_size
+) {
+    uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH];
+    uint64_t heap_sizes[PTO2_MAX_RING_DEPTH];
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        task_window_sizes[r] = task_window_size;
+        heap_sizes[r] = heap_size;
+    }
+    return init_per_ring(sm_base_arg, sm_size_arg, task_window_sizes, heap_sizes);
+}
+
+bool PTO2SharedMemoryHandle::init_per_ring(
+    void *sm_base_arg, uint64_t sm_size_arg, const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH],
+    const uint64_t heap_sizes[PTO2_MAX_RING_DEPTH]
+) {
+    if (!sm_base_arg || sm_size_arg == 0) return false;
+    if (sm_size_arg < calculate_size_per_ring(task_window_sizes)) return false;
+
+    sm_base = sm_base_arg;
+    sm_size = sm_size_arg;
+    is_owner = false;
+    setup_pointers_per_ring(task_window_sizes);
+    init_header_per_ring(task_window_sizes, heap_sizes);
+    return true;
+}
+
+PTO2SharedMemoryHandle *PTO2SharedMemoryHandle::create_and_init_default(DeviceArena &arena) {
+    const uint64_t buffer_size = calculate_size(PTO2_TASK_WINDOW_SIZE);
+    const size_t off_handle = arena.reserve(sizeof(PTO2SharedMemoryHandle), alignof(PTO2SharedMemoryHandle));
+    const size_t off_buffer = arena.reserve(static_cast<size_t>(buffer_size), PTO2_ALIGN_SIZE);
+    if (arena.commit() == nullptr) return nullptr;
+
+    auto *handle = static_cast<PTO2SharedMemoryHandle *>(arena.region_ptr(off_handle));
+    memset(handle, 0, sizeof(*handle));
+    void *buffer = arena.region_ptr(off_buffer);
+    memset(buffer, 0, static_cast<size_t>(buffer_size));
+    if (!handle->init(buffer, buffer_size, PTO2_TASK_WINDOW_SIZE, PTO2_HEAP_SIZE)) return nullptr;
+    return handle;
+}
+
+void PTO2SharedMemoryHandle::destroy() {
+    // Arena-owned wrappers (is_owner == false) are reclaimed by arena.release();
+    // calling destroy on them is a no-op so existing callers stay safe.
+    if (is_owner && sm_base) {
+        free(sm_base);
+        free(this);
+    }
+}
+
+// =============================================================================
+// Initialization
+// =============================================================================
+//
+// no need init data in pool, init pool data when used
+void PTO2SharedMemoryHandle::init_header(uint64_t task_window_size, uint64_t heap_size) {
+    uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH];
+    uint64_t heap_sizes[PTO2_MAX_RING_DEPTH];
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        task_window_sizes[r] = task_window_size;
+        heap_sizes[r] = heap_size;
+    }
+    init_header_per_ring(task_window_sizes, heap_sizes);
+}
+
+void PTO2SharedMemoryHandle::init_header_per_ring(
+    const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH], const uint64_t heap_sizes[PTO2_MAX_RING_DEPTH]
+) {
+    // Per-ring flow control (start at 0)
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        header->rings[r].fc.init();
+    }
+
+    header->orchestrator_done.store(0, std::memory_order_relaxed);
+
+    // Per-ring layout info
+    uint64_t offset = PTO2_ALIGN_UP(sizeof(PTO2SharedMemoryHeader), PTO2_ALIGN_SIZE);
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        header->rings[r].task_window_size = task_window_sizes[r];
+        header->rings[r].task_window_mask = static_cast<int32_t>(task_window_sizes[r] - 1);
+        header->rings[r].heap_size = heap_sizes[r];
+        header->rings[r].task_descriptors_offset = offset;
+        offset += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskDescriptor), PTO2_ALIGN_SIZE);
+        offset += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskPayload), PTO2_ALIGN_SIZE);
+        offset += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskSlotState), PTO2_ALIGN_SIZE);
+    }
+
+    header->total_size = sm_size;
+    header->graph_output_ptr.store(0, std::memory_order_relaxed);
+    header->graph_output_size.store(0, std::memory_order_relaxed);
+
+    // Error reporting
+    header->orch_error_code.store(PTO2_ERROR_NONE, std::memory_order_relaxed);
+    header->sched_error_bitmap.store(0, std::memory_order_relaxed);
+    header->sched_error_code.store(PTO2_ERROR_NONE, std::memory_order_relaxed);
+    header->sched_error_thread.store(-1, std::memory_order_relaxed);
+
+    // Per-ring slot_states reset. Previously lived in
+    // PTO2SchedulerState::RingSchedState::init(), but it writes into
+    // ring->slot_states[] which is SM-side storage — keeping it here lets
+    // host-side prebuilt-arena init skip all SM dereferences.
+    // bind_ring() pins the ring_id (slot-invariant after this point);
+    // reset_for_reuse() prepares dynamic fanout/refcount fields so the first
+    // submit doesn't need an explicit reset.
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        auto &ring = header->rings[r];
+        for (uint64_t i = 0; i < task_window_sizes[r]; i++) {
+            ring.slot_states[i].bind_ring(static_cast<uint8_t>(r));
+            ring.slot_states[i].reset_for_reuse();
+            ring.slot_states[i].fanin_count = 0;
+            ring.slot_states[i].active_mask = ActiveMask{};
+        }
+    }
+}
+
+// =============================================================================
+// Debug Utilities
+// =============================================================================
+
+void PTO2SharedMemoryHandle::print_layout() {
+    if (!header) return;
+
+    PTO2SharedMemoryHeader *h = header;
+
+    LOG_INFO_V0("=== PTO2 Shared Memory Layout ===");
+    LOG_INFO_V0("Base address:       %p", sm_base);
+    LOG_INFO_V0("Total size:         %" PRIu64 " bytes", h->total_size);
+    LOG_INFO_V0("Ring depth:         %d", PTO2_MAX_RING_DEPTH);
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        LOG_INFO_V0("Ring %d:", r);
+        LOG_INFO_V0("  task_window_size: %" PRIu64, h->rings[r].task_window_size);
+        LOG_INFO_V0("  heap_size:        %" PRIu64 " bytes", h->rings[r].heap_size);
+        LOG_INFO_V0(
+            "  descriptors_off:  %" PRIu64 " (0x%" PRIx64 ")", h->rings[r].task_descriptors_offset,
+            h->rings[r].task_descriptors_offset
+        );
+        LOG_INFO_V0("  current_task_idx: %d", h->rings[r].fc.current_task_index.load(std::memory_order_acquire));
+        LOG_INFO_V0("  last_task_alive:  %d", h->rings[r].fc.last_task_alive.load(std::memory_order_acquire));
+    }
+    LOG_INFO_V0("orchestrator_done:  %d", h->orchestrator_done.load(std::memory_order_acquire));
+    LOG_INFO_V0("Error state:");
+    LOG_INFO_V0("  orch_error_code:    %d", h->orch_error_code.load(std::memory_order_relaxed));
+    LOG_INFO_V0("  sched_error_bitmap: 0x%x", h->sched_error_bitmap.load(std::memory_order_relaxed));
+    LOG_INFO_V0("  sched_error_code:   %d", h->sched_error_code.load(std::memory_order_relaxed));
+    LOG_INFO_V0("  sched_error_thread: %d", h->sched_error_thread.load(std::memory_order_relaxed));
+    LOG_INFO_V0("================================");
+}
+
+bool PTO2SharedMemoryHandle::validate() {
+    if (!sm_base) return false;
+    if (!header) return false;
+
+    PTO2SharedMemoryHeader *h = header;
+
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        if (!h->rings[r].fc.validate(this, r)) return false;
+    }
+
+    return true;
+}
+
+bool PTO2RingFlowControl::validate(PTO2SharedMemoryHandle *handle, int32_t ring_id) const {
+    if (!handle) return false;
+    if (!handle->header) return false;
+    if (ring_id < 0 || ring_id >= PTO2_MAX_RING_DEPTH) return false;
+
+    const PTO2SharedMemoryHeader *h = handle->header;
+
+    // Check that offsets are within bounds
+    if (h->rings[ring_id].task_descriptors_offset >= h->total_size) return false;
+
+    // Check pointer alignment
+    if ((uintptr_t)h->rings[ring_id].task_descriptors % PTO2_ALIGN_SIZE != 0) return false;
+
+    // Check flow control pointer sanity
+    int32_t current = current_task_index.load(std::memory_order_acquire);
+    int32_t last_alive = last_task_alive.load(std::memory_order_acquire);
+    if (current < 0) return false;
+    if (last_alive < 0) return false;
+
+    return true;
+}
diff --git a/src/a5/runtime/fully_distributed_within_core/runtime/shared/pto_tensormap.cpp b/src/a5/runtime/fully_distributed_within_core/runtime/shared/pto_tensormap.cpp
new file mode 100644
index 000000000..b99c67233
--- /dev/null
+++ b/src/a5/runtime/fully_distributed_within_core/runtime/shared/pto_tensormap.cpp
@@ -0,0 +1,261 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * PTO Runtime2 - TensorMap Implementation
+ *
+ * Implements TensorMap with ring buffer pool, lazy invalidation,
+ * and chain truncation optimization.
+ *
+ * Key features:
+ * 1. O(1) insert at bucket head
+ * 2. O(valid_entries) lookup with chain truncation
+ * 3. Automatic stale entry cleanup during lookup
+ * 4. Periodic explicit cleanup for long chains
+ *
+ * Based on: docs/RUNTIME_LOGIC.md
+ */
+
+#include "pto_tensormap.h"
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "common.h"
+#include "common/unified_log.h"
+
+// =============================================================================
+// TensorMap Lookup Chain Length Statistics (compile-time toggle)
+// =============================================================================
+#if PTO2_TENSORMAP_PROFILING
+uint64_t g_lookup_chain_total = 0;
+uint64_t g_lookup_count = 0;
+int32_t g_lookup_chain_max = 0;
+uint64_t g_lookup_overlap_checks = 0;
+uint64_t g_lookup_overlap_hits = 0;
+uint64_t g_insert_count = 0;
+#endif
+
+// =============================================================================
+// Initialization and Destruction
+// =============================================================================
+
+PTO2TensorMapLayout PTO2TensorMap::reserve_layout(
+    DeviceArena &arena, int32_t new_num_buckets, int32_t new_pool_size,
+    const int32_t new_task_window_sizes[PTO2_MAX_RING_DEPTH]
+) {
+    // num_buckets must be a power of two for the hash truncation to work.
+    always_assert((new_num_buckets & (new_num_buckets - 1)) == 0);
+
+    PTO2TensorMapLayout layout{};
+    layout.num_buckets = new_num_buckets;
+    layout.pool_size = new_pool_size;
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        layout.task_window_sizes[r] = new_task_window_sizes[r];
+    }
+
+    layout.off_buckets = arena.reserve(
+        static_cast<size_t>(new_num_buckets) * sizeof(PTO2TensorMapEntry *), alignof(PTO2TensorMapEntry *)
+    );
+    layout.off_entry_pool =
+        arena.reserve(static_cast<size_t>(new_pool_size) * sizeof(PTO2TensorMapEntry), alignof(PTO2TensorMapEntry));
+    layout.off_free_entry_list =
+        arena.reserve(static_cast<size_t>(new_pool_size) * sizeof(PTO2TensorMapEntry *), alignof(PTO2TensorMapEntry *));
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        layout.off_task_entry_heads[r] = arena.reserve(
+            static_cast<size_t>(new_task_window_sizes[r]) * sizeof(PTO2TensorMapEntry *), alignof(PTO2TensorMapEntry *)
+        );
+    }
+    return layout;
+}
+
+PTO2TensorMapLayout
+PTO2TensorMap::reserve_layout_default(DeviceArena &arena, const int32_t new_task_window_sizes[PTO2_MAX_RING_DEPTH]) {
+    return reserve_layout(arena, PTO2_TENSORMAP_NUM_BUCKETS, PTO2_TENSORMAP_POOL_SIZE, new_task_window_sizes);
+}
+
+bool PTO2TensorMap::init_data_from_layout(const PTO2TensorMapLayout &layout, DeviceArena &arena) {
+    num_buckets = layout.num_buckets;
+    pool_size = layout.pool_size;
+
+    // Address arena regions for data writes; do not store these in struct
+    // fields (wire_arena_pointers does that).
+    auto *buckets_arena = static_cast<PTO2TensorMapEntry **>(arena.region_ptr(layout.off_buckets));
+    auto *entry_pool_arena = static_cast<PTO2TensorMapEntry *>(arena.region_ptr(layout.off_entry_pool));
+    auto *free_list_arena = static_cast<PTO2TensorMapEntry **>(arena.region_ptr(layout.off_free_entry_list));
+
+    // buckets[]: empty == nullptr.
+    for (int32_t i = 0; i < num_buckets; i++) {
+        buckets_arena[i] = nullptr;
+    }
+
+    // entry_pool: zero-init equivalent to the previous calloc(entry_pool, ...).
+    // The pool's persistent invariant after init is "bucket_index == -1 means
+    // not linked", set explicitly below.
+    memset(entry_pool_arena, 0, static_cast<size_t>(pool_size) * sizeof(PTO2TensorMapEntry));
+    for (int32_t i = 0; i < pool_size; i++) {
+        entry_pool_arena[i].bucket_index = -1;
+        entry_pool_arena[i].next_in_bucket = nullptr;
+        entry_pool_arena[i].prev_in_bucket = nullptr;
+        entry_pool_arena[i].next_in_task = nullptr;
+        entry_pool_arena[i].prev_in_task = nullptr;
+        entry_pool_arena[i].producer_task_id = PTO2TaskId{};
+    }
+
+    // free_entry_list: zeroed (was calloc'd before); contents become meaningful
+    // only after entries are freed back, so the body of the array stays as 0.
+    memset(free_list_arena, 0, static_cast<size_t>(pool_size) * sizeof(PTO2TensorMapEntry *));
+
+    next_entry_idx = 0;
+    free_num = 0;
+
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        auto *heads_arena = static_cast<PTO2TensorMapEntry **>(arena.region_ptr(layout.off_task_entry_heads[r]));
+        for (int32_t i = 0; i < layout.task_window_sizes[r]; i++) {
+            heads_arena[i] = nullptr;
+        }
+        task_window_sizes[r] = layout.task_window_sizes[r];
+        last_task_alives[r] = 0;
+        last_cleanup[r] = 0;
+    }
+
+    return true;
+}
+
+void PTO2TensorMap::wire_arena_pointers(const PTO2TensorMapLayout &layout, DeviceArena &arena) {
+    buckets = static_cast<PTO2TensorMapEntry **>(arena.region_ptr(layout.off_buckets));
+    entry_pool = static_cast<PTO2TensorMapEntry *>(arena.region_ptr(layout.off_entry_pool));
+    free_entry_list = static_cast<PTO2TensorMapEntry **>(arena.region_ptr(layout.off_free_entry_list));
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        task_entry_heads[r] = static_cast<PTO2TensorMapEntry **>(arena.region_ptr(layout.off_task_entry_heads[r]));
+    }
+}
+
+void PTO2TensorMap::destroy() {
+    // Arena owns the backing memory; here we only forget our pointers so any
+    // stray post-destroy access trips a nullptr dereference instead of reading
+    // a recycled allocation.
+    buckets = nullptr;
+    entry_pool = nullptr;
+    free_entry_list = nullptr;
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        task_entry_heads[r] = nullptr;
+    }
+}
+
+// =============================================================================
+// Debug Utilities
+// =============================================================================
+
+void PTO2TensorMap::print_stats() {
+    int32_t valid = 0;
+    int32_t stale = 0;
+    int32_t empty_buckets = 0;
+    int32_t max_chain = 0;
+    int64_t total_chain = 0;
+    int32_t non_empty_buckets = 0;
+
+    // Count entries
+    for (int32_t i = 0; i < pool_size; i++) {
+        if (entry_pool[i].bucket_index != -1) {
+            if (entry_valid(entry_pool[i])) {
+                valid++;
+            } else {
+                stale++;
+            }
+        }
+    }
+
+    // Count bucket stats
+    for (int32_t b = 0; b < num_buckets; b++) {
+        int32_t chain_len = 0;
+        auto cur_entry = buckets[b];
+
+        while (cur_entry != nullptr) {
+            chain_len++;
+            cur_entry = cur_entry->next_in_bucket;
+        }
+
+        if (chain_len == 0) {
+            empty_buckets++;
+        } else {
+            non_empty_buckets++;
+            total_chain += chain_len;
+            if (chain_len > max_chain) {
+                max_chain = chain_len;
+            }
+        }
+    }
+
+    LOG_INFO_V0("=== TensorMap Statistics ===");
+    LOG_INFO_V0("Pool size:           %d", pool_size);
+    LOG_INFO_V0("Pool next entry idx: %d", next_entry_idx);
+    LOG_INFO_V0("Pool free_num:       %d", free_num);
+    LOG_INFO_V0("Num buckets:         %d", num_buckets);
+    LOG_INFO_V0("Valid entries:       %d", valid);
+    LOG_INFO_V0("Stale entries:       %d", stale);
+    LOG_INFO_V0("Empty buckets:       %d", empty_buckets);
+    LOG_INFO_V0("Max chain len:       %d", max_chain);
+    LOG_INFO_V0("Avg chain len:       %.2f", non_empty_buckets > 0 ? (float)total_chain / non_empty_buckets : 0);
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        LOG_INFO_V0("Last task alive[%d]: %d", r, last_task_alives[r]);
+    }
+    LOG_INFO_V0("============================");
+}
+
+int32_t PTO2TensorMap::valid_count() {
+    int32_t count = 0;
+
+    for (int32_t i = 0; i < pool_size; i++) {
+        if (entry_pool[i].bucket_index != -1 && entry_valid(entry_pool[i])) {
+            count++;
+        }
+    }
+
+    return count;
+}
+
+void PTO2TensorMap::sync_tensormap(PTO2TaskId task_id, int32_t sm_last_task_alive) {
+    auto ring_id = task_id.ring();
+    auto local_id = task_id.local();
+    sync_validity(ring_id, sm_last_task_alive);
+
+    // Only attempt cleanup when last_task_alive has actually advanced;
+    // otherwise cleanup_retired would empty-loop and we'd spin forever.
+    auto overlap = get_task_local_id_slot(ring_id, local_id) == get_task_local_id_slot(ring_id, last_cleanup[ring_id]);
+    if (sm_last_task_alive - last_cleanup[ring_id] >= PTO2_TENSORMAP_CLEANUP_INTERVAL || overlap) {
+        cleanup_retired(ring_id, last_cleanup[ring_id], sm_last_task_alive);
+        last_cleanup[ring_id] = sm_last_task_alive;
+    }
+}
+
+// =============================================================================
+// TensorMap Lookup Profiling
+// =============================================================================
+#if PTO2_TENSORMAP_PROFILING
+PTO2TensorMapProfilingData pto2_tensormap_get_profiling() {
+    PTO2TensorMapProfilingData d;
+    d.lookup_chain_total = g_lookup_chain_total;
+    d.lookup_count = g_lookup_count;
+    d.lookup_chain_max = g_lookup_chain_max;
+    d.overlap_checks = g_lookup_overlap_checks;
+    d.overlap_hits = g_lookup_overlap_hits;
+    d.insert_count = g_insert_count;
+
+    // Reset
+    g_lookup_chain_total = 0;
+    g_lookup_count = 0;
+    g_lookup_chain_max = 0;
+    g_lookup_overlap_checks = 0;
+    g_lookup_overlap_hits = 0;
+    g_insert_count = 0;
+    return d;
+}
+#endif
diff --git a/src/a5/runtime/fully_distributed_within_core/runtime/shared/runtime.cpp b/src/a5/runtime/fully_distributed_within_core/runtime/shared/runtime.cpp
new file mode 100644
index 000000000..1683ac323
--- /dev/null
+++ b/src/a5/runtime/fully_distributed_within_core/runtime/shared/runtime.cpp
@@ -0,0 +1,174 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Runtime Class - Implementation
+ *
+ * Device execution and handshake control.
+ * Task graph construction is handled by PTO2Runtime.
+ */
+
+#include "runtime.h"
+
+#include "common/unified_log.h"
+#include "pto_runtime2_types.h"
+#include "pto_shared_memory.h"
+
+// =============================================================================
+// Constructor
+// =============================================================================
+
+Runtime::Runtime() {
+    // NOTE: host_api is initialized in InitRuntime() (host-only code)
+    // because the CApi functions don't exist when compiled for device.
+
+    // Initialize handshake buffers
+    memset(workers, 0, sizeof(workers));
+    worker_count = 0;
+    aicpu_thread_num = 1;
+    ready_queue_shards = RUNTIME_DEFAULT_READY_QUEUE_SHARDS;
+    memset(aicpu_allowed_cpus, 0, sizeof(aicpu_allowed_cpus));
+    aicpu_allowed_cpu_count = 0;
+    aicpu_launch_count = 0;
+    orch_to_sched = false;
+
+    // fully_distributed_within_core handoff fields
+    dist.core_main_fn = 0;
+    dist.go = 0;
+    dist.num_workers = 0;
+    dist.done_count = 0;
+
+    // Initialize profiling state
+
+    // Initialize device orchestration state
+    gm_sm_ptr_ = nullptr;
+    gm_heap_ptr_ = nullptr;
+    slot_states_ptr_ = nullptr;
+    orch_args_storage_.clear();
+    prebuilt_arena_base_ = nullptr;
+    prebuilt_runtime_offset_ = 0;
+
+    // Initialize device orchestration SO binary
+    dev_orch_so_addr_ = 0;
+    dev_orch_so_size_ = 0;
+    active_callable_id_ = -1;
+    register_new_callable_id_ = false;
+    device_orch_func_name_[0] = '\0';
+    device_orch_config_name_[0] = '\0';
+
+    // Initialize kernel binary tracking
+    registered_kernel_count_ = 0;
+
+    // Initialize function address mapping
+    for (int i = 0; i < RUNTIME_MAX_FUNC_ID; i++) {
+        func_id_to_addr_[i] = 0;
+    }
+}
+
+// =============================================================================
+// Device orchestration
+// =============================================================================
+
+void *Runtime::get_gm_sm_ptr() const { return gm_sm_ptr_; }
+void *Runtime::get_gm_heap_ptr() const { return gm_heap_ptr_; }
+const ChipStorageTaskArgs &Runtime::get_orch_args() const { return orch_args_storage_; }
+void Runtime::set_gm_sm_ptr(void *p) { gm_sm_ptr_ = p; }
+void Runtime::set_gm_heap(void *p) { gm_heap_ptr_ = p; }
+void Runtime::set_slot_states_ptr(void *p) { slot_states_ptr_ = p; }
+void Runtime::set_orch_args(const ChipStorageTaskArgs &args) { orch_args_storage_ = args; }
+
+void Runtime::set_prebuilt_arena(void *arena_base, size_t runtime_off) {
+    prebuilt_arena_base_ = arena_base;
+    prebuilt_runtime_offset_ = runtime_off;
+}
+void *Runtime::get_prebuilt_arena_base() const { return prebuilt_arena_base_; }
+size_t Runtime::get_prebuilt_runtime_offset() const { return prebuilt_runtime_offset_; }
+
+// Device orchestration SO metadata (bytes live in a separate device buffer
+// owned by DeviceRunner; only the address/size travels in Runtime).
+void Runtime::set_dev_orch_so(uint64_t dev_addr, uint64_t size) {
+    dev_orch_so_addr_ = dev_addr;
+    dev_orch_so_size_ = size;
+}
+
+uint64_t Runtime::get_dev_orch_so_addr() const { return dev_orch_so_addr_; }
+
+uint64_t Runtime::get_dev_orch_so_size() const { return dev_orch_so_size_; }
+
+void Runtime::set_active_callable_id(int32_t callable_id, bool is_new) {
+    active_callable_id_ = callable_id;
+    register_new_callable_id_ = is_new;
+}
+
+int32_t Runtime::get_active_callable_id() const { return active_callable_id_; }
+
+bool Runtime::register_new_callable_id() const { return register_new_callable_id_; }
+
+void Runtime::set_device_orch_func_name(const char *name) {
+    if (name == nullptr) {
+        device_orch_func_name_[0] = '\0';
+        return;
+    }
+    std::strncpy(device_orch_func_name_, name, RUNTIME_MAX_ORCH_SYMBOL_NAME - 1);
+    device_orch_func_name_[RUNTIME_MAX_ORCH_SYMBOL_NAME - 1] = '\0';
+}
+
+const char *Runtime::get_device_orch_func_name() const { return device_orch_func_name_; }
+
+void Runtime::set_device_orch_config_name(const char *name) {
+    if (name == nullptr) {
+        device_orch_config_name_[0] = '\0';
+        return;
+    }
+    std::strncpy(device_orch_config_name_, name, RUNTIME_MAX_ORCH_SYMBOL_NAME - 1);
+    device_orch_config_name_[RUNTIME_MAX_ORCH_SYMBOL_NAME - 1] = '\0';
+}
+
+const char *Runtime::get_device_orch_config_name() const { return device_orch_config_name_; }
+
+uint64_t Runtime::get_function_bin_addr(int func_id) const {
+    if (func_id < 0 || func_id >= RUNTIME_MAX_FUNC_ID) return 0;
+    return func_id_to_addr_[func_id];
+}
+
+void Runtime::set_function_bin_addr(int func_id, uint64_t addr) {
+    if (func_id < 0 || func_id >= RUNTIME_MAX_FUNC_ID) {
+        LOG_ERROR("[Runtime] func_id=%d is out of range [0, %d)", func_id, RUNTIME_MAX_FUNC_ID);
+        return;
+    }
+    if (addr != 0 && func_id_to_addr_[func_id] == 0) {
+        if (registered_kernel_count_ < RUNTIME_MAX_FUNC_ID) {
+            registered_kernel_func_ids_[registered_kernel_count_++] = func_id;
+        } else {
+            LOG_ERROR(
+                "[Runtime] Registration limit reached (%d). Cannot track func_id=%d for cleanup.", RUNTIME_MAX_FUNC_ID,
+                func_id
+            );
+        }
+    }
+    func_id_to_addr_[func_id] = addr;
+}
+
+void Runtime::replay_function_bin_addr(int func_id, uint64_t addr) {
+    if (func_id < 0 || func_id >= RUNTIME_MAX_FUNC_ID) {
+        LOG_ERROR("[Runtime] func_id=%d is out of range [0, %d)", func_id, RUNTIME_MAX_FUNC_ID);
+        return;
+    }
+    func_id_to_addr_[func_id] = addr;
+}
+
+int Runtime::get_registered_kernel_count() const { return registered_kernel_count_; }
+
+int Runtime::get_registered_kernel_func_id(int index) const {
+    if (index < 0 || index >= registered_kernel_count_) return -1;
+    return registered_kernel_func_ids_[index];
+}
+
+void Runtime::clear_registered_kernels() { registered_kernel_count_ = 0; }
diff --git a/src/a5/runtime/fully_distributed_within_core/runtime/tensor_create_info.h b/src/a5/runtime/fully_distributed_within_core/runtime/tensor_create_info.h
new file mode 100644
index 000000000..912839a34
--- /dev/null
+++ b/src/a5/runtime/fully_distributed_within_core/runtime/tensor_create_info.h
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * TensorCreateInfo — submit-time create-info for runtime-allocated outputs.
+ *
+ * Runtime-only: this header (and the materialization helpers below) are NOT
+ * part of the wire/host-facing Tensor in src/common/task_interface/tensor.h.
+ * It carries the metadata required to materialize a fresh contiguous output:
+ * dtype, ndims, shapes, manual_dep, and an optional initial value fill. Its
+ * 64B layout mirrors Tensor cache line 1 so init_tensor_from_create_info() can
+ * copy the whole line with a single memcpy.
+ */
+
+#pragma once
+
+#include <cstring>
+#include <memory.h>
+#include <stdint.h>
+
+#include "data_type.h"
+#include "tensor.h"
+
+class alignas(64) TensorCreateInfo {
+public:
+    TensorCreateInfo(
+        const uint32_t shapes_in[], uint32_t ndims_in, DataType dtype_in = DataType::FLOAT32, bool manual_dep_in = false
+    ) :
+        initial_value(0),
+        has_initial_value(false),
+        __pad2__(0),
+        start_offset(0),  // mirrors Tensor::start_offset; pre-zeroed for create-info outputs
+        version(0),
+        ndims(ndims_in),
+        dtype(dtype_in),
+        manual_dep(manual_dep_in),
+        is_contiguous(true),  // mirrors Tensor::is_contiguous; pre-set for create-info outputs
+        __pad_flags__(0) {
+        // Bound the write below: shapes[] holds MAX_TENSOR_DIMS, and ndims_in
+        // comes from user-submitted output shapes — guard before the loop so an
+        // oversized rank can't overrun the fixed array.
+        always_assert(ndims_in > 0 && ndims_in <= MAX_TENSOR_DIMS);
+        for (uint32_t i = 0; i < ndims_in; i++) {
+            shapes[i] = shapes_in[i];
+        }
+    }
+
+    void copy(const TensorCreateInfo &other) { memcpy(this, &other, sizeof(other)); }
+
+    template <typename T = uint64_t>
+    void set_initial_value(T value) {
+        has_initial_value = true;
+        initial_value = to_u64(value);
+    }
+
+    uint64_t buffer_size_bytes() const {
+        uint64_t total = 1;
+        for (uint32_t i = 0; i < ndims; i++) {
+            total *= shapes[i];
+        }
+        return total * get_element_size(dtype);
+    }
+
+public:
+    // --- Bytes [0, 32): TensorCreateInfo-only fields ---
+    // These occupy the same positions as Tensor::buffer, Tensor::owner_task_id,
+    // and Tensor::start_offset. The runtime overwrites owner metadata after the
+    // memcpy and recomputes start_offset / stride during payload materialization.
+    uint64_t initial_value;
+    bool has_initial_value;
+    uint8_t __pad1__[7];
+    uint64_t __pad2__;      // → Tensor::owner_task_id (overwritten post-memcpy)
+    uint64_t start_offset;  // mirrors Tensor::start_offset; always 0 for create-info outputs
+
+    // --- Bytes [32, 64): Matches Tensor cache line 1 layout ---
+    int32_t version;  // Always 0 for create-info outputs
+    uint32_t ndims;
+    DataType dtype;
+    bool manual_dep;
+    bool is_contiguous;                // Always true for create-info outputs
+    uint8_t __pad_flags__;             // → Tensor::child_memory (always 0 for create-info outputs)
+    uint32_t shapes[MAX_TENSOR_DIMS];  // → Tensor::shapes
+
+    TensorCreateInfo() = default;
+};
+
+// TensorCreateInfo layout must match Tensor cacheline 1 for memcpy optimization
+static_assert(sizeof(TensorCreateInfo) == 64, "TensorCreateInfo must match Tensor cacheline 1 size (64 bytes)");
+static_assert(offsetof(TensorCreateInfo, start_offset) == offsetof(Tensor, start_offset));
+static_assert(offsetof(TensorCreateInfo, version) == offsetof(Tensor, version));
+static_assert(offsetof(TensorCreateInfo, ndims) == offsetof(Tensor, ndims));
+static_assert(offsetof(TensorCreateInfo, dtype) == offsetof(Tensor, dtype));
+static_assert(offsetof(TensorCreateInfo, manual_dep) == offsetof(Tensor, manual_dep));
+static_assert(offsetof(TensorCreateInfo, is_contiguous) == offsetof(Tensor, is_contiguous));
+static_assert(offsetof(TensorCreateInfo, __pad_flags__) == offsetof(Tensor, child_memory));
+static_assert(offsetof(TensorCreateInfo, shapes) == offsetof(Tensor, shapes));
+
+// ============================================================================
+// Materialization helpers — operate on a Tensor& through its public members.
+// Factored out of Tensor (which now lives in the wire/host-facing common
+// header) so the create-info dependency stays runtime-only.
+// ============================================================================
+
+/// Fill the entire backing buffer of `t` with `initial_value` (doubling memcpy).
+inline void fill_tensor_initial_value(Tensor &t, uint64_t initial_value) {
+    always_assert(reinterpret_cast<char *>(t.buffer.addr) != nullptr);
+    uint64_t elem_size = get_element_size(t.dtype);
+    char *dst = reinterpret_cast<char *>(t.buffer.addr);
+    constexpr uint64_t blk_size = 64;
+    uint64_t blk = (t.buffer.size < blk_size) ? t.buffer.size : blk_size;
+    for (uint64_t b = 0; b < blk; b += elem_size) {
+        memcpy(dst + b, &initial_value, elem_size);
+    }
+    uint64_t filled = blk;
+    while (filled < t.buffer.size) {
+        uint64_t copy_size = ((t.buffer.size - filled) < filled) ? (t.buffer.size - filled) : filled;
+        memcpy(dst + filled, dst, copy_size);
+        filled += copy_size;
+    }
+}
+
+/// Materialize a TensorCreateInfo into `t` (fresh contiguous output).
+/// Single 64B memcpy covers cache line 1; `ci` pre-initialises start_offset (=0)
+/// and is_contiguous (=true) in its line-1 slots so they need no reset here.
+/// Cache line 2 (stride/extent) is computed from `ci.shapes` in a single reverse pass.
+inline void init_tensor_from_create_info(Tensor &t, const TensorCreateInfo &ci, void *addr, uint64_t buffer_size) {
+    always_assert(ci.ndims > 0 && ci.ndims <= MAX_TENSOR_DIMS);
+    memcpy(&t, &ci, 64);
+    t.buffer = {reinterpret_cast<uint64_t>(addr), buffer_size};
+    t.owner_task_id = PTO2TaskId::invalid();  // caller (orchestrator) overwrites with actual task_id
+    uint32_t s = 1;
+    for (int32_t i = static_cast<int32_t>(t.ndims) - 1; i >= 0; --i) {
+        t.strides[i] = s;
+        s *= t.shapes[i];
+    }
+    t.extent_elem_cache = s;
+    if (ci.has_initial_value) {
+        fill_tensor_initial_value(t, ci.initial_value);
+    }
+}
diff --git a/src/common/hierarchical/remote_wire.cpp b/src/common/hierarchical/remote_wire.cpp
index 3e0438460..f45bdebf6 100644
--- a/src/common/hierarchical/remote_wire.cpp
+++ b/src/common/hierarchical/remote_wire.cpp
@@ -315,6 +315,10 @@ std::vector<uint8_t> encode_call_config(const CallConfig &config) {
     put_i32(out, config.enable_pmu);
     put_i32(out, config.enable_dep_gen);
     put_i32(out, config.enable_scope_stats);
+    put_i32(out, config.use_example_exec_time);
+    for (int i = 0; i < CALLCONFIG_MAX_EXAMPLE_FUNCS; ++i) {
+        put_i32(out, config.example_exec_time_ns[i]);
+    }
     put_string(out, call_config_prefix(config), MAX_STRING_BYTES, "CallConfig.output_prefix");
     return out;
 }
@@ -328,6 +332,10 @@ CallConfig decode_call_config(const uint8_t *data, size_t size, size_t &offset)
     config.enable_pmu = get_i32(data, size, offset);
     config.enable_dep_gen = get_i32(data, size, offset);
     config.enable_scope_stats = get_i32(data, size, offset);
+    config.use_example_exec_time = get_i32(data, size, offset);
+    for (int i = 0; i < CALLCONFIG_MAX_EXAMPLE_FUNCS; ++i) {
+        config.example_exec_time_ns[i] = get_i32(data, size, offset);
+    }
     std::string prefix = get_string(data, size, offset, MAX_STRING_BYTES, "CallConfig.output_prefix");
     ensure(prefix.size() < sizeof(config.output_prefix), "remote_wire: CallConfig.output_prefix is too long");
     std::memset(config.output_prefix, 0, sizeof(config.output_prefix));
diff --git a/src/common/platform/onboard/host/c_api_shared.cpp b/src/common/platform/onboard/host/c_api_shared.cpp
index 7ffe3b651..c57511909 100644
--- a/src/common/platform/onboard/host/c_api_shared.cpp
+++ b/src/common/platform/onboard/host/c_api_shared.cpp
@@ -337,12 +337,22 @@ int prepare_callable(DeviceContextHandle ctx, int32_t callable_id, const void *c
     }
 }
 
+// Weak no-op default for the trace-driven replay hook (see
+// pto_runtime_c_api.h). fully_distributed_within_core's runtime_maker provides
+// a strong override; every other runtime links this no-op.
+extern "C" __attribute__((weak)) void
+runtime_apply_example_exec_time(void *runtime, int use_example_exec_time, const int32_t *example_exec_time_ns) {
+    (void)runtime;
+    (void)use_example_exec_time;
+    (void)example_exec_time_ns;
+}
+
 int run_prepared(
     DeviceContextHandle ctx, RuntimeHandle runtime, int32_t callable_id, const void *args, int block_dim,
     int aicpu_thread_num, int enable_l2_swimlane, int enable_dump_tensor, int enable_pmu, int enable_dep_gen,
     int enable_scope_stats, uint64_t ring_task_window, uint64_t ring_heap, uint64_t ring_dep_pool,
     const uint64_t *ring_task_windows, const uint64_t *ring_heaps, const uint64_t *ring_dep_pools,
-    const char *output_prefix, PtoRunTiming *out_timing
+    const char *output_prefix, int use_example_exec_time, const int32_t *example_exec_time_ns, PtoRunTiming *out_timing
 ) {
     if (out_timing != NULL) {
         out_timing->host_wall_ns = 0;
@@ -419,6 +429,7 @@ int run_prepared(
         runner->set_dep_gen_enabled(enable_dep_gen != 0);
         runner->set_scope_stats_enabled(enable_scope_stats != 0);
         runner->set_output_prefix(output_prefix);
+        runtime_apply_example_exec_time(r, use_example_exec_time, example_exec_time_ns);
 
         rc = runner->run(*r, block_dim, aicpu_thread_num);
         if (rc != 0) {
diff --git a/src/common/platform/sim/host/c_api_shared.cpp b/src/common/platform/sim/host/c_api_shared.cpp
index fd245bb93..e82b126eb 100644
--- a/src/common/platform/sim/host/c_api_shared.cpp
+++ b/src/common/platform/sim/host/c_api_shared.cpp
@@ -301,12 +301,22 @@ int prepare_callable(DeviceContextHandle ctx, int32_t callable_id, const void *c
     }
 }
 
+// Weak no-op default for the sim trace-driven replay hook (see
+// pto_runtime_c_api.h). fully_distributed_within_core's runtime_maker provides
+// a strong override; every other runtime links this no-op.
+extern "C" __attribute__((weak)) void
+runtime_apply_example_exec_time(void *runtime, int use_example_exec_time, const int32_t *example_exec_time_ns) {
+    (void)runtime;
+    (void)use_example_exec_time;
+    (void)example_exec_time_ns;
+}
+
 int run_prepared(
     DeviceContextHandle ctx, RuntimeHandle runtime, int32_t callable_id, const void *args, int block_dim,
     int aicpu_thread_num, int enable_l2_swimlane, int enable_dump_tensor, int enable_pmu, int enable_dep_gen,
     int enable_scope_stats, uint64_t ring_task_window, uint64_t ring_heap, uint64_t ring_dep_pool,
     const uint64_t *ring_task_windows, const uint64_t *ring_heaps, const uint64_t *ring_dep_pools,
-    const char *output_prefix, PtoRunTiming *out_timing
+    const char *output_prefix, int use_example_exec_time, const int32_t *example_exec_time_ns, PtoRunTiming *out_timing
 ) {
     if (out_timing != NULL) {
         out_timing->host_wall_ns = 0;
@@ -370,6 +380,7 @@ int run_prepared(
         runner->set_dep_gen_enabled(enable_dep_gen != 0);
         runner->set_scope_stats_enabled(enable_scope_stats != 0);
         runner->set_output_prefix(output_prefix);
+        runtime_apply_example_exec_time(r, use_example_exec_time, example_exec_time_ns);
 
         rc = runner->run(*r, block_dim, aicpu_thread_num);
         if (rc != 0) {
diff --git a/src/common/task_interface/call_config.h b/src/common/task_interface/call_config.h
index 91ef59191..54c2749da 100644
--- a/src/common/task_interface/call_config.h
+++ b/src/common/task_interface/call_config.h
@@ -55,6 +55,11 @@ inline constexpr int RUNTIME_ENV_PER_RING_FIELD_GROUPS = 3;
 inline constexpr int RUNTIME_ENV_UINT64_FIELD_COUNT =
     RUNTIME_ENV_SCALAR_FIELD_COUNT + RUNTIME_ENV_PER_RING_FIELD_GROUPS * RUNTIME_ENV_RING_COUNT;
 
+// Capacity of the per-func reference-duration table carried for the sim-only
+// trace-driven replay feature (see use_example_exec_time below). func_ids at or
+// above this are simply not eligible for the feature (run for real).
+inline constexpr int CALLCONFIG_MAX_EXAMPLE_FUNCS = 64;
+
 #pragma pack(push, 1)
 // Per-task runtime-environment overrides — the programmatic equivalent of the
 // `PTO2_RING_*` env vars, grouped under their own sub-struct so they read as a
@@ -126,7 +131,20 @@ struct CallConfig {
     int32_t enable_pmu = 0;  // 0 = disabled; >0 = enabled, value selects event type
     int32_t enable_dep_gen = 0;
     int32_t enable_scope_stats = 0;  // writes <output_prefix>/scope_stats/scope_stats.jsonl
-    RuntimeEnv runtime_env;          // per-task PTO2_RING_* overrides
+    // Sim-only trace-driven replay. ONLY fully_distributed_within_core implements
+    // it; every other runtime must reject use_example_exec_time != 0 (the
+    // scene-test layer enforces this so no other runtime needs to adapt). When on,
+    // execute_slot skips the real incore kernel and busy-waits
+    // example_exec_time_ns[func_id] instead, so a fast sim run reflects measured
+    // on-hardware kernel durations. 0 = off (kernels run for real, golden valid).
+    int32_t use_example_exec_time = 0;
+    RuntimeEnv runtime_env;  // per-task PTO2_RING_* overrides
+    // Per-func reference kernel duration in nanoseconds, indexed by func_id
+    // (int32 caps at ~2.1 s, ample for a kernel).
+    // 0 = unset: that func runs for real even under use_example_exec_time (so a
+    // partially-annotated CALLABLE still works). Consumed only when
+    // use_example_exec_time != 0.
+    int32_t example_exec_time_ns[CALLCONFIG_MAX_EXAMPLE_FUNCS] = {};
     char output_prefix[1024] = {};
 
     bool diagnostics_any() const noexcept {
@@ -154,6 +172,7 @@ struct CallConfig {
 #pragma pack(pop)
 static_assert(sizeof(RuntimeEnv) == RUNTIME_ENV_UINT64_FIELD_COUNT * sizeof(uint64_t), "RuntimeEnv wire layout drift");
 static_assert(
-    sizeof(CallConfig) == 7 * sizeof(int32_t) + RUNTIME_ENV_UINT64_FIELD_COUNT * sizeof(uint64_t) + 1024,
+    sizeof(CallConfig) ==
+        (8 + CALLCONFIG_MAX_EXAMPLE_FUNCS) * sizeof(int32_t) + RUNTIME_ENV_UINT64_FIELD_COUNT * sizeof(uint64_t) + 1024,
     "CallConfig wire layout drift"
 );
diff --git a/src/common/worker/chip_worker.cpp b/src/common/worker/chip_worker.cpp
index 77b3a2dde..9dd3f6911 100644
--- a/src/common/worker/chip_worker.cpp
+++ b/src/common/worker/chip_worker.cpp
@@ -332,7 +332,8 @@ RunTiming ChipWorker::run(int32_t callable_id, const ChipStorageTaskArgs *args,
         device_ctx_, rt, callable_id, args, config.block_dim, config.aicpu_thread_num, config.enable_l2_swimlane,
         config.enable_dump_tensor, config.enable_pmu, config.enable_dep_gen, config.enable_scope_stats,
         config.runtime_env.ring_task_window, config.runtime_env.ring_heap, config.runtime_env.ring_dep_pool,
-        ring_task_windows, ring_heaps, ring_dep_pools, config.output_prefix, &timing
+        ring_task_windows, ring_heaps, ring_dep_pools, config.output_prefix, config.use_example_exec_time,
+        config.example_exec_time_ns, &timing
     );
     if (rc != 0) {
         throw std::runtime_error("run_prepared failed with code " + std::to_string(rc));
diff --git a/src/common/worker/chip_worker.h b/src/common/worker/chip_worker.h
index 6057960e2..901f68899 100644
--- a/src/common/worker/chip_worker.h
+++ b/src/common/worker/chip_worker.h
@@ -144,7 +144,7 @@ class ChipWorker {
     using PrepareCallableFn = int (*)(void *, int32_t, const void *);
     using RunPreparedFn = int (*)(
         void *, void *, int32_t, const void *, int, int, int, int, int, int, int, uint64_t, uint64_t, uint64_t,
-        const uint64_t *, const uint64_t *, const uint64_t *, const char *, PtoRunTiming *
+        const uint64_t *, const uint64_t *, const uint64_t *, const char *, int, const int32_t *, PtoRunTiming *
     );
     using UnregisterCallableFn = int (*)(void *, int32_t);
     using GetAicpuDlopenCountFn = size_t (*)(void *);
diff --git a/src/common/worker/pto_runtime_c_api.h b/src/common/worker/pto_runtime_c_api.h
index ca2ac3f07..d686d42d6 100644
--- a/src/common/worker/pto_runtime_c_api.h
+++ b/src/common/worker/pto_runtime_c_api.h
@@ -197,6 +197,12 @@ int prepare_callable(DeviceContextHandle ctx, int32_t callable_id, const void *c
  * Consumed by tensormap_and_ringbuffer only; other runtime variants accept
  * and ignore them.
  *
+ * `use_example_exec_time` + `example_exec_time_ns` drive the sim-only
+ * trace-driven replay feature: when nonzero, a supporting runtime busy-waits
+ * example_exec_time_ns[func_id] nanoseconds in place of the real kernel.
+ * Plumbed to the runtime via the weak runtime_apply_example_exec_time hook
+ * below; only fully_distributed_within_core implements it.
+ *
  * @return 0 on success, negative on error (no prep state, NULL ctx, etc.).
  */
 int run_prepared(
@@ -204,9 +210,21 @@ int run_prepared(
     int aicpu_thread_num, int enable_l2_swimlane, int enable_dump_tensor, int enable_pmu, int enable_dep_gen,
     int enable_scope_stats, uint64_t ring_task_window, uint64_t ring_heap, uint64_t ring_dep_pool,
     const uint64_t *ring_task_windows, const uint64_t *ring_heaps, const uint64_t *ring_dep_pools,
-    const char *output_prefix, PtoRunTiming *out_timing
+    const char *output_prefix, int use_example_exec_time, const int32_t *example_exec_time_ns, PtoRunTiming *out_timing
 );
 
+/*
+ * Sim-only trace-driven replay hook. run_prepared calls this after binding the
+ * callable so a runtime that opts in can stash the per-func reference durations
+ * (example_exec_time_ns[func_id], nanoseconds) on its Runtime and busy-wait
+ * them instead of running the real kernel. example_exec_time_ns has
+ * CALLCONFIG_MAX_EXAMPLE_FUNCS entries. A weak no-op default lives in each
+ * platform's c_api_shared so runtimes that don't support the feature need no
+ * change; fully_distributed_within_core overrides it with a strong definition.
+ */
+extern "C" void
+runtime_apply_example_exec_time(void *runtime, int use_example_exec_time, const int32_t *example_exec_time_ns);
+
 /**
  * Drop the prepared state for `callable_id` and release the per-id share of
  * the device orch SO buffer. The buffer itself is freed only when its
diff --git a/tests/st/a2a3/fully_distributed_within_core/alternating_matmul_add/kernels/aic/kernel_matmul.cpp b/tests/st/a2a3/fully_distributed_within_core/alternating_matmul_add/kernels/aic/kernel_matmul.cpp
new file mode 100644
index 000000000..9a3a1c337
--- /dev/null
+++ b/tests/st/a2a3/fully_distributed_within_core/alternating_matmul_add/kernels/aic/kernel_matmul.cpp
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Matrix Multiplication Kernel (Cube Core)
+ *
+ * Computes: C = A @ B (TILE x TILE x TILE matmul)
+ * Uses TMATMUL instruction
+ *
+ * Args (Tensor*):
+ *   args[0] = A (INPUT)  - TILE x TILE
+ *   args[1] = B (INPUT)  - TILE x TILE
+ *   args[2] = C (OUTPUT) - TILE x TILE
+ */
+
+#include <cstdint>
+#include <pto/pto-inst.hpp>
+#include <pto/common/constants.hpp>
+#include <pto/common/pto_tile.hpp>
+
+#include "tensor.h"
+
+using namespace pto;
+
+#include "pipe_sync.h"
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__ [aicore]
+#endif
+
+template <typename T>
+AICORE constexpr inline T CeilAlign(T num_1, T num_2) {
+    if (num_2 == 0) {
+        return 0;
+    }
+    return (num_1 + num_2 - 1) / num_2 * num_2;
+}
+
+static __aicore__ inline int get_num_tiles(__gm__ Tensor *tensor, uint64_t tile_elems) {
+    uint64_t total_elems = tensor->shapes[0];
+    return static_cast<int>(total_elems / tile_elems);
+}
+
+template <int TILE>
+static __aicore__ void matmul_impl(__gm__ float *input_a, __gm__ float *input_b, __gm__ float *output) {
+    constexpr int blockAlign = C0_SIZE_BYTE / sizeof(float);
+    constexpr int M = CeilAlign<int>(TILE, 16);
+    constexpr int K = CeilAlign<int>(TILE, blockAlign);
+    constexpr int N = CeilAlign<int>(TILE, blockAlign);
+
+    using GlobalDataA =
+        GlobalTensor<float, Shape<1, 1, 1, TILE, TILE>, Stride<1 * TILE * TILE, 1 * TILE * TILE, TILE * TILE, TILE, 1>>;
+    using GlobalDataB =
+        GlobalTensor<float, Shape<1, 1, 1, TILE, TILE>, Stride<1 * TILE * TILE, 1 * TILE * TILE, TILE * TILE, TILE, 1>>;
+    using GlobalDataC =
+        GlobalTensor<float, Shape<1, 1, 1, TILE, TILE>, Stride<1 * TILE * TILE, 1 * TILE * TILE, TILE * TILE, TILE, 1>>;
+
+    GlobalDataA src0Global(input_a);
+    GlobalDataB src1Global(input_b);
+    GlobalDataC dstGlobal(output);
+
+    using TileMatA = Tile<TileType::Mat, float, M, K, BLayout::ColMajor, TILE, TILE, SLayout::RowMajor, 512>;
+    using TileMatB = Tile<TileType::Mat, float, K, N, BLayout::ColMajor, TILE, TILE, SLayout::RowMajor, 512>;
+
+    using LeftTile = TileLeft<float, M, K, TILE, TILE>;
+    using RightTile = TileRight<float, K, N, TILE, TILE>;
+    using AccTile = TileAcc<float, M, N, TILE, TILE>;
+
+    TileMatA aMatTile;
+    TileMatB bMatTile;
+    TASSIGN(aMatTile, 0x0);
+    TASSIGN(bMatTile, 0x20000);
+
+    LeftTile aTile;
+    RightTile bTile;
+    AccTile cTile;
+    TASSIGN(aTile, 0x0);
+    TASSIGN(bTile, 0x0);
+    TASSIGN(cTile, 0x0);
+
+    TLOAD(aMatTile, src0Global);
+    TLOAD(bMatTile, src1Global);
+
+    set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
+    wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
+
+    TMOV(aTile, aMatTile);
+    TMOV(bTile, bMatTile);
+
+    set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
+    wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
+
+    TMATMUL(cTile, aTile, bTile);
+
+    set_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
+    wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
+
+    TSTORE(dstGlobal, cTile);
+
+    pipe_sync();
+}
+
+extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) {
+    __gm__ Tensor *input_a = reinterpret_cast<__gm__ Tensor *>(args[0]);
+    __gm__ Tensor *input_b = reinterpret_cast<__gm__ Tensor *>(args[1]);
+    __gm__ Tensor *output = reinterpret_cast<__gm__ Tensor *>(args[2]);
+
+    constexpr uint64_t TILE_ELEMS = 128 * 128;
+    int num_tiles = get_num_tiles(input_a, TILE_ELEMS);
+
+    __gm__ float *base_a = reinterpret_cast<__gm__ float *>(input_a->buffer.addr) + input_a->start_offset;
+    __gm__ float *base_b = reinterpret_cast<__gm__ float *>(input_b->buffer.addr) + input_b->start_offset;
+    __gm__ float *base_c = reinterpret_cast<__gm__ float *>(output->buffer.addr) + output->start_offset;
+
+    for (int tile_idx = 0; tile_idx < num_tiles; tile_idx++) {
+        __gm__ float *a_ptr = base_a + (tile_idx * TILE_ELEMS);
+        __gm__ float *b_ptr = base_b + (tile_idx * TILE_ELEMS);
+        __gm__ float *c_ptr = base_c + (tile_idx * TILE_ELEMS);
+
+        matmul_impl<128>(a_ptr, b_ptr, c_ptr);
+    }
+}
diff --git a/tests/st/a2a3/fully_distributed_within_core/alternating_matmul_add/kernels/aiv/kernel_add.cpp b/tests/st/a2a3/fully_distributed_within_core/alternating_matmul_add/kernels/aiv/kernel_add.cpp
new file mode 100644
index 000000000..d542c38b3
--- /dev/null
+++ b/tests/st/a2a3/fully_distributed_within_core/alternating_matmul_add/kernels/aiv/kernel_add.cpp
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Element-wise Tensor Addition Kernel
+ *
+ * Implements: out[i] = src0[i] + src1[i]
+ * Tile size: ROWS x COLS
+ *
+ * Args (Tensor*):
+ *   args[0] = src0 (INPUT)  - ROWS x COLS
+ *   args[1] = src1 (INPUT)  - ROWS x COLS
+ *   args[2] = out (OUTPUT)  - ROWS x COLS
+ */
+
+#include <cstdint>
+#include <pto/pto-inst.hpp>
+
+#include "tensor.h"
+
+using namespace pto;
+
+#include "pipe_sync.h"
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__ [aicore]
+#endif
+
+static __aicore__ inline int get_num_tiles(__gm__ Tensor *tensor, uint64_t tile_elems) {
+    uint64_t total_elems = tensor->shapes[0];
+    return static_cast<int>(total_elems / tile_elems);
+}
+
+template <int ROWS, int COLS>
+static __aicore__ void add_impl(__gm__ float *src0, __gm__ float *src1, __gm__ float *out) {
+    using DynShapeDim5 = Shape<1, 1, 1, ROWS, COLS>;
+    using DynStridDim5 = Stride<1, 1, 1, COLS, 1>;
+    using GlobalData = GlobalTensor<float, DynShapeDim5, DynStridDim5>;
+    using TileData = Tile<TileType::Vec, float, ROWS, COLS, BLayout::RowMajor, -1, -1>;
+
+    TileData src0Tile(ROWS, COLS);
+    TileData src1Tile(ROWS, COLS);
+    TileData dstTile(ROWS, COLS);
+    TASSIGN(src0Tile, 0x0);
+    TASSIGN(src1Tile, 0x10000);
+    TASSIGN(dstTile, 0x20000);
+
+    GlobalData src0Global(src0);
+    GlobalData src1Global(src1);
+    GlobalData dstGlobal(out);
+
+    TLOAD(src0Tile, src0Global);
+    TLOAD(src1Tile, src1Global);
+    set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+    wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+    TADD(dstTile, src0Tile, src1Tile);
+    set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+    wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+    TSTORE(dstGlobal, dstTile);
+    pipe_sync();
+}
+
+extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) {
+    __gm__ Tensor *src0_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]);
+    __gm__ Tensor *src1_tensor = reinterpret_cast<__gm__ Tensor *>(args[1]);
+    __gm__ Tensor *out_tensor = reinterpret_cast<__gm__ Tensor *>(args[2]);
+
+    constexpr uint64_t TILE_ELEMS = 128 * 128;
+    int num_tiles = get_num_tiles(src0_tensor, TILE_ELEMS);
+
+    __gm__ float *base_src0 = reinterpret_cast<__gm__ float *>(src0_tensor->buffer.addr) + src0_tensor->start_offset;
+    __gm__ float *base_src1 = reinterpret_cast<__gm__ float *>(src1_tensor->buffer.addr) + src1_tensor->start_offset;
+    __gm__ float *base_out = reinterpret_cast<__gm__ float *>(out_tensor->buffer.addr) + out_tensor->start_offset;
+
+    for (int tile_idx = 0; tile_idx < num_tiles; tile_idx++) {
+        __gm__ float *src0_ptr = base_src0 + (tile_idx * TILE_ELEMS);
+        __gm__ float *src1_ptr = base_src1 + (tile_idx * TILE_ELEMS);
+        __gm__ float *out_ptr = base_out + (tile_idx * TILE_ELEMS);
+
+        add_impl<128, 128>(src0_ptr, src1_ptr, out_ptr);
+    }
+}
diff --git a/tests/st/a2a3/fully_distributed_within_core/alternating_matmul_add/kernels/orchestration/alternating_orch.cpp b/tests/st/a2a3/fully_distributed_within_core/alternating_matmul_add/kernels/orchestration/alternating_orch.cpp
new file mode 100644
index 000000000..d08f7645b
--- /dev/null
+++ b/tests/st/a2a3/fully_distributed_within_core/alternating_matmul_add/kernels/orchestration/alternating_orch.cpp
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Alternating Matmul-Add Orchestration Function (tensormap_and_ringbuffer Runtime)
+ *
+ * Submits independent matmul and add tasks per batch.
+ *
+ * Configuration read from scalar args:
+ *   - batch: Number of batches
+ *   - M: Number of matmul tasks per batch
+ *   - N: Number of add tasks per batch
+ *   - matmul_batch: Number of matmul tiles per task group
+ *   - add_batch: Number of add tiles per task group
+ *
+ * Task pattern: interleaved [matmul_0, add_0, matmul_1, add_1, ...]
+ * All tasks are completely independent (no dependencies).
+ *
+ * Arg layout: [A, B, C, X, Y, Z, batch, M_val, N_val, matmul_batch, add_batch]
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "pto_orchestration_api.h"  // NOLINT(build/include_subdir)
+
+#define FUNC_MATMUL 0
+#define FUNC_ADD 1
+
+static constexpr uint64_t MATMUL_ELEMS = 128 * 128;
+static constexpr uint64_t ADD_ELEMS = 128 * 128;
+
+extern "C" {
+
+__attribute__((visibility("default"))) PTO2OrchestrationConfig aicpu_orchestration_config(const L2TaskArgs &orch_args) {
+    (void)orch_args;  // NOLINT(readability/casting)
+    return PTO2OrchestrationConfig{
+        .expected_arg_count = 11,
+    };
+}
+
+__attribute__((visibility("default"))) void aicpu_orchestration_entry(const L2TaskArgs &orch_args) {
+    // Tensor args
+    const Tensor &ext_A = orch_args.tensor(0).ref();
+    const Tensor &ext_B = orch_args.tensor(1).ref();
+    const Tensor &ext_C = orch_args.tensor(2).ref();
+    const Tensor &ext_X = orch_args.tensor(3).ref();
+    const Tensor &ext_Y = orch_args.tensor(4).ref();
+    const Tensor &ext_Z = orch_args.tensor(5).ref();
+
+    // Scalar config args
+    int batch = static_cast<int>(orch_args.scalar(0));
+    int M = static_cast<int>(orch_args.scalar(1));
+    int N = static_cast<int>(orch_args.scalar(2));
+    int matmul_batch = static_cast<int>(orch_args.scalar(3));
+    int add_batch = static_cast<int>(orch_args.scalar(4));
+
+    LOG_INFO_V0(
+        "[alternating_orch] Batch: %d, M: %d, N: %d, matmul_batch: %d, add_batch: %d", batch, M, N, matmul_batch,
+        add_batch
+    );
+
+    int total_matmul_tasks = batch * M;
+    int total_add_tasks = batch * N;
+    int num_matmul_groups = total_matmul_tasks / matmul_batch;
+    int num_add_groups = total_add_tasks / add_batch;
+
+    int total_matmul = 0;
+    int total_add = 0;
+
+    int max_groups = num_matmul_groups > num_add_groups ? num_matmul_groups : num_add_groups;
+
+    // Interleaved submit: matmul and add groups alternate
+    for (int group_idx = 0; group_idx < max_groups; group_idx++) {
+        if (group_idx < num_matmul_groups) {
+            int start_task_idx = group_idx * matmul_batch;
+            uint64_t offset = static_cast<uint64_t>(start_task_idx) * MATMUL_ELEMS;
+            uint64_t group_size = static_cast<uint64_t>(matmul_batch) * MATMUL_ELEMS;
+
+            uint32_t matmul_group_shapes[1] = {static_cast<uint32_t>(group_size)};
+            uint32_t view_offsets[1] = {static_cast<uint32_t>(offset)};
+
+            Tensor A_view = ext_A.view(matmul_group_shapes, view_offsets);
+            Tensor B_view = ext_B.view(matmul_group_shapes, view_offsets);
+            Tensor C_view = ext_C.view(matmul_group_shapes, view_offsets);
+
+            L0TaskArgs params_matmul;
+            params_matmul.add_input(A_view);
+            params_matmul.add_input(B_view);
+            params_matmul.add_output(C_view);
+            rt_submit_aic_task(FUNC_MATMUL, params_matmul);
+            total_matmul++;
+        }
+
+        if (group_idx < num_add_groups) {
+            int start_task_idx = group_idx * add_batch;
+            uint64_t offset = static_cast<uint64_t>(start_task_idx) * ADD_ELEMS;
+            uint64_t group_size = static_cast<uint64_t>(add_batch) * ADD_ELEMS;
+
+            uint32_t add_group_shapes[1] = {static_cast<uint32_t>(group_size)};
+            uint32_t view_offsets[1] = {static_cast<uint32_t>(offset)};
+
+            Tensor X_view = ext_X.view(add_group_shapes, view_offsets);
+            Tensor Y_view = ext_Y.view(add_group_shapes, view_offsets);
+            Tensor Z_view = ext_Z.view(add_group_shapes, view_offsets);
+
+            L0TaskArgs params_add;
+            params_add.add_input(X_view);
+            params_add.add_input(Y_view);
+            params_add.add_output(Z_view);
+            rt_submit_aiv_task(FUNC_ADD, params_add);
+            total_add++;
+        }
+    }
+
+    LOG_INFO_V9("[alternating_orch] Submitted %d matmul groups and %d add groups", total_matmul, total_add);
+}
+
+}  // extern "C"
diff --git a/tests/st/a2a3/fully_distributed_within_core/alternating_matmul_add/test_alternating_matmul_add.py b/tests/st/a2a3/fully_distributed_within_core/alternating_matmul_add/test_alternating_matmul_add.py
new file mode 100644
index 000000000..70051d7ce
--- /dev/null
+++ b/tests/st/a2a3/fully_distributed_within_core/alternating_matmul_add/test_alternating_matmul_add.py
@@ -0,0 +1,132 @@
+#!/usr/bin/env python3
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""Alternating matmul + add: interleaved AIC (matmul 128x128) and AIV (add 128x128) tasks.
+
+Tests AIC+AIV mixed execution with scalar parameters and batched task submission.
+C[b,m] = A[b,m] @ B[b,m], Z[b,n] = X[b,n] + Y[b,n].
+"""
+
+import ctypes
+
+import torch
+from simpler.task_interface import ArgDirection as D
+
+from simpler_setup import Scalar, SceneTestCase, TaskArgsBuilder, Tensor, scene_test
+
+
+@scene_test(level=2, runtime="tensormap_and_ringbuffer")
+class TestAlternatingMatmulAdd(SceneTestCase):
+    """Alternating matmul + add with scalar parameters."""
+
+    RTOL = 1e-3
+    ATOL = 1e-3
+
+    CALLABLE = {
+        "orchestration": {
+            "source": "kernels/orchestration/alternating_orch.cpp",
+            "function_name": "aicpu_orchestration_entry",
+            "signature": [D.IN, D.IN, D.OUT, D.IN, D.IN, D.OUT],
+        },
+        "incores": [
+            {
+                "func_id": 0,
+                "source": "kernels/aic/kernel_matmul.cpp",
+                "core_type": "aic",
+                "signature": [D.IN, D.IN, D.OUT],
+            },
+            {
+                "func_id": 1,
+                "source": "kernels/aiv/kernel_add.cpp",
+                "core_type": "aiv",
+                "signature": [D.IN, D.IN, D.OUT],
+            },
+        ],
+    }
+
+    CASES = [
+        {
+            "name": "default",
+            "platforms": ["a2a3"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 24},
+            "params": {"batch": 1, "M": 1, "N": 1, "matmul_batch": 1, "add_batch": 1},
+        },
+        {
+            "name": "Case1",
+            "platforms": ["a2a3"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 24},
+            "params": {"batch": 500, "M": 4, "N": 4, "matmul_batch": 4, "add_batch": 4},
+            "manual": True,
+        },
+        {
+            "name": "Case2",
+            "platforms": ["a2a3"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 24},
+            "params": {"batch": 512, "M": 2, "N": 5, "matmul_batch": 4, "add_batch": 5},
+            "manual": True,
+        },
+    ]
+
+    def generate_args(self, params):
+        batch = params["batch"]
+        M = params["M"]
+        N = params["N"]
+        matmul_batch = params.get("matmul_batch", 1)
+        add_batch = params.get("add_batch", 1)
+        matmul_size = 128
+        add_rows = 128
+        add_cols = 128
+
+        torch.manual_seed(42)
+        A = torch.randn(batch, M, matmul_size, matmul_size, dtype=torch.float32) * 0.01
+        B = torch.randn(batch, M, matmul_size, matmul_size, dtype=torch.float32) * 0.01
+        C = torch.zeros(batch, M, matmul_size, matmul_size, dtype=torch.float32)
+        X = torch.randn(batch, N, add_rows, add_cols, dtype=torch.float32) * 0.01
+        Y = torch.randn(batch, N, add_rows, add_cols, dtype=torch.float32) * 0.01
+        Z = torch.zeros(batch, N, add_rows, add_cols, dtype=torch.float32)
+
+        return TaskArgsBuilder(
+            Tensor("A", A.flatten()),
+            Tensor("B", B.flatten()),
+            Tensor("C", C.flatten()),
+            Tensor("X", X.flatten()),
+            Tensor("Y", Y.flatten()),
+            Tensor("Z", Z.flatten()),
+            Scalar("batch", ctypes.c_int64(batch)),
+            Scalar("M_val", ctypes.c_int64(M)),
+            Scalar("N_val", ctypes.c_int64(N)),
+            Scalar("matmul_batch", ctypes.c_int64(matmul_batch)),
+            Scalar("add_batch", ctypes.c_int64(add_batch)),
+        )
+
+    def compute_golden(self, args, params):
+        batch = params["batch"]
+        M = params["M"]
+        N = params["N"]
+        matmul_size = 128
+        add_rows = 128
+        add_cols = 128
+
+        A = args.A.reshape(batch, M, matmul_size, matmul_size)
+        B = args.B.reshape(batch, M, matmul_size, matmul_size)
+        C = args.C.reshape(batch, M, matmul_size, matmul_size)
+        X = args.X.reshape(batch, N, add_rows, add_cols)
+        Y = args.Y.reshape(batch, N, add_rows, add_cols)
+        Z = args.Z.reshape(batch, N, add_rows, add_cols)
+
+        for b in range(batch):
+            for m in range(M):
+                C[b, m] = torch.matmul(A[b, m], B[b, m])
+        for b in range(batch):
+            for n in range(N):
+                Z[b, n] = X[b, n] + Y[b, n]
+
+
+if __name__ == "__main__":
+    SceneTestCase.run_module(__name__)
diff --git a/tests/st/a2a3/fully_distributed_within_core/batch_paged_attention/kernels/aic/aic_pv_matmul.cpp b/tests/st/a2a3/fully_distributed_within_core/batch_paged_attention/kernels/aic/aic_pv_matmul.cpp
new file mode 100644
index 000000000..825665b70
--- /dev/null
+++ b/tests/st/a2a3/fully_distributed_within_core/batch_paged_attention/kernels/aic/aic_pv_matmul.cpp
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+// Batched PV Matmul Kernel: for each batch b, pij(M, K) @ vj(K, N) -> oi_new(M, N)
+//
+// Processes batch_count batches in a single kernel invocation.
+// Per-batch addresses are computed from global tensor bases + block_table lookup.
+//
+// Supports two tile configurations via runtime dispatch:
+//   Case1: (16, 128) @ (128, 128) -> (16, 128)
+//   Case2: (64,  64) @ ( 64, 128) -> (64, 128)
+//
+// Template: M=q_tile, K=block_size, N=head_dim
+
+#include <cstdint>
+#include <pto/pto-inst.hpp>
+
+#include "tensor.h"
+
+// NOLINTNEXTLINE(build/namespaces)
+using namespace pto;
+
+#include "pipe_sync.h"
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__ [aicore]  // NOLINT(whitespace/braces)
+#endif
+
+template <int M, int K, int N>
+static __aicore__ void pv_matmul_batch_impl(
+    __gm__ Tensor *pij_batch, __gm__ Tensor *value_cache, __gm__ Tensor *block_table_t, __gm__ Tensor *oi_new_batch,
+    uint64_t batch_count, uint64_t block_idx, uint64_t block_num, uint64_t batch_start
+) {
+    __gm__ bfloat16_t *pij_base = reinterpret_cast<__gm__ bfloat16_t *>(pij_batch->buffer.addr);
+    __gm__ bfloat16_t *val_base = reinterpret_cast<__gm__ bfloat16_t *>(value_cache->buffer.addr);
+    __gm__ float *oi_base = reinterpret_cast<__gm__ float *>(oi_new_batch->buffer.addr);
+    __gm__ int32_t *bt = reinterpret_cast<__gm__ int32_t *>(block_table_t->buffer.addr);
+
+    using GlobalA = GlobalTensor<bfloat16_t, Shape<1, 1, 1, M, K>, Stride<M * K, M * K, M * K, K, 1>>;
+    using GlobalB = GlobalTensor<bfloat16_t, Shape<1, 1, 1, K, N>, Stride<K * N, K * N, K * N, N, 1>>;
+    using GlobalOut = GlobalTensor<float, Shape<1, 1, 1, M, N>, Stride<M * N, M * N, M * N, N, 1>>;
+
+    using TileMatA = Tile<TileType::Mat, bfloat16_t, M, K, BLayout::ColMajor, M, K, SLayout::RowMajor, 512>;
+    using TileMatB = Tile<TileType::Mat, bfloat16_t, K, N, BLayout::ColMajor, K, N, SLayout::RowMajor, 512>;
+
+    using LeftTile = TileLeft<bfloat16_t, M, K, M, K>;
+    using RightTile = TileRight<bfloat16_t, K, N, K, N>;
+    using AccTile = TileAcc<float, M, N, M, N>;
+
+    TileMatA aMatTile;
+    TileMatB bMatTile;
+    TASSIGN(aMatTile, 0x0);
+    TASSIGN(bMatTile, 0x20000);
+
+    LeftTile aTile;
+    RightTile bTile;
+    AccTile cTile;
+    TASSIGN(aTile, 0x0);
+    TASSIGN(bTile, 0x0);
+    TASSIGN(cTile, 0x0);
+
+    for (uint64_t b = 0; b < batch_count; b++) {
+        __gm__ bfloat16_t *pij_addr = pij_base + b * M * K;
+        int32_t phys_block = bt[(batch_start + b) * block_num + block_idx];
+        __gm__ bfloat16_t *vj_addr = val_base + static_cast<uint64_t>(phys_block) * K * N;
+        __gm__ float *oi_addr = oi_base + b * M * N;
+
+        GlobalA pijGlobal(pij_addr);
+        GlobalB vjGlobal(vj_addr);
+        GlobalOut oiGlobal(oi_addr);
+
+        TLOAD(aMatTile, pijGlobal);
+        TLOAD(bMatTile, vjGlobal);
+
+        set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
+        wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
+
+        TMOV(aTile, aMatTile);
+        TMOV(bTile, bMatTile);
+
+        set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
+        wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
+
+        TMATMUL(cTile, aTile, bTile);
+
+        set_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
+        wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
+
+        TSTORE(oiGlobal, cTile);
+
+        if (b + 1 < batch_count) {
+            pipe_barrier(PIPE_ALL);
+        }
+    }
+
+    pipe_sync();
+}
+
+extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) {
+    __gm__ Tensor *pij_batch = reinterpret_cast<__gm__ Tensor *>(args[0]);
+    __gm__ Tensor *value_cache = reinterpret_cast<__gm__ Tensor *>(args[1]);
+    __gm__ Tensor *block_table_t = reinterpret_cast<__gm__ Tensor *>(args[2]);
+    __gm__ Tensor *oi_new_batch = reinterpret_cast<__gm__ Tensor *>(args[3]);
+    uint64_t batch_count = static_cast<uint64_t>(args[4]);
+    uint64_t block_idx = static_cast<uint64_t>(args[5]);
+    uint64_t block_num = static_cast<uint64_t>(args[6]);
+    uint64_t batch_start = static_cast<uint64_t>(args[7]);
+
+    uint64_t q_tile_size = static_cast<uint64_t>(pij_batch->shapes[0] / batch_count);
+    uint64_t block_size = static_cast<uint64_t>(pij_batch->shapes[1]);
+
+    if (q_tile_size == 16 && block_size <= 16) {
+        pv_matmul_batch_impl<16, 16, 16>(
+            pij_batch, value_cache, block_table_t, oi_new_batch, batch_count, block_idx, block_num, batch_start
+        );
+    } else if (q_tile_size == 16) {
+        pv_matmul_batch_impl<16, 128, 128>(
+            pij_batch, value_cache, block_table_t, oi_new_batch, batch_count, block_idx, block_num, batch_start
+        );
+    } else {
+        pv_matmul_batch_impl<64, 64, 128>(
+            pij_batch, value_cache, block_table_t, oi_new_batch, batch_count, block_idx, block_num, batch_start
+        );
+    }
+}
diff --git a/tests/st/a2a3/fully_distributed_within_core/batch_paged_attention/kernels/aic/aic_qk_matmul.cpp b/tests/st/a2a3/fully_distributed_within_core/batch_paged_attention/kernels/aic/aic_qk_matmul.cpp
new file mode 100644
index 000000000..0bf394f93
--- /dev/null
+++ b/tests/st/a2a3/fully_distributed_within_core/batch_paged_attention/kernels/aic/aic_qk_matmul.cpp
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+// Batched QK Matmul Kernel: for each batch b, qi(M, K) @ kj.T(K, N) -> sij(M, N)
+//
+// Processes batch_count batches in a single kernel invocation.
+// Per-batch addresses are computed from global tensor bases + block_table lookup.
+//
+// Supports two tile configurations via runtime dispatch:
+//   Case1: (16, 128) @ (128, 128).T -> (16, 128)
+//   Case2: (64, 128) @ (128,  64).T -> (64,  64)
+//
+// Template: M=q_tile, K=head_dim, N=block_size
+
+#include <cstdint>
+#include <pto/pto-inst.hpp>
+
+#include "tensor.h"
+
+// NOLINTNEXTLINE(build/namespaces)
+using namespace pto;
+
+#include "pipe_sync.h"
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__ [aicore]  // NOLINT(whitespace/braces)
+#endif
+
+template <int M, int K, int N>
+static __aicore__ void qk_matmul_batch_impl(
+    __gm__ Tensor *query, __gm__ Tensor *key_cache, __gm__ Tensor *block_table_t, __gm__ Tensor *sij_batch,
+    uint64_t batch_count, uint64_t block_idx, uint64_t q_offset, uint64_t block_num, uint64_t num_heads,
+    uint64_t batch_start
+) {
+    __gm__ bfloat16_t *query_base = reinterpret_cast<__gm__ bfloat16_t *>(query->buffer.addr);
+    __gm__ bfloat16_t *key_base = reinterpret_cast<__gm__ bfloat16_t *>(key_cache->buffer.addr);
+    __gm__ float *sij_base = reinterpret_cast<__gm__ float *>(sij_batch->buffer.addr);
+    __gm__ int32_t *bt = reinterpret_cast<__gm__ int32_t *>(block_table_t->buffer.addr);
+
+    using GlobalA = GlobalTensor<bfloat16_t, Shape<1, 1, 1, M, K>, Stride<M * K, M * K, M * K, K, 1>>;
+    using GlobalB = GlobalTensor<bfloat16_t, Shape<1, 1, 1, K, N>, Stride<K * N, K * N, K * N, 1, K>, Layout::DN>;
+    using GlobalOut = GlobalTensor<float, Shape<1, 1, 1, M, N>, Stride<M * N, M * N, M * N, N, 1>>;
+
+    using TileMatA = Tile<TileType::Mat, bfloat16_t, M, K, BLayout::ColMajor, M, K, SLayout::RowMajor, 512>;
+    using TileMatB = Tile<TileType::Mat, bfloat16_t, K, N, BLayout::RowMajor, K, N, SLayout::ColMajor, 512>;
+
+    using LeftTile = TileLeft<bfloat16_t, M, K, M, K>;
+    using RightTile = TileRight<bfloat16_t, K, N, K, N>;
+    using AccTile = TileAcc<float, M, N, M, N>;
+
+    TileMatA aMatTile;
+    TileMatB bMatTile;
+    TASSIGN(aMatTile, 0x0);
+    TASSIGN(bMatTile, 0x20000);
+
+    LeftTile aTile;
+    RightTile bTile;
+    AccTile cTile;
+    TASSIGN(aTile, 0x0);
+    TASSIGN(bTile, 0x0);
+    TASSIGN(cTile, 0x0);
+
+    for (uint64_t b = 0; b < batch_count; b++) {
+        __gm__ bfloat16_t *qi_addr = query_base + ((batch_start + b) * num_heads + q_offset) * K;
+        int32_t phys_block = bt[(batch_start + b) * block_num + block_idx];
+        __gm__ bfloat16_t *kj_addr = key_base + static_cast<uint64_t>(phys_block) * N * K;
+        __gm__ float *sij_addr = sij_base + b * M * N;
+
+        GlobalA qiGlobal(qi_addr);
+        GlobalB kjGlobal(kj_addr);
+        GlobalOut sijGlobal(sij_addr);
+
+        TLOAD(aMatTile, qiGlobal);
+        set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
+        TLOAD(bMatTile, kjGlobal);
+        set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID1);
+
+        wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
+        TMOV(aTile, aMatTile);
+        wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID1);
+        TMOV(bTile, bMatTile);
+
+        set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
+        wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
+
+        TMATMUL(cTile, aTile, bTile);
+
+        set_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
+        wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
+
+        TSTORE(sijGlobal, cTile);
+
+        if (b + 1 < batch_count) {
+            pipe_barrier(PIPE_ALL);
+        }
+    }
+
+    pipe_sync();
+}
+
+extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) {
+    __gm__ Tensor *query = reinterpret_cast<__gm__ Tensor *>(args[0]);
+    __gm__ Tensor *key_cache = reinterpret_cast<__gm__ Tensor *>(args[1]);
+    __gm__ Tensor *block_table_t = reinterpret_cast<__gm__ Tensor *>(args[2]);
+    __gm__ Tensor *sij_batch = reinterpret_cast<__gm__ Tensor *>(args[3]);
+    uint64_t batch_count = static_cast<uint64_t>(args[4]);
+    uint64_t block_idx = static_cast<uint64_t>(args[5]);
+    uint64_t q_offset = static_cast<uint64_t>(args[6]);
+    uint64_t block_num = static_cast<uint64_t>(args[7]);
+    uint64_t num_heads = static_cast<uint64_t>(args[8]);
+    uint64_t batch_start = static_cast<uint64_t>(args[9]);
+
+    uint64_t q_tile_size = static_cast<uint64_t>(sij_batch->shapes[0] / batch_count);
+    uint64_t block_size = static_cast<uint64_t>(sij_batch->shapes[1]);
+
+    if (q_tile_size == 16 && block_size <= 16) {
+        qk_matmul_batch_impl<16, 16, 16>(
+            query, key_cache, block_table_t, sij_batch, batch_count, block_idx, q_offset, block_num, num_heads,
+            batch_start
+        );
+    } else if (q_tile_size == 16) {
+        qk_matmul_batch_impl<16, 128, 128>(
+            query, key_cache, block_table_t, sij_batch, batch_count, block_idx, q_offset, block_num, num_heads,
+            batch_start
+        );
+    } else {
+        qk_matmul_batch_impl<64, 128, 64>(
+            query, key_cache, block_table_t, sij_batch, batch_count, block_idx, q_offset, block_num, num_heads,
+            batch_start
+        );
+    }
+}
diff --git a/tests/st/a2a3/fully_distributed_within_core/batch_paged_attention/kernels/aiv/aiv_online_update.cpp b/tests/st/a2a3/fully_distributed_within_core/batch_paged_attention/kernels/aiv/aiv_online_update.cpp
new file mode 100644
index 000000000..b8955c3b5
--- /dev/null
+++ b/tests/st/a2a3/fully_distributed_within_core/batch_paged_attention/kernels/aiv/aiv_online_update.cpp
@@ -0,0 +1,230 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+// Batched Online Softmax Update + Normalize Kernel (AIV)
+//
+// Processes batch_count batches in a single kernel invocation.
+// For each batch b, updates accumulators mi/li/oi with new block's mij/lij/oi_new.
+// On is_last, normalizes and writes to the output tensor at the correct batch offset.
+//
+// Supports two tile configurations via runtime dispatch:
+//   Case1: (16, 128) -- q_tile=16, head_dim=128
+//   Case2: (64, 128) -- q_tile=64, head_dim=128
+//
+// Scalar layout strategy:
+//   M scalar floats stored contiguously in GM can be loaded as either:
+//   - ND (kScalarRows, kScalarCols) RowMajor for element-wise ops
+//   - DN (kAlignedRows, 1) ColMajor for row-broadcast ops
+//   Conversion between layouts uses TRESHAPE (UB-internal, zero GM access).
+
+#include <cstdint>
+#include <pto/pto-inst.hpp>
+
+#include "tensor.h"
+
+using namespace pto;
+
+#include "pipe_sync.h"
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__ [aicore]
+#endif
+
+template <int M, int N>
+static __aicore__ void online_update_batch_impl(
+    __gm__ Tensor *mij_batch, __gm__ Tensor *lij_batch, __gm__ Tensor *oi_new_batch, __gm__ Tensor *mi_batch,
+    __gm__ Tensor *li_batch, __gm__ Tensor *oi_batch, __gm__ Tensor *out, uint64_t is_first, uint64_t is_last,
+    uint64_t batch_count, uint64_t q_offset, uint64_t num_heads, uint64_t batch_start
+) {
+    __gm__ float *mij_base = reinterpret_cast<__gm__ float *>(mij_batch->buffer.addr);
+    __gm__ float *lij_base = reinterpret_cast<__gm__ float *>(lij_batch->buffer.addr);
+    __gm__ float *oi_new_base = reinterpret_cast<__gm__ float *>(oi_new_batch->buffer.addr);
+    __gm__ float *mi_base = reinterpret_cast<__gm__ float *>(mi_batch->buffer.addr);
+    __gm__ float *li_base = reinterpret_cast<__gm__ float *>(li_batch->buffer.addr);
+    __gm__ float *oi_base = reinterpret_cast<__gm__ float *>(oi_batch->buffer.addr);
+    __gm__ float *out_base = reinterpret_cast<__gm__ float *>(out->buffer.addr);
+
+    constexpr int kScalarCols = 32 / sizeof(float);
+    constexpr int kScalarRows = M / kScalarCols;
+    constexpr int kAlignedRows = ((M * sizeof(float) + 31) / 32) * (32 / sizeof(float));
+
+    using GlobalDataMxN = GlobalTensor<float, Shape<1, 1, 1, M, N>, Stride<1, 1, 1, N, 1>>;
+    using GlobalScalarND =
+        GlobalTensor<float, Shape<1, 1, 1, kScalarRows, kScalarCols>, Stride<1, 1, 1, kScalarCols, 1>>;
+
+    using TileDataMxN = Tile<TileType::Vec, float, M, N, BLayout::RowMajor, M, N>;
+    using TileScalarND =
+        Tile<TileType::Vec, float, kScalarRows, kScalarCols, BLayout::RowMajor, kScalarRows, kScalarCols>;
+    using TileScalarDN = Tile<TileType::Vec, float, kAlignedRows, 1, BLayout::ColMajor, M, 1>;
+
+    constexpr int kDataBytes = M * N * sizeof(float);
+    constexpr int kScalarNDBytes = kScalarRows * kScalarCols * sizeof(float);
+
+    TileDataMxN oiNewTile;
+    TileDataMxN oiTile;
+
+    TileScalarND mijND, lijND, miND, liND;
+    TileScalarND miNewND, alphaND, betaND, tmpND;
+
+    TileScalarDN alphaDN, betaDN, liDN;
+
+    TASSIGN(oiNewTile, 0);
+    TASSIGN(oiTile, kDataBytes);
+    TASSIGN(mijND, 2 * kDataBytes);
+    TASSIGN(lijND, 2 * kDataBytes + kScalarNDBytes);
+    TASSIGN(miND, 2 * kDataBytes + 2 * kScalarNDBytes);
+    TASSIGN(liND, 2 * kDataBytes + 3 * kScalarNDBytes);
+    TASSIGN(miNewND, 2 * kDataBytes + 4 * kScalarNDBytes);
+    TASSIGN(alphaND, 2 * kDataBytes + 5 * kScalarNDBytes);
+    TASSIGN(betaND, 2 * kDataBytes + 6 * kScalarNDBytes);
+    TASSIGN(tmpND, 2 * kDataBytes + 7 * kScalarNDBytes);
+
+    for (uint64_t b = 0; b < batch_count; b++) {
+        __gm__ float *mij_ptr = mij_base + b * M;
+        __gm__ float *lij_ptr = lij_base + b * M;
+        __gm__ float *oi_new_ptr = oi_new_base + b * M * N;
+        __gm__ float *mi_ptr = mi_base + b * M;
+        __gm__ float *li_ptr = li_base + b * M;
+        __gm__ float *oi_ptr = oi_base + b * M * N;
+        __gm__ float *dst_ptr = out_base + ((batch_start + b) * num_heads + q_offset) * N;
+
+        GlobalDataMxN oiNewGlobal(oi_new_ptr);
+        GlobalDataMxN oiGlobal(oi_ptr);
+        GlobalDataMxN dstGlobal(dst_ptr);
+
+        GlobalScalarND mijGlobalND(mij_ptr);
+        GlobalScalarND lijGlobalND(lij_ptr);
+        GlobalScalarND miGlobalND(mi_ptr);
+        GlobalScalarND liGlobalND(li_ptr);
+
+        if (is_first) {
+            TLOAD(oiNewTile, oiNewGlobal);
+            TLOAD(mijND, mijGlobalND);
+            TLOAD(lijND, lijGlobalND);
+            set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+            wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+
+            set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+            wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+            TSTORE(miGlobalND, mijND);
+            TSTORE(liGlobalND, lijND);
+            TSTORE(oiGlobal, oiNewTile);
+
+            if (is_last) {
+                TRESHAPE(liDN, lijND);
+                set_flag(PIPE_MTE3, PIPE_V, EVENT_ID1);
+                wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID1);
+                TROWEXPANDDIV(oiNewTile, oiNewTile, liDN);
+                set_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);
+                wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);
+                TSTORE(dstGlobal, oiNewTile);
+            }
+        } else {
+            TLOAD(oiNewTile, oiNewGlobal);
+            TLOAD(oiTile, oiGlobal);
+            TLOAD(mijND, mijGlobalND);
+            TLOAD(lijND, lijGlobalND);
+            TLOAD(miND, miGlobalND);
+            TLOAD(liND, liGlobalND);
+            set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+            wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+
+            TMAX(miNewND, miND, mijND);
+            pipe_barrier(PIPE_V);
+            TSUB(alphaND, miND, miNewND);
+            pipe_barrier(PIPE_V);
+            TEXP(alphaND, alphaND);
+            pipe_barrier(PIPE_V);
+            TSUB(betaND, mijND, miNewND);
+            pipe_barrier(PIPE_V);
+            TEXP(betaND, betaND);
+            pipe_barrier(PIPE_V);
+            TMUL(liND, alphaND, liND);
+            pipe_barrier(PIPE_V);
+            TMUL(tmpND, betaND, lijND);
+            pipe_barrier(PIPE_V);
+            TADD(liND, liND, tmpND);
+
+            TRESHAPE(alphaDN, alphaND);
+            TRESHAPE(betaDN, betaND);
+            if (is_last) {
+                TRESHAPE(liDN, liND);
+            }
+
+            set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+            wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+            TSTORE(miGlobalND, miNewND);
+            TSTORE(liGlobalND, liND);
+
+            TROWEXPANDMUL(oiTile, oiTile, alphaDN);
+            TROWEXPANDMUL(oiNewTile, oiNewTile, betaDN);
+            pipe_barrier(PIPE_V);
+            TADD(oiTile, oiTile, oiNewTile);
+
+            if (is_last) {
+                pipe_barrier(PIPE_V);
+                TROWEXPANDDIV(oiTile, oiTile, liDN);
+                set_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);
+                wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);
+                TSTORE(dstGlobal, oiTile);
+            } else {
+                set_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);
+                wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);
+                TSTORE(oiGlobal, oiTile);
+            }
+        }
+
+        if (b + 1 < batch_count) {
+            pipe_barrier(PIPE_ALL);
+        }
+    }
+
+    pipe_sync();
+}
+
+extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) {
+    __gm__ Tensor *mij_batch = reinterpret_cast<__gm__ Tensor *>(args[0]);
+    __gm__ Tensor *lij_batch = reinterpret_cast<__gm__ Tensor *>(args[1]);
+    __gm__ Tensor *oi_new_batch = reinterpret_cast<__gm__ Tensor *>(args[2]);
+    __gm__ Tensor *mi_batch = reinterpret_cast<__gm__ Tensor *>(args[3]);
+    __gm__ Tensor *li_batch = reinterpret_cast<__gm__ Tensor *>(args[4]);
+    __gm__ Tensor *oi_batch = reinterpret_cast<__gm__ Tensor *>(args[5]);
+    __gm__ Tensor *out = reinterpret_cast<__gm__ Tensor *>(args[6]);
+    uint64_t is_first = static_cast<uint64_t>(args[7]);
+    uint64_t is_last = static_cast<uint64_t>(args[8]);
+    uint64_t batch_count = static_cast<uint64_t>(args[9]);
+    uint64_t q_offset = static_cast<uint64_t>(args[10]);
+    uint64_t num_heads = static_cast<uint64_t>(args[11]);
+    uint64_t batch_start = static_cast<uint64_t>(args[12]);
+
+    uint64_t q_tile_size = static_cast<uint64_t>(mij_batch->shapes[0] / batch_count);
+    uint64_t head_dim = static_cast<uint64_t>(oi_new_batch->shapes[1]);
+
+    if (q_tile_size == 16 && head_dim <= 16) {
+        online_update_batch_impl<16, 16>(
+            mij_batch, lij_batch, oi_new_batch, mi_batch, li_batch, oi_batch, out, is_first, is_last, batch_count,
+            q_offset, num_heads, batch_start
+        );
+    } else if (q_tile_size == 16) {
+        online_update_batch_impl<16, 128>(
+            mij_batch, lij_batch, oi_new_batch, mi_batch, li_batch, oi_batch, out, is_first, is_last, batch_count,
+            q_offset, num_heads, batch_start
+        );
+    } else {
+        online_update_batch_impl<64, 128>(
+            mij_batch, lij_batch, oi_new_batch, mi_batch, li_batch, oi_batch, out, is_first, is_last, batch_count,
+            q_offset, num_heads, batch_start
+        );
+    }
+}
diff --git a/tests/st/a2a3/fully_distributed_within_core/batch_paged_attention/kernels/aiv/aiv_softmax_prepare.cpp b/tests/st/a2a3/fully_distributed_within_core/batch_paged_attention/kernels/aiv/aiv_softmax_prepare.cpp
new file mode 100644
index 000000000..3ce77eaa0
--- /dev/null
+++ b/tests/st/a2a3/fully_distributed_within_core/batch_paged_attention/kernels/aiv/aiv_softmax_prepare.cpp
@@ -0,0 +1,200 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+// Batched Softmax Preparation Kernel (AIV)
+//
+// Processes batch_count batches in a single kernel invocation.
+// For each batch b at block_idx bn:
+//   valid_len = min(N, context_lens[b] - bn * N)
+//   sij_masked = pad(sij[b], valid_len, -inf)
+//   sij_scale  = sij_masked * scale
+//   mij[b]     = row_max(sij_scale)
+//   pij[b]     = exp(sij_scale - mij[b])  (truncated to bf16 then back)
+//   lij[b]     = row_sum(pij[b])
+//
+// Supports two tile configurations via runtime dispatch:
+//   Case1: (16, 128) -- q_tile=16, block_size=128
+//   Case2: (64, 64)  -- q_tile=64, block_size=64
+
+#include <cstdint>
+#include <pto/pto-inst.hpp>
+
+#include "tensor.h"
+
+// NOLINTNEXTLINE(build/namespaces)
+using namespace pto;
+
+#include "pipe_sync.h"
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__ [aicore]  // NOLINT(whitespace/braces)
+#endif
+
+template <int M, int N>
+static __aicore__ void softmax_prepare_batch_impl(
+    __gm__ Tensor *sij_batch, __gm__ Tensor *context_lens_t, __gm__ Tensor *pij_batch, __gm__ Tensor *mij_batch,
+    __gm__ Tensor *lij_batch, float scale_value, uint64_t batch_count, uint64_t block_idx, uint64_t batch_start
+) {
+    __gm__ float *sij_base = reinterpret_cast<__gm__ float *>(sij_batch->buffer.addr);
+    __gm__ bfloat16_t *pij_base = reinterpret_cast<__gm__ bfloat16_t *>(pij_batch->buffer.addr);
+    __gm__ float *mij_base = reinterpret_cast<__gm__ float *>(mij_batch->buffer.addr);
+    __gm__ float *lij_base = reinterpret_cast<__gm__ float *>(lij_batch->buffer.addr);
+    __gm__ int32_t *ctx_lens = reinterpret_cast<__gm__ int32_t *>(context_lens_t->buffer.addr);
+
+    constexpr int kAlignedRows = ((M * sizeof(float) + 31) / 32) * (32 / sizeof(float));
+
+    using GlobalDataMxN = GlobalTensor<float, Shape<1, 1, 1, M, N>, Stride<1, 1, 1, N, 1>>;
+    using GlobalDataMxN_bf16 = GlobalTensor<bfloat16_t, Shape<1, 1, 1, M, N>, Stride<1, 1, 1, N, 1>>;
+    using GlobalScalarDN = GlobalTensor<float, Shape<1, 1, 1, kAlignedRows, 1>, Stride<1, 1, 1, 1, 1>, Layout::DN>;
+
+    using TileSijDyn = Tile<TileType::Vec, float, M, N, BLayout::RowMajor, M, -1>;
+    using TileSijPad = Tile<TileType::Vec, float, M, N, BLayout::RowMajor, M, N, SLayout::NoneBox, 512, PadValue::Min>;
+
+    using TileVecMxN = Tile<TileType::Vec, float, M, N, BLayout::RowMajor, M, N>;
+    using TileVecMxN_bf16 = Tile<TileType::Vec, bfloat16_t, M, N, BLayout::RowMajor, M, N>;
+    using TileScalarDN = Tile<TileType::Vec, float, kAlignedRows, 1, BLayout::ColMajor, M, 1>;
+
+    TileVecMxN sijTile;
+    TileSijPad sijPadTile;
+    TileVecMxN pijTile;
+    TileVecMxN tmpTile;
+    TileScalarDN maxTile;
+    TileScalarDN sumTile;
+    TileVecMxN_bf16 pijBf16Tile;
+
+    TASSIGN(sijTile, 0x0);
+    TASSIGN(sijPadTile, 0x0);
+    TASSIGN(pijTile, M * N * sizeof(float));
+    TASSIGN(tmpTile, 2 * M * N * sizeof(float));
+    TASSIGN(maxTile, 3 * M * N * sizeof(float));
+    TASSIGN(sumTile, 3 * M * N * sizeof(float) + kAlignedRows * sizeof(float));
+    TASSIGN(pijBf16Tile, 3 * M * N * sizeof(float) + 2 * kAlignedRows * sizeof(float));
+
+    for (uint64_t b = 0; b < batch_count; b++) {
+        int32_t cur_seq = ctx_lens[batch_start + b];
+        uint64_t start = block_idx * N;
+        uint64_t valid_len = 0;
+        if (start < static_cast<uint64_t>(cur_seq)) {
+            uint64_t remaining = static_cast<uint64_t>(cur_seq) - start;
+            valid_len = (remaining < N) ? remaining : N;
+        }
+
+        __gm__ float *sij_addr = sij_base + b * M * N;
+        __gm__ bfloat16_t *pij_addr = pij_base + b * M * N;
+        __gm__ float *mij_addr = mij_base + b * M;
+        __gm__ float *lij_addr = lij_base + b * M;
+
+        GlobalDataMxN sijGlobal(sij_addr);
+        GlobalDataMxN_bf16 pijGlobal(pij_addr);
+        GlobalScalarDN mijGlobal(mij_addr);
+        GlobalScalarDN lijGlobal(lij_addr);
+
+        if (valid_len == 0) {
+            // Block entirely beyond sequence: write mij=-1e30, lij=0, pij=0
+            // Use -1e30 instead of -inf to avoid NaN in online_update (exp(-inf - (-inf)) = NaN)
+            constexpr float NEG_LARGE = -1e30f;
+            for (int i = 0; i < kAlignedRows; i++) {
+                maxTile.SetValue(i, NEG_LARGE);
+                sumTile.SetValue(i, 0.0f);
+            }
+            for (int i = 0; i < M * N; i++) {
+                pijBf16Tile.SetValue(i, static_cast<bfloat16_t>(0.0f));
+            }
+
+            set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+            wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+            TSTORE(mijGlobal, maxTile);
+            TSTORE(lijGlobal, sumTile);
+            TSTORE(pijGlobal, pijBf16Tile);
+
+            if (b + 1 < batch_count) {
+                pipe_barrier(PIPE_ALL);
+            }
+            continue;
+        }
+
+        TLOAD(sijTile, sijGlobal);
+        set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+        wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+
+        TileSijDyn sijDynTile(static_cast<size_t>(valid_len));
+        TASSIGN(sijDynTile, 0x0);
+        TFILLPAD_INPLACE(sijPadTile, sijDynTile);
+        pipe_barrier(PIPE_V);
+
+        TMULS(sijTile, sijTile, scale_value);
+        pipe_barrier(PIPE_V);
+        TROWMAX(maxTile, sijTile, tmpTile);
+        pipe_barrier(PIPE_V);
+        TROWEXPANDSUB(pijTile, sijTile, maxTile);
+        pipe_barrier(PIPE_V);
+        TEXP(pijTile, pijTile);
+        pipe_barrier(PIPE_V);
+        // Truncate pij to bf16 first, then compute lij from truncated values (matches golden)
+        TCVT(pijBf16Tile, pijTile, RoundMode::CAST_ROUND);
+        set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+        pipe_barrier(PIPE_V);
+        TCVT(pijTile, pijBf16Tile, RoundMode::CAST_ROUND);
+        pipe_barrier(PIPE_V);
+        TROWSUM(sumTile, pijTile, tmpTile);
+        set_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);
+
+        wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+        TSTORE(pijGlobal, pijBf16Tile);
+        TSTORE(mijGlobal, maxTile);
+        wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);
+        TSTORE(lijGlobal, sumTile);
+
+        if (b + 1 < batch_count) {
+            pipe_barrier(PIPE_ALL);
+        }
+    }
+
+    pipe_sync();
+}
+
+extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) {
+    __gm__ Tensor *sij_batch = reinterpret_cast<__gm__ Tensor *>(args[0]);
+    __gm__ Tensor *context_lens_t = reinterpret_cast<__gm__ Tensor *>(args[1]);
+    __gm__ Tensor *pij_batch = reinterpret_cast<__gm__ Tensor *>(args[2]);
+    __gm__ Tensor *mij_batch = reinterpret_cast<__gm__ Tensor *>(args[3]);
+    __gm__ Tensor *lij_batch = reinterpret_cast<__gm__ Tensor *>(args[4]);
+    union {
+        uint64_t u;
+        float f;
+    } scale_conv;
+    scale_conv.u = static_cast<uint64_t>(args[5]);
+    float scale_value = scale_conv.f;
+    uint64_t batch_count = static_cast<uint64_t>(args[6]);
+    uint64_t block_idx = static_cast<uint64_t>(args[7]);
+    uint64_t batch_start = static_cast<uint64_t>(args[8]);
+
+    uint64_t q_tile_size = static_cast<uint64_t>(sij_batch->shapes[0] / batch_count);
+    uint64_t block_size = static_cast<uint64_t>(pij_batch->shapes[1]);
+
+    if (q_tile_size == 16 && block_size <= 16) {
+        softmax_prepare_batch_impl<16, 16>(
+            sij_batch, context_lens_t, pij_batch, mij_batch, lij_batch, scale_value, batch_count, block_idx, batch_start
+        );
+    } else if (q_tile_size == 16) {
+        softmax_prepare_batch_impl<16, 128>(
+            sij_batch, context_lens_t, pij_batch, mij_batch, lij_batch, scale_value, batch_count, block_idx, batch_start
+        );
+    } else {
+        softmax_prepare_batch_impl<64, 64>(
+            sij_batch, context_lens_t, pij_batch, mij_batch, lij_batch, scale_value, batch_count, block_idx, batch_start
+        );
+    }
+}
diff --git a/tests/st/a2a3/fully_distributed_within_core/batch_paged_attention/kernels/orchestration/paged_attention_orch.cpp b/tests/st/a2a3/fully_distributed_within_core/batch_paged_attention/kernels/orchestration/paged_attention_orch.cpp
new file mode 100644
index 000000000..1717ebc48
--- /dev/null
+++ b/tests/st/a2a3/fully_distributed_within_core/batch_paged_attention/kernels/orchestration/paged_attention_orch.cpp
@@ -0,0 +1,215 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Batch Paged Attention Orchestration Function - Production Scale
+ *
+ * Chunked batched architecture: the full batch is split into chunks of
+ * IN_CORE_BATCH size. Each chunk's QK/SF/PV/UP tasks are independent
+ * and can be scheduled to different cores in parallel.
+ *
+ * Task count = num_chunks * (1 + max_bn * 4), where
+ *   num_chunks = ceil(batch / IN_CORE_BATCH)
+ *
+ * For batch <= IN_CORE_BATCH, behavior is identical to the non-chunked version.
+ *
+ * Memory Layout:
+ *   Query: (batch * num_heads, head_dim) bf16
+ *   Key:   (total_blocks, block_size, head_dim) bf16 (stored as K^T for QK)
+ *   Value: (total_blocks, block_size, head_dim) bf16
+ *
+ * Per-chunk intermediate tensors (contiguous across chunk_bc dimension):
+ *   sij:     (chunk_bc * q_tile, block_size)  fp32
+ *   pij:     (chunk_bc * q_tile, block_size)  bf16
+ *   mij/lij: (chunk_bc * q_tile)              fp32
+ *   oi_new:  (chunk_bc * q_tile, head_dim)    fp32
+ *   oi:      (chunk_bc * q_tile, head_dim)    fp32  accumulator
+ *   mi/li:   (chunk_bc * q_tile)              fp32  accumulator
+ *
+ * Kernels receive global tensors + scalar metadata (including batch_start)
+ * and compute per-batch addresses internally.
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <algorithm>
+#include <cinttypes>
+
+#include "pto_orchestration_api.h"
+
+#define FUNC_QK_MATMUL 0
+#define FUNC_SOFTMAX_PREPARE 1
+#define FUNC_PV_MATMUL 2
+#define FUNC_ONLINE_UPDATE 3
+extern "C" {
+
+__attribute__((visibility("default"))) PTO2OrchestrationConfig aicpu_orchestration_config(const L2TaskArgs &orch_args) {
+    (void)orch_args;
+    return PTO2OrchestrationConfig{
+        .expected_arg_count = 7,
+    };
+}
+
+__attribute__((visibility("default"))) void aicpu_orchestration_entry(const L2TaskArgs &orch_args) {
+    // Read dimensions from tensor metadata
+    uint64_t batch = orch_args.tensor(0).ref().shapes[0];
+    uint64_t num_heads = orch_args.tensor(0).ref().shapes[1];
+    uint64_t head_dim = orch_args.tensor(0).ref().shapes[2];
+    DataType data_type = orch_args.tensor(0).ref().dtype;
+
+    uint64_t block_size = orch_args.tensor(1).ref().shapes[1];
+    uint64_t block_num = orch_args.tensor(3).ref().shapes[1];
+
+    uint64_t scale_value = orch_args.scalar(0);
+
+    uint64_t q_tile = std::min(num_heads, static_cast<uint64_t>(128));
+    uint64_t q_loop = (num_heads + q_tile - 1) / q_tile;
+    uint64_t elem_size = get_element_size(data_type);
+
+    LOG_INFO_V0("batch_paged_attention: batch=%" PRIu64 ", num_heads=%" PRIu64, batch, num_heads);
+
+    void *query_ptr = orch_args.tensor(0).ref().data_as<void>();
+    void *kc_ptr = orch_args.tensor(1).ref().data_as<void>();
+    void *vc_ptr = orch_args.tensor(2).ref().data_as<void>();
+    void *out_ptr = orch_args.tensor(5).ref().data_as<void>();
+
+    uint32_t bt_shapes[2] = {static_cast<uint32_t>(batch), static_cast<uint32_t>(block_num)};
+    Tensor block_table =
+        make_tensor_external(orch_args.tensor(3).ref().data_as<void>(), bt_shapes, 2, DataType::INT32, false);
+
+    uint32_t cl_shapes[1] = {static_cast<uint32_t>(batch)};
+    Tensor context_lens =
+        make_tensor_external(orch_args.tensor(4).ref().data_as<void>(), cl_shapes, 1, DataType::INT32, false);
+
+    uint64_t max_bn = 0;
+    for (uint64_t b = 0; b < batch; b++) {
+        uint32_t cl_idx[1] = {static_cast<uint32_t>(b)};
+        uint64_t cur_seq = static_cast<uint64_t>(get_tensor_data<int32_t>(context_lens, 1, cl_idx));
+        uint64_t bn_b = (cur_seq + block_size - 1) / block_size;
+        if (bn_b > max_bn) max_bn = bn_b;
+    }
+
+    uint32_t query_shapes[2] = {static_cast<uint32_t>(batch * num_heads), static_cast<uint32_t>(head_dim)};
+    uint64_t total_blocks_count = orch_args.tensor(1).ref().shapes[0];
+    uint64_t kv_total_rows = total_blocks_count * block_size;
+    uint32_t key_cache_shapes[2] = {static_cast<uint32_t>(kv_total_rows), static_cast<uint32_t>(head_dim)};
+    uint32_t value_cache_shapes[2] = {static_cast<uint32_t>(kv_total_rows), static_cast<uint32_t>(head_dim)};
+    uint32_t out_shapes[2] = {static_cast<uint32_t>(batch * num_heads), static_cast<uint32_t>(head_dim)};
+
+    Tensor query = make_tensor_external(query_ptr, query_shapes, 2, data_type);
+    Tensor key_cache = make_tensor_external(kc_ptr, key_cache_shapes, 2, data_type);
+    Tensor value_cache = make_tensor_external(vc_ptr, value_cache_shapes, 2, data_type);
+    Tensor out = make_tensor_external(out_ptr, out_shapes, 2, DataType::FLOAT32, true);
+
+    constexpr uint64_t IN_CORE_BATCH = 16;
+    uint64_t num_chunks = (batch + IN_CORE_BATCH - 1) / IN_CORE_BATCH;
+
+    for (uint64_t q_idx = 0; q_idx < q_loop; q_idx++) {
+        uint64_t q_offset = q_idx * q_tile;
+
+        for (uint64_t chunk_idx = 0; chunk_idx < num_chunks; chunk_idx++) {
+            uint64_t chunk_bc = batch - chunk_idx * IN_CORE_BATCH;
+            if (chunk_bc > IN_CORE_BATCH) chunk_bc = IN_CORE_BATCH;
+            uint64_t batch_start = chunk_idx * IN_CORE_BATCH;
+
+            PTO2_SCOPE() {
+                uint32_t oi_acc_shapes[2] = {static_cast<uint32_t>(chunk_bc * q_tile), static_cast<uint32_t>(head_dim)};
+                uint32_t scalar_acc_shapes[1] = {static_cast<uint32_t>(chunk_bc * q_tile)};
+                TensorCreateInfo oi_batch_ci(oi_acc_shapes, 2, DataType::FLOAT32);
+                TensorCreateInfo scalar_acc_ci(scalar_acc_shapes, 1, DataType::FLOAT32);
+                TaskOutputTensors alloc_outs = alloc_tensors(oi_batch_ci, scalar_acc_ci, scalar_acc_ci);
+                const Tensor &oi_batch = alloc_outs.get_ref(0);
+                const Tensor &li_batch = alloc_outs.get_ref(1);
+                const Tensor &mi_batch = alloc_outs.get_ref(2);
+
+                // Inner-loop create infos: shapes are loop-invariant, hoist out of bn loop
+                uint32_t sij_shapes[2] = {static_cast<uint32_t>(chunk_bc * q_tile), static_cast<uint32_t>(block_size)};
+                uint32_t vec_shapes[1] = {static_cast<uint32_t>(chunk_bc * q_tile)};
+                uint32_t oi_new_shapes[2] = {static_cast<uint32_t>(chunk_bc * q_tile), static_cast<uint32_t>(head_dim)};
+                TensorCreateInfo sij_ci(sij_shapes, 2, DataType::FLOAT32);
+                TensorCreateInfo pij_ci(sij_shapes, 2, data_type);
+                TensorCreateInfo vec_ci(vec_shapes, 1, DataType::FLOAT32);
+                TensorCreateInfo oi_new_ci(oi_new_shapes, 2, DataType::FLOAT32);
+
+                for (uint64_t bn = 0; bn < max_bn; bn++) {
+                    PTO2_SCOPE() {
+                        L0TaskArgs params_qk;
+                        params_qk.add_input(query);
+                        params_qk.add_input(key_cache);
+                        params_qk.add_input(block_table);
+                        params_qk.add_output(sij_ci);
+                        params_qk.add_scalar(chunk_bc);
+                        params_qk.add_scalar(bn);
+                        params_qk.add_scalar(q_offset);
+                        params_qk.add_scalar(block_num);
+                        params_qk.add_scalar(num_heads);
+                        params_qk.add_scalar(batch_start);
+                        TaskOutputTensors qk_outs = rt_submit_aic_task(FUNC_QK_MATMUL, params_qk);
+                        const Tensor &sij_b = qk_outs.get_ref(0);
+
+                        L0TaskArgs params_sf;
+                        params_sf.add_input(sij_b);
+                        params_sf.add_input(context_lens);
+                        params_sf.add_output(pij_ci);
+                        params_sf.add_output(vec_ci);
+                        params_sf.add_output(vec_ci);
+                        params_sf.add_scalar(scale_value);
+                        params_sf.add_scalar(chunk_bc);
+                        params_sf.add_scalar(bn);
+                        params_sf.add_scalar(batch_start);
+                        TaskOutputTensors sf_outs = rt_submit_aiv_task(FUNC_SOFTMAX_PREPARE, params_sf);
+                        const Tensor &pij_b = sf_outs.get_ref(0);
+                        const Tensor &mij_b = sf_outs.get_ref(1);
+                        const Tensor &lij_b = sf_outs.get_ref(2);
+
+                        L0TaskArgs params_pv;
+                        params_pv.add_input(pij_b);
+                        params_pv.add_input(value_cache);
+                        params_pv.add_input(block_table);
+                        params_pv.add_output(oi_new_ci);
+                        params_pv.add_scalar(chunk_bc);
+                        params_pv.add_scalar(bn);
+                        params_pv.add_scalar(block_num);
+                        params_pv.add_scalar(batch_start);
+                        TaskOutputTensors pv_outs = rt_submit_aic_task(FUNC_PV_MATMUL, params_pv);
+                        const Tensor &oi_new_b = pv_outs.get_ref(0);
+
+                        uint64_t is_first = (bn == 0) ? 1 : 0;
+                        uint64_t is_last = (bn == max_bn - 1) ? 1 : 0;
+                        L0TaskArgs params_up;
+                        params_up.add_input(mij_b);
+                        params_up.add_input(lij_b);
+                        params_up.add_input(oi_new_b);
+                        params_up.add_inout(mi_batch);
+                        params_up.add_inout(li_batch);
+                        params_up.add_inout(oi_batch);
+                        params_up.add_inout(out);
+                        params_up.add_scalar(is_first);
+                        params_up.add_scalar(is_last);
+                        params_up.add_scalar(chunk_bc);
+                        params_up.add_scalar(q_offset);
+                        params_up.add_scalar(num_heads);
+                        params_up.add_scalar(batch_start);
+                        rt_submit_aiv_task(FUNC_ONLINE_UPDATE, params_up);
+                    }
+                }
+            }
+        }
+    }
+
+    LOG_INFO_V0(
+        "batch_paged_attention: %" PRIu64 " tasks (batch=%" PRIu64 ", max_bn=%" PRIu64 ", chunks=%" PRIu64
+        ", IN_CORE_BATCH=%" PRIu64 ")",
+        static_cast<uint64_t>(num_chunks * (1 + max_bn * 4)), batch, max_bn, num_chunks, IN_CORE_BATCH
+    );
+}
+
+}  // extern "C"
diff --git a/tests/st/a2a3/fully_distributed_within_core/batch_paged_attention/test_batch_paged_attention.py b/tests/st/a2a3/fully_distributed_within_core/batch_paged_attention/test_batch_paged_attention.py
new file mode 100644
index 000000000..f36391d77
--- /dev/null
+++ b/tests/st/a2a3/fully_distributed_within_core/batch_paged_attention/test_batch_paged_attention.py
@@ -0,0 +1,213 @@
+#!/usr/bin/env python3
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""Batch paged attention: batched online softmax with AIC/AIV subgraph splitting (bfloat16)."""
+
+import torch
+from simpler.task_interface import ArgDirection as D
+
+from simpler_setup import Scalar, SceneTestCase, TaskArgsBuilder, Tensor, scene_test
+from simpler_setup.goldens.paged_attention import compute_golden as _pa_compute_golden
+from simpler_setup.goldens.paged_attention import generate_inputs as _pa_generate_inputs
+
+
+@scene_test(level=2, runtime="tensormap_and_ringbuffer")
+class TestBatchPagedAttention(SceneTestCase):
+    RTOL = 1e-3
+    ATOL = 1e-3
+
+    CALLABLE = {
+        "orchestration": {
+            "source": "kernels/orchestration/paged_attention_orch.cpp",
+            "function_name": "aicpu_orchestration_entry",
+            "signature": [D.IN, D.IN, D.IN, D.IN, D.IN, D.OUT],
+        },
+        "incores": [
+            {
+                "func_id": 0,
+                "name": "QK",
+                "source": "kernels/aic/aic_qk_matmul.cpp",
+                "core_type": "aic",
+                "signature": [D.IN, D.IN, D.OUT],
+            },
+            {
+                "func_id": 1,
+                "name": "SF",
+                "source": "kernels/aiv/aiv_softmax_prepare.cpp",
+                "core_type": "aiv",
+                "signature": [D.IN, D.OUT, D.OUT, D.OUT],
+            },
+            {
+                "func_id": 2,
+                "name": "PV",
+                "source": "kernels/aic/aic_pv_matmul.cpp",
+                "core_type": "aic",
+                "signature": [D.IN, D.IN, D.OUT],
+            },
+            {
+                "func_id": 3,
+                "name": "UP",
+                "source": "kernels/aiv/aiv_online_update.cpp",
+                "core_type": "aiv",
+                "signature": [D.IN, D.IN, D.IN, D.INOUT, D.INOUT, D.INOUT, D.INOUT],
+            },
+        ],
+    }
+
+    CASES = [
+        {
+            "name": "Case1",
+            "platforms": ["a2a3"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 24},
+            "params": {
+                "batch": 256,
+                "num_heads": 16,
+                "kv_head_num": 1,
+                "head_dim": 128,
+                "block_size": 128,
+                "context_len": 8192,
+                "max_model_len": 32768,
+                "dtype": "bfloat16",
+            },
+        },
+        {
+            "name": "Case2",
+            "platforms": ["a2a3"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 24},
+            "manual": True,
+            "params": {
+                "batch": 64,
+                "num_heads": 64,
+                "kv_head_num": 1,
+                "head_dim": 128,
+                "block_size": 64,
+                "context_len": 8192,
+                "max_model_len": 32768,
+                "dtype": "bfloat16",
+            },
+        },
+        {
+            "name": "Case3",
+            "platforms": ["a2a3"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 24},
+            "manual": True,
+            "params": {
+                "batch": 64,
+                "num_heads": 64,
+                "kv_head_num": 1,
+                "head_dim": 256,
+                "block_size": 64,
+                "context_len": 8192,
+                "max_model_len": 32768,
+                "dtype": "bfloat16",
+            },
+        },
+        {
+            "name": "CaseSmall1",
+            "platforms": ["a2a3sim", "a2a3"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 9},
+            "params": {
+                "batch": 1,
+                "num_heads": 16,
+                "kv_head_num": 1,
+                "head_dim": 16,
+                "block_size": 16,
+                "context_len": 33,
+                "max_model_len": 256,
+                "dtype": "bfloat16",
+            },
+        },
+        {
+            "name": "CaseSmall2",
+            "platforms": ["a2a3sim", "a2a3"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 9},
+            "manual": True,
+            "params": {
+                "batch": 1,
+                "num_heads": 16,
+                "kv_head_num": 1,
+                "head_dim": 16,
+                "block_size": 16,
+                "context_len": 31,
+                "max_model_len": 256,
+                "dtype": "bfloat16",
+            },
+        },
+        {
+            "name": "CaseSmall3",
+            "platforms": ["a2a3sim", "a2a3"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 9},
+            "manual": True,
+            "params": {
+                "batch": 1,
+                "num_heads": 16,
+                "kv_head_num": 1,
+                "head_dim": 16,
+                "block_size": 16,
+                "context_len": 128,
+                "max_model_len": 256,
+                "dtype": "bfloat16",
+            },
+        },
+        {
+            "name": "CaseVarSeq2",
+            "platforms": ["a2a3sim", "a2a3"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 9},
+            "manual": True,
+            "params": {
+                "batch": 2,
+                "num_heads": 16,
+                "kv_head_num": 1,
+                "head_dim": 16,
+                "block_size": 16,
+                "context_len": 33,
+                "context_lens_list": [33, 17],
+                "max_model_len": 256,
+                "dtype": "bfloat16",
+            },
+        },
+        {
+            "name": "CaseVarSeq4",
+            "platforms": ["a2a3sim", "a2a3"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 9},
+            "manual": True,
+            "params": {
+                "batch": 4,
+                "num_heads": 16,
+                "kv_head_num": 1,
+                "head_dim": 16,
+                "block_size": 16,
+                "context_len": 128,
+                "context_lens_list": [33, 64, 128, 15],
+                "max_model_len": 256,
+                "dtype": "bfloat16",
+            },
+        },
+    ]
+
+    def generate_args(self, params):
+        result = _pa_generate_inputs(params)
+        specs = []
+        for name, value in result:
+            if isinstance(value, torch.Tensor):
+                specs.append(Tensor(name, value))
+            else:
+                specs.append(Scalar(name, value))
+        return TaskArgsBuilder(*specs)
+
+    def compute_golden(self, args, params):
+        tensors = {s.name: s.value for s in args.specs if isinstance(s, Tensor)}
+        _pa_compute_golden(tensors, params)
+        for s in args.specs:
+            if isinstance(s, Tensor) and s.name in tensors:
+                getattr(args, s.name)[:] = tensors[s.name]
+
+
+if __name__ == "__main__":
+    SceneTestCase.run_module(__name__)
diff --git a/tests/st/a2a3/fully_distributed_within_core/dfx/dep_gen/kernels/orchestration/chain_barrier_orch.cpp b/tests/st/a2a3/fully_distributed_within_core/dfx/dep_gen/kernels/orchestration/chain_barrier_orch.cpp
new file mode 100644
index 000000000..eb9340bf8
--- /dev/null
+++ b/tests/st/a2a3/fully_distributed_within_core/dfx/dep_gen/kernels/orchestration/chain_barrier_orch.cpp
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+/**
+ * Many-to-one barrier via explicit set_dependencies — exercises the dep_gen
+ * overflow chain wire format.
+ *
+ * Submits N producers each writing X[0] = 42.0, then a dummy_T whose only
+ * dependency surface is set_dependencies({all N producer ids}, N), then a
+ * consumer that explicit-depends on the barrier and copies X[0] -> Y[0].
+ *
+ * Picking N > DEP_GEN_MAX_EXPLICIT_DEPS (=64) forces the dep_gen capture to
+ * spill into one or more DepGenOverflowRecord slots; picking N to span the
+ * 64 + k*326 boundaries exercises both single- and multi-overflow chains.
+ *
+ * Args layout: [X, Y, scalar(N)]
+ *   - X: every producer writes it (tensormap auto-deps the chain so the
+ *        SENTINEL is preserved); consumer reads it.
+ *   - Y: consumer writes it; host checks Y[0] == SENTINEL.
+ *
+ * Scalar: N (1 .. MAX_PRODUCERS).
+ */
+
+#include <cstdint>
+
+#include "pto_orchestration_api.h"  // NOLINT(build/include_subdir)
+
+#define FUNC_WRITE_CONST 0
+#define FUNC_COPY_FIRST 1
+
+// Stack room for producer_ids[]. 500 covers everything we expect to test;
+// PTO2_DEP_LIST_POOL_SIZE (16384) is the real ceiling on a per-ring basis.
+static constexpr int32_t MAX_PRODUCERS = 500;
+
+extern "C" {
+
+__attribute__((visibility("default"))) PTO2OrchestrationConfig aicpu_orchestration_config(const L2TaskArgs &orch_args) {
+    (void)orch_args;
+    return PTO2OrchestrationConfig{
+        .expected_arg_count = 3,  // X, Y, scalar(N)
+    };
+}
+
+__attribute__((visibility("default"))) void aicpu_orchestration_entry(const L2TaskArgs &orch_args) {
+    const Tensor &ext_X = orch_args.tensor(0).ref();
+    const Tensor &ext_Y = orch_args.tensor(1).ref();
+
+    uint64_t n_raw = orch_args.scalar(0);
+    int32_t n = static_cast<int32_t>(n_raw);
+    if (n < 1 || n > MAX_PRODUCERS) {
+        rt_report_fatal(PTO2_ERROR_INVALID_ARGS, "chain_barrier_orch: invalid n=%d", n);
+        return;
+    }
+
+    PTO2TaskId producer_ids[MAX_PRODUCERS];
+
+    // N producers each INOUT X. tensormap auto-deps them in a chain, so X[0]
+    // stays at SENTINEL through all of them — the host only checks the final
+    // value, which proves the barrier waited for every producer to finish.
+    for (int32_t i = 0; i < n; i++) {
+        L0TaskArgs args;
+        args.add_inout(ext_X);
+        producer_ids[i] = rt_submit_aic_task(FUNC_WRITE_CONST, args).task_id();
+    }
+
+    // Dummy barrier with explicit deps on ALL N producers. dc=n > 64 forces
+    // the dep_gen writer to emit base + overflow chain.
+    PTO2TaskId barrier_id;
+    {
+        L0TaskArgs args;
+        args.set_dependencies(producer_ids, n);
+        barrier_id = rt_submit_dummy_task(args).task_id();
+    }
+
+    // Consumer: explicit dep on barrier only, reads X, writes Y.
+    {
+        L0TaskArgs args;
+        PTO2TaskId consumer_deps[] = {barrier_id};
+        args.set_dependencies(consumer_deps, 1);
+        args.add_input(ext_X);
+        args.add_inout(ext_Y);
+        rt_submit_aic_task(FUNC_COPY_FIRST, args);
+    }
+}
+
+}  // extern "C"
diff --git a/tests/st/a2a3/fully_distributed_within_core/dfx/dep_gen/test_dep_gen.py b/tests/st/a2a3/fully_distributed_within_core/dfx/dep_gen/test_dep_gen.py
new file mode 100644
index 000000000..7377b545c
--- /dev/null
+++ b/tests/st/a2a3/fully_distributed_within_core/dfx/dep_gen/test_dep_gen.py
@@ -0,0 +1,259 @@
+#!/usr/bin/env python3
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""dep_gen capture + replay sim test.
+
+Re-runs the ``vector_example`` orchestration with ``--enable-dep-gen``.
+Verifies the end-to-end dep_gen pipeline on a2a3sim:
+
+  ``<output_prefix>/deps.json`` is produced by the host replay
+  (PTO2TensorMap replay → JSON edge list), and contains exactly the
+  6 edges documented in example_orchestration.cpp. The capture path
+  (host collector drains the device ring buffer into memory and feeds
+  the replay directly — no submit_trace.bin on disk) is exercised
+  implicitly: if it broke, deps.json would be empty or wrong.
+
+deps.json is now the sole source of truth for fanout edges — the device
+hot path no longer records L2SwimlaneAicpuTaskRecord::fanout[], so there is no
+"fanout ⊆ deps" cross-check to run. swimlane_converter.py joins
+deps.json into the Perfetto trace at post-process time.
+
+Compute correctness is delegated to the upstream ``vector_example`` test —
+this case re-uses the same orchestration to keep coverage focused on the
+capture+replay+validation pipeline.
+"""
+
+import json
+import shutil
+import subprocess
+import sys
+
+import torch
+from simpler.task_interface import ArgDirection as D
+
+from simpler_setup import SceneTestCase, TaskArgsBuilder, Tensor, scene_test
+from simpler_setup.scene_test import _outputs_dir, _sanitize_for_filename
+
+KERNELS_BASE = "../../../../../../examples/a2a3/tensormap_and_ringbuffer/vector_example/kernels"
+
+
+def _task_id(ring: int, local: int) -> int:
+    """Encode (ring_id, local_id) → 64-bit raw matching ``PTO2TaskId::raw`` —
+    keeps the bit layout (``(ring << 32) | local``) in one place rather than
+    repeating ``1 << 32`` arithmetic at every call site.
+    """
+    return (ring << 32) | local
+
+
+@scene_test(level=2, runtime="tensormap_and_ringbuffer")
+class TestDepGen(SceneTestCase):
+    """Vector example, run with dep_gen enabled, then verify submit_trace.bin."""
+
+    CALLABLE = {
+        "orchestration": {
+            "source": f"{KERNELS_BASE}/orchestration/example_orchestration.cpp",
+            "function_name": "aicpu_orchestration_entry",
+            "signature": [D.IN, D.IN, D.OUT],
+        },
+        "incores": [
+            {
+                "func_id": 0,
+                "source": f"{KERNELS_BASE}/aiv/kernel_add.cpp",
+                "core_type": "aiv",
+                "signature": [D.IN, D.IN, D.OUT],
+            },
+            {
+                "func_id": 1,
+                "source": f"{KERNELS_BASE}/aiv/kernel_add_scalar.cpp",
+                "core_type": "aiv",
+                "signature": [D.IN, D.OUT],
+            },
+            {
+                "func_id": 2,
+                "source": f"{KERNELS_BASE}/aiv/kernel_mul.cpp",
+                "core_type": "aiv",
+                "signature": [D.IN, D.IN, D.OUT],
+            },
+        ],
+    }
+
+    CASES = [
+        {
+            "name": "default",
+            "platforms": ["a2a3sim", "a2a3"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 3},
+            "params": {},
+        },
+    ]
+
+    def generate_args(self, params):
+        SIZE = 128 * 128
+        return TaskArgsBuilder(
+            Tensor("a", torch.full((SIZE,), 2.0, dtype=torch.float32)),
+            Tensor("b", torch.full((SIZE,), 3.0, dtype=torch.float32)),
+            Tensor("f", torch.zeros(SIZE, dtype=torch.float32)),
+        )
+
+    def compute_golden(self, args, params):
+        args.f[:] = (args.a + args.b + 1) * (args.a + args.b + 2) + (args.a + args.b)
+
+    def test_run(self, st_platform, st_worker, request):
+        # Run the standard scene-test loop, then assert dep_gen output for the
+        # cases that actually ran on this platform. Without this override, the
+        # pytest path silently passes when dep_gen is disabled in the AICPU
+        # build (the trace ring stays empty and deps.json is just `{"edges":[]}`)
+        # — the bug that prompted #742. Use the framework helper so the
+        # rounds-guard stays consistent with SceneTestCase.test_run (super()
+        # already warned, so warn=False here).
+        super().test_run(st_platform, st_worker, request)
+        if not self._effective_enable_dep_gen(request):
+            return
+        for case in self.CASES:
+            if st_platform in case.get("platforms", []):
+                self._post_validate(case)
+
+    def _post_validate(self, case):
+        """Skips if no per-case output_prefix dir exists (e.g. selector
+        skipped this case at pytest level). When the dir + deps.json are
+        present, assert that deps.json contains the 6 edges documented in
+        example_orchestration.cpp.
+        """
+        case_name = case["name"]
+        safe_label = _sanitize_for_filename(f"TestDepGen_{case_name}")
+        outputs = _outputs_dir()
+        matches = sorted(outputs.glob(f"{safe_label}_*"), key=lambda p: p.stat().st_mtime)
+        if not matches:
+            # No output_prefix dir — dep_gen flag wasn't on for this run; nothing
+            # to validate. Don't fail the test (the case itself already passed).
+            return
+        out_dir = matches[-1]
+
+        # ---- deps.json (host replay output — sole dep_gen artifact on disk) ----
+        # We only reach here with --enable-dep-gen on and rounds<=1 (the
+        # test_run gate via _effective_enable_dep_gen) AND an output dir present
+        # (the case actually ran). deps.json MUST therefore have been produced;
+        # its absence means the capture->reconcile->replay pipeline silently
+        # produced nothing (reconcile drops or replay failure) — exactly the
+        # regression this test exists to catch (#742). Fail loudly, don't skip.
+        deps_path = out_dir / "deps.json"
+        assert deps_path.exists(), (
+            f"--enable-dep-gen is on and {out_dir} exists, but deps.json was not produced "
+            f"— capture/reconcile/replay pipeline regression"
+        )
+        with deps_path.open() as f:
+            deps = json.load(f)
+        # Strided-Tensor schema: annotated edges with tasks[] / tensors[]
+        # sidecars carrying strided slice descriptors (start_offset +
+        # stride[]). Project annotated edges down to a (pred, succ) set for
+        # the existing structural checks; the annotation sanity check below
+        # verifies the tensor metadata path.
+        raw_edges = deps.get("edges", [])
+        deps_edges = set()
+        for e in raw_edges:
+            assert isinstance(e, dict), f"deps.json edge must be an object, got {type(e).__name__}: {e!r}"
+            pred, succ = e.get("pred"), e.get("succ")
+            if pred is None or succ is None:
+                continue
+            deps_edges.add((int(pred), int(succ)))
+
+        # example_orchestration.cpp comment block (verified by tracing the source):
+        #   t0: ring 0, local 0
+        #   t1..t4: ring 1, local 0..3  (inner manual scope → ring 1)
+        # Edges: t0->t1, t0->t2, t1->t3, t2->t3, t0->t4, t3->t4
+        t0 = _task_id(0, 0)
+        t1 = _task_id(1, 0)
+        t2 = _task_id(1, 1)
+        t3 = _task_id(1, 2)
+        t4 = _task_id(1, 3)
+        expected_edges = {(t0, t1), (t0, t2), (t1, t3), (t2, t3), (t0, t4), (t3, t4)}
+        missing = expected_edges - deps_edges
+        assert not missing, f"deps.json missing expected edges: {missing} (got {deps_edges})"
+        # Allow extra edges (creator-retention may add owner edges that don't appear
+        # in the comment's logical-dep view), but flag anything outside the task set.
+        valid_ids = {t0, t1, t2, t3, t4}
+        bad = {e for e in deps_edges if e[0] not in valid_ids or e[1] not in valid_ids}
+        assert not bad, f"deps.json contains edges referencing unknown task ids: {bad}"
+
+        # ---- Annotated-edge sanity ----
+        # Replay always emits the tensor-info sidecar; the differential check
+        # inside the replay would have failed the run before we got here if
+        # the annotated pass disagreed with compute_task_fanin. These
+        # assertions just confirm the schema actually carries the expected
+        # blocks (so e.g. a future "always write empty arrays" bug would
+        # surface here, not silently in a downstream viewer).
+        tasks = deps.get("tasks", [])
+        tensors = deps.get("tensors", [])
+        task_ids = {int(t["task_id"]) for t in tasks if "task_id" in t}
+        assert valid_ids <= task_ids, f"tasks[] missing expected ids: {valid_ids - task_ids}"
+        # Every non-explicit edge should reference a tensor_id present in
+        # tensors[]. EXPLICIT edges legitimately omit it.
+        tensor_ids = {int(t["tensor_id"]) for t in tensors if "tensor_id" in t}
+        for e in raw_edges:
+            if not isinstance(e, dict):
+                continue
+            source = e.get("source")
+            if source == "explicit":
+                continue
+            tid = e.get("tensor_id")
+            assert tid is not None and int(tid) in tensor_ids, (
+                f"edge {e.get('pred')}->{e.get('succ')} (source={source}) "
+                f"references tensor_id {tid} absent from tensors[]"
+            )
+            # Annotated edges must carry consumer-side strided slice info.
+            assert "consumer_shape" in e and "consumer_start_offset" in e and "consumer_strides" in e, (
+                f"edge {e.get('pred')}->{e.get('succ')} (source={source}) missing consumer_shape/start_offset/strides"
+            )
+
+        # ---- Tool smoke: deps_viewer (text) ----
+        # scene_test auto-generates deps_viewer.txt via _graph_case_dep_gen;
+        # smoke verifies it was produced and has the expected sections.
+        out_txt = out_dir / "deps_viewer.txt"
+        assert out_txt.exists(), f"scene_test auto-hook did not produce {out_txt}"
+        text = out_txt.read_text()
+        assert "SUMMARY" in text and "TASK INDEX" in text, "text deps graph missing expected sections"
+
+        for extra in (["--direction", "LR"], ["--engine", "dot"]):
+            bad = subprocess.run(
+                [
+                    sys.executable,
+                    "-m",
+                    "simpler_setup.tools.deps_viewer",
+                    str(deps_path),
+                    "--format",
+                    "text",
+                    *extra,
+                ],
+                check=False,
+                timeout=60,
+                capture_output=True,
+                text=True,
+            )
+            assert bad.returncode != 0, f"text mode should reject {' '.join(extra)}"
+            assert "only valid with --format html" in bad.stderr
+
+        if shutil.which("dot"):
+            out_html = out_dir / "_smoke_deps.html"
+            subprocess.run(
+                [
+                    sys.executable,
+                    "-m",
+                    "simpler_setup.tools.deps_viewer",
+                    str(deps_path),
+                    "--format",
+                    "html",
+                    "-o",
+                    str(out_html),
+                ],
+                check=True,
+                timeout=60,
+            )
+
+
+if __name__ == "__main__":
+    SceneTestCase.run_module(__name__)
diff --git a/tests/st/a2a3/fully_distributed_within_core/dfx/dep_gen/test_dep_gen_chain.py b/tests/st/a2a3/fully_distributed_within_core/dfx/dep_gen/test_dep_gen_chain.py
new file mode 100644
index 000000000..774ca0470
--- /dev/null
+++ b/tests/st/a2a3/fully_distributed_within_core/dfx/dep_gen/test_dep_gen_chain.py
@@ -0,0 +1,215 @@
+#!/usr/bin/env python3
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""dep_gen overflow chain regression — submits with >64 explicit deps.
+
+A submit with explicit_dep_count > DEP_GEN_MAX_EXPLICIT_DEPS (=64) spills the
+extra deps into one or more DepGenOverflowRecord slots that overlay the same
+buffer ring. Before the chain wire format, dep_gen would silently truncate
+the tail in deps.json; this test verifies every explicit dep edge survives
+the round-trip writer → host collector → replay → deps.json.
+
+Test shape (chain_barrier_orch.cpp): N producers each INOUT X, then a dummy
+barrier `set_dependencies({all N producer ids})`, then a consumer
+`set_dependencies({barrier_id})` reading X and writing Y. With N spanning
+the {64, 65, 390, 391} boundaries we exercise:
+
+  - n=64: base only (no chain) — sanity baseline
+  - n=65: base + 1 overflow record (1 dep in overflow)
+  - n=200: base + 1 overflow (136 deps in overflow)
+  - n=391: base + 2 overflow (326 + 1 deps across two overflows)
+
+Validation: the barrier task in deps.json must have exactly N predecessors,
+all of which are the producer ids. The consumer must have one explicit
+predecessor — the barrier.
+"""
+
+import json
+
+import torch
+from simpler.task_interface import ArgDirection as D
+
+from simpler_setup import Scalar, SceneTestCase, TaskArgsBuilder, Tensor, scene_test
+from simpler_setup.scene_test import _outputs_dir, _sanitize_for_filename
+
+# Path is relative to this file's directory (the SceneTestCase build helper
+# resolves CALLABLE sources from there). dummy_task already ships the two
+# kernels we need (write_const + copy_first), so we reuse those instead of
+# duplicating the source.
+DUMMY_KERNELS = "../../dummy_task/kernels"
+
+
+@scene_test(level=2, runtime="tensormap_and_ringbuffer")
+class TestDepGenChain(SceneTestCase):
+    """dep_gen overflow chain: many-to-one barrier with >64 explicit deps."""
+
+    RTOL = 0
+    ATOL = 0
+
+    CALLABLE = {
+        "orchestration": {
+            "source": "kernels/orchestration/chain_barrier_orch.cpp",
+            "function_name": "aicpu_orchestration_entry",
+            "signature": [D.INOUT, D.INOUT],  # X, Y; N goes as scalar
+        },
+        "incores": [
+            {
+                "func_id": 0,
+                "name": "WRITE_CONST",
+                "source": f"{DUMMY_KERNELS}/aic/kernel_write_const.cpp",
+                "core_type": "aic",
+                # Single-AIC task with one INOUT tensor (args[0]). Declared so
+                # the tensor dump's per-subtask sum matches the payload.
+                "signature": [D.INOUT],
+            },
+            {
+                "func_id": 1,
+                "name": "COPY_FIRST",
+                "source": f"{DUMMY_KERNELS}/aic/kernel_copy_first.cpp",
+                "core_type": "aic",
+                # Single-AIC task: copies args[0] -> args[1] (IN, INOUT).
+                "signature": [D.IN, D.INOUT],
+            },
+        ],
+    }
+
+    # Sentinel must match kernel_write_const (writes 42.0f).
+    SENTINEL = 42.0
+    INIT_VAL = -1.0
+
+    CASES = [
+        {
+            "name": "n_64_no_chain",
+            "platforms": ["a2a3sim", "a2a3"],
+            "config": {"aicpu_thread_num": 2, "block_dim": 1},
+            "params": {"n": 64},
+        },
+        {
+            "name": "n_65_single_overflow",
+            "platforms": ["a2a3sim", "a2a3"],
+            "config": {"aicpu_thread_num": 2, "block_dim": 1},
+            "params": {"n": 65},
+        },
+        {
+            "name": "n_200_single_overflow",
+            "platforms": ["a2a3sim", "a2a3"],
+            "config": {"aicpu_thread_num": 2, "block_dim": 1},
+            "params": {"n": 200},
+        },
+        {
+            "name": "n_391_two_overflow",
+            "platforms": ["a2a3sim", "a2a3"],
+            "config": {"aicpu_thread_num": 2, "block_dim": 1},
+            "params": {"n": 391},
+        },
+    ]
+
+    def generate_args(self, params):
+        # Single-element tensors are enough — kernel_write_const writes index 0
+        # and kernel_copy_first reads index 0.
+        x = torch.full((16,), self.INIT_VAL, dtype=torch.float32)
+        y = torch.full((16,), self.INIT_VAL, dtype=torch.float32)
+        return TaskArgsBuilder(
+            Tensor("x", x),
+            Tensor("y", y),
+            Scalar("n", int(params["n"])),
+        )
+
+    def compute_golden(self, args, params):
+        # Producers each write SENTINEL to X[0]; consumer copies X[0] -> Y[0].
+        # If the barrier didn't actually wait for all producers, the consumer
+        # could race ahead and copy INIT_VAL instead — making the host check
+        # a defacto sanity gate even before we look at deps.json.
+        args.x[0] = self.SENTINEL
+        args.y[0] = self.SENTINEL
+
+    def test_run(self, st_platform, st_worker, request):
+        super().test_run(st_platform, st_worker, request)
+        if not self._effective_enable_dep_gen(request):
+            return
+        for case in self.CASES:
+            if st_platform in case.get("platforms", []):
+                self._post_validate(case)
+
+    def _post_validate(self, case):
+        """Verify every explicit dep edge survived the writer → replay round-trip.
+
+        With dep_gen on, deps.json must contain N edges from the producers to
+        the barrier task (one per `set_dependencies` entry the orchestration
+        emitted), plus the consumer's one explicit edge back from the barrier.
+        Pre-chain code would truncate the producer→barrier edge set to 16/64.
+        """
+        case_name = case["name"]
+        n = int(case["params"]["n"])
+        safe_label = _sanitize_for_filename(f"TestDepGenChain_{case_name}")
+        outputs = _outputs_dir()
+        matches = sorted(outputs.glob(f"{safe_label}_*"), key=lambda p: p.stat().st_mtime)
+        assert matches, f"no output dir for case {case_name!r} — scene didn't run on this platform?"
+        out_dir = matches[-1]
+        deps_path = out_dir / "deps.json"
+        # _post_validate is only invoked when dep_gen was effectively enabled;
+        # absence of deps.json means the host runner declined to emit it (most
+        # likely reconcile_counters failed). Surface that as a hard failure
+        # rather than silently passing — the whole point of this test is to
+        # catch chain-side reconciliation regressions.
+        assert deps_path.exists(), (
+            f"dep_gen was enabled but {deps_path} is missing. Likely cause: "
+            f"reconcile_counters() detected a count mismatch and suppressed deps.json emission. "
+            f"Check the run log for 'dep_gen reconcile' warnings."
+        )
+
+        with deps_path.open() as f:
+            deps = json.load(f)
+
+        raw_edges = deps.get("edges", [])
+        # Project annotated edges → (pred, succ) — we only care about graph
+        # structure here; the annot-vs-oracle agreement gate already ran
+        # inside the replay before deps.json was written.
+        edges = set()
+        explicit_edges = set()
+        for e in raw_edges:
+            if not isinstance(e, dict):
+                continue
+            pred, succ = e.get("pred"), e.get("succ")
+            if pred is None or succ is None:
+                continue
+            pair = (int(pred), int(succ))
+            edges.add(pair)
+            if e.get("source") == "explicit":
+                explicit_edges.add(pair)
+
+        # Identify the barrier task: it's the task with exactly n explicit-source
+        # incoming edges. (Producers have 0; consumer has 1 — the one to barrier.)
+        explicit_by_succ = {}
+        for pred, succ in explicit_edges:
+            explicit_by_succ.setdefault(succ, set()).add(pred)
+        barrier_candidates = [tid for tid, preds in explicit_by_succ.items() if len(preds) == n]
+        assert len(barrier_candidates) == 1, (
+            f"expected exactly one task with {n} explicit predecessors "
+            f"(the barrier), got {len(barrier_candidates)}: "
+            f"{[(tid, len(preds)) for tid, preds in explicit_by_succ.items()]}"
+        )
+        barrier_id = barrier_candidates[0]
+        barrier_preds = explicit_by_succ[barrier_id]
+
+        # All N producer→barrier edges must be present. This is the chain
+        # round-trip assertion: pre-chain code drops anything past index 63.
+        assert len(barrier_preds) == n, f"barrier has {len(barrier_preds)} preds, expected {n}"
+
+        # Consumer must explicit-depend on the barrier — exactly one outgoing
+        # explicit edge from the barrier.
+        outgoing_explicit_from_barrier = {succ for pred, succ in explicit_edges if pred == barrier_id}
+        assert len(outgoing_explicit_from_barrier) == 1, (
+            f"barrier {barrier_id} has {len(outgoing_explicit_from_barrier)} outgoing explicit edges, "
+            f"expected 1 (the consumer)"
+        )
+
+
+if __name__ == "__main__":
+    SceneTestCase.run_module(__name__)
diff --git a/tests/st/a2a3/fully_distributed_within_core/dfx/l2_swimlane/__init__.py b/tests/st/a2a3/fully_distributed_within_core/dfx/l2_swimlane/__init__.py
new file mode 100644
index 000000000..ad03ca31b
--- /dev/null
+++ b/tests/st/a2a3/fully_distributed_within_core/dfx/l2_swimlane/__init__.py
@@ -0,0 +1,8 @@
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
diff --git a/tests/st/a2a3/fully_distributed_within_core/dfx/l2_swimlane/_swimlane_validate.py b/tests/st/a2a3/fully_distributed_within_core/dfx/l2_swimlane/_swimlane_validate.py
new file mode 100644
index 000000000..13efeadd2
--- /dev/null
+++ b/tests/st/a2a3/fully_distributed_within_core/dfx/l2_swimlane/_swimlane_validate.py
@@ -0,0 +1,240 @@
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""Shared l2_swimlane post-case validation.
+
+The vector_example and paged_attention swimlane tests run the same capture →
+tool smoke → differential gate sequence; the only difference between them is
+the workload itself. The helpers below are workload-agnostic so each test
+file owns only its CALLABLE + cases.
+
+The differential gate is the load-bearing assertion: it parses the script's
+printed Pop / Fanout / Fanin totals and cross-checks them against an oracle
+computed straight from the raw artifacts. The paged_attention test exercises
+the per-task dedup branch in ``compute_dag_stats_from_deps`` because mixed
+AIC+AIV tasks produce multiple perf rows per ``task_id``.
+"""
+
+from __future__ import annotations
+
+import json
+import re
+import subprocess
+import sys
+from pathlib import Path
+
+from simpler_setup.scene_test import _outputs_dir, _sanitize_for_filename
+from simpler_setup.tools.swimlane_converter import read_perf_data
+
+_REQUIRED_TASK_FIELDS = (
+    "task_id",
+    "func_id",
+    "core_id",
+    "core_type",
+    "start_time_us",
+    "end_time_us",
+    # receive_time_us / local_setup_us are populated unconditionally by the
+    # AICore-side capture (v3 schema). propagation_us requires AICPU dispatch_ts
+    # and is therefore only present at level≥2 — not in this required-set.
+    "receive_time_us",
+    "local_setup_us",
+)
+
+
+def validate_perf_artifact(case_label: str, *, expected_task_count: int | None = None) -> None:
+    """Locate the latest output dir for ``case_label`` and run the full
+    capture-→-tools-→-differential sequence.
+
+    Args:
+        case_label: full SceneTest case label (``f"{cls_name}_{case_name}"``)
+            used to glob the per-case ``outputs/<label>_<ts>/`` directory.
+        expected_task_count: when provided, assert ``len(tasks) == N``.
+            Workloads whose task count varies with sim/onboard timing should
+            leave this ``None`` and rely on the differential gate.
+    """
+    safe_label = _sanitize_for_filename(case_label)
+    matches = sorted(_outputs_dir().glob(f"{safe_label}_*"), key=lambda p: p.stat().st_mtime)
+    if not matches:
+        return
+    perf = matches[-1] / "l2_swimlane_records.json"
+    assert perf.exists(), f"l2_swimlane_records.json missing under {matches[-1]} — swimlane capture failed?"
+
+    # Read via the swimlane_converter loader so v2 host JSON gets joined into
+    # the v1-shaped dict the rest of this validator (and the differential
+    # oracle below) expects. Direct json.load(perf) would see only raw
+    # aicore_tasks / aicpu_tasks arrays under v2.
+    data = read_perf_data(perf)
+    assert data.get("l2_swimlane_level") in (1, 2, 3, 4), (
+        f"unexpected l2_swimlane_level: {data.get('l2_swimlane_level')}"
+    )
+    tasks = data.get("tasks")
+    assert isinstance(tasks, list), "tasks field missing or not a list"
+    assert len(tasks) > 0, f"perf records empty under {perf}"
+    if expected_task_count is not None:
+        assert len(tasks) == expected_task_count, (
+            f"got {len(tasks)} perf records, expected {expected_task_count} under {perf}"
+        )
+    # Spot-check a single record's required fields — guards against drift in
+    # the swimlane schema that swimlane_converter.py / deps_viewer.py rely on.
+    first = tasks[0]
+    for key in _REQUIRED_TASK_FIELDS:
+        assert key in first, f"perf record missing required field '{key}': {first}"
+
+    # ---- Tool smoke: swimlane_converter ----
+    # Exit-code-only check; we don't validate the Perfetto JSON content. A
+    # schema change that breaks the converter fires here in the same CI
+    # step that produced the artifact.
+    subprocess.run(
+        [
+            sys.executable,
+            "-m",
+            "simpler_setup.tools.swimlane_converter",
+            str(perf),
+            "-o",
+            str(matches[-1] / "_smoke_swimlane.json"),
+        ],
+        check=True,
+        timeout=60,
+    )
+
+    # ---- Tool smoke: sched_overhead_analysis ----
+    # pop_hit / pop_miss come from the dispatch-phase extras the runtime writes
+    # (l2_swimlane_collector.cpp). The differential block below cross-validates
+    # the script's printed numbers against an independent oracle computed
+    # straight from the raw artifacts — any regression in either the runtime
+    # capture path or the parser arithmetic fails here in the same CI step
+    # that produced the data.
+    # sched_overhead_analysis now REQUIRES the DAG (deps.json). The l2_swimlane
+    # CI smoke captures it alongside the perf JSON (--enable-dep-gen), so pass it
+    # explicitly. (For accurate user-facing timing, deps must be a SEPARATE
+    # capture — dep_gen perturbs timing — but the count-based differential below
+    # is timing-independent, so the co-run smoke is fine here.)
+    sched_cmd = [
+        sys.executable,
+        "-m",
+        "simpler_setup.tools.sched_overhead_analysis",
+        "--l2-swimlane-records-json",
+        str(perf),
+    ]
+    deps_sibling = Path(perf).parent / "deps.json"
+    if deps_sibling.exists():
+        sched_cmd += ["--deps-json", str(deps_sibling)]
+    result = subprocess.run(sched_cmd, check=True, timeout=120, capture_output=True, text=True)
+    for header in ("Part 1:", "Part 2:", "Part 5:", "Part 6:"):
+        assert header in result.stdout, f"sched_overhead missing section header '{header}'\nstdout:\n{result.stdout}"
+    # Bad pattern: AICPU didn't capture real cycle counters → tool "succeeds"
+    # but every metric is 0. Match the loop-iteration line printed unconditionally
+    # in Part 6 and assert its value is non-zero (reported in ns now).
+    m = re.search(r"Avg scheduler loop iteration:\s+([\d.]+)\s+ns", result.stdout)
+    assert m, f"sched_overhead stdout missing 'Avg scheduler loop iteration'\nstdout:\n{result.stdout}"
+    assert float(m.group(1)) > 0.0, (
+        f"sched_overhead reports zero loop iteration (avg_loop_us={m.group(1)}). "
+        f"AICPU likely didn't capture dispatch_time/finish_time cycle counters — "
+        f"the L2 perf collector path may have regressed.\nstdout:\n{result.stdout}"
+    )
+    verify_sched_overhead_differential(result.stdout, data, matches[-1])
+
+
+def verify_sched_overhead_differential(stdout: str, perf: dict, artifact_dir: Path) -> None:
+    """Cross-check the script's printed Pop / Fanout / Fanin totals against
+    an oracle computed independently from the raw artifacts. The script and
+    the oracle should agree exactly — if they don't, either the runtime
+    capture regressed or the parser arithmetic drifted, and the bug is
+    caught in the same CI step that produced the data.
+
+    Args:
+        stdout: captured ``sched_overhead_analysis`` stdout.
+        perf: parsed ``l2_swimlane_records.json`` dict — passed in by the caller
+            so we don't re-read multi-MB profiling artifacts here.
+        artifact_dir: per-case output directory. ``deps.json`` is looked up
+            beside the perf JSON; absent → fanout / fanin half is skipped.
+
+    The per-task dedup branch is exercised on mixed AIC+AIV workloads where
+    the perf JSON emits one row per subtask/core for a single ``task_id``.
+    """
+    # Oracle: pop_hit / pop_miss are the sum across all dispatch records.
+    # Compares against the "Pop: hit=N, miss=M" line the script prints.
+    phases = perf.get("aicpu_scheduler_phases", [])
+    oracle_pop_hit = sum(r.get("pop_hit", 0) for thr_recs in phases for r in thr_recs if r.get("phase") == "dispatch")
+    oracle_pop_miss = sum(r.get("pop_miss", 0) for thr_recs in phases for r in thr_recs if r.get("phase") == "dispatch")
+    pop_match = re.search(r"Pop:\s*hit=(\d+),\s*miss=(\d+)", stdout)
+    assert pop_match, f"sched_overhead stdout missing 'Pop: hit=N, miss=M' line\nstdout:\n{stdout}"
+    printed_pop_hit, printed_pop_miss = int(pop_match.group(1)), int(pop_match.group(2))
+    assert printed_pop_hit == oracle_pop_hit, (
+        f"Pop hit mismatch: printed={printed_pop_hit}, oracle={oracle_pop_hit} "
+        f"(summed from dispatch-record extras)\nstdout:\n{stdout}"
+    )
+    assert printed_pop_miss == oracle_pop_miss, (
+        f"Pop miss mismatch: printed={printed_pop_miss}, oracle={oracle_pop_miss}\nstdout:\n{stdout}"
+    )
+
+    # Fanout / fanin differential — only meaningful when deps.json is
+    # colocated (i.e. --enable-dep-gen was also on). When absent, skip.
+    deps_path = artifact_dir / "deps.json"
+    if not deps_path.exists():
+        return
+    with deps_path.open() as f:
+        deps = json.load(f)
+    unique_edges = set()
+    for e in deps.get("edges", []):
+        try:
+            pred, succ = int(e["pred"]), int(e["succ"])
+        except (TypeError, ValueError, KeyError):
+            continue
+        if pred < 0:
+            pred &= (1 << 64) - 1
+        if succ < 0:
+            succ &= (1 << 64) - 1
+        unique_edges.add((pred, succ))
+
+    # Per-thread oracle: a task's fanout is billed to the thread that
+    # retired it (core_to_thread[task.core_id]). Sum across threads ==
+    # total edges (modulo unattributed tasks, e.g. alloc-only with no
+    # core_id). The script prints the sum-across-threads total.
+    core_to_thread = perf.get("core_to_thread") or []
+    edges_by_pred: dict[int, set[int]] = {}
+    edges_by_succ: dict[int, set[int]] = {}
+    for pred, succ in unique_edges:
+        edges_by_pred.setdefault(pred, set()).add(succ)
+        edges_by_succ.setdefault(succ, set()).add(pred)
+    # Dedup by task_id: mixed tasks emit one perf row per subtask/core.
+    oracle_fanout = 0
+    oracle_fanin = 0
+    seen_tids: set[int] = set()
+    for task in perf.get("tasks", []):
+        cid = task.get("core_id")
+        if not isinstance(cid, int) or not (0 <= cid < len(core_to_thread)):
+            continue
+        if core_to_thread[cid] < 0:
+            continue
+        try:
+            tid = int(task["task_id"])
+        except (TypeError, ValueError, KeyError):
+            continue
+        if tid < 0:
+            tid &= (1 << 64) - 1
+        if tid in seen_tids:
+            continue
+        seen_tids.add(tid)
+        oracle_fanout += len(edges_by_pred.get(tid, ()))
+        oracle_fanin += len(edges_by_succ.get(tid, ()))
+
+    fanout_match = re.search(r"Fanout \(.*?\):\s*total edges=(\d+),\s*max_degree=(\d+)", stdout)
+    fanin_match = re.search(r"Fanin\s+\(.*?\):\s*total edges=(\d+),\s*max_degree=(\d+)", stdout)
+    assert fanout_match, f"sched_overhead stdout missing 'Fanout' line\nstdout:\n{stdout}"
+    assert fanin_match, f"sched_overhead stdout missing 'Fanin' line\nstdout:\n{stdout}"
+    printed_fanout = int(fanout_match.group(1))
+    printed_fanin = int(fanin_match.group(1))
+    assert printed_fanout == oracle_fanout, (
+        f"Fanout edges mismatch: printed={printed_fanout}, oracle={oracle_fanout} "
+        f"(derived from {len(unique_edges)} unique deps.json edges + core_to_thread)\n"
+        f"stdout:\n{stdout}"
+    )
+    assert printed_fanin == oracle_fanin, (
+        f"Fanin edges mismatch: printed={printed_fanin}, oracle={oracle_fanin}\nstdout:\n{stdout}"
+    )
diff --git a/tests/st/a2a3/fully_distributed_within_core/dfx/l2_swimlane/kernels/orchestration/chained_mix_orch.cpp b/tests/st/a2a3/fully_distributed_within_core/dfx/l2_swimlane/kernels/orchestration/chained_mix_orch.cpp
new file mode 100644
index 000000000..333cabcbe
--- /dev/null
+++ b/tests/st/a2a3/fully_distributed_within_core/dfx/l2_swimlane/kernels/orchestration/chained_mix_orch.cpp
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Chained MIX orchestration — three MIX tasks where each step reads the
+ * previous step's output. Purpose-built for the l2_swimlane differential
+ * gate: produces MIX tasks (multiple perf rows per task_id) AND non-zero
+ * deps.json edges, so the ``seen_tids`` dedup in
+ * ``compute_dag_stats_from_deps`` has an arithmetically observable effect.
+ *
+ * Each MIX task runs ``aic_matmul`` (kernel_matmul.cpp, 128x128 GEMM) and
+ * ``aiv_add`` (kernel_add.cpp, elementwise add). Inputs are reshaped from
+ * the flat 16384-element tensors the test allocates.
+ *
+ * Arg layout (8 args, all 1-D 16384 float32 except workspaces which are
+ * 32768):
+ *   [A, B, D, E, ws_aic, ws_aiv, aic_out, aiv_out]
+ *
+ * Chain (each line is one MIX task; AIC ↑ AIV ↓):
+ *   step 1:  ws_aic[0:T]      ← matmul(A, B)              edges: (none)
+ *            ws_aiv[0:T]      ← add(D, E)
+ *   step 2:  ws_aic[T:2T]     ← matmul(ws_aic[0:T], B)    edges: 1→2 (×2 tensors)
+ *            ws_aiv[T:2T]     ← add(ws_aiv[0:T], E)
+ *   step 3:  aic_out          ← matmul(ws_aic[T:2T], B)   edges: 2→3 (×2 tensors)
+ *            aiv_out          ← add(ws_aiv[T:2T], E)
+ *
+ * dep_gen collapses the per-tensor flows to unique (pred, succ) pairs,
+ * so deps.json reports 2 edges: (step1, step2) and (step2, step3).
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "pto_orchestration_api.h"  // NOLINT(build/include_subdir)
+
+#define FUNC_MATMUL 0  // AIC kernel — reads first 3 args of the MIX bundle
+#define FUNC_ADD 1     // AIV kernel — reads next 3 args of the MIX bundle
+
+static constexpr uint32_t TILE_ELEMS = 128 * 128;
+
+extern "C" {
+
+__attribute__((visibility("default"))) PTO2OrchestrationConfig aicpu_orchestration_config(const L2TaskArgs &orch_args) {
+    (void)orch_args;
+    return PTO2OrchestrationConfig{
+        .expected_arg_count = 8,
+    };
+}
+
+__attribute__((visibility("default"))) void aicpu_orchestration_entry(const L2TaskArgs &orch_args) {
+    const Tensor &ext_A = orch_args.tensor(0).ref();
+    const Tensor &ext_B = orch_args.tensor(1).ref();
+    const Tensor &ext_D = orch_args.tensor(2).ref();
+    const Tensor &ext_E = orch_args.tensor(3).ref();
+    const Tensor &ext_ws_aic = orch_args.tensor(4).ref();
+    const Tensor &ext_ws_aiv = orch_args.tensor(5).ref();
+    const Tensor &ext_aic_out = orch_args.tensor(6).ref();
+    const Tensor &ext_aiv_out = orch_args.tensor(7).ref();
+
+    uint32_t slot_shape[1] = {TILE_ELEMS};
+    uint32_t off_slot0[1] = {0};
+    uint32_t off_slot1[1] = {TILE_ELEMS};
+
+    Tensor ws_aic_slot0 = ext_ws_aic.view(slot_shape, off_slot0);
+    Tensor ws_aic_slot1 = ext_ws_aic.view(slot_shape, off_slot1);
+    Tensor ws_aiv_slot0 = ext_ws_aiv.view(slot_shape, off_slot0);
+    Tensor ws_aiv_slot1 = ext_ws_aiv.view(slot_shape, off_slot1);
+
+    LOG_INFO_V0("[chained_mix_orch] launching 3-step chained MIX (AIC + AIV)");
+
+    // Step 1: heads of both chains read external inputs.
+    {
+        MixedKernels mk;
+        mk.aic_kernel_id = FUNC_MATMUL;
+        mk.aiv0_kernel_id = FUNC_ADD;
+        L0TaskArgs args;
+        args.add_input(ext_A);
+        args.add_input(ext_B);
+        args.add_output(ws_aic_slot0);
+        args.add_input(ext_D);
+        args.add_input(ext_E);
+        args.add_output(ws_aiv_slot0);
+        rt_submit_task(mk, args);
+    }
+
+    // Step 2: AIC reads ws_aic_slot0 (step 1 AIC output) and AIV reads
+    // ws_aiv_slot0 (step 1 AIV output). Two tensors flow from step 1 to
+    // step 2; dep_gen collapses to a single (step1, step2) edge.
+    {
+        MixedKernels mk;
+        mk.aic_kernel_id = FUNC_MATMUL;
+        mk.aiv0_kernel_id = FUNC_ADD;
+        L0TaskArgs args;
+        args.add_input(ws_aic_slot0);
+        args.add_input(ext_B);
+        args.add_output(ws_aic_slot1);
+        args.add_input(ws_aiv_slot0);
+        args.add_input(ext_E);
+        args.add_output(ws_aiv_slot1);
+        rt_submit_task(mk, args);
+    }
+
+    // Step 3: writes the final user-visible outputs.
+    {
+        MixedKernels mk;
+        mk.aic_kernel_id = FUNC_MATMUL;
+        mk.aiv0_kernel_id = FUNC_ADD;
+        L0TaskArgs args;
+        args.add_input(ws_aic_slot1);
+        args.add_input(ext_B);
+        args.add_output(ext_aic_out);
+        args.add_input(ws_aiv_slot1);
+        args.add_input(ext_E);
+        args.add_output(ext_aiv_out);
+        rt_submit_task(mk, args);
+    }
+}
+
+}  // extern "C"
diff --git a/tests/st/a2a3/fully_distributed_within_core/dfx/l2_swimlane/test_l2_swimlane.py b/tests/st/a2a3/fully_distributed_within_core/dfx/l2_swimlane/test_l2_swimlane.py
new file mode 100644
index 000000000..56f371b7a
--- /dev/null
+++ b/tests/st/a2a3/fully_distributed_within_core/dfx/l2_swimlane/test_l2_swimlane.py
@@ -0,0 +1,99 @@
+#!/usr/bin/env python3
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""L2 swimlane profiling smoke — capture pipeline produces a usable
+``l2_swimlane_records.json``.
+
+Re-uses ``vector_example`` as a known-good 5-task AIV-only workload. When the
+``--enable-l2-swimlane`` flag is on, the helper in :mod:`_swimlane_validate`
+asserts schema, runs the converter / sched_overhead tool smokes, and fires a
+differential gate over Pop / Fanout / Fanin. Without the flag the assertions
+are skipped — the test still runs the case so the default ``pytest tests/st``
+invocation doesn't pay an extra step.
+
+A mixed AIC+AIV companion lives in ``test_l2_swimlane_mixed.py`` —
+that variant exercises the per-task dedup branch in
+``compute_dag_stats_from_deps`` which this AIV-only workload doesn't.
+"""
+
+import torch
+from simpler.task_interface import ArgDirection as D
+
+from simpler_setup import SceneTestCase, TaskArgsBuilder, Tensor, scene_test
+
+from ._swimlane_validate import validate_perf_artifact
+
+KERNELS_BASE = "../../../../../../examples/a2a3/tensormap_and_ringbuffer/vector_example/kernels"
+# example_orchestration.cpp issues 5 submit_task calls.
+_EXPECTED_TASK_COUNT = 5
+
+
+@scene_test(level=2, runtime="tensormap_and_ringbuffer")
+class TestL2Swimlane(SceneTestCase):
+    """Vector example with --enable-l2-swimlane, then assert l2_swimlane_records.json."""
+
+    CALLABLE = {
+        "orchestration": {
+            "source": f"{KERNELS_BASE}/orchestration/example_orchestration.cpp",
+            "function_name": "aicpu_orchestration_entry",
+            "signature": [D.IN, D.IN, D.OUT],
+        },
+        "incores": [
+            {
+                "func_id": 0,
+                "source": f"{KERNELS_BASE}/aiv/kernel_add.cpp",
+                "core_type": "aiv",
+                "signature": [D.IN, D.IN, D.OUT],
+            },
+            {
+                "func_id": 1,
+                "source": f"{KERNELS_BASE}/aiv/kernel_add_scalar.cpp",
+                "core_type": "aiv",
+                "signature": [D.IN, D.OUT],
+            },
+            {
+                "func_id": 2,
+                "source": f"{KERNELS_BASE}/aiv/kernel_mul.cpp",
+                "core_type": "aiv",
+                "signature": [D.IN, D.IN, D.OUT],
+            },
+        ],
+    }
+
+    CASES = [
+        {
+            "name": "default",
+            "platforms": ["a2a3sim", "a2a3"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 3},
+            "params": {},
+        },
+    ]
+
+    def generate_args(self, params):
+        SIZE = 128 * 128
+        return TaskArgsBuilder(
+            Tensor("a", torch.full((SIZE,), 2.0, dtype=torch.float32)),
+            Tensor("b", torch.full((SIZE,), 3.0, dtype=torch.float32)),
+            Tensor("f", torch.zeros(SIZE, dtype=torch.float32)),
+        )
+
+    def compute_golden(self, args, params):
+        args.f[:] = (args.a + args.b + 1) * (args.a + args.b + 2) + (args.a + args.b)
+
+    def test_run(self, st_platform, st_worker, request):
+        super().test_run(st_platform, st_worker, request)
+        if not request.config.getoption("--enable-l2-swimlane", default=0):
+            return
+        for case in self.CASES:
+            if st_platform in case["platforms"]:
+                validate_perf_artifact(f"TestL2Swimlane_{case['name']}", expected_task_count=_EXPECTED_TASK_COUNT)
+
+
+if __name__ == "__main__":
+    SceneTestCase.run_module(__name__)
diff --git a/tests/st/a2a3/fully_distributed_within_core/dfx/l2_swimlane/test_l2_swimlane_mixed.py b/tests/st/a2a3/fully_distributed_within_core/dfx/l2_swimlane/test_l2_swimlane_mixed.py
new file mode 100644
index 000000000..29014d0df
--- /dev/null
+++ b/tests/st/a2a3/fully_distributed_within_core/dfx/l2_swimlane/test_l2_swimlane_mixed.py
@@ -0,0 +1,142 @@
+#!/usr/bin/env python3
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""L2 swimlane profiling on a chained MIX-task workload.
+
+Companion to ``test_l2_swimlane.py``. vector_example is AIV-only and emits
+one perf row per ``task_id`` — the dedup branch in
+``compute_dag_stats_from_deps`` (and the matching dedup in the oracle
+inside :mod:`_swimlane_validate`) sits idle. ``chained_mix_orch.cpp`` runs
+3 MIX tasks where each step's output feeds the next step's input, so
+the workload produces *both*:
+
+  - MIX rows: each MIX task_id emits one perf row per subtask/core
+  - deps.json edges: 2 unique (pred, succ) pairs from the chain
+
+That combination is what makes the dedup arithmetically observable. Without
+``seen_tids`` the oracle would compute fanout = 4 instead of 2 (each MIX
+task's fanout being counted once per perf row), and the differential gate
+would fire.
+"""
+
+import torch
+from simpler.task_interface import ArgDirection as D
+
+from simpler_setup import SceneTestCase, TaskArgsBuilder, Tensor, scene_test
+
+from ._swimlane_validate import validate_perf_artifact
+
+_MATMUL_SIZE = 128
+_TILE_ELEMS = _MATMUL_SIZE * _MATMUL_SIZE
+# ws_aic / ws_aiv hold two intermediate slots — step 1's output (slot 0)
+# is read by step 2, step 2's output (slot 1) is read by step 3.
+_WS_SLOTS = 2
+_WS_ELEMS = _WS_SLOTS * _TILE_ELEMS
+
+
+@scene_test(level=2, runtime="tensormap_and_ringbuffer")
+class TestL2SwimlaneMixed(SceneTestCase):
+    """Chained MIX workload (3 steps, each step is AIC matmul + AIV add).
+
+    Step N reads workspace slot N-1 and writes workspace slot N. Step 3
+    writes the user-visible outputs. dep_gen collapses the multi-tensor
+    flow between adjacent steps into a single (pred, succ) edge per pair,
+    giving 2 unique edges across 3 MIX task_ids.
+    """
+
+    RTOL = 1e-3
+    ATOL = 1e-3
+
+    CALLABLE = {
+        "orchestration": {
+            "source": "kernels/orchestration/chained_mix_orch.cpp",
+            "function_name": "aicpu_orchestration_entry",
+            "signature": [D.IN, D.IN, D.IN, D.IN, D.INOUT, D.INOUT, D.OUT, D.OUT],
+        },
+        "incores": [
+            {
+                "func_id": 0,
+                "name": "MATMUL",
+                "source": "../../mixed_example/kernels/aic/kernel_matmul.cpp",
+                "core_type": "aic",
+                "signature": [D.IN, D.IN, D.OUT],
+            },
+            {
+                "func_id": 1,
+                "name": "ADD",
+                "source": "../../mixed_example/kernels/aiv/kernel_add.cpp",
+                "core_type": "aiv",
+                "signature": [D.IN, D.IN, D.OUT],
+            },
+        ],
+    }
+
+    CASES = [
+        {
+            "name": "default",
+            "platforms": ["a2a3sim", "a2a3"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 3},
+            "params": {},
+        },
+    ]
+
+    def generate_args(self, params):
+        torch.manual_seed(42)
+        A = torch.randn(_MATMUL_SIZE, _MATMUL_SIZE, dtype=torch.float32) * 0.01
+        B = torch.randn(_MATMUL_SIZE, _MATMUL_SIZE, dtype=torch.float32) * 0.01
+        D_t = torch.randn(_TILE_ELEMS, dtype=torch.float32) * 0.01
+        E = torch.randn(_TILE_ELEMS, dtype=torch.float32) * 0.01
+
+        return TaskArgsBuilder(
+            Tensor("A", A.flatten()),
+            Tensor("B", B.flatten()),
+            Tensor("D", D_t),
+            Tensor("E", E),
+            Tensor("ws_aic", torch.zeros(_WS_ELEMS, dtype=torch.float32)),
+            Tensor("ws_aiv", torch.zeros(_WS_ELEMS, dtype=torch.float32)),
+            Tensor("aic_out", torch.zeros(_TILE_ELEMS, dtype=torch.float32)),
+            Tensor("aiv_out", torch.zeros(_TILE_ELEMS, dtype=torch.float32)),
+        )
+
+    def compute_golden(self, args, params):
+        A_mat = args.A.reshape(_MATMUL_SIZE, _MATMUL_SIZE)
+        B_mat = args.B.reshape(_MATMUL_SIZE, _MATMUL_SIZE)
+        # AIC chain: B applied three times via matmul.
+        s1_aic = torch.matmul(A_mat, B_mat)
+        s2_aic = torch.matmul(s1_aic, B_mat)
+        s3_aic = torch.matmul(s2_aic, B_mat)
+        # AIV chain: E added three times → D + 3E.
+        s1_aiv = args.D + args.E
+        s2_aiv = s1_aiv + args.E
+        s3_aiv = s2_aiv + args.E
+
+        args.aic_out[:] = s3_aic.flatten()
+        args.aiv_out[:] = s3_aiv
+        # Final workspace state — slot 0 holds step 1's output, slot 1
+        # holds step 2's output.
+        args.ws_aic[0:_TILE_ELEMS] = s1_aic.flatten()
+        args.ws_aic[_TILE_ELEMS:_WS_ELEMS] = s2_aic.flatten()
+        args.ws_aiv[0:_TILE_ELEMS] = s1_aiv
+        args.ws_aiv[_TILE_ELEMS:_WS_ELEMS] = s2_aiv
+
+    def test_run(self, st_platform, st_worker, request):
+        super().test_run(st_platform, st_worker, request)
+        if not request.config.getoption("--enable-l2-swimlane", default=False):
+            return
+        for case in self.CASES:
+            if st_platform in case["platforms"]:
+                # Rely on the differential gate (Pop / Fanout / Fanin) —
+                # the chain produces 3 MIX task_ids × 2 subtask rows = 6
+                # perf rows and 2 deps.json edges, so the dedup branch in
+                # the oracle has an arithmetically observable effect.
+                validate_perf_artifact(f"TestL2SwimlaneMixed_{case['name']}")
+
+
+if __name__ == "__main__":
+    SceneTestCase.run_module(__name__)
diff --git a/tests/st/a2a3/fully_distributed_within_core/dfx/pmu/test_pmu.py b/tests/st/a2a3/fully_distributed_within_core/dfx/pmu/test_pmu.py
new file mode 100644
index 000000000..e16e90ce3
--- /dev/null
+++ b/tests/st/a2a3/fully_distributed_within_core/dfx/pmu/test_pmu.py
@@ -0,0 +1,111 @@
+#!/usr/bin/env python3
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""PMU profiling smoke — capture pipeline produces a usable ``pmu.csv``.
+
+Re-uses ``vector_example`` (5 submit_task calls). With ``--enable-pmu N``
+the AICore counters land in ``<output_prefix>/pmu.csv``, one data row per
+task. The schema is fixed (see docs/dfx/pmu-profiling.md and
+src/a2a3/platform/shared/host/pmu_collector.cpp's "Build CSV header"
+block). Smoke asserts: file exists, header starts with the documented
+prefix, at least one data row present.
+"""
+
+import torch
+from simpler.task_interface import ArgDirection as D
+
+from simpler_setup import SceneTestCase, TaskArgsBuilder, Tensor, scene_test
+from simpler_setup.scene_test import _outputs_dir, _sanitize_for_filename
+
+KERNELS_BASE = "../../../../../../examples/a2a3/tensormap_and_ringbuffer/vector_example/kernels"
+# Required leading columns — keep in sync with build_csv_header() in
+# pmu_collector.cpp. Counter columns follow these and vary per event_type.
+_REQUIRED_HEADER_PREFIX = ("thread_id", "core_id", "task_id", "func_id", "core_type", "pmu_total_cycles")
+
+
+@scene_test(level=2, runtime="tensormap_and_ringbuffer")
+class TestPmu(SceneTestCase):
+    """Vector example with --enable-pmu, then assert pmu.csv."""
+
+    CALLABLE = {
+        "orchestration": {
+            "source": f"{KERNELS_BASE}/orchestration/example_orchestration.cpp",
+            "function_name": "aicpu_orchestration_entry",
+            "signature": [D.IN, D.IN, D.OUT],
+        },
+        "incores": [
+            {
+                "func_id": 0,
+                "source": f"{KERNELS_BASE}/aiv/kernel_add.cpp",
+                "core_type": "aiv",
+                "signature": [D.IN, D.IN, D.OUT],
+            },
+            {
+                "func_id": 1,
+                "source": f"{KERNELS_BASE}/aiv/kernel_add_scalar.cpp",
+                "core_type": "aiv",
+                "signature": [D.IN, D.OUT],
+            },
+            {
+                "func_id": 2,
+                "source": f"{KERNELS_BASE}/aiv/kernel_mul.cpp",
+                "core_type": "aiv",
+                "signature": [D.IN, D.IN, D.OUT],
+            },
+        ],
+    }
+
+    CASES = [
+        {
+            "name": "default",
+            "platforms": ["a2a3sim", "a2a3"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 3},
+            "params": {},
+        },
+    ]
+
+    def generate_args(self, params):
+        SIZE = 128 * 128
+        return TaskArgsBuilder(
+            Tensor("a", torch.full((SIZE,), 2.0, dtype=torch.float32)),
+            Tensor("b", torch.full((SIZE,), 3.0, dtype=torch.float32)),
+            Tensor("f", torch.zeros(SIZE, dtype=torch.float32)),
+        )
+
+    def compute_golden(self, args, params):
+        args.f[:] = (args.a + args.b + 1) * (args.a + args.b + 2) + (args.a + args.b)
+
+    def test_run(self, st_platform, st_worker, request):
+        super().test_run(st_platform, st_worker, request)
+        if not request.config.getoption("--enable-pmu", default=0):
+            return
+        for case in self.CASES:
+            if st_platform in case["platforms"]:
+                self._validate_pmu_artifact(case)
+
+    def _validate_pmu_artifact(self, case):
+        safe_label = _sanitize_for_filename(f"TestPmu_{case['name']}")
+        matches = sorted(_outputs_dir().glob(f"{safe_label}_*"), key=lambda p: p.stat().st_mtime)
+        if not matches:
+            return
+        csv = matches[-1] / "pmu.csv"
+        assert csv.exists(), f"pmu.csv missing under {matches[-1]} — PMU capture failed?"
+        lines = csv.read_text().splitlines()
+        assert lines, "pmu.csv is empty"
+        header_cols = lines[0].split(",")
+        for col in _REQUIRED_HEADER_PREFIX:
+            assert col in header_cols, f"header missing required column '{col}': {header_cols}"
+        # At least one data row — sim runs all 5 vector_example tasks; expect ≥1
+        # to keep the assertion robust if a future scheduler change collapses
+        # / batches per-task PMU sampling.
+        assert len(lines) >= 2, f"pmu.csv has no data rows (only header): {lines}"
+
+
+if __name__ == "__main__":
+    SceneTestCase.run_module(__name__)
diff --git a/tests/st/a2a3/fully_distributed_within_core/dfx/scope_stats/test_scope_stats.py b/tests/st/a2a3/fully_distributed_within_core/dfx/scope_stats/test_scope_stats.py
new file mode 100644
index 000000000..6968aa682
--- /dev/null
+++ b/tests/st/a2a3/fully_distributed_within_core/dfx/scope_stats/test_scope_stats.py
@@ -0,0 +1,134 @@
+#!/usr/bin/env python3
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""scope_stats smoke — capture pipeline produces a usable ``scope_stats.jsonl``.
+
+Re-uses ``vector_example`` (outer executor scope + one inner ``PTO2_SCOPE()``).
+With ``--enable-scope-stats`` the platform collector
+(``scope_stats_collector_aicpu.h``) appends one record per scope boundary
+(begin and end) into a pooled buffer that streams to the host, which writes
+NDJSON. Enabling the flag is the entire user surface for the new API — the
+runtime takes care of the ``set_pending_site`` / ``scope_stats_begin`` /
+``scope_stats_end`` calls. Schema lives in ``docs/dfx/scope-stats.md`` §3.
+
+Output (``scope_stats.jsonl``): line 1 is run metadata
+(``{"version":6,"fatal":bool,"dropped":uint,"total":uint}``); each subsequent
+line is one scope-boundary record carrying task/heap/dep_pool start-end.
+"""
+
+import json
+
+import torch
+from simpler.task_interface import ArgDirection as D
+
+from simpler_setup import SceneTestCase, TaskArgsBuilder, Tensor, scene_test
+from simpler_setup.scene_test import _outputs_dir, _sanitize_for_filename
+
+KERNELS_BASE = "../../../../../../examples/a2a3/tensormap_and_ringbuffer/vector_example/kernels"
+_REQUIRED_RECORD_FIELDS = {
+    "site",
+    "phase",
+    "depth",
+    "ring",
+    "task_window_start",
+    "task_window_end",
+    "heap_start",
+    "heap_end",
+    "dep_pool_start",
+    "dep_pool_end",
+    "tensormap",
+}
+
+
+@scene_test(level=2, runtime="tensormap_and_ringbuffer")
+class TestScopeStats(SceneTestCase):
+    """Vector example with --enable-scope-stats, then assert scope_stats.jsonl."""
+
+    CALLABLE = {
+        "orchestration": {
+            "source": f"{KERNELS_BASE}/orchestration/example_orchestration.cpp",
+            "function_name": "aicpu_orchestration_entry",
+            "signature": [D.IN, D.IN, D.OUT],
+        },
+        "incores": [
+            {
+                "func_id": 0,
+                "source": f"{KERNELS_BASE}/aiv/kernel_add.cpp",
+                "core_type": "aiv",
+                "signature": [D.IN, D.IN, D.OUT],
+            },
+            {
+                "func_id": 1,
+                "source": f"{KERNELS_BASE}/aiv/kernel_add_scalar.cpp",
+                "core_type": "aiv",
+                "signature": [D.IN, D.OUT],
+            },
+            {
+                "func_id": 2,
+                "source": f"{KERNELS_BASE}/aiv/kernel_mul.cpp",
+                "core_type": "aiv",
+                "signature": [D.IN, D.IN, D.OUT],
+            },
+        ],
+    }
+
+    CASES = [
+        {
+            "name": "default",
+            "platforms": ["a2a3sim", "a2a3"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 3},
+            "params": {},
+        },
+    ]
+
+    def generate_args(self, params):
+        SIZE = 128 * 128
+        return TaskArgsBuilder(
+            Tensor("a", torch.full((SIZE,), 2.0, dtype=torch.float32)),
+            Tensor("b", torch.full((SIZE,), 3.0, dtype=torch.float32)),
+            Tensor("f", torch.zeros(SIZE, dtype=torch.float32)),
+        )
+
+    def compute_golden(self, args, params):
+        args.f[:] = (args.a + args.b + 1) * (args.a + args.b + 2) + (args.a + args.b)
+
+    def test_run(self, st_platform, st_worker, request):
+        super().test_run(st_platform, st_worker, request)
+        if not request.config.getoption("--enable-scope-stats", default=False):
+            return
+        for case in self.CASES:
+            if st_platform in case["platforms"]:
+                self._validate_scope_stats_artifact(case)
+
+    def _validate_scope_stats_artifact(self, case):
+        safe_label = _sanitize_for_filename(f"TestScopeStats_{case['name']}")
+        matches = sorted(_outputs_dir().glob(f"{safe_label}_*"), key=lambda p: p.stat().st_mtime)
+        assert matches, (
+            f"no output directory under {_outputs_dir()} matching {safe_label}_* — "
+            f"--enable-scope-stats was on but the run produced no per-case output dir"
+        )
+        path = matches[-1] / "scope_stats" / "scope_stats.jsonl"
+        assert path.exists(), f"scope_stats.jsonl missing under {matches[-1]} — collector finalize failed?"
+        lines = [ln for ln in path.read_text().splitlines() if ln.strip()]
+        assert lines, f"scope_stats.jsonl empty under {matches[-1]}"
+        meta = json.loads(lines[0])
+        assert meta.get("version") == 6, f"unexpected schema version: {meta!r}"
+        assert meta.get("fatal") is False, f"run latched fatal: {meta!r}"
+        assert meta.get("dropped", 0) == 0, f"records dropped on device: {meta!r}"
+        assert "dep_pool_max" in meta, f"metadata missing dep_pool_max: {meta!r}"
+        records = [json.loads(ln) for ln in lines[1:]]
+        # outer (executor) + inner PTO2_SCOPE, each emitting a begin and an end
+        # record → ≥4 records.
+        assert len(records) >= 4, f"expected ≥4 begin/end records, got {records!r}"
+        for rec in records:
+            assert _REQUIRED_RECORD_FIELDS <= rec.keys(), f"record missing fields: {rec!r}"
+
+
+if __name__ == "__main__":
+    SceneTestCase.run_module(__name__)
diff --git a/tests/st/a2a3/fully_distributed_within_core/dfx/tensor_dump/kernels/orchestration/partial_dump_orch.cpp b/tests/st/a2a3/fully_distributed_within_core/dfx/tensor_dump/kernels/orchestration/partial_dump_orch.cpp
new file mode 100644
index 000000000..d6131194c
--- /dev/null
+++ b/tests/st/a2a3/fully_distributed_within_core/dfx/tensor_dump/kernels/orchestration/partial_dump_orch.cpp
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+#include <stdint.h>
+
+#include "pto_orchestration_api.h"  // NOLINT(build/include_subdir)
+
+extern "C" {
+
+__attribute__((visibility("default"))) PTO2OrchestrationConfig aicpu_orchestration_config(const L2TaskArgs &orch_args) {
+    (void)orch_args;
+    return PTO2OrchestrationConfig{
+        .expected_arg_count = 3,
+    };
+}
+
+__attribute__((visibility("default"))) void aicpu_orchestration_entry(const L2TaskArgs &orch_args) {
+    const Tensor &ext_a = orch_args.tensor(0).ref();
+    const Tensor &ext_b = orch_args.tensor(1).ref();
+    const Tensor &ext_f = orch_args.tensor(2).ref();
+
+    uint32_t size = orch_args.tensor(0).ref().shapes[0];
+    uint32_t inter_shapes[1] = {size};
+    TensorCreateInfo inter_ci(inter_shapes, 1, DataType::FLOAT32);
+
+    L0TaskArgs params_t0;
+    params_t0.add_input(ext_a);
+    params_t0.add_input(ext_b);
+    params_t0.add_output(inter_ci);
+    TaskOutputTensors outs_t0 = rt_submit_aiv_task(0, params_t0);
+    const Tensor &c = outs_t0.get_ref(0);
+
+    PTO2_SCOPE() {
+        L0TaskArgs params_t1;
+        params_t1.add_input(c);
+        params_t1.add_output(inter_ci);
+        float t1_addend = 1.0f;
+        uint32_t t1_count = 3u;
+        params_t1.add_scalar(t1_addend, t1_count);
+        // Partial dump, task granularity: no-arg dump() selects every tensor
+        // and scalar arg on this Arg.
+        params_t1.dump();
+        TaskOutputTensors outs_t1 = rt_submit_aiv_task(1, params_t1);
+        const Tensor &d = outs_t1.get_ref(0);
+
+        L0TaskArgs params_t2;
+        params_t2.add_input(c);
+        params_t2.add_output(inter_ci);
+        float t2_addend = 2.0f;
+        uint32_t t2_count = 3u;
+        params_t2.add_scalar(t2_addend, t2_count);
+        // Scalar-only selection: t2_count has the same value as t1_count
+        // but is left unmarked, so only t2_addend should be dumped.
+        params_t2.dump(t2_addend);
+        TaskOutputTensors outs_t2 = rt_submit_aiv_task(1, params_t2);
+        const Tensor &e = outs_t2.get_ref(0);
+
+        L0TaskArgs params_t3;
+        params_t3.add_input(d);
+        params_t3.add_input(e);
+        params_t3.add_output(inter_ci);
+        uint32_t t3_count = 3u;
+        params_t3.add_scalar(t3_count, t3_count);
+        // Mixed selection: input d + the output + one scalar. The scalar lvalue
+        // is added twice, so dump(t3_count) selects the first matching scalar
+        // arg and marks its JSON arg_index as ambiguous. Input e is left
+        // unmarked.
+        params_t3.dump(d, inter_ci, t3_count);
+        TaskOutputTensors outs_t3 = rt_submit_aiv_task(2, params_t3);
+        const Tensor &g = outs_t3.get_ref(0);
+
+        L0TaskArgs params_t4;
+        params_t4.add_input(g);
+        params_t4.add_input(c);
+        params_t4.add_output(ext_f);
+        // Tensor-only task granularity: no-arg dump() still selects every
+        // tensor arg on this Arg.
+        params_t4.dump();
+        rt_submit_aiv_task(0, params_t4);
+    }
+}
+
+}  // extern "C"
diff --git a/tests/st/a2a3/fully_distributed_within_core/dfx/tensor_dump/test_tensor_dump.py b/tests/st/a2a3/fully_distributed_within_core/dfx/tensor_dump/test_tensor_dump.py
new file mode 100644
index 000000000..ab6f40767
--- /dev/null
+++ b/tests/st/a2a3/fully_distributed_within_core/dfx/tensor_dump/test_tensor_dump.py
@@ -0,0 +1,189 @@
+#!/usr/bin/env python3
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""args_dump profiling smoke — capture pipeline produces a usable
+``args_dump/`` directory.
+
+Re-uses ``vector_example`` (5 submit_task calls). With ``--dump-args`` the
+AICPU writer captures task dump records into a unified manifest + raw-byte
+payload pair under ``<output_prefix>/args_dump/``. Smoke asserts:
+manifest exists + parses, the ``bin_file`` field it names exists, entries
+use the unified schema, and no legacy args-only manifest is emitted.
+"""
+
+import json
+import subprocess
+import sys
+
+import torch
+from simpler.task_interface import ArgDirection as D
+
+from simpler_setup import SceneTestCase, TaskArgsBuilder, Tensor, scene_test
+from simpler_setup.scene_test import _outputs_dir, _sanitize_for_filename
+
+KERNELS_BASE = "../../../../../../examples/a2a3/tensormap_and_ringbuffer/vector_example/kernels"
+
+
+@scene_test(level=2, runtime="tensormap_and_ringbuffer")
+class TestTensorDump(SceneTestCase):
+    """args_dump capture smoke, level-aware on the ``--dump-args`` level.
+
+    Uses ``partial_dump_orch`` (5 tasks; four carry ``dump(...)`` markers) so a
+    single orchestration exercises both modes:
+
+    - ``--dump-args 1`` (partial): only marked args are captured — task
+      ``0x..00`` via no-arg ``dump()`` (all tensor + scalar args), task
+      ``0x..01`` via ``dump(t2_addend)`` (scalar-only), task ``0x..02`` via
+      ``dump(d, inter_ci, t3_count)`` (mixed tensor + scalar, input ``e``
+      excluded), and task ``0x..03`` via no-arg ``dump()`` (all tensor args).
+      Mode is latched host-side before dispatch, so it is race-free regardless
+      of submission order.
+    - ``--dump-args 2`` (full): markers are ignored, every task is dumped.
+
+    The dump level comes straight from the CLI ``--dump-args`` value
+    (no per-case override).
+    """
+
+    CALLABLE = {
+        "orchestration": {
+            "source": "kernels/orchestration/partial_dump_orch.cpp",
+            "function_name": "aicpu_orchestration_entry",
+            "signature": [D.IN, D.IN, D.OUT],
+        },
+        "incores": [
+            {
+                "func_id": 0,
+                "source": f"{KERNELS_BASE}/aiv/kernel_add.cpp",
+                "core_type": "aiv",
+                "signature": [D.IN, D.IN, D.OUT],
+            },
+            {
+                "func_id": 1,
+                "source": f"{KERNELS_BASE}/aiv/kernel_add_scalar.cpp",
+                "core_type": "aiv",
+                "signature": [D.IN, D.OUT],
+            },
+            {
+                "func_id": 2,
+                "source": f"{KERNELS_BASE}/aiv/kernel_mul.cpp",
+                "core_type": "aiv",
+                "signature": [D.IN, D.IN, D.OUT],
+            },
+        ],
+    }
+
+    CASES = [
+        {
+            "name": "default",
+            "platforms": ["a2a3sim", "a2a3"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 3},
+            "params": {},
+        },
+    ]
+
+    def generate_args(self, params):
+        SIZE = 128 * 128
+        return TaskArgsBuilder(
+            Tensor("a", torch.full((SIZE,), 2.0, dtype=torch.float32)),
+            Tensor("b", torch.full((SIZE,), 3.0, dtype=torch.float32)),
+            Tensor("f", torch.zeros(SIZE, dtype=torch.float32)),
+        )
+
+    def compute_golden(self, args, params):
+        args.f[:] = (args.a + args.b + 1) * (args.a + args.b + 2) + (args.a + args.b)
+
+    def test_run(self, st_platform, st_worker, request):
+        super().test_run(st_platform, st_worker, request)
+        level = int(request.config.getoption("--dump-args", default=0))
+        if not level:
+            return
+        safe_label = _sanitize_for_filename("TestTensorDump_default")
+        matches = sorted(_outputs_dir().glob(f"{safe_label}_*"), key=lambda p: p.stat().st_mtime)
+        assert matches, "args dump output directory missing"
+        dump_dir = matches[-1] / "args_dump"
+        assert dump_dir.is_dir(), f"args_dump/ missing under {matches[-1]} — dump capture failed?"
+        manifest = dump_dir / "args_dump.json"
+        assert manifest.exists(), f"args_dump.json missing under {dump_dir} — collector finalize failed?"
+        with manifest.open() as f:
+            data = json.load(f)
+        bin_name = data.get("bin_file")
+        tensors = data.get("args", [])
+        assert tensors, f"args_dump.json has no entries: {data}"
+        if level == 3:
+            # full_json_only: metadata only, no payload and no .bin file.
+            assert bin_name is None, f"level 3 manifest should have bin_file=null: {data}"
+            assert not (dump_dir / "args.bin").exists(), "level 3 must not write args.bin"
+            assert all(t.get("bin_size") == 0 for t in tensors), tensors
+        else:
+            assert bin_name, f"manifest missing bin_file field: {data}"
+            bin_path = dump_dir / bin_name
+            assert bin_path.exists(), f"manifest names bin_file={bin_name!r} but {bin_path} not found"
+            assert bin_path.stat().st_size > 0, "args.bin is empty"
+
+        # Unified manifest (#792): tensors and scalar args share one
+        # args_dump.json keyed by a "kind" field; no separate legacy sidecar files.
+        assert not (dump_dir / "tensor_dump.json").exists(), "tensor_dump.json should not be emitted"
+        assert not (dump_dir / "kernel_args_dump.json").exists(), "kernel_args_dump.json should not be emitted"
+        assert all("kind" in t for t in tensors), tensors
+        scalar_entries = [t for t in tensors if t.get("kind") == "scalar"]
+        assert all(t.get("stage") == "before_dispatch" for t in scalar_entries), scalar_entries
+        assert all(t.get("bin_size") == 0 for t in scalar_entries), scalar_entries
+        assert all("value" in t for t in scalar_entries), scalar_entries
+
+        # Level-aware checks operate on the tensor entries.
+        tensor_entries = [t for t in tensors if t.get("kind") == "tensor"]
+        task_ids = {t["task_id"] for t in tensor_entries}
+        if level == 1:
+            # Partial: only the selected tensor/scalar args, race-free (host-latched).
+            assert len(tensor_entries) == 7, f"partial expected 7 tensor entries, got {len(tensor_entries)}"
+            assert task_ids == {
+                "0x0000000100000000",
+                "0x0000000100000002",
+                "0x0000000100000003",
+            }
+            # Task granularity: dump() captured all tensor args on task 0.
+            t00 = sorted(t["arg_index"] for t in tensor_entries if t["task_id"] == "0x0000000100000000")
+            assert t00 == [0, 1]
+            # Mixed granularity: dump(d, inter_ci, t3_count) captured tensor args
+            # 0 + 2, not arg 1 (e).
+            t02 = sorted(t["arg_index"] for t in tensor_entries if t["task_id"] == "0x0000000100000002")
+            assert t02 == [0, 2]
+            # Tensor-only task granularity: dump() captured all three tensor args.
+            t03 = sorted(t["arg_index"] for t in tensor_entries if t["task_id"] == "0x0000000100000003")
+            assert t03 == [0, 1, 2]
+            scalar_by_task = {
+                task_id: sorted(t["arg_index"] for t in scalar_entries if t["task_id"] == task_id)
+                for task_id in {t["task_id"] for t in scalar_entries}
+            }
+            assert scalar_by_task == {
+                "0x0000000100000000": [2, 3],
+                "0x0000000100000001": [2],
+                "0x0000000100000002": [3],
+            }
+            ambiguous_scalars = [
+                (t["task_id"], t["arg_index"]) for t in scalar_entries if t.get("arg_index_ambiguous", False)
+            ]
+            assert ambiguous_scalars == [("0x0000000100000002", 3)]
+        else:
+            # Full (level 2 or 3): markers ignored — every one of the 5 tasks is dumped.
+            assert len(task_ids) >= 5, f"full dump should cover all 5 tasks, got {sorted(task_ids)}"
+
+        # ---- Tool smoke: dump_viewer ----
+        # Exit-code-only check; the no-filter default lists every captured
+        # arg without exporting. A schema change that breaks the viewer
+        # fires here in the same CI step that produced the dump.
+        subprocess.run(
+            [sys.executable, "-m", "simpler_setup.tools.dump_viewer", str(dump_dir)],
+            check=True,
+            timeout=60,
+        )
+
+
+if __name__ == "__main__":
+    SceneTestCase.run_module(__name__)
diff --git a/tests/st/a2a3/fully_distributed_within_core/dummy_task/kernels/aic/kernel_copy_first.cpp b/tests/st/a2a3/fully_distributed_within_core/dummy_task/kernels/aic/kernel_copy_first.cpp
new file mode 100644
index 000000000..2e6887874
--- /dev/null
+++ b/tests/st/a2a3/fully_distributed_within_core/dummy_task/kernels/aic/kernel_copy_first.cpp
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+/**
+ * Consumer kernel for the dummy_task scene tests: copies args[0][0] -> args[1][0].
+ *
+ * If the dummy_task in the middle of the chain has correctly waited on the
+ * producer and notified this consumer, args[1][0] will equal the producer's
+ * sentinel (42.0f). If the dependency chain broke or the dummy somehow ran a
+ * kernel that clobbered args[0], the value will differ.
+ */
+
+#include <cstdint>
+#include <pto/pto-inst.hpp>
+
+#include "tensor.h"
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__ [aicore]  // NOLINT(whitespace/braces)
+#endif
+
+#include "intrinsic.h"
+
+#ifdef PTO_CPUSTUB_HPP
+#define dcci(...) \
+    do {          \
+    } while (0)
+#endif
+#ifndef SINGLE_CACHE_LINE
+#define SINGLE_CACHE_LINE 0
+#endif
+#ifndef CACHELINE_OUT
+#define CACHELINE_OUT 0
+#endif
+
+extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) {
+    __gm__ Tensor *in_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]);
+    __gm__ Tensor *out_tensor = reinterpret_cast<__gm__ Tensor *>(args[1]);
+
+    __gm__ float *in = reinterpret_cast<__gm__ float *>(in_tensor->buffer.addr) + in_tensor->start_offset;
+    __gm__ float *out = reinterpret_cast<__gm__ float *>(out_tensor->buffer.addr) + out_tensor->start_offset;
+
+    out[0] = in[0];
+    dcci(&out[0], SINGLE_CACHE_LINE, CACHELINE_OUT);
+}
diff --git a/tests/st/a2a3/fully_distributed_within_core/dummy_task/kernels/aic/kernel_write_const.cpp b/tests/st/a2a3/fully_distributed_within_core/dummy_task/kernels/aic/kernel_write_const.cpp
new file mode 100644
index 000000000..734cc9569
--- /dev/null
+++ b/tests/st/a2a3/fully_distributed_within_core/dummy_task/kernels/aic/kernel_write_const.cpp
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+/**
+ * Writes a fixed sentinel (42.0f) to args[0][0]. Used as the producer in the
+ * dummy_task scene tests so a downstream consumer can verify the dependency
+ * chain (producer -> ... -> dummy -> consumer) propagates the value intact.
+ */
+
+#include <cstdint>
+#include <pto/pto-inst.hpp>
+
+#include "tensor.h"
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__ [aicore]  // NOLINT(whitespace/braces)
+#endif
+
+#include "intrinsic.h"
+
+#ifdef PTO_CPUSTUB_HPP
+#define dcci(...) \
+    do {          \
+    } while (0)
+#endif
+#ifndef SINGLE_CACHE_LINE
+#define SINGLE_CACHE_LINE 0
+#endif
+#ifndef CACHELINE_OUT
+#define CACHELINE_OUT 0
+#endif
+
+extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) {
+    __gm__ Tensor *out_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]);
+    __gm__ float *out = reinterpret_cast<__gm__ float *>(out_tensor->buffer.addr) + out_tensor->start_offset;
+
+    out[0] = 42.0f;
+    dcci(&out[0], SINGLE_CACHE_LINE, CACHELINE_OUT);
+}
diff --git a/tests/st/a2a3/fully_distributed_within_core/dummy_task/kernels/orchestration/dummy_task_orch.cpp b/tests/st/a2a3/fully_distributed_within_core/dummy_task/kernels/orchestration/dummy_task_orch.cpp
new file mode 100644
index 000000000..fc1ff8d02
--- /dev/null
+++ b/tests/st/a2a3/fully_distributed_within_core/dummy_task/kernels/orchestration/dummy_task_orch.cpp
@@ -0,0 +1,146 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+/**
+ * dummy_task orchestration scenes.
+ *
+ * Each case is selected via params["case"] in the orchestration scalar slot.
+ *
+ *   case=1: Single dummy via auto tensormap dep.
+ *     producer (kernel_write_const) writes X[0] = 42.0
+ *     dummy_T INOUTs X (no kernel)        // becomes new producer in tensormap
+ *     consumer (kernel_copy_first) X -> Y
+ *     expect Y[0] = 42.0
+ *
+ *   case=2: Long dummy chain (N dummies between producer and consumer).
+ *     producer writes X[0] = 42.0
+ *     dummy_T1 .. dummy_TN each INOUT X    // chained through tensormap
+ *     consumer copies X -> Y
+ *     expect Y[0] = 42.0 (no dummy runs a kernel; X must be undisturbed)
+ *
+ *   case=3: Dummy as many-to-one barrier via explicit set_dependencies.
+ *     producer_A writes X[0] = 42.0
+ *     producer_B writes W[0] = 7.0
+ *     dummy_T explicit set_dependencies({A.id, B.id}, 2)  // pure barrier
+ *     consumer explicit set_dependencies({dummy.id}, 1), copies X -> Y
+ *     expect Y[0] = 42.0 (consumer waits on dummy which waits on A+B)
+ *
+ * Args layout: [X, Y, W]
+ *   - X: producer A writes; consumer reads
+ *   - Y: consumer writes; host checks
+ *   - W: producer B writes (case 3 only); ignored by consumer
+ *
+ * Scalar:  case selector
+ */
+
+#include <cstdint>
+
+#include "pto_orchestration_api.h"  // NOLINT(build/include_subdir)
+
+#define FUNC_WRITE_CONST 0
+#define FUNC_COPY_FIRST 1
+
+static constexpr int32_t LONG_CHAIN_DUMMIES = 4;
+
+extern "C" {
+
+__attribute__((visibility("default"))) PTO2OrchestrationConfig aicpu_orchestration_config(const L2TaskArgs &orch_args) {
+    (void)orch_args;  // NOLINT(readability/casting)
+    return PTO2OrchestrationConfig{
+        .expected_arg_count = 4,  // 3 tensors + 1 case scalar
+    };
+}
+
+__attribute__((visibility("default"))) void aicpu_orchestration_entry(const L2TaskArgs &orch_args) {
+    const Tensor &ext_X = orch_args.tensor(0).ref();
+    const Tensor &ext_Y = orch_args.tensor(1).ref();
+    const Tensor &ext_W = orch_args.tensor(2).ref();
+
+    uint64_t case_id = orch_args.scalar(0);
+    LOG_INFO_V0("[dummy_task_orch] case_id=%llu", static_cast<unsigned long long>(case_id));
+
+    if (case_id == 1) {
+        // producer writes X
+        {
+            L0TaskArgs args;
+            args.add_inout(ext_X);
+            rt_submit_aic_task(FUNC_WRITE_CONST, args);
+        }
+        // dummy_T INOUTs X (becomes new producer)
+        {
+            L0TaskArgs args;
+            args.add_inout(ext_X);
+            rt_submit_dummy_task(args);
+        }
+        // consumer reads X -> writes Y
+        {
+            L0TaskArgs args;
+            args.add_input(ext_X);
+            args.add_inout(ext_Y);
+            rt_submit_aic_task(FUNC_COPY_FIRST, args);
+        }
+    } else if (case_id == 2) {
+        // producer writes X
+        {
+            L0TaskArgs args;
+            args.add_inout(ext_X);
+            rt_submit_aic_task(FUNC_WRITE_CONST, args);
+        }
+        // long dummy chain
+        for (int32_t i = 0; i < LONG_CHAIN_DUMMIES; i++) {
+            L0TaskArgs args;
+            args.add_inout(ext_X);
+            rt_submit_dummy_task(args);
+        }
+        // consumer
+        {
+            L0TaskArgs args;
+            args.add_input(ext_X);
+            args.add_inout(ext_Y);
+            rt_submit_aic_task(FUNC_COPY_FIRST, args);
+        }
+    } else if (case_id == 3) {
+        // producer A writes X, producer B writes W
+        PTO2TaskId a_id;
+        PTO2TaskId b_id;
+        {
+            L0TaskArgs args;
+            args.add_inout(ext_X);
+            a_id = rt_submit_aic_task(FUNC_WRITE_CONST, args).task_id();
+        }
+        {
+            L0TaskArgs args;
+            args.add_inout(ext_W);
+            b_id = rt_submit_aic_task(FUNC_WRITE_CONST, args).task_id();
+        }
+        // dummy barrier on A + B (no tensor args, only explicit deps)
+        PTO2TaskId dummy_id;
+        {
+            L0TaskArgs args;
+            PTO2TaskId barrier_deps[] = {a_id, b_id};
+            args.set_dependencies(barrier_deps, 2);
+            dummy_id = rt_submit_dummy_task(args).task_id();
+        }
+        // consumer: explicit dep on dummy, reads X
+        {
+            L0TaskArgs args;
+            PTO2TaskId consumer_deps[] = {dummy_id};
+            args.set_dependencies(consumer_deps, 1);
+            args.add_input(ext_X);
+            args.add_inout(ext_Y);
+            rt_submit_aic_task(FUNC_COPY_FIRST, args);
+        }
+    } else {
+        rt_report_fatal(PTO2_ERROR_INVALID_ARGS, "unsupported case_id=%llu", static_cast<unsigned long long>(case_id));
+    }
+}
+
+}  // extern "C"
diff --git a/tests/st/a2a3/fully_distributed_within_core/dummy_task/test_dummy_task.py b/tests/st/a2a3/fully_distributed_within_core/dummy_task/test_dummy_task.py
new file mode 100644
index 000000000..8d783b71d
--- /dev/null
+++ b/tests/st/a2a3/fully_distributed_within_core/dummy_task/test_dummy_task.py
@@ -0,0 +1,120 @@
+#!/usr/bin/env python3
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""dummy_task: verify dep-only tasks block consumers and never run a kernel.
+
+The orchestration submits one of three scenes, controlled by params["case"]:
+
+  case=1 (single dummy via tensormap INOUT):
+    producer writes X[0]=42.0 -> dummy_T INOUTs X -> consumer copies X to Y.
+    Y[0] must equal 42.0. If dummy somehow ran a kernel it would zero or
+    corrupt the buffer; the value 42.0 in Y proves both ordering and the
+    no-op nature of dummy_task.
+
+  case=2 (long dummy chain):
+    Same as case 1, but with LONG_CHAIN_DUMMIES dummies between producer
+    and consumer. Looks after the dummy_ready_queue + dispatch-loop drain
+    when several dummies sit on the critical path back-to-back.
+
+  case=3 (explicit set_dependencies barrier):
+    Two independent producers (writing X and W); a dummy_T uses
+    set_dependencies({A, B}, 2) as a many-to-one barrier; the consumer
+    set_dependencies({dummy}, 1) and reads X. Verifies dummy_task
+    participates in explicit_dep wiring.
+"""
+
+import torch
+from simpler.task_interface import ArgDirection as D
+
+from simpler_setup import Scalar, SceneTestCase, TaskArgsBuilder, Tensor, scene_test
+
+SENTINEL = 42.0
+INIT_VAL = -1.0  # so unmodified Y is distinguishable from the sentinel
+
+
+@scene_test(level=2, runtime="tensormap_and_ringbuffer")
+class TestDummyTask(SceneTestCase):
+    """dummy_task: dep-only tasks must block consumers and never run a kernel."""
+
+    RTOL = 0
+    ATOL = 0
+
+    CALLABLE = {
+        "orchestration": {
+            "source": "kernels/orchestration/dummy_task_orch.cpp",
+            "function_name": "aicpu_orchestration_entry",
+            "signature": [D.INOUT, D.INOUT, D.INOUT],
+        },
+        "incores": [
+            {
+                "func_id": 0,
+                "name": "WRITE_CONST",
+                "source": "kernels/aic/kernel_write_const.cpp",
+                "core_type": "aic",
+                # Single-AIC task with one INOUT tensor (args[0]). Declared so
+                # the tensor dump's per-subtask sum matches the payload.
+                "signature": [D.INOUT],
+            },
+            {
+                "func_id": 1,
+                "name": "COPY_FIRST",
+                "source": "kernels/aic/kernel_copy_first.cpp",
+                "core_type": "aic",
+                # Single-AIC task: copies args[0] -> args[1] (IN, INOUT).
+                "signature": [D.IN, D.INOUT],
+            },
+        ],
+    }
+
+    CASES = [
+        {
+            "name": "SingleDummyAutoDep",
+            "platforms": ["a2a3sim", "a2a3"],
+            "config": {"aicpu_thread_num": 2, "block_dim": 1},
+            "params": {"case": 1},
+        },
+        {
+            "name": "LongDummyChain",
+            "platforms": ["a2a3sim", "a2a3"],
+            "config": {"aicpu_thread_num": 2, "block_dim": 1},
+            "params": {"case": 2},
+        },
+        {
+            "name": "DummyExplicitDepBarrier",
+            "platforms": ["a2a3sim", "a2a3"],
+            "config": {"aicpu_thread_num": 2, "block_dim": 1},
+            "params": {"case": 3},
+        },
+    ]
+
+    def generate_args(self, params):
+        x = torch.full((16,), INIT_VAL, dtype=torch.float32)
+        y = torch.full((16,), INIT_VAL, dtype=torch.float32)
+        w = torch.full((16,), INIT_VAL, dtype=torch.float32)
+        return TaskArgsBuilder(
+            Tensor("x", x),
+            Tensor("y", y),
+            Tensor("w", w),
+            Scalar("case", int(params["case"])),
+        )
+
+    def compute_golden(self, args, params):
+        # The producer (kernel_write_const) writes 42.0 to X[0]; the consumer
+        # (kernel_copy_first) copies X[0] -> Y[0]. Any dummy_task in the chain
+        # is a pure barrier and does NOT touch the buffer, so X[0] / Y[0]
+        # must equal SENTINEL on the host side regardless of case.
+        args.x[0] = SENTINEL
+        args.y[0] = SENTINEL
+        if params["case"] == 3:
+            # case 3 has a second producer writing W
+            args.w[0] = SENTINEL
+
+
+if __name__ == "__main__":
+    SceneTestCase.run_module(__name__)
diff --git a/tests/st/a2a3/fully_distributed_within_core/dynamic_register/test_dynamic_register.py b/tests/st/a2a3/fully_distributed_within_core/dynamic_register/test_dynamic_register.py
new file mode 100644
index 000000000..6c7353036
--- /dev/null
+++ b/tests/st/a2a3/fully_distributed_within_core/dynamic_register/test_dynamic_register.py
@@ -0,0 +1,445 @@
+#!/usr/bin/env python3
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""End-to-end ST for post-start Worker.register(ChipCallable) at L3.
+
+Exercises the _CTRL_REGISTER IPC path end-to-end: parent stages a
+ChipCallable in shared memory after child startup, broadcasts CTRL_REGISTER to
+every chip child, the child mmaps + prepares, and the resulting
+CallableHandle is indistinguishable from a pre-start preparation when used
+in run().
+
+The UT suite (tests/ut/py/test_worker/test_host_worker.py) already covers
+the facade-level paths (lock guard, capacity overflow, lambda rejection, run
+race detection, shm name generator). This file's job is to prove the
+bytes actually traverse shm to the chip child and prepare succeeds —
+which only a real (sim or device) chip child can confirm.
+"""
+
+import os
+
+import pytest
+import torch
+from _task_interface import MAX_REGISTERED_CALLABLE_IDS  # pyright: ignore[reportMissingImports]
+from simpler.task_interface import ArgDirection as D
+from simpler.task_interface import CallConfig, ChipCallable
+from simpler.worker import Worker
+
+from simpler_setup import TaskArgsBuilder, Tensor
+from simpler_setup.kernel_compiler import KernelCompiler
+from simpler_setup.scene_test import _build_l3_task_args
+
+_RUNTIME = "tensormap_and_ringbuffer"
+_HERE = os.path.dirname(os.path.abspath(__file__))
+_KERNELS = os.path.join(
+    _HERE,
+    "..",
+    "..",
+    "..",
+    "..",
+    "..",
+    "examples",
+    "a2a3",
+    "tensormap_and_ringbuffer",
+    "vector_example",
+    "kernels",
+)
+_ORCH_SRC = os.path.join(_KERNELS, "orchestration", "example_orchestration.cpp")
+_AIV_ADD = os.path.join(_KERNELS, "aiv", "kernel_add.cpp")
+_AIV_ADD_SCALAR = os.path.join(_KERNELS, "aiv", "kernel_add_scalar.cpp")
+_AIV_MUL = os.path.join(_KERNELS, "aiv", "kernel_mul.cpp")
+
+_ORCH_SIG = [D.IN, D.IN, D.OUT]
+
+
+def _build_vector_callable(platform: str, *, extra_unused_child: bool = False) -> ChipCallable:
+    """Compile the vector_example orchestration + 3 AIV kernels.
+
+    Mirrors how SceneTestCase._compile_chip_callable_from_spec assembles
+    a ChipCallable, but inline so the test can call prepare_callable() on it both
+    before and after init().
+    """
+    from simpler.task_interface import CoreCallable  # noqa: PLC0415
+
+    from simpler_setup.elf_parser import extract_text_section  # noqa: PLC0415
+    from simpler_setup.pto_isa import ensure_pto_isa_root  # noqa: PLC0415
+
+    kc = KernelCompiler(platform=platform)
+    pto_isa_root = ensure_pto_isa_root()
+    inc_dirs = kc.get_orchestration_include_dirs(_RUNTIME)
+
+    orch_bytes = kc.compile_orchestration(runtime_name=_RUNTIME, source_path=_ORCH_SRC)
+
+    def _aiv(path: str) -> bytes:
+        raw = kc.compile_incore(path, core_type="aiv", pto_isa_root=pto_isa_root, extra_include_dirs=inc_dirs)
+        return raw if platform.endswith("sim") else extract_text_section(raw)
+
+    add = CoreCallable.build(signature=[D.IN, D.IN, D.OUT], binary=_aiv(_AIV_ADD))
+    add_scalar = CoreCallable.build(signature=[D.IN, D.OUT], binary=_aiv(_AIV_ADD_SCALAR))
+    mul = CoreCallable.build(signature=[D.IN, D.IN, D.OUT], binary=_aiv(_AIV_MUL))
+
+    children = [(0, add), (1, add_scalar), (2, mul)]
+    if extra_unused_child:
+        children.append((99, add))
+
+    return ChipCallable.build(
+        signature=_ORCH_SIG,
+        func_name="aicpu_orchestration_entry",
+        binary=orch_bytes,
+        children=children,
+    )
+
+
+def _unique_py_callable(index: int):
+    def fn(args, _index=index):
+        return _index
+
+    return fn
+
+
+def _make_args(a: float, b: float) -> TaskArgsBuilder:
+    size = 128 * 128
+    return TaskArgsBuilder(
+        Tensor("a", torch.full((size,), a, dtype=torch.float32).share_memory_()),
+        Tensor("b", torch.full((size,), b, dtype=torch.float32).share_memory_()),
+        Tensor("f", torch.zeros(size, dtype=torch.float32).share_memory_()),
+    )
+
+
+def _golden(a: float, b: float) -> float:
+    # Matches the orchestration: f = (a+b+1) * (a+b+2) + (a+b)
+    s = a + b
+    return (s + 1) * (s + 2) + s
+
+
+@pytest.mark.platforms(["a2a3sim"])
+@pytest.mark.device_count(1)
+@pytest.mark.runtime(_RUNTIME)
+def test_prepare_new_identity_after_start_then_run(st_platform, st_device_ids):
+    """Happy path: prepare one identity pre-start and another post-start.
+
+    Proves the post-start control path delivers a usable handle for a
+    previously unseen hashid. Both identities execute equivalent kernels and
+    must produce numerically identical outputs.
+    """
+    chip_callable = _build_vector_callable(st_platform)
+    post_callable = _build_vector_callable(st_platform, extra_unused_child=True)
+
+    worker = Worker(
+        level=3,
+        device_ids=[int(st_device_ids[0])],
+        num_sub_workers=0,
+        platform=st_platform,
+        runtime=_RUNTIME,
+    )
+    pre_handle = worker.register(chip_callable)
+
+    # Pre-allocate both runs' tensors BEFORE Worker.init() so the
+    # share_memory_() mappings are inherited by the forked chip child.
+    # share_memory_ regions created after fork in the parent are not visible
+    # to the chip child, so dispatch on those would segfault.
+    a, b = 2.0, 3.0
+    expected = _golden(a, b)
+    args_pre = _make_args(a, b)
+    args_post = _make_args(a, b)
+    chip_args_pre, output_names_pre = _build_l3_task_args(args_pre, _ORCH_SIG)
+    chip_args_post, output_names_post = _build_l3_task_args(args_post, _ORCH_SIG)
+    assert output_names_pre == ["f"] and output_names_post == ["f"]
+
+    worker.init()
+    try:
+        config = CallConfig()
+        config.block_dim = 3
+        config.aicpu_thread_num = 4
+
+        # 1. Run pre_handle once to force _start_hierarchical (forks chip
+        #    children, runs the CTRL_PREPARE prewarm loop). This puts the
+        #    chip child into _run_chip_main_loop, the only state in which
+        #    a CTRL_REGISTER broadcast can be ACKed.
+        def orch_pre(o, _args, _cfg):
+            o.submit_next_level(pre_handle, chip_args_pre, config)
+
+        worker.run(orch_pre)
+        got_pre = args_pre.f
+        assert torch.allclose(got_pre, torch.full_like(got_pre, expected), rtol=1e-5, atol=1e-5), (
+            f"pre_handle={pre_handle.hashid}: expected {expected}, got {got_pre[:4].tolist()}..."
+        )
+
+        # 2. Now do the post-start dynamic prepare. The parent stages bytes
+        #    in shm and broadcasts CTRL_REGISTER; the child mmaps and calls
+        #    prepare_callable_from_blob. post_handle is unknown to the
+        #    CoW-inherited registry on the child side — only the IPC path
+        #    can deliver it.
+        post_handle = worker.register(post_callable)
+        assert post_handle.hashid != pre_handle.hashid
+
+        # 3. Run with post_handle. If CTRL_REGISTER delivered correctly, the
+        #    child has the identity prepared; otherwise dispatch will fail.
+        def orch_post(o, _args, _cfg):
+            o.submit_next_level(post_handle, chip_args_post, config)
+
+        worker.run(orch_post)
+        got_post = args_post.f
+        assert torch.allclose(got_post, torch.full_like(got_post, expected), rtol=1e-5, atol=1e-5), (
+            f"post_handle={post_handle.hashid}: expected {expected}, got {got_post[:4].tolist()}..."
+        )
+    finally:
+        worker.close()
+
+
+@pytest.mark.platforms(["a2a3sim"])
+@pytest.mark.device_count(2)
+@pytest.mark.runtime(_RUNTIME)
+def test_prepare_new_identity_after_start_parallel_broadcast(st_platform, st_device_ids):
+    """Two chip children, post-start prepare broadcasts to both in parallel.
+
+    Asserts that the prepared handle runs successfully on each chip — proving
+    the C++ broadcast (one std::thread per WorkerThread) delivers the bytes
+    to every chip's mailbox and each prepare_callable_from_blob runs without
+    racing against the others.
+    """
+    chip_callable = _build_vector_callable(st_platform)
+    post_callable = _build_vector_callable(st_platform, extra_unused_child=True)
+    device_ids = [int(d) for d in st_device_ids[:2]]
+    worker = Worker(
+        level=3,
+        device_ids=device_ids,
+        num_sub_workers=0,
+        platform=st_platform,
+        runtime=_RUNTIME,
+    )
+    pre_handle = worker.register(chip_callable)
+    a, b = 2.0, 3.0
+    expected = _golden(a, b)
+    # Pre-allocate args for each chip (chip_id = block group). The
+    # vector_example orchestration partitions the input across cores, so a
+    # single args bundle works for both chips' first-run trigger; the
+    # second-run uses the post-start prepared handle.
+    args_pre = _make_args(a, b)
+    args_post = _make_args(a, b)
+    chip_args_pre, _ = _build_l3_task_args(args_pre, _ORCH_SIG)
+    chip_args_post, _ = _build_l3_task_args(args_post, _ORCH_SIG)
+
+    worker.init()
+    try:
+        config = CallConfig()
+        config.block_dim = 3
+        config.aicpu_thread_num = 4
+
+        def orch_pre(o, _a, _c):
+            o.submit_next_level(pre_handle, chip_args_pre, config)
+
+        worker.run(orch_pre)
+        assert torch.allclose(args_pre.f, torch.full_like(args_pre.f, expected), rtol=1e-5, atol=1e-5)
+
+        # Now broadcast CTRL_REGISTER to BOTH chip mailboxes in parallel.
+        post_handle = worker.register(post_callable)
+
+        def orch_post(o, _a, _c):
+            o.submit_next_level(post_handle, chip_args_post, config)
+
+        worker.run(orch_post)
+        assert torch.allclose(args_post.f, torch.full_like(args_post.f, expected), rtol=1e-5, atol=1e-5)
+    finally:
+        worker.close()
+
+
+@pytest.mark.platforms(["a2a3sim"])
+@pytest.mark.device_count(1)
+@pytest.mark.runtime(_RUNTIME)
+def test_prepare_capacity_overflow_post_start(st_platform, st_device_ids):
+    """Saturate callable capacity pre-start, then verify post-start prepare hits
+    the same ``MAX_REGISTERED_CALLABLE_IDS`` ceiling for a new hashid.
+
+    Confirms the public capacity guard is shared between pre-start preparation
+    and the post-start control path (and that the error message is
+    protocol-aware so the operator sees the same diagnostic in both paths).
+    """
+    chip_callable = _build_vector_callable(st_platform)
+    worker = Worker(
+        level=3,
+        device_ids=[int(st_device_ids[0])],
+        num_sub_workers=0,
+        platform=st_platform,
+        runtime=_RUNTIME,
+    )
+    # Fill the registry pre-start with distinct sub fn identities (cheap, no
+    # device cost).
+    for i in range(MAX_REGISTERED_CALLABLE_IDS - 1):
+        worker.register(_unique_py_callable(i))
+    chip_handle = worker.register(chip_callable)  # final capacity entry
+
+    a, b = 2.0, 3.0
+    args_pre = _make_args(a, b)
+    chip_args_pre, _ = _build_l3_task_args(args_pre, _ORCH_SIG)
+
+    worker.init()
+    try:
+        config = CallConfig()
+        config.block_dim = 3
+        config.aicpu_thread_num = 4
+
+        def orch_pre(o, _a, _c):
+            o.submit_next_level(chip_handle, chip_args_pre, config)
+
+        worker.run(orch_pre)
+
+        # The very next dynamic prepare of a new identity hits the capacity
+        # ceiling. Re-preparing ``chip_callable`` itself would only create
+        # another handle to the existing identity.
+        with pytest.raises(RuntimeError, match="MAX_REGISTERED_CALLABLE_IDS"):
+            worker.register(_build_vector_callable(st_platform, extra_unused_child=True))
+    finally:
+        worker.close()
+
+
+@pytest.mark.platforms(["a2a3sim"])
+@pytest.mark.device_count(1)
+@pytest.mark.runtime(_RUNTIME)
+def test_duplicate_prepare_same_hashid_survives_one_unregister(st_platform, st_device_ids):
+    """prepare same hashid twice, unregister one handle, run the other.
+
+    This is the hashid-specific post-start path: the second
+    ``prepare_callable(same_chip_callable)`` must return a distinct handle for
+    the same hashid. Unregistering one handle must only drop that public
+    handle; the remaining handle must still dispatch successfully.
+    """
+    chip_callable = _build_vector_callable(st_platform)
+    worker = Worker(
+        level=3,
+        device_ids=[int(st_device_ids[0])],
+        num_sub_workers=0,
+        platform=st_platform,
+        runtime=_RUNTIME,
+    )
+    pre_handle = worker.register(chip_callable)
+
+    a, b = 2.0, 3.0
+    expected = _golden(a, b)
+    # Two runs total — preallocate both args bundles BEFORE init() so
+    # the share_memory_ mappings are inherited by the forked chip child.
+    args_one = _make_args(a, b)
+    args_two = _make_args(a, b)
+    chip_args_one, _ = _build_l3_task_args(args_one, _ORCH_SIG)
+    chip_args_two, _ = _build_l3_task_args(args_two, _ORCH_SIG)
+
+    worker.init()
+    try:
+        config = CallConfig()
+        config.block_dim = 3
+        config.aicpu_thread_num = 4
+
+        # 1. Trigger fork via pre_handle to put the chip child into the main loop.
+        def orch_one(o, _args, _cfg):
+            o.submit_next_level(pre_handle, chip_args_one, config)
+
+        worker.run(orch_one)
+        assert torch.allclose(args_one.f, torch.full_like(args_one.f, expected), rtol=1e-5, atol=1e-5)
+
+        # 2. Prepare the same callable after start. This returns another
+        # public handle for the same hashid, not a new identity.
+        duplicate_handle = worker.register(chip_callable)
+        assert duplicate_handle.hashid == pre_handle.hashid
+        assert duplicate_handle.digest == pre_handle.digest
+        assert duplicate_handle._handle_id != pre_handle._handle_id
+
+        # 3. Drop the first handle. The child must keep the prepared identity
+        # alive for duplicate_handle.
+        worker.unregister(pre_handle)
+
+        with pytest.raises(KeyError, match="not live"):
+            worker.run(lambda o, _args, _cfg: o.submit_next_level(pre_handle, chip_args_one, config))
+
+        def orch_two(o, _args, _cfg):
+            o.submit_next_level(duplicate_handle, chip_args_two, config)
+
+        worker.run(orch_two)
+        assert torch.allclose(args_two.f, torch.full_like(args_two.f, expected), rtol=1e-5, atol=1e-5)
+
+        # 4. Dropping the final handle invalidates it through the public API.
+        worker.unregister(duplicate_handle)
+        with pytest.raises(KeyError, match="not live"):
+            worker.run(lambda o, _args, _cfg: o.submit_next_level(duplicate_handle, chip_args_two, config))
+    finally:
+        worker.close()
+
+
+@pytest.mark.platforms(["a2a3sim"])
+@pytest.mark.device_count(1)
+@pytest.mark.runtime(_RUNTIME)
+def test_unregister_last_handle_allows_reprepare_same_hashid(st_platform, st_device_ids):
+    """prepare → run → unregister final handle → prepare same identity again.
+
+    Proves the IPC unregister path works end-to-end: after CTRL_UNREGISTER
+    propagates to the chip child, the old handle is invalid and a subsequent
+    post-start prepare of that identity materializes a usable handle again.
+    """
+    chip_callable = _build_vector_callable(st_platform)
+    post_callable = _build_vector_callable(st_platform, extra_unused_child=True)
+    worker = Worker(
+        level=3,
+        device_ids=[int(st_device_ids[0])],
+        num_sub_workers=0,
+        platform=st_platform,
+        runtime=_RUNTIME,
+    )
+    pre_handle = worker.register(chip_callable)
+
+    a, b = 2.0, 3.0
+    expected = _golden(a, b)
+    args_one = _make_args(a, b)
+    args_two = _make_args(a, b)
+    args_three = _make_args(a, b)
+    chip_args_one, _ = _build_l3_task_args(args_one, _ORCH_SIG)
+    chip_args_two, _ = _build_l3_task_args(args_two, _ORCH_SIG)
+    chip_args_three, _ = _build_l3_task_args(args_three, _ORCH_SIG)
+
+    worker.init()
+    try:
+        config = CallConfig()
+        config.block_dim = 3
+        config.aicpu_thread_num = 4
+
+        def orch_one(o, _args, _cfg):
+            o.submit_next_level(pre_handle, chip_args_one, config)
+
+        worker.run(orch_one)
+        assert torch.allclose(args_one.f, torch.full_like(args_one.f, expected), rtol=1e-5, atol=1e-5)
+
+        dyn_handle = worker.register(post_callable)
+
+        def orch_two(o, _args, _cfg):
+            o.submit_next_level(dyn_handle, chip_args_two, config)
+
+        worker.run(orch_two)
+        assert torch.allclose(args_two.f, torch.full_like(args_two.f, expected), rtol=1e-5, atol=1e-5)
+
+        worker.unregister(dyn_handle)
+        with pytest.raises(KeyError, match="not live"):
+            worker.run(lambda o, _args, _cfg: o.submit_next_level(dyn_handle, chip_args_two, config))
+
+        # Re-prepare the same hashid after its final handle was dropped.
+        again_handle = worker.register(post_callable)
+        assert again_handle.hashid == dyn_handle.hashid
+        assert again_handle.digest == dyn_handle.digest
+        assert again_handle._handle_id != dyn_handle._handle_id
+
+        def orch_three(o, _args, _cfg):
+            o.submit_next_level(again_handle, chip_args_three, config)
+
+        worker.run(orch_three)
+        assert torch.allclose(args_three.f, torch.full_like(args_three.f, expected), rtol=1e-5, atol=1e-5)
+    finally:
+        worker.close()
+
+
+if __name__ == "__main__":
+    import sys
+
+    sys.exit(pytest.main([__file__, "-v"]))
diff --git a/tests/st/a2a3/fully_distributed_within_core/fanin_lookup_perf/kernels/aic/kernel_write_const_visible.cpp b/tests/st/a2a3/fully_distributed_within_core/fanin_lookup_perf/kernels/aic/kernel_write_const_visible.cpp
new file mode 100644
index 000000000..543cb8361
--- /dev/null
+++ b/tests/st/a2a3/fully_distributed_within_core/fanin_lookup_perf/kernels/aic/kernel_write_const_visible.cpp
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+#include <cstdint>
+#include <pto/pto-inst.hpp>
+
+#include "tensor.h"
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__ [aicore]  // NOLINT(whitespace/braces)
+#endif
+
+#include "intrinsic.h"
+
+#ifdef PTO_CPUSTUB_HPP
+#define dcci(...) \
+    do {          \
+    } while (0)
+#endif
+#ifndef SINGLE_CACHE_LINE
+#define SINGLE_CACHE_LINE 0
+#endif
+#ifndef CACHELINE_OUT
+#define CACHELINE_OUT 0
+#endif
+
+extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) {
+    __gm__ Tensor *out_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]);
+    __gm__ float *out = reinterpret_cast<__gm__ float *>(out_tensor->buffer.addr) + out_tensor->start_offset;
+
+    // Keep the swimlane bars visible. Lookup-only timing uses dummy tasks, so
+    // this spin does not affect the fanin lookup cost measurement.
+    volatile uint32_t spin = 0;
+    for (uint32_t i = 0; i < 4096; i++) {
+        spin += i;
+    }
+
+    out[0] = 42.0f;
+    if (spin == 0xffffffffu) {
+        out[0] = 43.0f;
+    }
+    dcci(&out[0], SINGLE_CACHE_LINE, CACHELINE_OUT);
+}
diff --git a/tests/st/a2a3/fully_distributed_within_core/fanin_lookup_perf/kernels/orchestration/fanin_lookup_perf_orch.cpp b/tests/st/a2a3/fully_distributed_within_core/fanin_lookup_perf/kernels/orchestration/fanin_lookup_perf_orch.cpp
new file mode 100644
index 000000000..566e213cd
--- /dev/null
+++ b/tests/st/a2a3/fully_distributed_within_core/fanin_lookup_perf/kernels/orchestration/fanin_lookup_perf_orch.cpp
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+/**
+ * Explicit 64x64 fanin DAG validation scene.
+ *
+ * Args layout:
+ *   tensor[0]: producer outputs, split into disjoint producer slices
+ *   tensor[1]: consumer outputs, split into disjoint consumer slices
+ *   scalar[0]: producer_count
+ *   scalar[1]: consumer_count
+ *   scalar[2]: use_real_kernels
+ *
+ * The scene submits producer_count independent producers, then consumer_count
+ * independent consumers where every consumer explicitly depends on every
+ * producer. Real-kernel mode writes disjoint tensor slices so tensormap
+ * auto-deps do not add producer chains or consumer chains.
+ *
+ * When use_real_kernels is false, the same dependency shape is submitted with
+ * dummy tasks to isolate orchestrator fanin lookup cost.
+ */
+
+#include <cstdint>
+
+#include "pto_orchestration_api.h"  // NOLINT(build/include_subdir)
+
+static constexpr int32_t MAX_PRODUCERS = 64;
+static constexpr int32_t MAX_CONSUMERS = 64;
+static constexpr uint32_t SLOT_ELEMS = 16;
+
+#define FUNC_WRITE_CONST 0
+
+extern "C" {
+
+__attribute__((visibility("default"))) PTO2OrchestrationConfig aicpu_orchestration_config(const L2TaskArgs &orch_args) {
+    (void)orch_args;  // NOLINT(readability/casting)
+    return PTO2OrchestrationConfig{
+        .expected_arg_count = 5,
+    };
+}
+
+__attribute__((visibility("default"))) void aicpu_orchestration_entry(const L2TaskArgs &orch_args) {
+    const Tensor &producer_outputs = orch_args.tensor(0).ref();
+    const Tensor &consumer_outputs = orch_args.tensor(1).ref();
+    int32_t producer_count = static_cast<int32_t>(orch_args.scalar(0));
+    int32_t consumer_count = static_cast<int32_t>(orch_args.scalar(1));
+    bool use_real_kernels = orch_args.scalar(2) != 0;
+    if (producer_count < 1 || producer_count > MAX_PRODUCERS || consumer_count < 1 || consumer_count > MAX_CONSUMERS) {
+        rt_report_fatal(
+            PTO2_ERROR_INVALID_ARGS,
+            "producer_count=%d consumer_count=%d exceed supported range producers=[1, %d] consumers=[1, %d]",
+            producer_count, consumer_count, MAX_PRODUCERS, MAX_CONSUMERS
+        );
+        return;
+    }
+
+    PTO2TaskId producer_ids[MAX_PRODUCERS];
+    uint32_t slot_shape[1] = {SLOT_ELEMS};
+    for (int32_t i = 0; i < producer_count; i++) {
+        L0TaskArgs args;
+        if (use_real_kernels) {
+            uint32_t offset[1] = {static_cast<uint32_t>(i) * SLOT_ELEMS};
+            Tensor producer_out = producer_outputs.view(slot_shape, offset);
+            args.add_inout(producer_out);
+            producer_ids[i] = rt_submit_aic_task(FUNC_WRITE_CONST, args).task_id();
+        } else {
+            producer_ids[i] = rt_submit_dummy_task(args).task_id();
+        }
+    }
+
+    for (int32_t c = 0; c < consumer_count; c++) {
+        L0TaskArgs args;
+        args.set_dependencies(producer_ids, static_cast<uint32_t>(producer_count));
+        if (use_real_kernels) {
+            uint32_t offset[1] = {static_cast<uint32_t>(c) * SLOT_ELEMS};
+            Tensor consumer_out = consumer_outputs.view(slot_shape, offset);
+            args.add_inout(consumer_out);
+            rt_submit_aic_task(FUNC_WRITE_CONST, args);
+        } else {
+            rt_submit_dummy_task(args);
+        }
+    }
+}
+
+}  // extern "C"
diff --git a/tests/st/a2a3/fully_distributed_within_core/fanin_lookup_perf/test_fanin_lookup_perf.py b/tests/st/a2a3/fully_distributed_within_core/fanin_lookup_perf/test_fanin_lookup_perf.py
new file mode 100644
index 000000000..02763e433
--- /dev/null
+++ b/tests/st/a2a3/fully_distributed_within_core/fanin_lookup_perf/test_fanin_lookup_perf.py
@@ -0,0 +1,87 @@
+#!/usr/bin/env python3
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""fanin_lookup_perf: validate a 64x64 explicit fanin DAG.
+
+The orchestration submits 64 independent producers and 64 independent
+consumers. Each consumer explicitly depends on all 64 producers. The real
+kernel case uses disjoint tensor slices so tensormap auto-deps do not add
+producer chains or consumer chains.
+"""
+
+import ctypes
+
+import torch
+from simpler.task_interface import ArgDirection as D
+
+from simpler_setup import Scalar, SceneTestCase, TaskArgsBuilder, Tensor, scene_test
+
+
+@scene_test(level=2, runtime="tensormap_and_ringbuffer")
+class TestFaninLookupPerf(SceneTestCase):
+    """Validate a wide 64-producer/64-consumer explicit fanin DAG."""
+
+    CALLABLE = {
+        "orchestration": {
+            "source": "kernels/orchestration/fanin_lookup_perf_orch.cpp",
+            "function_name": "aicpu_orchestration_entry",
+            "signature": [D.INOUT, D.INOUT],
+        },
+        "incores": [
+            {
+                "func_id": 0,
+                "name": "WRITE_CONST",
+                "source": "kernels/aic/kernel_write_const_visible.cpp",
+                "core_type": "aic",
+                # Single-AIC task with one INOUT tensor (args[0]). Declared so
+                # the tensor dump's per-subtask sum matches the payload.
+                "signature": [D.INOUT],
+            },
+        ],
+    }
+
+    CASES = [
+        {
+            "name": "LookupOnlyProducers64Consumers64",
+            "platforms": ["a2a3sim", "a2a3"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 24},
+            "params": {"producer_count": 64, "consumer_count": 64, "use_real_kernels": 0},
+        },
+        {
+            "name": "SwimlaneProducers64Consumers64",
+            "platforms": ["a2a3sim", "a2a3"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 24},
+            "params": {"producer_count": 64, "consumer_count": 64, "use_real_kernels": 1},
+        },
+    ]
+
+    def generate_args(self, params):
+        slot_elems = 16
+        producer_count = int(params["producer_count"])
+        consumer_count = int(params["consumer_count"])
+        return TaskArgsBuilder(
+            Tensor("producer_out", torch.full((producer_count * slot_elems,), -1.0, dtype=torch.float32)),
+            Tensor("consumer_out", torch.full((consumer_count * slot_elems,), -1.0, dtype=torch.float32)),
+            Scalar("producer_count", ctypes.c_int64(producer_count)),
+            Scalar("consumer_count", ctypes.c_int64(consumer_count)),
+            Scalar("use_real_kernels", ctypes.c_int64(int(params["use_real_kernels"]))),
+        )
+
+    def compute_golden(self, args, params):
+        if not params["use_real_kernels"]:
+            return
+        slot_elems = 16
+        for i in range(int(params["producer_count"])):
+            args.producer_out[i * slot_elems] = 42.0
+        for c in range(int(params["consumer_count"])):
+            args.consumer_out[c * slot_elems] = 42.0
+
+
+if __name__ == "__main__":
+    SceneTestCase.run_module(__name__)
diff --git a/tests/st/a2a3/fully_distributed_within_core/mixed_example/kernels/aic/kernel_matmul.cpp b/tests/st/a2a3/fully_distributed_within_core/mixed_example/kernels/aic/kernel_matmul.cpp
new file mode 100644
index 000000000..607e5d657
--- /dev/null
+++ b/tests/st/a2a3/fully_distributed_within_core/mixed_example/kernels/aic/kernel_matmul.cpp
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Matrix Multiplication Kernel (Cube Core)
+ *
+ * Computes: C = A @ B (TILE x TILE x TILE matmul)
+ * Uses TMATMUL instruction
+ *
+ * Args (Tensor*):
+ *   args[0] = A (INPUT)  - TILE x TILE
+ *   args[1] = B (INPUT)  - TILE x TILE
+ *   args[2] = C (OUTPUT) - TILE x TILE
+ */
+
+#include <cstdint>
+#include <pto/pto-inst.hpp>
+#include <pto/common/constants.hpp>
+#include <pto/common/pto_tile.hpp>
+
+#include "tensor.h"
+
+using namespace pto;
+
+#include "pipe_sync.h"
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__ [aicore]
+#endif
+
+template <typename T>
+AICORE constexpr inline T CeilAlign(T num_1, T num_2) {
+    if (num_2 == 0) {
+        return 0;
+    }
+    return (num_1 + num_2 - 1) / num_2 * num_2;
+}
+
+static __aicore__ inline int get_num_tiles(__gm__ Tensor *tensor, uint64_t tile_elems) {
+    uint64_t total_elems = tensor->shapes[0];
+    return static_cast<int>(total_elems / tile_elems);
+}
+
+template <int TILE>
+static __aicore__ void matmul_impl(__gm__ float *input_a, __gm__ float *input_b, __gm__ float *output) {
+    constexpr int blockAlign = C0_SIZE_BYTE / sizeof(float);
+    constexpr int M = CeilAlign<int>(TILE, 16);
+    constexpr int K = CeilAlign<int>(TILE, blockAlign);
+    constexpr int N = CeilAlign<int>(TILE, blockAlign);
+
+    using GlobalDataA = GlobalTensor<
+        float, Shape<1, 1, 1, TILE, TILE>, pto::Stride<1 * TILE * TILE, 1 * TILE * TILE, TILE * TILE, TILE, 1>>;
+    using GlobalDataB = GlobalTensor<
+        float, Shape<1, 1, 1, TILE, TILE>, pto::Stride<1 * TILE * TILE, 1 * TILE * TILE, TILE * TILE, TILE, 1>>;
+    using GlobalDataC = GlobalTensor<
+        float, Shape<1, 1, 1, TILE, TILE>, pto::Stride<1 * TILE * TILE, 1 * TILE * TILE, TILE * TILE, TILE, 1>>;
+
+    GlobalDataA src0Global(input_a);
+    GlobalDataB src1Global(input_b);
+    GlobalDataC dstGlobal(output);
+
+    using TileMatA = Tile<TileType::Mat, float, M, K, BLayout::ColMajor, TILE, TILE, SLayout::RowMajor, 512>;
+    using TileMatB = Tile<TileType::Mat, float, K, N, BLayout::ColMajor, TILE, TILE, SLayout::RowMajor, 512>;
+
+    using LeftTile = TileLeft<float, M, K, TILE, TILE>;
+    using RightTile = TileRight<float, K, N, TILE, TILE>;
+    using AccTile = TileAcc<float, M, N, TILE, TILE>;
+
+    TileMatA aMatTile;
+    TileMatB bMatTile;
+    TASSIGN(aMatTile, 0x0);
+    TASSIGN(bMatTile, 0x20000);
+
+    LeftTile aTile;
+    RightTile bTile;
+    AccTile cTile;
+    TASSIGN(aTile, 0x0);
+    TASSIGN(bTile, 0x0);
+    TASSIGN(cTile, 0x0);
+
+    TLOAD(aMatTile, src0Global);
+    TLOAD(bMatTile, src1Global);
+
+    set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
+    wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
+
+    TMOV(aTile, aMatTile);
+    TMOV(bTile, bMatTile);
+
+    set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
+    wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
+
+    TMATMUL(cTile, aTile, bTile);
+
+    set_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
+    wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
+
+    TSTORE(dstGlobal, cTile);
+
+    pipe_sync();
+}
+
+extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) {
+    __gm__ Tensor *input_a = reinterpret_cast<__gm__ Tensor *>(args[0]);
+    __gm__ Tensor *input_b = reinterpret_cast<__gm__ Tensor *>(args[1]);
+    __gm__ Tensor *output = reinterpret_cast<__gm__ Tensor *>(args[2]);
+
+    constexpr uint64_t TILE_ELEMS = 128 * 128;
+    int num_tiles = get_num_tiles(input_a, TILE_ELEMS);
+
+    __gm__ float *base_a = reinterpret_cast<__gm__ float *>(input_a->buffer.addr) + input_a->start_offset;
+    __gm__ float *base_b = reinterpret_cast<__gm__ float *>(input_b->buffer.addr) + input_b->start_offset;
+    __gm__ float *base_c = reinterpret_cast<__gm__ float *>(output->buffer.addr) + output->start_offset;
+
+    for (int tile_idx = 0; tile_idx < num_tiles; tile_idx++) {
+        __gm__ float *a_ptr = base_a + (tile_idx * TILE_ELEMS);
+        __gm__ float *b_ptr = base_b + (tile_idx * TILE_ELEMS);
+        __gm__ float *c_ptr = base_c + (tile_idx * TILE_ELEMS);
+
+        matmul_impl<128>(a_ptr, b_ptr, c_ptr);
+    }
+}
diff --git a/tests/st/a2a3/fully_distributed_within_core/mixed_example/kernels/aiv/kernel_add.cpp b/tests/st/a2a3/fully_distributed_within_core/mixed_example/kernels/aiv/kernel_add.cpp
new file mode 100644
index 000000000..ddd9e94d4
--- /dev/null
+++ b/tests/st/a2a3/fully_distributed_within_core/mixed_example/kernels/aiv/kernel_add.cpp
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Element-wise Tensor Addition Kernel (for mixed task)
+ *
+ * Implements: out[i] = src0[i] + src1[i]
+ * Tile size: 128 x 128
+ *
+ * In the mixed task, this kernel shares the param list with the matmul kernel.
+ * Matmul uses args[0..2], this kernel uses args[3..5].
+ *
+ * Args (Tensor*):
+ *   args[3] = src0 (INPUT)  - 128 x 128
+ *   args[4] = src1 (INPUT)  - 128 x 128
+ *   args[5] = out (OUTPUT)  - 128 x 128
+ */
+
+#include <cstdint>
+#include <pto/pto-inst.hpp>
+
+#include "tensor.h"
+
+using namespace pto;
+
+#include "pipe_sync.h"
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__ [aicore]
+#endif
+
+static __aicore__ inline int get_num_tiles(__gm__ Tensor *tensor, uint64_t tile_elems) {
+    uint64_t total_elems = tensor->shapes[0];
+    return static_cast<int>(total_elems / tile_elems);
+}
+
+template <int ROWS, int COLS>
+static __aicore__ void add_impl(__gm__ float *src0, __gm__ float *src1, __gm__ float *out) {
+    using DynShapeDim5 = Shape<1, 1, 1, ROWS, COLS>;
+    using DynStridDim5 = Stride<1, 1, 1, COLS, 1>;
+    using GlobalData = GlobalTensor<float, DynShapeDim5, DynStridDim5>;
+    using TileData = Tile<TileType::Vec, float, ROWS, COLS, BLayout::RowMajor, -1, -1>;
+
+    TileData src0Tile(ROWS, COLS);
+    TileData src1Tile(ROWS, COLS);
+    TileData dstTile(ROWS, COLS);
+    TASSIGN(src0Tile, 0x0);
+    TASSIGN(src1Tile, 0x10000);
+    TASSIGN(dstTile, 0x20000);
+
+    GlobalData src0Global(src0);
+    GlobalData src1Global(src1);
+    GlobalData dstGlobal(out);
+
+    TLOAD(src0Tile, src0Global);
+    TLOAD(src1Tile, src1Global);
+    set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+    wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+    TADD(dstTile, src0Tile, src1Tile);
+    set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+    wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+    TSTORE(dstGlobal, dstTile);
+    pipe_sync();
+}
+
+extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) {
+    __gm__ Tensor *src0_tensor = reinterpret_cast<__gm__ Tensor *>(args[3]);
+    __gm__ Tensor *src1_tensor = reinterpret_cast<__gm__ Tensor *>(args[4]);
+    __gm__ Tensor *out_tensor = reinterpret_cast<__gm__ Tensor *>(args[5]);
+
+    constexpr uint64_t TILE_ELEMS = 128 * 128;
+    int num_tiles = get_num_tiles(src0_tensor, TILE_ELEMS);
+
+    __gm__ float *base_src0 = reinterpret_cast<__gm__ float *>(src0_tensor->buffer.addr) + src0_tensor->start_offset;
+    __gm__ float *base_src1 = reinterpret_cast<__gm__ float *>(src1_tensor->buffer.addr) + src1_tensor->start_offset;
+    __gm__ float *base_out = reinterpret_cast<__gm__ float *>(out_tensor->buffer.addr) + out_tensor->start_offset;
+
+    for (int tile_idx = 0; tile_idx < num_tiles; tile_idx++) {
+        __gm__ float *src0_ptr = base_src0 + (tile_idx * TILE_ELEMS);
+        __gm__ float *src1_ptr = base_src1 + (tile_idx * TILE_ELEMS);
+        __gm__ float *out_ptr = base_out + (tile_idx * TILE_ELEMS);
+
+        add_impl<128, 128>(src0_ptr, src1_ptr, out_ptr);
+    }
+}
diff --git a/tests/st/a2a3/fully_distributed_within_core/mixed_example/kernels/aiv/kernel_add_standalone.cpp b/tests/st/a2a3/fully_distributed_within_core/mixed_example/kernels/aiv/kernel_add_standalone.cpp
new file mode 100644
index 000000000..02568f395
--- /dev/null
+++ b/tests/st/a2a3/fully_distributed_within_core/mixed_example/kernels/aiv/kernel_add_standalone.cpp
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Standalone Element-wise Addition Kernel
+ *
+ * Implements: out[i] = src0[i] + src1[i]
+ * Tile size: 128 x 128
+ *
+ * Reads args[0..2] — for standalone AIV_X1 tasks or AIV0 slot in AIV_X2.
+ *
+ * Args (Tensor*):
+ *   args[0] = src0 (INPUT)  - 128 x 128
+ *   args[1] = src1 (INPUT)  - 128 x 128
+ *   args[2] = out (OUTPUT)  - 128 x 128
+ */
+
+#include <cstdint>
+#include <pto/pto-inst.hpp>
+
+#include "tensor.h"
+
+using namespace pto;
+
+#include "pipe_sync.h"
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__ [aicore]
+#endif
+
+template <int ROWS, int COLS>
+static __aicore__ void add_impl(__gm__ float *src0, __gm__ float *src1, __gm__ float *out) {
+    using DynShapeDim5 = Shape<1, 1, 1, ROWS, COLS>;
+    using DynStridDim5 = Stride<1, 1, 1, COLS, 1>;
+    using GlobalData = GlobalTensor<float, DynShapeDim5, DynStridDim5>;
+    using TileData = Tile<TileType::Vec, float, ROWS, COLS, BLayout::RowMajor, -1, -1>;
+
+    TileData src0Tile(ROWS, COLS);
+    TileData src1Tile(ROWS, COLS);
+    TileData dstTile(ROWS, COLS);
+    TASSIGN(src0Tile, 0x0);
+    TASSIGN(src1Tile, 0x10000);
+    TASSIGN(dstTile, 0x20000);
+
+    GlobalData src0Global(src0);
+    GlobalData src1Global(src1);
+    GlobalData dstGlobal(out);
+
+    TLOAD(src0Tile, src0Global);
+    TLOAD(src1Tile, src1Global);
+    set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+    wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+    TADD(dstTile, src0Tile, src1Tile);
+    set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+    wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+    TSTORE(dstGlobal, dstTile);
+    pipe_sync();
+}
+
+extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) {
+    __gm__ Tensor *src0_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]);
+    __gm__ Tensor *src1_tensor = reinterpret_cast<__gm__ Tensor *>(args[1]);
+    __gm__ Tensor *out_tensor = reinterpret_cast<__gm__ Tensor *>(args[2]);
+
+    __gm__ float *src0 = reinterpret_cast<__gm__ float *>(src0_tensor->buffer.addr) + src0_tensor->start_offset;
+    __gm__ float *src1 = reinterpret_cast<__gm__ float *>(src1_tensor->buffer.addr) + src1_tensor->start_offset;
+    __gm__ float *out = reinterpret_cast<__gm__ float *>(out_tensor->buffer.addr) + out_tensor->start_offset;
+
+    add_impl<128, 128>(src0, src1, out);
+}
diff --git a/tests/st/a2a3/fully_distributed_within_core/mixed_example/kernels/aiv/kernel_mul.cpp b/tests/st/a2a3/fully_distributed_within_core/mixed_example/kernels/aiv/kernel_mul.cpp
new file mode 100644
index 000000000..40bbe7058
--- /dev/null
+++ b/tests/st/a2a3/fully_distributed_within_core/mixed_example/kernels/aiv/kernel_mul.cpp
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Element-wise Tensor Multiplication Kernel (for mixed task, AIV1 slot)
+ *
+ * Implements: out[i] = src0[i] * src1[i]
+ * Tile size: 128 x 128
+ *
+ * In the mixed task, this kernel occupies the AIV1 slot and shares the param
+ * list with the matmul kernel (args[0..2]) and add kernel (args[3..5]).
+ * This kernel uses args[6..8].
+ *
+ * Args (Tensor*):
+ *   args[6] = src0 (INPUT)  - 128 x 128
+ *   args[7] = src1 (INPUT)  - 128 x 128
+ *   args[8] = out (OUTPUT)  - 128 x 128
+ */
+
+#include <cstdint>
+#include <pto/pto-inst.hpp>
+
+#include "tensor.h"
+
+using namespace pto;
+
+#include "pipe_sync.h"
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__ [aicore]
+#endif
+
+static __aicore__ inline int get_num_tiles(__gm__ Tensor *tensor, uint64_t tile_elems) {
+    uint64_t total_elems = tensor->shapes[0];
+    return static_cast<int>(total_elems / tile_elems);
+}
+
+template <int ROWS, int COLS>
+static __aicore__ void mul_impl(__gm__ float *src0, __gm__ float *src1, __gm__ float *out) {
+    using DynShapeDim5 = Shape<1, 1, 1, ROWS, COLS>;
+    using DynStridDim5 = Stride<1, 1, 1, COLS, 1>;
+    using GlobalData = GlobalTensor<float, DynShapeDim5, DynStridDim5>;
+    using TileData = Tile<TileType::Vec, float, ROWS, COLS, BLayout::RowMajor, -1, -1>;
+
+    TileData src0Tile(ROWS, COLS);
+    TileData src1Tile(ROWS, COLS);
+    TileData dstTile(ROWS, COLS);
+    TASSIGN(src0Tile, 0x0);
+    TASSIGN(src1Tile, 0x10000);
+    TASSIGN(dstTile, 0x20000);
+
+    GlobalData src0Global(src0);
+    GlobalData src1Global(src1);
+    GlobalData dstGlobal(out);
+
+    TLOAD(src0Tile, src0Global);
+    TLOAD(src1Tile, src1Global);
+    set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+    wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+    TMUL(dstTile, src0Tile, src1Tile);
+    set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+    wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+    TSTORE(dstGlobal, dstTile);
+    pipe_sync();
+}
+
+extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) {
+    __gm__ Tensor *src0_tensor = reinterpret_cast<__gm__ Tensor *>(args[6]);
+    __gm__ Tensor *src1_tensor = reinterpret_cast<__gm__ Tensor *>(args[7]);
+    __gm__ Tensor *out_tensor = reinterpret_cast<__gm__ Tensor *>(args[8]);
+
+    constexpr uint64_t TILE_ELEMS = 128 * 128;
+    int num_tiles = get_num_tiles(src0_tensor, TILE_ELEMS);
+
+    __gm__ float *base_src0 = reinterpret_cast<__gm__ float *>(src0_tensor->buffer.addr) + src0_tensor->start_offset;
+    __gm__ float *base_src1 = reinterpret_cast<__gm__ float *>(src1_tensor->buffer.addr) + src1_tensor->start_offset;
+    __gm__ float *base_out = reinterpret_cast<__gm__ float *>(out_tensor->buffer.addr) + out_tensor->start_offset;
+
+    for (int tile_idx = 0; tile_idx < num_tiles; tile_idx++) {
+        __gm__ float *src0_ptr = base_src0 + (tile_idx * TILE_ELEMS);
+        __gm__ float *src1_ptr = base_src1 + (tile_idx * TILE_ELEMS);
+        __gm__ float *out_ptr = base_out + (tile_idx * TILE_ELEMS);
+
+        mul_impl<128, 128>(src0_ptr, src1_ptr, out_ptr);
+    }
+}
diff --git a/tests/st/a2a3/fully_distributed_within_core/mixed_example/kernels/aiv/kernel_mul_standalone.cpp b/tests/st/a2a3/fully_distributed_within_core/mixed_example/kernels/aiv/kernel_mul_standalone.cpp
new file mode 100644
index 000000000..81899cd4b
--- /dev/null
+++ b/tests/st/a2a3/fully_distributed_within_core/mixed_example/kernels/aiv/kernel_mul_standalone.cpp
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Standalone Element-wise Multiplication Kernel (AIV1 slot)
+ *
+ * Implements: out[i] = src0[i] * src1[i]
+ * Tile size: 128 x 128
+ *
+ * Reads args[3..5] — for AIV1 slot in AIV_X2 tasks where AIV0 uses args[0..2].
+ *
+ * Args (Tensor*):
+ *   args[3] = src0 (INPUT)  - 128 x 128
+ *   args[4] = src1 (INPUT)  - 128 x 128
+ *   args[5] = out (OUTPUT)  - 128 x 128
+ */
+
+#include <cstdint>
+#include <pto/pto-inst.hpp>
+
+#include "tensor.h"
+
+using namespace pto;
+
+#include "pipe_sync.h"
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__ [aicore]
+#endif
+
+template <int ROWS, int COLS>
+static __aicore__ void mul_impl(__gm__ float *src0, __gm__ float *src1, __gm__ float *out) {
+    using DynShapeDim5 = Shape<1, 1, 1, ROWS, COLS>;
+    using DynStridDim5 = Stride<1, 1, 1, COLS, 1>;
+    using GlobalData = GlobalTensor<float, DynShapeDim5, DynStridDim5>;
+    using TileData = Tile<TileType::Vec, float, ROWS, COLS, BLayout::RowMajor, -1, -1>;
+
+    TileData src0Tile(ROWS, COLS);
+    TileData src1Tile(ROWS, COLS);
+    TileData dstTile(ROWS, COLS);
+    TASSIGN(src0Tile, 0x0);
+    TASSIGN(src1Tile, 0x10000);
+    TASSIGN(dstTile, 0x20000);
+
+    GlobalData src0Global(src0);
+    GlobalData src1Global(src1);
+    GlobalData dstGlobal(out);
+
+    TLOAD(src0Tile, src0Global);
+    TLOAD(src1Tile, src1Global);
+    set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+    wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+    TMUL(dstTile, src0Tile, src1Tile);
+    set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+    wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+    TSTORE(dstGlobal, dstTile);
+    pipe_sync();
+}
+
+extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) {
+    __gm__ Tensor *src0_tensor = reinterpret_cast<__gm__ Tensor *>(args[3]);
+    __gm__ Tensor *src1_tensor = reinterpret_cast<__gm__ Tensor *>(args[4]);
+    __gm__ Tensor *out_tensor = reinterpret_cast<__gm__ Tensor *>(args[5]);
+
+    __gm__ float *src0 = reinterpret_cast<__gm__ float *>(src0_tensor->buffer.addr) + src0_tensor->start_offset;
+    __gm__ float *src1 = reinterpret_cast<__gm__ float *>(src1_tensor->buffer.addr) + src1_tensor->start_offset;
+    __gm__ float *out = reinterpret_cast<__gm__ float *>(out_tensor->buffer.addr) + out_tensor->start_offset;
+
+    mul_impl<128, 128>(src0, src1, out);
+}
diff --git a/tests/st/a2a3/fully_distributed_within_core/mixed_example/kernels/orchestration/mixed_orch.cpp b/tests/st/a2a3/fully_distributed_within_core/mixed_example/kernels/orchestration/mixed_orch.cpp
new file mode 100644
index 000000000..6e3d53054
--- /dev/null
+++ b/tests/st/a2a3/fully_distributed_within_core/mixed_example/kernels/orchestration/mixed_orch.cpp
@@ -0,0 +1,163 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Mixed AIC+AIV Orchestration Function (tensormap_and_ringbuffer Runtime)
+ *
+ * Covers all 5 resource shapes per iteration:
+ *   1. AIC_AIV_X2: AIC matmul(A,B->C) + AIV0 add(D,E->F) + AIV1 mul(G,H->I)
+ *   2. AIC_ONLY:   matmul(A,B->J)
+ *   3. AIV_X1:     add(D,E->K)
+ *   4. AIV_X2:     AIV0 add(D,E->L) + AIV1 mul(G,H->M)
+ *   5. AIC_AIV_X1: AIC matmul(A,B->N) + AIV0 add(D,E->O)
+ *
+ * Arg layout (15 args):
+ *   [A, B, C, D, E, F, G, H, I, J, K, L, M, N, O]
+ *   Shape/dtype/size in tensor metadata.
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "pto_orchestration_api.h"  // NOLINT(build/include_subdir)
+
+// Mixed-task kernels (args offset matches param position in mixed param list)
+#define FUNC_MATMUL 0  // AIC: reads args[0..2]
+#define FUNC_ADD 1     // AIV0 in mixed: reads args[3..5]
+#define FUNC_MUL 2     // AIV1 in mixed: reads args[6..8]
+// Standalone kernels (read args[0..2] or args[3..5])
+#define FUNC_ADD_STANDALONE 3  // AIV: reads args[0..2]
+#define FUNC_MUL_STANDALONE 4  // AIV1 in AIV_X2: reads args[3..5]
+
+static constexpr uint32_t TILE_ELEMS = 128 * 128;
+
+extern "C" {
+
+__attribute__((visibility("default"))) PTO2OrchestrationConfig aicpu_orchestration_config(const L2TaskArgs &orch_args) {
+    (void)orch_args;  // NOLINT(readability/casting)
+    return PTO2OrchestrationConfig{
+        .expected_arg_count = 15,
+    };
+}
+
+__attribute__((visibility("default"))) void aicpu_orchestration_entry(const L2TaskArgs &orch_args) {
+    // Input tensors use orch_args.tensor(i).ref() — golden shape = kernel shape
+    const Tensor &ext_A = orch_args.tensor(0).ref();
+    const Tensor &ext_B = orch_args.tensor(1).ref();
+    const Tensor &ext_D = orch_args.tensor(3).ref();
+    const Tensor &ext_E = orch_args.tensor(4).ref();
+    const Tensor &ext_G = orch_args.tensor(6).ref();
+    const Tensor &ext_H = orch_args.tensor(7).ref();
+
+    // Output tensors — full buffers
+    const Tensor &ext_C = orch_args.tensor(2).ref();
+    const Tensor &ext_F = orch_args.tensor(5).ref();
+    const Tensor &ext_I = orch_args.tensor(8).ref();
+    const Tensor &ext_J = orch_args.tensor(9).ref();
+    const Tensor &ext_K = orch_args.tensor(10).ref();
+    const Tensor &ext_L = orch_args.tensor(11).ref();
+    const Tensor &ext_M = orch_args.tensor(12).ref();
+    const Tensor &ext_N = orch_args.tensor(13).ref();
+    const Tensor &ext_O = orch_args.tensor(14).ref();
+
+    // Derive num_iters from output tensor size
+    uint32_t total_elems = orch_args.tensor(2).ref().shapes[0];
+    int num_iters = static_cast<int>(total_elems / TILE_ELEMS);
+
+    LOG_INFO_V0("[mixed_orch] num_iters=%d", num_iters);
+
+    for (int i = 0; i < num_iters; i++) {
+        PTO2_SCOPE() {
+            uint32_t view_shapes[1] = {TILE_ELEMS};
+            uint32_t view_offsets[1] = {static_cast<uint32_t>(i) * TILE_ELEMS};
+
+            Tensor C_view = ext_C.view(view_shapes, view_offsets);
+            Tensor F_view = ext_F.view(view_shapes, view_offsets);
+            Tensor I_view = ext_I.view(view_shapes, view_offsets);
+            Tensor J_view = ext_J.view(view_shapes, view_offsets);
+            Tensor K_view = ext_K.view(view_shapes, view_offsets);
+            Tensor L_view = ext_L.view(view_shapes, view_offsets);
+            Tensor M_view = ext_M.view(view_shapes, view_offsets);
+            Tensor N_view = ext_N.view(view_shapes, view_offsets);
+            Tensor O_view = ext_O.view(view_shapes, view_offsets);
+
+            // 1. AIC_AIV_X2: matmul + add + mul
+            {
+                MixedKernels mk;
+                mk.aic_kernel_id = FUNC_MATMUL;
+                mk.aiv0_kernel_id = FUNC_ADD;
+                mk.aiv1_kernel_id = FUNC_MUL;
+                L0TaskArgs args;
+                args.add_input(ext_A);
+                args.add_input(ext_B);
+                args.add_output(C_view);
+                args.add_input(ext_D);
+                args.add_input(ext_E);
+                args.add_output(F_view);
+                args.add_input(ext_G);
+                args.add_input(ext_H);
+                args.add_output(I_view);
+                rt_submit_task(mk, args);
+            }
+
+            // 2. AIC_ONLY: standalone matmul
+            {
+                L0TaskArgs args;
+                args.add_input(ext_A);
+                args.add_input(ext_B);
+                args.add_output(J_view);
+                rt_submit_aic_task(FUNC_MATMUL, args);
+            }
+
+            // 3. AIV_X1: standalone add
+            {
+                L0TaskArgs args;
+                args.add_input(ext_D);
+                args.add_input(ext_E);
+                args.add_output(K_view);
+                rt_submit_aiv_task(FUNC_ADD_STANDALONE, args);
+            }
+
+            // 4. AIV_X2: add (AIV0) + mul (AIV1)
+            {
+                MixedKernels mk;
+                mk.aiv0_kernel_id = FUNC_ADD_STANDALONE;
+                mk.aiv1_kernel_id = FUNC_MUL_STANDALONE;
+                L0TaskArgs args;
+                args.add_input(ext_D);
+                args.add_input(ext_E);
+                args.add_output(L_view);
+                args.add_input(ext_G);
+                args.add_input(ext_H);
+                args.add_output(M_view);
+                rt_submit_task(mk, args);
+            }
+
+            // 5. AIC_AIV_X1: matmul (AIC) + add (AIV0)
+            {
+                MixedKernels mk;
+                mk.aic_kernel_id = FUNC_MATMUL;
+                mk.aiv0_kernel_id = FUNC_ADD;
+                L0TaskArgs args;
+                args.add_input(ext_A);
+                args.add_input(ext_B);
+                args.add_output(N_view);
+                args.add_input(ext_D);
+                args.add_input(ext_E);
+                args.add_output(O_view);
+                rt_submit_task(mk, args);
+            }
+        }
+    }
+
+    LOG_INFO_V0("[mixed_orch] Submitted %d iterations x 5 shapes = %d tasks", num_iters, num_iters * 5);
+}
+
+}  // extern "C"
diff --git a/tests/st/a2a3/fully_distributed_within_core/mixed_example/test_mixed_example.py b/tests/st/a2a3/fully_distributed_within_core/mixed_example/test_mixed_example.py
new file mode 100644
index 000000000..da21e903c
--- /dev/null
+++ b/tests/st/a2a3/fully_distributed_within_core/mixed_example/test_mixed_example.py
@@ -0,0 +1,159 @@
+#!/usr/bin/env python3
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""Mixed AIC+AIV example: all 5 resource shapes per iteration.
+
+Args layout (15 tensors): [A, B, C, D, E, F, G, H, I, J, K, L, M, N, O]
+"""
+
+import torch
+from simpler.task_interface import ArgDirection as D
+
+from simpler_setup import SceneTestCase, TaskArgsBuilder, Tensor, scene_test
+
+MATMUL_SIZE = 128
+TILE_ELEMS = 128 * 128
+
+
+@scene_test(level=2, runtime="tensormap_and_ringbuffer")
+class TestMixedExample(SceneTestCase):
+    RTOL = 1e-3
+    ATOL = 1e-3
+
+    CALLABLE = {
+        "orchestration": {
+            "source": "kernels/orchestration/mixed_orch.cpp",
+            "function_name": "aicpu_orchestration_entry",
+            "signature": [
+                D.IN,
+                D.IN,
+                D.OUT,
+                D.IN,
+                D.IN,
+                D.OUT,
+                D.IN,
+                D.IN,
+                D.OUT,
+                D.OUT,
+                D.OUT,
+                D.OUT,
+                D.OUT,
+                D.OUT,
+                D.OUT,
+            ],
+        },
+        "incores": [
+            {
+                "func_id": 0,
+                "name": "MATMUL",
+                "source": "kernels/aic/kernel_matmul.cpp",
+                "core_type": "aic",
+                "signature": [D.IN, D.IN, D.OUT],
+            },
+            {
+                "func_id": 1,
+                "name": "ADD",
+                "source": "kernels/aiv/kernel_add.cpp",
+                "core_type": "aiv",
+                "signature": [D.IN, D.IN, D.OUT],
+            },
+            {
+                "func_id": 2,
+                "name": "MUL",
+                "source": "kernels/aiv/kernel_mul.cpp",
+                "core_type": "aiv",
+                "signature": [D.IN, D.IN, D.OUT],
+            },
+            {
+                "func_id": 3,
+                "name": "ADD_STANDALONE",
+                "source": "kernels/aiv/kernel_add_standalone.cpp",
+                "core_type": "aiv",
+                "signature": [D.IN, D.IN, D.OUT],
+            },
+            {
+                "func_id": 4,
+                "name": "MUL_STANDALONE",
+                "source": "kernels/aiv/kernel_mul_standalone.cpp",
+                "core_type": "aiv",
+                "signature": [D.IN, D.IN, D.OUT],
+            },
+        ],
+    }
+
+    CASES = [
+        {
+            "name": "case1",
+            "platforms": ["a2a3sim", "a2a3"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 3},
+            "params": {"num_iters": 4},
+        },
+        {
+            "name": "case2",
+            "manual": True,
+            "platforms": ["a2a3sim", "a2a3"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 3},
+            "params": {"num_iters": 1},
+        },
+    ]
+
+    def generate_args(self, params):
+        num_iters = params["num_iters"]
+        torch.manual_seed(42)
+        A = torch.randn(MATMUL_SIZE, MATMUL_SIZE, dtype=torch.float32) * 0.01
+        B = torch.randn(MATMUL_SIZE, MATMUL_SIZE, dtype=torch.float32) * 0.01
+        D_t = torch.randn(TILE_ELEMS, dtype=torch.float32) * 0.01
+        E = torch.randn(TILE_ELEMS, dtype=torch.float32) * 0.01
+        G = torch.randn(TILE_ELEMS, dtype=torch.float32) * 0.01
+        H = torch.randn(TILE_ELEMS, dtype=torch.float32) * 0.01
+
+        def z():
+            return torch.zeros(num_iters * TILE_ELEMS, dtype=torch.float32)
+
+        return TaskArgsBuilder(
+            Tensor("A", A.flatten()),
+            Tensor("B", B.flatten()),
+            Tensor("C", z()),
+            Tensor("D", D_t),
+            Tensor("E", E),
+            Tensor("F", z()),
+            Tensor("G", G),
+            Tensor("H", H),
+            Tensor("I", z()),
+            Tensor("J", z()),
+            Tensor("K", z()),
+            Tensor("L", z()),
+            Tensor("M", z()),
+            Tensor("N", z()),
+            Tensor("O", z()),
+        )
+
+    def compute_golden(self, args, params):
+        num_iters = params["num_iters"]
+        golden_matmul = torch.matmul(
+            args.A.reshape(MATMUL_SIZE, MATMUL_SIZE), args.B.reshape(MATMUL_SIZE, MATMUL_SIZE)
+        ).flatten()
+        golden_add = args.D + args.E
+        golden_mul = args.G * args.H
+        for t in [args.C, args.J, args.N]:
+            out = t.reshape(num_iters, TILE_ELEMS)
+            for i in range(num_iters):
+                out[i] = golden_matmul
+        for t in [args.F, args.K, args.L, args.O]:
+            out = t.reshape(num_iters, TILE_ELEMS)
+            for i in range(num_iters):
+                out[i] = golden_add
+        for t in [args.I, args.M]:
+            out = t.reshape(num_iters, TILE_ELEMS)
+            for i in range(num_iters):
+                out[i] = golden_mul
+
+
+if __name__ == "__main__":
+    SceneTestCase.run_module(__name__)
diff --git a/tests/st/a2a3/fully_distributed_within_core/multi_round_paged_attention/test_multi_round_paged_attention.py b/tests/st/a2a3/fully_distributed_within_core/multi_round_paged_attention/test_multi_round_paged_attention.py
new file mode 100644
index 000000000..521cf7ef2
--- /dev/null
+++ b/tests/st/a2a3/fully_distributed_within_core/multi_round_paged_attention/test_multi_round_paged_attention.py
@@ -0,0 +1,155 @@
+#!/usr/bin/env python3
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""Multi-round paged attention: benchmark multi-round execution (default 10 rounds).
+
+Run with --rounds 10 --skip-golden for benchmarking.
+"""
+
+import torch
+from simpler.task_interface import ArgDirection as D
+
+from simpler_setup import Scalar, SceneTestCase, TaskArgsBuilder, Tensor, scene_test
+from simpler_setup.goldens.paged_attention import compute_golden as _pa_compute_golden
+from simpler_setup.goldens.paged_attention import generate_inputs as _pa_generate_inputs
+
+_PA_KERNELS = "../../../../../examples/a2a3/tensormap_and_ringbuffer/paged_attention/kernels"
+
+
+@scene_test(level=2, runtime="tensormap_and_ringbuffer")
+class TestMultiRoundPagedAttention(SceneTestCase):
+    RTOL = 1e-2
+    ATOL = 1e-2
+
+    CALLABLE = {
+        "orchestration": {
+            "source": f"{_PA_KERNELS}/orchestration/paged_attention_orch.cpp",
+            "function_name": "aicpu_orchestration_entry",
+            "signature": [D.IN, D.IN, D.IN, D.IN, D.IN, D.OUT],
+        },
+        "incores": [
+            {
+                "func_id": 0,
+                "name": "QK",
+                "source": f"{_PA_KERNELS}/aic/aic_qk_matmul.cpp",
+                "core_type": "aic",
+                "signature": [D.IN, D.IN, D.OUT],
+            },
+            {
+                "func_id": 1,
+                "name": "SF",
+                "source": f"{_PA_KERNELS}/aiv/aiv_softmax_prepare.cpp",
+                "core_type": "aiv",
+                "signature": [D.IN, D.OUT, D.OUT, D.OUT],
+            },
+            {
+                "func_id": 2,
+                "name": "PV",
+                "source": f"{_PA_KERNELS}/aic/aic_pv_matmul.cpp",
+                "core_type": "aic",
+                "signature": [D.IN, D.IN, D.OUT],
+            },
+            {
+                "func_id": 3,
+                "name": "UP",
+                "source": f"{_PA_KERNELS}/aiv/aiv_online_update.cpp",
+                "core_type": "aiv",
+                "signature": [D.IN, D.IN, D.IN, D.INOUT, D.INOUT, D.INOUT, D.INOUT],
+            },
+        ],
+    }
+
+    CASES = [
+        {
+            "name": "Case1",
+            "platforms": ["a2a3sim", "a2a3"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 24},
+            "params": {
+                "batch": 1,
+                "num_heads": 16,
+                "kv_head_num": 1,
+                "head_dim": 16,
+                "block_size": 16,
+                "context_len": 33,
+                "max_model_len": 256,
+                "dtype": "bfloat16",
+            },
+        },
+        {
+            "name": "Case2",
+            "platforms": ["a2a3sim", "a2a3"],
+            "manual": True,
+            "config": {"aicpu_thread_num": 4, "block_dim": 24},
+            "params": {
+                "batch": 1,
+                "num_heads": 16,
+                "kv_head_num": 1,
+                "head_dim": 16,
+                "block_size": 16,
+                "context_len": 128,
+                "max_model_len": 256,
+                "dtype": "bfloat16",
+            },
+        },
+        {
+            "name": "CaseVarSeq2",
+            "platforms": ["a2a3sim", "a2a3"],
+            "manual": True,
+            "config": {"aicpu_thread_num": 4, "block_dim": 24},
+            "params": {
+                "batch": 2,
+                "num_heads": 16,
+                "kv_head_num": 1,
+                "head_dim": 16,
+                "block_size": 16,
+                "context_len": 33,
+                "context_lens_list": [33, 17],
+                "max_model_len": 256,
+                "dtype": "bfloat16",
+            },
+        },
+        {
+            "name": "CaseVarSeq4",
+            "platforms": ["a2a3sim", "a2a3"],
+            "manual": True,
+            "config": {"aicpu_thread_num": 4, "block_dim": 24},
+            "params": {
+                "batch": 4,
+                "num_heads": 16,
+                "kv_head_num": 1,
+                "head_dim": 16,
+                "block_size": 16,
+                "context_len": 128,
+                "context_lens_list": [33, 64, 128, 15],
+                "max_model_len": 256,
+                "dtype": "bfloat16",
+            },
+        },
+    ]
+
+    def generate_args(self, params):
+        result = _pa_generate_inputs(params)
+        specs = []
+        for name, value in result:
+            if isinstance(value, torch.Tensor):
+                specs.append(Tensor(name, value))
+            else:
+                specs.append(Scalar(name, value))
+        return TaskArgsBuilder(*specs)
+
+    def compute_golden(self, args, params):
+        tensors = {s.name: s.value for s in args.specs if isinstance(s, Tensor)}
+        _pa_compute_golden(tensors, params)
+        for s in args.specs:
+            if isinstance(s, Tensor) and s.name in tensors:
+                getattr(args, s.name)[:] = tensors[s.name]
+
+
+if __name__ == "__main__":
+    SceneTestCase.run_module(__name__)
diff --git a/tests/st/a2a3/fully_distributed_within_core/orch_so_cache/test_orch_so_cache.py b/tests/st/a2a3/fully_distributed_within_core/orch_so_cache/test_orch_so_cache.py
new file mode 100644
index 000000000..b94820795
--- /dev/null
+++ b/tests/st/a2a3/fully_distributed_within_core/orch_so_cache/test_orch_so_cache.py
@@ -0,0 +1,120 @@
+#!/usr/bin/env python3
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""End-to-end coverage for the orchestration SO host-side cache.
+
+The host hashes the orchestration SO's GNU Build-ID, skips re-uploading bytes
+that already live on device, and tells AICPU to reuse the cached `dlopen`
+handle. The framework reuses one `Worker` (and therefore one `DeviceRunner`)
+across cases inside a `SceneTestCase`, so running multiple cases against the
+same `CALLABLE` exercises the cache-hit path on every case after the first.
+
+This test deliberately:
+  - Reuses the vector_example orchestration & AIV kernels (no new C++ to maintain).
+  - Spans three cases with different (a, b) inputs — proves cache hit doesn't
+    leak any per-run state across iterations.
+  - Uses the same tensor size (128*128) across all cases because the AIV
+    kernels have hardcoded tile shapes and do not accept a runtime size.
+  - Runs on both sim and hardware (sim DeviceRunner uses the same code path,
+    just with `mem_alloc_` returning host memory).
+
+Verification is purely outcome-based: every case must produce the correct
+result. A regression in cache logic (stale handle, wrong device buffer,
+missing dlopen on first run) shows up as wrong output or a runtime failure.
+"""
+
+import torch
+from simpler.task_interface import ArgDirection as D
+
+from simpler_setup import SceneTestCase, TaskArgsBuilder, Tensor, scene_test
+
+_VECTOR_KERNELS = "../../../../../examples/a2a3/tensormap_and_ringbuffer/vector_example/kernels"
+
+
+@scene_test(level=2, runtime="tensormap_and_ringbuffer")
+class TestOrchSoCache(SceneTestCase):
+    """Same callable, three cases — case 0 misses the cache, cases 1-2 hit it."""
+
+    CALLABLE = {
+        "orchestration": {
+            "source": f"{_VECTOR_KERNELS}/orchestration/example_orchestration.cpp",
+            "function_name": "aicpu_orchestration_entry",
+            "signature": [D.IN, D.IN, D.OUT],
+        },
+        "incores": [
+            {
+                "func_id": 0,
+                "source": f"{_VECTOR_KERNELS}/aiv/kernel_add.cpp",
+                "core_type": "aiv",
+                "signature": [D.IN, D.IN, D.OUT],
+            },
+            {
+                "func_id": 1,
+                "source": f"{_VECTOR_KERNELS}/aiv/kernel_add_scalar.cpp",
+                "core_type": "aiv",
+                "signature": [D.IN, D.OUT],
+            },
+            {
+                "func_id": 2,
+                "source": f"{_VECTOR_KERNELS}/aiv/kernel_mul.cpp",
+                "core_type": "aiv",
+                "signature": [D.IN, D.IN, D.OUT],
+            },
+        ],
+    }
+
+    # Three cases sharing one callable. The framework iterates them on a
+    # single Worker; cases after the first land on cache-hit. Sizes vary so
+    # a stale handle would manifest as wrong output, not "happens to pass".
+    _COMMON_CONFIG = {"aicpu_thread_num": 4, "block_dim": 3}
+    _PLATFORMS = ["a2a3sim", "a2a3"]
+
+    # All cases use the same size (128*128) because the AIV kernels have
+    # hardcoded tile shapes (kTRows_=128, kTCols_=128) and do not read a
+    # runtime size argument — running with a smaller tensor would cause an
+    # out-of-bounds access.  Different (a, b) values are enough to verify
+    # that no per-run state leaks across cache-hit iterations.
+    CASES = [
+        {
+            "name": "first_miss",
+            "platforms": _PLATFORMS,
+            "config": _COMMON_CONFIG,
+            "params": {"size": 128 * 128, "a": 2.0, "b": 3.0},
+        },
+        {
+            "name": "second_hit",
+            "platforms": _PLATFORMS,
+            "config": _COMMON_CONFIG,
+            "params": {"size": 128 * 128, "a": 1.0, "b": 4.0},
+        },
+        {
+            "name": "third_hit",
+            "platforms": _PLATFORMS,
+            "config": _COMMON_CONFIG,
+            "params": {"size": 128 * 128, "a": 0.5, "b": 0.5},
+        },
+    ]
+
+    def generate_args(self, params):
+        size = params["size"]
+        a = params["a"]
+        b = params["b"]
+        return TaskArgsBuilder(
+            Tensor("a", torch.full((size,), a, dtype=torch.float32)),
+            Tensor("b", torch.full((size,), b, dtype=torch.float32)),
+            Tensor("f", torch.zeros(size, dtype=torch.float32)),
+        )
+
+    def compute_golden(self, args, params):
+        # f = (a+b+1) * (a+b+2) + (a+b) — same formula as vector_example.
+        args.f[:] = (args.a + args.b + 1) * (args.a + args.b + 2) + (args.a + args.b)
+
+
+if __name__ == "__main__":
+    SceneTestCase.run_module(__name__)
diff --git a/tests/st/a2a3/fully_distributed_within_core/paged_attention_unroll/kernels/aic/aic_pv_matmul.cpp b/tests/st/a2a3/fully_distributed_within_core/paged_attention_unroll/kernels/aic/aic_pv_matmul.cpp
new file mode 100644
index 000000000..ec55f0377
--- /dev/null
+++ b/tests/st/a2a3/fully_distributed_within_core/paged_attention_unroll/kernels/aic/aic_pv_matmul.cpp
@@ -0,0 +1,169 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+// SplitK PV Matmul Kernel: Accumulated P @ V across n_blocks
+//
+// Processes n_blocks blocks using SplitK accumulation pattern:
+//   Block 0: TMATMUL(C, A, B)       — initialize accumulator
+//   Block i: TMATMUL_ACC(C, C, A, B) — accumulate into same C
+//
+// Per-block pij addresses: contiguous slices of pij_buf (n_blocks * M * K)
+// Per-block vj addresses: value_cache base + block_indices lookup
+// Single output: oi_new (M, N) fp32 = sum of P_i @ V_i across all blocks
+//
+// Optimizations:
+//   - Double-buffered L1 tiles (ping/pong for A and B via MTE2)
+//   - Double-buffered L0 tiles (ping/pong for L0A and L0B via MTE1)
+//   - TLOAD(next) overlaps with TMATMUL(current) via MTE2/M-pipe parallelism
+//   - Canonical 3-stage pipeline: TLOAD(MTE2) → TMOV(MTE1) → TMATMUL(M)
+//   - Reverse-dependency events ensure buffer safety across iterations
+//
+// Supports two tile configurations via runtime dispatch:
+//   Case1: (16, 128) @ (128, 128) -> (16, 128)
+//   Case2: (64,  64) @ ( 64, 128) -> (64, 128)
+//
+// pij is bfloat16 (from softmax_prepare TCVT).
+// vj is stored as (K, N) = (block_size, head_dim) in row-major (ND) layout.
+
+#include <cstdint>
+// NOLINTBEGIN(clang-diagnostic-error,bugprone-reserved-identifier,bugprone-easily-swappable-parameters,modernize-avoid-c-arrays,modernize-use-auto)
+#include <pto/pto-inst.hpp>
+
+#include "tensor.h"
+
+// NOLINTNEXTLINE(build/namespaces)
+using namespace pto;
+
+#include "pipe_sync.h"
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__ [aicore]  // NOLINT(whitespace/braces)
+#endif
+
+template <int M, int K, int N>
+static __aicore__ void pv_matmul_n_impl(
+    __gm__ bfloat16_t *pij_base, __gm__ bfloat16_t *val_base, __gm__ float *oi_base, uint64_t n_blocks,
+    __gm__ int32_t *bt, uint64_t bt_offset
+) {
+    using GlobalA = GlobalTensor<bfloat16_t, Shape<1, 1, 1, M, K>, Stride<M * K, M * K, M * K, K, 1>>;
+    using GlobalB = GlobalTensor<bfloat16_t, Shape<1, 1, 1, K, N>, Stride<K * N, K * N, K * N, N, 1>>;
+    using GlobalOut = GlobalTensor<float, Shape<1, 1, 1, M, N>, Stride<M * N, M * N, M * N, N, 1>>;
+
+    using TileMatA = Tile<TileType::Mat, bfloat16_t, M, K, BLayout::ColMajor, M, K, SLayout::RowMajor, 512>;
+    using TileMatB = Tile<TileType::Mat, bfloat16_t, K, N, BLayout::ColMajor, K, N, SLayout::RowMajor, 512>;
+
+    using LeftTile = TileLeft<bfloat16_t, M, K, M, K>;
+    using RightTile = TileRight<bfloat16_t, K, N, K, N>;
+    using AccTile = TileAcc<float, M, N, M, N>;
+
+    // L1 memory layout: double-buffered A and B tiles (tightly packed)
+    constexpr int kATileBytes = M * K * static_cast<int>(sizeof(bfloat16_t));
+    constexpr int kBTileBytes = K * N * static_cast<int>(sizeof(bfloat16_t));
+
+    TileMatA aMatTile[2];
+    TileMatB bMatTile[2];
+    TASSIGN(aMatTile[0], 0x0);
+    TASSIGN(aMatTile[1], kATileBytes);
+    TASSIGN(bMatTile[0], 2 * kATileBytes);
+    TASSIGN(bMatTile[1], 2 * kATileBytes + kBTileBytes);
+
+    // L0 memory layout: double-buffered L0A and L0B, single accumulator L0C
+    LeftTile aTile[2];
+    RightTile bTile[2];
+    AccTile cTile;
+    TASSIGN(aTile[0], 0x0);
+    TASSIGN(aTile[1], kATileBytes);
+    TASSIGN(bTile[0], 0x0);
+    TASSIGN(bTile[1], kBTileBytes);
+    TASSIGN(cTile, 0x0);
+
+    GlobalOut oiGlobal(oi_base);
+
+    // Seed reverse-dependency flags: all ping/pong buffers initially free
+    //   PIPE_MTE1 → PIPE_MTE2: L1 buffer [0/1] safe for TLOAD to overwrite
+    //   PIPE_M    → PIPE_MTE1: L0 buffer [0/1] safe for TMOV to overwrite
+    set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID0);
+    set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1);
+    set_flag(PIPE_M, PIPE_MTE1, EVENT_ID0);
+    set_flag(PIPE_M, PIPE_MTE1, EVENT_ID1);
+
+    for (uint64_t i = 0; i < n_blocks; i++) {
+        int cur = static_cast<int>(i % 2);
+        GlobalA pijGlobal(pij_base + i * M * K);
+        GlobalB vjGlobal(val_base + bt[bt_offset + i] * K * N);
+
+        // Stage 1: TLOAD (MTE2: GM → L1[cur])
+        // Wait for MTE1 to release L1[cur] (reverse dep from previous iteration)
+        wait_flag(PIPE_MTE1, PIPE_MTE2, static_cast<::event_t>(cur));
+        TLOAD(aMatTile[cur], pijGlobal);
+        set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);  // forward: A in L1 ready
+        TLOAD(bMatTile[cur], vjGlobal);
+        set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID1);  // forward: B in L1 ready
+
+        // Stage 2: TMOV (MTE1: L1[cur] → L0[cur])
+        // Wait for M-pipe to release L0[cur] (reverse dep from previous iteration)
+        wait_flag(PIPE_M, PIPE_MTE1, static_cast<::event_t>(cur));
+        wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);  // forward: wait A loaded
+        TMOV(aTile[cur], aMatTile[cur]);
+        wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID1);  // forward: wait B loaded
+        TMOV(bTile[cur], bMatTile[cur]);
+        set_flag(PIPE_MTE1, PIPE_MTE2, static_cast<::event_t>(cur));  // reverse: release L1[cur]
+
+        // Stage 3: TMATMUL (M-pipe: L0A[cur] × L0B[cur] → L0C)
+        set_flag(PIPE_MTE1, PIPE_M, static_cast<::event_t>(cur));  // forward: L0[cur] ready
+        wait_flag(PIPE_MTE1, PIPE_M, static_cast<::event_t>(cur));
+        if (i == 0) {
+            TMATMUL(cTile, aTile[cur], bTile[cur]);
+        } else {
+            TMATMUL_ACC(cTile, cTile, aTile[cur], bTile[cur]);
+        }
+        set_flag(PIPE_M, PIPE_MTE1, static_cast<::event_t>(cur));  // reverse: release L0[cur]
+    }
+
+    // Drain outstanding reverse-dependency flags
+    wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID0);
+    wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1);
+    wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID0);
+    wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID1);
+
+    set_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
+    wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
+    TSTORE(oiGlobal, cTile);
+
+    pipe_sync();
+}
+
+extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) {
+    __gm__ Tensor *pij_buf = reinterpret_cast<__gm__ Tensor *>(args[0]);
+    __gm__ Tensor *value_cache = reinterpret_cast<__gm__ Tensor *>(args[1]);
+    __gm__ Tensor *block_table_t = reinterpret_cast<__gm__ Tensor *>(args[2]);
+    __gm__ Tensor *oi_new = reinterpret_cast<__gm__ Tensor *>(args[3]);
+    uint64_t n_blocks = static_cast<uint64_t>(args[4]);
+    uint64_t bt_offset = static_cast<uint64_t>(args[5]);
+
+    __gm__ bfloat16_t *pij_base = reinterpret_cast<__gm__ bfloat16_t *>(pij_buf->buffer.addr) + pij_buf->start_offset;
+    __gm__ bfloat16_t *val_base = reinterpret_cast<__gm__ bfloat16_t *>(value_cache->buffer.addr);
+    __gm__ float *oi_base = reinterpret_cast<__gm__ float *>(oi_new->buffer.addr) + oi_new->start_offset;
+    __gm__ int32_t *bt = reinterpret_cast<__gm__ int32_t *>(block_table_t->buffer.addr);
+
+    uint64_t q_tile_size = static_cast<uint64_t>(pij_buf->shapes[0]);
+
+    if (q_tile_size == 16) {
+        pv_matmul_n_impl<16, 128, 128>(pij_base, val_base, oi_base, n_blocks, bt, bt_offset);
+    } else {
+        pv_matmul_n_impl<64, 64, 128>(pij_base, val_base, oi_base, n_blocks, bt, bt_offset);
+    }
+}
+// NOLINTEND(clang-diagnostic-error,bugprone-reserved-identifier,bugprone-easily-swappable-parameters,modernize-avoid-c-arrays,modernize-use-auto)
diff --git a/tests/st/a2a3/fully_distributed_within_core/paged_attention_unroll/kernels/aic/aic_qk_matmul.cpp b/tests/st/a2a3/fully_distributed_within_core/paged_attention_unroll/kernels/aic/aic_qk_matmul.cpp
new file mode 100644
index 000000000..20ec20e73
--- /dev/null
+++ b/tests/st/a2a3/fully_distributed_within_core/paged_attention_unroll/kernels/aic/aic_qk_matmul.cpp
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+// Multi-block QK Matmul Kernel: qi(M, K) @ kj.T(K, N) -> sij(M, N) for each block
+//
+// Processes n_blocks blocks in a single kernel invocation.
+// Per-block kj addresses computed from key_cache base + block_indices lookup.
+// qi is shared across all blocks (same query head against different key blocks).
+//
+// Output layout: n_blocks contiguous (M, N) tiles stacked vertically.
+// Block i occupies sij[i*M : (i+1)*M, 0:N].
+//
+// Optimizations:
+//   - qi TLOAD hoisted before the loop (constant across all iterations)
+//   - Double-buffered L1 B tiles: prefetch next kj during current TMATMUL+TSTORE
+//
+// Supports two tile configurations via runtime dispatch:
+//   Case1: (16, 128) @ (128, 128).T -> (16, 128)
+//   Case2: (64, 128) @ (128,  64).T -> (64,  64)
+//
+// Template: M=q_tile, K=head_dim, N=block_size
+
+#include <cstdint>
+// NOLINTBEGIN(clang-diagnostic-error,bugprone-reserved-identifier,bugprone-easily-swappable-parameters,modernize-use-auto)
+#include <pto/pto-inst.hpp>
+
+#include "tensor.h"
+
+// NOLINTNEXTLINE(build/namespaces)
+using namespace pto;
+
+#include "pipe_sync.h"
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__ [aicore]  // NOLINT(whitespace/braces)
+#endif
+
+template <int M, int K, int N>
+static __aicore__ void qk_matmul_n_impl(
+    __gm__ bfloat16_t *qi_base, __gm__ bfloat16_t *key_base, __gm__ float *sij_base, uint64_t n_blocks,
+    __gm__ int32_t *bt, uint64_t bt_offset
+) {
+    using GlobalA = GlobalTensor<bfloat16_t, Shape<1, 1, 1, M, K>, Stride<M * K, M * K, M * K, K, 1>>;
+    using GlobalB = GlobalTensor<bfloat16_t, Shape<1, 1, 1, K, N>, Stride<K * N, K * N, K * N, 1, K>, Layout::DN>;
+    using GlobalOut = GlobalTensor<float, Shape<1, 1, 1, M, N>, Stride<M * N, M * N, M * N, N, 1>>;
+
+    using TileMatA = Tile<TileType::Mat, bfloat16_t, M, K, BLayout::ColMajor, M, K, SLayout::RowMajor, 512>;
+    using TileMatB = Tile<TileType::Mat, bfloat16_t, K, N, BLayout::RowMajor, K, N, SLayout::ColMajor, 512>;
+
+    using LeftTile = TileLeft<bfloat16_t, M, K, M, K>;
+    using RightTile = TileRight<bfloat16_t, K, N, K, N>;
+    using AccTile = TileAcc<float, M, N, M, N>;
+
+    // Double-buffered L1 B tiles for kj prefetching
+    constexpr int kBBytes = K * N * static_cast<int>(sizeof(bfloat16_t));
+    TileMatA aMatTile;
+    TileMatB bMatTile_A;
+    TileMatB bMatTile_B;
+    TASSIGN(aMatTile, 0x0);
+    TASSIGN(bMatTile_A, 0x20000);
+    TASSIGN(bMatTile_B, 0x20000 + kBBytes);
+
+    LeftTile aTile;
+    RightTile bTile;
+    AccTile cTile;
+    TASSIGN(aTile, 0x0);
+    TASSIGN(bTile, 0x0);
+    TASSIGN(cTile, 0x0);
+
+    // Hoist qi TLOAD before the loop (qi is constant across all blocks)
+    GlobalA qiGlobal(qi_base);
+    TLOAD(aMatTile, qiGlobal);
+
+    // Pre-load first kj into buffer A
+    GlobalB kjGlobal_0(key_base + bt[bt_offset + 0] * N * K);
+    TLOAD(bMatTile_A, kjGlobal_0);
+
+    for (uint64_t i = 0; i < n_blocks; i++) {
+        GlobalOut sijGlobal(sij_base + i * M * N);
+
+        // Wait for current kj TLOAD to complete
+        set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
+        wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
+
+        // TMOV qi L1→L0A and kj L1→L0B from current buffer
+        TMOV(aTile, aMatTile);
+        if (i % 2 == 0) {
+            TMOV(bTile, bMatTile_A);
+        } else {
+            TMOV(bTile, bMatTile_B);
+        }
+
+        // Prefetch next kj into alternate L1 buffer (overlaps with MTE1→M→FIX)
+        if (i + 1 < n_blocks) {
+            GlobalB kjGlobal_next(key_base + bt[bt_offset + i + 1] * N * K);
+            if (i % 2 == 0) {
+                TLOAD(bMatTile_B, kjGlobal_next);
+            } else {
+                TLOAD(bMatTile_A, kjGlobal_next);
+            }
+        }
+
+        set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
+        wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
+
+        TMATMUL(cTile, aTile, bTile);
+
+        set_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
+        wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
+
+        TSTORE(sijGlobal, cTile);
+
+        if (i + 1 < n_blocks) {
+            // Drain all pipes before next iteration:
+            //   - FIX/MTE3: ensures TSTORE data path (L0C→UB→GM) fully completes
+            //   - MTE2: prefetch TLOAD likely already done (ran during TMATMUL+TSTORE)
+            // The prefetch TLOAD overlaps with compute, so barrier cost is minimal.
+            pipe_barrier(PIPE_ALL);
+        }
+    }
+    pipe_sync();
+}
+
+extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) {
+    __gm__ Tensor *qi = reinterpret_cast<__gm__ Tensor *>(args[0]);
+    __gm__ Tensor *key_cache = reinterpret_cast<__gm__ Tensor *>(args[1]);
+    __gm__ Tensor *block_table_t = reinterpret_cast<__gm__ Tensor *>(args[2]);
+    __gm__ Tensor *sij_buf = reinterpret_cast<__gm__ Tensor *>(args[3]);
+    uint64_t n_blocks = static_cast<uint64_t>(args[4]);
+    uint64_t bt_offset = static_cast<uint64_t>(args[5]);
+
+    __gm__ bfloat16_t *qi_base = reinterpret_cast<__gm__ bfloat16_t *>(qi->buffer.addr) + qi->start_offset;
+    __gm__ bfloat16_t *key_base = reinterpret_cast<__gm__ bfloat16_t *>(key_cache->buffer.addr);
+    __gm__ float *sij_base = reinterpret_cast<__gm__ float *>(sij_buf->buffer.addr) + sij_buf->start_offset;
+    __gm__ int32_t *bt = reinterpret_cast<__gm__ int32_t *>(block_table_t->buffer.addr);
+
+    uint64_t q_tile_size = static_cast<uint64_t>(qi->shapes[0]);
+
+    if (q_tile_size == 16) {
+        qk_matmul_n_impl<16, 128, 128>(qi_base, key_base, sij_base, n_blocks, bt, bt_offset);
+    } else {
+        qk_matmul_n_impl<64, 128, 64>(qi_base, key_base, sij_base, n_blocks, bt, bt_offset);
+    }
+}
+// NOLINTEND(clang-diagnostic-error,bugprone-reserved-identifier,bugprone-easily-swappable-parameters,modernize-use-auto)
diff --git a/tests/st/a2a3/fully_distributed_within_core/paged_attention_unroll/kernels/aiv/aiv_online_update.cpp b/tests/st/a2a3/fully_distributed_within_core/paged_attention_unroll/kernels/aiv/aiv_online_update.cpp
new file mode 100644
index 000000000..df4b5a726
--- /dev/null
+++ b/tests/st/a2a3/fully_distributed_within_core/paged_attention_unroll/kernels/aiv/aiv_online_update.cpp
@@ -0,0 +1,256 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+// Online Softmax Update + Normalize Kernel (AIV)
+//
+// Operates on full tiles where M=q_tile_size, N=head_dim (128):
+//   Case1: oi/oi_new are (16, 128), mij/lij/mi/li are 16-element vectors
+//   Case2: oi/oi_new are (64, 128), mij/lij/mi/li are 64-element vectors
+//
+// Scalar layout strategy using TRESHAPE (zero-copy UB reshape):
+//   Scalars loaded as DN ColMajor (M, 1) for TROWEXPANDMUL/TROWEXPANDDIV.
+//   For element-wise ops (TMAX, TSUB, TEXP, etc.), TRESHAPE to RowMajor (1, M).
+//   After arithmetic, TRESHAPE back to ColMajor (M, 1) for row-broadcast ops.
+//   This eliminates the GM round-trip (TSTORE ND → TLOAD DN) used in the original.
+
+#include <cstdint>
+#include <pto/pto-inst.hpp>
+
+#include "tensor.h"
+
+using namespace pto;
+
+#include "pipe_sync.h"
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__ [aicore]
+#endif
+
+template <int M, int N>
+static __aicore__ void online_update_impl(
+    __gm__ Tensor *mij, __gm__ Tensor *lij, __gm__ Tensor *oi_new, __gm__ Tensor *mi, __gm__ Tensor *li,
+    __gm__ Tensor *oi, uint64_t is_first, uint64_t is_last, __gm__ Tensor *dst
+) {
+    __gm__ float *mij_ptr = reinterpret_cast<__gm__ float *>(mij->buffer.addr);
+    __gm__ float *lij_ptr = reinterpret_cast<__gm__ float *>(lij->buffer.addr);
+    __gm__ float *oi_new_ptr = reinterpret_cast<__gm__ float *>(oi_new->buffer.addr);
+    __gm__ float *mi_ptr = reinterpret_cast<__gm__ float *>(mi->buffer.addr);
+    __gm__ float *li_ptr = reinterpret_cast<__gm__ float *>(li->buffer.addr);
+    __gm__ float *oi_ptr = reinterpret_cast<__gm__ float *>(oi->buffer.addr);
+    __gm__ float *dst_ptr = reinterpret_cast<__gm__ float *>(dst->buffer.addr);
+
+    // Aligned rows for ColMajor DN tiles (32-byte alignment)
+    constexpr int kAlignedRows = ((M * sizeof(float) + 31) / 32) * (32 / sizeof(float));
+
+    // --- GlobalTensor types ---
+
+    // Data (M, N) RowMajor
+    using GlobalDataMxN = GlobalTensor<float, Shape<1, 1, 1, M, N>, Stride<1, 1, 1, N, 1>>;
+
+    // Scalar DN: M contiguous floats as (kAlignedRows, 1) ColMajor for TROWEXPAND ops and loading
+    using GlobalScalarDN = GlobalTensor<float, Shape<1, 1, 1, kAlignedRows, 1>, Stride<1, 1, 1, 1, 1>, Layout::DN>;
+
+    // Scalar ND: for storing mi_new and li_new back to GM
+    constexpr int kScalarCols = 32 / sizeof(float);
+    constexpr int kScalarRows = M / kScalarCols;
+    using GlobalScalarND =
+        GlobalTensor<float, Shape<1, 1, 1, kScalarRows, kScalarCols>, Stride<1, 1, 1, kScalarCols, 1>>;
+
+    // --- GlobalTensor instances ---
+
+    GlobalDataMxN oiNewGlobal(oi_new_ptr + oi_new->start_offset);
+    GlobalDataMxN oiGlobal(oi_ptr + oi->start_offset);
+    GlobalDataMxN dstGlobal(dst_ptr + dst->start_offset);
+
+    // DN globals for loading scalars as ColMajor
+    GlobalScalarDN mijGlobalDN(mij_ptr + mij->start_offset);
+    GlobalScalarDN lijGlobalDN(lij_ptr + lij->start_offset);
+    GlobalScalarDN miGlobalDN(mi_ptr + mi->start_offset);
+    GlobalScalarDN liGlobalDN(li_ptr + li->start_offset);
+
+    // ND globals for storing scalar results
+    GlobalScalarND miGlobalND(mi_ptr + mi->start_offset);
+    GlobalScalarND liGlobalND(li_ptr + li->start_offset);
+
+    // --- Tile types ---
+
+    using TileDataMxN = Tile<TileType::Vec, float, M, N, BLayout::RowMajor, M, N>;
+    using TileScalarDN = Tile<TileType::Vec, float, kAlignedRows, 1, BLayout::ColMajor, M, 1>;
+
+    // RowMajor (1, M) tiles for element-wise arithmetic via TRESHAPE
+    using TileScalarRow = Tile<TileType::Vec, float, 1, M, BLayout::RowMajor, 1, M>;
+
+    // ND tile for storing back to GM
+    using TileScalarND =
+        Tile<TileType::Vec, float, kScalarRows, kScalarCols, BLayout::RowMajor, kScalarRows, kScalarCols>;
+
+    // --- UB memory layout ---
+
+    constexpr int kDataBytes = M * N * sizeof(float);
+    constexpr int kScalarDNBytes = kAlignedRows * sizeof(float);
+
+    // Data tiles
+    TileDataMxN oiNewTile;
+    TileDataMxN oiTile;
+
+    // Scalar DN tiles loaded from GM (ColMajor)
+    TileScalarDN mijDN, lijDN, miDN, liDN;
+
+    // Temporary DN tiles for results
+    TileScalarDN miNewDN, alphaDN, betaDN, liNewDN, tmpDN;
+
+    TASSIGN(oiNewTile, 0);
+    TASSIGN(oiTile, kDataBytes);
+    TASSIGN(mijDN, 2 * kDataBytes);
+    TASSIGN(lijDN, 2 * kDataBytes + kScalarDNBytes);
+    TASSIGN(miDN, 2 * kDataBytes + 2 * kScalarDNBytes);
+    TASSIGN(liDN, 2 * kDataBytes + 3 * kScalarDNBytes);
+    TASSIGN(miNewDN, 2 * kDataBytes + 4 * kScalarDNBytes);
+    TASSIGN(alphaDN, 2 * kDataBytes + 5 * kScalarDNBytes);
+    TASSIGN(betaDN, 2 * kDataBytes + 6 * kScalarDNBytes);
+    TASSIGN(liNewDN, 2 * kDataBytes + 7 * kScalarDNBytes);
+    TASSIGN(tmpDN, 2 * kDataBytes + 8 * kScalarDNBytes);
+
+    if (is_first) {
+        // --- First block: copy inputs to accumulators ---
+        TLOAD(oiNewTile, oiNewGlobal);
+        TLOAD(mijDN, mijGlobalDN);
+        TLOAD(lijDN, lijGlobalDN);
+        set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+        wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+
+        // Store mi = mij, li = lij, oi = oi_new
+        // Alias ND tiles to same UB as DN tiles for ND-format store
+        TileScalarND mijND, lijND;
+        TASSIGN(mijND, 2 * kDataBytes);                   // alias same UB as mijDN
+        TASSIGN(lijND, 2 * kDataBytes + kScalarDNBytes);  // alias same UB as lijDN
+
+        set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+        wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+        TSTORE(miGlobalND, mijND);    // mi = mij
+        TSTORE(liGlobalND, lijND);    // li = lij
+        TSTORE(oiGlobal, oiNewTile);  // oi = oi_new
+
+        if (is_last) {
+            // Single block: normalize dst = oi_new / lij
+            // lijDN already in ColMajor DN format, use directly for TROWEXPANDDIV
+            set_flag(PIPE_MTE3, PIPE_V, EVENT_ID0);
+            wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID0);
+            TROWEXPANDDIV(oiNewTile, oiNewTile, lijDN);
+            set_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);
+            wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);
+            TSTORE(dstGlobal, oiNewTile);
+        }
+    } else {
+        // --- Subsequent blocks: accumulate ---
+
+        // Load all inputs as DN (ColMajor)
+        TLOAD(oiNewTile, oiNewGlobal);
+        TLOAD(oiTile, oiGlobal);
+        TLOAD(mijDN, mijGlobalDN);
+        TLOAD(lijDN, lijGlobalDN);
+        TLOAD(miDN, miGlobalDN);
+        TLOAD(liDN, liGlobalDN);
+        set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+        wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+
+        // TRESHAPE: ColMajor(M,1) → RowMajor(1,M) for element-wise arithmetic
+        TileScalarRow miRow, mijRow, liRow, lijRow;
+        TRESHAPE(miRow, miDN);
+        TRESHAPE(mijRow, mijDN);
+        TRESHAPE(liRow, liDN);
+        TRESHAPE(lijRow, lijDN);
+
+        // Scalar arithmetic in RowMajor (1, M) layout
+        TileScalarRow miNewRow, alphaRow, betaRow, liNewRow, tmpRow;
+        TASSIGN(miNewRow, 2 * kDataBytes + 4 * kScalarDNBytes);
+        TASSIGN(alphaRow, 2 * kDataBytes + 5 * kScalarDNBytes);
+        TASSIGN(betaRow, 2 * kDataBytes + 6 * kScalarDNBytes);
+        TASSIGN(liNewRow, 2 * kDataBytes + 7 * kScalarDNBytes);
+        TASSIGN(tmpRow, 2 * kDataBytes + 8 * kScalarDNBytes);
+
+        TMAX(miNewRow, miRow, mijRow);  // mi_new = max(mi, mij)
+        pipe_barrier(PIPE_V);
+        // alphaRow and betaRow write to independent UB addresses; both only read miNewRow
+        TSUB(alphaRow, miRow, miNewRow);  // alpha_exp = mi - mi_new
+        TSUB(betaRow, mijRow, miNewRow);  // beta_exp = mij - mi_new
+        pipe_barrier(PIPE_V);
+        // TEXP on independent UB addresses
+        TEXP(alphaRow, alphaRow);  // alpha = exp(mi - mi_new)
+        TEXP(betaRow, betaRow);    // beta = exp(mij - mi_new)
+        pipe_barrier(PIPE_V);
+        // tmpRow and liNewRow write to independent UB addresses
+        TMUL(tmpRow, alphaRow, liRow);    // alpha * li
+        TMUL(liNewRow, betaRow, lijRow);  // beta * lij
+        pipe_barrier(PIPE_V);
+        TADD(liNewRow, tmpRow, liNewRow);  // li_new = alpha*li + beta*lij
+
+        // TRESHAPE back: RowMajor(1,M) → ColMajor(M,1) for TROWEXPANDMUL
+        pipe_barrier(PIPE_V);
+        TRESHAPE(alphaDN, alphaRow);
+        TRESHAPE(betaDN, betaRow);
+
+        // Scale data tiles using row-broadcast multiply
+        TROWEXPANDMUL(oiTile, oiTile, alphaDN);       // oi *= alpha
+        TROWEXPANDMUL(oiNewTile, oiNewTile, betaDN);  // oi_new *= beta
+        pipe_barrier(PIPE_V);
+        TADD(oiTile, oiTile, oiNewTile);  // oi = alpha*oi + beta*oi_new
+
+        // Store mi_new and li_new to GM (ND format)
+        // Alias ND tiles to the same UB locations as miNewRow and liNewRow
+        TileScalarND miNewND, liNewND;
+        TASSIGN(miNewND, 2 * kDataBytes + 4 * kScalarDNBytes);
+        TASSIGN(liNewND, 2 * kDataBytes + 7 * kScalarDNBytes);
+
+        if (is_last) {
+            // Normalize and output: dst = oi / li_new
+            TRESHAPE(liNewDN, liNewRow);
+            pipe_barrier(PIPE_V);
+            TROWEXPANDDIV(oiTile, oiTile, liNewDN);
+            set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+            wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+            TSTORE(miGlobalND, miNewND);  // persist mi_new
+            TSTORE(liGlobalND, liNewND);  // persist li_new
+            TSTORE(dstGlobal, oiTile);
+        } else {
+            // Store updated accumulators
+            set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+            wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+            TSTORE(miGlobalND, miNewND);  // persist mi_new
+            TSTORE(liGlobalND, liNewND);  // persist li_new
+            TSTORE(oiGlobal, oiTile);
+        }
+    }
+    pipe_sync();
+}
+
+extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) {
+    __gm__ Tensor *mij = reinterpret_cast<__gm__ Tensor *>(args[0]);
+    __gm__ Tensor *lij = reinterpret_cast<__gm__ Tensor *>(args[1]);
+    __gm__ Tensor *oi_new = reinterpret_cast<__gm__ Tensor *>(args[2]);
+    __gm__ Tensor *mi = reinterpret_cast<__gm__ Tensor *>(args[3]);
+    __gm__ Tensor *li = reinterpret_cast<__gm__ Tensor *>(args[4]);
+    __gm__ Tensor *oi = reinterpret_cast<__gm__ Tensor *>(args[5]);
+    __gm__ Tensor *dst = reinterpret_cast<__gm__ Tensor *>(args[6]);
+    uint64_t is_first = static_cast<uint64_t>(args[7]);
+    uint64_t is_last = static_cast<uint64_t>(args[8]);
+    uint64_t q_tile_size = static_cast<uint64_t>(mij->shapes[0]);
+    // args[10] = head_dim (128)
+
+    if (q_tile_size == 16) {
+        online_update_impl<16, 128>(mij, lij, oi_new, mi, li, oi, is_first, is_last, dst);
+    } else {
+        online_update_impl<64, 128>(mij, lij, oi_new, mi, li, oi, is_first, is_last, dst);
+    }
+}
diff --git a/tests/st/a2a3/fully_distributed_within_core/paged_attention_unroll/kernels/aiv/aiv_softmax_prepare.cpp b/tests/st/a2a3/fully_distributed_within_core/paged_attention_unroll/kernels/aiv/aiv_softmax_prepare.cpp
new file mode 100644
index 000000000..aa221fa5c
--- /dev/null
+++ b/tests/st/a2a3/fully_distributed_within_core/paged_attention_unroll/kernels/aiv/aiv_softmax_prepare.cpp
@@ -0,0 +1,293 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+// Two-Pass Softmax Kernel (AIV) for n_blocks tiles
+//
+// Input:  sij_buf (n_blocks * M, N) fp32 — QK results stacked vertically
+// Output: pij_buf (n_blocks * M, N) bf16 — attention weights per block
+//         mij (M,) fp32 — global row max across all blocks
+//         lij (M,) fp32 — total row sum across all blocks
+//
+// Pass 1: Iterate over n_blocks tiles, mask last block,
+//         find global m = scale * max over all blocks of rowmax(S_i)
+//         Defers scale to after the loop (single M-element TMULS vs n_blocks M×N).
+//         Uses double-buffered sij tiles and TRESHAPE for DN↔Row conversion.
+// Pass 2: Iterate again, compute P_i = exp(S_i * scale - m) -> bf16,
+//         accumulate l = sum over all blocks of rowsum(P_i)
+//         Uses double-buffered sij tiles to overlap TLOAD with computation.
+//
+// Two-pass ensures all P_i tiles share the same scale (global max),
+// enabling direct TMATMUL_ACC accumulation in the PV kernel.
+//
+// Supports two tile configurations via runtime dispatch:
+//   Case1: M=16, N=128 (q_tile=16, block_size=128)
+//   Case2: M=64, N=64  (q_tile=64, block_size=64)
+
+#include <cstdint>
+#include <pto/pto-inst.hpp>
+
+#include "tensor.h"
+
+using namespace pto;
+
+#include "pipe_sync.h"
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__ [aicore]
+#endif
+
+template <int M, int N>
+static __aicore__ void softmax_prepare_n_impl(
+    __gm__ float *sij_base, float scale_value, __gm__ bfloat16_t *pij_base, __gm__ float *mij_addr,
+    __gm__ float *lij_addr, uint64_t n_blocks, uint64_t valid_len_last
+) {
+    constexpr int kAlignedRows = ((M * sizeof(float) + 31) / 32) * (32 / sizeof(float));
+    constexpr int kScalarCols = 32 / sizeof(float);
+    constexpr int kScalarRows = M / kScalarCols;
+
+    // --- GlobalTensor types ---
+    using GlobalDataMxN = GlobalTensor<float, Shape<1, 1, 1, M, N>, Stride<1, 1, 1, N, 1>>;
+    using GlobalDataMxN_bf16 = GlobalTensor<bfloat16_t, Shape<1, 1, 1, M, N>, Stride<1, 1, 1, N, 1>>;
+    using GlobalScalarDN = GlobalTensor<float, Shape<1, 1, 1, kAlignedRows, 1>, Stride<1, 1, 1, 1, 1>, Layout::DN>;
+    using GlobalScalarND =
+        GlobalTensor<float, Shape<1, 1, 1, kScalarRows, kScalarCols>, Stride<1, 1, 1, kScalarCols, 1>>;
+
+    // --- Tile types ---
+    using TileSijDyn = Tile<TileType::Vec, float, M, N, BLayout::RowMajor, M, -1>;
+    using TileSijPad = Tile<TileType::Vec, float, M, N, BLayout::RowMajor, M, N, SLayout::NoneBox, 512, PadValue::Min>;
+    using TileVecMxN = Tile<TileType::Vec, float, M, N, BLayout::RowMajor, M, N>;
+    using TileVecMxN_bf16 = Tile<TileType::Vec, bfloat16_t, M, N, BLayout::RowMajor, M, N>;
+    using TileScalarDN = Tile<TileType::Vec, float, kAlignedRows, 1, BLayout::ColMajor, M, 1>;
+    using TileScalarND =
+        Tile<TileType::Vec, float, kScalarRows, kScalarCols, BLayout::RowMajor, kScalarRows, kScalarCols>;
+    // RowMajor (1, M) tile for element-wise arithmetic via TRESHAPE
+    using TileScalarRow = Tile<TileType::Vec, float, 1, M, BLayout::RowMajor, 1, M>;
+
+    // --- UB memory layout (double-buffered sij) ---
+    constexpr int kDataBytes = M * N * sizeof(float);
+    constexpr int kScalarDNBytes = kAlignedRows * sizeof(float);
+
+    // Double-buffered sij tiles
+    TileVecMxN sijTile_A;
+    TileSijPad sijPadTile_A;
+    TileVecMxN sijTile_B;
+    TileSijPad sijPadTile_B;
+    TileVecMxN pijTile;
+    TileVecMxN tmpTile;
+    TileVecMxN sumAccTile;
+    TileScalarDN localMaxDN;
+    TileScalarDN globalMaxDN;
+    TileScalarDN sumDN;
+    TileVecMxN_bf16 pijBf16Tile;
+
+    // TRESHAPE aliases (same UB address as their DN counterparts)
+    TileScalarRow localMaxRow;
+    TileScalarRow globalMaxRow;
+
+    // ND alias for storing globalMax to GM
+    TileScalarND globalMaxND;
+
+    TASSIGN(sijTile_A, 0x0);
+    TASSIGN(sijPadTile_A, 0x0);
+    TASSIGN(sijTile_B, kDataBytes);
+    TASSIGN(sijPadTile_B, kDataBytes);
+    TASSIGN(pijTile, 2 * kDataBytes);
+    TASSIGN(tmpTile, 3 * kDataBytes);
+    TASSIGN(sumAccTile, 4 * kDataBytes);
+    int scalarBase = 5 * kDataBytes;
+    TASSIGN(localMaxDN, scalarBase);
+    TASSIGN(localMaxRow, scalarBase);  // alias: same UB as localMaxDN
+    TASSIGN(globalMaxDN, scalarBase + kScalarDNBytes);
+    TASSIGN(globalMaxRow, scalarBase + kScalarDNBytes);  // alias: same UB as globalMaxDN
+    TASSIGN(globalMaxND, scalarBase + kScalarDNBytes);   // alias: same UB as globalMaxDN
+    TASSIGN(sumDN, scalarBase + 2 * kScalarDNBytes);
+    TASSIGN(pijBf16Tile, scalarBase + 3 * kScalarDNBytes);
+
+    // GM aliases (mij/lij output buffers)
+    GlobalScalarND mijGlobalND(mij_addr);
+    GlobalScalarDN lijGlobalDN(lij_addr);
+
+    // ======== Pass 1: Find global row max (unscaled) with double-buffered sij ========
+    // rowmax(S*scale) = scale * rowmax(S) since scale > 0, so defer scale to after loop.
+    GlobalDataMxN sijGlobal_p1_0(sij_base);
+    TLOAD(sijTile_A, sijGlobal_p1_0);
+
+    for (uint64_t i = 0; i < n_blocks; i++) {
+        set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+        wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+
+        if (i == n_blocks - 1 && valid_len_last < static_cast<uint64_t>(N)) {
+            TileSijDyn sijDynTile(static_cast<size_t>(valid_len_last));
+            if (i % 2 == 0) {
+                TASSIGN(sijDynTile, 0x0);
+                TFILLPAD_INPLACE(sijPadTile_A, sijDynTile);
+            } else {
+                TASSIGN(sijDynTile, static_cast<int>(kDataBytes));
+                TFILLPAD_INPLACE(sijPadTile_B, sijDynTile);
+            }
+            pipe_barrier(PIPE_V);
+        }
+
+        // Compute unscaled TROWMAX on current buffer
+        if (i % 2 == 0) {
+            TROWMAX(localMaxDN, sijTile_A, tmpTile);
+        } else {
+            TROWMAX(localMaxDN, sijTile_B, tmpTile);
+        }
+        pipe_barrier(PIPE_V);
+
+        // Prefetch next sij into alternate buffer (overlaps with V pipe scalar ops)
+        if (i + 1 < n_blocks) {
+            GlobalDataMxN sijGlobal_next(sij_base + (i + 1) * M * N);
+            if (i % 2 == 0) {
+                TLOAD(sijTile_B, sijGlobal_next);
+            } else {
+                TLOAD(sijTile_A, sijGlobal_next);
+            }
+        }
+
+        // TRESHAPE: ColMajor(M,1) → RowMajor(1,M) for element-wise TMAX
+        TRESHAPE(localMaxRow, localMaxDN);
+        if (i == 0) {
+            TMAX(globalMaxRow, localMaxRow, localMaxRow);
+        } else {
+            TMAX(globalMaxRow, globalMaxRow, localMaxRow);
+        }
+        pipe_barrier(PIPE_V);
+    }
+
+    // Apply scale once to the global max vector (M elements, not n_blocks × M × N)
+    TMULS(globalMaxRow, globalMaxRow, scale_value);
+    pipe_barrier(PIPE_V);
+
+    // TRESHAPE back: RowMajor(1,M) → ColMajor(M,1) for Pass 2's TROWEXPANDSUB
+    TRESHAPE(globalMaxDN, globalMaxRow);
+
+    // Store final global max to mij for online_update to consume
+    set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+    wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+    TSTORE(mijGlobalND, globalMaxND);
+
+    // ======== Pass 2: Compute softmax with double-buffered sij ========
+    // globalMaxDN is already in UB from TRESHAPE — no reload needed.
+    // Sync MTE3→MTE2 to ensure the mij TSTORE completed before first sij TLOAD.
+    set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
+    wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
+
+    // Pre-load first sij tile into buffer A
+    GlobalDataMxN sijGlobal_0(sij_base);
+    TLOAD(sijTile_A, sijGlobal_0);
+
+    for (uint64_t i = 0; i < n_blocks; i++) {
+        GlobalDataMxN_bf16 pijGlobal(pij_base + i * M * N);
+
+        // Wait for current tile's TLOAD to complete
+        set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+        wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+
+        // TFILLPAD on current buffer if last block with partial valid length
+        if (i == n_blocks - 1 && valid_len_last < static_cast<uint64_t>(N)) {
+            TileSijDyn curSijDyn(static_cast<size_t>(valid_len_last));
+            if (i % 2 == 0) {
+                TASSIGN(curSijDyn, 0x0);
+                TFILLPAD_INPLACE(sijPadTile_A, curSijDyn);
+            } else {
+                TASSIGN(curSijDyn, static_cast<int>(kDataBytes));
+                TFILLPAD_INPLACE(sijPadTile_B, curSijDyn);
+            }
+            pipe_barrier(PIPE_V);
+        }
+
+        // Compute on current buffer (select A or B based on iteration parity)
+        if (i % 2 == 0) {
+            TMULS(sijTile_A, sijTile_A, scale_value);
+            pipe_barrier(PIPE_V);
+            TROWEXPANDSUB(pijTile, sijTile_A, globalMaxDN);
+        } else {
+            TMULS(sijTile_B, sijTile_B, scale_value);
+            pipe_barrier(PIPE_V);
+            TROWEXPANDSUB(pijTile, sijTile_B, globalMaxDN);
+        }
+        pipe_barrier(PIPE_V);
+        TEXP(pijTile, pijTile);
+        pipe_barrier(PIPE_V);
+        TCVT(pijBf16Tile, pijTile, RoundMode::CAST_ROUND);
+        pipe_barrier(PIPE_V);
+        TCVT(pijTile, pijBf16Tile, RoundMode::CAST_ROUND);
+
+        pipe_barrier(PIPE_V);
+        if (i == 0) {
+            TMULS(sumAccTile, pijTile, 1.0f);
+        } else {
+            TADD(sumAccTile, sumAccTile, pijTile);
+        }
+
+        // Store pij (must complete before next iteration's TCVT overwrites pijBf16Tile)
+        pipe_barrier(PIPE_V);
+        set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+        wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+        TSTORE(pijGlobal, pijBf16Tile);
+
+        // Prefetch next sij into alternate buffer (after TSTORE to avoid UB race)
+        if (i + 1 < n_blocks) {
+            set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
+            wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
+            GlobalDataMxN sijGlobal_next(sij_base + (i + 1) * M * N);
+            if (i % 2 == 0) {
+                TLOAD(sijTile_B, sijGlobal_next);
+            } else {
+                TLOAD(sijTile_A, sijGlobal_next);
+            }
+        }
+    }
+
+    // Compute final row sum from accumulated pij values
+    pipe_barrier(PIPE_V);
+    TROWSUM(sumDN, sumAccTile, tmpTile);
+
+    // Store lij (total sum). mij already stored after Pass 1.
+    set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+    wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+    TSTORE(lijGlobalDN, sumDN);
+
+    pipe_sync();
+}
+
+extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) {
+    __gm__ Tensor *sij_buf = reinterpret_cast<__gm__ Tensor *>(args[0]);
+    __gm__ Tensor *pij_buf = reinterpret_cast<__gm__ Tensor *>(args[1]);
+    __gm__ Tensor *mij = reinterpret_cast<__gm__ Tensor *>(args[2]);
+    __gm__ Tensor *lij = reinterpret_cast<__gm__ Tensor *>(args[3]);
+    union {
+        uint64_t u;
+        float f;
+    } scale_conv;
+    scale_conv.u = static_cast<uint64_t>(args[4]);
+    float scale_value = scale_conv.f;
+    uint64_t n_blocks = static_cast<uint64_t>(args[5]);
+    uint64_t valid_len_last = static_cast<uint64_t>(args[6]);
+
+    __gm__ float *sij_base = reinterpret_cast<__gm__ float *>(sij_buf->buffer.addr) + sij_buf->start_offset;
+    __gm__ bfloat16_t *pij_base = reinterpret_cast<__gm__ bfloat16_t *>(pij_buf->buffer.addr) + pij_buf->start_offset;
+    __gm__ float *mij_addr = reinterpret_cast<__gm__ float *>(mij->buffer.addr) + mij->start_offset;
+    __gm__ float *lij_addr = reinterpret_cast<__gm__ float *>(lij->buffer.addr) + lij->start_offset;
+
+    uint64_t q_tile_size = static_cast<uint64_t>(sij_buf->shapes[0]);
+
+    if (q_tile_size == 16) {
+        softmax_prepare_n_impl<16, 128>(sij_base, scale_value, pij_base, mij_addr, lij_addr, n_blocks, valid_len_last);
+    } else {
+        softmax_prepare_n_impl<64, 64>(sij_base, scale_value, pij_base, mij_addr, lij_addr, n_blocks, valid_len_last);
+    }
+}
diff --git a/tests/st/a2a3/fully_distributed_within_core/paged_attention_unroll/kernels/orchestration/paged_attention_orch.cpp b/tests/st/a2a3/fully_distributed_within_core/paged_attention_unroll/kernels/orchestration/paged_attention_orch.cpp
new file mode 100644
index 000000000..0978073d9
--- /dev/null
+++ b/tests/st/a2a3/fully_distributed_within_core/paged_attention_unroll/kernels/orchestration/paged_attention_orch.cpp
@@ -0,0 +1,379 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Paged Attention Orchestration Function V2 - N_UNROLL=8, 4 Tasks Per Group
+ *
+ * Batches up to N_UNROLL blocks per group. Each group submits exactly 4 tasks:
+ *   1. QK matmul:  qi @ K^T for n_blocks → sij_buf (q_tile, n_blocks * block_size)
+ *   2. Softmax:    two-pass over sij_buf → pij_buf, mi, li
+ *   3. PV matmul:  SplitK accumulated P @ V → oi_new (q_tile, head_dim)
+ *   4. Update:     online softmax accumulation with group-level mi, li, oi_new
+ *
+ * Memory Layout:
+ *   Query: (batch * num_heads, head_dim) bf16
+ *   Key:   (total_blocks, block_size, head_dim) bf16 (stored as K^T for QK)
+ *   Value: (total_blocks, block_size, head_dim) bf16
+ */
+
+#include <algorithm>
+#include <cstdint>
+#include <cstring>
+
+#include "pto_orchestration_api.h"
+
+#define N_UNROLL 64
+
+#define FUNC_QK_MATMUL 0
+#define FUNC_SOFTMAX_PREPARE 1
+#define FUNC_PV_MATMUL 2
+#define FUNC_ONLINE_UPDATE 3
+constexpr uint64_t PLATFORM_PROF_SYS_CNT_FREQ = 50000000;  // 50 MHz
+
+inline double cycles_to_us(uint64_t cycles) {
+    return (static_cast<double>(cycles) / PLATFORM_PROF_SYS_CNT_FREQ) * 1000000.0;
+}
+
+inline uint64_t get_sys_cnt_aicpu() {
+    uint64_t ticks;
+    asm volatile("mrs %0, cntvct_el0" : "=r"(ticks));
+    return ticks;
+}
+
+#ifdef ENABLE_PROFILING
+struct ProfCounters {
+    uint64_t param_extract = 0;
+    uint64_t ext_tensor = 0;
+    uint64_t make_tensor = 0;
+    uint64_t tensor_view = 0;
+    uint64_t param_setup = 0;
+    uint64_t submit_task = 0;
+    uint64_t scope_and_loop = 0;
+    int submit_count = 0;
+    int make_count = 0;
+    int view_count = 0;
+    // Running lap timestamps. File-global so the lap timeline stays continuous
+    // across the entry/process_qtile_scope() boundary — orchestration runs on a
+    // single thread, so a shared counter needs no synchronization.
+    uint64_t t0 = 0;
+    uint64_t t1 = 0;
+};
+static ProfCounters g_prof;
+#define CYCLE_COUNT_START() (g_prof.t0 = get_sys_cnt_aicpu())
+#define CYCLE_COUNT_LAP(acc)              \
+    do {                                  \
+        g_prof.t1 = get_sys_cnt_aicpu();  \
+        (acc) += (g_prof.t1 - g_prof.t0); \
+        g_prof.t0 = g_prof.t1;            \
+    } while (0)
+#else
+#define CYCLE_COUNT_START() (void)0
+#define CYCLE_COUNT_LAP(acc) (void)0
+#endif
+
+/**
+ * Submit the QK -> softmax -> PV -> update task chain for one (batch, q-tile) unit.
+ *
+ * All context is passed positionally through a transport `Arg` (built by the
+ * caller, never submitted — only its slots are read back here). Every tensor
+ * slot is a materialized Tensor; the Arg carries no TensorCreateInfo (the
+ * scope's create-infos are rebuilt locally from the q_tile/head_dim scalars):
+ *   tensors: 0 query, 1 key_cache, 2 value_cache, 3 block_table (inputs),
+ *            4 out (output buffer the update task writes — add_output(Tensor))
+ *   scalars: 0 b_idx, 1 q_idx, 2 q_head_num, 3 q_tile, 4 head_dim,
+ *            5 block_size, 6 block_num, 7 scale_value, 8 bn_this_batch,
+ *            9 cur_seq, 10 data_type
+ * Adding/removing a slot here must be mirrored at the caller's build site.
+ *
+ * Must run inside a PTO2_SCOPE: the alloc'd / submitted tensors it references
+ * do not outlive that scope.
+ */
+static void process_qtile_scope(const L0TaskArgs &ctx) {
+    const Tensor &query = ctx.tensor(0).ref();
+    const Tensor &key_cache = ctx.tensor(1).ref();
+    const Tensor &value_cache = ctx.tensor(2).ref();
+    const Tensor &block_table = ctx.tensor(3).ref();
+    const Tensor &out = ctx.tensor(4).ref();
+    uint64_t b_idx = ctx.scalar(0);
+    uint64_t q_idx = ctx.scalar(1);
+    uint64_t q_head_num = ctx.scalar(2);
+    uint64_t q_tile = ctx.scalar(3);
+    uint64_t head_dim = ctx.scalar(4);
+    uint64_t block_size = ctx.scalar(5);
+    uint64_t block_num = ctx.scalar(6);
+    uint64_t scale_value = ctx.scalar(7);
+    uint64_t bn_this_batch = ctx.scalar(8);
+    uint64_t cur_seq = ctx.scalar(9);
+    DataType data_type = static_cast<DataType>(ctx.scalar(10));
+
+    CYCLE_COUNT_START();
+
+    // Create infos for the per-scope accumulators — shapes depend only on
+    // q_tile/head_dim, so build once before the block loop. Kept out of the
+    // transport Arg, which carries only materialized Tensors.
+    uint32_t oi_shapes[2] = {static_cast<uint32_t>(q_tile), static_cast<uint32_t>(head_dim)};
+    uint32_t li_shapes[1] = {static_cast<uint32_t>(q_tile)};
+    TensorCreateInfo tile2d_ci(oi_shapes, 2, DataType::FLOAT32);
+    TensorCreateInfo scalar_ci(li_shapes, 1, DataType::FLOAT32);
+#ifdef ENABLE_PROFILING
+    g_prof.make_count += 2;
+    CYCLE_COUNT_LAP(g_prof.make_tensor);
+#endif
+
+    uint64_t cur_offset = b_idx * q_head_num + q_idx * q_tile;
+
+    uint32_t qi_shapes[2] = {static_cast<uint32_t>(q_tile), static_cast<uint32_t>(head_dim)};
+    uint32_t qi_offsets[2] = {static_cast<uint32_t>(cur_offset), 0};
+    Tensor qi = query.view(qi_shapes, qi_offsets);
+    uint32_t out_view_shapes[2] = {static_cast<uint32_t>(q_tile), static_cast<uint32_t>(head_dim)};
+    uint32_t out_view_offsets[2] = {static_cast<uint32_t>(cur_offset), 0};
+    Tensor out_view = out.view(out_view_shapes, out_view_offsets, true);
+#ifdef ENABLE_PROFILING
+    g_prof.view_count += 2;
+    CYCLE_COUNT_LAP(g_prof.tensor_view);
+#endif
+    CYCLE_COUNT_LAP(g_prof.param_setup);
+    TaskOutputTensors alloc_outs = alloc_tensors(tile2d_ci, scalar_ci, scalar_ci);
+    const Tensor &oi = alloc_outs.get_ref(0);
+    const Tensor &li_update = alloc_outs.get_ref(1);
+    const Tensor &mi_update = alloc_outs.get_ref(2);
+#ifdef ENABLE_PROFILING
+    g_prof.submit_count++;
+    CYCLE_COUNT_LAP(g_prof.submit_task);
+#endif
+
+    // Reusable Arg objects — reset() before each use avoids
+    // repeated stack-frame construction in the inner loop.
+    L0TaskArgs params_qk, params_sf, params_pv, params_up;
+
+    for (uint64_t bn = 0; bn < bn_this_batch; bn += N_UNROLL) {
+        uint64_t n_blocks = std::min(static_cast<uint64_t>(N_UNROLL), bn_this_batch - bn);
+
+        // Valid length for last block in this group
+        uint64_t last_block_seq_start = (bn + n_blocks - 1) * block_size;
+        uint64_t valid_len_last = std::min(block_size, cur_seq - last_block_seq_start);
+        CYCLE_COUNT_LAP(g_prof.param_extract);
+
+        // === Task 1: Batched QK matmul ===
+        uint32_t sij_buf_shapes[2] = {static_cast<uint32_t>(q_tile), static_cast<uint32_t>(n_blocks * block_size)};
+        TensorCreateInfo sij_buf_ci(sij_buf_shapes, 2, DataType::FLOAT32);
+#ifdef ENABLE_PROFILING
+        g_prof.make_count += 1;
+        CYCLE_COUNT_LAP(g_prof.make_tensor);
+#endif
+
+        params_qk.reset();
+        params_qk.add_input(qi, key_cache, block_table);
+        params_qk.add_output(sij_buf_ci);
+        params_qk.add_scalar(n_blocks, b_idx * block_num + bn);
+        CYCLE_COUNT_LAP(g_prof.param_setup);
+        TaskOutputTensors qk_outs = rt_submit_aic_task(FUNC_QK_MATMUL, params_qk);
+        const Tensor &sij_buf = qk_outs.get_ref(0);
+#ifdef ENABLE_PROFILING
+        g_prof.submit_count++;
+        CYCLE_COUNT_LAP(g_prof.submit_task);
+#endif
+
+        // === Task 2: Two-pass softmax over all blocks in group ===
+        uint32_t pij_buf_shapes[2] = {static_cast<uint32_t>(q_tile), static_cast<uint32_t>(n_blocks * block_size)};
+        TensorCreateInfo pij_buf_ci(pij_buf_shapes, 2, data_type);
+#ifdef ENABLE_PROFILING
+        g_prof.make_count += 1;
+        CYCLE_COUNT_LAP(g_prof.make_tensor);
+#endif
+
+        params_sf.reset();
+        params_sf.add_input(sij_buf);
+        params_sf.add_output(pij_buf_ci, scalar_ci, scalar_ci);
+        params_sf.add_scalar(scale_value, n_blocks, valid_len_last);
+        CYCLE_COUNT_LAP(g_prof.param_setup);
+        TaskOutputTensors sf_outs = rt_submit_aiv_task(FUNC_SOFTMAX_PREPARE, params_sf);
+        const Tensor &pij_buf = sf_outs.get_ref(0);
+        const Tensor &mi = sf_outs.get_ref(1);
+        const Tensor &li = sf_outs.get_ref(2);
+#ifdef ENABLE_PROFILING
+        g_prof.submit_count++;
+        CYCLE_COUNT_LAP(g_prof.submit_task);
+#endif
+
+        // === Task 3: SplitK PV matmul (accumulated P @ V) ===
+        params_pv.reset();
+        params_pv.add_input(pij_buf, value_cache, block_table);
+        params_pv.add_output(tile2d_ci);
+        params_pv.add_scalar(n_blocks, b_idx * block_num + bn);
+        CYCLE_COUNT_LAP(g_prof.param_setup);
+        TaskOutputTensors pv_outs = rt_submit_aic_task(FUNC_PV_MATMUL, params_pv);
+        const Tensor &oi_new = pv_outs.get_ref(0);
+#ifdef ENABLE_PROFILING
+        g_prof.submit_count++;
+        CYCLE_COUNT_LAP(g_prof.submit_task);
+#endif
+
+        // === Task 4: Online update (per-group) ===
+        uint64_t is_first = (bn == 0) ? 1 : 0;
+        uint64_t is_last = (bn + n_blocks >= bn_this_batch) ? 1 : 0;
+
+        params_up.reset();
+        params_up.add_input(mi, li, oi_new);
+        params_up.add_inout(mi_update, li_update, oi, out_view);
+        params_up.add_scalar(is_first, is_last);
+        CYCLE_COUNT_LAP(g_prof.param_setup);
+        rt_submit_aiv_task(FUNC_ONLINE_UPDATE, params_up);
+#ifdef ENABLE_PROFILING
+        g_prof.submit_count++;
+        CYCLE_COUNT_LAP(g_prof.submit_task);
+#endif
+    }
+}
+
+extern "C" {
+/**
+ * Orchestration config — the executor reads these values to set up
+ * shared memory and runtime before calling aicpu_orchestration_entry.
+ */
+__attribute__((visibility("default"))) PTO2OrchestrationConfig aicpu_orchestration_config(const L2TaskArgs &orch_args) {
+    (void)orch_args;
+    return PTO2OrchestrationConfig{
+        .expected_arg_count = 7,
+    };
+}
+
+__attribute__((visibility("default"))) void aicpu_orchestration_entry(const L2TaskArgs &orch_args) {
+#ifdef ENABLE_PROFILING
+    g_prof = ProfCounters{};  // reset per entry — single-threaded orchestration
+#endif
+
+    CYCLE_COUNT_START();
+
+    // Read dimensions from tensor metadata
+    // query: shape=[batch, num_heads, head_dim]
+    uint64_t batch = orch_args.tensor(0).ref().shapes[0];
+    uint64_t num_heads = orch_args.tensor(0).ref().shapes[1];
+    uint64_t head_dim = orch_args.tensor(0).ref().shapes[2];
+    DataType data_type = orch_args.tensor(0).ref().dtype;
+
+    // key_cache: shape=[total_blocks, block_size, kv_head_num, head_dim]
+    uint64_t block_size = orch_args.tensor(1).ref().shapes[1];
+
+    // block_table: shape=[batch, max_num_blocks_per_req]
+    uint64_t block_num = orch_args.tensor(3).ref().shapes[1];
+
+    // scale from scalar arg
+    uint64_t scale_value = orch_args.scalar(0);
+    uint64_t q_head_num = num_heads;
+    uint64_t q_tile = std::min(num_heads, static_cast<uint64_t>(128));
+    uint64_t q_loop = (q_head_num + q_tile - 1) / q_tile;
+    CYCLE_COUNT_LAP(g_prof.param_extract);
+
+    // Reshape tensors for kernel consumption (2D flattened)
+    void *query_ptr = orch_args.tensor(0).ref().data_as<void>();
+    void *kc_ptr = orch_args.tensor(1).ref().data_as<void>();
+    void *vc_ptr = orch_args.tensor(2).ref().data_as<void>();
+    void *out_ptr = orch_args.tensor(5).ref().data_as<void>();
+
+    uint64_t total_blocks_count = orch_args.tensor(1).ref().shapes[0];
+
+    uint32_t query_shapes[2] = {static_cast<uint32_t>(batch * num_heads), static_cast<uint32_t>(head_dim)};
+    uint32_t key_cache_shapes[2] = {
+        static_cast<uint32_t>(total_blocks_count * block_size), static_cast<uint32_t>(head_dim)
+    };
+    uint32_t value_cache_shapes[2] = {
+        static_cast<uint32_t>(total_blocks_count * block_size), static_cast<uint32_t>(head_dim)
+    };
+    uint32_t out_shapes[2] = {static_cast<uint32_t>(batch * num_heads), static_cast<uint32_t>(head_dim)};
+    Tensor query = make_tensor_external(query_ptr, query_shapes, 2, data_type, false);
+    Tensor key_cache = make_tensor_external(kc_ptr, key_cache_shapes, 2, data_type, false);
+    Tensor value_cache = make_tensor_external(vc_ptr, value_cache_shapes, 2, data_type, false);
+    Tensor out = make_tensor_external(out_ptr, out_shapes, 2, DataType::FLOAT32);
+
+    uint32_t bt_shapes[2] = {static_cast<uint32_t>(batch), static_cast<uint32_t>(block_num)};
+    Tensor block_table =
+        make_tensor_external(orch_args.tensor(3).ref().data_as<void>(), bt_shapes, 2, DataType::INT32, false);
+    uint32_t cl_shapes[1] = {static_cast<uint32_t>(batch)};
+    Tensor context_lens =
+        make_tensor_external(orch_args.tensor(4).ref().data_as<void>(), cl_shapes, 1, DataType::INT32, false);
+
+#ifdef ENABLE_PROFILING
+    CYCLE_COUNT_LAP(g_prof.ext_tensor);
+#endif
+
+    // Transport Arg reused across iterations — packs the scope's context for
+    // process_qtile_scope(); see that function for the positional slot layout.
+    // It carries only materialized Tensors (no TensorCreateInfo); the scope's
+    // create-infos are rebuilt inside the helper from the q_tile/head_dim scalars.
+    L0TaskArgs ctx;
+
+    for (uint64_t b_idx = 0; b_idx < batch; b_idx++) {
+        uint32_t cl_idx[1] = {static_cast<uint32_t>(b_idx)};
+        uint64_t cur_seq = static_cast<uint64_t>(get_tensor_data<int32_t>(context_lens, 1, cl_idx));
+        uint64_t bn_this_batch = (cur_seq + block_size - 1) / block_size;
+
+        for (uint64_t q_idx = 0; q_idx < q_loop; q_idx++) {
+            CYCLE_COUNT_LAP(g_prof.scope_and_loop);
+
+            ctx.reset();
+            ctx.add_input(query, key_cache, value_cache, block_table);
+            ctx.add_output(out);
+            ctx.add_scalar(
+                b_idx, q_idx, q_head_num, q_tile, head_dim, block_size, block_num, scale_value, bn_this_batch, cur_seq,
+                static_cast<uint64_t>(data_type)
+            );
+
+            PTO2_SCOPE() { process_qtile_scope(ctx); }
+        }
+    }
+    CYCLE_COUNT_LAP(g_prof.scope_and_loop);
+
+#ifdef ENABLE_PROFILING
+    uint64_t total = g_prof.param_extract + g_prof.ext_tensor + g_prof.make_tensor + g_prof.tensor_view +
+                     g_prof.param_setup + g_prof.submit_task + g_prof.scope_and_loop;
+    LOG_INFO_V9(
+        "=== PagedAttn Orch Profiling: %d submits, %d makes, %d views, total=%.3fus ===", g_prof.submit_count,
+        g_prof.make_count, g_prof.view_count, cycles_to_us(total)
+    );
+    if (total > 0) {
+        LOG_INFO_V9(
+            "  param_extract    : %7.3fus (%5.1f%%)", cycles_to_us(g_prof.param_extract),
+            g_prof.param_extract * 100.0 / total
+        );
+        LOG_INFO_V9(
+            "  ext_tensor(x4)   : %7.3fus (%5.1f%%)", cycles_to_us(g_prof.ext_tensor), g_prof.ext_tensor * 100.0 / total
+        );
+        LOG_INFO_V9(
+            "  create_info(x%d) : %7.3fus (%5.1f%%)  avg=%.3fus", g_prof.make_count, cycles_to_us(g_prof.make_tensor),
+            g_prof.make_tensor * 100.0 / total,
+            g_prof.make_count > 0 ? cycles_to_us(g_prof.make_tensor) / g_prof.make_count : 0.0
+        );
+        LOG_INFO_V9(
+            "  tensor_view(x%d) : %7.3fus (%5.1f%%)  avg=%.3fus", g_prof.view_count, cycles_to_us(g_prof.tensor_view),
+            g_prof.tensor_view * 100.0 / total,
+            g_prof.view_count > 0 ? cycles_to_us(g_prof.tensor_view) / g_prof.view_count : 0.0
+        );
+        LOG_INFO_V9(
+            "  param_setup      : %7.3fus (%5.1f%%)", cycles_to_us(g_prof.param_setup),
+            g_prof.param_setup * 100.0 / total
+        );
+        LOG_INFO_V9(
+            "  submit_task(x%d) : %7.3fus (%5.1f%%)  avg=%.3fus", g_prof.submit_count, cycles_to_us(g_prof.submit_task),
+            g_prof.submit_task * 100.0 / total,
+            g_prof.submit_count > 0 ? cycles_to_us(g_prof.submit_task) / g_prof.submit_count : 0.0
+        );
+        LOG_INFO_V9(
+            "  scope_and_loop   : %7.3fus (%5.1f%%)", cycles_to_us(g_prof.scope_and_loop),
+            g_prof.scope_and_loop * 100.0 / total
+        );
+    }
+#endif
+
+#undef CYCLE_COUNT_START
+#undef CYCLE_COUNT_LAP
+}
+
+}  // extern "C"
diff --git a/tests/st/a2a3/fully_distributed_within_core/paged_attention_unroll/test_paged_attention_unroll.py b/tests/st/a2a3/fully_distributed_within_core/paged_attention_unroll/test_paged_attention_unroll.py
new file mode 100644
index 000000000..5135a2ed2
--- /dev/null
+++ b/tests/st/a2a3/fully_distributed_within_core/paged_attention_unroll/test_paged_attention_unroll.py
@@ -0,0 +1,132 @@
+#!/usr/bin/env python3
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""Paged attention unroll: production-scale with unrolled orchestration."""
+
+import torch
+from simpler.task_interface import ArgDirection as D
+
+from simpler_setup import Scalar, SceneTestCase, TaskArgsBuilder, Tensor, scene_test
+from simpler_setup.goldens.paged_attention import compute_golden as _pa_compute_golden
+from simpler_setup.goldens.paged_attention import generate_inputs as _pa_generate_inputs
+
+
+@scene_test(level=2, runtime="tensormap_and_ringbuffer")
+class TestPagedAttentionUnroll(SceneTestCase):
+    RTOL = 1e-3
+    ATOL = 1e-3
+
+    CALLABLE = {
+        "orchestration": {
+            "source": "kernels/orchestration/paged_attention_orch.cpp",
+            "function_name": "aicpu_orchestration_entry",
+            "signature": [D.IN, D.IN, D.IN, D.IN, D.IN, D.OUT],
+        },
+        "incores": [
+            {
+                "func_id": 0,
+                "name": "QK",
+                "source": "kernels/aic/aic_qk_matmul.cpp",
+                "core_type": "aic",
+                "signature": [D.IN, D.IN, D.IN, D.OUT],
+            },
+            {
+                "func_id": 1,
+                "name": "SF",
+                "source": "kernels/aiv/aiv_softmax_prepare.cpp",
+                "core_type": "aiv",
+                "signature": [D.IN, D.OUT, D.OUT, D.OUT],
+            },
+            {
+                "func_id": 2,
+                "name": "PV",
+                "source": "kernels/aic/aic_pv_matmul.cpp",
+                "core_type": "aic",
+                "signature": [D.IN, D.IN, D.IN, D.OUT],
+            },
+            {
+                "func_id": 3,
+                "name": "UP",
+                "source": "kernels/aiv/aiv_online_update.cpp",
+                "core_type": "aiv",
+                "signature": [D.IN, D.IN, D.IN, D.INOUT, D.INOUT, D.INOUT, D.INOUT],
+            },
+        ],
+    }
+
+    CASES = [
+        {
+            "name": "Case1",
+            "platforms": ["a2a3"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 24},
+            "params": {
+                "batch": 256,
+                "num_heads": 16,
+                "kv_head_num": 1,
+                "head_dim": 128,
+                "block_size": 128,
+                "context_len": 8192,
+                "max_model_len": 32768,
+                "dtype": "bfloat16",
+            },
+        },
+        {
+            "name": "Case2",
+            "platforms": ["a2a3"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 24},
+            "manual": True,
+            "params": {
+                "batch": 64,
+                "num_heads": 64,
+                "kv_head_num": 1,
+                "head_dim": 128,
+                "block_size": 64,
+                "context_len": 8192,
+                "max_model_len": 32768,
+                "dtype": "bfloat16",
+            },
+        },
+        {
+            "name": "Case3",
+            "platforms": ["a2a3"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 24},
+            "manual": True,
+            "params": {
+                "batch": 64,
+                "num_heads": 64,
+                "kv_head_num": 1,
+                "head_dim": 256,
+                "block_size": 64,
+                "context_len": 8192,
+                "max_model_len": 32768,
+                "dtype": "bfloat16",
+            },
+        },
+    ]
+
+    def generate_args(self, params):
+        result = _pa_generate_inputs(params)
+        specs = []
+        for name, value in result:
+            if isinstance(value, torch.Tensor):
+                specs.append(Tensor(name, value))
+            else:
+                specs.append(Scalar(name, value))
+        return TaskArgsBuilder(*specs)
+
+    def compute_golden(self, args, params):
+        tensors = {s.name: s.value for s in args.specs if isinstance(s, Tensor)}
+        _pa_compute_golden(tensors, params)
+        for s in args.specs:
+            if isinstance(s, Tensor) and s.name in tensors:
+                getattr(args, s.name)[:] = tensors[s.name]
+
+
+if __name__ == "__main__":
+    SceneTestCase.run_module(__name__)
diff --git a/tests/st/a2a3/fully_distributed_within_core/paged_attention_unroll_4dims/kernels/aic/aic_pv_matmul.cpp b/tests/st/a2a3/fully_distributed_within_core/paged_attention_unroll_4dims/kernels/aic/aic_pv_matmul.cpp
new file mode 100644
index 000000000..779a986c3
--- /dev/null
+++ b/tests/st/a2a3/fully_distributed_within_core/paged_attention_unroll_4dims/kernels/aic/aic_pv_matmul.cpp
@@ -0,0 +1,171 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+// SplitK PV Matmul Kernel: Accumulated P @ V across n_blocks
+//
+// Processes n_blocks blocks using SplitK accumulation pattern:
+//   Block 0: TMATMUL(C, A, B)       — initialize accumulator
+//   Block i: TMATMUL_ACC(C, C, A, B) — accumulate into same C
+//
+// Per-block pij addresses: contiguous slices of pij_buf (n_blocks * M * K)
+// Per-block vj addresses: value_cache base + block_indices lookup
+// Single output: oi_new (M, N) fp32 = sum of P_i @ V_i across all blocks
+//
+// Optimizations:
+//   - Double-buffered L1 tiles (ping/pong for A and B via MTE2)
+//   - Double-buffered L0 tiles (ping/pong for L0A and L0B via MTE1)
+//   - TLOAD(next) overlaps with TMATMUL(current) via MTE2/M-pipe parallelism
+//   - Canonical 3-stage pipeline: TLOAD(MTE2) → TMOV(MTE1) → TMATMUL(M)
+//   - Reverse-dependency events ensure buffer safety across iterations
+//
+// Supports two tile configurations via runtime dispatch:
+//   Case1: (16, 128) @ (128, 128) -> (16, 128)
+//   Case2: (64,  64) @ ( 64, 128) -> (64, 128)
+//
+// pij is bfloat16 (from softmax_prepare TCVT).
+// vj is stored as (K, N) = (block_size, head_dim) in row-major (ND) layout.
+
+#include <cstdint>
+#include <pto/pto-inst.hpp>
+
+#include "tensor.h"
+
+using namespace pto;
+
+#include "pipe_sync.h"
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__ [aicore]
+#endif
+
+template <int M, int K, int N>
+static __aicore__ void pv_matmul_n_impl(
+    __gm__ Tensor *pij_buf, __gm__ Tensor *value_cache, __gm__ Tensor *block_table_t, __gm__ Tensor *oi_new,
+    uint64_t n_blocks, uint64_t bt_offset
+) {
+    // Decode 4D semantic: batch/q_len are constexpr 1.
+    static constexpr int BATCH = 1;
+    static constexpr int Q_LEN = 1;
+
+    __gm__ bfloat16_t *pij_base = reinterpret_cast<__gm__ bfloat16_t *>(pij_buf->buffer.addr) + pij_buf->start_offset;
+    __gm__ bfloat16_t *val_base = reinterpret_cast<__gm__ bfloat16_t *>(value_cache->buffer.addr);
+    __gm__ float *oi_base = reinterpret_cast<__gm__ float *>(oi_new->buffer.addr) + oi_new->start_offset;
+    __gm__ int32_t *bt = reinterpret_cast<__gm__ int32_t *>(block_table_t->buffer.addr);
+
+    using GlobalA = GlobalTensor<bfloat16_t, Shape<1, BATCH, Q_LEN, M, K>, Stride<1, M * K, M * K, K, 1>>;
+    using GlobalB = GlobalTensor<bfloat16_t, Shape<1, 1, 1, K, N>, Stride<K * N, K * N, K * N, N, 1>>;
+    using GlobalOut = GlobalTensor<float, Shape<1, BATCH, Q_LEN, M, N>, Stride<1, M * N, M * N, N, 1>>;
+
+    using TileMatA = Tile<TileType::Mat, bfloat16_t, M, K, BLayout::ColMajor, M, K, SLayout::RowMajor, 512>;
+    using TileMatB = Tile<TileType::Mat, bfloat16_t, K, N, BLayout::ColMajor, K, N, SLayout::RowMajor, 512>;
+
+    using LeftTile = TileLeft<bfloat16_t, M, K, M, K>;
+    using RightTile = TileRight<bfloat16_t, K, N, K, N>;
+    using AccTile = TileAcc<float, M, N, M, N>;
+
+    // L1 memory layout: double-buffered A and B tiles (tightly packed)
+    constexpr int kATileBytes = M * K * static_cast<int>(sizeof(bfloat16_t));
+    constexpr int kBTileBytes = K * N * static_cast<int>(sizeof(bfloat16_t));
+
+    TileMatA aMatTile[2];
+    TileMatB bMatTile[2];
+    TASSIGN(aMatTile[0], 0x0);
+    TASSIGN(aMatTile[1], kATileBytes);
+    TASSIGN(bMatTile[0], 2 * kATileBytes);
+    TASSIGN(bMatTile[1], 2 * kATileBytes + kBTileBytes);
+
+    // L0 memory layout: double-buffered L0A and L0B, single accumulator L0C
+    LeftTile aTile[2];
+    RightTile bTile[2];
+    AccTile cTile;
+    TASSIGN(aTile[0], 0x0);
+    TASSIGN(aTile[1], kATileBytes);
+    TASSIGN(bTile[0], 0x0);
+    TASSIGN(bTile[1], kBTileBytes);
+    TASSIGN(cTile, 0x0);
+
+    GlobalOut oiGlobal(oi_base);
+
+    // Seed reverse-dependency flags: all ping/pong buffers initially free
+    //   PIPE_MTE1 → PIPE_MTE2: L1 buffer [0/1] safe for TLOAD to overwrite
+    //   PIPE_M    → PIPE_MTE1: L0 buffer [0/1] safe for TMOV to overwrite
+    set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID0);
+    set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1);
+    set_flag(PIPE_M, PIPE_MTE1, EVENT_ID0);
+    set_flag(PIPE_M, PIPE_MTE1, EVENT_ID1);
+
+    for (uint64_t i = 0; i < n_blocks; i++) {
+        int cur = static_cast<int>(i % 2);
+        GlobalA pijGlobal(pij_base + i * M * K);
+        GlobalB vjGlobal(val_base + bt[bt_offset + i] * K * N);
+
+        // Stage 1: TLOAD (MTE2: GM → L1[cur])
+        // Wait for MTE1 to release L1[cur] (reverse dep from previous iteration)
+        wait_flag(PIPE_MTE1, PIPE_MTE2, static_cast<::event_t>(cur));
+        TLOAD(aMatTile[cur], pijGlobal);
+        set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);  // forward: A in L1 ready
+        TLOAD(bMatTile[cur], vjGlobal);
+        set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID1);  // forward: B in L1 ready
+
+        // Stage 2: TMOV (MTE1: L1[cur] → L0[cur])
+        // Wait for M-pipe to release L0[cur] (reverse dep from previous iteration)
+        wait_flag(PIPE_M, PIPE_MTE1, static_cast<::event_t>(cur));
+        wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);  // forward: wait A loaded
+        TMOV(aTile[cur], aMatTile[cur]);
+        wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID1);  // forward: wait B loaded
+        TMOV(bTile[cur], bMatTile[cur]);
+        set_flag(PIPE_MTE1, PIPE_MTE2, static_cast<::event_t>(cur));  // reverse: release L1[cur]
+
+        // Stage 3: TMATMUL (M-pipe: L0A[cur] × L0B[cur] → L0C)
+        set_flag(PIPE_MTE1, PIPE_M, static_cast<::event_t>(cur));  // forward: L0[cur] ready
+        wait_flag(PIPE_MTE1, PIPE_M, static_cast<::event_t>(cur));
+        if (i == 0) {
+            TMATMUL(cTile, aTile[cur], bTile[cur]);
+        } else {
+            TMATMUL_ACC(cTile, cTile, aTile[cur], bTile[cur]);
+        }
+        set_flag(PIPE_M, PIPE_MTE1, static_cast<::event_t>(cur));  // reverse: release L0[cur]
+    }
+
+    // Drain outstanding reverse-dependency flags
+    wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID0);
+    wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1);
+    wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID0);
+    wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID1);
+
+    set_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
+    wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
+    TSTORE(oiGlobal, cTile);
+
+    pipe_sync();
+}
+
+extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) {
+    __gm__ Tensor *pij_buf = reinterpret_cast<__gm__ Tensor *>(args[0]);
+    __gm__ Tensor *value_cache = reinterpret_cast<__gm__ Tensor *>(args[1]);
+    __gm__ Tensor *block_table_t = reinterpret_cast<__gm__ Tensor *>(args[2]);
+    __gm__ Tensor *oi_new = reinterpret_cast<__gm__ Tensor *>(args[3]);
+    uint64_t n_blocks = static_cast<uint64_t>(args[4]);
+    uint64_t bt_offset = static_cast<uint64_t>(args[5]);
+
+    // pij_buf is 4D (1, 1, q_tile, n_blocks*block_size) to match qk's 4D output.
+    uint64_t q_tile_size = static_cast<uint64_t>(pij_buf->shapes[2]);
+
+    if (q_tile_size == 16) {
+        pv_matmul_n_impl<16, 128, 128>(pij_buf, value_cache, block_table_t, oi_new, n_blocks, bt_offset);
+    } else {
+        pv_matmul_n_impl<64, 64, 128>(pij_buf, value_cache, block_table_t, oi_new, n_blocks, bt_offset);
+    }
+}
diff --git a/tests/st/a2a3/fully_distributed_within_core/paged_attention_unroll_4dims/kernels/aic/aic_qk_matmul.cpp b/tests/st/a2a3/fully_distributed_within_core/paged_attention_unroll_4dims/kernels/aic/aic_qk_matmul.cpp
new file mode 100644
index 000000000..eb2bac50f
--- /dev/null
+++ b/tests/st/a2a3/fully_distributed_within_core/paged_attention_unroll_4dims/kernels/aic/aic_qk_matmul.cpp
@@ -0,0 +1,135 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+// Multi-block QK Matmul Kernel: qi(M, K) @ kj.T(K, N) -> sij(M, N) for each block
+//
+// Processes n_blocks blocks in a single kernel invocation.
+// Per-block kj addresses computed from key_cache base + block_indices lookup.
+// qi is shared across all blocks (same query head against different key blocks).
+//
+// Output layout: n_blocks contiguous (M, N) tiles stacked vertically.
+// Block i occupies sij[i*M : (i+1)*M, 0:N].
+//
+// Optimizations:
+//   - qi TLOAD hoisted before the loop (constant across all iterations)
+//
+// Supports two tile configurations via runtime dispatch:
+//   Case1: (16, 128) @ (128, 128).T -> (16, 128)
+//   Case2: (64, 128) @ (128,  64).T -> (64,  64)
+//
+// Template: M=q_tile, K=head_dim, N=block_size
+
+#include <cstdint>
+#include <pto/pto-inst.hpp>
+
+#include "tensor.h"
+
+using namespace pto;
+
+#include "pipe_sync.h"
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__ [aicore]
+#endif
+
+template <int M, int K, int N>
+static __aicore__ void qk_matmul_n_impl(
+    __gm__ Tensor *qi, __gm__ Tensor *key_cache, __gm__ Tensor *block_table_t, __gm__ Tensor *sij_buf,
+    uint64_t n_blocks, uint64_t bt_offset
+) {
+    // Decode 4D query view: batch/q_len are constexpr 1.
+    static constexpr int BATCH = 1;
+    static constexpr int Q_LEN = 1;
+
+    __gm__ bfloat16_t *qi_base = reinterpret_cast<__gm__ bfloat16_t *>(qi->buffer.addr) + qi->start_offset;
+    __gm__ bfloat16_t *key_base = reinterpret_cast<__gm__ bfloat16_t *>(key_cache->buffer.addr);
+    __gm__ float *sij_base = reinterpret_cast<__gm__ float *>(sij_buf->buffer.addr) + sij_buf->start_offset;
+    __gm__ int32_t *bt = reinterpret_cast<__gm__ int32_t *>(block_table_t->buffer.addr);
+
+    using GlobalA = GlobalTensor<bfloat16_t, Shape<1, BATCH, Q_LEN, M, K>, Stride<1, M * K, M * K, K, 1>>;
+    using GlobalB = GlobalTensor<bfloat16_t, Shape<1, 1, 1, K, N>, Stride<K * N, K * N, K * N, 1, K>, Layout::DN>;
+    using GlobalOut = GlobalTensor<float, Shape<1, BATCH, Q_LEN, M, N>, Stride<1, M * N, M * N, N, 1>>;
+
+    using TileMatA = Tile<TileType::Mat, bfloat16_t, M, K, BLayout::ColMajor, M, K, SLayout::RowMajor, 512>;
+    using TileMatB = Tile<TileType::Mat, bfloat16_t, K, N, BLayout::RowMajor, K, N, SLayout::ColMajor, 512>;
+
+    using LeftTile = TileLeft<bfloat16_t, M, K, M, K>;
+    using RightTile = TileRight<bfloat16_t, K, N, K, N>;
+    using AccTile = TileAcc<float, M, N, M, N>;
+
+    TileMatA aMatTile;
+    TileMatB bMatTile;
+    TASSIGN(aMatTile, 0x0);
+    TASSIGN(bMatTile, 0x20000);
+
+    LeftTile aTile;
+    RightTile bTile;
+    AccTile cTile;
+    TASSIGN(aTile, 0x0);
+    TASSIGN(bTile, 0x0);
+    TASSIGN(cTile, 0x0);
+
+    // Hoist qi TLOAD before the loop (qi is constant across all blocks)
+    GlobalA qiGlobal(qi_base);
+    TLOAD(aMatTile, qiGlobal);
+
+    for (uint64_t i = 0; i < n_blocks; i++) {
+        GlobalB kjGlobal(key_base + bt[bt_offset + i] * N * K);
+        GlobalOut sijGlobal(sij_base + i * M * N);
+
+        // Load only B each iteration (qi already in L1 from hoist)
+        TLOAD(bMatTile, kjGlobal);
+
+        set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
+        wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
+
+        // TMOV qi from L1→L0A (re-copy since TMATMUL consumed L0A) and kj from L1→L0B
+        TMOV(aTile, aMatTile);
+        TMOV(bTile, bMatTile);
+
+        set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
+        wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
+
+        TMATMUL(cTile, aTile, bTile);
+
+        set_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
+        wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
+
+        TSTORE(sijGlobal, cTile);
+
+        if (i + 1 < n_blocks) {
+            pipe_barrier(PIPE_ALL);
+        }
+    }
+    pipe_sync();
+}
+
+extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) {
+    __gm__ Tensor *qi = reinterpret_cast<__gm__ Tensor *>(args[0]);
+    __gm__ Tensor *key_cache = reinterpret_cast<__gm__ Tensor *>(args[1]);
+    __gm__ Tensor *block_table_t = reinterpret_cast<__gm__ Tensor *>(args[2]);
+    __gm__ Tensor *sij_buf = reinterpret_cast<__gm__ Tensor *>(args[3]);
+    uint64_t n_blocks = static_cast<uint64_t>(args[4]);
+    uint64_t bt_offset = static_cast<uint64_t>(args[5]);
+
+    // qi is a 4D view (batch, q_len, num_heads_tile, head_dim); decode fixes batch=q_len=1.
+    uint64_t q_tile_size = static_cast<uint64_t>(qi->shapes[2]);
+
+    if (q_tile_size == 16) {
+        qk_matmul_n_impl<16, 128, 128>(qi, key_cache, block_table_t, sij_buf, n_blocks, bt_offset);
+    } else {
+        qk_matmul_n_impl<64, 128, 64>(qi, key_cache, block_table_t, sij_buf, n_blocks, bt_offset);
+    }
+}
diff --git a/tests/st/a2a3/fully_distributed_within_core/paged_attention_unroll_4dims/kernels/aiv/aiv_online_update.cpp b/tests/st/a2a3/fully_distributed_within_core/paged_attention_unroll_4dims/kernels/aiv/aiv_online_update.cpp
new file mode 100644
index 000000000..b1e8110d2
--- /dev/null
+++ b/tests/st/a2a3/fully_distributed_within_core/paged_attention_unroll_4dims/kernels/aiv/aiv_online_update.cpp
@@ -0,0 +1,262 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+// Online Softmax Update + Normalize Kernel (AIV)
+//
+// Operates on full tiles where M=q_tile_size, N=head_dim (128):
+//   Case1: oi/oi_new are (16, 128), mij/lij/mi/li are 16-element vectors
+//   Case2: oi/oi_new are (64, 128), mij/lij/mi/li are 64-element vectors
+//
+// Scalar layout strategy using TRESHAPE (zero-copy UB reshape):
+//   Scalars loaded as DN ColMajor (M, 1) for TROWEXPANDMUL/TROWEXPANDDIV.
+//   For element-wise ops (TMAX, TSUB, TEXP, etc.), TRESHAPE to RowMajor (1, M).
+//   After arithmetic, TRESHAPE back to ColMajor (M, 1) for row-broadcast ops.
+//   This eliminates the GM round-trip (TSTORE ND → TLOAD DN) used in the original.
+
+#include <cstdint>
+#include <pto/pto-inst.hpp>
+
+#include "tensor.h"
+
+using namespace pto;
+
+#include "pipe_sync.h"
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__ [aicore]
+#endif
+
+template <int M, int N>
+static __aicore__ void online_update_impl(
+    __gm__ Tensor *mij, __gm__ Tensor *lij, __gm__ Tensor *oi_new, __gm__ Tensor *mi, __gm__ Tensor *li,
+    __gm__ Tensor *oi, uint64_t is_first, uint64_t is_last, __gm__ Tensor *dst
+) {
+    __gm__ float *mij_ptr = reinterpret_cast<__gm__ float *>(mij->buffer.addr);
+    __gm__ float *lij_ptr = reinterpret_cast<__gm__ float *>(lij->buffer.addr);
+    __gm__ float *oi_new_ptr = reinterpret_cast<__gm__ float *>(oi_new->buffer.addr);
+    __gm__ float *mi_ptr = reinterpret_cast<__gm__ float *>(mi->buffer.addr);
+    __gm__ float *li_ptr = reinterpret_cast<__gm__ float *>(li->buffer.addr);
+    __gm__ float *oi_ptr = reinterpret_cast<__gm__ float *>(oi->buffer.addr);
+    __gm__ float *dst_ptr = reinterpret_cast<__gm__ float *>(dst->buffer.addr);
+
+    // Aligned rows for ColMajor DN tiles (32-byte alignment)
+    constexpr int kAlignedRows = ((M * sizeof(float) + 31) / 32) * (32 / sizeof(float));
+
+    // --- GlobalTensor types ---
+
+    // Decode 4D semantic: batch/q_len are constexpr 1.
+    static constexpr int BATCH = 1;
+    static constexpr int Q_LEN = 1;
+
+    // 4D data views (1, 1, q_tile, head_dim) — oi, oi_new, dst.
+    using GlobalData4D = GlobalTensor<float, Shape<1, BATCH, Q_LEN, M, N>, Stride<1, M * N, M * N, N, 1>>;
+
+    // Scalar DN: M contiguous floats as (kAlignedRows, 1) ColMajor for TROWEXPAND ops and loading
+    using GlobalScalarDN = GlobalTensor<float, Shape<1, 1, 1, kAlignedRows, 1>, Stride<1, 1, 1, 1, 1>, Layout::DN>;
+
+    // Scalar ND: for storing mi_new and li_new back to GM
+    constexpr int kScalarCols = 32 / sizeof(float);
+    constexpr int kScalarRows = M / kScalarCols;
+    using GlobalScalarND =
+        GlobalTensor<float, Shape<1, 1, 1, kScalarRows, kScalarCols>, Stride<1, 1, 1, kScalarCols, 1>>;
+
+    // --- GlobalTensor instances ---
+
+    GlobalData4D oiNewGlobal(oi_new_ptr + oi_new->start_offset);
+    GlobalData4D oiGlobal(oi_ptr + oi->start_offset);
+    GlobalData4D dstGlobal(dst_ptr + dst->start_offset);
+
+    // DN globals for loading scalars as ColMajor
+    GlobalScalarDN mijGlobalDN(mij_ptr + mij->start_offset);
+    GlobalScalarDN lijGlobalDN(lij_ptr + lij->start_offset);
+    GlobalScalarDN miGlobalDN(mi_ptr + mi->start_offset);
+    GlobalScalarDN liGlobalDN(li_ptr + li->start_offset);
+
+    // ND globals for storing scalar results
+    GlobalScalarND miGlobalND(mi_ptr + mi->start_offset);
+    GlobalScalarND liGlobalND(li_ptr + li->start_offset);
+
+    // --- Tile types ---
+
+    using TileDataMxN = Tile<TileType::Vec, float, M, N, BLayout::RowMajor, M, N>;
+    using TileScalarDN = Tile<TileType::Vec, float, kAlignedRows, 1, BLayout::ColMajor, M, 1>;
+
+    // RowMajor (1, M) tiles for element-wise arithmetic via TRESHAPE
+    using TileScalarRow = Tile<TileType::Vec, float, 1, M, BLayout::RowMajor, 1, M>;
+
+    // ND tile for storing back to GM
+    using TileScalarND =
+        Tile<TileType::Vec, float, kScalarRows, kScalarCols, BLayout::RowMajor, kScalarRows, kScalarCols>;
+
+    // --- UB memory layout ---
+
+    constexpr int kDataBytes = M * N * sizeof(float);
+    constexpr int kScalarDNBytes = kAlignedRows * sizeof(float);
+
+    // Data tiles
+    TileDataMxN oiNewTile;
+    TileDataMxN oiTile;
+
+    // Scalar DN tiles loaded from GM (ColMajor)
+    TileScalarDN mijDN, lijDN, miDN, liDN;
+
+    // Temporary DN tiles for results
+    TileScalarDN miNewDN, alphaDN, betaDN, liNewDN, tmpDN;
+
+    TASSIGN(oiNewTile, 0);
+    TASSIGN(oiTile, kDataBytes);
+    TASSIGN(mijDN, 2 * kDataBytes);
+    TASSIGN(lijDN, 2 * kDataBytes + kScalarDNBytes);
+    TASSIGN(miDN, 2 * kDataBytes + 2 * kScalarDNBytes);
+    TASSIGN(liDN, 2 * kDataBytes + 3 * kScalarDNBytes);
+    TASSIGN(miNewDN, 2 * kDataBytes + 4 * kScalarDNBytes);
+    TASSIGN(alphaDN, 2 * kDataBytes + 5 * kScalarDNBytes);
+    TASSIGN(betaDN, 2 * kDataBytes + 6 * kScalarDNBytes);
+    TASSIGN(liNewDN, 2 * kDataBytes + 7 * kScalarDNBytes);
+    TASSIGN(tmpDN, 2 * kDataBytes + 8 * kScalarDNBytes);
+
+    if (is_first) {
+        // --- First block: copy inputs to accumulators ---
+        TLOAD(oiNewTile, oiNewGlobal);
+        TLOAD(mijDN, mijGlobalDN);
+        TLOAD(lijDN, lijGlobalDN);
+        set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+        wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+
+        // Store mi = mij, li = lij, oi = oi_new
+        // Alias ND tiles to same UB as DN tiles for ND-format store
+        TileScalarND mijND, lijND;
+        TASSIGN(mijND, 2 * kDataBytes);                   // alias same UB as mijDN
+        TASSIGN(lijND, 2 * kDataBytes + kScalarDNBytes);  // alias same UB as lijDN
+
+        set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+        wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+        TSTORE(miGlobalND, mijND);    // mi = mij
+        TSTORE(liGlobalND, lijND);    // li = lij
+        TSTORE(oiGlobal, oiNewTile);  // oi = oi_new
+
+        if (is_last) {
+            // Single block: normalize dst = oi_new / lij
+            // lijDN already in ColMajor DN format, use directly for TROWEXPANDDIV
+            set_flag(PIPE_MTE3, PIPE_V, EVENT_ID0);
+            wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID0);
+            TROWEXPANDDIV(oiNewTile, oiNewTile, lijDN);
+            set_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);
+            wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);
+            TSTORE(dstGlobal, oiNewTile);
+        }
+    } else {
+        // --- Subsequent blocks: accumulate ---
+
+        // Load all inputs as DN (ColMajor)
+        TLOAD(oiNewTile, oiNewGlobal);
+        TLOAD(oiTile, oiGlobal);
+        TLOAD(mijDN, mijGlobalDN);
+        TLOAD(lijDN, lijGlobalDN);
+        TLOAD(miDN, miGlobalDN);
+        TLOAD(liDN, liGlobalDN);
+        set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+        wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+
+        // TRESHAPE: ColMajor(M,1) → RowMajor(1,M) for element-wise arithmetic
+        TileScalarRow miRow, mijRow, liRow, lijRow;
+        TRESHAPE(miRow, miDN);
+        TRESHAPE(mijRow, mijDN);
+        TRESHAPE(liRow, liDN);
+        TRESHAPE(lijRow, lijDN);
+
+        // Scalar arithmetic in RowMajor (1, M) layout
+        TileScalarRow miNewRow, alphaRow, betaRow, liNewRow, tmpRow;
+        TASSIGN(miNewRow, 2 * kDataBytes + 4 * kScalarDNBytes);
+        TASSIGN(alphaRow, 2 * kDataBytes + 5 * kScalarDNBytes);
+        TASSIGN(betaRow, 2 * kDataBytes + 6 * kScalarDNBytes);
+        TASSIGN(liNewRow, 2 * kDataBytes + 7 * kScalarDNBytes);
+        TASSIGN(tmpRow, 2 * kDataBytes + 8 * kScalarDNBytes);
+
+        TMAX(miNewRow, miRow, mijRow);  // mi_new = max(mi, mij)
+        pipe_barrier(PIPE_V);
+        TSUB(alphaRow, miRow, miNewRow);  // alpha_exp = mi - mi_new
+        pipe_barrier(PIPE_V);
+        TEXP(alphaRow, alphaRow);  // alpha = exp(mi - mi_new)
+        pipe_barrier(PIPE_V);
+        TSUB(betaRow, mijRow, miNewRow);  // beta_exp = mij - mi_new
+        pipe_barrier(PIPE_V);
+        TEXP(betaRow, betaRow);  // beta = exp(mij - mi_new)
+        pipe_barrier(PIPE_V);
+        TMUL(tmpRow, alphaRow, liRow);  // alpha * li
+        pipe_barrier(PIPE_V);
+        TMUL(liNewRow, betaRow, lijRow);  // beta * lij
+        pipe_barrier(PIPE_V);
+        TADD(liNewRow, tmpRow, liNewRow);  // li_new = alpha*li + beta*lij
+
+        // TRESHAPE back: RowMajor(1,M) → ColMajor(M,1) for TROWEXPANDMUL
+        pipe_barrier(PIPE_V);
+        TRESHAPE(alphaDN, alphaRow);
+        TRESHAPE(betaDN, betaRow);
+
+        // Scale data tiles using row-broadcast multiply
+        TROWEXPANDMUL(oiTile, oiTile, alphaDN);  // oi *= alpha
+        pipe_barrier(PIPE_V);
+        TROWEXPANDMUL(oiNewTile, oiNewTile, betaDN);  // oi_new *= beta
+        pipe_barrier(PIPE_V);
+        TADD(oiTile, oiTile, oiNewTile);  // oi = alpha*oi + beta*oi_new
+
+        // Store mi_new and li_new to GM (ND format)
+        // Alias ND tiles to the same UB locations as miNewRow and liNewRow
+        TileScalarND miNewND, liNewND;
+        TASSIGN(miNewND, 2 * kDataBytes + 4 * kScalarDNBytes);
+        TASSIGN(liNewND, 2 * kDataBytes + 7 * kScalarDNBytes);
+
+        if (is_last) {
+            // Normalize and output: dst = oi / li_new
+            TRESHAPE(liNewDN, liNewRow);
+            pipe_barrier(PIPE_V);
+            TROWEXPANDDIV(oiTile, oiTile, liNewDN);
+            set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+            wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+            TSTORE(miGlobalND, miNewND);  // persist mi_new
+            TSTORE(liGlobalND, liNewND);  // persist li_new
+            TSTORE(dstGlobal, oiTile);
+        } else {
+            // Store updated accumulators
+            set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+            wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+            TSTORE(miGlobalND, miNewND);  // persist mi_new
+            TSTORE(liGlobalND, liNewND);  // persist li_new
+            TSTORE(oiGlobal, oiTile);
+        }
+    }
+    pipe_sync();
+}
+
+extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) {
+    __gm__ Tensor *mij = reinterpret_cast<__gm__ Tensor *>(args[0]);
+    __gm__ Tensor *lij = reinterpret_cast<__gm__ Tensor *>(args[1]);
+    __gm__ Tensor *oi_new = reinterpret_cast<__gm__ Tensor *>(args[2]);
+    __gm__ Tensor *mi = reinterpret_cast<__gm__ Tensor *>(args[3]);
+    __gm__ Tensor *li = reinterpret_cast<__gm__ Tensor *>(args[4]);
+    __gm__ Tensor *oi = reinterpret_cast<__gm__ Tensor *>(args[5]);
+    __gm__ Tensor *dst = reinterpret_cast<__gm__ Tensor *>(args[6]);
+    uint64_t is_first = static_cast<uint64_t>(args[7]);
+    uint64_t is_last = static_cast<uint64_t>(args[8]);
+    // mij is 3D (1, 1, q_tile) to match softmax's 3D scalar output.
+    uint64_t q_tile_size = static_cast<uint64_t>(mij->shapes[2]);
+
+    if (q_tile_size == 16) {
+        online_update_impl<16, 128>(mij, lij, oi_new, mi, li, oi, is_first, is_last, dst);
+    } else {
+        online_update_impl<64, 128>(mij, lij, oi_new, mi, li, oi, is_first, is_last, dst);
+    }
+}
diff --git a/tests/st/a2a3/fully_distributed_within_core/paged_attention_unroll_4dims/kernels/aiv/aiv_softmax_prepare.cpp b/tests/st/a2a3/fully_distributed_within_core/paged_attention_unroll_4dims/kernels/aiv/aiv_softmax_prepare.cpp
new file mode 100644
index 000000000..a74ca3577
--- /dev/null
+++ b/tests/st/a2a3/fully_distributed_within_core/paged_attention_unroll_4dims/kernels/aiv/aiv_softmax_prepare.cpp
@@ -0,0 +1,277 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+// Two-Pass Softmax Kernel (AIV) for n_blocks tiles
+//
+// Input:  sij_buf (n_blocks * M, N) fp32 — QK results stacked vertically
+// Output: pij_buf (n_blocks * M, N) bf16 — attention weights per block
+//         mij (M,) fp32 — global row max across all blocks
+//         lij (M,) fp32 — total row sum across all blocks
+//
+// Pass 1: Iterate over n_blocks tiles, apply scale, mask last block,
+//         find global m = max over all blocks of rowmax(S_i * scale)
+//         Uses TRESHAPE for DN↔Row conversion to keep globalMax in UB
+//         (eliminates 63 × 4 GM round-trip operations).
+// Pass 2: Iterate again, compute P_i = exp(S_i * scale - m) -> bf16,
+//         accumulate l = sum over all blocks of rowsum(P_i)
+//         Uses double-buffered sij tiles to overlap TLOAD with computation.
+//
+// Two-pass ensures all P_i tiles share the same scale (global max),
+// enabling direct TMATMUL_ACC accumulation in the PV kernel.
+//
+// Supports two tile configurations via runtime dispatch:
+//   Case1: M=16, N=128 (q_tile=16, block_size=128)
+//   Case2: M=64, N=64  (q_tile=64, block_size=64)
+
+#include <cstdint>
+#include <pto/pto-inst.hpp>
+
+#include "tensor.h"
+
+using namespace pto;
+
+#include "pipe_sync.h"
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__ [aicore]
+#endif
+
+template <int M, int N>
+static __aicore__ void softmax_prepare_n_impl(
+    __gm__ Tensor *sij_buf, __gm__ Tensor *pij_buf, __gm__ Tensor *mij, __gm__ Tensor *lij, float scale_value,
+    uint64_t n_blocks, uint64_t valid_len_last
+) {
+    __gm__ float *sij_base = reinterpret_cast<__gm__ float *>(sij_buf->buffer.addr) + sij_buf->start_offset;
+    __gm__ bfloat16_t *pij_base = reinterpret_cast<__gm__ bfloat16_t *>(pij_buf->buffer.addr) + pij_buf->start_offset;
+    __gm__ float *mij_addr = reinterpret_cast<__gm__ float *>(mij->buffer.addr) + mij->start_offset;
+    __gm__ float *lij_addr = reinterpret_cast<__gm__ float *>(lij->buffer.addr) + lij->start_offset;
+
+    // Decode 4D semantic: batch/q_len are constexpr 1.
+    static constexpr int BATCH = 1;
+    static constexpr int Q_LEN = 1;
+
+    constexpr int kAlignedRows = ((M * sizeof(float) + 31) / 32) * (32 / sizeof(float));
+    constexpr int kScalarCols = 32 / sizeof(float);
+    constexpr int kScalarRows = M / kScalarCols;
+
+    // --- GlobalTensor types ---
+    // 4D data views (1, 1, q_tile, n_blocks*block_size) for sij/pij.
+    using GlobalDataMxN = GlobalTensor<float, Shape<1, BATCH, Q_LEN, M, N>, Stride<1, M * N, M * N, N, 1>>;
+    using GlobalDataMxN_bf16 = GlobalTensor<bfloat16_t, Shape<1, BATCH, Q_LEN, M, N>, Stride<1, M * N, M * N, N, 1>>;
+    // DN/ND scalar globals stay 2D: scalar vectors only need per-element layout.
+    using GlobalScalarDN = GlobalTensor<float, Shape<1, 1, 1, kAlignedRows, 1>, Stride<1, 1, 1, 1, 1>, Layout::DN>;
+    using GlobalScalarND =
+        GlobalTensor<float, Shape<1, 1, 1, kScalarRows, kScalarCols>, Stride<1, 1, 1, kScalarCols, 1>>;
+
+    // --- Tile types ---
+    using TileSijDyn = Tile<TileType::Vec, float, M, N, BLayout::RowMajor, M, -1>;
+    using TileSijPad = Tile<TileType::Vec, float, M, N, BLayout::RowMajor, M, N, SLayout::NoneBox, 512, PadValue::Min>;
+    using TileVecMxN = Tile<TileType::Vec, float, M, N, BLayout::RowMajor, M, N>;
+    using TileVecMxN_bf16 = Tile<TileType::Vec, bfloat16_t, M, N, BLayout::RowMajor, M, N>;
+    using TileScalarDN = Tile<TileType::Vec, float, kAlignedRows, 1, BLayout::ColMajor, M, 1>;
+    using TileScalarND =
+        Tile<TileType::Vec, float, kScalarRows, kScalarCols, BLayout::RowMajor, kScalarRows, kScalarCols>;
+    // RowMajor (1, M) tile for element-wise arithmetic via TRESHAPE
+    using TileScalarRow = Tile<TileType::Vec, float, 1, M, BLayout::RowMajor, 1, M>;
+
+    // --- UB memory layout (double-buffered sij) ---
+    constexpr int kDataBytes = M * N * sizeof(float);
+    constexpr int kScalarDNBytes = kAlignedRows * sizeof(float);
+
+    // Double-buffered sij tiles
+    TileVecMxN sijTile_A;
+    TileSijPad sijPadTile_A;
+    TileVecMxN sijTile_B;
+    TileSijPad sijPadTile_B;
+    TileVecMxN pijTile;
+    TileVecMxN tmpTile;
+    TileVecMxN sumAccTile;
+    TileScalarDN localMaxDN;
+    TileScalarDN globalMaxDN;
+    TileScalarDN sumDN;
+    TileVecMxN_bf16 pijBf16Tile;
+
+    // TRESHAPE aliases (same UB address as their DN counterparts)
+    TileScalarRow localMaxRow;
+    TileScalarRow globalMaxRow;
+
+    // ND alias for storing globalMax to GM
+    TileScalarND globalMaxND;
+
+    TASSIGN(sijTile_A, 0x0);
+    TASSIGN(sijPadTile_A, 0x0);
+    TASSIGN(sijTile_B, kDataBytes);
+    TASSIGN(sijPadTile_B, kDataBytes);
+    TASSIGN(pijTile, 2 * kDataBytes);
+    TASSIGN(tmpTile, 3 * kDataBytes);
+    TASSIGN(sumAccTile, 4 * kDataBytes);
+    int scalarBase = 5 * kDataBytes;
+    TASSIGN(localMaxDN, scalarBase);
+    TASSIGN(localMaxRow, scalarBase);  // alias: same UB as localMaxDN
+    TASSIGN(globalMaxDN, scalarBase + kScalarDNBytes);
+    TASSIGN(globalMaxRow, scalarBase + kScalarDNBytes);  // alias: same UB as globalMaxDN
+    TASSIGN(globalMaxND, scalarBase + kScalarDNBytes);   // alias: same UB as globalMaxDN
+    TASSIGN(sumDN, scalarBase + 2 * kScalarDNBytes);
+    TASSIGN(pijBf16Tile, scalarBase + 3 * kScalarDNBytes);
+
+    // GM aliases (mij/lij output buffers)
+    GlobalScalarND mijGlobalND(mij_addr);
+    GlobalScalarDN lijGlobalDN(lij_addr);
+
+    // ======== Pass 1: Find global row max via TRESHAPE (no GM round-trip) ========
+    for (uint64_t i = 0; i < n_blocks; i++) {
+        GlobalDataMxN sijGlobal(sij_base + i * M * N);
+        TLOAD(sijTile_A, sijGlobal);
+        set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+        wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+
+        if (i == n_blocks - 1 && valid_len_last < static_cast<uint64_t>(N)) {
+            TileSijDyn sijDynTile(static_cast<size_t>(valid_len_last));
+            TASSIGN(sijDynTile, 0x0);
+            TFILLPAD_INPLACE(sijPadTile_A, sijDynTile);
+            pipe_barrier(PIPE_V);
+        }
+
+        TMULS(sijTile_A, sijTile_A, scale_value);
+        pipe_barrier(PIPE_V);
+        TROWMAX(localMaxDN, sijTile_A, tmpTile);
+        pipe_barrier(PIPE_V);
+
+        // TRESHAPE: ColMajor(M,1) → RowMajor(1,M) for element-wise TMAX
+        TRESHAPE(localMaxRow, localMaxDN);
+        if (i == 0) {
+            TMAX(globalMaxRow, localMaxRow, localMaxRow);
+        } else {
+            TMAX(globalMaxRow, globalMaxRow, localMaxRow);
+        }
+        pipe_barrier(PIPE_V);
+    }
+
+    // TRESHAPE back: RowMajor(1,M) → ColMajor(M,1) for Pass 2's TROWEXPANDSUB
+    TRESHAPE(globalMaxDN, globalMaxRow);
+
+    // Store final global max to mij for online_update to consume
+    set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+    wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+    TSTORE(mijGlobalND, globalMaxND);
+
+    // ======== Pass 2: Compute softmax with double-buffered sij ========
+    // globalMaxDN is already in UB from TRESHAPE — no reload needed.
+    // Sync MTE3→MTE2 to ensure the mij TSTORE completed before first sij TLOAD.
+    set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
+    wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
+
+    // Pre-load first sij tile into buffer A
+    GlobalDataMxN sijGlobal_0(sij_base);
+    TLOAD(sijTile_A, sijGlobal_0);
+
+    for (uint64_t i = 0; i < n_blocks; i++) {
+        GlobalDataMxN_bf16 pijGlobal(pij_base + i * M * N);
+
+        // Wait for current tile's TLOAD to complete
+        set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+        wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+
+        // TFILLPAD on current buffer if last block with partial valid length
+        if (i == n_blocks - 1 && valid_len_last < static_cast<uint64_t>(N)) {
+            TileSijDyn curSijDyn(static_cast<size_t>(valid_len_last));
+            if (i % 2 == 0) {
+                TASSIGN(curSijDyn, 0x0);
+                TFILLPAD_INPLACE(sijPadTile_A, curSijDyn);
+            } else {
+                TASSIGN(curSijDyn, static_cast<int>(kDataBytes));
+                TFILLPAD_INPLACE(sijPadTile_B, curSijDyn);
+            }
+            pipe_barrier(PIPE_V);
+        }
+
+        // Compute on current buffer (select A or B based on iteration parity)
+        if (i % 2 == 0) {
+            TMULS(sijTile_A, sijTile_A, scale_value);
+            pipe_barrier(PIPE_V);
+            TROWEXPANDSUB(pijTile, sijTile_A, globalMaxDN);
+        } else {
+            TMULS(sijTile_B, sijTile_B, scale_value);
+            pipe_barrier(PIPE_V);
+            TROWEXPANDSUB(pijTile, sijTile_B, globalMaxDN);
+        }
+        pipe_barrier(PIPE_V);
+        TEXP(pijTile, pijTile);
+        pipe_barrier(PIPE_V);
+        TCVT(pijBf16Tile, pijTile, RoundMode::CAST_ROUND);
+        pipe_barrier(PIPE_V);
+        TCVT(pijTile, pijBf16Tile, RoundMode::CAST_ROUND);
+
+        pipe_barrier(PIPE_V);
+        if (i == 0) {
+            TMULS(sumAccTile, pijTile, 1.0f);
+        } else {
+            TADD(sumAccTile, sumAccTile, pijTile);
+        }
+
+        // Store pij (must complete before next iteration's TCVT overwrites pijBf16Tile)
+        pipe_barrier(PIPE_V);
+        set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+        wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+        TSTORE(pijGlobal, pijBf16Tile);
+
+        // Prefetch next sij into alternate buffer (after TSTORE to avoid UB race)
+        if (i + 1 < n_blocks) {
+            set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
+            wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
+            GlobalDataMxN sijGlobal_next(sij_base + (i + 1) * M * N);
+            if (i % 2 == 0) {
+                TLOAD(sijTile_B, sijGlobal_next);
+            } else {
+                TLOAD(sijTile_A, sijGlobal_next);
+            }
+        }
+    }
+
+    // Compute final row sum from accumulated pij values
+    pipe_barrier(PIPE_V);
+    TROWSUM(sumDN, sumAccTile, tmpTile);
+
+    // Store lij (total sum). mij already stored after Pass 1.
+    set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+    wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+    TSTORE(lijGlobalDN, sumDN);
+
+    pipe_sync();
+}
+
+extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) {
+    __gm__ Tensor *sij_buf = reinterpret_cast<__gm__ Tensor *>(args[0]);
+    __gm__ Tensor *pij_buf = reinterpret_cast<__gm__ Tensor *>(args[1]);
+    __gm__ Tensor *mij = reinterpret_cast<__gm__ Tensor *>(args[2]);
+    __gm__ Tensor *lij = reinterpret_cast<__gm__ Tensor *>(args[3]);
+    union {
+        uint64_t u;
+        float f;
+    } scale_conv;
+    scale_conv.u = static_cast<uint64_t>(args[4]);
+    float scale_value = scale_conv.f;
+    uint64_t n_blocks = static_cast<uint64_t>(args[5]);
+    uint64_t valid_len_last = static_cast<uint64_t>(args[6]);
+
+    // sij_buf is 4D (1, 1, q_tile, n_blocks*block_size) to match qk's 4D output semantic.
+    uint64_t q_tile_size = static_cast<uint64_t>(sij_buf->shapes[2]);
+
+    if (q_tile_size == 16) {
+        softmax_prepare_n_impl<16, 128>(sij_buf, pij_buf, mij, lij, scale_value, n_blocks, valid_len_last);
+    } else {
+        softmax_prepare_n_impl<64, 64>(sij_buf, pij_buf, mij, lij, scale_value, n_blocks, valid_len_last);
+    }
+}
diff --git a/tests/st/a2a3/fully_distributed_within_core/paged_attention_unroll_4dims/kernels/orchestration/paged_attention_orch.cpp b/tests/st/a2a3/fully_distributed_within_core/paged_attention_unroll_4dims/kernels/orchestration/paged_attention_orch.cpp
new file mode 100644
index 000000000..fb5b2015e
--- /dev/null
+++ b/tests/st/a2a3/fully_distributed_within_core/paged_attention_unroll_4dims/kernels/orchestration/paged_attention_orch.cpp
@@ -0,0 +1,186 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+/**
+ * Paged Attention Orchestration - 4D input shapes, N_UNROLL=64, 4 Tasks Per Group
+ *
+ * Batches up to N_UNROLL blocks per group. Each group submits exactly 4 tasks:
+ *   1. QK matmul:  qi @ K^T for n_blocks → sij_buf (1, 1, q_tile, n_blocks * block_size)
+ *   2. Softmax:    two-pass over sij_buf → pij_buf, mi, li
+ *   3. PV matmul:  SplitK accumulated P @ V → oi_new (1, 1, q_tile, head_dim)
+ *   4. Update:     online softmax accumulation with group-level mi, li, oi_new
+ *
+ * Memory Layout (4D throughout):
+ *   Query: (batch, seq_len=1, num_heads, head_dim) bf16
+ *   Key:   (total_blocks, block_size, kv_head_num, head_dim) bf16
+ *   Value: (total_blocks, block_size, kv_head_num, head_dim) bf16
+ *   Out:   (batch, seq_len=1, num_heads, head_dim) fp32
+ */
+
+#include <cstdint>
+#include <cstring>
+
+#include "pto_orchestration_api.h"
+
+#define N_UNROLL 64
+
+#define FUNC_QK_MATMUL 0
+#define FUNC_SOFTMAX_PREPARE 1
+#define FUNC_PV_MATMUL 2
+#define FUNC_ONLINE_UPDATE 3
+
+extern "C" {
+/**
+ * Orchestration config — the executor reads these values to set up
+ * shared memory and runtime before calling aicpu_orchestration_entry.
+ */
+__attribute__((visibility("default"))) PTO2OrchestrationConfig aicpu_orchestration_config(const L2TaskArgs &orch_args) {
+    (void)orch_args;
+    return PTO2OrchestrationConfig{
+        .expected_arg_count = 7,
+    };
+}
+
+__attribute__((visibility("default"))) void aicpu_orchestration_entry(const L2TaskArgs &orch_args) {
+    // Read dimensions from tensor metadata
+    // query: shape=[batch, seq_len, num_heads, head_dim]
+    uint64_t batch = orch_args.tensor(0).ref().shapes[0];
+    uint64_t num_heads = orch_args.tensor(0).ref().shapes[2];
+    uint64_t head_dim = orch_args.tensor(0).ref().shapes[3];
+    DataType data_type = orch_args.tensor(0).ref().dtype;
+
+    // key_cache: shape=[total_blocks, block_size, kv_head_num, head_dim]
+    uint64_t block_size = orch_args.tensor(1).ref().shapes[1];
+
+    // block_table: shape=[batch, max_num_blocks_per_req]
+    uint64_t block_num = orch_args.tensor(3).ref().shapes[1];
+
+    // scale from scalar arg
+    uint64_t scale_value = orch_args.scalar(0);
+    uint64_t q_tile = std::min(num_heads, static_cast<uint64_t>(128));
+    uint64_t q_loop = (num_heads + q_tile - 1) / q_tile;
+
+    // External 4D tensors inherit shape/dtype from TaskArg (golden provides 4D).
+    const Tensor &query = orch_args.tensor(0).ref();
+    const Tensor &key_cache = orch_args.tensor(1).ref();
+    const Tensor &value_cache = orch_args.tensor(2).ref();
+    const Tensor &block_table = orch_args.tensor(3).ref();
+    const Tensor &out = orch_args.tensor(5).ref();
+
+    int *host_context_lens = orch_args.tensor(4).ref().data_as<int>();
+
+    // Loop-invariant shape descriptors: 4D data tiles (1, 1, q_tile, head_dim),
+    // 3D scalar vectors (1, 1, q_tile).
+    uint32_t tile4d_shapes[4] = {1, 1, (uint32_t)q_tile, (uint32_t)head_dim};
+    uint32_t scalar_shapes[3] = {1, 1, (uint32_t)q_tile};
+    TensorCreateInfo tile4d_ci(tile4d_shapes, 4, DataType::FLOAT32);
+    TensorCreateInfo scalar_ci(scalar_shapes, 3, DataType::FLOAT32);
+
+    // Prefetch first block host_context_lens data into cache
+    __builtin_prefetch(&host_context_lens[0], 0, 3);
+
+    for (uint64_t b_idx = 0; b_idx < batch; b_idx++) {
+        uint64_t cur_seq = host_context_lens[b_idx];
+        uint64_t bn_this_batch = (cur_seq + block_size - 1) / block_size;
+
+        // Prefetch next block host_context_lens data while processing current batch
+        if (b_idx + 1 < batch) {
+            __builtin_prefetch(&host_context_lens[b_idx + 1], 0, 3);
+        }
+        for (uint64_t q_idx = 0; q_idx < q_loop; q_idx++) {
+            PTO2_SCOPE() {
+                // 4D views into query/out, matching (1, 1, q_tile, head_dim).
+                uint32_t view_shapes[4] = {1, 1, (uint32_t)q_tile, (uint32_t)head_dim};
+                uint32_t view_offsets[4] = {(uint32_t)b_idx, 0, (uint32_t)(q_idx * q_tile), 0};
+                Tensor qi = query.view(view_shapes, view_offsets);
+                Tensor out_view = out.view(view_shapes, view_offsets, true);
+
+                // Per-group accumulators: oi (4D data), mi_update/li_update (3D scalars).
+                TaskOutputTensors alloc_outs = alloc_tensors(tile4d_ci, scalar_ci, scalar_ci);
+                const Tensor &oi = alloc_outs.get_ref(0);
+                const Tensor &li_update = alloc_outs.get_ref(1);
+                const Tensor &mi_update = alloc_outs.get_ref(2);
+
+                // Reusable Arg objects — reset() before each use avoids
+                // repeated stack-frame construction in the inner loop.
+                L0TaskArgs params_qk, params_sf, params_pv, params_up;
+
+                for (uint64_t bn = 0; bn < bn_this_batch; bn += N_UNROLL) {
+                    uint64_t n_blocks = std::min((uint64_t)N_UNROLL, bn_this_batch - bn);
+
+                    // Valid length for last block in this group
+                    uint64_t last_block_seq_start = (bn + n_blocks - 1) * block_size;
+                    uint64_t valid_len_last = std::min(block_size, cur_seq - last_block_seq_start);
+
+                    // === Task 1: Batched QK matmul — produces 4D sij_buf ===
+                    uint32_t sij_buf_shapes[4] = {1, 1, (uint32_t)q_tile, (uint32_t)(n_blocks * block_size)};
+                    TensorCreateInfo sij_buf_ci(sij_buf_shapes, 4, DataType::FLOAT32);
+
+                    params_qk.reset();
+                    params_qk.add_input(qi);
+                    params_qk.add_input(key_cache);
+                    params_qk.add_input(block_table);
+                    params_qk.add_output(sij_buf_ci);
+                    params_qk.add_scalar(n_blocks);
+                    params_qk.add_scalar(b_idx * block_num + bn);
+                    TaskOutputTensors qk_outs = rt_submit_aic_task(FUNC_QK_MATMUL, params_qk);
+                    const Tensor &sij_buf = qk_outs.get_ref(0);
+
+                    // === Task 2: Two-pass softmax — produces 4D pij_buf, 3D mi, li ===
+                    uint32_t pij_buf_shapes[4] = {1, 1, (uint32_t)q_tile, (uint32_t)(n_blocks * block_size)};
+                    TensorCreateInfo pij_buf_ci(pij_buf_shapes, 4, data_type);
+
+                    params_sf.reset();
+                    params_sf.add_input(sij_buf);
+                    params_sf.add_output(pij_buf_ci);
+                    params_sf.add_output(scalar_ci);
+                    params_sf.add_output(scalar_ci);
+                    params_sf.add_scalar(scale_value);
+                    params_sf.add_scalar(n_blocks);
+                    params_sf.add_scalar(valid_len_last);
+                    TaskOutputTensors sf_outs = rt_submit_aiv_task(FUNC_SOFTMAX_PREPARE, params_sf);
+                    const Tensor &pij_buf = sf_outs.get_ref(0);
+                    const Tensor &mi = sf_outs.get_ref(1);
+                    const Tensor &li = sf_outs.get_ref(2);
+
+                    // === Task 3: SplitK PV matmul — produces 4D oi_new ===
+                    params_pv.reset();
+                    params_pv.add_input(pij_buf);
+                    params_pv.add_input(value_cache);
+                    params_pv.add_input(block_table);
+                    params_pv.add_output(tile4d_ci);
+                    params_pv.add_scalar(n_blocks);
+                    params_pv.add_scalar(b_idx * block_num + bn);
+                    TaskOutputTensors pv_outs = rt_submit_aic_task(FUNC_PV_MATMUL, params_pv);
+                    const Tensor &oi_new = pv_outs.get_ref(0);
+
+                    // === Task 4: Online update (per-group) ===
+                    uint64_t is_first = (bn == 0) ? 1 : 0;
+                    uint64_t is_last = (bn + n_blocks >= bn_this_batch) ? 1 : 0;
+
+                    params_up.reset();
+                    params_up.add_input(mi);
+                    params_up.add_input(li);
+                    params_up.add_input(oi_new);
+                    params_up.add_inout(mi_update);
+                    params_up.add_inout(li_update);
+                    params_up.add_inout(oi);
+                    params_up.add_inout(out_view);
+                    params_up.add_scalar(is_first);
+                    params_up.add_scalar(is_last);
+                    rt_submit_aiv_task(FUNC_ONLINE_UPDATE, params_up);
+                }
+            }
+        }
+    }
+}
+
+}  // extern "C"
diff --git a/tests/st/a2a3/fully_distributed_within_core/paged_attention_unroll_4dims/test_paged_attention_unroll_4dims.py b/tests/st/a2a3/fully_distributed_within_core/paged_attention_unroll_4dims/test_paged_attention_unroll_4dims.py
new file mode 100644
index 000000000..1ef0dffec
--- /dev/null
+++ b/tests/st/a2a3/fully_distributed_within_core/paged_attention_unroll_4dims/test_paged_attention_unroll_4dims.py
@@ -0,0 +1,144 @@
+#!/usr/bin/env python3
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""Paged attention unroll with 4D input shapes (batch, seq_len, num_heads, head_dim).
+
+Query and output tensors use 4D format instead of the standard 3D.
+6 kernels: QK/PV matmul (AIC), softmax_prepare/online_update (AIV).
+Orchestration with N_UNROLL=64, 4 tasks per group, online softmax accumulation.
+"""
+
+import torch
+from simpler.task_interface import ArgDirection as D
+
+from simpler_setup import Scalar, SceneTestCase, TaskArgsBuilder, Tensor, scene_test
+from simpler_setup.goldens.paged_attention import compute_golden as _pa_compute_golden
+from simpler_setup.goldens.paged_attention import generate_inputs as _pa_generate_inputs
+
+
+@scene_test(level=2, runtime="tensormap_and_ringbuffer")
+class TestPagedAttentionUnroll4dims(SceneTestCase):
+    """Paged attention unroll with 4D query/out shapes."""
+
+    RTOL = 1e-3
+    ATOL = 1e-3
+
+    CALLABLE = {
+        "orchestration": {
+            "source": "kernels/orchestration/paged_attention_orch.cpp",
+            "function_name": "aicpu_orchestration_entry",
+            "signature": [D.IN, D.IN, D.IN, D.IN, D.IN, D.OUT],
+        },
+        "incores": [
+            {
+                "func_id": 0,
+                "source": "kernels/aic/aic_qk_matmul.cpp",
+                "core_type": "aic",
+                "signature": [D.IN, D.IN, D.IN, D.OUT],
+            },
+            {
+                "func_id": 1,
+                "source": "kernels/aiv/aiv_softmax_prepare.cpp",
+                "core_type": "aiv",
+                "signature": [D.IN, D.OUT, D.OUT, D.OUT],
+            },
+            {
+                "func_id": 2,
+                "source": "kernels/aic/aic_pv_matmul.cpp",
+                "core_type": "aic",
+                "signature": [D.IN, D.IN, D.IN, D.OUT],
+            },
+            {
+                "func_id": 3,
+                "source": "kernels/aiv/aiv_online_update.cpp",
+                "core_type": "aiv",
+                "signature": [D.IN, D.IN, D.IN, D.INOUT, D.INOUT, D.INOUT, D.INOUT],
+            },
+        ],
+    }
+
+    CASES = [
+        {
+            "name": "Case1",
+            "platforms": ["a2a3"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 24},
+            "params": {
+                "batch": 256,
+                "num_heads": 16,
+                "kv_head_num": 1,
+                "head_dim": 128,
+                "block_size": 128,
+                "context_len": 8192,
+                "max_model_len": 32768,
+                "dtype": "bfloat16",
+            },
+        },
+        {
+            "name": "Case2",
+            "platforms": ["a2a3"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 24},
+            "params": {
+                "batch": 64,
+                "num_heads": 64,
+                "kv_head_num": 1,
+                "head_dim": 128,
+                "block_size": 64,
+                "context_len": 8192,
+                "max_model_len": 32768,
+                "dtype": "bfloat16",
+            },
+            "manual": True,
+        },
+        {
+            "name": "Case3",
+            "platforms": ["a2a3"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 24},
+            "params": {
+                "batch": 64,
+                "num_heads": 64,
+                "kv_head_num": 1,
+                "head_dim": 256,
+                "block_size": 64,
+                "context_len": 8192,
+                "max_model_len": 32768,
+                "dtype": "bfloat16",
+            },
+            "manual": True,
+        },
+    ]
+
+    def generate_args(self, params):
+        inputs = _pa_generate_inputs(params)
+        batch = params["batch"]
+        num_heads = params["num_heads"]
+        head_dim = params["head_dim"]
+        specs = []
+        for name, val in inputs:
+            if isinstance(val, torch.Tensor):
+                if name in ("query", "out"):
+                    val = val.reshape(batch, 1, num_heads, head_dim)
+                specs.append(Tensor(name, val))
+            else:
+                specs.append(Scalar(name, val))
+        return TaskArgsBuilder(*specs)
+
+    def compute_golden(self, args, params):
+        batch = params["batch"]
+        num_heads = params["num_heads"]
+        head_dim = params["head_dim"]
+        tensors = {s.name: s.value for s in args.specs if isinstance(s, Tensor)}
+        # Reshape 4D out to 3D for shared golden, then restore
+        out_4d = tensors["out"]
+        tensors["out"] = out_4d.reshape(batch, num_heads, head_dim)
+        _pa_compute_golden(tensors, params)
+        tensors["out"] = out_4d
+
+
+if __name__ == "__main__":
+    SceneTestCase.run_module(__name__)
diff --git a/tests/st/a2a3/fully_distributed_within_core/prepared_callable/conftest.py b/tests/st/a2a3/fully_distributed_within_core/prepared_callable/conftest.py
new file mode 100644
index 000000000..7395557fb
--- /dev/null
+++ b/tests/st/a2a3/fully_distributed_within_core/prepared_callable/conftest.py
@@ -0,0 +1,58 @@
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""Isolated L2 worker for prepared_callable white-box tests.
+
+The default ``st_worker`` (root conftest) is shared across L2 ST classes
+in a session-scoped pool — correct for ordinary business tests but not
+for prepared_callable, which asserts on the worker's internal handle table
+(``aicpu_dlopen_count`` / ``host_dlopen_count`` deltas, SO cache hits). Sharing the worker breaks those
+assertions: other tests' prepared handles leave residue in the
+worker identity table.
+
+Override ``st_worker`` here as class-scope, building a fresh L2 worker
+that does **not** enter ``_l2_worker_pool``. Cost: one extra init/close
+per prepared_callable test class.
+
+The 4 prepared_callable directories (a2a3/a5 × tensormap_and_ringbuffer/
+host_build_graph) share identical conftest content — keep them in sync.
+"""
+
+from __future__ import annotations
+
+import pytest
+
+
+@pytest.fixture(scope="class")
+def st_worker(request, st_platform, device_pool):
+    cls = request.node.cls
+    if cls is None or not hasattr(cls, "_st_runtime"):
+        pytest.skip("isolated st_worker requires a SceneTestCase subclass")
+
+    runtime = cls._st_runtime
+
+    ids = device_pool.allocate(1)
+    if not ids:
+        pytest.fail("no devices available for isolated L2 worker")
+    dev_id = ids[0]
+    try:
+        from simpler.worker import Worker  # noqa: PLC0415
+
+        w = Worker(
+            level=2,
+            device_id=dev_id,
+            platform=st_platform,
+            runtime=runtime,
+        )
+        w.init()
+        try:
+            yield w
+        finally:
+            w.close()
+    finally:
+        device_pool.release(ids)
diff --git a/tests/st/a2a3/fully_distributed_within_core/prepared_callable/test_prepared_callable.py b/tests/st/a2a3/fully_distributed_within_core/prepared_callable/test_prepared_callable.py
new file mode 100644
index 000000000..f18c66569
--- /dev/null
+++ b/tests/st/a2a3/fully_distributed_within_core/prepared_callable/test_prepared_callable.py
@@ -0,0 +1,302 @@
+#!/usr/bin/env python3
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""End-to-end white-box test for the private L2 prepared-callable ABI.
+
+Reuses the vector_example orchestration + AIV kernels. Exercises:
+  - prepare one private callable slot, then run twice (second run proves the
+    AICPU-side dlopen cache / host-side orch SO dedup is working — no re-upload).
+  - Two distinct private callable slots sharing the same callable: verifies both
+    produce correct output independently.
+  - private slot unregister after runs complete: should not raise.
+  - aicpu_dlopen_count assertions covering: same-slot repeat, multi-slot
+    interleaving, double-prepare rejection, and unregister + re-prepare.
+"""
+
+import pytest
+import torch
+from simpler.task_interface import ArgDirection as D
+
+from simpler_setup import SceneTestCase, TaskArgsBuilder, Tensor, scene_test
+from simpler_setup.scene_test import _build_chip_task_args, _compare_outputs
+
+_VECTOR_KERNELS = "../../../../../examples/a2a3/tensormap_and_ringbuffer/vector_example/kernels"
+_SLOT_PRIMARY = 0
+_SLOT_SECONDARY = 1
+
+
+@scene_test(level=2, runtime="tensormap_and_ringbuffer")
+class TestPreparedCallable(SceneTestCase):
+    """Exercise private prepare / run / unregister slot ABI.
+
+    Requires an isolated L2 ``Worker`` (private slot table starts empty); this is
+    provided by the directory-local ``conftest.py`` overriding ``st_worker``
+    with a class-scope fixture.
+    """
+
+    CALLABLE = {
+        "orchestration": {
+            "source": f"{_VECTOR_KERNELS}/orchestration/example_orchestration.cpp",
+            "function_name": "aicpu_orchestration_entry",
+            "signature": [D.IN, D.IN, D.OUT],
+        },
+        "incores": [
+            {
+                "func_id": 0,
+                "source": f"{_VECTOR_KERNELS}/aiv/kernel_add.cpp",
+                "core_type": "aiv",
+                "signature": [D.IN, D.IN, D.OUT],
+            },
+            {
+                "func_id": 1,
+                "source": f"{_VECTOR_KERNELS}/aiv/kernel_add_scalar.cpp",
+                "core_type": "aiv",
+                "signature": [D.IN, D.OUT],
+            },
+            {
+                "func_id": 2,
+                "source": f"{_VECTOR_KERNELS}/aiv/kernel_mul.cpp",
+                "core_type": "aiv",
+                "signature": [D.IN, D.IN, D.OUT],
+            },
+        ],
+    }
+
+    _COMMON_CONFIG = {"aicpu_thread_num": 4, "block_dim": 3}
+    _PLATFORMS = ["a2a3sim", "a2a3"]
+
+    CASES = [
+        {
+            "name": "prepare_run_twice",
+            "platforms": _PLATFORMS,
+            "config": _COMMON_CONFIG,
+            "params": {"a": 2.0, "b": 3.0},
+        },
+    ]
+
+    def generate_args(self, params):
+        size = 128 * 128
+        a, b = params["a"], params["b"]
+        return TaskArgsBuilder(
+            Tensor("a", torch.full((size,), a, dtype=torch.float32)),
+            Tensor("b", torch.full((size,), b, dtype=torch.float32)),
+            Tensor("f", torch.zeros(size, dtype=torch.float32)),
+        )
+
+    def compute_golden(self, args, params):
+        args.f[:] = (args.a + args.b + 1) * (args.a + args.b + 2) + (args.a + args.b)
+
+    def _chip_worker(self, worker):
+        chip_worker = worker._chip_worker
+        assert chip_worker is not None
+        return chip_worker
+
+    def _run_and_validate_l2(  # noqa: PLR0913
+        self,
+        worker,
+        callable_obj,
+        case,
+        rounds=1,
+        skip_golden=False,
+        enable_l2_swimlane=False,
+        enable_dump_args=False,
+        enable_pmu=0,
+        enable_dep_gen=False,
+        enable_scope_stats=False,
+        output_prefix="",
+    ):
+        params = case.get("params", {})
+        config_dict = case.get("config", {})
+        orch_sig = self.CALLABLE.get("orchestration", {}).get("signature", [])
+
+        config = self._build_config(config_dict)
+        chip_worker = self._chip_worker(worker)
+
+        # 1) prepare two private slots with the SAME callable.
+        chip_worker._prepare_callable_at_slot(_SLOT_PRIMARY, callable_obj)
+        chip_worker._prepare_callable_at_slot(_SLOT_SECONDARY, callable_obj)
+
+        # 2) run primary slot twice (second run proves dedup/cache hit)
+        for _ in range(2):
+            test_args = self.generate_args(params)
+            chip_args, output_names = _build_chip_task_args(test_args, orch_sig)
+            golden_args = test_args.clone()
+            self.compute_golden(golden_args, params)
+
+            chip_worker._run_slot(_SLOT_PRIMARY, chip_args, config=config)
+            _compare_outputs(test_args, golden_args, output_names, self.RTOL, self.ATOL)
+
+        # 3) run secondary slot — different slot, same SO, must also work.
+        test_args = self.generate_args(params)
+        chip_args, output_names = _build_chip_task_args(test_args, orch_sig)
+        golden_args = test_args.clone()
+        self.compute_golden(golden_args, params)
+
+        chip_worker._run_slot(_SLOT_SECONDARY, chip_args, config=config)
+        _compare_outputs(test_args, golden_args, output_names, self.RTOL, self.ATOL)
+
+        # 4) unregister both — should not raise.
+        chip_worker._unregister_slot(_SLOT_PRIMARY)
+        chip_worker._unregister_slot(_SLOT_SECONDARY)
+
+    # ------------------------------------------------------------------
+    # aicpu_dlopen_count assertions.
+    #
+    # The class-scope L2 worker is shared across test methods in this
+    # class (see ./conftest.py), so the counter can be non-zero on entry
+    # from prior methods. Each test below snapshots the counter on entry,
+    # asserts the *delta* introduced by the scenario, then unregisters
+    # everything it staged. unregister does NOT decrement the
+    # counter (the counter is monotonic — see test_dlopen_count_unregister_re_prepare).
+    # ------------------------------------------------------------------
+
+    def _setup_dlopen_count_test(self, st_worker, st_platform):
+        """Common fixture: build callable + config, return (callable, config, case)."""
+        case = self.CASES[0]
+        callable_obj = self.build_callable(st_platform)
+        config = self._build_config(case["config"])
+        return callable_obj, config, case
+
+    def _run_one(self, worker, slot, config, case):
+        params = case["params"]
+        orch_sig = self.CALLABLE["orchestration"]["signature"]
+        test_args = self.generate_args(params)
+        chip_args, output_names = _build_chip_task_args(test_args, orch_sig)
+        golden_args = test_args.clone()
+        self.compute_golden(golden_args, params)
+        self._chip_worker(worker)._run_slot(slot, chip_args, config=config)
+        _compare_outputs(test_args, golden_args, output_names, self.RTOL, self.ATOL)
+
+    def test_dlopen_count_same_slot_repeated_runs(self, st_platform, st_worker):
+        """Case A: prepare(primary) + run x5 -> dlopen_count delta == 1."""
+        callable_obj, config, case = self._setup_dlopen_count_test(st_worker, st_platform)
+        baseline = st_worker.aicpu_dlopen_count
+        prepared = False
+        chip_worker = self._chip_worker(st_worker)
+        try:
+            chip_worker._prepare_callable_at_slot(_SLOT_PRIMARY, callable_obj)
+            prepared = True
+            for _ in range(5):
+                self._run_one(st_worker, _SLOT_PRIMARY, config, case)
+            assert st_worker.aicpu_dlopen_count - baseline == 1, (
+                f"expected exactly 1 new dlopen for 5 runs of primary slot, "
+                f"got delta {st_worker.aicpu_dlopen_count - baseline}"
+            )
+        finally:
+            if prepared:
+                chip_worker._unregister_slot(_SLOT_PRIMARY)
+
+    def test_dlopen_count_two_slots_alternating(self, st_platform, st_worker):
+        """Case B: prepare(primary)+prepare(secondary) + alternating runs x5 -> delta == 2."""
+        callable_obj, config, case = self._setup_dlopen_count_test(st_worker, st_platform)
+        baseline = st_worker.aicpu_dlopen_count
+        primary_prepared = False
+        secondary_prepared = False
+        chip_worker = self._chip_worker(st_worker)
+        try:
+            chip_worker._prepare_callable_at_slot(_SLOT_PRIMARY, callable_obj)
+            primary_prepared = True
+            chip_worker._prepare_callable_at_slot(_SLOT_SECONDARY, callable_obj)
+            secondary_prepared = True
+            for _ in range(5):
+                self._run_one(st_worker, _SLOT_PRIMARY, config, case)
+                self._run_one(st_worker, _SLOT_SECONDARY, config, case)
+            assert st_worker.aicpu_dlopen_count - baseline == 2, (
+                f"expected exactly 2 new dlopens for two slots interleaved, "
+                f"got delta {st_worker.aicpu_dlopen_count - baseline}"
+            )
+        finally:
+            if secondary_prepared:
+                chip_worker._unregister_slot(_SLOT_SECONDARY)
+            if primary_prepared:
+                chip_worker._unregister_slot(_SLOT_PRIMARY)
+
+    def test_dlopen_count_double_prepare_raises(self, st_platform, st_worker):
+        """Case C: prepare(primary) + prepare(primary) -> second call raises RuntimeError."""
+        callable_obj, _config, _case = self._setup_dlopen_count_test(st_worker, st_platform)
+        prepared = False
+        chip_worker = self._chip_worker(st_worker)
+        try:
+            chip_worker._prepare_callable_at_slot(_SLOT_PRIMARY, callable_obj)
+            prepared = True
+            with pytest.raises(RuntimeError):
+                chip_worker._prepare_callable_at_slot(_SLOT_PRIMARY, callable_obj)
+        finally:
+            if prepared:
+                chip_worker._unregister_slot(_SLOT_PRIMARY)
+
+    def test_dedup_shared_so_independent_unregister(self, st_platform, st_worker):
+        """Case E: two slots on the same ChipCallable share one device orch SO buffer.
+
+        Build-ID-keyed dedup in DeviceRunner refcounts the buffer; unregistering
+        one slot must not invalidate the other. Run-after-unregister proves the
+        shared buffer is still alive (a missing refcount would either crash or
+        produce incorrect results when the second slot dispatches into a freed
+        device region).
+        """
+        callable_obj, config, case = self._setup_dlopen_count_test(st_worker, st_platform)
+        primary_prepared = False
+        secondary_prepared = False
+        chip_worker = self._chip_worker(st_worker)
+        try:
+            chip_worker._prepare_callable_at_slot(_SLOT_PRIMARY, callable_obj)
+            primary_prepared = True
+            chip_worker._prepare_callable_at_slot(_SLOT_SECONDARY, callable_obj)
+            secondary_prepared = True
+            # Sanity: both slots work before any unregister.
+            self._run_one(st_worker, _SLOT_PRIMARY, config, case)
+            self._run_one(st_worker, _SLOT_SECONDARY, config, case)
+            # Drop primary; secondary's run must still succeed because the
+            # dedup refcount is still > 0.
+            chip_worker._unregister_slot(_SLOT_PRIMARY)
+            primary_prepared = False
+            self._run_one(st_worker, _SLOT_SECONDARY, config, case)
+        finally:
+            if secondary_prepared:
+                chip_worker._unregister_slot(_SLOT_SECONDARY)
+            if primary_prepared:
+                chip_worker._unregister_slot(_SLOT_PRIMARY)
+
+    def test_dlopen_count_unregister_re_prepare(self, st_platform, st_worker):
+        """Case D: prepare+run+unregister+prepare+run -> delta == 2.
+
+        unregister erases the slot from aicpu_seen_callable_ids_, so the second
+        prepare/run pair sets register_new_callable_id_ again and the AICPU
+        does a fresh dlopen. The counter is monotonic (does NOT decrement on
+        unregister), so the delta after the second cycle is 2.
+        """
+        callable_obj, config, case = self._setup_dlopen_count_test(st_worker, st_platform)
+        baseline = st_worker.aicpu_dlopen_count
+        prepared = False
+        chip_worker = self._chip_worker(st_worker)
+        try:
+            chip_worker._prepare_callable_at_slot(_SLOT_PRIMARY, callable_obj)
+            prepared = True
+            self._run_one(st_worker, _SLOT_PRIMARY, config, case)
+            assert st_worker.aicpu_dlopen_count - baseline == 1
+            chip_worker._unregister_slot(_SLOT_PRIMARY)
+            prepared = False
+            after_unreg = st_worker.aicpu_dlopen_count
+            assert after_unreg - baseline == 1, (
+                f"unregister must NOT decrement the dlopen counter; baseline={baseline}, after_unreg={after_unreg}"
+            )
+            chip_worker._prepare_callable_at_slot(_SLOT_PRIMARY, callable_obj)
+            prepared = True
+            self._run_one(st_worker, _SLOT_PRIMARY, config, case)
+            assert st_worker.aicpu_dlopen_count - baseline == 2, (
+                f"after re-prepare expected counter +2 (two distinct AICPU dlopens), "
+                f"got delta {st_worker.aicpu_dlopen_count - baseline}"
+            )
+        finally:
+            if prepared:
+                chip_worker._unregister_slot(_SLOT_PRIMARY)
+
+
+if __name__ == "__main__":
+    SceneTestCase.run_module(__name__)
diff --git a/tests/st/a2a3/fully_distributed_within_core/spmd_basic/kernels/aic/kernel_spmd_read.cpp b/tests/st/a2a3/fully_distributed_within_core/spmd_basic/kernels/aic/kernel_spmd_read.cpp
new file mode 100644
index 000000000..36e5b73c5
--- /dev/null
+++ b/tests/st/a2a3/fully_distributed_within_core/spmd_basic/kernels/aic/kernel_spmd_read.cpp
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+/**
+ * SPMD Context Read Kernel (AIC version)
+ *
+ * Reads SPMD local context (block_idx, block_num) and writes values to
+ * cache line 0 of the shared output tensor.  AIC does not use
+ * get_sub_block_id (sub_block_id is only meaningful for AIV).
+ *
+ * Args:
+ *   args[0] = output Tensor* (OUTPUT, 48 float32 elements)
+ */
+
+#include <cstdint>
+#include <pto/pto-inst.hpp>
+
+#include "tensor.h"
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__ [aicore]  // NOLINT(whitespace/braces)
+#endif
+
+#include "intrinsic.h"
+
+// Cache line = 64B = 16 float32.  Each slot owns one cache line.
+static constexpr int32_t FLOATS_PER_CACHE_LINE = 16;
+
+// dcci + constants: CCEC provides these as builtins; provide fallbacks for sim.
+#ifdef PTO_CPUSTUB_HPP
+#define dcci(...) \
+    do {          \
+    } while (0)
+#endif
+#ifndef SINGLE_CACHE_LINE
+#define SINGLE_CACHE_LINE 0
+#endif
+#ifndef CACHELINE_OUT
+#define CACHELINE_OUT 0
+#endif
+
+extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) {
+    __gm__ Tensor *out_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]);
+    __gm__ float *out = reinterpret_cast<__gm__ float *>(out_tensor->buffer.addr) + out_tensor->start_offset;
+
+    // AIC writes at fixed cache line 0 (no sub_block_id needed)
+    out[0] = static_cast<float>(get_block_idx(args));
+    out[1] = static_cast<float>(get_block_num(args));
+
+    // Flush this cache line to HBM so host can read the output.
+    dcci(&out[0], SINGLE_CACHE_LINE, CACHELINE_OUT);
+}
diff --git a/tests/st/a2a3/fully_distributed_within_core/spmd_basic/kernels/aiv/kernel_spmd_read.cpp b/tests/st/a2a3/fully_distributed_within_core/spmd_basic/kernels/aiv/kernel_spmd_read.cpp
new file mode 100644
index 000000000..7c85dd37f
--- /dev/null
+++ b/tests/st/a2a3/fully_distributed_within_core/spmd_basic/kernels/aiv/kernel_spmd_read.cpp
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+/**
+ * SPMD Context Read Kernel (AIV version)
+ *
+ * Reads SPMD context via Get* accessors and writes values to the shared
+ * output tensor.  AIV uses get_sub_block_id to determine its lane (0=left,
+ * 1=right) and writes at cache line (1 + sub_block_id) to avoid
+ * overlapping with the AIC slot at cache line 0.
+ *
+ * Args:
+ *   args[0] = output Tensor* (OUTPUT, 48 float32 elements)
+ */
+
+#include <cstdint>
+#include <pto/pto-inst.hpp>
+
+#include "tensor.h"
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__ [aicore]  // NOLINT(whitespace/braces)
+#endif
+
+#include "intrinsic.h"
+
+// Cache line = 64B = 16 float32.  Each slot owns one cache line.
+static constexpr int32_t FLOATS_PER_CACHE_LINE = 16;
+
+// dcci + constants: CCEC provides these as builtins; provide fallbacks for sim.
+#ifdef PTO_CPUSTUB_HPP
+#define dcci(...) \
+    do {          \
+    } while (0)
+#endif
+#ifndef SINGLE_CACHE_LINE
+#define SINGLE_CACHE_LINE 0
+#endif
+#ifndef CACHELINE_OUT
+#define CACHELINE_OUT 0
+#endif
+
+extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) {
+    __gm__ Tensor *out_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]);
+    __gm__ float *out = reinterpret_cast<__gm__ float *>(out_tensor->buffer.addr) + out_tensor->start_offset;
+
+    int32_t sub_block_id = get_sub_block_id(args);
+    // AIV writes at cache line (1 + sub_block_id), skipping AIC's cache line 0
+    int32_t offset = (1 + sub_block_id) * FLOATS_PER_CACHE_LINE;
+
+    out[offset + 0] = static_cast<float>(get_block_idx(args));
+    out[offset + 1] = static_cast<float>(get_block_num(args));
+    out[offset + 2] = static_cast<float>(sub_block_id);
+
+    // Flush this cache line to HBM so host can read the output.
+    dcci(&out[offset], SINGLE_CACHE_LINE, CACHELINE_OUT);
+}
diff --git a/tests/st/a2a3/fully_distributed_within_core/spmd_basic/kernels/orchestration/spmd_basic_orch.cpp b/tests/st/a2a3/fully_distributed_within_core/spmd_basic/kernels/orchestration/spmd_basic_orch.cpp
new file mode 100644
index 000000000..6b92c3b1f
--- /dev/null
+++ b/tests/st/a2a3/fully_distributed_within_core/spmd_basic/kernels/orchestration/spmd_basic_orch.cpp
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+/**
+ * SPMD Basic Orchestration
+ *
+ * Submits a single MIX task (AIC + AIV0 + AIV1) with a shared output
+ * tensor. Each subtask writes its SPMD context at a sub_block_id-based
+ * offset, so the host can verify all three slots independently.
+ *
+ * Args layout: [output]
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "pto_orchestration_api.h"
+
+#define FUNC_SPMD_READ_AIC 0
+#define FUNC_SPMD_READ_AIV0 1
+#define FUNC_SPMD_READ_AIV1 2
+
+extern "C" {
+
+__attribute__((visibility("default"))) PTO2OrchestrationConfig aicpu_orchestration_config(const L2TaskArgs &orch_args) {
+    (void)orch_args;  // NOLINT(readability/casting)
+    return PTO2OrchestrationConfig{
+        .expected_arg_count = 1,
+    };
+}
+
+__attribute__((visibility("default"))) void aicpu_orchestration_entry(const L2TaskArgs &orch_args) {
+    const Tensor &ext_output = orch_args.tensor(0).ref();
+
+    MixedKernels mk;
+    mk.aic_kernel_id = FUNC_SPMD_READ_AIC;
+    mk.aiv0_kernel_id = FUNC_SPMD_READ_AIV0;
+    mk.aiv1_kernel_id = FUNC_SPMD_READ_AIV1;
+
+    L0TaskArgs args;
+    args.add_inout(ext_output);
+
+    rt_submit_task(mk, args);
+
+    LOG_INFO_V9("[spmd_basic_orch] Submitted 1 MIX task (AIC+AIV0+AIV1)");
+}
+
+}  // extern "C"
diff --git a/tests/st/a2a3/fully_distributed_within_core/spmd_basic/test_spmd_basic.py b/tests/st/a2a3/fully_distributed_within_core/spmd_basic/test_spmd_basic.py
new file mode 100644
index 000000000..bdb99ed01
--- /dev/null
+++ b/tests/st/a2a3/fully_distributed_within_core/spmd_basic/test_spmd_basic.py
@@ -0,0 +1,98 @@
+#!/usr/bin/env python3
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""SPMD basic context accessors: single MIX task verifying block_idx, block_num, sub_block_id.
+
+Submits one MIX task (AIC + AIV0 + AIV1) with block_dim=1.
+Each subtask writes its SPMD context at a sub_block_id-based offset.
+
+Output layout (float32[48], 3 cache lines):
+  [0..15]  = AIC  slot: [block_idx, block_num, pad x14]
+  [16..31] = AIV0 slot: [block_idx, block_num, sub_block_id=0, pad x13]
+  [32..47] = AIV1 slot: [block_idx, block_num, sub_block_id=1, pad x13]
+"""
+
+import torch
+from simpler.task_interface import ArgDirection as D
+
+from simpler_setup import SceneTestCase, TaskArgsBuilder, Tensor, scene_test
+
+FLOATS_PER_CACHE_LINE = 16
+
+
+@scene_test(level=2, runtime="tensormap_and_ringbuffer")
+class TestSpmdBasic(SceneTestCase):
+    """SPMD context accessors with a single MIX task."""
+
+    RTOL = 0
+    ATOL = 0
+
+    CALLABLE = {
+        "orchestration": {
+            "source": "kernels/orchestration/spmd_basic_orch.cpp",
+            "function_name": "aicpu_orchestration_entry",
+            "signature": [D.INOUT],
+        },
+        "incores": [
+            {
+                "func_id": 0,
+                "name": "SPMD_READ_AIC",
+                "source": "kernels/aic/kernel_spmd_read.cpp",
+                "core_type": "aic",
+                # Cooperative MIX (AIC+AIV0+AIV1 share one args[]). Declare the
+                # payload signature on exactly ONE subtask so the tensor dump's
+                # per-subtask sum equals the payload (1 INOUT tensor); the AIVs
+                # stay empty or the sum would triple and the dump is skipped.
+                "signature": [D.INOUT],
+            },
+            {"func_id": 1, "name": "SPMD_READ_AIV0", "source": "kernels/aiv/kernel_spmd_read.cpp", "core_type": "aiv"},
+            {"func_id": 2, "name": "SPMD_READ_AIV1", "source": "kernels/aiv/kernel_spmd_read.cpp", "core_type": "aiv"},
+        ],
+    }
+
+    CASES = [
+        {
+            "name": "Case1",
+            "platforms": ["a2a3sim", "a2a3"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 24},
+            "params": {},
+        },
+        {
+            # Exercises the CallConfig block_dim=0 "auto" path: scene_test
+            # omits block_dim, so DeviceRunner resolves it to the stream's
+            # max (PLATFORM_MAX_BLOCKDIM on sim, aclrtGetStreamResLimit on
+            # onboard). The SPMD task itself is block_num=1, so the golden
+            # is identical to Case1 regardless of how many workers exist.
+            "name": "Case2_AutoBlockDim",
+            "platforms": ["a2a3sim", "a2a3"],
+            "config": {"aicpu_thread_num": 4},
+            "params": {},
+        },
+    ]
+
+    def generate_args(self, params):
+        output = torch.zeros(3 * FLOATS_PER_CACHE_LINE, dtype=torch.float32)
+        return TaskArgsBuilder(Tensor("output", output))
+
+    def compute_golden(self, args, params):
+        out = args.output
+        out[0] = 0.0
+        out[1] = 1.0
+        base = 1 * FLOATS_PER_CACHE_LINE
+        out[base + 0] = 0.0
+        out[base + 1] = 1.0
+        out[base + 2] = 0.0
+        base = 2 * FLOATS_PER_CACHE_LINE
+        out[base + 0] = 0.0
+        out[base + 1] = 1.0
+        out[base + 2] = 1.0
+
+
+if __name__ == "__main__":
+    SceneTestCase.run_module(__name__)
diff --git a/tests/st/a2a3/fully_distributed_within_core/spmd_batch_dispatch_oob/kernels/aic/kernel_write.cpp b/tests/st/a2a3/fully_distributed_within_core/spmd_batch_dispatch_oob/kernels/aic/kernel_write.cpp
new file mode 100644
index 000000000..e9fb32715
--- /dev/null
+++ b/tests/st/a2a3/fully_distributed_within_core/spmd_batch_dispatch_oob/kernels/aic/kernel_write.cpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+/**
+ * AIC kernel: writes float(block_idx) at cache line (base_cl + block_idx*3 + 0).
+ *
+ * Args:
+ *   args[0] = output Tensor* (INOUT)
+ *   args[1] = scalar: base_cl
+ */
+
+#include <cstdint>
+#include <pto/pto-inst.hpp>
+
+#include "tensor.h"
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__ [aicore]  // NOLINT(whitespace/braces)
+#endif
+
+#include "intrinsic.h"
+
+static constexpr int32_t FLOATS_PER_CACHE_LINE = 16;
+static constexpr int32_t SLOTS_PER_BLOCK = 3;
+
+#ifdef PTO_CPUSTUB_HPP
+#define dcci(...) \
+    do {          \
+    } while (0)
+#endif
+#ifndef SINGLE_CACHE_LINE
+#define SINGLE_CACHE_LINE 0
+#endif
+#ifndef CACHELINE_OUT
+#define CACHELINE_OUT 0
+#endif
+
+extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) {
+    __gm__ Tensor *out_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]);
+    __gm__ float *out = reinterpret_cast<__gm__ float *>(out_tensor->buffer.addr) + out_tensor->start_offset;
+
+    int32_t base_cl = static_cast<int32_t>(args[1]);
+    int32_t block_idx = get_block_idx(args);
+    int32_t offset = (base_cl + block_idx * SLOTS_PER_BLOCK + 0) * FLOATS_PER_CACHE_LINE;
+
+    out[offset] = static_cast<float>(block_idx);
+
+    dcci(&out[offset], SINGLE_CACHE_LINE, CACHELINE_OUT);
+}
diff --git a/tests/st/a2a3/fully_distributed_within_core/spmd_batch_dispatch_oob/kernels/aiv/kernel_write.cpp b/tests/st/a2a3/fully_distributed_within_core/spmd_batch_dispatch_oob/kernels/aiv/kernel_write.cpp
new file mode 100644
index 000000000..1a7fe065b
--- /dev/null
+++ b/tests/st/a2a3/fully_distributed_within_core/spmd_batch_dispatch_oob/kernels/aiv/kernel_write.cpp
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+/**
+ * AIV kernel: writes float(block_idx) at cache line
+ *   (base_cl + block_idx*3 + 1 + sub_block_id).
+ *
+ * Args:
+ *   args[0] = output Tensor* (INOUT)
+ *   args[1] = scalar: base_cl
+ */
+
+#include <cstdint>
+#include <pto/pto-inst.hpp>
+
+#include "tensor.h"
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__ [aicore]  // NOLINT(whitespace/braces)
+#endif
+
+#include "intrinsic.h"
+
+static constexpr int32_t FLOATS_PER_CACHE_LINE = 16;
+static constexpr int32_t SLOTS_PER_BLOCK = 3;
+
+#ifdef PTO_CPUSTUB_HPP
+#define dcci(...) \
+    do {          \
+    } while (0)
+#endif
+#ifndef SINGLE_CACHE_LINE
+#define SINGLE_CACHE_LINE 0
+#endif
+#ifndef CACHELINE_OUT
+#define CACHELINE_OUT 0
+#endif
+
+extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) {
+    __gm__ Tensor *out_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]);
+    __gm__ float *out = reinterpret_cast<__gm__ float *>(out_tensor->buffer.addr) + out_tensor->start_offset;
+
+    int32_t base_cl = static_cast<int32_t>(args[1]);
+    int32_t block_idx = get_block_idx(args);
+    int32_t sub_block_id = get_sub_block_id(args);
+    int32_t offset = (base_cl + block_idx * SLOTS_PER_BLOCK + 1 + sub_block_id) * FLOATS_PER_CACHE_LINE;
+
+    out[offset] = static_cast<float>(block_idx);
+
+    dcci(&out[offset], SINGLE_CACHE_LINE, CACHELINE_OUT);
+}
diff --git a/tests/st/a2a3/fully_distributed_within_core/spmd_batch_dispatch_oob/kernels/orchestration/spmd_batch_dispatch_oob_orch.cpp b/tests/st/a2a3/fully_distributed_within_core/spmd_batch_dispatch_oob/kernels/orchestration/spmd_batch_dispatch_oob_orch.cpp
new file mode 100644
index 000000000..47cd60ce7
--- /dev/null
+++ b/tests/st/a2a3/fully_distributed_within_core/spmd_batch_dispatch_oob/kernels/orchestration/spmd_batch_dispatch_oob_orch.cpp
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+/**
+ * Regression test for batch dispatch OOB (issue #565).
+ *
+ * Submits two MIX tasks with block_num=48 back-to-back so they are both
+ * in the ready queue when the scheduler runs pop_ready_tasks_batch.
+ *
+ * Args layout: [output]
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "pto_orchestration_api.h"
+
+#define FUNC_AIC 0
+#define FUNC_AIV0 1
+#define FUNC_AIV1 2
+
+extern "C" {
+
+__attribute__((visibility("default"))) PTO2OrchestrationConfig aicpu_orchestration_config(const L2TaskArgs &orch_args) {
+    (void)orch_args;
+    return PTO2OrchestrationConfig{
+        .expected_arg_count = 1,
+    };
+}
+
+static void submit_spmd_mix(const Tensor &out, int16_t block_num, int64_t base_cl) {
+    MixedKernels mk;
+    mk.aic_kernel_id = FUNC_AIC;
+    mk.aiv0_kernel_id = FUNC_AIV0;
+    mk.aiv1_kernel_id = FUNC_AIV1;
+
+    L0TaskArgs args;
+    args.add_inout(out);
+    args.add_scalar(base_cl);
+    args.launch_spec.set_block_num(block_num);
+    rt_submit_task(mk, args);
+}
+
+__attribute__((visibility("default"))) void aicpu_orchestration_entry(const L2TaskArgs &orch_args) {
+    const Tensor &ext_output = orch_args.tensor(0).ref();
+
+    // Two back-to-back tasks with block_num=48 (2x cluster count).
+    // Both land in the ready queue simultaneously, triggering got=2 in
+    // pop_ready_tasks_batch — the scenario that causes OOB without the fix.
+    submit_spmd_mix(ext_output, 48, 0);
+    submit_spmd_mix(ext_output, 48, 144);
+
+    LOG_INFO_V9("[spmd_batch_dispatch_oob] Submitted 2 MIX tasks: block_num=48,48");
+}
+
+}  // extern "C"
diff --git a/tests/st/a2a3/fully_distributed_within_core/spmd_batch_dispatch_oob/test_spmd_batch_dispatch_oob.py b/tests/st/a2a3/fully_distributed_within_core/spmd_batch_dispatch_oob/test_spmd_batch_dispatch_oob.py
new file mode 100644
index 000000000..e89a1734b
--- /dev/null
+++ b/tests/st/a2a3/fully_distributed_within_core/spmd_batch_dispatch_oob/test_spmd_batch_dispatch_oob.py
@@ -0,0 +1,82 @@
+#!/usr/bin/env python3
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""Regression test for batch dispatch OOB (issue #565).
+
+Submits two back-to-back MIX tasks each with block_num=48 (>> 24 clusters).
+When both tasks enter the ready queue simultaneously, pop_ready_tasks_batch
+returns got=2.  Without the fix, the first task's do-while drains all idle
+clusters, and the second task's do-while calls pop_first() on an empty mask,
+returning -1 as cluster_offset — an out-of-bounds array index.
+
+Each block writes float(block_idx) at 3 cache lines (AIC, AIV0, AIV1).
+Output tensor: 2 * 48 * 3 = 288 cache lines = 4608 float32.
+"""
+
+import torch
+from simpler.task_interface import ArgDirection as D
+
+from simpler_setup import SceneTestCase, TaskArgsBuilder, Tensor, scene_test
+
+FLOATS_PER_CACHE_LINE = 16
+SLOTS_PER_BLOCK = 3
+TASKS = [(48, 0), (48, 144)]
+TOTAL_CL = sum(bn * SLOTS_PER_BLOCK for bn, _ in TASKS)
+
+
+@scene_test(level=2, runtime="tensormap_and_ringbuffer")
+class TestSpmdBatchDispatchOob(SceneTestCase):
+    RTOL = 0
+    ATOL = 0
+
+    CALLABLE = {
+        "orchestration": {
+            "source": "kernels/orchestration/spmd_batch_dispatch_oob_orch.cpp",
+            "function_name": "aicpu_orchestration_entry",
+            "signature": [D.INOUT],
+        },
+        "incores": [
+            {
+                "func_id": 0,
+                "source": "kernels/aic/kernel_write.cpp",
+                "core_type": "aic",
+                # Cooperative MIX (AIC+AIV0+AIV1 share one args[]). Declare the
+                # payload signature on exactly ONE subtask so the tensor dump's
+                # per-subtask sum equals the payload (1 INOUT tensor); the AIVs
+                # stay empty or the sum would triple and the dump is skipped.
+                "signature": [D.INOUT],
+            },
+            {"func_id": 1, "source": "kernels/aiv/kernel_write.cpp", "core_type": "aiv"},
+            {"func_id": 2, "source": "kernels/aiv/kernel_write.cpp", "core_type": "aiv"},
+        ],
+    }
+
+    CASES = [
+        {
+            "name": "Case1",
+            "platforms": ["a2a3sim", "a2a3"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 24},
+            "params": {},
+        }
+    ]
+
+    def generate_args(self, params):
+        return TaskArgsBuilder(Tensor("output", torch.zeros(TOTAL_CL * FLOATS_PER_CACHE_LINE, dtype=torch.float32)))
+
+    def compute_golden(self, args, params):
+        out = args.output
+        for block_num, base_cl in TASKS:
+            for block_idx in range(block_num):
+                for slot in range(SLOTS_PER_BLOCK):
+                    cl = base_cl + block_idx * SLOTS_PER_BLOCK + slot
+                    out[cl * FLOATS_PER_CACHE_LINE] = float(block_idx)
+
+
+if __name__ == "__main__":
+    SceneTestCase.run_module(__name__)
diff --git a/tests/st/a2a3/fully_distributed_within_core/spmd_multiblock_aiv/kernels/aiv/kernel_spmd_write.cpp b/tests/st/a2a3/fully_distributed_within_core/spmd_multiblock_aiv/kernels/aiv/kernel_spmd_write.cpp
new file mode 100644
index 000000000..cad4aae85
--- /dev/null
+++ b/tests/st/a2a3/fully_distributed_within_core/spmd_multiblock_aiv/kernels/aiv/kernel_spmd_write.cpp
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+/**
+ * SPMD Multi-Block Write Kernel (AIV)
+ *
+ * Each block writes float(block_idx) at a cacheline-aligned offset
+ * determined by scalar parameter base_cl:
+ *
+ *   out[(base_cl + block_idx) * FLOATS_PER_CACHE_LINE] = float(block_idx)
+ *
+ * Args:
+ *   args[0] = output Tensor* (INOUT)
+ *   args[1] = scalar: base_cl (starting cache line index for this task)
+ */
+
+#include <cstdint>
+#include <pto/pto-inst.hpp>
+
+#include "tensor.h"
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__ [aicore]  // NOLINT(whitespace/braces)
+#endif
+
+#include "intrinsic.h"
+
+static constexpr int32_t FLOATS_PER_CACHE_LINE = 16;
+
+#ifdef PTO_CPUSTUB_HPP
+#define dcci(...) \
+    do {          \
+    } while (0)
+#endif
+#ifndef SINGLE_CACHE_LINE
+#define SINGLE_CACHE_LINE 0
+#endif
+#ifndef CACHELINE_OUT
+#define CACHELINE_OUT 0
+#endif
+
+extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) {
+    __gm__ Tensor *out_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]);
+    __gm__ float *out = reinterpret_cast<__gm__ float *>(out_tensor->buffer.addr) + out_tensor->start_offset;
+
+    int32_t base_cl = static_cast<int32_t>(args[1]);
+    int32_t block_idx = get_block_idx(args);
+    int32_t offset = (base_cl + block_idx) * FLOATS_PER_CACHE_LINE;
+
+    out[offset] = static_cast<float>(block_idx);
+
+    dcci(&out[offset], SINGLE_CACHE_LINE, CACHELINE_OUT);
+}
diff --git a/tests/st/a2a3/fully_distributed_within_core/spmd_multiblock_aiv/kernels/orchestration/spmd_multiblock_aiv_orch.cpp b/tests/st/a2a3/fully_distributed_within_core/spmd_multiblock_aiv/kernels/orchestration/spmd_multiblock_aiv_orch.cpp
new file mode 100644
index 000000000..188317cb9
--- /dev/null
+++ b/tests/st/a2a3/fully_distributed_within_core/spmd_multiblock_aiv/kernels/orchestration/spmd_multiblock_aiv_orch.cpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+/**
+ * SPMD Multi-Block AIV Orchestration
+ *
+ * Submits three AIV tasks with increasing block_num to exercise:
+ *   T0: block_num=4   — fits within a single sched thread
+ *   T1: block_num=16  — saturates one sched thread (8 clusters × 2 AIV)
+ *   T2: block_num=24  — forces cross-thread re-push via ready_queue
+ *
+ * Each task writes to a disjoint region of the output tensor using the
+ * base_cl scalar to offset the block writes.
+ *
+ * Args layout: [output]
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "pto_orchestration_api.h"
+
+#define FUNC_SPMD_WRITE_AIV 0
+
+extern "C" {
+
+__attribute__((visibility("default"))) PTO2OrchestrationConfig aicpu_orchestration_config(const L2TaskArgs &orch_args) {
+    (void)orch_args;  // NOLINT(readability/casting)
+    return PTO2OrchestrationConfig{
+        .expected_arg_count = 1,
+    };
+}
+
+static void submit_spmd_aiv(int32_t kernel_id, const Tensor &out, int16_t block_num, int64_t base_cl) {
+    L0TaskArgs args;
+    args.add_inout(out);
+    args.add_scalar(base_cl);
+    args.launch_spec.set_block_num(block_num);
+    rt_submit_aiv_task(kernel_id, args);
+}
+
+__attribute__((visibility("default"))) void aicpu_orchestration_entry(const L2TaskArgs &orch_args) {
+    const Tensor &ext_output = orch_args.tensor(0).ref();
+
+    // T0: 4 blocks — basic multi-block
+    submit_spmd_aiv(FUNC_SPMD_WRITE_AIV, ext_output, 4, 0);
+
+    // T1: 16 blocks — saturate one sched thread's AIV cores (8 clusters × 2 AIV)
+    submit_spmd_aiv(FUNC_SPMD_WRITE_AIV, ext_output, 16, 4);
+
+    // T2: 24 blocks — cross-thread dispatch via ready_queue re-push
+    submit_spmd_aiv(FUNC_SPMD_WRITE_AIV, ext_output, 24, 20);
+
+    // T3: 48 blocks — occupy all AIV cores across all 3 sched threads
+    submit_spmd_aiv(FUNC_SPMD_WRITE_AIV, ext_output, 48, 44);
+
+    // T4: 96 blocks — two full rounds of all AIV cores
+    submit_spmd_aiv(FUNC_SPMD_WRITE_AIV, ext_output, 96, 92);
+
+    LOG_INFO_V9("[spmd_multiblock_aiv] Submitted 5 AIV tasks: block_num=4,16,24,48,96");
+}
+
+}  // extern "C"
diff --git a/tests/st/a2a3/fully_distributed_within_core/spmd_multiblock_aiv/test_spmd_multiblock_aiv.py b/tests/st/a2a3/fully_distributed_within_core/spmd_multiblock_aiv/test_spmd_multiblock_aiv.py
new file mode 100644
index 000000000..3613d66b5
--- /dev/null
+++ b/tests/st/a2a3/fully_distributed_within_core/spmd_multiblock_aiv/test_spmd_multiblock_aiv.py
@@ -0,0 +1,72 @@
+#!/usr/bin/env python3
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""SPMD multi-block AIV: five AIV tasks with block_num = 4, 16, 24, 48, 96.
+
+Each block writes float(block_idx) at cache line (base_cl + block_idx).
+Output tensor: 188 cache lines = 3008 float32.
+"""
+
+import torch
+from simpler.task_interface import ArgDirection as D
+
+from simpler_setup import SceneTestCase, TaskArgsBuilder, Tensor, scene_test
+
+FLOATS_PER_CACHE_LINE = 16
+TASKS = [(4, 0), (16, 4), (24, 20), (48, 44), (96, 92)]
+TOTAL_CL = sum(bn for bn, _ in TASKS)
+
+
+@scene_test(level=2, runtime="tensormap_and_ringbuffer")
+class TestSpmdMultiblockAiv(SceneTestCase):
+    RTOL = 0
+    ATOL = 0
+
+    CALLABLE = {
+        "orchestration": {
+            "source": "kernels/orchestration/spmd_multiblock_aiv_orch.cpp",
+            "function_name": "aicpu_orchestration_entry",
+            "signature": [D.INOUT],
+        },
+        "incores": [
+            {
+                "func_id": 0,
+                "name": "SPMD_WRITE_AIV",
+                "source": "kernels/aiv/kernel_spmd_write.cpp",
+                "core_type": "aiv",
+                # Declare the single output tensor so the tensor dump (which
+                # sums per-subtask signature tensors and matches them to the
+                # payload) captures it under func_id 0 — without it the count is
+                # 0 != payload 1 and the dump is skipped.
+                "signature": [D.INOUT],
+            },
+        ],
+    }
+
+    CASES = [
+        {
+            "name": "Case1",
+            "platforms": ["a2a3sim", "a2a3"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 24},
+            "params": {},
+        }
+    ]
+
+    def generate_args(self, params):
+        return TaskArgsBuilder(Tensor("output", torch.zeros(TOTAL_CL * FLOATS_PER_CACHE_LINE, dtype=torch.float32)))
+
+    def compute_golden(self, args, params):
+        out = args.output
+        for block_num, base_cl in TASKS:
+            for block_idx in range(block_num):
+                out[(base_cl + block_idx) * FLOATS_PER_CACHE_LINE] = float(block_idx)
+
+
+if __name__ == "__main__":
+    SceneTestCase.run_module(__name__)
diff --git a/tests/st/a2a3/fully_distributed_within_core/spmd_multiblock_mix/kernels/aic/kernel_spmd_mix.cpp b/tests/st/a2a3/fully_distributed_within_core/spmd_multiblock_mix/kernels/aic/kernel_spmd_mix.cpp
new file mode 100644
index 000000000..6fda6aa0e
--- /dev/null
+++ b/tests/st/a2a3/fully_distributed_within_core/spmd_multiblock_mix/kernels/aic/kernel_spmd_mix.cpp
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+/**
+ * SPMD Multi-Block MIX Kernel (AIC version)
+ *
+ * AIC writes float(block_idx) at cache line (base_cl + block_idx * 3 + 0).
+ *
+ * Args:
+ *   args[0] = output Tensor* (INOUT)
+ *   args[1] = scalar: base_cl (starting cache line index for this task)
+ */
+
+#include <cstdint>
+#include <pto/pto-inst.hpp>
+
+#include "tensor.h"
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__ [aicore]  // NOLINT(whitespace/braces)
+#endif
+
+#include "intrinsic.h"
+
+static constexpr int32_t FLOATS_PER_CACHE_LINE = 16;
+static constexpr int32_t SLOTS_PER_BLOCK = 3;
+
+#ifdef PTO_CPUSTUB_HPP
+#define dcci(...) \
+    do {          \
+    } while (0)
+#endif
+#ifndef SINGLE_CACHE_LINE
+#define SINGLE_CACHE_LINE 0
+#endif
+#ifndef CACHELINE_OUT
+#define CACHELINE_OUT 0
+#endif
+
+extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) {
+    __gm__ Tensor *out_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]);
+    __gm__ float *out = reinterpret_cast<__gm__ float *>(out_tensor->buffer.addr) + out_tensor->start_offset;
+
+    int32_t base_cl = static_cast<int32_t>(args[1]);
+    int32_t block_idx = get_block_idx(args);
+    int32_t offset = (base_cl + block_idx * SLOTS_PER_BLOCK + 0) * FLOATS_PER_CACHE_LINE;
+
+    out[offset] = static_cast<float>(block_idx);
+
+    dcci(&out[offset], SINGLE_CACHE_LINE, CACHELINE_OUT);
+}
diff --git a/tests/st/a2a3/fully_distributed_within_core/spmd_multiblock_mix/kernels/aiv/kernel_spmd_mix.cpp b/tests/st/a2a3/fully_distributed_within_core/spmd_multiblock_mix/kernels/aiv/kernel_spmd_mix.cpp
new file mode 100644
index 000000000..9eaddfdec
--- /dev/null
+++ b/tests/st/a2a3/fully_distributed_within_core/spmd_multiblock_mix/kernels/aiv/kernel_spmd_mix.cpp
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+/**
+ * SPMD Multi-Block MIX Kernel (AIV version)
+ *
+ * AIV writes float(block_idx) at cache line
+ *   (base_cl + block_idx * 3 + 1 + sub_block_id)
+ * where sub_block_id is 0 for AIV0 and 1 for AIV1.
+ *
+ * Args:
+ *   args[0] = output Tensor* (INOUT)
+ *   args[1] = scalar: base_cl (starting cache line index for this task)
+ */
+
+#include <cstdint>
+#include <pto/pto-inst.hpp>
+
+#include "tensor.h"
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__ [aicore]  // NOLINT(whitespace/braces)
+#endif
+
+#include "intrinsic.h"
+
+static constexpr int32_t FLOATS_PER_CACHE_LINE = 16;
+static constexpr int32_t SLOTS_PER_BLOCK = 3;
+
+#ifdef PTO_CPUSTUB_HPP
+#define dcci(...) \
+    do {          \
+    } while (0)
+#endif
+#ifndef SINGLE_CACHE_LINE
+#define SINGLE_CACHE_LINE 0
+#endif
+#ifndef CACHELINE_OUT
+#define CACHELINE_OUT 0
+#endif
+
+extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) {
+    __gm__ Tensor *out_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]);
+    __gm__ float *out = reinterpret_cast<__gm__ float *>(out_tensor->buffer.addr) + out_tensor->start_offset;
+
+    int32_t base_cl = static_cast<int32_t>(args[1]);
+    int32_t block_idx = get_block_idx(args);
+    int32_t sub_block_id = get_sub_block_id(args);
+    int32_t offset = (base_cl + block_idx * SLOTS_PER_BLOCK + 1 + sub_block_id) * FLOATS_PER_CACHE_LINE;
+
+    out[offset] = static_cast<float>(block_idx);
+
+    dcci(&out[offset], SINGLE_CACHE_LINE, CACHELINE_OUT);
+}
diff --git a/tests/st/a2a3/fully_distributed_within_core/spmd_multiblock_mix/kernels/orchestration/spmd_multiblock_mix_orch.cpp b/tests/st/a2a3/fully_distributed_within_core/spmd_multiblock_mix/kernels/orchestration/spmd_multiblock_mix_orch.cpp
new file mode 100644
index 000000000..bf875a157
--- /dev/null
+++ b/tests/st/a2a3/fully_distributed_within_core/spmd_multiblock_mix/kernels/orchestration/spmd_multiblock_mix_orch.cpp
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+/**
+ * SPMD Multi-Block MIX Orchestration
+ *
+ * Submits three MIX tasks (AIC + AIV0 + AIV1) with increasing block_num:
+ *   T0: block_num=2   — basic multi-block MIX
+ *   T1: block_num=8   — saturates one sched thread (8 clusters)
+ *   T2: block_num=12  — forces cross-thread re-push via ready_queue
+ *
+ * Each task writes to a disjoint region of the output tensor using the
+ * base_cl scalar to offset the block writes.
+ *
+ * Args layout: [output]
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "pto_orchestration_api.h"
+
+#define FUNC_SPMD_MIX_AIC 0
+#define FUNC_SPMD_MIX_AIV0 1
+#define FUNC_SPMD_MIX_AIV1 2
+
+extern "C" {
+
+__attribute__((visibility("default"))) PTO2OrchestrationConfig aicpu_orchestration_config(const L2TaskArgs &orch_args) {
+    (void)orch_args;  // NOLINT(readability/casting)
+    return PTO2OrchestrationConfig{
+        .expected_arg_count = 1,
+    };
+}
+
+static void submit_spmd_mix(
+    int32_t aic_id, int32_t aiv0_id, int32_t aiv1_id, const Tensor &out, int16_t block_num, int64_t base_cl
+) {
+    MixedKernels mk;
+    mk.aic_kernel_id = aic_id;
+    mk.aiv0_kernel_id = aiv0_id;
+    mk.aiv1_kernel_id = aiv1_id;
+
+    L0TaskArgs args;
+    args.add_inout(out);
+    args.add_scalar(base_cl);
+    args.launch_spec.set_block_num(block_num);
+    rt_submit_task(mk, args);
+}
+
+__attribute__((visibility("default"))) void aicpu_orchestration_entry(const L2TaskArgs &orch_args) {
+    const Tensor &ext_output = orch_args.tensor(0).ref();
+
+    // T0: 2 blocks (6 CL) — basic multi-block MIX
+    submit_spmd_mix(FUNC_SPMD_MIX_AIC, FUNC_SPMD_MIX_AIV0, FUNC_SPMD_MIX_AIV1, ext_output, 2, 0);
+
+    // T1: 8 blocks (24 CL) — saturate one sched thread's clusters
+    submit_spmd_mix(FUNC_SPMD_MIX_AIC, FUNC_SPMD_MIX_AIV0, FUNC_SPMD_MIX_AIV1, ext_output, 8, 6);
+
+    // T2: 12 blocks (36 CL) — cross-thread dispatch via ready_queue re-push
+    submit_spmd_mix(FUNC_SPMD_MIX_AIC, FUNC_SPMD_MIX_AIV0, FUNC_SPMD_MIX_AIV1, ext_output, 12, 30);
+
+    // T3: 24 blocks (72 CL) — occupy all clusters across all 3 sched threads
+    submit_spmd_mix(FUNC_SPMD_MIX_AIC, FUNC_SPMD_MIX_AIV0, FUNC_SPMD_MIX_AIV1, ext_output, 24, 66);
+
+    // T4: 48 blocks (144 CL) — two full rounds of all clusters
+    submit_spmd_mix(FUNC_SPMD_MIX_AIC, FUNC_SPMD_MIX_AIV0, FUNC_SPMD_MIX_AIV1, ext_output, 48, 138);
+
+    LOG_INFO_V9("[spmd_multiblock_mix] Submitted 5 MIX tasks: block_num=2,8,12,24,48");
+}
+
+}  // extern "C"
diff --git a/tests/st/a2a3/fully_distributed_within_core/spmd_multiblock_mix/test_spmd_multiblock_mix.py b/tests/st/a2a3/fully_distributed_within_core/spmd_multiblock_mix/test_spmd_multiblock_mix.py
new file mode 100644
index 000000000..7a4abebfa
--- /dev/null
+++ b/tests/st/a2a3/fully_distributed_within_core/spmd_multiblock_mix/test_spmd_multiblock_mix.py
@@ -0,0 +1,77 @@
+#!/usr/bin/env python3
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""SPMD multi-block MIX: five MIX tasks with block_num = 2, 8, 12, 24, 48.
+
+Each block occupies 3 cache lines (AIC, AIV0, AIV1).
+Output tensor: 282 cache lines = 4512 float32.
+"""
+
+import torch
+from simpler.task_interface import ArgDirection as D
+
+from simpler_setup import SceneTestCase, TaskArgsBuilder, Tensor, scene_test
+
+FLOATS_PER_CACHE_LINE = 16
+SLOTS_PER_BLOCK = 3
+TASKS = [(2, 0), (8, 6), (12, 30), (24, 66), (48, 138)]
+TOTAL_CL = sum(bn * SLOTS_PER_BLOCK for bn, _ in TASKS)
+
+
+@scene_test(level=2, runtime="tensormap_and_ringbuffer")
+class TestSpmdMultiblockMix(SceneTestCase):
+    RTOL = 0
+    ATOL = 0
+
+    CALLABLE = {
+        "orchestration": {
+            "source": "kernels/orchestration/spmd_multiblock_mix_orch.cpp",
+            "function_name": "aicpu_orchestration_entry",
+            "signature": [D.INOUT],
+        },
+        "incores": [
+            {
+                "func_id": 0,
+                "name": "SPMD_MIX_AIC",
+                "source": "kernels/aic/kernel_spmd_mix.cpp",
+                "core_type": "aic",
+                # Cooperative MIX (AIC+AIV0+AIV1 share one args[]). Declare the
+                # payload signature on exactly ONE subtask so the tensor dump's
+                # per-subtask sum equals the payload (1 INOUT tensor); the AIVs
+                # stay empty or the sum would triple and the dump is skipped.
+                "signature": [D.INOUT],
+            },
+            {"func_id": 1, "name": "SPMD_MIX_AIV0", "source": "kernels/aiv/kernel_spmd_mix.cpp", "core_type": "aiv"},
+            {"func_id": 2, "name": "SPMD_MIX_AIV1", "source": "kernels/aiv/kernel_spmd_mix.cpp", "core_type": "aiv"},
+        ],
+    }
+
+    CASES = [
+        {
+            "name": "Case1",
+            "platforms": ["a2a3sim", "a2a3"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 24},
+            "params": {},
+        }
+    ]
+
+    def generate_args(self, params):
+        return TaskArgsBuilder(Tensor("output", torch.zeros(TOTAL_CL * FLOATS_PER_CACHE_LINE, dtype=torch.float32)))
+
+    def compute_golden(self, args, params):
+        out = args.output
+        for block_num, base_cl in TASKS:
+            for block_idx in range(block_num):
+                for slot in range(SLOTS_PER_BLOCK):
+                    cl = base_cl + block_idx * SLOTS_PER_BLOCK + slot
+                    out[cl * FLOATS_PER_CACHE_LINE] = float(block_idx)
+
+
+if __name__ == "__main__":
+    SceneTestCase.run_module(__name__)
diff --git a/tests/st/a2a3/fully_distributed_within_core/spmd_paged_attention/kernels/mix/paged_attention_parallel.cpp b/tests/st/a2a3/fully_distributed_within_core/spmd_paged_attention/kernels/mix/paged_attention_parallel.cpp
new file mode 100644
index 000000000..32e9e3302
--- /dev/null
+++ b/tests/st/a2a3/fully_distributed_within_core/spmd_paged_attention/kernels/mix/paged_attention_parallel.cpp
@@ -0,0 +1,851 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Paged Attention MIX Kernel — AIC + AIV in single source via TPUSH/TPOP
+ *
+ * Hardware block_num is fixed at 24. Each hardware block strides over
+ * total_logical_blocks = batch * q_loop logical work items:
+ *   for (block_idx = hw_block_idx; block_idx < total_logical_blocks; block_idx += 24)
+ * Each logical block_idx encodes one (batch_idx, q_tile_idx) position.
+ *
+ * q_tile adapts to num_heads at runtime: q_tile = min(num_heads, MAX_Q_TILE).
+ * When num_heads <= MAX_Q_TILE, q_loop = 1 and each block processes all heads.
+ * Two q_tile shapes are statically dispatched: 16 (default) and 64.
+ *
+ * Compiled twice: once with __DAV_CUBE__ (AIC), once with __DAV_VEC__ (AIV).
+ * AIC and AIV cooperate via 3 GM-backed FIFO pipes (one set per hardware block,
+ * reused across stride-loop iterations):
+ *   - sij_pipe (C2V): QK scores    (Q_TILE, block_size) fp32, TILE_UP_DOWN
+ *   - pij_pipe (V2C): softmax probs (Q_TILE, block_size) bf16, TILE_UP_DOWN
+ *   - oi_pipe  (C2V): PV output    (Q_TILE, head_dim)   fp32, TILE_UP_DOWN
+ *
+ * Per-block pipeline:
+ *   AIC: QK matmul → TPUSH(sij) → TPOP(pij) → PV matmul → TPUSH(oi_new)
+ *   AIV: TPOP(sij) → online softmax → TPUSH(pij) → TPOP(oi_new) → online update
+ *
+ * MixedKernels args:
+ *   args[0]  = query         Tensor* (batch*num_heads, head_dim) bf16
+ *   args[1]  = key_cache     Tensor* (kv_total_rows, head_dim) bf16
+ *   args[2]  = value_cache   Tensor* (kv_total_rows, head_dim) bf16
+ *   args[3]  = block_table   Tensor* (batch, max_blocks_per_req) int32
+ *   args[4]  = context_lens  Tensor* (batch,) int32
+ *   args[5]  = out           Tensor* (batch*num_heads, head_dim) float32 [output]
+ *   args[6]  = sij_fifo      Tensor* GM ring buffer for sij pipe
+ *   args[7]  = pij_fifo      Tensor* GM ring buffer for pij pipe
+ *   args[8]  = oi_fifo       Tensor* GM ring buffer for oi_new pipe
+ *   args[9]  = scale_value   scalar (float bits in uint64)
+ *   args[10] = num_heads     scalar
+ *   args[11] = head_dim      scalar
+ *   args[12] = block_size    scalar
+ *   args[13] = max_num_blocks_per_req scalar
+ *   args[14] = q_loop        scalar
+ *   args[15] = total_logical_blocks scalar (= batch * q_loop)
+ *   args[16] = q_tile        scalar (16 or 64)
+ */
+
+#include <cstdint>
+// NOLINTBEGIN(clang-diagnostic-error,bugprone-reserved-identifier,bugprone-easily-swappable-parameters,modernize-use-auto)
+#include <pto/pto-inst.hpp>
+#include <pto/common/fifo.hpp>
+
+#include "tensor.h"
+
+using pto::BLayout;
+using pto::Direction;
+using pto::GlobalTensor;
+using pto::Layout;
+using pto::PadValue;
+using pto::RoundMode;
+using pto::Shape;
+using pto::SLayout;
+using pto::Stride;
+using pto::Tile;
+using pto::TileAcc;
+using pto::TileLeft;
+using pto::TileRight;
+using pto::TileSplitAxis;
+using pto::TileType;
+using pto::TPipe;
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__ [aicore]  // NOLINT(whitespace/braces)
+#endif
+
+#ifdef __DAV_CUBE__
+constexpr bool DAV_CUBE = true;
+#else
+constexpr bool DAV_CUBE = false;
+#endif
+
+#ifdef __DAV_VEC__
+constexpr bool DAV_VEC = true;
+#else
+constexpr bool DAV_VEC = false;
+#endif
+
+#include "intrinsic.h"
+
+static constexpr int MAX_Q_TILE = 64;
+static constexpr int HEAD_DIM = 128;
+static constexpr int MAX_BLOCK_SIZE = 128;
+
+// TPUSH/TPOP pipe flag IDs (each consumes 2 consecutive IDs: data + backpressure)
+static constexpr uint16_t SIJ_FLAG_ID = 0;
+static constexpr uint16_t PIJ_FLAG_ID = 2;
+static constexpr uint16_t OI_FLAG_ID = 4;
+static constexpr uint8_t FIFO_DEPTH = 2;
+
+// Per-q_tile compile-time configuration: pipe types, slot sizes, UB/L1 layouts.
+// QT must be 16 or 64. SUB_QT = QT / 2 (each of AIV0/AIV1 handles half the rows).
+template <int QT>
+struct PAConfig {
+    static constexpr int Q_TILE = QT;
+    static constexpr int SUB_QT = QT / 2;
+
+    // GM FIFO slot sizes (full tile per slot, sized for max block_size to allow
+    // the same FIFO to host both block_size=64 and block_size=128 cases).
+    static constexpr uint32_t SIJ_SLOT_SIZE = QT * MAX_BLOCK_SIZE * sizeof(float);
+    static constexpr uint32_t PIJ_SLOT_SIZE = QT * MAX_BLOCK_SIZE * sizeof(bfloat16_t);
+    static constexpr uint32_t OI_SLOT_SIZE = QT * HEAD_DIM * sizeof(float);
+
+    using SijPipeT = TPipe<SIJ_FLAG_ID, Direction::DIR_C2V, SIJ_SLOT_SIZE, FIFO_DEPTH>;
+    using PijPipeT = TPipe<PIJ_FLAG_ID, Direction::DIR_V2C, PIJ_SLOT_SIZE, FIFO_DEPTH>;
+    using OiPipeT = TPipe<OI_FLAG_ID, Direction::DIR_C2V, OI_SLOT_SIZE, FIFO_DEPTH>;
+
+    // AIV UB consumer buffer layout (sized for SUB_QT rows per AIV lane)
+    static constexpr uint32_t SIJ_UB_BASE = 0x0;
+    static constexpr uint32_t SIJ_UB_SIZE = 2 * SUB_QT * MAX_BLOCK_SIZE * sizeof(float);
+    static constexpr uint32_t OI_UB_BASE = SIJ_UB_BASE + SIJ_UB_SIZE;
+    static constexpr uint32_t OI_UB_SIZE = 2 * SUB_QT * HEAD_DIM * sizeof(float);
+    static constexpr uint32_t WORK_UB_BASE = OI_UB_BASE + OI_UB_SIZE;
+
+    // AIC L1 consumer buffer for V2C pij pipe (full QT * MAX_BLOCK_SIZE rows)
+    static constexpr uint32_t PIJ_L1_BASE = 0x40000;
+    static constexpr uint32_t PIJ_L1_SIZE = 2 * QT * MAX_BLOCK_SIZE * sizeof(bfloat16_t);
+};
+
+// ============================================================================
+// AIC (Cube) processing — QK-first offset-loop software pipeline
+//
+// QK-first order: each steady-state iteration does QK[i] then PV[i-1].
+// This maximizes overlap by hiding AIV's softmax behind AIC's QK matmul:
+// while AIC computes QK[i], AIV concurrently processes SF[i-1].
+// By the time AIC finishes QK[i] and needs pij[i-1], SF[i-1] is done.
+// FIFO_DEPTH=2 supports the 2-deep sij buffering (sij[i-1] + sij[i]).
+//
+// Timeline (steady state):
+//   AIC:  QK[i] → TPUSH(sij[i]) → TPOP(pij[i-1]) → PV[i-1] → TPUSH(oi[i-1])
+//   AIV:  TPOP(sij[i-1]) → SF[i-1] → TPUSH(pij[i-1]) → TPOP(oi[i-2]) → UP[i-2]
+//   ──────────────────────────────────────────────────────────────────────────
+//   QK[i] overlaps with SF[i-1]   (Cube compute ∥ Vector softmax)
+//   PV[i-1] overlaps with UP[i-2] (Cube compute ∥ Vector online update)
+// ============================================================================
+
+// Helper: QK matmul for block i — load key, move to L0, matmul, TPUSH sij
+template <
+    int M, int K, int N, typename SijPipeT, typename GlobalB_QK, typename TileMatA_QK, typename TileMatB_QK,
+    typename LeftTile_QK, typename RightTile_QK, typename AccTile_QK>
+static __aicore__ void aic_qk_step(
+    __gm__ bfloat16_t *key_base, uint64_t kv_block_id, uint64_t i, TileMatA_QK &aMatTile_QK, TileMatB_QK &bMatTile_QK_A,
+    TileMatB_QK &bMatTile_QK_B, LeftTile_QK &aTile_QK, RightTile_QK &bTile_QK, AccTile_QK &cTile_QK, SijPipeT &sij_pipe,
+    bool current_loaded = false, bool has_next = false, uint64_t next_kv_block_id = 0
+) {
+    if (!current_loaded) {
+        GlobalB_QK kjGlobal(key_base + kv_block_id * N * K);
+        if (i % 2 == 0) {
+            TLOAD(bMatTile_QK_A, kjGlobal);
+        } else {
+            TLOAD(bMatTile_QK_B, kjGlobal);
+        }
+    }
+
+    set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
+    wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
+
+    TMOV(aTile_QK, aMatTile_QK);
+    if (i % 2 == 0) {
+        TMOV(bTile_QK, bMatTile_QK_A);
+    } else {
+        TMOV(bTile_QK, bMatTile_QK_B);
+    }
+
+    if (has_next) {
+        GlobalB_QK kjGlobalNext(key_base + next_kv_block_id * N * K);
+        if ((i + 1) % 2 == 0) {
+            TLOAD(bMatTile_QK_A, kjGlobalNext);
+        } else {
+            TLOAD(bMatTile_QK_B, kjGlobalNext);
+        }
+    }
+
+    set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
+    wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
+
+    TMATMUL(cTile_QK, aTile_QK, bTile_QK);
+
+    set_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
+    wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
+
+    // TPUSH sij (C2V): AccTile L0C -> GM. Ensure prior MTE3 is done,
+    // then push, then wait for MTE3 DMA to complete before signaling consumer.
+    TPUSH<SijPipeT, AccTile_QK, TileSplitAxis::TILE_UP_DOWN>(sij_pipe, cTile_QK);
+    set_flag(PIPE_FIX, PIPE_S, EVENT_ID7);
+    wait_flag(PIPE_FIX, PIPE_S, EVENT_ID7);
+    sij_pipe.prod.record();
+}
+
+// Helper: PV matmul for block i — TPOP pij, load value, move to L0, matmul, TPUSH oi
+template <
+    int M, int K, int N, typename PijPipeT, typename OiPipeT, typename GlobalB_PV, typename PijMatTile,
+    typename TileMatB_PV, typename LeftTile_PV, typename RightTile_PV, typename AccTile_PV>
+static __aicore__ void aic_pv_step(
+    __gm__ bfloat16_t *val_base, uint64_t kv_block_id, uint64_t i, PijMatTile &pijMatTile, TileMatB_PV &bMatTile_PV_A,
+    TileMatB_PV &bMatTile_PV_B, LeftTile_PV &aTile_PV, RightTile_PV &bTile_PV, AccTile_PV &cTile_PV, PijPipeT &pij_pipe,
+    OiPipeT &oi_pipe, bool current_loaded = false, bool has_next = false, uint64_t next_kv_block_id = 0
+) {
+    if (!current_loaded) {
+        GlobalB_PV vjGlobal(val_base + kv_block_id * N * K);
+        if (i % 2 == 0) {
+            TLOAD(bMatTile_PV_A, vjGlobal);
+        } else {
+            TLOAD(bMatTile_PV_B, vjGlobal);
+        }
+    }
+
+    TPOP<PijPipeT, PijMatTile, TileSplitAxis::TILE_NO_SPLIT>(pij_pipe, pijMatTile);
+
+    // PV step uses EVENT_ID1 (QK step uses EVENT_ID0) to avoid flag aliasing
+    // when pipe_barrier(PIPE_ALL) is removed between steps.
+    set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID1);
+    wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID1);
+
+    TMOV(aTile_PV, pijMatTile);
+    if (i % 2 == 0) {
+        TMOV(bTile_PV, bMatTile_PV_A);
+    } else {
+        TMOV(bTile_PV, bMatTile_PV_B);
+    }
+
+    if (has_next) {
+        GlobalB_PV vjGlobalNext(val_base + next_kv_block_id * N * K);
+        if ((i + 1) % 2 == 0) {
+            TLOAD(bMatTile_PV_A, vjGlobalNext);
+        } else {
+            TLOAD(bMatTile_PV_B, vjGlobalNext);
+        }
+    }
+
+    set_flag(PIPE_MTE1, PIPE_M, EVENT_ID1);
+    wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID1);
+
+    TMATMUL(cTile_PV, aTile_PV, bTile_PV);
+
+    set_flag(PIPE_M, PIPE_FIX, EVENT_ID1);
+    wait_flag(PIPE_M, PIPE_FIX, EVENT_ID1);
+
+    // TPUSH oi (C2V): AccTile L0C -> GM. Same manual record pattern as sij.
+    TPUSH<OiPipeT, AccTile_PV, TileSplitAxis::TILE_UP_DOWN>(oi_pipe, cTile_PV);
+    set_flag(PIPE_FIX, PIPE_S, EVENT_ID7);
+    wait_flag(PIPE_FIX, PIPE_S, EVENT_ID7);
+    oi_pipe.prod.record();
+}
+
+template <typename Cfg, int K, int N>
+static __aicore__ void aic_process_blocks(
+    __gm__ bfloat16_t *qi_base, __gm__ bfloat16_t *key_base, __gm__ bfloat16_t *val_base, __gm__ int32_t *bt,
+    uint64_t bt_offset, uint64_t n_blocks, typename Cfg::SijPipeT &sij_pipe, typename Cfg::PijPipeT &pij_pipe,
+    typename Cfg::OiPipeT &oi_pipe
+) {
+    constexpr int M = Cfg::Q_TILE;
+    using SijPipeT = typename Cfg::SijPipeT;
+    using PijPipeT = typename Cfg::PijPipeT;
+    using OiPipeT = typename Cfg::OiPipeT;
+
+    using GlobalA_QK = GlobalTensor<bfloat16_t, Shape<1, 1, 1, M, K>, Stride<M * K, M * K, M * K, K, 1>>;
+    using GlobalB_QK = GlobalTensor<bfloat16_t, Shape<1, 1, 1, K, N>, Stride<K * N, K * N, K * N, 1, K>, Layout::DN>;
+    using TileMatA_QK = Tile<TileType::Mat, bfloat16_t, M, K, BLayout::ColMajor, M, K, SLayout::RowMajor, 512>;
+    using TileMatB_QK = Tile<TileType::Mat, bfloat16_t, K, N, BLayout::RowMajor, K, N, SLayout::ColMajor, 512>;
+    using LeftTile_QK = TileLeft<bfloat16_t, M, K, M, K>;
+    using RightTile_QK = TileRight<bfloat16_t, K, N, K, N>;
+    using AccTile_QK = TileAcc<float, M, N, M, N>;
+
+    using GlobalB_PV = GlobalTensor<bfloat16_t, Shape<1, 1, 1, N, K>, Stride<N * K, N * K, N * K, K, 1>>;
+    using TileMatB_PV = Tile<TileType::Mat, bfloat16_t, N, K, BLayout::ColMajor, N, K, SLayout::RowMajor, 512>;
+    using PijMatTile = Tile<TileType::Mat, bfloat16_t, M, N, BLayout::ColMajor, M, N, SLayout::RowMajor, 512>;
+    using LeftTile_PV = TileLeft<bfloat16_t, M, N, M, N>;
+    using RightTile_PV = TileRight<bfloat16_t, N, K, N, K>;
+    using AccTile_PV = TileAcc<float, M, K, M, K>;
+
+    constexpr int kQKBBytes = K * N * static_cast<int>(sizeof(bfloat16_t));
+    constexpr int kPVBBytes = N * K * static_cast<int>(sizeof(bfloat16_t));
+
+    TileMatA_QK aMatTile_QK;
+    TileMatB_QK bMatTile_QK_A, bMatTile_QK_B;
+    TASSIGN(aMatTile_QK, 0x0);
+    TASSIGN(bMatTile_QK_A, 0x20000);
+    TASSIGN(bMatTile_QK_B, 0x20000 + kQKBBytes);
+
+    LeftTile_QK aTile_QK;
+    RightTile_QK bTile_QK;
+    AccTile_QK cTile_QK;
+    TASSIGN(aTile_QK, 0x0);
+    TASSIGN(bTile_QK, 0x0);
+    TASSIGN(cTile_QK, 0x0);
+
+    PijMatTile pijMatTile;
+    TileMatB_PV bMatTile_PV_A, bMatTile_PV_B;
+    TASSIGN(bMatTile_PV_A, Cfg::PIJ_L1_BASE + Cfg::PIJ_L1_SIZE);
+    TASSIGN(bMatTile_PV_B, Cfg::PIJ_L1_BASE + Cfg::PIJ_L1_SIZE + kPVBBytes);
+
+    LeftTile_PV aTile_PV;
+    RightTile_PV bTile_PV;
+    AccTile_PV cTile_PV;
+    TASSIGN(aTile_PV, 0x0);
+    TASSIGN(bTile_PV, 0x0);
+    TASSIGN(cTile_PV, 0x0);
+
+    GlobalA_QK qiGlobal(qi_base);
+    TLOAD(aMatTile_QK, qiGlobal);
+
+    if (n_blocks == 1) {
+        // Degenerate case: no pipeline overlap possible
+        uint64_t block_id = static_cast<uint64_t>(bt[bt_offset]);
+        aic_qk_step<M, K, N, SijPipeT, GlobalB_QK>(
+            key_base, block_id, 0, aMatTile_QK, bMatTile_QK_A, bMatTile_QK_B, aTile_QK, bTile_QK, cTile_QK, sij_pipe
+        );
+        aic_pv_step<M, K, N, PijPipeT, OiPipeT, GlobalB_PV>(
+            val_base, block_id, 0, pijMatTile, bMatTile_PV_A, bMatTile_PV_B, aTile_PV, bTile_PV, cTile_PV, pij_pipe,
+            oi_pipe
+        );
+    } else {
+        // Prologue: QK[0] — produces sij[0] for AIV to start SF[0]
+        uint64_t prev_block_id = static_cast<uint64_t>(bt[bt_offset]);
+        uint64_t next_block_id = static_cast<uint64_t>(bt[bt_offset + 1]);
+        aic_qk_step<M, K, N, SijPipeT, GlobalB_QK>(
+            key_base, prev_block_id, 0, aMatTile_QK, bMatTile_QK_A, bMatTile_QK_B, aTile_QK, bTile_QK, cTile_QK,
+            sij_pipe, false, true, next_block_id
+        );
+        // Steady state: QK[i] then PV[i-1] (QK-first order).
+        for (uint64_t i = 1; i < n_blocks; i++) {
+            uint64_t block_id = static_cast<uint64_t>(bt[bt_offset + i]);
+            uint64_t next_block_id = (i + 1 < n_blocks) ? static_cast<uint64_t>(bt[bt_offset + i + 1]) : 0;
+            aic_qk_step<M, K, N, SijPipeT, GlobalB_QK>(
+                key_base, block_id, i, aMatTile_QK, bMatTile_QK_A, bMatTile_QK_B, aTile_QK, bTile_QK, cTile_QK,
+                sij_pipe, true, i + 1 < n_blocks, next_block_id
+            );
+            aic_pv_step<M, K, N, PijPipeT, OiPipeT, GlobalB_PV>(
+                val_base, prev_block_id, i - 1, pijMatTile, bMatTile_PV_A, bMatTile_PV_B, aTile_PV, bTile_PV, cTile_PV,
+                pij_pipe, oi_pipe, i > 1, i < n_blocks, block_id
+            );
+            prev_block_id = block_id;
+        }
+
+        // Epilogue: PV[n-1] — consume last pij
+        aic_pv_step<M, K, N, PijPipeT, OiPipeT, GlobalB_PV>(
+            val_base, prev_block_id, n_blocks - 1, pijMatTile, bMatTile_PV_A, bMatTile_PV_B, aTile_PV, bTile_PV,
+            cTile_PV, pij_pipe, oi_pipe, n_blocks > 1
+        );
+    }
+}
+
+// ============================================================================
+// AIV (Vector) processing — SF-first offset-loop software pipeline
+//
+// SF-first order: each steady-state iteration does SF[i] then UP[i-1].
+// This ensures pij[i] is produced as early as possible so AIC's TPOP(pij)
+// never stalls behind a pending UP computation. Combined with AIC's
+// QK-first order, SF[i] overlaps with AIC's PV[i-1] Cube matmul.
+// ============================================================================
+
+// Helper: softmax step for block i — TPOP sij, compute softmax, TPUSH pij
+//
+// globalMaxRow is used as a running accumulator: on entry it holds the max
+// from the previous iteration (or is undefined when i==0). SF updates it
+// in-place to max(globalMaxRow, localMaxRow_i * scale). The caller must
+// save globalMaxRow before calling SF if the old value is still needed.
+template <
+    typename Cfg, int TM, int TN, typename SijVecTile, typename TileSijPad, typename TileVecMxN,
+    typename PijVecBf16Tile, typename TileScalarDN, typename TileScalarRow>
+static __aicore__ void aiv_sf_step(
+    uint64_t i, bool is_last_partial, uint64_t valid_len_last, float scale_value, SijVecTile &sijTile,
+    TileSijPad &sijPadTile, TileVecMxN &pijTile, TileVecMxN &tmpTile, PijVecBf16Tile &pijBf16Tile,
+    TileScalarDN &localMaxDN, TileScalarDN &globalMaxDN, TileScalarDN &llDN, TileScalarRow &localMaxRow,
+    TileScalarRow &globalMaxRow, typename Cfg::SijPipeT &sij_pipe, typename Cfg::PijPipeT &pij_pipe
+) {
+    using TileSijDyn = Tile<TileType::Vec, float, TM, TN, BLayout::RowMajor, TM, -1>;
+    using SijPipeT = typename Cfg::SijPipeT;
+    using PijPipeT = typename Cfg::PijPipeT;
+
+    TPOP<SijPipeT, SijVecTile, TileSplitAxis::TILE_UP_DOWN>(sij_pipe, sijTile);
+
+    set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+    wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+
+    if (is_last_partial) {
+        int sij_addr = Cfg::SIJ_UB_BASE + static_cast<int>((i % 2) * TM * TN * static_cast<int>(sizeof(float)));
+        TASSIGN(sijPadTile, sij_addr);
+        TileSijDyn sijDynTile(static_cast<size_t>(valid_len_last));
+        TASSIGN(sijDynTile, sij_addr);
+        TFILLPAD_INPLACE(sijPadTile, sijDynTile);
+        pipe_barrier(PIPE_V);
+    }
+
+    TROWMAX(localMaxDN, sijTile, tmpTile);
+    pipe_barrier(PIPE_V);
+    TRESHAPE(localMaxRow, localMaxDN);
+
+    if (i == 0) {
+        TMULS(globalMaxRow, localMaxRow, scale_value);
+    } else {
+        TMULS(localMaxRow, localMaxRow, scale_value);
+        pipe_barrier(PIPE_V);
+        TMAX(globalMaxRow, globalMaxRow, localMaxRow);
+    }
+    TRESHAPE(globalMaxDN, globalMaxRow);
+
+    TMULS(sijTile, sijTile, scale_value);
+    pipe_barrier(PIPE_V);
+    TROWEXPANDSUB(pijTile, sijTile, globalMaxDN);
+    pipe_barrier(PIPE_V);
+    TEXP(pijTile, pijTile);
+    pipe_barrier(PIPE_V);
+
+    TCVT(pijBf16Tile, pijTile, RoundMode::CAST_ROUND);
+    pipe_barrier(PIPE_V);
+    TCVT(pijTile, pijBf16Tile, RoundMode::CAST_ROUND);
+    pipe_barrier(PIPE_V);
+
+    TROWSUM(llDN, pijTile, tmpTile);
+
+    set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+    wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+    TPUSH<PijPipeT, PijVecBf16Tile, TileSplitAxis::TILE_UP_DOWN>(pij_pipe, pijBf16Tile);
+}
+
+// Helper: online update step for block i — TPOP oi, merge with accumulators
+//
+// curMaxRow  = M[i]   = running max over blocks 0..i   (mij in FlashAttention notation)
+// prevMaxRow = M[i-1] = running max over blocks 0..i-1 (dm / old max)
+// llDN_i     = row-sum of pij for block i
+//
+// alpha = exp(prevMaxRow - curMaxRow), used to rescale accumulated go and gl.
+template <
+    typename Cfg, int TM, int TN, typename OiVecTile, typename TileDataMxHD, typename TileScalarDN,
+    typename TileScalarND, typename TileScalarRow>
+static __aicore__ void aiv_up_step(
+    uint64_t i, OiVecTile &oiNewTile, TileDataMxHD &goTile, TileScalarDN &alphaDN_dn, TileScalarDN &llDN_i,
+    TileScalarND &glND, TileScalarND &alphaND, TileScalarND &llND, TileScalarND &dmND, TileScalarND &mijND,
+    TileScalarRow &curMaxRow, TileScalarRow &prevMaxRow, typename Cfg::OiPipeT &oi_pipe
+) {
+    using OiPipeT = typename Cfg::OiPipeT;
+    TPOP<OiPipeT, OiVecTile, TileSplitAxis::TILE_UP_DOWN>(oi_pipe, oiNewTile);
+
+    set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1);
+    wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1);
+
+    if (i == 0) {
+        TMULS(goTile, oiNewTile, 1.0f);
+        TRESHAPE(llND, llDN_i);
+        pipe_barrier(PIPE_V);
+        TMULS(glND, llND, 1.0f);
+    } else {
+        TRESHAPE(llND, llDN_i);
+        TRESHAPE(mijND, curMaxRow);
+        TRESHAPE(dmND, prevMaxRow);
+
+        TSUB(alphaND, dmND, mijND);
+        pipe_barrier(PIPE_V);
+        TEXP(alphaND, alphaND);
+        pipe_barrier(PIPE_V);
+
+        TRESHAPE(alphaDN_dn, alphaND);
+        TROWEXPANDMUL(goTile, goTile, alphaDN_dn);
+        pipe_barrier(PIPE_V);
+        TADD(goTile, goTile, oiNewTile);
+
+        TMUL(glND, glND, alphaND);
+        pipe_barrier(PIPE_V);
+        TADD(glND, glND, llND);
+    }
+
+    pipe_barrier(PIPE_V);
+}
+
+template <typename Cfg, int TN>
+static __aicore__ void aiv_process_blocks(
+    float scale_value, uint64_t n_blocks, uint64_t valid_len_last, __gm__ float *dst_ptr,
+    typename Cfg::SijPipeT &sij_pipe, typename Cfg::PijPipeT &pij_pipe, typename Cfg::OiPipeT &oi_pipe
+) {
+    constexpr int TM = Cfg::SUB_QT;
+    constexpr int HD = HEAD_DIM;
+    constexpr int kAlignedRows = ((TM * sizeof(float) + 31) / 32) * (32 / sizeof(float));
+    constexpr int kScalarCols = 32 / sizeof(float);
+    constexpr int kScalarRows = TM / kScalarCols;
+
+    using SijVecTile = Tile<TileType::Vec, float, TM, TN, BLayout::RowMajor, TM, TN>;
+    using PijVecBf16Tile = Tile<TileType::Vec, bfloat16_t, TM, TN, BLayout::RowMajor, TM, TN>;
+    using OiVecTile = Tile<TileType::Vec, float, TM, HD, BLayout::RowMajor, TM, HD>;
+
+    using TileVecMxN = Tile<TileType::Vec, float, TM, TN, BLayout::RowMajor, TM, TN>;
+    using TileSijPad =
+        Tile<TileType::Vec, float, TM, TN, BLayout::RowMajor, TM, TN, SLayout::NoneBox, 512, PadValue::Min>;
+    using TileScalarDN = Tile<TileType::Vec, float, kAlignedRows, 1, BLayout::ColMajor, TM, 1>;
+    using TileScalarND =
+        Tile<TileType::Vec, float, kScalarRows, kScalarCols, BLayout::RowMajor, kScalarRows, kScalarCols>;
+    using TileScalarRow = Tile<TileType::Vec, float, 1, TM, BLayout::RowMajor, 1, TM>;
+    using TileDataMxHD = Tile<TileType::Vec, float, TM, HD, BLayout::RowMajor, TM, HD>;
+    using GlobalDataMxHD = GlobalTensor<float, Shape<1, 1, 1, TM, HD>, Stride<1, 1, 1, HD, 1>>;
+
+    constexpr int kSijBytes = TM * TN * sizeof(float);
+    constexpr int kPijBf16Bytes = TM * TN * sizeof(bfloat16_t);
+    constexpr int kScalarDNBytes = kAlignedRows * sizeof(float);
+    constexpr int kScalarNDBytes = kScalarRows * kScalarCols * sizeof(float);
+
+    SijVecTile sijTile;
+    TileSijPad sijPadTile;
+    TileVecMxN pijTile;
+    TileVecMxN tmpTile;
+    PijVecBf16Tile pijBf16Tile;
+    TileScalarDN localMaxDN, globalMaxDN;
+    TileScalarDN alphaDN_dn, llDN, glDN;
+    TileScalarDN savedLlDN;
+    TileScalarND gmND, glND, alphaND, llND, dmND, miNewND, mijND;
+    TileScalarRow localMaxRow, globalMaxRow;
+    TileScalarRow savedMaxRow, prevMaxRow;
+    OiVecTile oiNewTile;
+    TileDataMxHD goTile;
+
+    int ub = Cfg::WORK_UB_BASE;
+    TASSIGN(pijTile, ub);
+    ub += kSijBytes;
+    TASSIGN(pijBf16Tile, ub);
+    ub += kPijBf16Bytes;
+    TASSIGN(tmpTile, ub);
+    ub += kSijBytes;
+
+    int sb = ub;
+    TASSIGN(localMaxDN, sb);
+    TASSIGN(localMaxRow, sb);
+    sb += kScalarDNBytes;
+    TASSIGN(globalMaxDN, sb);
+    TASSIGN(globalMaxRow, sb);
+    sb += kScalarDNBytes;
+    TASSIGN(gmND, sb);
+    TASSIGN(savedMaxRow, sb);
+    sb += kScalarDNBytes;
+    TASSIGN(glND, sb);
+    TASSIGN(glDN, sb);
+    sb += kScalarDNBytes;
+    TASSIGN(alphaND, sb);
+    TASSIGN(alphaDN_dn, sb);
+    sb += kScalarDNBytes;
+    TASSIGN(llND, sb);
+    TASSIGN(llDN, sb);
+    sb += kScalarDNBytes;
+    TASSIGN(dmND, sb);
+    sb += kScalarNDBytes;
+    TASSIGN(miNewND, sb);
+    sb += kScalarNDBytes;
+    TASSIGN(mijND, sb);
+    sb += kScalarNDBytes;
+    TASSIGN(prevMaxRow, sb);
+    sb += kScalarDNBytes;
+    TASSIGN(savedLlDN, sb);
+    sb += kScalarDNBytes;
+
+    TASSIGN(goTile, sb);
+
+    GlobalDataMxHD dstGlobal(dst_ptr);
+
+    bool last_partial = (valid_len_last < static_cast<uint64_t>(TN));
+
+    if (n_blocks == 1) {
+        aiv_sf_step<Cfg, TM, TN>(
+            0, last_partial, valid_len_last, scale_value, sijTile, sijPadTile, pijTile, tmpTile, pijBf16Tile,
+            localMaxDN, globalMaxDN, llDN, localMaxRow, globalMaxRow, sij_pipe, pij_pipe
+        );
+        aiv_up_step<Cfg, TM, TN>(
+            0, oiNewTile, goTile, alphaDN_dn, llDN, glND, alphaND, llND, dmND, mijND, globalMaxRow, globalMaxRow,
+            oi_pipe
+        );
+    } else {
+        // Prologue: SF[0] — not the last block
+        aiv_sf_step<Cfg, TM, TN>(
+            0, false, valid_len_last, scale_value, sijTile, sijPadTile, pijTile, tmpTile, pijBf16Tile, localMaxDN,
+            globalMaxDN, llDN, localMaxRow, globalMaxRow, sij_pipe, pij_pipe
+        );
+
+        // Steady state: SF[i] then UP[i-1] (SF-first order).
+        for (uint64_t i = 1; i < n_blocks; i++) {
+            // Shift max history: prevMaxRow ← savedMaxRow (M[i-2])
+            // Save current: savedMaxRow ← globalMaxRow (M[i-1])
+            TMULS(prevMaxRow, savedMaxRow, 1.0f);
+            TMULS(savedMaxRow, globalMaxRow, 1.0f);
+            TMULS(savedLlDN, llDN, 1.0f);
+            pipe_barrier(PIPE_V);
+
+            bool cur_last_partial = (i == n_blocks - 1) && last_partial;
+            aiv_sf_step<Cfg, TM, TN>(
+                i, cur_last_partial, valid_len_last, scale_value, sijTile, sijPadTile, pijTile, tmpTile, pijBf16Tile,
+                localMaxDN, globalMaxDN, llDN, localMaxRow, globalMaxRow, sij_pipe, pij_pipe
+            );
+
+            aiv_up_step<Cfg, TM, TN>(
+                i - 1, oiNewTile, goTile, alphaDN_dn, savedLlDN, glND, alphaND, llND, dmND, mijND, savedMaxRow,
+                prevMaxRow, oi_pipe
+            );
+        }
+
+        // Epilogue: UP[n-1] — uses live globalMaxRow (M[n-1]) and savedMaxRow (M[n-2])
+        aiv_up_step<Cfg, TM, TN>(
+            n_blocks - 1, oiNewTile, goTile, alphaDN_dn, llDN, glND, alphaND, llND, dmND, mijND, globalMaxRow,
+            savedMaxRow, oi_pipe
+        );
+    }
+
+    // Final normalization: output = goTile / glDN
+    TRESHAPE(glDN, glND);
+    pipe_barrier(PIPE_V);
+    TROWEXPANDDIV(goTile, goTile, glDN);
+    set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+    wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+    TSTORE(dstGlobal, goTile);
+
+    set_flag(PIPE_MTE3, PIPE_S, EVENT_ID7);
+    wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID7);
+}
+
+// ============================================================================
+// Per-config dispatch: builds pipes from per-hw-block FIFO bases, then runs
+// the AIC or AIV stride loop over total_logical_blocks.
+// ============================================================================
+
+template <typename Cfg>
+static __aicore__ void run_aic(
+    __gm__ int64_t *args, __gm__ int32_t *ctx_ptr, int32_t hw_block_idx, int32_t hw_block_num,
+    int64_t total_logical_blocks, int64_t num_heads, int64_t head_dim, int64_t block_size, int64_t max_blocks_per_req,
+    int64_t q_loop, __gm__ void *sij_fifo_base, __gm__ void *pij_fifo_base, __gm__ void *oi_fifo_base
+) {
+    typename Cfg::SijPipeT sij_pipe(sij_fifo_base, Cfg::SIJ_UB_BASE, 0U);
+    typename Cfg::PijPipeT pij_pipe(pij_fifo_base, 0U, Cfg::PIJ_L1_BASE);
+    typename Cfg::OiPipeT oi_pipe(oi_fifo_base, Cfg::OI_UB_BASE, 0U);
+
+    // Disable auto-record on C2V pipes: AccTile TSTORE goes through FIX → MTE3,
+    // but auto-record fires on PIPE_FIX which may complete before MTE3 DMA writes
+    // to GM. Manual pipe_barrier(PIPE_MTE3) + record() in each step ensures the
+    // cross-core signal fires only after the GM write is visible.
+    sij_pipe.prod.setRecordStatus(false);
+    oi_pipe.prod.setRecordStatus(false);
+
+    // Disable reverse-dependency sync (back-pressure). Forward dependency chain
+    // (AIC: QK-first; AIV: SF-first; FIFO_DEPTH=2) guarantees producer is at
+    // most SLOT_NUM=2 tiles ahead of consumer:
+    //   sij: AIC pushes sij[i+1] only after TPOP(pij[i-1]), which requires
+    //        AIV TPOP(sij[i-1]) — slot reuse safe.
+    //   oi : AIC pushes oi[i+1]  only after TPOP(pij[i+1]), which requires
+    //        AIV's iter i+1 SF, by which time AIV iter i finished UP[i-1]
+    //        i.e. TPOP(oi[i-1]) — slot reuse safe.
+    //   pij: AIV pushes pij[i+1] only after TPOP(sij[i+1]), which fires after
+    //        AIC iter i+1 starts and AIC iter i has finished TPOP(pij[i-1])
+    //        — slot reuse safe.
+    // If the QK-first/SF-first interleaving or FIFO_DEPTH changes, restore
+    // these flags.
+    sij_pipe.prod.setAllocateStatus(false);
+    oi_pipe.prod.setAllocateStatus(false);
+    pij_pipe.cons.setFreeStatus(false);
+
+    __gm__ Tensor *query_t = reinterpret_cast<__gm__ Tensor *>(args[0]);
+    __gm__ Tensor *key_cache_t = reinterpret_cast<__gm__ Tensor *>(args[1]);
+    __gm__ Tensor *value_cache_t = reinterpret_cast<__gm__ Tensor *>(args[2]);
+    __gm__ Tensor *block_table_t = reinterpret_cast<__gm__ Tensor *>(args[3]);
+
+    __gm__ bfloat16_t *query_base = reinterpret_cast<__gm__ bfloat16_t *>(query_t->buffer.addr) + query_t->start_offset;
+    __gm__ bfloat16_t *key_base =
+        reinterpret_cast<__gm__ bfloat16_t *>(key_cache_t->buffer.addr) + key_cache_t->start_offset;
+    __gm__ bfloat16_t *val_base =
+        reinterpret_cast<__gm__ bfloat16_t *>(value_cache_t->buffer.addr) + value_cache_t->start_offset;
+    __gm__ int32_t *bt = reinterpret_cast<__gm__ int32_t *>(block_table_t->buffer.addr) + block_table_t->start_offset;
+
+    for (int32_t block_idx = hw_block_idx; block_idx < total_logical_blocks; block_idx += hw_block_num) {
+        int64_t batch_idx = block_idx / q_loop;
+        int64_t q_tile_idx = block_idx % q_loop;
+
+        int64_t cur_seq = static_cast<int64_t>(ctx_ptr[batch_idx]);
+        int64_t n_blocks = (cur_seq + block_size - 1) / block_size;
+        if (n_blocks <= 0) continue;
+
+        int64_t q_offset = (batch_idx * num_heads + q_tile_idx * Cfg::Q_TILE) * head_dim;
+        __gm__ bfloat16_t *qi_base = query_base + q_offset;
+        uint64_t bt_offset = static_cast<uint64_t>(batch_idx * max_blocks_per_req);
+
+        if (block_size == 128) {
+            aic_process_blocks<Cfg, 128, 128>(
+                qi_base, key_base, val_base, bt, bt_offset, static_cast<uint64_t>(n_blocks), sij_pipe, pij_pipe, oi_pipe
+            );
+        } else {
+            aic_process_blocks<Cfg, 128, 64>(
+                qi_base, key_base, val_base, bt, bt_offset, static_cast<uint64_t>(n_blocks), sij_pipe, pij_pipe, oi_pipe
+            );
+        }
+    }
+}
+
+template <typename Cfg>
+static __aicore__ void run_aiv(
+    __gm__ int64_t *args, __gm__ int32_t *ctx_ptr, int32_t hw_block_idx, int32_t hw_block_num,
+    int64_t total_logical_blocks, int64_t num_heads, int64_t head_dim, int64_t block_size, int64_t q_loop,
+    __gm__ void *sij_fifo_base, __gm__ void *pij_fifo_base, __gm__ void *oi_fifo_base
+) {
+    typename Cfg::SijPipeT sij_pipe(sij_fifo_base, Cfg::SIJ_UB_BASE, 0U);
+    typename Cfg::PijPipeT pij_pipe(pij_fifo_base, 0U, Cfg::PIJ_L1_BASE);
+    typename Cfg::OiPipeT oi_pipe(oi_fifo_base, Cfg::OI_UB_BASE, 0U);
+
+    __gm__ Tensor *out_t = reinterpret_cast<__gm__ Tensor *>(args[5]);
+    float scale_value = from_u64<float>(static_cast<uint64_t>(args[9]));
+
+    int32_t sub_block_id = get_sub_block_id(args);
+    int64_t row_offset = sub_block_id * Cfg::SUB_QT;
+
+    // Entry offsets depend on the actual tile width (block_size for sij/pij, HEAD_DIM for oi).
+    // TILE_UP_DOWN splits Q_TILE rows into two SUB_QT halves; AIV1's data starts at
+    // SUB_QT * tile_width * sizeof(element) within the contiguous TPUSH'd tile.
+    int sij_sub_offset = sub_block_id * Cfg::SUB_QT * static_cast<int>(block_size) * static_cast<int>(sizeof(float));
+    int pij_sub_offset =
+        sub_block_id * Cfg::SUB_QT * static_cast<int>(block_size) * static_cast<int>(sizeof(bfloat16_t));
+    int oi_sub_offset = sub_block_id * Cfg::SUB_QT * HEAD_DIM * static_cast<int>(sizeof(float));
+    sij_pipe.cons.setEntryOffset(sij_sub_offset);
+    pij_pipe.prod.setEntryOffset(pij_sub_offset);
+    oi_pipe.cons.setEntryOffset(oi_sub_offset);
+
+    // Mirror reverse-dependency disable on the AIV side (see run_aic for
+    // the full forward-chain argument).
+    pij_pipe.prod.setAllocateStatus(false);
+    sij_pipe.cons.setFreeStatus(false);
+    oi_pipe.cons.setFreeStatus(false);
+
+    __gm__ float *out_base = reinterpret_cast<__gm__ float *>(out_t->buffer.addr) + out_t->start_offset;
+
+    for (int32_t block_idx = hw_block_idx; block_idx < total_logical_blocks; block_idx += hw_block_num) {
+        int64_t batch_idx = block_idx / q_loop;
+        int64_t q_tile_idx = block_idx % q_loop;
+
+        int64_t cur_seq = static_cast<int64_t>(ctx_ptr[batch_idx]);
+        int64_t n_blocks = (cur_seq + block_size - 1) / block_size;
+
+        int64_t out_offset = (batch_idx * num_heads + q_tile_idx * Cfg::Q_TILE + row_offset) * head_dim;
+        __gm__ float *dst = out_base + out_offset;
+
+        if (n_blocks <= 0) {
+            using ZeroTile =
+                Tile<TileType::Vec, float, Cfg::SUB_QT, HEAD_DIM, BLayout::RowMajor, Cfg::SUB_QT, HEAD_DIM>;
+            using ZeroGlobal = GlobalTensor<float, Shape<1, 1, 1, Cfg::SUB_QT, HEAD_DIM>, Stride<1, 1, 1, HEAD_DIM, 1>>;
+            ZeroTile zeroTile;
+            TASSIGN(zeroTile, Cfg::WORK_UB_BASE);
+            TEXPANDS(zeroTile, 0.0f);
+            pipe_barrier(PIPE_V);
+            ZeroGlobal dstZero(dst);
+            set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+            wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+            TSTORE(dstZero, zeroTile);
+            pipe_barrier(PIPE_MTE3);
+            continue;
+        }
+
+        int64_t last_block_seq = (n_blocks - 1) * block_size;
+        int64_t remaining = cur_seq - last_block_seq;
+        uint64_t valid_len_last = (remaining >= block_size) ? static_cast<uint64_t>(block_size) :
+                                                              (remaining > 0 ? static_cast<uint64_t>(remaining) : 0);
+
+        if (block_size == 128) {
+            aiv_process_blocks<Cfg, 128>(
+                scale_value, static_cast<uint64_t>(n_blocks), valid_len_last, dst, sij_pipe, pij_pipe, oi_pipe
+            );
+        } else {
+            aiv_process_blocks<Cfg, 64>(
+                scale_value, static_cast<uint64_t>(n_blocks), valid_len_last, dst, sij_pipe, pij_pipe, oi_pipe
+            );
+        }
+    }
+}
+
+// ============================================================================
+// Entry point — shared by AIC and AIV via DAV_CUBE / DAV_VEC guards
+// ============================================================================
+
+extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) {
+    __gm__ Tensor *context_lens_t = reinterpret_cast<__gm__ Tensor *>(args[4]);
+    __gm__ Tensor *sij_fifo_t = reinterpret_cast<__gm__ Tensor *>(args[6]);
+    __gm__ Tensor *pij_fifo_t = reinterpret_cast<__gm__ Tensor *>(args[7]);
+    __gm__ Tensor *oi_fifo_t = reinterpret_cast<__gm__ Tensor *>(args[8]);
+
+    int64_t num_heads = static_cast<int64_t>(args[10]);
+    int64_t head_dim = static_cast<int64_t>(args[11]);
+    int64_t block_size = static_cast<int64_t>(args[12]);
+    int64_t max_blocks_per_req = static_cast<int64_t>(args[13]);
+    int64_t q_loop = static_cast<int64_t>(args[14]);
+    int64_t total_logical_blocks = static_cast<int64_t>(args[15]);
+    int64_t q_tile = static_cast<int64_t>(args[16]);
+
+    int32_t hw_block_idx = get_block_idx(args);
+    int32_t hw_block_num = get_block_num(args);
+
+    __gm__ int32_t *ctx_ptr =
+        reinterpret_cast<__gm__ int32_t *>(context_lens_t->buffer.addr) + context_lens_t->start_offset;
+
+    // GM FIFO buffer per hardware block (reused across stride-loop iterations).
+    // Slot stride is sized for max(Q_TILE) so the same offset works for both q_tile=16 and 64.
+    constexpr uint32_t SIJ_HW_STRIDE = PAConfig<MAX_Q_TILE>::SIJ_SLOT_SIZE * FIFO_DEPTH;
+    constexpr uint32_t PIJ_HW_STRIDE = PAConfig<MAX_Q_TILE>::PIJ_SLOT_SIZE * FIFO_DEPTH;
+    constexpr uint32_t OI_HW_STRIDE = PAConfig<MAX_Q_TILE>::OI_SLOT_SIZE * FIFO_DEPTH;
+
+    __gm__ void *sij_fifo_base = reinterpret_cast<__gm__ void *>(
+        reinterpret_cast<__gm__ uint8_t *>(sij_fifo_t->buffer.addr) + hw_block_idx * SIJ_HW_STRIDE
+    );
+    __gm__ void *pij_fifo_base = reinterpret_cast<__gm__ void *>(
+        reinterpret_cast<__gm__ uint8_t *>(pij_fifo_t->buffer.addr) + hw_block_idx * PIJ_HW_STRIDE
+    );
+    __gm__ void *oi_fifo_base = reinterpret_cast<__gm__ void *>(
+        reinterpret_cast<__gm__ uint8_t *>(oi_fifo_t->buffer.addr) + hw_block_idx * OI_HW_STRIDE
+    );
+
+    if constexpr (DAV_CUBE) {
+        if (q_tile == 16) {
+            run_aic<PAConfig<16>>(
+                args, ctx_ptr, hw_block_idx, hw_block_num, total_logical_blocks, num_heads, head_dim, block_size,
+                max_blocks_per_req, q_loop, sij_fifo_base, pij_fifo_base, oi_fifo_base
+            );
+        } else {
+            run_aic<PAConfig<MAX_Q_TILE>>(
+                args, ctx_ptr, hw_block_idx, hw_block_num, total_logical_blocks, num_heads, head_dim, block_size,
+                max_blocks_per_req, q_loop, sij_fifo_base, pij_fifo_base, oi_fifo_base
+            );
+        }
+    }
+
+    if constexpr (DAV_VEC) {
+        if (q_tile == 16) {
+            run_aiv<PAConfig<16>>(
+                args, ctx_ptr, hw_block_idx, hw_block_num, total_logical_blocks, num_heads, head_dim, block_size,
+                q_loop, sij_fifo_base, pij_fifo_base, oi_fifo_base
+            );
+        } else {
+            run_aiv<PAConfig<MAX_Q_TILE>>(
+                args, ctx_ptr, hw_block_idx, hw_block_num, total_logical_blocks, num_heads, head_dim, block_size,
+                q_loop, sij_fifo_base, pij_fifo_base, oi_fifo_base
+            );
+        }
+    }
+}
+// NOLINTEND(clang-diagnostic-error,bugprone-reserved-identifier,bugprone-easily-swappable-parameters,modernize-use-auto)
diff --git a/tests/st/a2a3/fully_distributed_within_core/spmd_paged_attention/kernels/orchestration/spmd_paged_attention_orch.cpp b/tests/st/a2a3/fully_distributed_within_core/spmd_paged_attention/kernels/orchestration/spmd_paged_attention_orch.cpp
new file mode 100644
index 000000000..07e90500b
--- /dev/null
+++ b/tests/st/a2a3/fully_distributed_within_core/spmd_paged_attention/kernels/orchestration/spmd_paged_attention_orch.cpp
@@ -0,0 +1,159 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * SPMD Paged Attention Orchestration with TPUSH/TPOP (fixed block_num=24)
+ *
+ * Submits a single MixedKernels task with hardware block_num fixed at 24.
+ * total_logical_blocks = batch * q_loop logical work items are distributed
+ * across the 24 hardware blocks via a stride loop inside the kernel:
+ *   for (block_idx = hw_block_idx; block_idx < total_logical_blocks; block_idx += 24)
+ *
+ * q_tile adapts to num_heads at runtime: q_tile = min(num_heads, MAX_Q_TILE).
+ * When num_heads <= MAX_Q_TILE (=64), q_loop = 1 and each block processes all heads.
+ *
+ * Each iteration of the stride loop processes one (batch_idx, q_tile_idx) logical
+ * position, running the full AIC/AIV cooperative pipeline via TPUSH/TPOP pipes:
+ *   AIC: QK matmul -> TPUSH(sij) -> TPOP(pij) -> PV matmul -> TPUSH(oi_new)
+ *   AIV: TPOP(sij) -> online softmax -> TPUSH(pij) -> TPOP(oi_new) -> online update
+ *
+ * GM FIFO buffers for TPUSH/TPOP are sized for the 24 hardware blocks (not for
+ * total_logical_blocks). Each hardware block owns its own FIFO slots and reuses
+ * them across stride-loop iterations.
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <cinttypes>
+
+#include "pto_orchestration_api.h"
+
+#define FUNC_PA_AIC 0
+#define FUNC_PA_AIV 1
+
+static constexpr uint64_t MAX_Q_TILE = 64;
+static constexpr uint64_t HEAD_DIM = 128;
+static constexpr uint64_t MAX_BLOCK_SIZE = 128;
+static constexpr int16_t SPMD_BLOCK_NUM = 24;
+
+// GM FIFO slot sizes (must match kernel's PAConfig<MAX_Q_TILE> constants).
+// Sized for the maximum (q_tile, block_size) so the same FIFO layout works
+// for both q_tile=16 and q_tile=64 dispatch paths inside the kernel.
+static constexpr uint32_t SIJ_SLOT_SIZE = MAX_Q_TILE * MAX_BLOCK_SIZE * sizeof(float);
+static constexpr uint32_t PIJ_SLOT_SIZE = MAX_Q_TILE * MAX_BLOCK_SIZE * sizeof(uint16_t);
+static constexpr uint32_t OI_SLOT_SIZE = MAX_Q_TILE * HEAD_DIM * sizeof(float);
+static constexpr uint32_t FIFO_DEPTH = 2;
+
+extern "C" {
+
+__attribute__((visibility("default"))) PTO2OrchestrationConfig aicpu_orchestration_config(const L2TaskArgs &orch_args) {
+    (void)orch_args;
+    return PTO2OrchestrationConfig{
+        .expected_arg_count = 7,
+    };
+}
+
+__attribute__((visibility("default"))) void aicpu_orchestration_entry(const L2TaskArgs &orch_args) {
+    uint64_t batch = orch_args.tensor(0).ref().shapes[0];
+    uint64_t num_heads = orch_args.tensor(0).ref().shapes[1];
+    uint64_t head_dim = orch_args.tensor(0).ref().shapes[2];
+    DataType data_type = orch_args.tensor(0).ref().dtype;
+
+    uint64_t block_size = orch_args.tensor(1).ref().shapes[1];
+    uint64_t max_num_blocks_per_req = orch_args.tensor(3).ref().shapes[1];
+    uint64_t scale_value = orch_args.scalar(0);
+
+    // q_tile adapts to num_heads: use 64 when num_heads >= 64, else 16.
+    // The kernel statically dispatches on q_tile == 16 vs 64.
+    uint64_t q_tile = (num_heads >= MAX_Q_TILE) ? MAX_Q_TILE : 16;
+    uint64_t q_loop = (num_heads + q_tile - 1) / q_tile;
+    int64_t total_logical_blocks = static_cast<int64_t>(batch * q_loop);
+
+    LOG_INFO_V0(
+        "SPMD PA TPUSH/TPOP: batch=%" PRIu64 " heads=%" PRIu64 " hd=%" PRIu64 " bs=%" PRIu64 " q_tile=%" PRIu64
+        " q_loop=%" PRIu64 " hw_blocks=%d logical_blocks=%" PRId64,
+        batch, num_heads, head_dim, block_size, q_tile, q_loop, SPMD_BLOCK_NUM, total_logical_blocks
+    );
+
+    // Wrap host tensors
+    void *query_ptr = orch_args.tensor(0).ref().data_as<void>();
+    void *kc_ptr = orch_args.tensor(1).ref().data_as<void>();
+    void *vc_ptr = orch_args.tensor(2).ref().data_as<void>();
+    void *out_ptr = orch_args.tensor(5).ref().data_as<void>();
+
+    uint64_t total_kv_blocks = orch_args.tensor(1).ref().shapes[0];
+    uint64_t kv_total_rows = total_kv_blocks * block_size;
+
+    uint32_t query_shapes[2] = {static_cast<uint32_t>(batch * num_heads), static_cast<uint32_t>(head_dim)};
+    uint32_t kv_shapes[2] = {static_cast<uint32_t>(kv_total_rows), static_cast<uint32_t>(head_dim)};
+    uint32_t out_shapes[2] = {static_cast<uint32_t>(batch * num_heads), static_cast<uint32_t>(head_dim)};
+
+    Tensor query = make_tensor_external(query_ptr, query_shapes, 2, data_type);
+    Tensor key_cache = make_tensor_external(kc_ptr, kv_shapes, 2, data_type);
+    Tensor value_cache = make_tensor_external(vc_ptr, kv_shapes, 2, data_type);
+    Tensor out = make_tensor_external(out_ptr, out_shapes, 2, DataType::FLOAT32);
+
+    uint32_t bt_shapes[2] = {static_cast<uint32_t>(batch), static_cast<uint32_t>(max_num_blocks_per_req)};
+    Tensor block_table =
+        make_tensor_external(orch_args.tensor(3).ref().data_as<void>(), bt_shapes, 2, DataType::INT32, false);
+    uint32_t cl_shapes[1] = {static_cast<uint32_t>(batch)};
+    Tensor context_lens =
+        make_tensor_external(orch_args.tensor(4).ref().data_as<void>(), cl_shapes, 1, DataType::INT32, false);
+
+    // GM FIFO buffers for TPUSH/TPOP (one set of slots per hardware block)
+    uint32_t sij_fifo_total = static_cast<uint32_t>(SPMD_BLOCK_NUM) * SIJ_SLOT_SIZE * FIFO_DEPTH;
+    uint32_t pij_fifo_total = static_cast<uint32_t>(SPMD_BLOCK_NUM) * PIJ_SLOT_SIZE * FIFO_DEPTH;
+    uint32_t oi_fifo_total = static_cast<uint32_t>(SPMD_BLOCK_NUM) * OI_SLOT_SIZE * FIFO_DEPTH;
+
+    // Allocate as 1D byte tensors (using INT32 for 4-byte alignment, divide by 4)
+    uint32_t sij_fifo_shapes[1] = {sij_fifo_total / sizeof(int32_t)};
+    uint32_t pij_fifo_shapes[1] = {pij_fifo_total / sizeof(int32_t)};
+    uint32_t oi_fifo_shapes[1] = {oi_fifo_total / sizeof(int32_t)};
+
+    TensorCreateInfo sij_fifo_ci(sij_fifo_shapes, 1, DataType::INT32);
+    TensorCreateInfo pij_fifo_ci(pij_fifo_shapes, 1, DataType::INT32);
+    TensorCreateInfo oi_fifo_ci(oi_fifo_shapes, 1, DataType::INT32);
+
+    PTO2_SCOPE() {
+        L0TaskArgs args;
+        args.add_input(query);
+        args.add_input(key_cache);
+        args.add_input(value_cache);
+        args.add_input(block_table);
+        args.add_input(context_lens);
+        args.add_inout(out);
+        args.add_output(sij_fifo_ci);
+        args.add_output(pij_fifo_ci);
+        args.add_output(oi_fifo_ci);
+        args.add_scalar(scale_value);
+        args.add_scalar(static_cast<int64_t>(num_heads));
+        args.add_scalar(static_cast<int64_t>(head_dim));
+        args.add_scalar(static_cast<int64_t>(block_size));
+        args.add_scalar(static_cast<int64_t>(max_num_blocks_per_req));
+        args.add_scalar(static_cast<int64_t>(q_loop));
+        args.add_scalar(total_logical_blocks);
+        args.add_scalar(static_cast<int64_t>(q_tile));
+        args.launch_spec.set_block_num(SPMD_BLOCK_NUM);
+
+        MixedKernels mk;
+        mk.aic_kernel_id = FUNC_PA_AIC;
+        mk.aiv0_kernel_id = FUNC_PA_AIV;
+        mk.aiv1_kernel_id = FUNC_PA_AIV;
+        rt_submit_task(mk, args);
+    }
+
+    LOG_INFO_V0(
+        "SPMD PA TPUSH/TPOP: submitted 1 MixedKernels task, hw_blocks=%d logical=%" PRId64,
+        static_cast<int>(SPMD_BLOCK_NUM), total_logical_blocks
+    );
+}
+
+}  // extern "C"
diff --git a/tests/st/a2a3/fully_distributed_within_core/spmd_paged_attention/test_spmd_paged_attention.py b/tests/st/a2a3/fully_distributed_within_core/spmd_paged_attention/test_spmd_paged_attention.py
new file mode 100644
index 000000000..4cf9898ad
--- /dev/null
+++ b/tests/st/a2a3/fully_distributed_within_core/spmd_paged_attention/test_spmd_paged_attention.py
@@ -0,0 +1,129 @@
+#!/usr/bin/env python3
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""Paged attention unroll with TPUSH/TPOP: MIX kernel AIC+AIV cooperative pipeline."""
+
+import torch
+from simpler.task_interface import ArgDirection as D
+
+from simpler_setup import Scalar, SceneTestCase, TaskArgsBuilder, Tensor, scene_test
+from simpler_setup.goldens.paged_attention import compute_golden as _pa_compute_golden
+from simpler_setup.goldens.paged_attention import generate_inputs as _pa_generate_inputs
+
+
+@scene_test(level=2, runtime="tensormap_and_ringbuffer")
+class TestPagedAttentionUnrollTpushPop(SceneTestCase):
+    # Tolerances relaxed (2e-3 -> 5e-3 in #825, then 5e-3 -> 1e-2 for #848)
+    # to absorb hardware numerical drift in the AIC/AIV cooperative TPUSH/TPOP
+    # pipeline; observed max_diff ~5.5e-3.
+    RTOL = 1e-2
+    ATOL = 1e-2
+
+    CALLABLE = {
+        "orchestration": {
+            "source": "kernels/orchestration/spmd_paged_attention_orch.cpp",
+            "function_name": "aicpu_orchestration_entry",
+            "signature": [D.IN, D.IN, D.IN, D.IN, D.IN, D.OUT],
+        },
+        "incores": [
+            {
+                "func_id": 0,
+                "name": "PA_AIC",
+                "source": "kernels/mix/paged_attention_parallel.cpp",
+                "core_type": "aic",
+                # Declare the full 9-tensor layout here (AIV entry left empty)
+                # so the tensor dump — which sums per-subtask signature tensors
+                # and matches them to the payload — captures all args under
+                # func_id 0. Consumed only by the dump; dispatch ignores it.
+                "signature": [D.IN, D.IN, D.IN, D.IN, D.IN, D.INOUT, D.OUT, D.OUT, D.OUT],
+            },
+            {
+                "func_id": 1,
+                "name": "PA_AIV",
+                "source": "kernels/mix/paged_attention_parallel.cpp",
+                "core_type": "aiv",
+            },
+        ],
+    }
+
+    CASES = [
+        {
+            "name": "Case1",
+            "platforms": ["a2a3"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 24},
+            "params": {
+                "batch": 256,
+                "num_heads": 16,
+                "kv_head_num": 1,
+                "head_dim": 128,
+                "block_size": 128,
+                "context_len": 8192,
+                "max_model_len": 32768,
+                "dtype": "bfloat16",
+            },
+        },
+        {
+            "name": "Case2",
+            "platforms": ["a2a3"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 24},
+            "manual": True,
+            "params": {
+                "batch": 64,
+                "num_heads": 64,
+                "kv_head_num": 1,
+                "head_dim": 128,
+                "block_size": 64,
+                "context_len": 8192,
+                "max_model_len": 32768,
+                "dtype": "bfloat16",
+            },
+        },
+        {
+            # Intra-core trace target only (--case SmallCase1; manual -> not in
+            # the default onboard CI sweep). batch=24 == the orchestration's
+            # hardcoded SPMD_BLOCK_NUM, so every hw block gets one logical block
+            # (fewer stalls in the AIC<->AIV handshake). Same q_tile=16 path as
+            # Case1; passes golden at context_len=8192.
+            "name": "SmallCase1",
+            "platforms": ["a2a3"],
+            "manual": True,
+            "config": {"aicpu_thread_num": 4, "block_dim": 24},
+            "params": {
+                "batch": 24,
+                "num_heads": 16,
+                "kv_head_num": 1,
+                "head_dim": 128,
+                "block_size": 128,
+                "context_len": 8192,
+                "max_model_len": 32768,
+                "dtype": "bfloat16",
+            },
+        },
+    ]
+
+    def generate_args(self, params):
+        result = _pa_generate_inputs(params)
+        specs = []
+        for name, value in result:
+            if isinstance(value, torch.Tensor):
+                specs.append(Tensor(name, value))
+            else:
+                specs.append(Scalar(name, value))
+        return TaskArgsBuilder(*specs)
+
+    def compute_golden(self, args, params):
+        tensors = {s.name: s.value for s in args.specs if isinstance(s, Tensor)}
+        _pa_compute_golden(tensors, params)
+        for s in args.specs:
+            if isinstance(s, Tensor) and s.name in tensors:
+                getattr(args, s.name)[:] = tensors[s.name]
+
+
+if __name__ == "__main__":
+    SceneTestCase.run_module(__name__)
diff --git a/tests/st/a2a3/fully_distributed_within_core/spmd_paged_attention_highperf/kernels/aic/paged_attention_highperf.cpp b/tests/st/a2a3/fully_distributed_within_core/spmd_paged_attention_highperf/kernels/aic/paged_attention_highperf.cpp
new file mode 100644
index 000000000..eeba7a66d
--- /dev/null
+++ b/tests/st/a2a3/fully_distributed_within_core/spmd_paged_attention_highperf/kernels/aic/paged_attention_highperf.cpp
@@ -0,0 +1,195 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+#include <cstdint>
+
+#ifdef __CPU_SIM
+#include <pto/pto-inst.hpp>
+#endif
+
+#include "tensor.h"
+
+#ifdef __CPU_SIM
+#ifndef __gm__
+#define __gm__
+#endif
+#ifndef __aicore__
+#define __aicore__ [aicore]
+#endif
+
+#include <algorithm>
+#include <cmath>
+#include <cstring>
+
+static float half_to_float(uint16_t h) {
+    uint32_t sign = static_cast<uint32_t>(h & 0x8000) << 16;
+    uint32_t exp = (h >> 10) & 0x1f;
+    uint32_t mant = h & 0x03ff;
+    uint32_t bits;
+    if (exp == 0) {
+        if (mant == 0) {
+            bits = sign;
+        } else {
+            exp = 1;
+            while ((mant & 0x0400) == 0) {
+                mant <<= 1;
+                --exp;
+            }
+            mant &= 0x03ff;
+            bits = sign | ((exp + 112) << 23) | (mant << 13);
+        }
+    } else if (exp == 31) {
+        bits = sign | 0x7f800000 | (mant << 13);
+    } else {
+        bits = sign | ((exp + 112) << 23) | (mant << 13);
+    }
+    float out;
+    std::memcpy(&out, &bits, sizeof(out));
+    return out;
+}
+
+static uint16_t float_to_half(float f) {
+    uint32_t bits;
+    std::memcpy(&bits, &f, sizeof(bits));
+    uint32_t sign = (bits >> 16) & 0x8000;
+    int32_t exp = static_cast<int32_t>((bits >> 23) & 0xff) - 127 + 15;
+    uint32_t mant = bits & 0x7fffff;
+    if (exp <= 0) {
+        if (exp < -10) {
+            return static_cast<uint16_t>(sign);
+        }
+        mant = (mant | 0x800000) >> (1 - exp);
+        return static_cast<uint16_t>(sign | ((mant + 0x1000) >> 13));
+    }
+    if (exp >= 31) {
+        return static_cast<uint16_t>(sign | 0x7c00);
+    }
+    return static_cast<uint16_t>(sign | (static_cast<uint32_t>(exp) << 10) | ((mant + 0x1000) >> 13));
+}
+
+extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) {
+#ifdef __DAV_VEC__
+    (void)args;
+    return;
+#else
+    auto *query_t = reinterpret_cast<Tensor *>(args[0]);
+    auto *key_t = reinterpret_cast<Tensor *>(args[1]);
+    auto *value_t = reinterpret_cast<Tensor *>(args[2]);
+    auto *block_table_t = reinterpret_cast<Tensor *>(args[3]);
+    auto *out_t = reinterpret_cast<Tensor *>(args[4]);
+
+    auto *query = reinterpret_cast<uint16_t *>(query_t->buffer.addr) + query_t->start_offset;
+    auto *key = reinterpret_cast<uint16_t *>(key_t->buffer.addr) + key_t->start_offset;
+    auto *value = reinterpret_cast<uint16_t *>(value_t->buffer.addr) + value_t->start_offset;
+    auto *block_table = reinterpret_cast<int32_t *>(block_table_t->buffer.addr) + block_table_t->start_offset;
+    auto *out = reinterpret_cast<uint16_t *>(out_t->buffer.addr) + out_t->start_offset;
+
+    const int batch = static_cast<int>(query_t->shapes[0]);
+    const int num_heads = static_cast<int>(query_t->shapes[1]);
+    const int head_dim = static_cast<int>(query_t->shapes[2]);
+    const int block_size = static_cast<int>(key_t->shapes[1]);
+    const int num_kv_heads = static_cast<int>(key_t->shapes[2]);
+    const int blocks_per_batch = static_cast<int>(key_t->shapes[0]) / batch;
+    const int max_blocks_per_query = static_cast<int>(block_table_t->shapes[1]);
+    const int heads_per_kv = num_heads / num_kv_heads;
+    const int seq_len = blocks_per_batch * block_size;
+    const float scale = 1.0f / std::sqrt(static_cast<float>(head_dim));
+
+    for (int b = 0; b < batch; ++b) {
+        for (int h = 0; h < num_heads; ++h) {
+            const int kv_head = h / heads_per_kv;
+            float max_score = -INFINITY;
+            for (int token = 0; token < seq_len; ++token) {
+                const int block_col = std::min(token / block_size, max_blocks_per_query - 1);
+                const int block_id = block_table[b * max_blocks_per_query + block_col];
+                const int block_token = token % block_size;
+                float score = 0.0f;
+                for (int d = 0; d < head_dim; ++d) {
+                    const int q_idx = (b * num_heads + h) * head_dim + d;
+                    const int k_idx = ((block_id * block_size + block_token) * num_kv_heads + kv_head) * head_dim + d;
+                    score += half_to_float(query[q_idx]) * half_to_float(key[k_idx]);
+                }
+                max_score = std::max(max_score, score * scale);
+            }
+
+            float denom = 0.0f;
+            for (int d = 0; d < head_dim; ++d) {
+                float accum = 0.0f;
+                for (int token = 0; token < seq_len; ++token) {
+                    const int block_col = std::min(token / block_size, max_blocks_per_query - 1);
+                    const int block_id = block_table[b * max_blocks_per_query + block_col];
+                    const int block_token = token % block_size;
+                    float score = 0.0f;
+                    for (int kd = 0; kd < head_dim; ++kd) {
+                        const int q_idx = (b * num_heads + h) * head_dim + kd;
+                        const int k_idx =
+                            ((block_id * block_size + block_token) * num_kv_heads + kv_head) * head_dim + kd;
+                        score += half_to_float(query[q_idx]) * half_to_float(key[k_idx]);
+                    }
+                    const float weight = std::exp(score * scale - max_score);
+                    if (d == 0) {
+                        denom += weight;
+                    }
+                    const int v_idx = ((block_id * block_size + block_token) * num_kv_heads + kv_head) * head_dim + d;
+                    accum += weight * half_to_float(value[v_idx]);
+                }
+                const int out_idx = (b * num_heads + h) * head_dim + d;
+                out[out_idx] = float_to_half(accum / denom);
+            }
+        }
+    }
+#endif
+}
+
+#else
+
+#include "intrinsic.h"
+
+#define PTO_PA_NO_GLOBAL_ENTRY
+#include "../kernel/pa_entry.cce"
+#undef PTO_PA_NO_GLOBAL_ENTRY
+
+static __aicore__ __attribute__((always_inline)) __gm__ uint8_t *tensor_data(__gm__ int64_t *args, int idx) {
+    __gm__ Tensor *tensor = reinterpret_cast<__gm__ Tensor *>(args[idx]);
+    return reinterpret_cast<__gm__ uint8_t *>(tensor->buffer.addr);
+}
+
+extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) {
+    __gm__ uint8_t *q_gm = tensor_data(args, 0);
+    __gm__ uint8_t *k_gm = tensor_data(args, 1);
+    __gm__ uint8_t *v_gm = tensor_data(args, 2);
+    __gm__ uint8_t *block_tables_gm = tensor_data(args, 3);
+    __gm__ uint8_t *o_gm = tensor_data(args, 4);
+    __gm__ uint8_t *s_gm = tensor_data(args, 5);
+    __gm__ uint8_t *p_gm = tensor_data(args, 6);
+    __gm__ uint8_t *o_tmp_gm = tensor_data(args, 7);
+    __gm__ uint8_t *go_gm = tensor_data(args, 8);
+    __gm__ uint8_t *o_core_tmp_gm = tensor_data(args, 9);
+    __gm__ uint8_t *l_gm = tensor_data(args, 10);
+    __gm__ uint8_t *gm_k16 = tensor_data(args, 11);
+    __gm__ uint8_t *gm_v16 = tensor_data(args, 12);
+    __gm__ uint8_t *tiling_para_gm = tensor_data(args, 13);
+    __gm__ uint8_t *null_gm = tensor_data(args, 14);
+    const uint32_t pto_block_idx = static_cast<uint32_t>(get_block_idx(args));
+    const uint32_t pto_block_num = static_cast<uint32_t>(get_block_num(args));
+#ifdef __DAV_C220_VEC__
+    const uint32_t pto_sub_block_id = static_cast<uint32_t>(get_sub_block_id(args));
+#else
+    const uint32_t pto_sub_block_id = 0;
+#endif
+
+    paged_attention_mask_body(
+        nullptr, pto_block_idx, pto_block_num, pto_sub_block_id, q_gm, k_gm, v_gm, block_tables_gm, null_gm, null_gm,
+        null_gm, null_gm, null_gm, null_gm, null_gm, null_gm, null_gm, o_gm, s_gm, p_gm, o_tmp_gm, go_gm, o_core_tmp_gm,
+        l_gm, gm_k16, gm_v16, tiling_para_gm
+    );
+}
+
+#endif
diff --git a/tests/st/a2a3/fully_distributed_within_core/spmd_paged_attention_highperf/kernels/kernel/pa_entry.cce b/tests/st/a2a3/fully_distributed_within_core/spmd_paged_attention_highperf/kernels/kernel/pa_entry.cce
new file mode 100644
index 000000000..edc7dd81d
--- /dev/null
+++ b/tests/st/a2a3/fully_distributed_within_core/spmd_paged_attention_highperf/kernels/kernel/pa_entry.cce
@@ -0,0 +1,172 @@
+#ifdef __CCE_KT_TEST__
+#define __aicore__
+#else
+#define __aicore__ [aicore]
+#endif
+
+#ifndef PTO_PA_CONTEXT_PARAMS
+#define PTO_PA_CONTEXT_PARAMS
+#endif
+#ifndef PTO_PA_CONTEXT_ARGS
+#define PTO_PA_CONTEXT_ARGS
+#endif
+
+#include "pa_kernel.cce"
+
+static __aicore__ __attribute__((always_inline)) void paged_attention_mask_body(
+    __gm__ uint8_t *__restrict__ sync,
+    uint32_t pto_block_idx,
+    uint32_t pto_block_num,
+    uint32_t pto_sub_block_id,
+    __gm__ uint8_t *__restrict__ q_gm,
+    __gm__ uint8_t *__restrict__ k_gm,
+    __gm__ uint8_t *__restrict__ v_gm,
+    __gm__ uint8_t *__restrict__ block_tables_gm,
+    __gm__ uint8_t *__restrict__ mask_gm,
+    __gm__ uint8_t *__restrict__ deq_scale1_gm,
+    __gm__ uint8_t *__restrict__ offset1_gm,
+    __gm__ uint8_t *__restrict__ deq_scale2_gm,
+    __gm__ uint8_t *__restrict__ offset2_gm,
+    __gm__ uint8_t *__restrict__ razorOffset,
+    __gm__ uint8_t *__restrict__ scale_gm,
+    __gm__ uint8_t *__restrict__ logN_gm,
+    __gm__ uint8_t *__restrict__ eye_gm,
+    __gm__ uint8_t *__restrict__ o_gm,
+    __gm__ uint8_t *__restrict__ s_gm,
+    __gm__ uint8_t *__restrict__ p_gm,
+    __gm__ uint8_t *__restrict__ o_tmp_gm,
+    __gm__ uint8_t *__restrict__ go_gm,
+    __gm__ uint8_t *__restrict__ o_core_tmp_gm,
+    __gm__ uint8_t *__restrict__ l_gm,
+    __gm__ uint8_t *__restrict__ gm_k16,
+    __gm__ uint8_t *__restrict__ gm_v16,
+    __gm__ uint8_t *__restrict__ tiling_para_gm)
+{
+    if (sync != nullptr) {
+        set_ffts_base_addr((unsigned long)sync);
+    }
+    set_atomic_none();
+    set_mask_norm();
+#ifdef __DAV_C220_VEC__
+    set_vector_mask((uint64_t)-1, (uint64_t)-1);
+#elif __DAV_C220_CUBE__
+    set_padding(0);
+    set_nd_para(1ULL);
+#endif
+    const uint32_t tiling_key_val = (uint32_t)(*((__gm__ int32_t *)tiling_para_gm + AtbOps::TILING_KEY_ID));
+    uint32_t prefill_batch_size = (uint32_t)(*((__gm__ int32_t *)tiling_para_gm + TILING_PREFILL_BS));
+    uint32_t decoder_batch_size = (uint32_t)(*((__gm__ int32_t *)tiling_para_gm + TILING_DECODER_BS));
+    if (tiling_key_val == 0) { // fp16 BN
+#ifdef __DAV_C220_CUBE__
+        UnpadAttentionDecoderAic<false, TilingKeyType::TILING_HALF_DATA, half, half, half> pa_aic_fp16(prefill_batch_size, decoder_batch_size);
+        pa_aic_fp16.SetArgs(sync, q_gm, k_gm, v_gm, block_tables_gm, o_gm, s_gm, p_gm, o_tmp_gm, gm_k16, gm_v16, tiling_para_gm, razorOffset, pto_block_idx, pto_block_num);
+        pa_aic_fp16.Run();
+#elif __DAV_C220_VEC__
+        UnpadAttentionDecoderAiv<TilingKeyType::TILING_HALF_DATA, half, half> pa_aiv(prefill_batch_size, decoder_batch_size);
+        pa_aiv.SetArgs(sync, k_gm, v_gm, deq_scale1_gm, offset1_gm, deq_scale2_gm, offset2_gm, block_tables_gm,
+            mask_gm, o_gm, s_gm, p_gm, o_tmp_gm, go_gm, o_core_tmp_gm, l_gm, gm_k16, gm_v16, tiling_para_gm, razorOffset, logN_gm, pto_block_idx, pto_block_num, pto_sub_block_id);
+        pa_aiv.Run();
+#endif
+    } else if (tiling_key_val == 1) { // bf16 BN
+#ifdef __DAV_C220_CUBE__
+        UnpadAttentionDecoderAic<false, TilingKeyType::TILING_BF16_DATA, __bf16, __bf16, __bf16> pa_aic_bf16(prefill_batch_size, decoder_batch_size);
+        pa_aic_bf16.SetArgs(sync, q_gm, k_gm, v_gm, block_tables_gm, o_gm, s_gm, p_gm, o_tmp_gm, gm_k16, gm_v16, tiling_para_gm, razorOffset, pto_block_idx, pto_block_num);
+        pa_aic_bf16.Run();
+#elif __DAV_C220_VEC__
+        UnpadAttentionDecoderAiv<TilingKeyType::TILING_BF16_DATA, __bf16, __bf16> pa_aiv(prefill_batch_size, decoder_batch_size);
+        pa_aiv.SetArgs(sync, k_gm, v_gm, deq_scale1_gm, offset1_gm, deq_scale2_gm, offset2_gm, block_tables_gm,
+            mask_gm, o_gm, s_gm, p_gm, o_tmp_gm, go_gm, o_core_tmp_gm, l_gm, gm_k16, gm_v16, tiling_para_gm, razorOffset, logN_gm, pto_block_idx, pto_block_num, pto_sub_block_id);
+        pa_aiv.Run();
+#endif
+    } else if (tiling_key_val == 16) { // fp16 BNS split-kv
+#ifdef __DAV_C220_CUBE__
+        UnpadAttentionDecoderAic<true, TilingKeyType::TILING_HALF_DATA, half, half, half> pa_aic_fp16(prefill_batch_size, decoder_batch_size);
+        pa_aic_fp16.SetArgs(sync, q_gm, k_gm, v_gm, block_tables_gm, o_gm, s_gm, p_gm, o_tmp_gm, gm_k16, gm_v16, tiling_para_gm, razorOffset, pto_block_idx, pto_block_num);
+        pa_aic_fp16.Run();
+#elif __DAV_C220_VEC__
+        UnpadAttentionDecoderAiv<TilingKeyType::TILING_HALF_DATA, half, half, true> pa_aiv(prefill_batch_size, decoder_batch_size);
+        pa_aiv.SetArgs(sync, k_gm, v_gm, deq_scale1_gm, offset1_gm, deq_scale2_gm, offset2_gm, block_tables_gm,
+            mask_gm, o_gm, s_gm, p_gm, o_tmp_gm, go_gm, o_core_tmp_gm, l_gm, gm_k16, gm_v16, tiling_para_gm, razorOffset, logN_gm, pto_block_idx, pto_block_num, pto_sub_block_id);
+        pa_aiv.Run();
+#endif
+    } else if (tiling_key_val == 17) { // bf16 BNS split-kv
+#ifdef __DAV_C220_CUBE__
+        UnpadAttentionDecoderAic<true, TilingKeyType::TILING_BF16_DATA, __bf16, __bf16, __bf16> pa_aic_bf16(prefill_batch_size, decoder_batch_size);
+        pa_aic_bf16.SetArgs(sync, q_gm, k_gm, v_gm, block_tables_gm, o_gm, s_gm, p_gm, o_tmp_gm, gm_k16, gm_v16, tiling_para_gm, razorOffset, pto_block_idx, pto_block_num);
+        pa_aic_bf16.Run();
+#elif __DAV_C220_VEC__
+        UnpadAttentionDecoderAiv<TilingKeyType::TILING_BF16_DATA, __bf16, __bf16, true> pa_aiv(prefill_batch_size, decoder_batch_size);
+        pa_aiv.SetArgs(sync, k_gm, v_gm, deq_scale1_gm, offset1_gm, deq_scale2_gm, offset2_gm, block_tables_gm,
+            mask_gm, o_gm, s_gm, p_gm, o_tmp_gm, go_gm, o_core_tmp_gm, l_gm, gm_k16, gm_v16, tiling_para_gm, razorOffset, logN_gm, pto_block_idx, pto_block_num, pto_sub_block_id);
+        pa_aiv.Run();
+#endif
+    }
+    pipe_barrier(PIPE_ALL);
+}
+
+#ifndef PTO_PA_NO_GLOBAL_ENTRY
+extern "C" __global__ __aicore__ void paged_attention_mask(
+    __gm__ uint8_t *__restrict__ sync,
+    __gm__ uint8_t *__restrict__ q_gm,
+    __gm__ uint8_t *__restrict__ k_gm,
+    __gm__ uint8_t *__restrict__ v_gm,
+    __gm__ uint8_t *__restrict__ block_tables_gm,
+    __gm__ uint8_t *__restrict__ mask_gm,
+    __gm__ uint8_t *__restrict__ deq_scale1_gm,
+    __gm__ uint8_t *__restrict__ offset1_gm,
+    __gm__ uint8_t *__restrict__ deq_scale2_gm,
+    __gm__ uint8_t *__restrict__ offset2_gm,
+    __gm__ uint8_t *__restrict__ razorOffset,
+    __gm__ uint8_t *__restrict__ scale_gm,
+    __gm__ uint8_t *__restrict__ logN_gm,
+    __gm__ uint8_t *__restrict__ eye_gm,
+    __gm__ uint8_t *__restrict__ o_gm,
+    __gm__ uint8_t *__restrict__ s_gm,
+    __gm__ uint8_t *__restrict__ p_gm,
+    __gm__ uint8_t *__restrict__ o_tmp_gm,
+    __gm__ uint8_t *__restrict__ go_gm,
+    __gm__ uint8_t *__restrict__ o_core_tmp_gm,
+    __gm__ uint8_t *__restrict__ l_gm,
+    __gm__ uint8_t *__restrict__ gm_k16,
+    __gm__ uint8_t *__restrict__ gm_v16,
+    __gm__ uint8_t *__restrict__ tiling_para_gm)
+{
+    const uint32_t pto_block_idx = static_cast<uint32_t>(get_block_idx());
+    const uint32_t pto_block_num = static_cast<uint32_t>(get_block_num());
+#ifdef __DAV_C220_VEC__
+    const uint32_t pto_sub_block_id = static_cast<uint32_t>(get_subblockid());
+#else
+    const uint32_t pto_sub_block_id = 0;
+#endif
+
+    paged_attention_mask_body(
+        sync,
+        pto_block_idx,
+        pto_block_num,
+        pto_sub_block_id,
+        q_gm,
+        k_gm,
+        v_gm,
+        block_tables_gm,
+        mask_gm,
+        deq_scale1_gm,
+        offset1_gm,
+        deq_scale2_gm,
+        offset2_gm,
+        razorOffset,
+        scale_gm,
+        logN_gm,
+        eye_gm,
+        o_gm,
+        s_gm,
+        p_gm,
+        o_tmp_gm,
+        go_gm,
+        o_core_tmp_gm,
+        l_gm,
+        gm_k16,
+        gm_v16,
+        tiling_para_gm
+    );
+}
+#endif
diff --git a/tests/st/a2a3/fully_distributed_within_core/spmd_paged_attention_highperf/kernels/kernel/pa_kernel.cce b/tests/st/a2a3/fully_distributed_within_core/spmd_paged_attention_highperf/kernels/kernel/pa_kernel.cce
new file mode 100644
index 000000000..f53e12262
--- /dev/null
+++ b/tests/st/a2a3/fully_distributed_within_core/spmd_paged_attention_highperf/kernels/kernel/pa_kernel.cce
@@ -0,0 +1,2946 @@
+#include <limits>
+#include <type_traits>
+#include "../tiling/pa_tiling_struct.h"
+
+#ifndef __force_inline__
+#define __force_inline__ inline __attribute__((always_inline))
+#endif
+
+
+template <uint32_t ALIGN, typename T = uint32_t>
+inline __aicore__ T RoundUp(const T val)
+{
+    static_assert(ALIGN != 0, "align must not be zero");
+    static_assert(std::is_arithmetic<T>::value, "T must be an arithmetic type");
+    T align = ALIGN;
+    if (val + align - 1 < val) {
+        return val;
+    }
+    return (val + align - 1) / align * align;
+}
+
+template <typename T>
+inline __aicore__ T RoundUp(const T val, const T align)
+{
+    static_assert(std::is_arithmetic<T>::value, "T must be an arithmetic type");
+    if (align == 0 || val + align - 1 < val) {
+        return val;
+    }
+    return (val + align - 1) / align * align;
+}
+
+template <uint32_t DIVISOR, typename T = uint32_t>
+inline __aicore__ T CeilDiv(const T dividend)
+{
+    static_assert(DIVISOR != 0, "divisor must not be zero");
+    static_assert(std::is_arithmetic<T>::value, "T must be an arithmetic type");
+    T divisor = DIVISOR;
+    if (dividend + divisor - 1 < dividend) {
+        return dividend;
+    }
+    return (dividend + divisor - 1) / divisor;
+}
+
+template <typename T>
+constexpr T T_MAX = std::numeric_limits<T>::max();
+
+template <typename T>
+inline __aicore__ T CeilDiv(const T dividend, const T divisor)
+{
+    static_assert(std::is_arithmetic<T>::value, "T must be an arithmetic type");
+    if (divisor == 0 || dividend + divisor - 1 < dividend) {
+        return T_MAX<T>;
+    }
+    return (dividend + divisor - 1) / divisor;
+}
+
+constexpr Order_t ORDER_ONLY_VALUE = ONLY_VALUE;
+
+template <typename DTypeIn, typename DTypeOut>
+__aicore__ inline void conv_v(__ubuf__ DTypeOut *dst, __ubuf__ DTypeIn *src, uint8_t repeat, uint16_t dstBlockStride,
+    uint16_t srcBlockStride, uint16_t dstRepeatStride, uint16_t srcRepeatStride)
+{
+    if constexpr (std::is_same<DTypeIn, float>::value && std::is_same<DTypeOut, __bf16>::value) {
+        vconv_f322bf16r((__ubuf__ __bf16 *)dst, (__ubuf__ float *)src, repeat, dstBlockStride, srcBlockStride,
+            dstRepeatStride, srcRepeatStride);
+    } else if constexpr (std::is_same<DTypeIn, float>::value && std::is_same<DTypeOut, half>::value) {
+        vconv_f322f16((__ubuf__ half *)dst, (__ubuf__ float *)src, repeat, dstBlockStride, srcBlockStride,
+            dstRepeatStride, srcRepeatStride);
+    } else if constexpr (std::is_same<DTypeIn, half>::value && std::is_same<DTypeOut, float>::value) {
+        vconv_f162f32((__ubuf__ float *)dst, (__ubuf__ half *)src, repeat, dstBlockStride, srcBlockStride,
+            dstRepeatStride, srcRepeatStride);
+    } else if constexpr (std::is_same<DTypeIn, __bf16>::value && std::is_same<DTypeOut, float>::value) {
+        vconv_bf162f32((__ubuf__ float *)dst, (__ubuf__ __bf16 *)src, repeat, dstBlockStride, srcBlockStride,
+            dstRepeatStride, srcRepeatStride);
+    } else {
+        static_assert(!std::is_same<DTypeIn, DTypeIn>::value, "Unsupported conv_v dtype combination.");
+    }
+}
+
+template <pipe_t pipe, uint8_t mode>
+__aicore__ inline void FftsCrossCoreSync(uint16_t flagId)
+{
+    uint64_t config = 1ULL | (static_cast<uint64_t>(mode) << 4) | (static_cast<uint64_t>(flagId) << 8);
+    ffts_cross_core_sync(pipe, config);
+}
+
+inline __aicore__ void DdrBarrierBeforeFfts()
+{
+#if defined(__CCE_KT_TEST__) || defined(__CCE_AICORE__) || defined(__DAV_C220__)
+#if defined(__CPU_SIM)
+    dsb(0);
+#else
+    dsb(DSB_DDR);
+#endif
+    pipe_barrier(PIPE_ALL);
+#endif
+}
+
+constexpr uint32_t PA_L1L0_BLOCK_BYTES = 32;
+constexpr uint32_t PA_GM_ND2NZ_STRIDE_LIMIT = 65536;
+
+template <typename DataType>
+__aicore__ inline void pa_gm_to_l1_nd_nd(__cbuf__ DataType *l1, __gm__ DataType *gm, uint32_t nTileActual,
+    uint32_t nTileCeil, uint32_t nVal, uint32_t dTileActual, uint32_t dTileCeil, uint32_t dVal)
+{
+    (void)nVal;
+    (void)dTileCeil;
+    static constexpr uint32_t BLOCK_SIZE = PA_L1L0_BLOCK_BYTES / sizeof(DataType);
+    copy_gm_to_cbuf(l1, gm, 0, 1, CeilDiv<BLOCK_SIZE>(nTileActual * dTileActual), 0, 0, PAD_NONE);
+}
+
+template <typename DataType>
+__aicore__ inline void pa_gm_to_l1_nd_nz(__cbuf__ DataType *l1, __gm__ DataType *gm, uint32_t nTileActual,
+    uint32_t nTileCeil, uint32_t nVal, uint32_t dTileActual, uint32_t dTileCeil, uint32_t dVal)
+{
+    (void)nVal;
+    (void)dTileCeil;
+    static constexpr uint32_t BLOCK_SIZE = PA_L1L0_BLOCK_BYTES / sizeof(DataType);
+    if (dVal < PA_GM_ND2NZ_STRIDE_LIMIT) {
+        if constexpr (sizeof(DataType) == 4) {
+            copy_gm_to_cbuf_multi_nd2nz_b32s(l1, gm, 0, 1, nTileActual, dTileActual, 0, dVal, nTileCeil, 1, 0);
+        } else {
+            copy_gm_to_cbuf_multi_nd2nz_b16(l1, gm, 0, 1, nTileActual, dTileActual, 0, dVal, nTileCeil, 1, 0);
+        }
+    } else {
+        for (uint32_t i = 0; i < nTileActual; i++) {
+            if constexpr (sizeof(DataType) == 4) {
+                copy_gm_to_cbuf_multi_nd2nz_b32s(l1 + i * BLOCK_SIZE, gm + i * dVal, 0, 1, 1, dTileActual, 0, 0,
+                    nTileCeil, 0, 0);
+            } else {
+                copy_gm_to_cbuf_multi_nd2nz_b16(l1 + i * BLOCK_SIZE, gm + i * dVal, 0, 1, 1, dTileActual, 0, 0,
+                    nTileCeil, 0, 0);
+            }
+        }
+    }
+}
+
+template <typename DataType, bool IsTranspose>
+__aicore__ inline void pa_l1_to_l0_a_vector(__ca__ DataType *l0, __cbuf__ DataType *l1, uint32_t mTileCeil,
+    uint32_t kPartCeil, uint32_t mSrcStride, uint32_t kSrcStride, uint32_t mDstStride, uint32_t kDstStride)
+{
+    (void)mTileCeil;
+    (void)mSrcStride;
+    (void)mDstStride;
+    if constexpr (IsTranspose) {
+        load_cbuf_to_ca(l0, l1, 0, kPartCeil, kSrcStride, kDstStride, 0, 1, (addr_cal_mode_t)0);
+    } else {
+        load_cbuf_to_ca(l0, l1, 0, kPartCeil, kSrcStride, kDstStride, 0, 0, (addr_cal_mode_t)0);
+    }
+}
+
+template <typename DataType, bool IsTranspose>
+__aicore__ inline void pa_l1_to_l0_b_vector(__cb__ DataType *l0, __cbuf__ DataType *l1, uint32_t nTileCeil,
+    uint32_t kPartCeil, uint32_t nSrcStride, uint32_t kSrcStride, uint32_t nDstStride, uint32_t kDstStride)
+{
+    (void)nTileCeil;
+    (void)nSrcStride;
+    (void)nDstStride;
+    if constexpr (IsTranspose) {
+        load_cbuf_to_cb(l0, l1, 0, kPartCeil, kSrcStride, kDstStride, 0, 1, (addr_cal_mode_t)0);
+    } else {
+        load_cbuf_to_cb(l0, l1, 0, kPartCeil, kSrcStride, kDstStride, 0, 0, (addr_cal_mode_t)0);
+    }
+}
+
+__aicore__ inline void pa_l0c_to_gm_nd_fp32(__gm__ float *gm, __cc__ float *cc, uint32_t mTileActual,
+    uint32_t nTileActual, uint32_t srcStride, uint32_t dstStride, uint8_t unitFlag = 0)
+{
+    set_nd_para((uint64_t)1);
+    pipe_barrier(PIPE_FIX);
+    copy_matrix_cc_to_gm(gm, cc, 0, nTileActual, mTileActual, dstStride, srcStride, unitFlag, QuantMode_t::NoQuant, 0,
+        false, true);
+}
+
+struct LoadData2dTransposeParams {
+    uint16_t startIndex{0};
+    uint16_t repeatTimes{0};
+    uint16_t srcStride{0};
+    uint16_t dstGap{0};
+    uint16_t dstFracGap{0};
+};
+
+// define common const value
+
+// FFTS Flag
+constexpr int32_t QK_READY = 0;
+constexpr int32_t SOFTMAX_READY = 1;
+constexpr int32_t UPDATE_READY = 2;
+constexpr int32_t QK_READY_DECODER = 3;
+constexpr int32_t SOFTMAX_READY_DECODER = 4;
+constexpr int32_t UPDATE_READY_DECODER = 5;
+constexpr int32_t QK_READY_STAGE2 = 6;
+constexpr int32_t SOFTMAX_READY_STAGE2 = 7;
+constexpr int32_t UPDATE_READY_STAGE2 = 8;
+constexpr uint32_t VEC_DEQ_K0_READY = 9;
+constexpr uint32_t VEC_DEQ_K1_READY = 10;
+constexpr uint32_t VEC_DEQ_V0_READY = 11;
+constexpr uint32_t VEC_DEQ_V1_READY = 12;
+constexpr int32_t REDUCE_READY_DECODER = 13;
+
+
+constexpr int32_t BLOCK_SIZE = 16;
+constexpr int32_t BLOCK_SIZE_32 = 32;
+constexpr int64_t TMP_SIZE = 65536;              // 256 * 256
+constexpr int32_t BIT_SHIFT = 8;
+
+const int32_t TILING_BATCH = 0;
+const int32_t TILING_NUMHEADS = 1;
+const int32_t TILING_HEADDIM = 2;
+const int32_t TILING_NUMBLOKS = 3;
+const int32_t TILING_BLOCKSIZE = 4;
+const int32_t TILING_MAXBLOCKS = 5;
+const int32_t TILING_TOR = 6;
+const int32_t TILING_KVHEADS = 7;
+const int32_t TILING_FORMER_BATCH = 8;
+const int32_t TILING_FORMER_HEAD = 9;
+const int32_t TILING_TAIL_BATCH = 10;
+const int32_t TILING_TAIL_HEAD = 11;
+const int32_t TILING_HEADNUM_MOVE = 12;
+const int32_t TILING_MASK_MAX_LEN = 13;
+const int32_t TILING_BATCH_STRIDE = 14;
+const int32_t TILING_HEAD_STRIDE = 15;
+const int32_t TILING_KEY = 16;
+const int32_t TILING_HEADSIZE = 17;
+const int32_t TILING_PARASIZE = 18;
+const int32_t TILING_GROUPNUM = 19;
+const int32_t TILING_FORMER_GROUP_MOVE = 20;
+const int32_t TILING_TAIL_GROUP_MOVE = 21;
+const int32_t TILING_MAX_KVSEQLEN = 22;
+const int32_t TILING_KVSPLIT = 23;
+const int32_t TILING_KVCORENUM = 24;
+const int32_t TILING_BLOCKSIZE_CALC = 25;
+const int32_t TILING_TOTAL_BLOCK_NUM = 26;
+const int32_t TILING_PREFILL_BS = 27;
+const int32_t TILING_DECODER_BS = 28;
+const int32_t TILING_HEADDIM_V = 29;
+const int32_t TILING_MODCOEF = 30;
+const int32_t TILING_DIVCOEF = 31;
+const int32_t TILING_QHEADORIGINAL = 32;
+const int32_t TILING_COMPRESSHEAD = 33;
+const int32_t TILING_QUANTYPE = 34;
+const int32_t TILING_DATA_SHAPE_TYPE = 35;
+const int32_t TILING_SCALETYPE = 36;
+const int32_t TILING_MASK_TYPE_ND = 37;
+const int32_t TILING_HEADDIM_K_SPLIT = 38;
+const int32_t TILING_HEADDIM_V_SPLIT = 39;
+const int32_t TILING_HEADDIM_V_SPLIT_VECTOR_FORMER = 40;
+const int32_t TILING_HEADDIM_V_SPLIT_VECTOR_TAIL = 41;
+const int32_t BLOCKSIZE_CALC_256 = 256;
+constexpr uint32_t CONST_16 = 16;
+constexpr uint32_t KV_SEQ_STEP = 16;
+constexpr uint32_t MAX_NUMEL_INST_B8 = 255 * 256;
+constexpr uint32_t MAX_NUMEL_INST_B16 = 255 * 128;
+constexpr uint32_t MAX_NUMEL_INST_B32 = 255 * 64;
+
+using TilingKeyType = AtbOps::TilingKeyType;
+
+using DataShapeType = AtbOps::DataShapeType;
+
+using CompressType = AtbOps::CompressType;
+
+using PagedAttnVariant = AtbOps::PagedAttnVariant;
+
+template<TilingKeyType tilingKeyType>
+struct AttentionType
+{
+};
+
+
+template<>
+struct AttentionType<TilingKeyType::TILING_HALF_DATA>
+{
+    using mm1OutputType = float;
+    using mm1CopyType = float;
+    using mmBiasType = float;
+    using mmScaleType = float;
+    using mm2OutputType = float;
+    using mm2CopyType = float;
+};
+
+template<>
+struct AttentionType<TilingKeyType::TILING_BF16_DATA>
+{
+    using mm1OutputType = float;
+    using mm1CopyType = float;
+    using mmBiasType = float;
+    using mmScaleType = float;
+    using mm2OutputType = float;
+    using mm2CopyType = float;
+};
+
+
+#ifdef __DAV_C220_CUBE__
+constexpr int32_t L0AB_HALF_BUF_SIZE = 16384;    // 128 * 128 = 16K
+constexpr int32_t L0AB_UINT8_BUF_SIZE = 16384 * 2;
+constexpr int32_t L0C_FLOAT_BUF_SIZE = 16384;
+constexpr int32_t L0C_UINT8_BUF_SIZE = 131072;
+constexpr int32_t CUBE_MATRIX_SIZE = 256;        // 16 * 16
+constexpr int64_t L0AB_UINT8_BLOCK_SIZE = 32768; // 128 * 128 * 2B
+constexpr int32_t L1_HALF_BUF_SIZE = 65536;  // 256 * 256
+constexpr int32_t L1_P_UINT8_BUF_SIZE = 32768;
+
+constexpr int32_t TMP_SIZE_DECODER = 32768;
+
+constexpr int32_t L1_HALF_BUF_SIZE_DECODER = 16384;
+constexpr int32_t L1_UINT8_BUF_SIZE_DECODER = 16384 * 2;
+constexpr int32_t L1_KV_HALF_BUF_SIZE = 65536;// 2* 128 * 256
+constexpr int32_t L1_KV_UINT8_BUF_SIZE = 65536 * 2;
+constexpr uint64_t L1_E_UINT8_SIZE = 1024;  // 32 * 32 * 1B
+constexpr uint64_t L1_SCALE_UINT8_SIZE = 4096;  // uint64 256 * 8 * 2head
+constexpr uint64_t L1_SCALE_UINT64_SIZE = L1_SCALE_UINT8_SIZE / 8;
+constexpr uint64_t L1_OFFSET_UINT8_SIZE = 2048;  // int32 256 * 4 8 2head
+constexpr uint64_t L1_OFFSET_INT32_SIZE = L1_OFFSET_UINT8_SIZE / 4;
+
+//DeQuant
+constexpr uint32_t L0AB_PINGPONG_BUFFER_LEN = 32768; // 32 KB
+constexpr uint32_t L0C_PINGPONG_BUFFER_LEN_INT32 = 16384; // 65536 / 4
+constexpr uint32_t CUBE_MATRIX_SIZE_512 = 16 * 32;       // 16 * 23
+constexpr int32_t BLOCK_SIZE_16 = 16;
+constexpr uint64_t CONST_4 = 4;
+constexpr uint64_t CONST_32 = 32;
+constexpr uint64_t CONST_64 = 64;
+constexpr uint64_t CONST_128 = 128;
+constexpr uint32_t EMBED_SPLIT = 256;
+constexpr uint32_t ROUND_EMBED_SPLIT = 256;
+
+#elif __DAV_C220_VEC__
+constexpr uint32_t HALF_VECTOR_SIZE = 128;
+constexpr uint32_t UB_ALIGN_BYTE = 32;
+constexpr int32_t FLOAT_VECTOR_SIZE = 64;
+constexpr int64_t UB_UINT8_BLOCK_SIZE_MLA = 16384;      // 96 * 128 * 2B // prefill/decoder diff
+constexpr int64_t UB_UINT8_BLOCK_SIZE_NORM = 24576;
+constexpr int64_t UB_UINT8_LINE_SIZE = 512;         // 64 * 4 B; 2x headroom to avoid UB overlap.
+constexpr int64_t UB_HALF_LINE_SIZE = 256;          // UB_FLOAT_LINE_SIZE * 2
+constexpr int64_t UB_FLOAT_LINE_SIZE = 128;         // 64 floats; 2x headroom to avoid UB overlap.
+
+constexpr int64_t PRE_UB_UINT8_BLOCK_SIZE = 16384;  // 64 * 128 * 2B
+constexpr int32_t VECTOR_SIZE = 128;                // prefill
+constexpr int32_t FLOAT_BLOCK_SIZE = 8;
+constexpr int32_t UB_HALF_BUF_SIZE = 8192;          // 64 * 128
+constexpr int32_t TMP_SIZE_DECODER = 32768;
+constexpr int32_t STAGE2_UB_UINT8_BLOCK_SIZE = 8192;
+constexpr int32_t CUBE_MATRIX_SIZE = 256;
+constexpr uint32_t MAX_UB_SIZE = 196608; // 192 * 1024
+constexpr uint32_t EMBED_SPLIT_SM = 128;
+constexpr uint32_t ROUND_EMBED_SPLIT_SM = 128;
+
+__aicore__ __attribute__((always_inline)) void inline __set_mask(int32_t len)
+{
+    uint64_t mask = 0;
+    uint64_t one = 1;
+    uint64_t temp = len % FLOAT_VECTOR_SIZE;
+    for (int64_t i = 0; i < temp; i++) {
+        mask |= one << i;
+    }
+
+    if (len == VECTOR_SIZE) {
+        set_vector_mask((uint64_t)-1, (uint64_t)-1);
+    } else if (len >= FLOAT_VECTOR_SIZE) {
+        set_vector_mask(mask, (uint64_t)-1);
+    } else {
+        set_vector_mask(0x0, mask);
+    }
+}
+
+template<PagedAttnVariant pagedAttnVariant>
+struct UbufAlloc
+{
+};
+
+
+template<>
+struct UbufAlloc<PagedAttnVariant::DEFAULT>
+{
+    const uint32_t ls32_ubuf_offset = 0;
+    const uint32_t lp_ubuf_offset = 2 * UB_UINT8_BLOCK_SIZE_NORM;
+    const uint32_t lp32_ubuf_offset = 2 * UB_UINT8_BLOCK_SIZE_NORM;
+    const uint32_t mask_ubuf_offset = 2 * UB_UINT8_BLOCK_SIZE_NORM;
+    const uint32_t lo_ubuf_offset = 3 * UB_UINT8_BLOCK_SIZE_NORM;
+    const uint32_t mask32_ubuf_offset = 3 * UB_UINT8_BLOCK_SIZE_NORM;
+    const uint32_t ls16_ubuf_offset = 3 * UB_UINT8_BLOCK_SIZE_NORM;
+    const uint32_t lm32_ubuf_offset = 5 * UB_UINT8_BLOCK_SIZE_NORM;
+    const uint32_t hm32_ubuf_offset = 5 * UB_UINT8_BLOCK_SIZE_NORM + 1 * UB_UINT8_LINE_SIZE;
+    const uint32_t pm32_ubuf_offset = 5 * UB_UINT8_BLOCK_SIZE_NORM + 2 * UB_UINT8_LINE_SIZE;
+    const uint32_t pm32_ubuf_stage2_offset = 5 * UB_UINT8_BLOCK_SIZE_NORM + 3 * UB_UINT8_LINE_SIZE;
+    const uint32_t descale1_offset = 5 * UB_UINT8_BLOCK_SIZE_NORM + 4 * UB_UINT8_LINE_SIZE;
+    const uint32_t descale2_offset = 5 * UB_UINT8_BLOCK_SIZE_NORM + 5 * UB_UINT8_LINE_SIZE;
+    const uint32_t dm32_ubuf_offset = 5 * UB_UINT8_BLOCK_SIZE_NORM + 6 * UB_UINT8_LINE_SIZE;
+    const uint32_t dm32_ubuf_stage2_offset = 5 * UB_UINT8_BLOCK_SIZE_NORM + 7 * UB_UINT8_LINE_SIZE;
+    const uint32_t ll_ubuf_offset = MAX_UB_SIZE - (UB_UINT8_LINE_SIZE + UB_UINT8_LINE_SIZE * 4); // 2 * UB_UINT8_LINE_SIZE
+    const uint32_t ll_ubuf_stage2_offset = MAX_UB_SIZE- UB_UINT8_LINE_SIZE * 2;      // 2 * UB_UINT8_LINE_SIZE
+    const uint32_t gm32_ubuf_offset = dm32_ubuf_stage2_offset + 3 * UB_UINT8_LINE_SIZE; // 2 * UB_UINT8_LINE_SIZE
+    const uint32_t gl_ubuf_offset = gm32_ubuf_offset + 2 * UB_UINT8_LINE_SIZE;          // 3 * UB_UINT8_LINE_SIZE
+    const uint32_t gl32_ubuf_offset = gm32_ubuf_offset + 2 * UB_UINT8_LINE_SIZE;        // 3 * UB_UINT8_LINE_SIZE
+    const uint32_t go_ubuf_offset = gl_ubuf_offset + 3 * UB_UINT8_LINE_SIZE;            // 16K
+    const uint32_t go32_ubuf_offset = gl_ubuf_offset + 3 * UB_UINT8_LINE_SIZE;          // 16K
+    const uint32_t tv32_ubuf_offset = go32_ubuf_offset + 2 * UB_UINT8_BLOCK_SIZE_NORM;
+};
+#endif
+
+#ifdef __DAV_C220_CUBE__
+template <bool SplitKV = false, TilingKeyType tilingKeyType = TilingKeyType::TILING_HALF_DATA, typename IN_DTYPE = half,  typename OUT_DTYPE = half, typename IN_KVDTYPE = half, PagedAttnVariant pagedAttnVariant = PagedAttnVariant::DEFAULT, DataShapeType dataShapeType = DataShapeType::BSND, CompressType compressType = CompressType::COMPRESS_TYPE_UNDEFINED, bool SplitBlock = false>
+class UnpadAttentionDecoderAic {
+    // define dtype
+    using mm1OutputType = typename AttentionType<tilingKeyType>::mm1OutputType;
+    using mm1CopyType = typename AttentionType<tilingKeyType>::mm1CopyType;
+    using mmBiasType = typename AttentionType<tilingKeyType>::mmBiasType;
+    using mmScaleType = typename AttentionType<tilingKeyType>::mmScaleType;
+    using mm2OutputType = typename AttentionType<tilingKeyType>::mm2OutputType;
+    using mm2CopyType = typename AttentionType<tilingKeyType>::mm2CopyType;
+    static constexpr uint32_t T_CUBE_MATRIX_SIZE = CUBE_MATRIX_SIZE_512 / sizeof(IN_DTYPE);
+    static constexpr uint32_t T_BLOCK_SIZE =  BLOCK_SIZE_32 / sizeof(IN_DTYPE);
+    static constexpr uint32_t T_BLOCK_OFFSET = 2 / sizeof(IN_DTYPE);
+
+public:
+    __aicore__ __attribute__((always_inline)) inline UnpadAttentionDecoderAic(uint32_t prefill_batch_size, uint32_t decoder_batch_size) {
+        prefill_batch_size_ = prefill_batch_size;
+        decoder_batch_size_ = decoder_batch_size;
+    }
+
+    __aicore__ __attribute__((always_inline)) inline void SetArgs(
+        __gm__ uint8_t *__restrict__ sync,
+        __gm__ uint8_t *__restrict__ q_in_gm,
+        __gm__ uint8_t *__restrict__ k_in_gm,
+        __gm__ uint8_t *__restrict__ v_in_gm,
+        __gm__ uint8_t *__restrict__ block_tables_in_gm,
+        __gm__ uint8_t *__restrict__ o_out_gm,
+        __gm__ uint8_t *__restrict__ s_out_gm,
+        __gm__ uint8_t *__restrict__ p_out_gm,
+        __gm__ uint8_t *__restrict__ o_temp_gm,
+        __gm__ uint8_t* __restrict__ gm_k16,
+        __gm__ uint8_t* __restrict__ gm_v16,
+        __gm__ uint8_t *__restrict__ tiling_para_gm,
+        __gm__ uint8_t *__restrict__ razorOffset,
+        uint32_t pto_block_idx,
+        uint32_t pto_block_num)
+    {
+        if (sync != nullptr) {
+            set_ffts_base_addr((uint64_t)sync);
+        }
+        set_padding(0);
+        set_atomic_none();
+        set_nd_para(1ULL);
+        set_mask_norm();
+
+        q_gm = reinterpret_cast<__gm__ IN_DTYPE *>(q_in_gm);
+        k_gm = reinterpret_cast<__gm__ IN_KVDTYPE *>(k_in_gm);
+        v_gm = reinterpret_cast<__gm__ IN_KVDTYPE *>(v_in_gm);
+        block_tables_gm = reinterpret_cast<__gm__ int32_t *>(block_tables_in_gm);
+        s_gm = reinterpret_cast<__gm__ mm1CopyType *>(s_out_gm);
+
+        p_gm = reinterpret_cast<__gm__ IN_DTYPE *>(p_out_gm);
+        o_tmp_gm = reinterpret_cast<__gm__ mm2CopyType *>(o_temp_gm);
+        tiling_gm = reinterpret_cast<__gm__ uint8_t *>(tiling_para_gm);
+
+        num_tokens = (uint32_t)(*((__gm__ uint32_t *)tiling_para_gm));
+        q_heads = (uint32_t)(*((__gm__ uint32_t *)tiling_para_gm + TILING_NUMHEADS));
+        embedding_size = (uint32_t)(*((__gm__ uint32_t *)tiling_para_gm + TILING_HEADDIM));
+        embedding_size_v = (uint32_t)(*((__gm__ uint32_t *)tiling_para_gm + TILING_HEADDIM_V));
+        block_size = (uint32_t)(*((__gm__ uint32_t *)tiling_para_gm + TILING_BLOCKSIZE));
+        max_num_blocks_per_query = (uint32_t)(*((__gm__ uint32_t *)tiling_para_gm + TILING_MAXBLOCKS));
+        kv_heads = (uint32_t)(*((__gm__ uint32_t *)tiling_para_gm + TILING_KVHEADS));
+        former_batch = (uint32_t)(*((__gm__ uint32_t *)tiling_para_gm + TILING_FORMER_BATCH));
+        former_head_split = (uint32_t)(*((__gm__ uint32_t *)tiling_para_gm + TILING_FORMER_HEAD));
+        tail_batch = (uint32_t)(*((__gm__ uint32_t *)tiling_para_gm + TILING_TAIL_BATCH));
+        tail_head_split = (uint32_t)(*((__gm__ uint32_t *)tiling_para_gm + TILING_TAIL_HEAD));
+        head_split_num = (uint32_t)(*((__gm__ uint32_t *)tiling_para_gm + TILING_HEADNUM_MOVE));
+        tiling_head_size = (uint32_t)(*((__gm__ uint32_t *)tiling_para_gm + TILING_HEADSIZE));
+        tiling_para_size = (uint32_t)(*((__gm__ uint32_t *)tiling_para_gm + TILING_PARASIZE));
+        group_num = (uint32_t)(*((__gm__ uint32_t *)tiling_para_gm + TILING_GROUPNUM));
+        block_size_calc = (uint32_t)(*((__gm__ uint32_t *)tiling_para_gm + TILING_BLOCKSIZE_CALC));
+        q_head_original = (uint32_t)(*((__gm__ uint32_t *)tiling_para_gm + TILING_QHEADORIGINAL));
+        compressHead = (uint32_t)(*((__gm__ uint32_t *)tiling_para_gm + TILING_COMPRESSHEAD));
+        block_idx = pto_block_idx;
+        block_num = pto_block_num;
+        block_size_inner_count = block_size / block_size_calc;
+
+            former_group_num_move = (uint32_t)(*((__gm__ uint32_t *)tiling_para_gm + TILING_FORMER_GROUP_MOVE));
+            tail_group_num_move = (uint32_t)(*((__gm__ uint32_t *)tiling_para_gm + TILING_TAIL_GROUP_MOVE));
+        kv_split_per_core = (uint32_t)(*((__gm__ uint32_t *)tiling_para_gm + TILING_KVSPLIT));
+        kv_split_core_num = (uint32_t)(*((__gm__ uint32_t *)tiling_para_gm + TILING_KVCORENUM));
+
+        former_head_split_num = (former_head_split > group_num) && (former_group_num_move == group_num) ? head_split_num : 1;
+        tail_head_split_num = (tail_head_split > group_num) && (tail_group_num_move == group_num) ? head_split_num : 1;
+
+        stride_kv = static_cast<uint64_t>(kv_heads) * embedding_size;
+
+
+        __k = embedding_size;
+        round_k = RoundUp<T_BLOCK_SIZE>(__k);
+    }
+
+
+    __aicore__ __attribute__((always_inline)) inline void Run()
+    {
+        set_flag(PIPE_M, PIPE_MTE1, EVENT_ID0);
+        set_flag(PIPE_M, PIPE_MTE1, EVENT_ID1);
+        set_flag(PIPE_M, PIPE_MTE1, EVENT_ID2);
+        set_flag(PIPE_M, PIPE_MTE1, EVENT_ID3);
+        set_flag(PIPE_M, PIPE_MTE1, EVENT_ID4);
+        set_flag(PIPE_M, PIPE_MTE1, EVENT_ID5);
+        set_flag(PIPE_M, PIPE_MTE1, EVENT_ID7);
+        set_flag(PIPE_FIX, PIPE_M, EVENT_ID0);
+        set_flag(PIPE_FIX, PIPE_M, EVENT_ID1);
+        set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID0);
+        set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1);
+        set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID2);
+        set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID3);
+        set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID4);
+        set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID5);
+        set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID7);
+        set_flag(PIPE_FIX, PIPE_MTE1, EVENT_ID2);
+        set_flag(PIPE_FIX, PIPE_MTE1, EVENT_ID3);
+        set_flag(PIPE_FIX, PIPE_MTE1, EVENT_ID4);
+        set_flag(PIPE_FIX, PIPE_MTE1, EVENT_ID5);
+        set_flag(PIPE_MTE2, PIPE_FIX, EVENT_ID0);
+        core_per_batch = (q_heads + former_head_split - 1) / former_head_split;
+        process_num = static_cast<uint64_t>(former_batch) * core_per_batch * kv_split_core_num;
+
+        for (uint32_t process = block_idx; process < process_num; process += uint32_t(block_num)) {  // for task
+            uint32_t cur_batch = process / (core_per_batch * kv_split_core_num) + prefill_batch_size_;
+            uint32_t offset_tiling = tiling_head_size + tiling_para_size * cur_batch;
+            cur_batch = (uint32_t)(*((__gm__ uint32_t *)tiling_gm + 13 + offset_tiling));
+            offset_tiling = tiling_head_size + tiling_para_size * cur_batch;
+            uint32_t batch_idx = (uint32_t)(*((__gm__ uint32_t *)tiling_gm + 8 + offset_tiling));
+            uint32_t kv_seqlen = (uint32_t)(*((__gm__ uint32_t *)tiling_gm  + 1 + offset_tiling));
+            if (kv_seqlen == 0) {
+                continue;
+            }
+            uint32_t kv_seqlen_align = (kv_seqlen + block_size - 1) / block_size * block_size;
+            uint32_t cur_head = (process / kv_split_core_num) % core_per_batch;
+            uint32_t cur_nIndx = process % kv_split_core_num;
+            uint32_t start_head = cur_head * former_head_split;
+            uint32_t start_kv = cur_nIndx * kv_split_per_core;
+            uint32_t cur_kv_seqlen = kv_split_per_core;
+            uint32_t kv_loop = (kv_seqlen_align + kv_split_per_core - 1) /  kv_split_per_core;
+            if (cur_nIndx >= kv_loop) {
+                continue;
+            }
+            if (cur_nIndx == (kv_loop - 1)) {
+                cur_kv_seqlen = kv_seqlen - cur_nIndx * kv_split_per_core;
+            }
+            uint32_t cur_head_num = former_head_split;
+            if (cur_head == (core_per_batch - 1)) {
+                cur_head_num = q_heads - cur_head * former_head_split;
+                former_group_num_move = former_group_num_move <= cur_head_num ? former_group_num_move : cur_head_num;
+            }
+            uint32_t head_split_loop = (cur_head_num + (former_head_split_num * former_group_num_move) - 1) /
+                                       (former_head_split_num * former_group_num_move);
+                InnerRunCube(batch_idx, start_head, cur_head_num, head_split_loop, start_kv, cur_kv_seqlen, offset_tiling, former_group_num_move, former_head_split_num);
+        }
+        if (tail_batch > 0) {
+            core_per_batch = (q_heads + tail_head_split - 1) / tail_head_split;
+            process_num = static_cast<uint64_t>(tail_batch) * core_per_batch;
+            for (uint32_t process = block_idx; process < process_num; process += uint32_t(block_num)) {  // for task
+                uint32_t cur_batch = process / core_per_batch + former_batch + prefill_batch_size_;
+                uint32_t offset_tiling = tiling_head_size + tiling_para_size * cur_batch;
+                cur_batch = (uint32_t)(*((__gm__ uint32_t *)tiling_gm + 13 + offset_tiling));
+                offset_tiling = tiling_head_size + tiling_para_size * cur_batch;
+                uint32_t batch_idx = (uint32_t)(*((__gm__ uint32_t *)tiling_gm + 8 + offset_tiling));
+                uint32_t kv_seqlen = (uint32_t)(*((__gm__ uint32_t *)tiling_gm + 1 + offset_tiling));
+                if (kv_seqlen == 0) {
+                    continue;
+                }
+                uint32_t cur_kv_seqlen = kv_seqlen;
+                uint32_t start_kv = 0;
+                uint32_t cur_head = process % core_per_batch;
+                uint32_t cur_head_num = tail_head_split;
+                if (cur_head == (core_per_batch - 1)) {
+                    cur_head_num = q_heads - cur_head * tail_head_split;
+                    tail_group_num_move = tail_group_num_move <= cur_head_num ? tail_group_num_move : cur_head_num;
+                }
+                uint32_t head_split_loop = (cur_head_num + (tail_head_split_num * tail_group_num_move) - 1) /
+                                           (tail_head_split_num * tail_group_num_move);
+                uint32_t start_head = (process % core_per_batch) * tail_head_split;
+                    InnerRunCube(batch_idx, start_head, cur_head_num, head_split_loop, start_kv, cur_kv_seqlen, offset_tiling, tail_group_num_move, tail_head_split_num);
+            }
+        }
+        wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID0);
+        wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID1);
+        wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID2);
+        wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID3);
+        wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID4);
+        wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID5);
+        wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID7);
+        wait_flag(PIPE_FIX, PIPE_M, EVENT_ID0);
+        wait_flag(PIPE_FIX, PIPE_M, EVENT_ID1);
+        wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID0);
+        wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1);
+        wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID2);
+        wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID3);
+        wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID4);
+        wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID5);
+        wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID7);
+        wait_flag(PIPE_FIX, PIPE_MTE1, EVENT_ID2);
+        wait_flag(PIPE_FIX, PIPE_MTE1, EVENT_ID3);
+        wait_flag(PIPE_FIX, PIPE_MTE1, EVENT_ID4);
+        wait_flag(PIPE_FIX, PIPE_MTE1, EVENT_ID5);
+        wait_flag(PIPE_MTE2, PIPE_FIX, EVENT_ID0);
+        pipe_barrier(PIPE_ALL);
+    }
+private:
+
+
+
+    __attribute__((always_inline)) inline __aicore__ void LoadQToL1(
+        uint32_t q_offset,
+        uint32_t cur_head_num)
+    {
+        if (is_multi_head_mmad) {
+            // gm_to_l1q
+            pa_gm_to_l1_nd_nz<IN_DTYPE>(
+                l1q_buf_addr_tensor,
+                q_gm + (q_offset),
+                cur_head_num,        // nValue
+                RoundUp<16>(cur_head_num),// dstNzC0Stride
+                0,                     // dstNzMatrixStride, unused
+                __k,                   // dValue
+                0,                     // dstNzMatrixStride, unused
+                __k                   // srcDValue
+            );
+        } else {
+            if (embedding_size % T_BLOCK_SIZE == 0) {
+                pa_gm_to_l1_nd_nd<IN_DTYPE>(
+                    l1q_buf_addr_tensor,
+                    q_gm + (q_offset),
+                    1,
+                    0,
+                    0,
+                    round_k * cur_head_num,               // lenBurst
+                    0,
+                    0
+                );
+            } else {
+                for (uint32_t copy_idx = 0; copy_idx < cur_head_num; copy_idx++) {
+                    pa_gm_to_l1_nd_nd<IN_DTYPE>(
+                        l1q_buf_addr_tensor + (copy_idx * round_k),
+                        q_gm + (q_offset + copy_idx * embedding_size),
+                        1,
+                        0,
+                        0,
+                        round_k,               // lenBurst
+                        0,
+                        0
+                    );
+                }
+            }
+        }
+    }
+
+
+
+
+
+    __attribute__((always_inline)) inline __aicore__ void LoadKVToL1(
+        __gm__ IN_KVDTYPE *__restrict__ kv_gm_tensor,
+        __cbuf__ IN_KVDTYPE *__restrict__ l1kv_buf_addr_tensor,
+        bool move_l1b_flag,
+        uint32_t head_num_move,
+        uint32_t cur_batch,
+        uint32_t cur_kv_seqlen,
+        uint32_t start_kv,
+        uint32_t qk_round_n,
+        uint32_t real_n_loop,
+        uint32_t sub_n_loop,
+        uint32_t n_idx
+    )
+    {
+        for (uint32_t inner_n_idx = 0; inner_n_idx < sub_n_loop; inner_n_idx++) {
+            uint32_t actual_idx = n_idx * sub_n_loop + inner_n_idx;
+            uint32_t sub_qk_n = block_size;
+            if (actual_idx >= real_n_loop) {
+                break;
+            }
+            uint32_t block_table_id = (uint32_t)(*(block_tables_gm +
+                            cur_batch * max_num_blocks_per_query + start_kv / block_size + actual_idx));
+            int64_t kv_offset = (int64_t)block_table_id * block_size * stride_kv;
+
+
+            if (actual_idx == (real_n_loop - 1)) {
+                sub_qk_n = (cur_kv_seqlen - actual_idx * block_size);
+            }
+            if (group_num == 1) {
+                if (inner_n_idx == 0) {
+                    wait_flag(PIPE_MTE1, PIPE_MTE2, static_cast<::event_t>(l1b_pingpong_flag + 2));
+                }
+                    pa_gm_to_l1_nd_nz<IN_KVDTYPE>(
+                        l1kv_buf_addr_tensor + (l1b_offset + block_size * T_BLOCK_SIZE * inner_n_idx), kv_gm_tensor + (kv_offset),
+                        sub_qk_n,            // nValue
+                        qk_round_n,          // dstNzC0Stride
+                        0,                   // dstNzMatrixStride, unused
+                        __k * head_num_move, // dValue
+                        0,                   // dstNzMatrixStride, unused
+                        stride_kv            // srcDValue
+                    );
+                if (actual_idx == real_n_loop - 1 || inner_n_idx == sub_n_loop - 1) {
+                    set_flag(PIPE_MTE2, PIPE_MTE1, static_cast<::event_t>(l1b_pingpong_flag + 2));
+                    move_l1b_offset = l1b_offset;
+                }
+            } else {
+                if (move_l1b_flag && inner_n_idx == 0) {
+                    l1b_pingpong_flag = 1 - l1b_pingpong_flag;
+                    l1b_offset = l1b_pingpong_flag * L1_KV_UINT8_BUF_SIZE / sizeof(IN_DTYPE);
+                    wait_flag(PIPE_MTE1, PIPE_MTE2, static_cast<::event_t>(l1b_pingpong_flag + 2));
+                }
+                if (move_l1b_flag) {
+                        pa_gm_to_l1_nd_nz<IN_KVDTYPE>(
+                            l1kv_buf_addr_tensor + (l1b_offset + block_size * T_BLOCK_SIZE * inner_n_idx), kv_gm_tensor + (kv_offset),
+                            sub_qk_n,            // nValue
+                            qk_round_n,          // dstNzC0Stride
+                            0,                   // dstNzMatrixStride, unused
+                            __k * head_num_move, // dValue
+                            0,                   // dstNzMatrixStride, unused
+                            stride_kv            // srcDValue
+                        );
+
+                    if (actual_idx == real_n_loop - 1 || inner_n_idx == sub_n_loop - 1) {
+                        set_flag(PIPE_MTE2, PIPE_MTE1, static_cast<::event_t>(l1b_pingpong_flag + 2));
+                    }
+                }
+                move_l1b_offset = l1b_offset;
+            }
+        }
+    }
+
+    // antiquant
+    __attribute__((always_inline)) inline __aicore__ void
+    LoadKVToL1(__gm__ IN_DTYPE *__restrict__ kv_gm_tensor,        // [seq_len, num_head, embd_size]
+               __cbuf__ IN_DTYPE *__restrict__ l1kv_buf_addr_tensor, // [seq_len, hidden_size]
+               bool move_l1b_flag, uint32_t head_num_move, uint32_t qk_n, uint32_t qk_round_n, uint32_t num_head)
+    {
+        if (group_num == 1) {
+            // [qk_n, cur_head_num, head_size] -> [qk_n, head_num_move, head_size]
+            wait_flag(PIPE_MTE1, PIPE_MTE2, static_cast<::event_t>(l1b_pingpong_flag + 2));
+            pa_gm_to_l1_nd_nz<IN_DTYPE>(l1kv_buf_addr_tensor + (l1b_offset),
+                                                                                      kv_gm_tensor,
+                                                                                      qk_n,       // nValue
+                                                                                      qk_round_n, // dstNzC0Stride
+                                                                                      0,          // dstNzMatrixStride
+                                                                                      __k * head_num_move, // dValue
+                                                                                      0,        // dstNzMatrixStride
+                                                                                      stride_kv // srcDValue
+            );
+            set_flag(PIPE_MTE2, PIPE_MTE1, static_cast<::event_t>(l1b_pingpong_flag + 2));
+            move_l1b_offset = l1b_offset;
+        } else {
+            if (move_l1b_flag) {
+                l1b_pingpong_flag = 1 - l1b_pingpong_flag;
+                l1b_offset = l1b_pingpong_flag * L1_KV_HALF_BUF_SIZE;
+                wait_flag(PIPE_MTE1, PIPE_MTE2, static_cast<::event_t>(l1b_pingpong_flag + 2));
+                pa_gm_to_l1_nd_nz<IN_DTYPE>(l1kv_buf_addr_tensor + (l1b_offset),
+                                                                                          kv_gm_tensor,
+                                                                                          qk_n,       // nValue
+                                                                                          qk_round_n, // dstNzC0Stride
+                                                                                          0, // dstNzMatrixStride
+                                                                                          __k * head_num_move, // dValue
+                                                                                          0,        // dstNzMatrixStride
+                                                                                          stride_kv // srcDValue
+
+                );
+                set_flag(PIPE_MTE2, PIPE_MTE1, static_cast<::event_t>(l1b_pingpong_flag + 2));
+                move_l1b_offset = l1b_offset;
+            }
+        }
+    }
+
+
+    __attribute__((always_inline)) inline __aicore__ void ProcessQK(
+        __gm__ mm1CopyType *__restrict__ s_gm_tensor,
+        uint32_t qk_n, uint32_t qk_round_n,
+        uint32_t head_num_move, uint32_t group_num_move,
+        uint32_t head_split_num_move, uint32_t cur_head_num_round,
+        uint32_t split_idx, bool move_l1b_flag, bool is_l0b_pingpong_off)
+    {
+        uint32_t loop_mad = (group_num == 1) ? head_num_move : 1;
+        for (uint32_t headdim_idx = 0; headdim_idx < loop_mad; headdim_idx++) {
+            bool move_l0b_flag = move_l1b_flag;
+            wait_flag(PIPE_M, PIPE_MTE1, static_cast<::event_t>(l0_pingpong_flag));
+            uint64_t l1q_offset = 0;
+            uint32_t q_load_coeff = 1;
+            if (!is_multi_head_mmad) {
+                l1q_offset = split_idx * head_split_num_move * round_k + headdim_idx * round_k;
+            } else {
+                l1q_offset = split_idx * group_num_move * T_BLOCK_SIZE;
+                q_load_coeff = cur_head_num_round;
+            }
+            if (q_load_coeff == 1) {
+                pa_l1_to_l0_a_vector<IN_DTYPE, false>(
+                    l0a_buf_tensor + (l0_offset),
+                    l1q_buf_addr_tensor + (l1q_offset),
+                    0,
+                    (round_k  + T_CUBE_MATRIX_SIZE - 1) / T_CUBE_MATRIX_SIZE,  // repeat
+                    0,
+                    1,                                                    // srcStride
+                    0,
+                    0                                                    // dstStride
+                );
+            } else {
+                for (uint64_t loa_load_idx = 0; loa_load_idx < q_load_coeff / BLOCK_SIZE; ++loa_load_idx) {
+                    pa_l1_to_l0_a_vector<IN_DTYPE, false>(
+                        l0a_buf_tensor + (l0_offset + loa_load_idx * round_k * BLOCK_SIZE),
+                        l1q_buf_addr_tensor + (l1q_offset + loa_load_idx * T_CUBE_MATRIX_SIZE),
+                        0,
+                        round_k / T_BLOCK_SIZE,            // repeat
+                        0,
+                        q_load_coeff / BLOCK_SIZE,                            // srcStride
+                        0,
+                        0                                                     // dstStride
+                    );
+                }
+            }
+            uint32_t mad_l0b_offset = 0;
+            if (group_num == 1 || tilingKeyType == TilingKeyType::TILING_INT8_CUBE_QUANT) {
+                    if (headdim_idx == 0) {
+                        wait_flag(PIPE_MTE2, PIPE_MTE1, static_cast<::event_t>(l1b_pingpong_flag + 2));
+                    }
+                if (is_l0b_pingpong_off) {
+                    mad_l0b_offset = 0;
+                    wait_flag(PIPE_M, PIPE_MTE1, static_cast<::event_t>(l0b_pingpong_flag + 2));
+                } else {
+                    mad_l0b_offset = l0_offset;
+                }
+                pa_l1_to_l0_b_vector<IN_DTYPE, false>(
+                    l0b_buf_tensor + (mad_l0b_offset),
+                    l1kv_buf_addr_tensor + (move_l1b_offset + headdim_idx * round_k * qk_round_n),
+                    0,
+                    (round_k * qk_round_n) / T_CUBE_MATRIX_SIZE,                   // repeat
+                    0,
+                    1,                                        // srcStride
+                    0,
+                    0                                        // dstStride
+                );
+            } else {
+                if (is_l0b_pingpong_off) {
+                    l0b_offset = 0;
+                    wait_flag(PIPE_M, PIPE_MTE1, static_cast<::event_t>(l0b_pingpong_flag + 2));
+                } else if (move_l0b_flag) {
+                    l0b_pingpong_flag = 1 - l0b_pingpong_flag;
+                    l0b_offset = l0b_pingpong_flag * L0AB_UINT8_BUF_SIZE / sizeof(IN_DTYPE);
+                    wait_flag(PIPE_M, PIPE_MTE1, static_cast<::event_t>(l0b_pingpong_flag + 2));
+                }
+                    if (headdim_idx == 0 && move_l1b_flag) {
+                        wait_flag(PIPE_MTE2, PIPE_MTE1, static_cast<::event_t>(l1b_pingpong_flag + 2));
+                    }
+                if (move_l0b_flag) {
+                    uint64_t l1kv_offset = move_l1b_offset + headdim_idx * round_k * qk_round_n;
+                    pa_l1_to_l0_b_vector<IN_DTYPE, false>(
+                        l0b_buf_tensor + (l0b_offset),
+                        l1kv_buf_addr_tensor + (l1kv_offset),
+                        0,
+                        round_k * qk_round_n / T_CUBE_MATRIX_SIZE,  // repeat
+                        0,
+                        1,                                        // srcStride
+                        0,
+                        0                                        // dstStride
+                    );
+                }
+                mad_l0b_offset = l0b_offset;
+            }
+
+            if (headdim_idx == loop_mad - 1) {
+                    if ((group_num != 1 && move_l1b_flag) || group_num == 1) {
+                        set_flag(PIPE_MTE1, PIPE_MTE2, static_cast<::event_t>(l1b_pingpong_flag + 2));
+                    }
+            }
+
+            set_flag(PIPE_MTE1, PIPE_M, static_cast<::event_t>(l0_pingpong_flag));
+            wait_flag(PIPE_MTE1, PIPE_M, static_cast<::event_t>(l0_pingpong_flag));
+            wait_flag(PIPE_FIX, PIPE_M, static_cast<::event_t>(l0_pingpong_flag));
+            mad(
+                mm1_l0c_buf_tensor + (l0c_offset),
+                l0a_buf_tensor + (l0_offset),
+                l0b_buf_tensor + (mad_l0b_offset),
+                m,
+                __k,
+                qk_n,
+                0,
+                false,
+                false,
+                1);
+            if (is_l0b_pingpong_off) {
+                set_flag(PIPE_M, PIPE_MTE1, static_cast<::event_t>(l0b_pingpong_flag + 2));
+            } else {
+                    if (group_num != 1 && move_l0b_flag) {
+                        set_flag(PIPE_M, PIPE_MTE1, static_cast<::event_t>(l0b_pingpong_flag + 2));
+                    }
+            }
+            set_flag(PIPE_M, PIPE_MTE1, static_cast<::event_t>(l0_pingpong_flag));
+            set_flag(PIPE_M, PIPE_FIX, static_cast<::event_t>(l0_pingpong_flag));
+            wait_flag(PIPE_M, PIPE_FIX, static_cast<::event_t>(l0_pingpong_flag));
+            // copy S to gm
+            uint64_t s_gm_offset = headdim_idx * group_num_move * qk_round_n;
+            pa_l0c_to_gm_nd_fp32(
+                s_gm_tensor + (s_gm_offset),
+                mm1_l0c_buf_tensor + (l0c_offset),
+                m,           // MSize
+                qk_round_n,  // NSize
+                RoundUp<16>(m), // srcStride
+                qk_round_n  // dstStride_dst_D
+            );
+            set_flag(PIPE_FIX, PIPE_M, static_cast<::event_t>(l0_pingpong_flag));
+            l0_pingpong_flag = 1 - l0_pingpong_flag;
+            l0_offset = l0_pingpong_flag * L0AB_UINT8_BUF_SIZE / sizeof(IN_DTYPE);
+            l0c_offset = l0_pingpong_flag * L0C_FLOAT_BUF_SIZE;
+        }
+    }
+
+
+
+    __attribute__((always_inline)) inline __aicore__ void ProcessPV(
+        __gm__ mm2CopyType *__restrict__ o_tmp_gm_tensor,
+        __gm__ IN_DTYPE *__restrict__ p_gm_tensor,
+        __cbuf__ IN_DTYPE *__restrict__ l1p_buf_addr_tensor,
+        uint32_t qk_n, uint32_t qk_round_n, uint32_t head_num_move, uint32_t group_num_move,
+        uint32_t head_split_num_move, uint32_t cur_head_num, uint32_t cur_head_num_round,
+        uint32_t split_idx, bool move_l1b_flag, uint32_t softmax_ready_flag, bool is_l0b_pingpong_off)
+    {
+        uint32_t loop_mad = (group_num == 1) ? head_num_move : 1;
+        for (uint32_t headdim_idx = 0; headdim_idx < loop_mad; headdim_idx++) {
+            bool move_l0b_flag = move_l1b_flag;
+            wait_flag(PIPE_M, PIPE_MTE1, static_cast<::event_t>(l0_pingpong_flag));
+            uint32_t mad_l0b_offset = 0;
+            LoadData2dTransposeParams loadDataParams;
+            loadDataParams.dstGap = 0;
+            loadDataParams.startIndex = 0;
+            loadDataParams.dstFracGap = 0;
+            if (group_num == 1 || tilingKeyType == TilingKeyType::TILING_INT8_CUBE_QUANT) {
+                    if (headdim_idx == 0) {
+                        wait_flag(PIPE_MTE2, PIPE_MTE1, static_cast<::event_t>(l1b_pingpong_flag + 2));
+                    }
+                if (is_l0b_pingpong_off) {
+                    mad_l0b_offset = 0;
+                    wait_flag(PIPE_M, PIPE_MTE1, static_cast<::event_t>(l0b_pingpong_flag + 2));
+                } else {
+                    mad_l0b_offset = l0_offset;
+                }
+                if(qk_round_n <= round_k || tilingKeyType == TilingKeyType::TILING_QUANT_FP16OUT || tilingKeyType == TilingKeyType::TILING_QUANT_BF16OUT) {// Nz -> nZ
+                    loadDataParams.repeatTimes = round_k / T_BLOCK_SIZE;
+                    loadDataParams.srcStride = qk_round_n / T_BLOCK_SIZE;
+                    uint16_t dstGap = sizeof(IN_DTYPE) == 1 ? 1 : 0;
+                    loadDataParams.dstGap = dstGap;
+                    for (uint32_t l0b_load_idx = 0; l0b_load_idx < qk_round_n / (T_BLOCK_SIZE); ++l0b_load_idx) {
+                        load_cbuf_to_cb_transpose(
+                            l0b_buf_tensor + (mad_l0b_offset + l0b_load_idx * RoundUp<16>(__k) * T_BLOCK_SIZE),
+                            l1kv_buf_addr_tensor + (move_l1b_offset + headdim_idx * round_k * qk_round_n / group_num +
+                                l0b_load_idx * T_BLOCK_SIZE * T_BLOCK_SIZE),
+                            loadDataParams.startIndex, loadDataParams.repeatTimes, loadDataParams.srcStride,
+                            loadDataParams.dstGap, (addr_cal_mode_t)0, loadDataParams.dstFracGap);
+                    }
+                } else {
+                    loadDataParams.repeatTimes = qk_round_n / T_BLOCK_SIZE;
+                    loadDataParams.dstGap = round_k / BLOCK_SIZE - 1;
+                    loadDataParams.srcStride = 1;
+                    for (uint32_t l0b_load_idx = 0; l0b_load_idx < round_k / T_BLOCK_SIZE; ++l0b_load_idx) {
+                        load_cbuf_to_cb_transpose(
+                            l0b_buf_tensor + (mad_l0b_offset + l0b_load_idx * T_BLOCK_SIZE * T_BLOCK_SIZE),
+                            l1kv_buf_addr_tensor + (move_l1b_offset + headdim_idx * round_k * qk_round_n / group_num +
+                                l0b_load_idx * qk_round_n * T_BLOCK_SIZE),
+                            loadDataParams.startIndex, loadDataParams.repeatTimes, loadDataParams.srcStride,
+                            loadDataParams.dstGap, (addr_cal_mode_t)0, loadDataParams.dstFracGap);
+                    }
+                }
+            } else {
+                if (is_l0b_pingpong_off) {
+                        l0b_offset = 0;
+                        wait_flag(PIPE_M, PIPE_MTE1, static_cast<::event_t>(l0b_pingpong_flag + 2));
+                 } else if (move_l0b_flag) {
+                        l0b_pingpong_flag = 1 - l0b_pingpong_flag;
+                        l0b_offset = l0b_pingpong_flag * L0AB_UINT8_BUF_SIZE / sizeof(IN_DTYPE);
+                        wait_flag(PIPE_M, PIPE_MTE1, static_cast<::event_t>(l0b_pingpong_flag + 2));
+                }
+                    if (headdim_idx == 0 && move_l1b_flag) {
+                        wait_flag(PIPE_MTE2, PIPE_MTE1, static_cast<::event_t>(l1b_pingpong_flag + 2));
+                    }
+                if (move_l0b_flag) {
+                    uint64_t l1kv_offset = move_l1b_offset + headdim_idx * round_k * qk_round_n;
+                    if (qk_round_n <= round_k || tilingKeyType == TilingKeyType::TILING_QUANT_FP16OUT || tilingKeyType == TilingKeyType::TILING_QUANT_BF16OUT) {// Nz -> nZ
+                        loadDataParams.repeatTimes = round_k / T_BLOCK_SIZE;
+                        loadDataParams.srcStride = qk_round_n / T_BLOCK_SIZE;
+                        uint16_t dstGap = sizeof(IN_DTYPE) == 1 ? 1 : 0;
+                        loadDataParams.dstGap = dstGap;
+                        for (uint32_t l0b_load_idx = 0; l0b_load_idx < qk_round_n / T_BLOCK_SIZE; ++l0b_load_idx) {
+                            load_cbuf_to_cb_transpose(
+                                l0b_buf_tensor + (l0b_offset + l0b_load_idx * RoundUp<16>(__k) * T_BLOCK_SIZE),
+                                l1kv_buf_addr_tensor + (l1kv_offset + l0b_load_idx * T_BLOCK_SIZE * T_BLOCK_SIZE),
+                                loadDataParams.startIndex, loadDataParams.repeatTimes, loadDataParams.srcStride,
+                                loadDataParams.dstGap, (addr_cal_mode_t)0, loadDataParams.dstFracGap);
+                        }
+                    } else {
+                        for (uint32_t l0b_load_idx = 0; l0b_load_idx < round_k / T_BLOCK_SIZE; ++l0b_load_idx) {
+                        loadDataParams.repeatTimes = qk_round_n / T_BLOCK_SIZE;
+                        loadDataParams.srcStride = 1;
+                        loadDataParams.dstGap = round_k / BLOCK_SIZE - 1;
+                        load_cbuf_to_cb_transpose(
+                            l0b_buf_tensor + (l0b_offset + l0b_load_idx * T_BLOCK_SIZE * T_BLOCK_SIZE),
+                            l1kv_buf_addr_tensor + (l1kv_offset + l0b_load_idx * qk_round_n * T_BLOCK_SIZE),
+                            loadDataParams.startIndex, loadDataParams.repeatTimes, loadDataParams.srcStride,
+                            loadDataParams.dstGap, (addr_cal_mode_t)0, loadDataParams.dstFracGap);
+                        }
+                    }
+                }
+                mad_l0b_offset = l0b_offset;
+            }
+
+            if (split_idx == 0 && headdim_idx == 0) {
+                wait_flag_dev(softmax_ready_flag);
+                if (!is_multi_head_mmad) {
+                    pa_gm_to_l1_nd_nd<IN_DTYPE>(
+                        l1p_buf_addr_tensor,
+                        p_gm_tensor,
+                        1,
+                        0,
+                        0,
+                        RoundUp<BLOCK_SIZE>(qk_n) * cur_head_num * T_BLOCK_OFFSET,               // lenBurst
+                        0,
+                        0
+                    );
+                } else {
+                    pa_gm_to_l1_nd_nz<IN_DTYPE>(
+                        l1p_buf_addr_tensor,
+                        p_gm_tensor,
+                        cur_head_num,         // nValue
+                        (cur_head_num + 15) / 16 * 16,// dstNzC0Stride
+                        0,                     // dstNzMatrixStride, unused
+                        qk_round_n,           // dValue
+                        0,                     // dstNzMatrixStride, unused
+                        RoundUp<BLOCK_SIZE>(qk_n) * T_BLOCK_OFFSET           // srcDValue
+                    );
+                }
+            }
+
+            set_flag(PIPE_MTE2, PIPE_MTE1, static_cast<::event_t>(l0_pingpong_flag));
+            wait_flag(PIPE_MTE2, PIPE_MTE1, static_cast<::event_t>(l0_pingpong_flag));
+            uint64_t l1p_offset = 0;
+            uint32_t p_load_coeff = 1;
+            if (!is_multi_head_mmad) {
+                 l1p_offset =  split_idx * head_split_num_move * RoundUp<BLOCK_SIZE>(qk_n)  * T_BLOCK_OFFSET +
+                     headdim_idx * RoundUp<BLOCK_SIZE>(qk_n)  * T_BLOCK_OFFSET;
+            } else {
+                l1p_offset = split_idx * group_num_move * T_BLOCK_SIZE;
+                p_load_coeff = cur_head_num_round;
+            }
+            if (p_load_coeff == 1) {
+                pa_l1_to_l0_a_vector<IN_DTYPE, false>(
+                    l0a_buf_tensor + (l0_offset),
+                    l1p_buf_addr_tensor + (l1p_offset),
+                    0,
+                    (qk_round_n + T_CUBE_MATRIX_SIZE - 1) / T_CUBE_MATRIX_SIZE,  // repeat
+                    0,
+                    1,                                                       // srcStride
+                    0,
+                    0                                                        // dstStride
+                );
+            } else {
+                for (uint64_t loa_load_idx = 0; loa_load_idx < p_load_coeff / BLOCK_SIZE; ++loa_load_idx) {
+                    pa_l1_to_l0_a_vector<IN_DTYPE, false>(
+                        l0a_buf_tensor + (l0_offset + loa_load_idx * qk_round_n * BLOCK_SIZE),
+                        l1p_buf_addr_tensor + (l1p_offset + loa_load_idx * T_CUBE_MATRIX_SIZE),
+                        0,
+                        qk_round_n / T_BLOCK_SIZE,                                 // repeat
+                        0,
+                        p_load_coeff / BLOCK_SIZE,                               // srcStride
+                        0,
+                        0                                                        // dstStride
+                    );
+                }
+            }
+
+            if (headdim_idx == loop_mad - 1) {
+                    if (group_num != 1 && move_l1b_flag || group_num == 1) {
+                        set_flag(PIPE_MTE1, PIPE_MTE2, static_cast<::event_t>(l1b_pingpong_flag + 2));
+                    }
+            }
+
+            set_flag(PIPE_MTE1, PIPE_M, static_cast<::event_t>(l0_pingpong_flag));
+            wait_flag(PIPE_MTE1, PIPE_M, static_cast<::event_t>(l0_pingpong_flag));
+            wait_flag(PIPE_FIX, PIPE_M, static_cast<::event_t>(l0_pingpong_flag));
+            mad(
+                mm2_l0c_buf_tensor + (l0c_offset),
+                l0a_buf_tensor + (l0_offset),
+                l0b_buf_tensor + (mad_l0b_offset),
+                m,
+                qk_n,
+                __k,
+                0,
+                false,
+                false,
+                1);
+
+            if (is_l0b_pingpong_off) {
+                set_flag(PIPE_M, PIPE_MTE1, static_cast<::event_t>(l0b_pingpong_flag + 2));
+            } else {
+                    if (group_num != 1 && move_l0b_flag) {
+                        set_flag(PIPE_M, PIPE_MTE1, static_cast<::event_t>(l0b_pingpong_flag + 2));
+                    }
+            }
+            set_flag(PIPE_M, PIPE_MTE1, static_cast<::event_t>(l0_pingpong_flag));
+            set_flag(PIPE_M, PIPE_FIX, static_cast<::event_t>(l0_pingpong_flag));
+            wait_flag(PIPE_M, PIPE_FIX, static_cast<::event_t>(l0_pingpong_flag));
+            // copy O to gm
+            uint64_t o_temp_gm_offset = headdim_idx * group_num_move * RoundUp<16>(__k);
+            pa_l0c_to_gm_nd_fp32(
+                o_tmp_gm_tensor + (o_temp_gm_offset),
+                mm2_l0c_buf_tensor + (l0c_offset),
+                m,        // MSize
+                RoundUp<16>(__k),  // NSize
+                RoundUp<16>(m),       // srcStride
+                RoundUp<16>(__k)  // dstStride_dst_D
+            );
+
+            set_flag(PIPE_FIX, PIPE_M, static_cast<::event_t>(l0_pingpong_flag));
+            ChangeL0PingPongFlag();
+        }
+    }
+
+
+
+    __attribute__((always_inline)) inline __aicore__ void ChangePingPongFlag() {
+        l1_pingpong_flag = 1 - l1_pingpong_flag;
+        l1_offset = l1_pingpong_flag * L1_UINT8_BUF_SIZE_DECODER / sizeof(IN_DTYPE);
+        l1_scale_offset = l1_pingpong_flag * L1_SCALE_UINT64_SIZE;
+        l1_bias_offset = l1_pingpong_flag * L1_OFFSET_INT32_SIZE;
+        if (group_num == 1) {
+            l1b_pingpong_flag = 1 - l1b_pingpong_flag;
+            l1b_offset = l1b_pingpong_flag * L1_KV_UINT8_BUF_SIZE / sizeof(IN_DTYPE);
+        }
+    }
+
+    __attribute__((always_inline)) inline __aicore__ void ChangeL0PingPongFlag() {
+        if (is_l0c_pingpong_off) {
+            l0_pingpong_flag = 0;
+            l0_offset = 0;
+            l0c_offset = 0;
+        } else {
+            l0_pingpong_flag = 1 - l0_pingpong_flag;
+            l0_offset = l0_pingpong_flag * L0AB_UINT8_BUF_SIZE / sizeof(IN_DTYPE);
+            l0c_offset = l0_pingpong_flag * L0C_FLOAT_BUF_SIZE;
+        }
+
+    }
+
+
+
+    __aicore__ __attribute__((always_inline)) inline void InnerRunCube(uint32_t cur_batch, uint32_t start_head, uint32_t cur_head_num, uint32_t head_split_loop,
+                                    uint32_t start_kv, uint32_t cur_kv_seqlen, uint32_t offset_tiling, uint32_t group_num_move, uint32_t head_split_num_move)
+    {
+        uint32_t addr_q_high32 = (uint32_t)(*((__gm__ uint32_t *)tiling_gm + 4 + offset_tiling));
+        uint32_t addr_q_loww32 = (uint32_t)(*((__gm__ uint32_t *)tiling_gm + 5 + offset_tiling));
+        uint64_t addr_q_scalar = (uint64_t)(((uint64_t)addr_q_high32) << 32 | addr_q_loww32);
+        uint64_t q_offset = addr_q_scalar + start_head * embedding_size;
+
+        uint32_t pp_n_scalar = block_size_calc;
+
+        uint32_t n_loop = (cur_kv_seqlen + pp_n_scalar - 1) / pp_n_scalar;
+        sub_n_loop = pp_n_scalar / block_size;
+        real_n_loop = (cur_kv_seqlen + block_size - 1) / block_size;
+
+        uint32_t qk_n = pp_n_scalar;
+        uint32_t qk_round_n = RoundUp<BLOCK_SIZE>(qk_n);
+        uint32_t qk_n_2 = pp_n_scalar;
+        uint32_t qk_round_n_2 = RoundUp<BLOCK_SIZE>(qk_n_2);
+
+        uint32_t cur_head_num_round = (cur_head_num + 15) / 16 * 16;
+        m = (group_num == 1) ? 1 : group_num_move;
+        is_multi_head_mmad = (group_num_move > 1) && (tilingKeyType != TilingKeyType::TILING_INT8_CUBE_QUANT);
+        bool is_l0b_pingpong_off = (RoundUp<T_BLOCK_SIZE>(block_size_calc) * round_k > (L0AB_UINT8_BUF_SIZE  /  sizeof(IN_DTYPE))) ? 1 : 0;
+        for (uint32_t n_idx = 0; n_idx < n_loop; n_idx += 2) {  // for k_seqlen
+            if (n_idx == (n_loop - 1)) {
+                qk_n = (cur_kv_seqlen - n_idx * pp_n_scalar);
+                qk_round_n = RoundUp<BLOCK_SIZE>(qk_n);
+            }
+            if ((n_idx + 1) == (n_loop - 1)) {
+                qk_n_2 = (cur_kv_seqlen - (n_idx + 1) * pp_n_scalar);
+                qk_round_n_2 = RoundUp<BLOCK_SIZE>(qk_n_2);
+            }
+            /* ************ CUBE1 stage1  ************* */
+            for (uint32_t split_idx = 0; split_idx < head_split_loop; split_idx++) {  // for head
+                bool move_l1b_flag = (split_idx == 0) || ((start_head + split_idx * group_num_move) % group_num) == 0;
+                // Only need load Q once
+                uint32_t head_num_move = ((group_num_move == 1) && (split_idx == (head_split_loop - 1))) ?
+                        cur_head_num - head_split_num_move * split_idx * group_num_move : head_split_num_move;
+                if (n_idx == 0 && split_idx == 0) {
+                    LoadQToL1(q_offset, cur_head_num);
+                }
+
+                   set_flag(PIPE_MTE2, PIPE_MTE1, static_cast<::event_t>(l1_pingpong_flag));
+                // *** Prepare K to L1
+                uint64_t hiddenSize_offset =
+                    (start_head + split_idx * head_split_num_move * group_num_move) / group_num * embedding_size;
+
+
+                uint64_t deq_scale1_hiddenSize_offset = hiddenSize_offset;
+                uint64_t offset1_hiddenSize = k_bias_flag ? hiddenSize_offset : 0;
+                    LoadKVToL1(
+                        k_gm + (hiddenSize_offset),
+                        l1kv_buf_addr_tensor,
+                        move_l1b_flag,
+                        head_num_move, cur_batch, cur_kv_seqlen, start_kv, qk_round_n,
+                        real_n_loop, sub_n_loop, n_idx
+                    );
+                wait_flag(PIPE_MTE2, PIPE_MTE1, static_cast<::event_t>(l1_pingpong_flag));
+
+                ProcessQK(
+                    s_gm + ((uint64_t)block_idx * TMP_SIZE_DECODER +
+                         split_idx * head_split_num_move * group_num_move * qk_round_n),
+                    qk_n, qk_round_n, head_num_move, group_num_move,
+                    head_split_num_move, cur_head_num_round,split_idx, move_l1b_flag, is_l0b_pingpong_off);
+                ChangePingPongFlag();
+            }
+            pipe_barrier(PIPE_FIX);
+            DdrBarrierBeforeFfts();
+            FftsCrossCoreSync<PIPE_FIX, 2>(QK_READY_DECODER);
+
+            /* ************ CUBE1 stage2  ************* */
+            if (n_idx + 1 < n_loop) {
+                for (uint32_t split_idx = 0; split_idx < head_split_loop; split_idx++) {  // for head
+                    bool move_l1b_flag = (split_idx == 0) || ((start_head + split_idx * group_num_move) % group_num) == 0;
+                    // Only need load Q once
+                    uint32_t head_num_move = ((group_num_move == 1) && (split_idx == (head_split_loop - 1))) ?
+                            cur_head_num - head_split_num_move * split_idx * group_num_move : head_split_num_move;
+
+                        set_flag(PIPE_MTE2, PIPE_MTE1, static_cast<::event_t>(l1_pingpong_flag));
+                    // *** Prepare K to L1
+                    uint64_t hiddenSize_offset = (start_head + split_idx * head_split_num_move * group_num_move) / group_num * embedding_size;
+                    uint64_t deq_scale1_hiddenSize_offset = hiddenSize_offset;
+                    uint64_t offset1_hiddenSize = k_bias_flag ? hiddenSize_offset : 0;
+                        LoadKVToL1(
+                            k_gm + (hiddenSize_offset),
+                            l1kv_buf_addr_tensor,
+                            move_l1b_flag,
+                            head_num_move, cur_batch, cur_kv_seqlen, start_kv, qk_round_n_2,
+                            real_n_loop, sub_n_loop, (n_idx + 1)
+                        );
+                    wait_flag(PIPE_MTE2, PIPE_MTE1, static_cast<::event_t>(l1_pingpong_flag));
+                    // s_gm ping-pong halves are separate (stage-2 uses the second half).
+                    ProcessQK(
+                        s_gm + ((uint64_t)block_idx * TMP_SIZE_DECODER +
+                            split_idx * head_split_num_move * group_num_move * qk_round_n_2 +
+                            TMP_SIZE_DECODER / 2),
+                        qk_n_2, qk_round_n_2, head_num_move, group_num_move,
+                        head_split_num_move, cur_head_num_round, split_idx, move_l1b_flag, is_l0b_pingpong_off);
+                    ChangePingPongFlag();
+                }
+                pipe_barrier(PIPE_FIX);
+                DdrBarrierBeforeFfts();
+                FftsCrossCoreSync<PIPE_FIX, 2>(QK_READY_STAGE2);
+            }
+
+            /* ************ CUBE2 stage1  ************* */
+            for (uint32_t split_idx = 0; split_idx < head_split_loop; split_idx++) {
+                int32_t head_num_move = ((group_num_move == 1) && (split_idx == (head_split_loop - 1))) ?
+                        cur_head_num - head_split_num_move * split_idx * group_num_move : head_split_num_move;
+                bool move_l1b_flag = (split_idx == 0) || ((start_head + split_idx * group_num_move) % group_num) == 0;
+                // *** Prepare V to L1
+                uint64_t hiddenSize_offset =
+                    (start_head + split_idx * head_split_num_move * group_num_move) / group_num * embedding_size;
+
+                uint64_t deq_scale2_hiddenSize_offset = hiddenSize_offset;
+                uint64_t offset2_hiddenSize = v_bias_flag ? hiddenSize_offset : 0;
+                    LoadKVToL1(
+                        v_gm + (hiddenSize_offset),
+                        l1kv_buf_addr_tensor,
+                        move_l1b_flag,
+                        head_num_move, cur_batch, cur_kv_seqlen, start_kv, RoundUp<T_BLOCK_SIZE>(qk_round_n),
+                        real_n_loop, sub_n_loop, n_idx
+                    );
+                set_flag(PIPE_MTE2, PIPE_MTE1, static_cast<::event_t>(l1_pingpong_flag));
+                wait_flag(PIPE_MTE2, PIPE_MTE1, static_cast<::event_t>(l1_pingpong_flag));
+                ProcessPV(
+                    o_tmp_gm + ((uint64_t)block_idx * TMP_SIZE +
+                        split_idx * head_split_num_move * group_num_move * RoundUp<16>(__k)),
+                    p_gm + ((uint64_t)block_idx * TMP_SIZE * T_BLOCK_OFFSET),
+                    l1p_buf_addr_tensor,
+                    qk_n, RoundUp<T_BLOCK_SIZE>(qk_round_n), head_num_move, group_num_move,
+                    head_split_num_move, cur_head_num, cur_head_num_round,
+                    split_idx, move_l1b_flag, SOFTMAX_READY_DECODER, is_l0b_pingpong_off);
+                ChangePingPongFlag();
+            }
+            pipe_barrier(PIPE_FIX);
+            DdrBarrierBeforeFfts();
+            FftsCrossCoreSync<PIPE_FIX, 2>(UPDATE_READY_DECODER);
+
+            /* ************ CUBE2 stage2  ************* */
+            if (n_idx + 1 < n_loop) {
+                for (uint32_t split_idx = 0; split_idx < head_split_loop; split_idx++) {
+                    int32_t head_num_move = ((group_num_move == 1) && (split_idx == (head_split_loop - 1))) ?
+                            cur_head_num - head_split_num_move * split_idx * group_num_move : head_split_num_move;
+                    bool move_l1b_flag = (split_idx == 0) || ((start_head + split_idx * group_num_move) % group_num) == 0;
+                    // *** Prepare V to L1
+                    uint64_t hiddenSize_offset =
+                        (start_head + split_idx * head_split_num_move * group_num_move) / group_num * embedding_size;
+
+                    uint64_t deq_scale2_hiddenSize_offset = hiddenSize_offset;
+                    uint64_t offset2_hiddenSize = v_bias_flag ? hiddenSize_offset : 0;
+                        LoadKVToL1(
+                            v_gm + (hiddenSize_offset),
+                            l1kv_buf_addr_tensor,
+                            move_l1b_flag,
+                            head_num_move, cur_batch, cur_kv_seqlen, start_kv,  RoundUp<T_BLOCK_SIZE>(qk_round_n_2),
+                            real_n_loop, sub_n_loop, (n_idx + 1)
+                        );
+                    set_flag(PIPE_MTE2, PIPE_MTE1, static_cast<::event_t>(l1_pingpong_flag));
+                    wait_flag(PIPE_MTE2, PIPE_MTE1, static_cast<::event_t>(l1_pingpong_flag));
+                    ProcessPV(
+                        o_tmp_gm + ((uint64_t)block_idx * TMP_SIZE +
+                            split_idx * head_split_num_move * group_num_move * RoundUp<16>(__k) +
+                            TMP_SIZE / 2),
+                        p_gm + ((uint64_t)block_idx * TMP_SIZE * T_BLOCK_OFFSET  + TMP_SIZE * T_BLOCK_OFFSET / 2),
+                        l1p_buf_addr_tensor + (qk_round_n * cur_head_num_round),
+                        qk_n_2, RoundUp<T_BLOCK_SIZE>(qk_round_n_2), head_num_move, group_num_move,
+                        head_split_num_move, cur_head_num, cur_head_num_round,
+                        split_idx, move_l1b_flag, SOFTMAX_READY_STAGE2, is_l0b_pingpong_off);
+                    ChangePingPongFlag();
+                }
+                pipe_barrier(PIPE_FIX);
+                DdrBarrierBeforeFfts();
+                FftsCrossCoreSync<PIPE_FIX, 2>(UPDATE_READY_STAGE2);
+            }
+        }
+    }
+
+
+private:
+    __gm__ IN_DTYPE *__restrict__ q_gm{nullptr};
+    __gm__ IN_KVDTYPE *__restrict__ k_gm{nullptr};
+    __gm__ IN_KVDTYPE *__restrict__ v_gm{nullptr};
+
+
+    __gm__ mm1CopyType *__restrict__ s_gm{nullptr};
+    __gm__ IN_DTYPE *__restrict__ p_gm{nullptr};
+    __gm__ mm2CopyType *__restrict__ o_tmp_gm{nullptr};
+    __gm__ int32_t *__restrict__ block_tables_gm{nullptr};
+    __gm__ uint8_t *__restrict__ tiling_gm{nullptr};
+
+    __gm__ IN_DTYPE *gm_k16_ping_{nullptr};
+    __gm__ IN_DTYPE *gm_k16_pong_{nullptr};
+    __gm__ IN_DTYPE *gm_v16_ping_{nullptr};
+    __gm__ IN_DTYPE *gm_v16_pong_{nullptr};
+
+    const uint32_t l1q_buf_addr_offset = 0;
+
+    const uint32_t l1p_buf_addr_offset = (5 * L0AB_UINT8_BLOCK_SIZE);
+    const uint32_t l1kv_buf_addr_offset = (7 * L0AB_UINT8_BLOCK_SIZE);
+
+    __cbuf__ IN_DTYPE *l1q_buf_addr_tensor =
+        reinterpret_cast<__cbuf__ IN_DTYPE *>((uintptr_t)l1q_buf_addr_offset);
+    __cbuf__ IN_DTYPE *l1kv_buf_addr_tensor =
+        reinterpret_cast<__cbuf__ IN_DTYPE *>((uintptr_t)l1kv_buf_addr_offset);
+    __cbuf__ IN_DTYPE *l1p_buf_addr_tensor =
+        reinterpret_cast<__cbuf__ IN_DTYPE *>((uintptr_t)l1p_buf_addr_offset);
+    __ca__ IN_DTYPE *l0a_buf_tensor = reinterpret_cast<__ca__ IN_DTYPE *>((uintptr_t)0);
+    __cb__ IN_DTYPE *l0b_buf_tensor = reinterpret_cast<__cb__ IN_DTYPE *>((uintptr_t)0);
+    __cc__ mm1OutputType *mm1_l0c_buf_tensor = reinterpret_cast<__cc__ mm1OutputType *>((uintptr_t)0);
+    __cc__ mm2OutputType *mm2_l0c_buf_tensor = reinterpret_cast<__cc__ mm2OutputType *>((uintptr_t)0);
+    __cc__ int32_t *l0c_buf_int32_tensor = reinterpret_cast<__cc__ int32_t *>((uintptr_t)0);
+
+
+    uint32_t k_bias_flag{0};
+    uint32_t v_bias_flag{0};
+    uint32_t num_tokens{0};
+    uint32_t q_heads{0};
+    uint32_t kv_heads{0};
+    uint32_t embedding_size{0};
+    uint32_t embedding_size_v{0};
+    uint32_t block_size{0};
+    uint32_t max_num_blocks_per_query{0};
+    uint32_t group_num{0};
+    uint32_t former_group_num_move{1};
+    uint32_t tail_group_num_move{1};
+    uint32_t former_head_split_num{1};
+    uint32_t tail_head_split_num{1};
+    uint32_t stride_kv{0};
+    uint32_t stride_vo{0};
+    uint32_t m{0};
+    uint32_t __k{0};
+    uint32_t __v{0};
+    uint32_t round_k{0};
+    uint32_t round_v{0};
+    uint32_t core_per_batch{0};
+    uint32_t process_num{0};
+    uint64_t former_batch{0};
+    uint32_t former_head_split{0};
+    uint64_t tail_batch{0};
+    uint32_t tail_head_split{0};
+    uint32_t head_split_num{0};
+    uint32_t tiling_head_size{0};
+    uint32_t tiling_para_size{0};
+    uint32_t kv_split_per_core{0};
+    uint32_t kv_split_core_num{1};
+    uint32_t block_size_calc{0};
+
+    uint32_t embed_split_size_qk{0};
+    uint32_t embed_split_loop_qk{1};
+    uint32_t embed_split_size_v{0};
+    uint32_t embed_split_loop_v{1};
+    bool is_multi_head_mmad{0};
+    uint32_t move_l1b_offset = 0;
+    uint32_t q_head_original{0};
+    uint32_t compressHead{0};
+    uint32_t block_idx{0};
+    uint32_t block_num{1};
+
+    uint32_t l1_pingpong_flag = 0;
+    uint32_t l1b_pingpong_flag = 0;
+    uint32_t l0_pingpong_flag = 0;
+    uint32_t l0b_pingpong_flag = 0;
+    uint32_t l1p_pingpong_flag = 0;
+    uint32_t block_size_inner_count{0};
+    uint32_t sub_n_loop{0};
+    uint32_t real_n_loop{0};
+
+    uint32_t l1_offset = l1_pingpong_flag * L1_UINT8_BUF_SIZE_DECODER / sizeof(IN_DTYPE);
+    uint32_t l1b_offset = l1b_pingpong_flag * L1_KV_UINT8_BUF_SIZE / sizeof(IN_DTYPE);
+    uint32_t l1_scale_offset = l1_pingpong_flag * L1_SCALE_UINT64_SIZE;
+    uint32_t l1_bias_offset = l1_pingpong_flag * L1_OFFSET_INT32_SIZE;
+    uint32_t l0_offset = l0_pingpong_flag * L0AB_UINT8_BUF_SIZE / sizeof(IN_DTYPE);
+    uint32_t l0c_offset = l0_pingpong_flag * L0C_FLOAT_BUF_SIZE;
+    uint32_t l0b_offset = l0b_pingpong_flag * L0AB_UINT8_BUF_SIZE / sizeof(IN_DTYPE);
+    uint32_t l1p_start_offset = l1p_pingpong_flag * L1_P_UINT8_BUF_SIZE / sizeof(IN_DTYPE);
+    bool is_l0c_pingpong_off = 0;
+    uint32_t prefill_batch_size_;
+    uint32_t decoder_batch_size_;
+
+};
+#elif __DAV_C220_VEC__
+enum class ScaleType {
+        SCALE_TOR = 0,
+        SCALE_LOGN = 1,
+        SCALE_LOGN_FP32 = 2
+};
+
+template <TilingKeyType tilingKeyType = TilingKeyType::TILING_HALF_DATA, typename IN_DTYPE = half, typename OUT_DTYPE = half, bool SplitKV = false, PagedAttnVariant pagedAttnVariant = PagedAttnVariant::DEFAULT, CompressType compressType = CompressType::COMPRESS_TYPE_UNDEFINED>
+class UnpadAttentionDecoderAiv{
+public:
+    using mm1OutputType = typename AttentionType<tilingKeyType>::mm1OutputType;
+    using mm1CopyType = typename AttentionType<tilingKeyType>::mm1CopyType;
+    using mmBiasType = typename AttentionType<tilingKeyType>::mmBiasType;
+    using mmScaleType = typename AttentionType<tilingKeyType>::mmScaleType;
+    using mm2OutputType = typename AttentionType<tilingKeyType>::mm2OutputType;
+    using mm2CopyType = typename AttentionType<tilingKeyType>::mm2CopyType;
+    static constexpr uint32_t T_BLOCK_SIZE =  BLOCK_SIZE_32 / sizeof(IN_DTYPE);
+    static constexpr uint32_t T_BLOCK_OFFSET = 2 / sizeof(IN_DTYPE);
+
+    __aicore__ __attribute__((always_inline)) inline UnpadAttentionDecoderAiv(uint32_t prefill_batch_size, uint32_t decoder_batch_size) {
+        prefill_batch_size_ = prefill_batch_size;
+        decoder_batch_size_ = decoder_batch_size;
+    }
+
+    __aicore__ __attribute__((always_inline)) inline void SetArgs(
+        __gm__ uint8_t *__restrict__ sync,
+        __gm__ uint8_t* __restrict__ gm_k8,
+        __gm__ uint8_t* __restrict__ gm_v8,
+        __gm__ uint8_t* __restrict__ gm_scale1,
+        __gm__ uint8_t* __restrict__ gm_offset1,
+        __gm__ uint8_t* __restrict__ gm_scale2,
+        __gm__ uint8_t* __restrict__ gm_offset2,
+        __gm__ uint8_t* __restrict__ gm_block_table,
+        __gm__ uint8_t *__restrict__ mask_in_gm,
+        __gm__ uint8_t *__restrict__ o_out_gm,
+        __gm__ uint8_t *__restrict__ s_out_gm,
+        __gm__ uint8_t *__restrict__ p_out_gm,
+        __gm__ uint8_t *__restrict__ o_temp_gm,
+        __gm__ uint8_t *__restrict__ globalo_gm,
+        __gm__ uint8_t *__restrict__ o_core_out_tmp_gm,
+        __gm__ uint8_t *__restrict__ l_in_gm,
+        __gm__ uint8_t* __restrict__ gm_k16,
+        __gm__ uint8_t* __restrict__ gm_v16,
+        __gm__ uint8_t *__restrict__ tiling_para_gm,
+        __gm__ uint8_t *__restrict__ razorOffset,
+        __gm__ uint8_t *__restrict__ logN_in_gm,
+        uint32_t pto_block_idx,
+        uint32_t pto_block_num,
+        uint32_t pto_sub_block_id)
+    {
+        if (sync != nullptr) {
+            set_ffts_base_addr((uint64_t)sync);
+        }
+        block_idx = pto_block_idx;
+        block_num = pto_block_num;
+        sub_block_idx = static_cast<uint64_t>(pto_sub_block_id);
+        set_atomic_none();
+        set_mask_norm();
+        set_vector_mask((uint64_t)-1, (uint64_t)-1);
+
+        mask_gm = reinterpret_cast<__gm__ OUT_DTYPE *>(mask_in_gm);
+        o_gm = reinterpret_cast<__gm__ OUT_DTYPE *>(o_out_gm);
+        s_gm = reinterpret_cast<__gm__ mm1CopyType *>(s_out_gm);
+        p_gm = reinterpret_cast<__gm__ IN_DTYPE *>(p_out_gm);
+        o_tmp_gm = reinterpret_cast<__gm__ mm2CopyType *>(o_temp_gm);
+        go_gm = reinterpret_cast<__gm__ float *>(globalo_gm);
+        o_core_tmp_gm = reinterpret_cast<__gm__ float *>(o_core_out_tmp_gm);
+        tiling_gm = reinterpret_cast<__gm__ uint8_t *>(tiling_para_gm);
+        l_gm = reinterpret_cast<__gm__ float *>(l_in_gm);
+        gm_block_tables_ = reinterpret_cast<__gm__ int32_t*>(gm_block_table);
+        logN_gm = reinterpret_cast<__gm__ float *>(logN_in_gm);
+        num_tokens = (uint32_t)(*((__gm__ uint32_t *)tiling_para_gm));
+        q_heads = (uint32_t)(*((__gm__ uint32_t *)tiling_para_gm + TILING_NUMHEADS));
+        embedding_size = (uint32_t)(*((__gm__ uint32_t *)tiling_para_gm + TILING_HEADDIM));
+        embedding_size_v = (uint32_t)(*((__gm__ uint32_t *)tiling_para_gm + TILING_HEADDIM_V));
+        block_size = (int32_t)(*((__gm__ uint32_t *)tiling_para_gm + TILING_BLOCKSIZE));
+        max_num_blocks_per_query = (uint32_t)(*((__gm__ uint32_t*)tiling_para_gm + TILING_MAXBLOCKS));
+        tor = (float)(*((__gm__ float *)tiling_para_gm + TILING_TOR));
+        num_kv_heads = (uint32_t)(*((__gm__ uint32_t*)tiling_para_gm + TILING_KVHEADS));
+        former_batch = (uint32_t)(*((__gm__ uint32_t *)tiling_para_gm + TILING_FORMER_BATCH));
+        former_head_split = (uint32_t)(*((__gm__ uint32_t *)tiling_para_gm + TILING_FORMER_HEAD));
+        tail_batch = (uint32_t)(*((__gm__ uint32_t *)tiling_para_gm + TILING_TAIL_BATCH));
+        tail_head_split = (uint32_t)(*((__gm__ uint32_t *)tiling_para_gm + TILING_TAIL_HEAD));
+        max_context_len = (uint32_t)(*((__gm__ uint32_t *)tiling_para_gm + TILING_MASK_MAX_LEN));
+        batch_stride = (uint64_t)(*((__gm__ uint32_t *)tiling_para_gm + TILING_BATCH_STRIDE));
+        head_stride = (uint64_t)(*((__gm__ uint32_t *)tiling_para_gm + TILING_HEAD_STRIDE));
+        tiling_head_size = (uint32_t)(*((__gm__ uint32_t *)tiling_para_gm + TILING_HEADSIZE));
+        tiling_para_size = (uint32_t)(*((__gm__ uint32_t *)tiling_para_gm + TILING_PARASIZE));
+        group_num = (uint32_t)(*((__gm__ uint32_t*)tiling_para_gm + TILING_GROUPNUM));
+
+        kv_split_per_core = (uint32_t)(*((__gm__ uint32_t *)tiling_para_gm + TILING_KVSPLIT));
+        kv_split_core_num = (uint32_t)(*((__gm__ uint32_t *)tiling_para_gm + TILING_KVCORENUM));
+        block_size_calc = (uint32_t)(*((__gm__ uint32_t *)tiling_para_gm + TILING_BLOCKSIZE_CALC));
+        q_head_original = (uint32_t)(*((__gm__ uint32_t *)tiling_para_gm + TILING_QHEADORIGINAL));
+        compressHead = (uint32_t)(*((__gm__ uint32_t *)tiling_para_gm + TILING_COMPRESSHEAD));
+        scaleType = (ScaleType)(*((__gm__ uint32_t *)tiling_para_gm + TILING_SCALETYPE));
+
+            former_group_num_move = (uint32_t)(*((__gm__ uint32_t *)tiling_para_gm + TILING_FORMER_GROUP_MOVE));
+            tail_group_num_move = (uint32_t)(*((__gm__ uint32_t *)tiling_para_gm + TILING_TAIL_GROUP_MOVE));
+
+        go_flag_scalar = 1;
+        gl_flag_scalar = 1;
+
+        modCoef = (uint32_t)(*((__gm__ uint32_t *)tiling_para_gm + TILING_MODCOEF));
+        divCoef = (uint32_t)(*((__gm__ uint32_t *)tiling_para_gm + TILING_DIVCOEF));
+
+        __k = embedding_size;
+        round_k = RoundUp<T_BLOCK_SIZE>(__k);
+
+        core_per_batch = (q_heads + split_size - 1) / split_size;
+        process_num = num_tokens * core_per_batch;
+    }
+
+    __aicore__ __attribute__((always_inline)) inline void Run()
+    {
+        set_flag(PIPE_MTE3, PIPE_V, EVENT_ID0);
+        set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
+        set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2);
+        set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID3);
+        set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID4);
+        set_flag(PIPE_V, PIPE_MTE2, EVENT_ID4);
+        set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0);
+        set_flag(PIPE_MTE3, PIPE_V, EVENT_ID2);
+        set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2);
+        core_per_batch = (q_heads + former_head_split - 1) / former_head_split;
+        process_num = static_cast<uint64_t>(former_batch) * core_per_batch * kv_split_core_num;
+        for (uint32_t process = block_idx; process < process_num; process += uint32_t(block_num)) {
+            uint32_t cur_batch = process / (core_per_batch * kv_split_core_num) + prefill_batch_size_;
+            uint32_t offset_tiling = tiling_head_size + tiling_para_size * cur_batch;
+            cur_batch = (uint32_t)(*((__gm__ uint32_t *)tiling_gm + 13 + offset_tiling));
+            offset_tiling = tiling_head_size + tiling_para_size * cur_batch;
+            uint32_t batch_idx = (uint32_t)(*((__gm__ uint32_t *)tiling_gm + 8 + offset_tiling));
+            uint32_t kv_seqlen = (uint32_t)(*((__gm__ uint32_t *)tiling_gm + 1 + offset_tiling));
+            tor = (float)(*((__gm__ float *)tiling_gm + TILING_TOR));
+            if(scaleType == ScaleType::SCALE_LOGN_FP32) {
+                float tor_logN = (float)(*((__gm__ float *)logN_gm + cur_batch));
+                tor = tor * tor_logN;
+            }
+            uint32_t kv_seqlen_align = (kv_seqlen + block_size - 1) / block_size * block_size;
+            if (kv_seqlen == 0) {
+                continue;
+            }
+            uint32_t cur_head = (process / kv_split_core_num) % core_per_batch;
+            uint32_t cur_nIndx = process % kv_split_core_num;
+            uint32_t start_head = cur_head * former_head_split;
+            uint32_t cur_kv_seqlen = kv_split_per_core;
+            uint32_t kv_loop = (kv_seqlen_align + kv_split_per_core - 1) /  kv_split_per_core;
+            if (cur_nIndx >= kv_loop) {
+                continue;
+            }
+            if (cur_nIndx == (kv_loop - 1)) {
+                cur_kv_seqlen = kv_seqlen - cur_nIndx * kv_split_per_core;
+            }
+            uint32_t cur_head_num = former_head_split;
+            if (cur_head == (core_per_batch - 1)) {
+                cur_head_num = q_heads - cur_head * former_head_split;
+            }
+            InnerRunVector(batch_idx, start_head, cur_nIndx, cur_kv_seqlen, cur_head_num, offset_tiling, kv_seqlen, embed_split_size_v_former, embed_split_loop_v_former);
+        }
+        if (tail_batch > 0) {
+            core_per_batch = (q_heads + tail_head_split - 1) / tail_head_split;
+            process_num = static_cast<uint64_t>(tail_batch) * core_per_batch;
+            for (uint32_t process = block_idx; process < process_num; process += uint32_t(block_num)) {
+                uint32_t cur_batch = process / core_per_batch + former_batch + prefill_batch_size_;
+                uint32_t offset_tiling = tiling_head_size + tiling_para_size * cur_batch;
+                cur_batch = (uint32_t)(*((__gm__ uint32_t *)tiling_gm + 13 + offset_tiling));
+                offset_tiling = tiling_head_size + tiling_para_size * cur_batch;
+                uint32_t batch_idx = (uint32_t)(*((__gm__ uint32_t *)tiling_gm + 8 + offset_tiling));
+                uint32_t kv_seqlen = (uint32_t)(*((__gm__ uint32_t *)tiling_gm + 1 + offset_tiling));
+                if (kv_seqlen == 0) {
+                    continue;
+                }
+                tor = (float)(*((__gm__ float *)tiling_gm + TILING_TOR));
+                if(scaleType == ScaleType::SCALE_LOGN_FP32) {
+                    float tor_logN = (float)(*((__gm__ float *)logN_gm + cur_batch));
+                    tor = tor * tor_logN;
+                }
+                uint32_t cur_kv_seqlen = kv_seqlen;
+                uint32_t cur_nIndx = 0;
+                uint32_t cur_head = process % core_per_batch;
+                uint32_t cur_head_num = tail_head_split;
+                if (cur_head == (core_per_batch - 1)) {
+                    cur_head_num = q_heads - cur_head * tail_head_split;
+                }
+                uint32_t start_head = (process % core_per_batch) * tail_head_split;
+                InnerRunVector(batch_idx, start_head, cur_nIndx, cur_kv_seqlen, cur_head_num, offset_tiling, kv_seqlen, embed_split_size_v_tail, embed_split_loop_v_tail);
+            }
+        }
+        wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID0);
+        wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
+        wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2);
+        wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID3);
+        wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID4);
+        wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0);
+        wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID4);
+        wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID2);
+        wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2);
+        pipe_barrier(PIPE_ALL);
+        if (SplitKV) {
+            DdrBarrierBeforeFfts();
+            FftsCrossCoreSync<PIPE_MTE3, (uint8_t)0>(REDUCE_READY_DECODER);
+            wait_flag_dev(REDUCE_READY_DECODER);
+            CombineScale(decoder_batch_size_, q_heads, kv_split_core_num, embedding_size);
+        }
+    }
+
+
+private:
+
+    __aicore__ __attribute__((always_inline)) inline void CopyScale(uint32_t sub_m, uint32_t l_offset, uint32_t o_offset)
+    {
+        set_flag(PIPE_V, PIPE_MTE3, EVENT_ID2);
+        wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID2);
+        copy_ubuf_to_gm_align_b32(
+            l_gm + ((int64_t)l_offset),
+            tv32_ubuf_tensor,
+            0,               // sid
+            sub_m,           // nBurst
+            4,               // lenBurst
+            0,               // leftPaddingNum
+            0,               // rightPaddingNum
+            0,                 // srcGap
+            (kv_split_core_num - 1) * 4 // dstGap
+        );
+        if (gl_flag_scalar == 0) {
+            set_flag(PIPE_MTE3, PIPE_V, EVENT_ID2);
+            gl_flag_scalar = 1;
+        }
+        uint32_t src_gap = ((__k % 16 <= 8) && (__k % 16 > 0))? 1 : 0;
+        copy_ubuf_to_gm_align_b32(
+            o_core_tmp_gm + ((int64_t)o_offset),
+            go32_ubuf_tensor,
+            0,        // sid
+            sub_m,    // nBurst
+            __k * 4,  // lenBurst
+            0,        // leftPaddingNum
+            0,        // rightPaddingNum
+            src_gap,   // srcGap
+            (kv_split_core_num - 1) * __k * 4  // dstGap
+        );
+    }
+    __aicore__ __attribute__((always_inline)) inline void CombineScale(uint32_t num_tokens, uint32_t q_heads, uint32_t kv_split_core_num, uint32_t embedding_size)
+    {
+        set_atomic_none();
+        set_mask_norm();
+        set_vector_mask((uint64_t)-1, (uint64_t)-1);
+        const uint32_t ll_ubuf_stage2_offset = 0;  // Tile: per-split log-sum L in fp32
+        const uint32_t lm_ubuf_stage2_offset = 1 * STAGE2_UB_UINT8_BLOCK_SIZE;  // Tile: row-wise l max in fp32
+        const uint32_t tl_ubuf_stage2_offset = 1 * STAGE2_UB_UINT8_BLOCK_SIZE + 1 * UB_UINT8_LINE_SIZE; // Tile: tmp shifted L before exp, fp32
+        const uint32_t rs_ubuf_stage2_offset = 2 * STAGE2_UB_UINT8_BLOCK_SIZE + 1 * UB_UINT8_LINE_SIZE; // Tile: row sum after exp, fp32
+        const uint32_t ts_ubuf_stage2_offset = 2 * STAGE2_UB_UINT8_BLOCK_SIZE + 2 * UB_UINT8_LINE_SIZE; // Tile: log(row sum) + l_max scratch, fp32
+        const uint32_t gl_ubuf_stage2_offset = 2 * STAGE2_UB_UINT8_BLOCK_SIZE + 3 * UB_UINT8_LINE_SIZE; // Tile: global combine scale in fp32
+        const uint32_t lo_ubuf_stage2_offset = 4 * STAGE2_UB_UINT8_BLOCK_SIZE + 3 * UB_UINT8_LINE_SIZE;
+        const uint32_t to_ubuf_stage2_offset = 8 * STAGE2_UB_UINT8_BLOCK_SIZE + 3 * UB_UINT8_LINE_SIZE;
+        const uint32_t go_ubuf_stage2_offset = 12 * STAGE2_UB_UINT8_BLOCK_SIZE + 3 * UB_UINT8_LINE_SIZE;
+        const uint32_t go16_ubuf_stage2_offset = 16 * STAGE2_UB_UINT8_BLOCK_SIZE + 3 * UB_UINT8_LINE_SIZE;
+
+        __ubuf__ float *ll_ubuf_stage2_tensor =
+            reinterpret_cast<__ubuf__ float *>((uintptr_t)ll_ubuf_stage2_offset);
+        __ubuf__ float *lm_ubuf_stage2_tensor =
+            reinterpret_cast<__ubuf__ float *>((uintptr_t)lm_ubuf_stage2_offset);
+        __ubuf__ float *tl_ubuf_stage2_tensor =
+            reinterpret_cast<__ubuf__ float *>((uintptr_t)tl_ubuf_stage2_offset);
+        __ubuf__ float *rs_ubuf_stage2_tensor =
+            reinterpret_cast<__ubuf__ float *>((uintptr_t)rs_ubuf_stage2_offset);
+        __ubuf__ float *ts_ubuf_stage2_tensor =
+            reinterpret_cast<__ubuf__ float *>((uintptr_t)ts_ubuf_stage2_offset);
+        __ubuf__ float *gl_ubuf_stage2_tensor =
+            reinterpret_cast<__ubuf__ float *>((uintptr_t)gl_ubuf_stage2_offset);
+        __ubuf__ float *lo_ubuf_stage2_tensor =
+            reinterpret_cast<__ubuf__ float *>((uintptr_t)lo_ubuf_stage2_offset);
+        __ubuf__ float *to_ubuf_stage2_tensor =
+            reinterpret_cast<__ubuf__ float *>((uintptr_t)to_ubuf_stage2_offset);
+        __ubuf__ float *go_ubuf_stage2_tensor =
+            reinterpret_cast<__ubuf__ float *>((uintptr_t)go_ubuf_stage2_offset);
+        __ubuf__ OUT_DTYPE *go16_ubuf_stage2_tensor =
+            reinterpret_cast<__ubuf__ OUT_DTYPE *>((uintptr_t)go16_ubuf_stage2_offset);
+
+        uint32_t batch_size = num_tokens;
+        uint32_t split_block = 1;
+        uint32_t __k0 = embedding_size;
+        uint32_t roundk_64 = (__k0 + 63) / 64 * 64;
+        uint32_t roundk_8 = (__k0 + 7) / 8 * 8;
+        uint32_t core_per_batch = (q_heads + split_block - 1) / split_block;
+
+        uint32_t process_num = core_per_batch * batch_size;
+        set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
+        for (uint32_t process = block_idx; process < process_num; process += uint32_t(block_num)){
+            uint32_t cur_batch = process / core_per_batch + prefill_batch_size_;
+            uint32_t offset_tiling = tiling_head_size + tiling_para_size * cur_batch;
+            cur_batch = (uint32_t)(*((__gm__ uint32_t *)tiling_gm + 13 + offset_tiling));
+            offset_tiling = tiling_head_size + tiling_para_size * cur_batch;
+            uint32_t kv_seqlen = (uint32_t)(*((__gm__ uint32_t *)tiling_gm + 1 + offset_tiling));
+            if (kv_seqlen == 0) {
+                continue;
+            }
+            uint32_t addr_o_high32 = (uint32_t)(*((__gm__ uint32_t *)tiling_gm + 6 + offset_tiling));
+            uint32_t addr_o_loww32 = (uint32_t)(*((__gm__ uint32_t *)tiling_gm + 7 + offset_tiling));
+            uint64_t addr_o_scalar = (uint64_t)(((uint64_t)addr_o_high32) << 32 | addr_o_loww32);
+            uint32_t addr_o_fd_high32 = (uint32_t)(*((__gm__ uint32_t *)tiling_gm + 15 + offset_tiling));
+            uint32_t addr_o_fd_loww32 = (uint32_t)(*((__gm__ uint32_t *)tiling_gm + 16 + offset_tiling));
+            uint64_t addr_o_fd_scalar = (uint64_t)(((uint64_t)addr_o_fd_high32) << 32 | addr_o_fd_loww32);
+            uint32_t addr_l_high32 = (uint32_t)(*((__gm__ uint32_t *)tiling_gm + 11 + offset_tiling));
+            uint32_t addr_l_loww32 = (uint32_t)(*((__gm__ uint32_t *)tiling_gm + 12 + offset_tiling));
+            uint64_t addr_l_scalar = (uint64_t)(((uint64_t)addr_l_high32) << 32 | addr_l_loww32);
+
+            uint32_t kv_seqlen_align = (kv_seqlen + block_size - 1) / block_size * block_size;
+            uint32_t m_split = (kv_seqlen_align + kv_split_per_core - 1) /  kv_split_per_core;
+            uint32_t cur_core = process % core_per_batch;
+            uint32_t cur_head_num = split_block; // Number of query heads processed in this split
+            if (cur_core == (core_per_batch - 1)){
+                cur_head_num = q_heads - cur_core * split_block;
+            }
+            uint32_t start_head = cur_core * split_block;
+            uint64_t addr_l_offset = addr_l_scalar;
+            uint64_t addr_o_offset = addr_o_fd_scalar * kv_split_core_num;
+            uint32_t l_remain = m_split % FLOAT_BLOCK_SIZE;
+            wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
+            copy_gm_to_ubuf_align_b32(
+                ll_ubuf_stage2_tensor,
+                l_gm + (addr_l_offset + start_head * kv_split_core_num),
+                0,                            // sid
+                1,                            // nBurst
+                m_split * 4,                  // lenBurst
+                0,                           // leftPaddingNum
+                FLOAT_BLOCK_SIZE - l_remain,  // rightPaddingNum
+                0,                           // srcGap
+                0   // dstGap
+            );
+
+            set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+            wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+
+            __set_mask(m_split);
+            vcmax((lm_ubuf_stage2_tensor), (ll_ubuf_stage2_tensor),
+                (m_split + FLOAT_VECTOR_SIZE - 1) / FLOAT_VECTOR_SIZE,
+                1,
+                1,
+                8,
+                static_cast<Order_t>(ORDER_ONLY_VALUE)
+            );
+            pipe_barrier(PIPE_V);
+
+            // lse_accum - lse_max
+            set_flag(PIPE_V, PIPE_S, EVENT_ID3);
+            wait_flag(PIPE_V, PIPE_S, EVENT_ID3);
+            float lse_max = -(float)(*lm_ubuf_stage2_tensor);
+            set_flag(PIPE_S, PIPE_V, EVENT_ID2);
+            wait_flag(PIPE_S, PIPE_V, EVENT_ID2);
+            vadds((tl_ubuf_stage2_tensor), (ll_ubuf_stage2_tensor), lse_max,
+                (m_split + FLOAT_VECTOR_SIZE - 1) / FLOAT_VECTOR_SIZE,
+                1,
+                1,
+                8,
+                8
+            );
+            pipe_barrier(PIPE_V);
+
+            // expf
+            vexp((tl_ubuf_stage2_tensor), (tl_ubuf_stage2_tensor),
+                (m_split + FLOAT_VECTOR_SIZE - 1) / FLOAT_VECTOR_SIZE,
+                1,
+                1,
+                8,
+                8
+            );
+            pipe_barrier(PIPE_V);
+
+            // rowsum lse_sum
+            vcadd((rs_ubuf_stage2_tensor), (tl_ubuf_stage2_tensor),
+                (m_split + FLOAT_VECTOR_SIZE - 1) / FLOAT_VECTOR_SIZE,
+                1,
+                1,
+                8,
+                0
+            );
+            pipe_barrier(PIPE_V);
+            __set_mask(cur_head_num);
+            vln((rs_ubuf_stage2_tensor), (rs_ubuf_stage2_tensor),
+                (cur_head_num + FLOAT_VECTOR_SIZE - 1) / FLOAT_VECTOR_SIZE,
+                1,
+                1,
+                8,
+                8
+            );
+            pipe_barrier(PIPE_V);
+
+            // logf(lse_sum) + lse_max
+            vadd((ts_ubuf_stage2_tensor), (rs_ubuf_stage2_tensor), (lm_ubuf_stage2_tensor),
+                (cur_head_num + FLOAT_VECTOR_SIZE - 1) / FLOAT_VECTOR_SIZE,
+                1,
+                1,
+                1,
+                8,
+                8,
+                8
+            );
+            pipe_barrier(PIPE_V);
+
+            // scale = expf(lse_accum(l) - lse_logsum)
+            __set_mask(m_split);
+            set_flag(PIPE_V, PIPE_S, EVENT_ID3);
+            wait_flag(PIPE_V, PIPE_S, EVENT_ID3);
+            float log_sum = -(float)(*ts_ubuf_stage2_tensor);
+            set_flag(PIPE_S, PIPE_V, EVENT_ID2);
+            wait_flag(PIPE_S, PIPE_V, EVENT_ID2);
+            vadds((gl_ubuf_stage2_tensor), (ll_ubuf_stage2_tensor), log_sum,
+                (m_split + FLOAT_VECTOR_SIZE - 1) / FLOAT_VECTOR_SIZE,
+                1,
+                1,
+                8,
+                8
+            );
+            pipe_barrier(PIPE_V);
+
+            __set_mask(m_split);
+            vexp((gl_ubuf_stage2_tensor), (gl_ubuf_stage2_tensor),
+                (m_split + FLOAT_VECTOR_SIZE - 1) / FLOAT_VECTOR_SIZE,
+                1,
+                1,
+                8,
+                8
+            );
+            pipe_barrier(PIPE_V);
+            // msplit * 1 * embedding
+            copy_gm_to_ubuf_align_b32(
+                lo_ubuf_stage2_tensor,
+                o_core_tmp_gm + (addr_o_offset + start_head * kv_split_core_num * __k0),
+                0,                                           // sid
+                m_split,                                     // nBurst
+                __k0 * 4,                                    // lenBurst
+                0,                                           // leftPaddingNum
+                0,                                           // rightPaddingNum
+                0,                                           // srcGap
+                0                                            // dstGap
+            );
+            set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1);
+            wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1);
+            set_vector_mask((uint64_t)-1, (uint64_t)-1);
+            for (uint32_t n_idx = 0; n_idx < m_split; n_idx++){
+                set_flag(PIPE_V, PIPE_S, EVENT_ID3);
+                wait_flag(PIPE_V, PIPE_S, EVENT_ID3);
+                float scale = (float)(*(gl_ubuf_stage2_tensor + n_idx));
+                set_flag(PIPE_S, PIPE_V, EVENT_ID2);
+                wait_flag(PIPE_S, PIPE_V, EVENT_ID2);
+
+                vmuls((to_ubuf_stage2_tensor), (lo_ubuf_stage2_tensor + (n_idx * roundk_8)), scale,
+                (roundk_64 + FLOAT_VECTOR_SIZE - 1) / FLOAT_VECTOR_SIZE,
+                1,
+                1,
+                8,
+                8
+            );
+                pipe_barrier(PIPE_V);
+
+                if (n_idx == 0){
+                    vadds((go_ubuf_stage2_tensor), (to_ubuf_stage2_tensor), 0,
+                roundk_64 / FLOAT_VECTOR_SIZE,
+                1,
+                1,
+                8,
+                8
+            );
+                    pipe_barrier(PIPE_V);
+
+                }
+                else{
+                    vadd((go_ubuf_stage2_tensor), (to_ubuf_stage2_tensor), (go_ubuf_stage2_tensor),
+                roundk_64 / FLOAT_VECTOR_SIZE,
+                1,
+                1,
+                1,
+                8,
+                8,
+                8
+            );
+                    pipe_barrier(PIPE_V);
+
+                }
+            }
+            conv_v<float, OUT_DTYPE>(go16_ubuf_stage2_tensor,
+                go_ubuf_stage2_tensor,
+                roundk_64 / FLOAT_VECTOR_SIZE,   // repeat
+                1,                               // dstBlockStride
+                1,                               // srcBlockStride
+                4,                               // dstRepeatStride
+                8                                // srcRepeatStride
+            );
+            pipe_barrier(PIPE_V);
+            set_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);
+            wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);
+            copy_ubuf_to_gm_align_b16(
+                o_gm + (addr_o_scalar + start_head * __k0),
+                go16_ubuf_stage2_tensor,
+                0,                       // sid
+                1,                       // nBurst
+                __k0 * 2,                // lenBurst
+                0,                       // leftPaddingNum
+                0,                       // rightPaddingNum
+                0,                       // srcGap
+                0                        // dstGap
+            );
+            set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
+        }
+        wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
+    }
+
+    __aicore__ __attribute__((always_inline)) inline void AddMask(
+        __gm__ OUT_DTYPE *__restrict__ mask_gm_tensor,
+        __ubuf__ OUT_DTYPE *__restrict__ mask_ubuf_tensor,
+        __ubuf__ float *__restrict__ mask32_ubuf_tensor,
+        uint32_t sub_m,
+        uint32_t qk_n,
+        uint32_t qk_round_n,
+        uint32_t mask_offset)
+    {
+        uint32_t mask_repeat_stride = head_stride == 0 ? 0 : qk_round_n / FLOAT_BLOCK_SIZE;
+        uint32_t mask_nburst = head_stride == 0 ? 1 : sub_m;
+        copy_gm_to_ubuf_align_b16(
+            mask_ubuf_tensor,
+            mask_gm_tensor,
+            0,                                 // sid
+            mask_nburst,                             // nBurst
+            qk_n * 2,                          // lenBurst
+            0,                                 // leftPaddingNum
+            0,                                 // rightPaddingNum
+            (max_context_len - qk_n) * 2,      // srcGap
+            0                                  // dstGap
+        );
+        set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+        wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+        set_vector_mask((uint64_t)-1, (uint64_t)-1);
+        conv_v<OUT_DTYPE, float>(mask32_ubuf_tensor,
+            mask_ubuf_tensor,
+            (mask_nburst * qk_round_n + FLOAT_VECTOR_SIZE - 1) / FLOAT_VECTOR_SIZE,  // repeat
+            1,                                                         // dstBlockStride
+            1,                                                         // srcBlockStride
+            8,                                                         // dstRepeatStride
+            4                                                          // srcRepeatStride
+        );
+        pipe_barrier(PIPE_V);
+        // *** ls = ls + mask
+        if (qk_round_n  > FLOAT_BLOCK_SIZE * 255) {
+            for (uint32_t vadd_idx = 0; vadd_idx < sub_m; ++vadd_idx){
+                vadd((ls32_ubuf_tensor + (vadd_idx * qk_round_n)), (ls32_ubuf_tensor + (vadd_idx * qk_round_n)), (mask32_ubuf_tensor + (vadd_idx * mask_repeat_stride * FLOAT_BLOCK_SIZE)),
+                qk_n / FLOAT_VECTOR_SIZE,
+                1,
+                1,
+                1,
+                8,
+                8,
+                8
+            );
+            }
+            if (qk_n % FLOAT_VECTOR_SIZE > 0) {
+                uint32_t offset = qk_n / FLOAT_VECTOR_SIZE * FLOAT_VECTOR_SIZE;
+                __set_mask(qk_n % FLOAT_VECTOR_SIZE);
+                for (uint32_t vadd_idx = 0; vadd_idx < sub_m; ++vadd_idx) {
+                    vadd((ls32_ubuf_tensor + (vadd_idx * qk_round_n + offset)), (ls32_ubuf_tensor + (vadd_idx * qk_round_n + offset)), (mask32_ubuf_tensor + (vadd_idx * qk_round_n + offset)),
+                1,
+                1,
+                1,
+                1,
+                1,
+                1,
+                1
+            );
+                }
+                set_vector_mask((uint64_t)-1, (uint64_t)-1);
+            }
+        } else {
+            for (uint32_t vadd_idx = 0; vadd_idx < qk_n / FLOAT_VECTOR_SIZE; ++vadd_idx) {
+                vadd((ls32_ubuf_tensor + (vadd_idx * FLOAT_VECTOR_SIZE)), (ls32_ubuf_tensor + (vadd_idx * FLOAT_VECTOR_SIZE)), (mask32_ubuf_tensor + (vadd_idx * FLOAT_VECTOR_SIZE)),
+                sub_m,
+                1,
+                1,
+                1,
+                qk_round_n / FLOAT_BLOCK_SIZE,
+                qk_round_n / FLOAT_BLOCK_SIZE,
+                mask_repeat_stride
+            );
+            }
+            if (qk_n % FLOAT_VECTOR_SIZE > 0) {
+                __set_mask(qk_n % FLOAT_VECTOR_SIZE);
+                vadd((ls32_ubuf_tensor + (qk_n / FLOAT_VECTOR_SIZE * FLOAT_VECTOR_SIZE)), (ls32_ubuf_tensor + (qk_n / FLOAT_VECTOR_SIZE * FLOAT_VECTOR_SIZE)), (mask32_ubuf_tensor + (qk_n / FLOAT_VECTOR_SIZE * FLOAT_VECTOR_SIZE)),
+                sub_m,
+                1,
+                1,
+                1,
+                qk_round_n / FLOAT_BLOCK_SIZE,
+                qk_round_n / FLOAT_BLOCK_SIZE,
+                mask_repeat_stride
+            );
+                set_vector_mask((uint64_t)-1, (uint64_t)-1);
+            }
+        }
+        pipe_barrier(PIPE_V);
+    }
+
+
+   __aicore__ __attribute__((always_inline)) inline void ReduceMaxRepeatM(
+        __ubuf__ float *__restrict__ dst,
+        __ubuf__ float *__restrict__ src,
+        __ubuf__ float *__restrict__ tempTensor,
+        uint32_t sub_m,
+        uint32_t qk_n,
+        uint32_t qk_round_n)
+    {
+        if (qk_n <= FLOAT_VECTOR_SIZE) {
+            __set_mask(qk_n);
+            vcmax((dst), (src),
+                sub_m,
+                1,
+                1,
+                qk_round_n / FLOAT_BLOCK_SIZE,
+                static_cast<Order_t>(ORDER_ONLY_VALUE)
+            );
+        } else {
+            copy_ubuf_to_ubuf(
+                tempTensor,
+                src,
+                0,                                             // sid
+                sub_m,                                         // nBurst
+                HALF_VECTOR_SIZE / BLOCK_SIZE,                 // lenBurst
+                (qk_round_n - FLOAT_VECTOR_SIZE) / FLOAT_BLOCK_SIZE,  // srcGap
+                0                                              // dstGap
+            );
+            pipe_barrier(PIPE_V);
+            for (uint32_t rowmax_idx = 1; rowmax_idx < qk_n / FLOAT_VECTOR_SIZE; ++rowmax_idx) {
+                vmax((tempTensor), (tempTensor), (src + (rowmax_idx * FLOAT_VECTOR_SIZE)),
+                sub_m,
+                1,
+                1,
+                1,
+                8,
+                8,
+                qk_round_n / FLOAT_BLOCK_SIZE
+            );
+            pipe_barrier(PIPE_V);
+            }
+            if (qk_n % FLOAT_VECTOR_SIZE > 0) {
+                __set_mask(qk_n % FLOAT_VECTOR_SIZE);
+                vmax((tempTensor), (tempTensor), (src + (qk_n / FLOAT_VECTOR_SIZE * FLOAT_VECTOR_SIZE)),
+                sub_m,
+                1,
+                1,
+                1,
+                8,
+                8,
+                qk_round_n / FLOAT_BLOCK_SIZE
+            );
+            }
+            pipe_barrier(PIPE_V);
+            set_vector_mask((uint64_t)-1, (uint64_t)-1);
+            vcmax((dst), (tempTensor),
+                sub_m,
+                1,
+                1,
+                8,
+                static_cast<Order_t>(ORDER_ONLY_VALUE)
+            );
+        }
+        set_vector_mask((uint64_t)-1, (uint64_t)-1);
+        pipe_barrier(PIPE_V);
+    }
+
+    __aicore__ __attribute__((always_inline)) inline void ReduceSumRepeatM(
+        __ubuf__ float *__restrict__ dst,
+        __ubuf__ float *__restrict__ src,
+        uint32_t sub_m,
+        uint32_t qk_n,
+        uint32_t qk_round_n)
+    {
+        if (qk_n <= FLOAT_VECTOR_SIZE) {
+            __set_mask(qk_n);
+            vcadd((dst), (src),
+                sub_m,
+                1,
+                1,
+                qk_round_n / FLOAT_BLOCK_SIZE,
+                0
+            );
+            set_vector_mask((uint64_t)-1, (uint64_t)-1);
+        } else {
+            for (uint32_t rowsum_idx = 1; rowsum_idx < qk_n / FLOAT_VECTOR_SIZE; ++rowsum_idx) {
+                vadd((src), (src), (src + (rowsum_idx * FLOAT_VECTOR_SIZE)),
+                sub_m,
+                1,
+                1,
+                1,
+                qk_round_n / FLOAT_BLOCK_SIZE,
+                qk_round_n / FLOAT_BLOCK_SIZE,
+                qk_round_n / FLOAT_BLOCK_SIZE
+            );
+                pipe_barrier(PIPE_V);
+            }
+            if (qk_n % FLOAT_VECTOR_SIZE > 0) {
+                __set_mask(qk_n % FLOAT_VECTOR_SIZE);
+                vadd((src), (src), (src + (qk_n / FLOAT_VECTOR_SIZE * FLOAT_VECTOR_SIZE)),
+                sub_m,
+                1,
+                1,
+                1,
+                qk_round_n / FLOAT_BLOCK_SIZE,
+                qk_round_n / FLOAT_BLOCK_SIZE,
+                qk_round_n / FLOAT_BLOCK_SIZE
+            );
+                set_vector_mask((uint64_t)-1, (uint64_t)-1);
+            }
+            pipe_barrier(PIPE_V);
+
+            vcadd((dst), (src),
+                sub_m,
+                1,
+                1,
+                qk_round_n / FLOAT_BLOCK_SIZE,
+                0
+            );
+        }
+    }
+
+    __aicore__ __attribute__((always_inline)) inline void TensorSubValueRepeatM(
+        __ubuf__ float *__restrict__ dst,
+        __ubuf__ float *__restrict__ src,
+        __ubuf__ float *__restrict__ MaxTensor,
+        __ubuf__ float *__restrict__ tempMaxTensor,
+        uint32_t sub_m,
+        uint32_t round_sub_m,
+        uint32_t qk_n,
+        uint32_t qk_round_n)
+    {
+        vbrcb((__ubuf__ uint32_t *)tempMaxTensor, (__ubuf__ uint32_t *)MaxTensor,
+            1,
+            8,
+            round_sub_m / FLOAT_BLOCK_SIZE
+        );
+        pipe_barrier(PIPE_V);
+        for (uint32_t sub_v_idx = 0; sub_v_idx < qk_n / FLOAT_VECTOR_SIZE; ++sub_v_idx) {
+            vsub((dst + (sub_v_idx * FLOAT_VECTOR_SIZE)), (src + (sub_v_idx * FLOAT_VECTOR_SIZE)), (tempMaxTensor),
+                sub_m,
+                1,
+                1,
+                0,
+                qk_round_n / FLOAT_BLOCK_SIZE,
+                qk_round_n / FLOAT_BLOCK_SIZE,
+                1
+            );
+        }
+        if (qk_n % FLOAT_VECTOR_SIZE > 0) {
+            __set_mask(qk_n % FLOAT_VECTOR_SIZE);
+            vsub((dst + (qk_n / FLOAT_VECTOR_SIZE * FLOAT_VECTOR_SIZE)), (src + (qk_n / FLOAT_VECTOR_SIZE * FLOAT_VECTOR_SIZE)), (tempMaxTensor),
+                sub_m,
+                1,
+                1,
+                0,
+                qk_round_n / FLOAT_BLOCK_SIZE,
+                qk_round_n / FLOAT_BLOCK_SIZE,
+                1
+            );
+            set_vector_mask((uint64_t)-1, (uint64_t)-1);
+        }
+        pipe_barrier(PIPE_V);
+    }
+
+    __aicore__ __attribute__((always_inline)) inline void TensorDivRepeatM(
+        __ubuf__ float *__restrict__ dst,
+        __ubuf__ float *__restrict__ src,
+        __ubuf__ float *__restrict__ src1,
+        uint32_t sub_m, uint32_t qk_n, uint32_t qk_round_n)
+    {
+        pipe_barrier(PIPE_V);
+        for (uint32_t vadd_idx = 0; vadd_idx < qk_n / FLOAT_VECTOR_SIZE; ++vadd_idx) {
+            vdiv((dst + (vadd_idx * FLOAT_VECTOR_SIZE)), (src + (vadd_idx * FLOAT_VECTOR_SIZE)), (src1),
+                sub_m,
+                1,
+                1,
+                0,
+                qk_round_n / FLOAT_BLOCK_SIZE,
+                qk_round_n / FLOAT_BLOCK_SIZE,
+                1
+            );
+        }
+        if (qk_n % FLOAT_VECTOR_SIZE > 0) {
+            __set_mask(qk_n % FLOAT_VECTOR_SIZE);
+            vdiv((dst + (qk_n / FLOAT_VECTOR_SIZE * FLOAT_VECTOR_SIZE)), (src + (qk_n / FLOAT_VECTOR_SIZE * FLOAT_VECTOR_SIZE)), (src1),
+                sub_m,
+                1,
+                1,
+                0,
+                qk_round_n / FLOAT_BLOCK_SIZE,
+                qk_round_n / FLOAT_BLOCK_SIZE,
+                1
+            );
+            set_vector_mask((uint64_t)-1, (uint64_t)-1);
+        }
+        pipe_barrier(PIPE_V);
+    }
+
+    __aicore__ __attribute__((always_inline)) inline void TensorMulRepeatM(
+        __ubuf__ float *__restrict__ dst,
+        __ubuf__ float *__restrict__ src,
+        __ubuf__ float *__restrict__ src1,
+        uint32_t sub_m, uint32_t qk_n, uint32_t qk_round_n, uint32_t src1BlockStride
+    ) {
+        pipe_barrier(PIPE_V);
+        for (uint32_t vadd_idx = 0; vadd_idx < qk_n / FLOAT_VECTOR_SIZE; ++vadd_idx) {
+            vmul((dst + (vadd_idx * FLOAT_VECTOR_SIZE)), (src + (vadd_idx * FLOAT_VECTOR_SIZE)), (src1),
+                sub_m,
+                1,
+                1,
+                src1BlockStride,
+                qk_round_n / FLOAT_BLOCK_SIZE,
+                qk_round_n / FLOAT_BLOCK_SIZE,
+                1
+            );
+        }
+        if (qk_n % FLOAT_VECTOR_SIZE > 0) {
+            __set_mask(qk_n % FLOAT_VECTOR_SIZE);
+            vmul((dst + (qk_n / FLOAT_VECTOR_SIZE * FLOAT_VECTOR_SIZE)), (src + (qk_n / FLOAT_VECTOR_SIZE * FLOAT_VECTOR_SIZE)), (src1),
+                sub_m,
+                1,
+                1,
+                src1BlockStride,
+                qk_round_n / FLOAT_BLOCK_SIZE,
+                qk_round_n / FLOAT_BLOCK_SIZE,
+                1
+            );
+            set_vector_mask((uint64_t)-1, (uint64_t)-1);
+        }
+        pipe_barrier(PIPE_V);
+    }
+
+
+   __aicore__ __attribute__((always_inline)) inline void SoftmaxStage1(
+        __gm__ IN_DTYPE *__restrict__ p_gm_tensor,
+        __gm__ mm1CopyType *__restrict__ s_gm_tensor,
+        __gm__ OUT_DTYPE *__restrict__ mask_gm_tensor,
+        __ubuf__ float *__restrict__ dm32_ubuf_tensor,
+        __ubuf__ float *__restrict__ ll_ubuf_tensor,
+        __ubuf__ float *__restrict__ pm32_ubuf_tensor,
+        uint32_t n_idx,
+        uint32_t qk_n,
+        uint32_t qk_round_n,
+        uint32_t sub_m,
+        uint32_t mask_offset,
+        const uint32_t sub_n_loop,
+        const uint32_t cur_batch,
+        const uint32_t start_kv,
+        const uint32_t real_n_loop,
+	    const uint32_t head_idx,
+        const uint32_t pm_flag_scalar
+    )
+    {
+        uint32_t sub_m_d128 = (sub_m + 127) / 128;  // up aligned to 128
+        uint32_t sub_m_d64 = (sub_m + 63) / 64;     // up aligned to 128
+        uint32_t round_sub_m = (sub_m + 15) / 16 * 16;
+        float quantMax = (float)1 / (float)127;
+        wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2);
+            copy_gm_to_ubuf(
+                ls32_ubuf_tensor,
+                s_gm_tensor,
+                0,                        // sid
+                1,                        // nBurst
+                sub_m * qk_round_n / FLOAT_BLOCK_SIZE,  // lenBurst
+                0,                        // srcGap
+                0                         // dstGap
+            );
+
+
+        set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+        wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+
+        for (uint32_t vadd_idx = 0; vadd_idx < qk_n / FLOAT_VECTOR_SIZE; ++vadd_idx) {
+            vmuls((ls32_ubuf_tensor + (vadd_idx * FLOAT_VECTOR_SIZE)), (ls32_ubuf_tensor + (vadd_idx * FLOAT_VECTOR_SIZE)), tor,
+                sub_m,
+                1,
+                1,
+                qk_round_n / FLOAT_BLOCK_SIZE,
+                qk_round_n / FLOAT_BLOCK_SIZE
+            );
+        }
+        if (qk_n % FLOAT_VECTOR_SIZE > 0) {
+            __set_mask(qk_n % FLOAT_VECTOR_SIZE);
+            vmuls((ls32_ubuf_tensor + (qk_n / FLOAT_VECTOR_SIZE * FLOAT_VECTOR_SIZE)), (ls32_ubuf_tensor + (qk_n / FLOAT_VECTOR_SIZE * FLOAT_VECTOR_SIZE)), tor,
+                sub_m,
+                1,
+                1,
+                qk_round_n / FLOAT_BLOCK_SIZE,
+                qk_round_n / FLOAT_BLOCK_SIZE
+            );
+            set_vector_mask((uint64_t)-1, (uint64_t)-1);
+        }
+        pipe_barrier(PIPE_V);
+
+        if (max_context_len != 0) {
+            wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0);
+            wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
+            AddMask(mask_gm_tensor, mask_ubuf_tensor, mask32_ubuf_tensor, sub_m, qk_n, qk_round_n, mask_offset);
+            pipe_barrier(PIPE_V);
+            set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0);
+        }
+
+        // *** lm = rowmax(ls)
+        ReduceMaxRepeatM(lm32_ubuf_tensor, ls32_ubuf_tensor, lp32_ubuf_tensor, sub_m, qk_n, qk_round_n);
+        if (n_idx != 0) {
+            // *** hm = vmax(lm, gm)
+            vmax((hm32_ubuf_tensor), (lm32_ubuf_tensor), (gm32_ubuf_tensor),
+                sub_m_d64,
+                1,
+                1,
+                1,
+                8,
+                8,
+                8
+            );
+            pipe_barrier(PIPE_V);
+            // *** dm = gm - hm
+            vsub((dm32_ubuf_tensor), (gm32_ubuf_tensor), (hm32_ubuf_tensor),
+                sub_m_d64,
+                1,
+                1,
+                1,
+                8,
+                8,
+                8
+            );
+            pipe_barrier(PIPE_V);
+        } else {
+            // *** hm = lm
+            copy_ubuf_to_ubuf(
+                hm32_ubuf_tensor,
+                lm32_ubuf_tensor,
+                0,                         // sid
+                1,                         // nBurst
+                round_sub_m / FLOAT_BLOCK_SIZE,  // lenBurst
+                0,                         // srcGap
+                0                          // dstGap
+            );
+            pipe_barrier(PIPE_V);
+        }
+        // *** gm = hm
+        copy_ubuf_to_ubuf(
+            gm32_ubuf_tensor,
+            hm32_ubuf_tensor,
+            0,                         // sid
+            1,                         // nBurst
+            round_sub_m / FLOAT_BLOCK_SIZE,  // lenBurst
+            0,                         // srcGap
+            0                          // dstGap
+        );
+        pipe_barrier(PIPE_V);
+        // *** hm_block = expand_to_block(hm), materialized in tv
+        if constexpr (SplitKV) {
+            if (n_idx == 0) {
+                if (gl_flag_scalar == 1) {
+                    wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID2);
+                    gl_flag_scalar = 0;
+                }
+            }
+        }
+        // *** ls = ls - hm_block
+        TensorSubValueRepeatM(ls32_ubuf_tensor, ls32_ubuf_tensor,
+                           hm32_ubuf_tensor, tv32_ubuf_tensor,
+                           sub_m, round_sub_m, qk_n, qk_round_n);
+        // *** ls = exp(ls)
+        vexp((ls32_ubuf_tensor), (ls32_ubuf_tensor),
+                (sub_m * qk_round_n + FLOAT_VECTOR_SIZE - 1) / FLOAT_VECTOR_SIZE,
+                1,
+                1,
+                8,
+                8
+            );
+        pipe_barrier(PIPE_V);
+            // *** lp = castfp32to16(ls)
+            conv_v<float, OUT_DTYPE>(lp_ubuf_tensor,
+                ls32_ubuf_tensor,
+                (sub_m * qk_round_n + FLOAT_VECTOR_SIZE - 1) / FLOAT_VECTOR_SIZE,  // repeat
+                1,                               // dstBlockStride
+                1,                               // srcBlockStride
+                4,                               // dstRepeatStride
+                8                                // srcRepeatStride
+            );
+        pipe_barrier(PIPE_V);
+        set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+        wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+        copy_ubuf_to_gm(
+            p_gm_tensor,
+            lp_ubuf_tensor,
+            0,                        // sid
+            1,                        // nBurst
+            sub_m * qk_round_n * T_BLOCK_OFFSET / T_BLOCK_SIZE,  // lenBurst
+            0,                        // srcGap
+            0                         // dstGap
+        );
+        if (max_context_len != 0){
+            set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
+        }
+        // *** ll = rowsum(ls32)
+        ReduceSumRepeatM(ll_ubuf_tensor, ls32_ubuf_tensor, sub_m, qk_n, qk_round_n);
+        set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2);
+        pipe_barrier(PIPE_V);
+    }
+
+
+    __aicore__ __attribute__((always_inline)) inline void SoftmaxStage2(
+        __gm__ mm2CopyType *__restrict__ o_tmp_gm_ptr,
+        __ubuf__ float *__restrict__ dm32_ubuf_tensor,
+        __ubuf__ float *__restrict__ ll_ubuf_tensor,
+        __ubuf__ float *__restrict__ pm32_ubuf_tensor,
+        uint32_t n_idx,
+        uint32_t n_loop,
+        uint32_t qk_n,
+        uint32_t qk_round_n,
+        uint32_t sub_m,
+        uint64_t l_offset,
+        uint64_t o_offset,
+        uint32_t head_idx,
+        uint32_t pm_flag_scalar)
+    {
+        uint32_t sub_m_d64 = (sub_m + 63) / 64;     // up aligned to 64
+        uint32_t round_sub_m = (sub_m + 15) / 16 * 16;
+        uint32_t round_k =  RoundUp<16>(__k);
+            wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0);
+            copy_gm_to_ubuf(
+                lo_ubuf_tensor,
+                o_tmp_gm_ptr,
+                0,                    // sid
+                1,                    // nBurst
+                sub_m * round_k / FLOAT_BLOCK_SIZE,  // lenBurst
+                0,                    // srcGap
+                0                     // dstGap
+            );
+            set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+            wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+
+        set_vector_mask((uint64_t)-1, (uint64_t)-1);
+        // *** Update L and O
+        if (n_idx != 0) {
+            // *** dm = exp(dm)
+            vexp((dm32_ubuf_tensor), (dm32_ubuf_tensor),
+                sub_m_d64,
+                1,
+                1,
+                8,
+                8
+            );
+            pipe_barrier(PIPE_V);
+            // *** gl = dm * gl
+            vmul((gl32_ubuf_tensor), (dm32_ubuf_tensor), (gl32_ubuf_tensor),
+                sub_m_d64,
+                1,
+                1,
+                1,
+                8,
+                8,
+                8
+            );
+            pipe_barrier(PIPE_V);
+            // *** gl = ll + gl
+            vadd((gl32_ubuf_tensor), (gl32_ubuf_tensor), (ll_ubuf_tensor),
+                sub_m_d64,
+                1,
+                1,
+                1,
+                8,
+                8,
+                8
+            );
+            pipe_barrier(PIPE_V);
+            // *** dm_block = expand_to_block(dm), materialized in tv
+            vbrcb((__ubuf__ uint32_t *)tv32_ubuf_tensor, (__ubuf__ uint32_t *)dm32_ubuf_tensor,
+            1,
+            8,
+            round_sub_m / FLOAT_BLOCK_SIZE
+        );
+            pipe_barrier(PIPE_V);
+            if (go_flag_scalar == 1) {
+                wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID0);
+                go_flag_scalar = 0;
+            }
+            // *** go = go * dm_block
+            for (uint32_t vmul_idx = 0; vmul_idx < __k / FLOAT_VECTOR_SIZE; ++vmul_idx) {
+                vmul((go32_ubuf_tensor + (vmul_idx * FLOAT_VECTOR_SIZE)), (go32_ubuf_tensor + (vmul_idx * FLOAT_VECTOR_SIZE)), (tv32_ubuf_tensor),
+                sub_m,
+                1,
+                1,
+                0,
+                round_k / FLOAT_BLOCK_SIZE,
+                round_k / FLOAT_BLOCK_SIZE,
+                1
+            );
+            }
+            if (__k % FLOAT_VECTOR_SIZE > 0) {
+                __set_mask(__k % FLOAT_VECTOR_SIZE);
+                vmul((go32_ubuf_tensor + (__k / FLOAT_VECTOR_SIZE * FLOAT_VECTOR_SIZE)), (go32_ubuf_tensor + (__k / FLOAT_VECTOR_SIZE * FLOAT_VECTOR_SIZE)), (tv32_ubuf_tensor),
+                sub_m,
+                1,
+                1,
+                0,
+                round_k / FLOAT_BLOCK_SIZE,
+                round_k / FLOAT_BLOCK_SIZE,
+                1
+            );
+                set_vector_mask((uint64_t)-1, (uint64_t)-1);
+            }
+            pipe_barrier(PIPE_V);
+            // *** go = lo + go
+            vadd((go32_ubuf_tensor), (go32_ubuf_tensor), (lo_ubuf_tensor),
+                (sub_m * round_k + FLOAT_VECTOR_SIZE - 1) / FLOAT_VECTOR_SIZE,
+                1,
+                1,
+                1,
+                8,
+                8,
+                8
+            );
+            pipe_barrier(PIPE_V);
+        } else {
+            // *** gl = ll
+            copy_ubuf_to_ubuf(
+                gl32_ubuf_tensor,
+                ll_ubuf_tensor,
+                0,                // sid
+                1,                // nBurst
+                round_sub_m / FLOAT_BLOCK_SIZE,  // lenBurst
+                0,                // srcGap
+                0                 // dstGap
+            );
+            pipe_barrier(PIPE_V);
+            if (go_flag_scalar == 1) {
+                wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID0);
+                go_flag_scalar = 0;
+            }
+            // *** go = lo
+            copy_ubuf_to_ubuf(
+                go32_ubuf_tensor,
+                lo_ubuf_tensor,
+                0,                    // sid
+                1,                    // nBurst
+                sub_m * round_k / FLOAT_BLOCK_SIZE,  // lenBurst
+                0,                    // srcGap
+                0                     // dstGap
+            );
+            pipe_barrier(PIPE_V);
+        }
+
+        set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0);
+
+        if (n_idx == n_loop - 1) {
+            // *** gl_block = expand_to_block(gl), materialized in tv
+            vbrcb((__ubuf__ uint32_t *)tv32_ubuf_tensor, (__ubuf__ uint32_t *)gl32_ubuf_tensor,
+            1,
+            8,
+            round_sub_m / FLOAT_BLOCK_SIZE
+        );
+            pipe_barrier(PIPE_V);
+            // *** go = go / gl_block
+            for (uint32_t vdiv_idx = 0; vdiv_idx < __k / FLOAT_VECTOR_SIZE; ++vdiv_idx) {
+                vdiv((go32_ubuf_tensor + (vdiv_idx * FLOAT_VECTOR_SIZE)), (go32_ubuf_tensor + (vdiv_idx * FLOAT_VECTOR_SIZE)), (tv32_ubuf_tensor),
+                sub_m,
+                1,
+                1,
+                0,
+                round_k / FLOAT_BLOCK_SIZE,
+                round_k / FLOAT_BLOCK_SIZE,
+                1
+            );
+            }
+            if (__k % FLOAT_VECTOR_SIZE > 0) {
+                __set_mask(__k % FLOAT_VECTOR_SIZE);
+                vdiv((go32_ubuf_tensor + (__k / FLOAT_VECTOR_SIZE * FLOAT_VECTOR_SIZE)), (go32_ubuf_tensor + (__k / FLOAT_VECTOR_SIZE * FLOAT_VECTOR_SIZE)), (tv32_ubuf_tensor),
+                sub_m,
+                1,
+                1,
+                0,
+                round_k / FLOAT_BLOCK_SIZE,
+                round_k / FLOAT_BLOCK_SIZE,
+                1
+            );
+                set_vector_mask((uint64_t)-1, (uint64_t)-1);  // fix hidden_size=96
+            }
+            pipe_barrier(PIPE_V);
+
+            if constexpr (SplitKV) {
+                // log(l)
+                    vln((tv32_ubuf_tensor), (tv32_ubuf_tensor),
+                sub_m,
+                1,
+                1,
+                8,
+                8
+            );
+                    pipe_barrier(PIPE_V);
+                    vbrcb((__ubuf__ uint32_t *)hm32_ubuf_tensor, (__ubuf__ uint32_t *)gm32_ubuf_tensor,
+            1,
+            8,
+            round_sub_m / FLOAT_BLOCK_SIZE
+        );
+                    pipe_barrier(PIPE_V);
+                    // logf(lse_sum) + lse_max
+                    vadd((tv32_ubuf_tensor), (tv32_ubuf_tensor), (hm32_ubuf_tensor),
+                sub_m,
+                1,
+                1,
+                1,
+                8,
+                8,
+                8
+            );
+                    CopyScale(sub_m, l_offset, o_offset);
+            } else {
+
+                // *** go = castfp32to16(go)
+                conv_v<float, OUT_DTYPE>(go_ubuf_tensor,
+                    go32_ubuf_tensor,
+                    (sub_m * round_k + FLOAT_VECTOR_SIZE - 1) / FLOAT_VECTOR_SIZE,  // repeat
+                    1,                            // dstBlockStride
+                    1,                            // srcBlockStride
+                    4,                            // dstRepeatStride
+                    8                             // srcRepeatStride
+                );
+                set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+                wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+                copy_ubuf_to_gm_align_b16(
+                    o_gm + ((int64_t)o_offset),
+                    go_ubuf_tensor,
+                    0,        // sid
+                    sub_m,    // nBurst
+                    __k * 2,  // lenBurst
+                    0,        // leftPaddingNum
+                    0,        // rightPaddingNum
+                    0,        // srcGap
+                    0         // dstGap
+                );
+            }
+            // ********************* move O to GM ************************
+            if (go_flag_scalar == 0) {
+                set_flag(PIPE_MTE3, PIPE_V, EVENT_ID0);
+                go_flag_scalar = 1;
+            }
+        }
+    }
+
+
+    __aicore__ __attribute__((always_inline)) inline void InnerRunVector(uint32_t cur_batch, uint32_t start_head, uint32_t cur_nIndx, uint32_t cur_kv_seqlen, uint32_t cur_head_num,
+                                                                         uint32_t offset_tiling, uint32_t kv_seqlen, uint32_t embed_split_size_v, uint32_t embed_split_loop_v)
+    {
+        uint32_t kv_start_head = start_head / group_num; //30 ~ 32
+        uint32_t kv_end_head = (start_head + cur_head_num + group_num - 1) / group_num;
+        uint32_t cur_kvhead_num = kv_end_head - kv_start_head;
+        uint32_t kv_head_idx = kv_start_head + sub_block_idx * cur_kvhead_num / 2;
+        uint32_t head_idx = start_head + sub_block_idx * cur_head_num / 2;
+        uint32_t addr_o_high32 = (uint32_t)(*((__gm__ uint32_t *)tiling_gm + 6 + offset_tiling));
+        uint32_t addr_o_loww32 = (uint32_t)(*((__gm__ uint32_t *)tiling_gm + 7 + offset_tiling));
+        uint64_t addr_o_scalar = (uint64_t)(((uint64_t)addr_o_high32) << 32 | addr_o_loww32);
+        uint32_t mask_high32 = (uint32_t)(*((__gm__ int32_t *)tiling_gm + 10 + offset_tiling));
+        uint32_t mask_loww32 = (uint32_t)(*((__gm__ int32_t *)tiling_gm + 14 + offset_tiling));
+        uint64_t mask_scalar = (uint64_t)(((uint64_t)mask_high32) << 32 | mask_loww32);
+        uint32_t addr_l_high32 = 0;
+        uint32_t addr_l_loww32 = 0;
+        uint64_t addr_l_scalar = 0;
+        uint64_t o_offset = 0;
+        uint32_t l_offset = 0;
+        // o #((num_tokens, num_heads, kvsplit, head_size))
+        // l  (numt_tokens, num_heads, kvsplit)
+        if constexpr (SplitKV) {
+            addr_l_high32 = (uint32_t)(*((__gm__ uint32_t *)tiling_gm + 11 + offset_tiling));
+            addr_l_loww32 = (uint32_t)(*((__gm__ uint32_t *)tiling_gm + 12 + offset_tiling));
+            addr_l_scalar = (uint64_t)(((uint64_t)addr_l_high32) << 32 | addr_l_loww32);
+            uint32_t addr_o_fd_high32 = (uint32_t)(*((__gm__ uint32_t *)tiling_gm + 15 + offset_tiling));
+            uint32_t addr_o_fd_loww32 = (uint32_t)(*((__gm__ uint32_t *)tiling_gm + 16 + offset_tiling));
+            uint64_t addr_o_fd_scalar = (uint64_t)(((uint64_t)addr_o_fd_high32) << 32 | addr_o_fd_loww32);
+            o_offset = addr_o_fd_scalar * kv_split_core_num + head_idx * __k * kv_split_core_num + cur_nIndx * __k;
+            l_offset = addr_l_scalar + head_idx * kv_split_core_num + cur_nIndx;
+        } else {
+                o_offset = addr_o_scalar + head_idx * embedding_size;
+        }
+        uint32_t pp_n_scalar = block_size_calc;
+        uint32_t sub_n_loop = pp_n_scalar / block_size;
+        uint32_t real_n_loop = (cur_kv_seqlen + block_size - 1) / block_size;
+
+        uint32_t n_loop = (cur_kv_seqlen + pp_n_scalar - 1) / pp_n_scalar;
+        uint64_t mask_offset = cur_batch % modCoef / divCoef * batch_stride + head_idx * head_stride + (uint64_t)cur_nIndx * kv_split_per_core;
+        mask_offset += mask_scalar;
+
+        uint32_t qk_n = pp_n_scalar;
+        uint32_t qk_round_n = RoundUp<BLOCK_SIZE>(qk_n);
+
+        uint32_t qk_n_2 = pp_n_scalar;
+        uint32_t qk_round_n_2 = RoundUp<BLOCK_SIZE>(qk_n);
+
+        uint32_t sub_m = (sub_block_idx == 1) ? (cur_head_num - cur_head_num / 2) : cur_head_num / 2;
+        uint32_t sub_m_d128 = (sub_m + 127) / 128;  // up aligned to 128
+        uint32_t sub_m_d64 = (sub_m + 63) / 64;     // up aligned to 128
+        uint32_t round_sub_m = (sub_m + 15) / 16 * 16;
+
+        uint32_t start_kv = cur_nIndx * kv_split_per_core;
+
+
+        uint32_t hiddenSizeOffset = kv_head_idx * embedding_size;
+        uint32_t gm_scale_hidden_size = kv_head_idx * embedding_size;
+        uint32_t hiddenSizeOffset1 = k_bias_flag ? hiddenSizeOffset : 0;
+        uint32_t hiddenSizeOffset2 = v_bias_flag ? hiddenSizeOffset : 0;
+        uint32_t sub_m_kv = (sub_block_idx == 1) ? (cur_kvhead_num - cur_kvhead_num / 2) : cur_kvhead_num / 2;
+
+
+        for (uint32_t n_idx = 0; n_idx < n_loop; n_idx+=2) {
+            if (n_idx == (n_loop - 1)) {
+                qk_n = (cur_kv_seqlen - n_idx * pp_n_scalar);
+                qk_round_n = RoundUp<16>(qk_n);
+            }
+            if ((n_idx + 1) == (n_loop - 1)) {
+                qk_n_2 = (cur_kv_seqlen - (n_idx + 1) * pp_n_scalar);
+                qk_round_n_2 = RoundUp<16>(qk_n_2);
+            }
+            wait_flag_dev(QK_READY_DECODER);
+            /* ************ softmax1 stage1  ************* */
+            wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID3);
+            if (sub_m > 0) {
+                // input QK shape (sub_m, qk_round_n)
+                SoftmaxStage1(
+                    p_gm + ((uint64_t)block_idx * TMP_SIZE * T_BLOCK_OFFSET +
+                        (uint64_t)sub_block_idx * cur_head_num / 2 * qk_round_n * T_BLOCK_OFFSET),
+                    s_gm + ((int64_t)block_idx * TMP_SIZE_DECODER +
+                        (int64_t)sub_block_idx * cur_head_num / 2 * qk_round_n),
+                    mask_gm + (mask_offset + (uint64_t)n_idx * pp_n_scalar),
+                    dm32_ubuf_tensor, ll_ubuf_tensor, pm32_ubuf_tensor,
+                    n_idx, qk_n, qk_round_n, sub_m, mask_offset, sub_n_loop, cur_batch, start_kv, real_n_loop, head_idx, pm_flag_scalar1
+                );
+               // input QK shape (sub_m, qk_round_n)
+            }
+            pipe_barrier(PIPE_MTE3);
+            DdrBarrierBeforeFfts();
+            FftsCrossCoreSync<PIPE_MTE3, 2>(SOFTMAX_READY_DECODER);
+            set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID3);
+            wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID3);
+            /* ************ softmax1 stage2  ************* */
+            if (n_idx + 1 < n_loop) {
+                wait_flag_dev(QK_READY_STAGE2);
+                if (sub_m > 0) {
+                    SoftmaxStage1(
+                        p_gm + ((uint64_t)block_idx * TMP_SIZE * T_BLOCK_OFFSET  +
+                            (uint64_t)sub_block_idx * cur_head_num / 2 * qk_round_n_2 * T_BLOCK_OFFSET +
+                            TMP_SIZE * T_BLOCK_OFFSET / 2),
+                        s_gm + ((int64_t)block_idx * TMP_SIZE_DECODER +
+                            (int64_t)sub_block_idx * cur_head_num / 2 * qk_round_n_2 +
+                            TMP_SIZE_DECODER / 2),
+                        mask_gm + (mask_offset + (uint64_t)(n_idx + 1) * pp_n_scalar),
+                        dm32_stage2_ubuf_tensor, ll_stage2_ubuf_tensor, pm32_ubuf_stage2_tensor,
+                        (n_idx + 1), qk_n_2, qk_round_n_2, sub_m, mask_offset, sub_n_loop, cur_batch, start_kv, real_n_loop, head_idx, pm_flag_scalar2
+                    );
+
+                }
+                pipe_barrier(PIPE_MTE3);
+                DdrBarrierBeforeFfts();
+                FftsCrossCoreSync<PIPE_MTE3, 2>(SOFTMAX_READY_STAGE2);
+            }
+            set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID3);
+            /* ************ softmax2 stage1  ************* */
+            wait_flag_dev(UPDATE_READY_DECODER);
+            uint32_t embed_split_size = embed_split_size_v;
+            uint32_t round_embed_split_size = RoundUp<BLOCK_SIZE>(embed_split_size);
+            if (sub_m > 0) {
+                    SoftmaxStage2(
+                        o_tmp_gm + ((int64_t)block_idx * TMP_SIZE +
+                        sub_block_idx * cur_head_num / 2 * RoundUp<16>(__k)),
+                        dm32_ubuf_tensor, ll_ubuf_tensor, pm32_ubuf_tensor,
+                        n_idx, n_loop, qk_n, RoundUp<T_BLOCK_SIZE>(qk_round_n), sub_m, l_offset, o_offset, head_idx, pm_flag_scalar1);
+            }
+            /* ************ softmax2 stage2  ************* */
+            embed_split_size = embed_split_size_v;
+            round_embed_split_size = RoundUp<BLOCK_SIZE>(embed_split_size);
+            if (n_idx + 1 < n_loop) {
+                wait_flag_dev(UPDATE_READY_STAGE2);
+                if (sub_m > 0) {
+                        SoftmaxStage2(
+                            o_tmp_gm + ((int64_t)block_idx * TMP_SIZE +
+                            sub_block_idx * cur_head_num / 2 * RoundUp<16>(__k) +
+                                TMP_SIZE / 2),
+                            dm32_stage2_ubuf_tensor, ll_stage2_ubuf_tensor, pm32_ubuf_stage2_tensor,
+                            (n_idx + 1), n_loop, qk_n_2, RoundUp<T_BLOCK_SIZE>(qk_round_n_2), sub_m, l_offset, o_offset, head_idx, pm_flag_scalar2);
+                }
+            }
+        }
+    }
+
+private:
+
+    __gm__ mm1CopyType *__restrict__ s_gm{nullptr};
+    __gm__ IN_DTYPE *__restrict__ p_gm{nullptr};
+    __gm__ mm2CopyType *__restrict__ o_tmp_gm{nullptr};
+    __gm__ float *__restrict__ go_gm{nullptr};
+    __gm__ float *__restrict__ o_core_tmp_gm{nullptr};
+    __gm__ float *__restrict__ l_gm{nullptr};
+    __gm__ int32_t* __restrict__ gm_block_tables_{nullptr};
+
+    __gm__ OUT_DTYPE *__restrict__ o_gm{nullptr};
+    __gm__ OUT_DTYPE *__restrict__ mask_gm{nullptr};
+    __gm__ uint8_t *__restrict__ tiling_gm{nullptr};
+    __gm__ float *__restrict__ logN_gm{nullptr};
+
+    UbufAlloc<pagedAttnVariant> UbAllocator;
+    const uint32_t ls32_ubuf_offset = UbAllocator.ls32_ubuf_offset;
+    const uint32_t lp_ubuf_offset = UbAllocator.lp_ubuf_offset;
+    const uint32_t lp32_ubuf_offset = UbAllocator.lp32_ubuf_offset;
+    const uint32_t mask_ubuf_offset = UbAllocator.mask_ubuf_offset;
+    const uint32_t lo_ubuf_offset = UbAllocator.lo_ubuf_offset;
+    const uint32_t mask32_ubuf_offset = UbAllocator.mask32_ubuf_offset;
+    const uint32_t ls16_ubuf_offset = UbAllocator.ls16_ubuf_offset;
+
+    const uint32_t lm32_ubuf_offset = UbAllocator.lm32_ubuf_offset;
+    const uint32_t hm32_ubuf_offset = UbAllocator.hm32_ubuf_offset;
+    const uint32_t pm32_ubuf_offset = UbAllocator.pm32_ubuf_offset;
+    const uint32_t pm32_ubuf_stage2_offset = UbAllocator.pm32_ubuf_stage2_offset;
+    const uint32_t dm32_ubuf_offset = UbAllocator.dm32_ubuf_offset;
+    const uint32_t dm32_ubuf_stage2_offset = UbAllocator.dm32_ubuf_stage2_offset;
+    const uint32_t ll_ubuf_offset = UbAllocator.ll_ubuf_offset;
+    const uint32_t ll_ubuf_stage2_offset = UbAllocator.ll_ubuf_stage2_offset;
+    const uint32_t gm32_ubuf_offset = UbAllocator.gm32_ubuf_offset;
+    const uint32_t gl_ubuf_offset = UbAllocator.gl_ubuf_offset;
+    const uint32_t gl32_ubuf_offset = UbAllocator.gl32_ubuf_offset;
+    const uint32_t go_ubuf_offset = UbAllocator.go_ubuf_offset;
+    const uint32_t go32_ubuf_offset = UbAllocator.go32_ubuf_offset;
+    const uint32_t tv32_ubuf_offset = UbAllocator.tv32_ubuf_offset;
+
+
+
+
+
+    __ubuf__ float *ls32_ubuf_tensor = reinterpret_cast<__ubuf__ float *>((uintptr_t)ls32_ubuf_offset);
+    __ubuf__ half *ls16_ubuf_tensor = reinterpret_cast<__ubuf__ half *>((uintptr_t)ls32_ubuf_offset);
+    __ubuf__ int32_t *lsint32_ubuf_tensor = reinterpret_cast<__ubuf__ int32_t *>((uintptr_t)ls32_ubuf_offset);
+    __ubuf__ IN_DTYPE *lp_ubuf_tensor = reinterpret_cast<__ubuf__ IN_DTYPE *>((uintptr_t)lp_ubuf_offset);
+    __ubuf__ float *lp32_ubuf_tensor = reinterpret_cast<__ubuf__ float *>((uintptr_t)lp32_ubuf_offset);
+    __ubuf__ OUT_DTYPE *mask_ubuf_tensor = reinterpret_cast<__ubuf__ OUT_DTYPE *>((uintptr_t)mask_ubuf_offset);
+    __ubuf__ float *lo_ubuf_tensor = reinterpret_cast<__ubuf__ float *>((uintptr_t)lo_ubuf_offset);
+    __ubuf__ int32_t *loint32_ubuf_tensor = reinterpret_cast<__ubuf__ int32_t *>((uintptr_t)lo_ubuf_offset);
+    __ubuf__ float *mask32_ubuf_tensor = reinterpret_cast<__ubuf__ float *>((uintptr_t)mask32_ubuf_offset);
+    __ubuf__ float *lm32_ubuf_tensor = reinterpret_cast<__ubuf__ float *>((uintptr_t)lm32_ubuf_offset);
+    __ubuf__ float *hm32_ubuf_tensor = reinterpret_cast<__ubuf__ float *>((uintptr_t)hm32_ubuf_offset);
+    __ubuf__ float *pm32_ubuf_tensor = reinterpret_cast<__ubuf__ float *>((uintptr_t)pm32_ubuf_offset);
+    __ubuf__ float *pm32_ubuf_stage2_tensor = reinterpret_cast<__ubuf__ float *>((uintptr_t)pm32_ubuf_stage2_offset);
+    __ubuf__ float *gm32_ubuf_tensor = reinterpret_cast<__ubuf__ float *>((uintptr_t)gm32_ubuf_offset);
+    __ubuf__ float *dm32_ubuf_tensor = reinterpret_cast<__ubuf__ float *>((uintptr_t)dm32_ubuf_offset);
+
+    __ubuf__ float *dm32_stage2_ubuf_tensor = reinterpret_cast<__ubuf__ float *>((uintptr_t)dm32_ubuf_stage2_offset);
+    __ubuf__ float *ll_ubuf_tensor = reinterpret_cast<__ubuf__ float *>((uintptr_t)ll_ubuf_offset);
+    __ubuf__ float *ll_stage2_ubuf_tensor = reinterpret_cast<__ubuf__ float *>((uintptr_t)ll_ubuf_stage2_offset);
+    __ubuf__ OUT_DTYPE *gl_ubuf_tensor = reinterpret_cast<__ubuf__ OUT_DTYPE *>((uintptr_t)gl_ubuf_offset);
+    __ubuf__ float *gl32_ubuf_tensor = reinterpret_cast<__ubuf__ float *>((uintptr_t)gl32_ubuf_offset);
+    __ubuf__ float *tv32_ubuf_tensor = reinterpret_cast<__ubuf__ float *>((uintptr_t)tv32_ubuf_offset);
+    __ubuf__ OUT_DTYPE *go_ubuf_tensor = reinterpret_cast<__ubuf__ OUT_DTYPE *>((uintptr_t)go_ubuf_offset);
+    __ubuf__ float *go32_ubuf_tensor = reinterpret_cast<__ubuf__ float *>((uintptr_t)go32_ubuf_offset);
+    __ubuf__ int32_t *goint32_ubuf_tensor = reinterpret_cast<__ubuf__ int32_t *>((uintptr_t)go32_ubuf_offset);
+
+
+    __gm__ OUT_DTYPE *gm_k16_ping_{nullptr};
+    __gm__ OUT_DTYPE *gm_k16_pong_{nullptr};
+    __gm__ OUT_DTYPE *gm_v16_ping_{nullptr};
+    __gm__ OUT_DTYPE *gm_v16_pong_{nullptr};
+
+    uint32_t k_bias_flag{0};
+    uint32_t v_bias_flag{0};
+
+    uint32_t go_flag_scalar{1};
+    uint32_t gl_flag_scalar{1};
+    uint32_t pm_flag_scalar1{1};
+    uint32_t pm_flag_scalar2{0};
+    ScaleType scaleType = ScaleType::SCALE_TOR;
+    float tor_logN{0};
+    uint32_t num_tokens{0};
+    uint32_t q_heads{0};
+    uint32_t num_kv_heads{0};
+    uint32_t embedding_size{0};
+    uint32_t embedding_size_v{0};
+    uint32_t block_size{0};
+    uint32_t max_context_len{0};
+    uint32_t start_head{0};
+    uint32_t cur_head_num{0};
+    uint32_t __k{0};
+    uint32_t round_k{0};
+    uint32_t __v{0};
+    uint32_t round_v{0};
+    uint32_t cur_batch{0};
+    float tor{0};
+    uint64_t sub_block_idx{0};
+    uint32_t batch_stride{0};
+    uint32_t head_stride{0};
+    uint64_t former_batch{0};
+    uint32_t former_head_split{0};
+    uint32_t split_size{0};
+    uint64_t tail_batch{0};
+    uint32_t tail_head_split{0};
+    uint32_t core_per_batch{0};
+    uint32_t process_num{0};
+    uint32_t block_idx{0};
+    uint32_t block_num{1};
+    uint32_t tiling_head_size{0};
+    uint32_t tiling_para_size{0};
+    uint32_t kv_split_per_core{0};
+    uint32_t kv_split_core_num{0};
+    uint32_t block_size_calc{0};
+    uint32_t former_group_num_move{1};
+    uint32_t tail_group_num_move{1};
+    uint32_t embed_split_size_v_former{0};
+    uint32_t embed_split_loop_v_former{1};
+    uint32_t embed_split_size_v_tail{0};
+    uint32_t embed_split_loop_v_tail{1};
+
+
+    uint32_t modCoef{0xffffffff}; // batch_idx % modCoef (multi-head adaptive compression tiling)
+    uint32_t divCoef{1}; // batch_idx / divCoef (multi-head adaptive compression tiling)
+    uint32_t q_head_original{0};
+    uint32_t compressHead{0};
+
+    uint32_t max_num_blocks_per_query{0};
+    uint32_t group_num{0};
+
+    uint32_t prefill_batch_size_;
+    uint32_t decoder_batch_size_;
+};
+#endif
diff --git a/tests/st/a2a3/fully_distributed_within_core/spmd_paged_attention_highperf/kernels/orchestration/paged_attention_highperf_orch.cpp b/tests/st/a2a3/fully_distributed_within_core/spmd_paged_attention_highperf/kernels/orchestration/paged_attention_highperf_orch.cpp
new file mode 100644
index 000000000..2c27e131a
--- /dev/null
+++ b/tests/st/a2a3/fully_distributed_within_core/spmd_paged_attention_highperf/kernels/orchestration/paged_attention_highperf_orch.cpp
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+#include <cinttypes>
+#include <cstdint>
+
+#include "pto_orchestration_api.h"  // NOLINT(build/include_subdir)
+
+#define FUNC_PA_AIC 0
+#define FUNC_PA_AIV 1
+
+extern "C" {
+
+__attribute__((visibility("default"))) PTO2OrchestrationConfig aicpu_orchestration_config(const L2TaskArgs &orch_args) {
+    (void)orch_args;
+    return PTO2OrchestrationConfig{
+        .expected_arg_count = 16,
+    };
+}
+
+__attribute__((visibility("default"))) void aicpu_orchestration_entry(const L2TaskArgs &orch_args) {
+    int64_t block_dim = static_cast<int64_t>(orch_args.scalar(0));
+
+    LOG_INFO_V1("SPMD PA highperf: block_dim=%" PRId64, block_dim);
+
+    const Tensor &query = orch_args.tensor(0).ref();
+    const Tensor &key_cache = orch_args.tensor(1).ref();
+    const Tensor &value_cache = orch_args.tensor(2).ref();
+    const Tensor &block_table = orch_args.tensor(3).ref();
+    const Tensor &out = orch_args.tensor(4).ref();
+    const Tensor &s_gm = orch_args.tensor(5).ref();
+    const Tensor &p_gm = orch_args.tensor(6).ref();
+    const Tensor &o_tmp_gm = orch_args.tensor(7).ref();
+    const Tensor &go_gm = orch_args.tensor(8).ref();
+    const Tensor &o_core_tmp_gm = orch_args.tensor(9).ref();
+    const Tensor &l_gm = orch_args.tensor(10).ref();
+    const Tensor &gm_k16 = orch_args.tensor(11).ref();
+    const Tensor &gm_v16 = orch_args.tensor(12).ref();
+    const Tensor &tiling = orch_args.tensor(13).ref();
+    const Tensor &null_tensor = orch_args.tensor(14).ref();
+
+    L0TaskArgs args;
+    args.add_input(query);
+    args.add_input(key_cache);
+    args.add_input(value_cache);
+    args.add_input(block_table);
+    args.add_inout(out);
+    args.add_inout(s_gm);
+    args.add_inout(p_gm);
+    args.add_inout(o_tmp_gm);
+    args.add_inout(go_gm);
+    args.add_inout(o_core_tmp_gm);
+    args.add_inout(l_gm);
+    args.add_inout(gm_k16);
+    args.add_inout(gm_v16);
+    args.add_input(tiling);
+    args.add_input(null_tensor);
+    args.launch_spec.set_block_num(static_cast<int16_t>(block_dim));
+
+    MixedKernels mk;
+    mk.aic_kernel_id = FUNC_PA_AIC;
+    mk.aiv0_kernel_id = FUNC_PA_AIV;
+    mk.aiv1_kernel_id = FUNC_PA_AIV;
+    rt_submit_task(mk, args);
+}
+
+}  // extern "C"
diff --git a/tests/st/a2a3/fully_distributed_within_core/spmd_paged_attention_highperf/kernels/pa_tiling.py b/tests/st/a2a3/fully_distributed_within_core/spmd_paged_attention_highperf/kernels/pa_tiling.py
new file mode 100644
index 000000000..b98e28ccb
--- /dev/null
+++ b/tests/st/a2a3/fully_distributed_within_core/spmd_paged_attention_highperf/kernels/pa_tiling.py
@@ -0,0 +1,484 @@
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""
+Python port of the PagedAttention ND tiling logic from ascend-transformer.
+"""
+
+from __future__ import annotations
+
+import struct
+
+import torch
+
+TILING_HEAD_SIZE = 44
+TILING_PARA_SIZE = 17
+
+
+TILING_BATCH = 0
+TILING_NUMHEADS = 1
+TILING_HEADDIM = 2
+TILING_NUMBLOKS = 3
+TILING_BLOCKSIZE = 4
+TILING_MAXBLOCKS = 5
+TILING_TOR = 6
+TILING_KVHEADS = 7
+TILING_FORMER_BATCH = 8
+TILING_FORMER_HEAD = 9
+TILING_TAIL_BATCH = 10
+TILING_TAIL_HEAD = 11
+TILING_HEADNUM_MOVE = 12
+TILING_MASK_MAX_LEN = 13
+TILING_BATCH_STRIDE = 14
+TILING_HEAD_STRIDE = 15
+TILING_KEY = 16
+TILING_HEADSIZE = 17
+TILING_PARASIZE = 18
+TILING_GROUPNUM = 19
+TILING_FORMER_GROUP_MOVE = 20
+TILING_TAIL_GROUP_MOVE = 21
+TILING_MAX_KVSEQLEN = 22
+TILING_KVSPLIT = 23
+TILING_KVCORENUM = 24
+TILING_BLOCKSIZE_CALC = 25
+TILING_TOTAL_BLOCK_NUM = 26
+TILING_PREFILL_BS = 27
+TILING_DECODER_BS = 28
+TILING_HEADDIM_V = 29
+TILING_MODCOEF = 30
+TILING_DIVCOEF = 31
+TILING_QHEADORIGINAL = 32
+TILING_COMPRESSHEAD = 33
+TILING_QUANTYPE = 34
+TILING_DATA_SHAPE_TYPE = 35
+TILING_SCALETYPE = 36
+TILING_MASK_TYPE_ND = 37
+TILING_HEADDIM_K_SPLIT = 38
+TILING_HEADDIM_V_SPLIT = 39
+TILING_HEADDIM_V_SPLIT_VECTOR_FORMER = 40
+TILING_HEADDIM_V_SPLIT_VECTOR_TAIL = 41
+
+
+WORKSPACE_BLOCK_SIZE_DB = 65536
+BLOCK_SIZE_ALIGN = 16
+SPLITKV_RATIO = 0.8
+SPLITHEAD_RATIO = 0.9
+HEADNUM_LIMIT = 128
+HEADNUM_LIMIT_REGU = 32
+EMBEDDING_LIMIT = 128
+MLA_THRESHOLD = 256
+KV_SEQLEN_SLICE = 128
+KV_SEQLEN_SLICE_256 = 256
+KV_SEQLEN_SLICE_512 = 512
+BLOCK_LIMIT = 128 * 128
+BLOCK_LIMIT_NO_PINGPONG_UINT8 = 128 * 256 * 2
+PP_MM = [16, 32, 48, 64, 80, 96, 112, 128]
+PP_BLOCK_BUFFER_SIZE = 128 * 128
+SPECIAL_NUM_TOKENS = 16
+SPECIAL_NUM_HEADS = 32
+
+
+def _round_up(v: int, align: int) -> int:
+    return ((v + align - 1) // align) * align
+
+
+def _ceil_div(a: int, b: int) -> int:
+    return (a + b - 1) // b
+
+
+def _f32_bits(f: float) -> int:
+    return struct.unpack("I", struct.pack("f", float(f)))[0]
+
+
+def _hi32(v: int) -> int:
+    return (v >> 32) & 0xFFFFFFFF
+
+
+def _lo32(v: int) -> int:
+    return v & 0xFFFFFFFF
+
+
+def _u32_to_i32(v: int) -> int:
+    v &= 0xFFFFFFFF
+    return v - 0x100000000 if v & 0x80000000 else v
+
+
+def _calcu_head_nd(num_heads: int, kv_heads: int, former_head_split: int, tail_head_split: int):
+    """CalcuHeadNd: compute group move factors."""
+    kv_real = kv_heads if kv_heads > 0 else num_heads
+    group_num = num_heads // kv_real
+
+    former_group_move = 1
+    if former_head_split % group_num == 0:
+        former_group_move = group_num
+    elif former_head_split < group_num and (kv_real == 1 or group_num % former_head_split == 0):
+        former_group_move = former_head_split
+
+    tail_group_move = 1
+    if tail_head_split > 0:
+        if tail_head_split % group_num == 0:
+            tail_group_move = group_num
+        elif tail_head_split < group_num and (kv_real == 1 or group_num % tail_head_split == 0):
+            tail_group_move = tail_head_split
+
+    return group_num, former_group_move, tail_group_move
+
+
+def _split_core_bn_nd(
+    num_heads: int,
+    kv_heads: int,
+    decoder_batch: int,
+    block_dim: int,
+    max_kv_seq_len: int,
+    block_size: int,
+    is_mla: bool,
+    is_quant: bool,
+):
+    """SplitCoreBNND: split by (Batch, Head) dimensions."""
+    kv_real = kv_heads if kv_heads > 0 else num_heads
+    core_per_batch = _ceil_div(block_dim, decoder_batch)
+
+    if block_dim * SPLITKV_RATIO <= decoder_batch <= block_dim and is_quant and kv_real == 1:
+        core_per_batch = 1
+
+    head_split = _ceil_div(num_heads, core_per_batch)
+    head_split = min(head_split, HEADNUM_LIMIT_REGU)
+
+    if decoder_batch == SPECIAL_NUM_TOKENS and num_heads == SPECIAL_NUM_HEADS:
+        head_split = 8
+
+    loop_len = _ceil_div(num_heads, head_split)
+    block = loop_len * decoder_batch
+
+    former_batch = decoder_batch
+    tail_batch = 0
+    former_head_split = head_split
+    tail_head_split = 0
+
+    if block > block_dim:
+        process_loop = block // block_dim
+        former_batch = process_loop * block_dim // loop_len
+        tail_batch = decoder_batch - former_batch
+        process_remain = tail_batch * loop_len
+        adj_last_head = (process_remain < SPECIAL_NUM_TOKENS) and (tail_batch > 0)
+        if (num_heads != kv_real) and not (kv_real == 1):
+            adj_last_head = adj_last_head and (tail_batch <= block_dim // 2)
+        if adj_last_head:
+            if is_mla and is_quant:
+                core_per_batch2 = block_dim // tail_batch
+            else:
+                core_per_batch2 = _ceil_div(block_dim, tail_batch)
+            tail_head_split = _ceil_div(num_heads, core_per_batch2)
+            tail_head_split = min(tail_head_split, HEADNUM_LIMIT_REGU)
+        else:
+            former_batch = decoder_batch
+            tail_batch = 0
+
+    eff_block_dim = min(block_dim, block)
+    kv_split_per_core = _round_up(max_kv_seq_len, block_size)
+    kv_split_core_num = 1
+
+    group_num, former_gm, tail_gm = _calcu_head_nd(num_heads, kv_real, former_head_split, tail_head_split)
+    return (
+        eff_block_dim,
+        former_batch,
+        former_head_split,
+        tail_batch,
+        tail_head_split,
+        kv_split_per_core,
+        kv_split_core_num,
+        group_num,
+        former_gm,
+        tail_gm,
+    )
+
+
+def _split_core_bns_nd(
+    num_heads: int,
+    kv_heads: int,
+    decoder_batch: int,
+    block_dim: int,
+    max_kv_seq_len: int,
+    block_size: int,
+    is_long_seq: bool,
+):
+    """SplitCoreBNSND: split by (Batch, Head, KVseq) dimensions."""
+    kv_real = kv_heads if kv_heads > 0 else num_heads
+    kv_seq_aligned = _round_up(max_kv_seq_len, block_size)
+    kv_seq_block_num = kv_seq_aligned // block_size
+
+    if is_long_seq:
+        kv_block_per_core = _ceil_div(kv_seq_block_num, block_dim)
+    else:
+        core_per_batch = _ceil_div(block_dim, decoder_batch)
+        kv_block_per_core = _ceil_div(kv_seq_block_num, core_per_batch)
+
+    kv_split_per_core = kv_block_per_core * block_size
+    kv_split_core_num = _ceil_div(kv_seq_aligned, kv_split_per_core)
+
+    core_per_kv = 1
+    if decoder_batch * kv_split_core_num < block_dim:
+        core_per_kv = _ceil_div(block_dim, decoder_batch * kv_split_core_num)
+
+    head_split = _ceil_div(num_heads, core_per_kv)
+    head_split = min(head_split, HEADNUM_LIMIT_REGU)
+
+    head_core_num = _ceil_div(num_heads, head_split)
+    block = head_core_num * decoder_batch * kv_split_core_num
+    eff_block_dim = min(block_dim, block)
+
+    former_batch = decoder_batch
+    tail_batch = 0
+    former_head_split = head_split
+    tail_head_split = 0
+
+    group_num, former_gm, tail_gm = _calcu_head_nd(num_heads, kv_real, former_head_split, tail_head_split)
+    return (
+        eff_block_dim,
+        former_batch,
+        former_head_split,
+        tail_batch,
+        tail_head_split,
+        kv_split_per_core,
+        kv_split_core_num,
+        group_num,
+        former_gm,
+        tail_gm,
+    )
+
+
+def make_pa_nd_decode_tiling(  # noqa: PLR0913, PLR0915
+    batch: int,
+    kv_seq_lens: list[int],
+    num_heads: int,
+    kv_heads: int,
+    head_dim: int,
+    head_dim_v: int,
+    num_blocks: int,
+    block_size: int,
+    max_blocks_per_query: int,
+    scale: float,
+    block_dim: int,
+    device: str = "npu",
+    dtype: torch.dtype = torch.float16,
+) -> tuple[torch.Tensor, int]:
+    """
+    Build PAGED_ATTENTION_MASK_ND tiling for decode-only GQA.
+
+    Args:
+        batch:               number of sequences
+        kv_seq_lens:         KV context length per sequence
+        num_heads:           number of Q attention heads
+        kv_heads:            number of KV heads (GQA), 0 means = num_heads
+        head_dim:            head dimension for QK
+        head_dim_v:          head dimension for V  (== head_dim for standard GQA)
+        num_blocks:          total number of KV cache blocks
+        block_size:          tokens per KV cache block
+        max_blocks_per_query: max blocks in block_table row
+        scale:               softmax scale (1/sqrt(head_dim) typically)
+        block_dim:           number of cube cores (from device properties)
+        device:              torch device string
+        dtype:               fp16 or bf16 (selects tiling key 0 or 1)
+
+    Returns:
+        (tiling_tensor, effective_block_dim)
+    """
+    kv_real = kv_heads if kv_heads > 0 else num_heads
+    max_kv = max(kv_seq_lens)
+    is_mla = head_dim > MLA_THRESHOLD or head_dim_v > MLA_THRESHOLD or head_dim != head_dim_v
+    is_quant = False  # fp16/bf16 only
+
+    indices: list[int] = sorted(range(batch), key=lambda i: kv_seq_lens[i])
+
+    decoder_batch = batch
+    is_long_seq = max_kv >= KV_SEQLEN_SLICE_512 * 8
+
+    use_bn = is_mla or (decoder_batch * num_heads >= block_dim * SPLITKV_RATIO and not is_long_seq)
+
+    if use_bn:
+        (eff_bd, fB, fH, tB, tH, kvSplit, kvCN, gN, fGM, tGM) = _split_core_bn_nd(
+            num_heads,
+            kv_real,
+            decoder_batch,
+            block_dim,
+            max_kv,
+            block_size,
+            is_mla,
+            is_quant,
+        )
+    else:
+        (eff_bd, fB, fH, tB, tH, kvSplit, kvCN, gN, fGM, tGM) = _split_core_bns_nd(
+            num_heads,
+            kv_real,
+            decoder_batch,
+            block_dim,
+            max_kv,
+            block_size,
+            is_long_seq,
+        )
+
+    if (
+        head_dim % 16 == 0
+        and head_dim <= EMBEDDING_LIMIT
+        and head_dim_v % 16 == 0
+        and head_dim_v <= EMBEDDING_LIMIT
+        and kv_real == num_heads
+        and not is_quant
+    ):
+        head_num_move = 2
+    else:
+        head_num_move = 1
+
+    head_dim_k_split = min(head_dim, MLA_THRESHOLD)
+    head_dim_v_split = min(head_dim_v, MLA_THRESHOLD)
+    head_dim_v_split_former = min(head_dim_v, MLA_THRESHOLD) if fGM <= 64 else min(head_dim_v, EMBEDDING_LIMIT)
+    head_dim_v_split_tail = min(head_dim_v, MLA_THRESHOLD) if tGM <= 64 else min(head_dim_v, EMBEDDING_LIMIT)
+
+    if (
+        block_size <= KV_SEQLEN_SLICE // 2
+        and block_size * 2 * head_dim_k_split <= BLOCK_LIMIT
+        and block_size * 2 * head_dim_v_split <= BLOCK_LIMIT
+    ):
+        block_size_calc = block_size * 2
+    elif block_size >= KV_SEQLEN_SLICE and head_dim == KV_SEQLEN_SLICE_256 and head_dim_v == KV_SEQLEN_SLICE_256:
+        block_size_calc = KV_SEQLEN_SLICE
+    else:
+        block_size_calc = block_size
+
+    is_split_key = int(kvCN > 1)
+    is_split_block = int(
+        block_size >= KV_SEQLEN_SLICE and head_dim == KV_SEQLEN_SLICE_256 and head_dim_v == KV_SEQLEN_SLICE_256
+    )
+    type_key = 0 if dtype == torch.float16 else 1
+    tiling_key = (is_split_block << 7) + (is_split_key << 4) + type_key
+
+    total_words = TILING_HEAD_SIZE + batch * TILING_PARA_SIZE
+    tiling = [0] * total_words
+
+    tiling[TILING_BATCH] = batch
+    tiling[TILING_NUMHEADS] = num_heads
+    tiling[TILING_HEADDIM] = head_dim
+    tiling[TILING_NUMBLOKS] = num_blocks
+    tiling[TILING_BLOCKSIZE] = block_size
+    tiling[TILING_MAXBLOCKS] = max_blocks_per_query
+    tiling[TILING_TOR] = _f32_bits(scale)
+    tiling[TILING_KVHEADS] = kv_real
+    tiling[TILING_FORMER_BATCH] = fB
+    tiling[TILING_FORMER_HEAD] = fH
+    tiling[TILING_TAIL_BATCH] = tB
+    tiling[TILING_TAIL_HEAD] = tH
+    tiling[TILING_HEADNUM_MOVE] = head_num_move
+    tiling[TILING_MASK_MAX_LEN] = 0
+    tiling[TILING_BATCH_STRIDE] = 0
+    tiling[TILING_HEAD_STRIDE] = 0
+    tiling[TILING_KEY] = tiling_key
+    tiling[TILING_HEADSIZE] = TILING_HEAD_SIZE
+    tiling[TILING_PARASIZE] = TILING_PARA_SIZE
+    tiling[TILING_GROUPNUM] = gN
+    tiling[TILING_FORMER_GROUP_MOVE] = fGM
+    tiling[TILING_TAIL_GROUP_MOVE] = tGM
+    tiling[TILING_MAX_KVSEQLEN] = max_kv
+    tiling[TILING_KVSPLIT] = kvSplit
+    tiling[TILING_KVCORENUM] = kvCN
+    tiling[TILING_BLOCKSIZE_CALC] = block_size_calc
+    tiling[TILING_TOTAL_BLOCK_NUM] = 0
+    tiling[TILING_PREFILL_BS] = 0
+    tiling[TILING_DECODER_BS] = batch
+    tiling[TILING_HEADDIM_V] = head_dim_v
+    tiling[TILING_MODCOEF] = 0xFFFFFFFF
+    tiling[TILING_DIVCOEF] = 1
+    tiling[TILING_QHEADORIGINAL] = num_heads
+    tiling[TILING_COMPRESSHEAD] = 0
+    tiling[TILING_QUANTYPE] = 0
+    tiling[TILING_DATA_SHAPE_TYPE] = 0
+    tiling[TILING_SCALETYPE] = 0
+    tiling[TILING_MASK_TYPE_ND] = 0
+    tiling[TILING_HEADDIM_K_SPLIT] = head_dim_k_split
+    tiling[TILING_HEADDIM_V_SPLIT] = head_dim_v_split
+    tiling[TILING_HEADDIM_V_SPLIT_VECTOR_FORMER] = head_dim_v_split_former
+    tiling[TILING_HEADDIM_V_SPLIT_VECTOR_TAIL] = head_dim_v_split_tail
+
+    addr_q = 0
+    addr_o = 0
+    total_q_blk = 0
+
+    for seq_idx in range(batch):
+        kv_seqlen = kv_seq_lens[seq_idx]
+        q_seqlen = 1
+
+        q_aligned = _round_up(q_seqlen, BLOCK_SIZE_ALIGN)
+        m_raw = (PP_BLOCK_BUFFER_SIZE // max(head_dim, block_size) // BLOCK_SIZE_ALIGN) * BLOCK_SIZE_ALIGN
+        m_ubd = min(m_raw, q_aligned)
+        m_ubd = max(m_ubd, BLOCK_SIZE_ALIGN)
+        m_idx = min(7, max(0, m_ubd // 16 - 1))
+        m_ubd = PP_MM[m_idx]
+
+        base = TILING_HEAD_SIZE + seq_idx * TILING_PARA_SIZE
+        tiling[base + 0] = q_seqlen
+        tiling[base + 1] = kv_seqlen
+        tiling[base + 2] = m_ubd
+        tiling[base + 3] = block_size
+        tiling[base + 4] = _hi32(addr_q)
+        tiling[base + 5] = _lo32(addr_q)
+        tiling[base + 6] = _hi32(addr_o)
+        tiling[base + 7] = _lo32(addr_o)
+        tiling[base + 8] = seq_idx
+        tiling[base + 9] = total_q_blk
+        tiling[base + 10] = 0
+        tiling[base + 13] = indices[seq_idx]
+        tiling[base + 14] = 0
+
+        addr_q += num_heads * head_dim * q_seqlen
+        addr_o += num_heads * head_dim_v * q_seqlen
+
+    addr_l = 0
+    addr_ofd = 0
+
+    for seq_idx in range(batch):
+        kv_seqlen = kv_seq_lens[seq_idx]
+        if kv_seqlen == 0:
+            continue
+        q_seqlen = 1
+        base = TILING_HEAD_SIZE + seq_idx * TILING_PARA_SIZE
+        tiling[base + 11] = _hi32(addr_l)
+        tiling[base + 12] = _lo32(addr_l)
+        tiling[base + 15] = _hi32(addr_ofd)
+        tiling[base + 16] = _lo32(addr_ofd)
+        addr_l += kvCN * num_heads * q_seqlen
+        addr_ofd += num_heads * head_dim * q_seqlen
+
+    tiling_i32 = [_u32_to_i32(word) for word in tiling]
+    tiling_tensor = torch.tensor(tiling_i32, dtype=torch.int32, device=device)
+
+    return tiling_tensor, eff_bd
+
+
+def workspace_sizes(
+    batch: int,
+    num_heads: int,
+    head_dim: int,
+    head_dim_v: int,
+    block_dim: int,
+) -> dict[str, int]:
+    """Return byte sizes for each workspace tensor (from PagedAttentionTiling scratch sizes)."""
+    basic_half = block_dim * WORKSPACE_BLOCK_SIZE_DB * 2
+    basic_float = block_dim * WORKSPACE_BLOCK_SIZE_DB * 4
+    o_core = int(block_dim * SPLITKV_RATIO) * num_heads * block_dim * head_dim * 4
+    l_size = int(block_dim * SPLITKV_RATIO) * num_heads * block_dim * 4
+    k16 = 2 * block_dim * 256 * num_heads * head_dim * 2
+    v16 = 2 * block_dim * 256 * num_heads * head_dim_v * 2
+    return {
+        "s": basic_float,
+        "p": basic_half,
+        "o_tmp": basic_float * 2,
+        "go": basic_float,
+        "o_core_tmp": max(16, o_core),
+        "l": max(16, l_size),
+        "k16": max(16, k16),
+        "v16": max(16, v16),
+    }
diff --git a/tests/st/a2a3/fully_distributed_within_core/spmd_paged_attention_highperf/kernels/tiling/pa_tiling_struct.h b/tests/st/a2a3/fully_distributed_within_core/spmd_paged_attention_highperf/kernels/tiling/pa_tiling_struct.h
new file mode 100644
index 000000000..4938fafbf
--- /dev/null
+++ b/tests/st/a2a3/fully_distributed_within_core/spmd_paged_attention_highperf/kernels/tiling/pa_tiling_struct.h
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+#ifndef PAGED_HATTENTION_H
+#define PAGED_HATTENTION_H
+
+#include <cstdint>
+
+namespace AtbOps {
+constexpr int32_t BLOCK_SIZE = 16;
+constexpr int32_t BLOCK_SIZE_32 = 32;
+constexpr int32_t TILING_PARA_SIZE = 17;
+constexpr int32_t TILING_HEAD_SIZE = 44;
+constexpr int32_t TILING_HEAD_SIZE_NZ = 128;
+constexpr int32_t TILING_HEAD_SIZE_910A = 192;
+constexpr int32_t TILING_PARA_SIZE_NZ = 8;
+constexpr int32_t M_LIMIT = 128;
+constexpr int32_t FLOAT_LIMIT = 64;
+constexpr int32_t MAX_EMBEDDING = 576;
+constexpr int32_t ND_BATCH_LIMIT = INT32_MAX;
+constexpr int32_t BLOCK_LIMIT = 128 * 128;
+constexpr int32_t BLOCK_LIMIT_NO_PINGPONG = 128 * 256;
+constexpr int32_t BLOCK_LIMIT_NO_PINGPONG_UINT8 = 128 * 256 * 2;
+constexpr int32_t NZ_BLOCK_SIZE = 16;
+constexpr int32_t TILING_KEY_ID = 16;
+constexpr int32_t MLA_BLOCK_SIZE_LIMIT = 128;
+constexpr int32_t MLA_THRESHOLD = 256;
+constexpr int32_t PREFILL_BATCH = 27;
+constexpr int32_t PARALLEL_MAX_HEAD = 256;
+constexpr int32_t PARALLEL_MAX_BLK_SIZE = 128;
+constexpr int32_t PARALLEL_MAX_BATCH = 2000;
+constexpr int32_t WORKSPACE_BLOCK_SIZE_DB = 65536;  // 128 * 256 * 2
+
+enum class TilingKeyType {
+    TILING_HALF_DATA = 0,
+    TILING_BF16_DATA = 1,
+    TILING_INT8_DATA = 2,
+    TILING_INT8_CUBE_QUANT = 4,
+    TILING_INT8_VEC_QUANT = 8,
+    TILING_INT8_VEC_QUANTBF16 = 9,
+    TILING_QUANT_FP16OUT = 12,
+    TILING_QUANT_BF16OUT = 14
+};
+
+enum class CalcType { CALC_TYPE_DEFAULT = 0, CALC_TYPE_MIX = 1, CALC_TYPE_PREFILL = 2 };
+
+enum class DataShapeType { BSND = 0, BNSD = 1 };
+
+enum class CompressType { COMPRESS_TYPE_UNDEFINED = 0, COMPRESS_TYPE_KVHEAD = 1 };
+
+enum class PagedAttnVariant { DEFAULT = 0, MULTI_LATENT = 1 };
+
+using PagedAttentionInfo = struct PagedAttentionTilingParams {
+    int32_t numTokens = 0;
+    int32_t numHeads = 0;
+    int32_t embeddingSize = 0;
+    int32_t embeddingSizeV = 0;
+    int32_t numBlocks = 0;
+    int32_t blockSize = 0;
+    int32_t maxNumBlocksPerQuery = 0;
+    float tor = 0;
+    int32_t kvHeads = 0;
+    int32_t maxPromptLen = 0;
+    int32_t batchStride = 0;
+    int32_t headStride = 0;
+    TilingKeyType type = TilingKeyType::TILING_HALF_DATA;
+    int32_t batch = 0;
+    int32_t isMaskSquare = 0;
+    int32_t *batchRunStatus{nullptr};
+    int32_t *kvSeqLen{nullptr};
+    int32_t modCoef{-1};
+    int32_t divCoef{1};
+    int32_t *qSeqLen{nullptr};
+    int32_t qHeadOriginal = 0;
+    int32_t compressHead = 0;
+    int32_t tBlockAlign = 16;  // L1 tile alignment: 16 for fp16, 32 for int8
+    int32_t dataShapeType = 0;
+};
+
+using AddrOffsets = struct AddressOffsetInfo {
+    uint64_t addrQSeqOffset = 0;
+    uint64_t addrOSeqOffset = 0;
+    uint64_t addrOFdSeqOffset = 0;
+    uint64_t addrLSeqOffset = 0;
+};
+
+}  // namespace AtbOps
+
+#endif
+// PAGED_HATTENTION_H
diff --git a/tests/st/a2a3/fully_distributed_within_core/spmd_paged_attention_highperf/test_spmd_paged_attention_highperf.py b/tests/st/a2a3/fully_distributed_within_core/spmd_paged_attention_highperf/test_spmd_paged_attention_highperf.py
new file mode 100644
index 000000000..8b2ac094c
--- /dev/null
+++ b/tests/st/a2a3/fully_distributed_within_core/spmd_paged_attention_highperf/test_spmd_paged_attention_highperf.py
@@ -0,0 +1,377 @@
+#!/usr/bin/env python3
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""High-performance SPMD paged attention."""
+
+import ctypes
+import math
+import sys
+from pathlib import Path
+
+import torch
+from simpler.task_interface import ArgDirection as D
+
+from simpler_setup import Scalar, SceneTestCase, TaskArgsBuilder, Tensor, scene_test
+
+KERNEL_DIR = Path(__file__).resolve().parent / "kernels"
+sys.path.insert(0, str(KERNEL_DIR))
+
+from pa_tiling import make_pa_nd_decode_tiling, workspace_sizes  # noqa: E402
+
+
+def _pack_kv_to_paged(
+    k_dense: torch.Tensor,
+    v_dense: torch.Tensor,
+    num_kv_heads: int,
+    head_dim: int,
+    block_size: int,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    batch, seq_len, _ = k_dense.shape
+    num_blocks = seq_len // block_size
+    k_page = (
+        k_dense.view(batch, seq_len, num_kv_heads, head_dim)
+        .view(batch, num_blocks, block_size, num_kv_heads, head_dim)
+        .reshape(batch * num_blocks, block_size, num_kv_heads, head_dim)
+        .contiguous()
+    )
+    v_page = (
+        v_dense.view(batch, seq_len, num_kv_heads, head_dim)
+        .view(batch, num_blocks, block_size, num_kv_heads, head_dim)
+        .reshape(batch * num_blocks, block_size, num_kv_heads, head_dim)
+        .contiguous()
+    )
+    block_table = (
+        torch.arange(num_blocks, dtype=torch.int32).unsqueeze(0).expand(batch, -1).clone()
+        + torch.arange(batch, dtype=torch.int32).unsqueeze(1) * num_blocks
+    )
+    return k_page, v_page, block_table
+
+
+def _compute_gqa_golden(
+    q: torch.Tensor,
+    k_page: torch.Tensor,
+    v_page: torch.Tensor,
+    block_table: torch.Tensor,
+    context_lens: torch.Tensor,
+    scale: float,
+) -> torch.Tensor:
+    batch, num_heads, head_dim = q.shape
+    _, block_size, num_kv_heads, _ = k_page.shape
+    heads_per_kv = num_heads // num_kv_heads
+    out = torch.empty(batch, num_heads, head_dim, dtype=q.dtype)
+
+    for batch_idx in range(batch):
+        seq_len = int(context_lens[batch_idx].item())
+        block_count = (seq_len + block_size - 1) // block_size
+        blocks = block_table[batch_idx, :block_count]
+        for head_idx in range(num_heads):
+            kv_head = head_idx // heads_per_kv
+            keys = []
+            values = []
+            remaining = seq_len
+            for block in blocks:
+                valid = min(block_size, remaining)
+                block_id = int(block.item())
+                keys.append(k_page[block_id, :valid, kv_head, :])
+                values.append(v_page[block_id, :valid, kv_head, :])
+                remaining -= valid
+            key = torch.cat(keys, dim=0).float()
+            value = torch.cat(values, dim=0).float()
+            scores = torch.mv(key, q[batch_idx, head_idx].float()) * scale
+            probs = torch.softmax(scores, dim=0)
+            out[batch_idx, head_idx] = torch.mv(value.t(), probs).to(q.dtype)
+
+    return out
+
+
+@scene_test(level=2, runtime="tensormap_and_ringbuffer")
+class TestSpmdPagedAttentionHighPerf(SceneTestCase):
+    RTOL = 5e-3
+    ATOL = 2e-2
+
+    CALLABLE = {
+        "orchestration": {
+            "source": "kernels/orchestration/paged_attention_highperf_orch.cpp",
+            "function_name": "aicpu_orchestration_entry",
+            "signature": [
+                D.IN,
+                D.IN,
+                D.IN,
+                D.IN,
+                D.OUT,
+                D.IN,
+                D.IN,
+                D.IN,
+                D.IN,
+                D.IN,
+                D.IN,
+                D.IN,
+                D.IN,
+                D.IN,
+                D.IN,
+            ],
+        },
+        "incores": [
+            {
+                "func_id": 0,
+                "name": "PA_HIGHPERF_AIC",
+                "source": "kernels/aic/paged_attention_highperf.cpp",
+                "core_type": "aic",
+                "signature": [
+                    D.IN,
+                    D.IN,
+                    D.IN,
+                    D.IN,
+                    D.OUT,
+                    D.IN,
+                    D.IN,
+                    D.IN,
+                    D.IN,
+                    D.IN,
+                    D.IN,
+                    D.IN,
+                    D.IN,
+                    D.IN,
+                    D.IN,
+                ],
+            },
+            {
+                "func_id": 1,
+                "name": "PA_HIGHPERF_AIV",
+                "source": "kernels/aic/paged_attention_highperf.cpp",
+                "core_type": "aiv",
+                "signature": [
+                    D.IN,
+                    D.IN,
+                    D.IN,
+                    D.IN,
+                    D.OUT,
+                    D.IN,
+                    D.IN,
+                    D.IN,
+                    D.IN,
+                    D.IN,
+                    D.IN,
+                    D.IN,
+                    D.IN,
+                    D.IN,
+                    D.IN,
+                ],
+            },
+        ],
+    }
+
+    CASES = [
+        {
+            "name": "b1_h32_kv8_s128_bs128_fp16",
+            # onboard a2a3 enabled: the 'out' golden mismatch is closed by the
+            # producer-side DdrBarrierBeforeFfts cross-core DDR fence, validated
+            # over 19 st-onboard-a2a3 rounds.
+            "platforms": ["a2a3sim", "a2a3"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 24},
+            "params": {
+                "batch": 1,
+                "num_heads": 32,
+                "num_kv_heads": 8,
+                "head_dim": 128,
+                "kv_seq": 128,
+                "block_size": 128,
+                "block_dim": 24,
+                "dtype": "float16",
+            },
+        },
+        {
+            "name": "b4_h32_kv8_s512_bs128_fp16",
+            "manual": True,
+            "platforms": ["a2a3sim", "a2a3"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 24},
+            "params": {
+                "batch": 4,
+                "num_heads": 32,
+                "num_kv_heads": 8,
+                "head_dim": 128,
+                "kv_seq": 512,
+                "block_size": 128,
+                "block_dim": 24,
+                "dtype": "float16",
+            },
+        },
+        {
+            "name": "b1_h32_kv8_s16384_bs128_fp16",
+            "manual": True,
+            "platforms": ["a2a3"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 24},
+            "params": {
+                "batch": 1,
+                "num_heads": 32,
+                "num_kv_heads": 8,
+                "head_dim": 128,
+                "kv_seq": 16384,
+                "block_size": 128,
+                "block_dim": 24,
+                "dtype": "float16",
+            },
+        },
+        {
+            "name": "b1_h32_kv8_s4096_bs128_fp16",
+            "manual": True,
+            "platforms": ["a2a3sim", "a2a3"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 24},
+            "params": {
+                "batch": 1,
+                "num_heads": 32,
+                "num_kv_heads": 8,
+                "head_dim": 128,
+                "kv_seq": 4096,
+                "block_size": 128,
+                "block_dim": 24,
+                "dtype": "float16",
+            },
+        },
+        {
+            "name": "b1_h32_kv8_s6144_bs128_fp16",
+            "manual": True,
+            "platforms": ["a2a3"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 24},
+            "params": {
+                "batch": 1,
+                "num_heads": 32,
+                "num_kv_heads": 8,
+                "head_dim": 128,
+                "kv_seq": 6144,
+                "block_size": 128,
+                "block_dim": 24,
+                "dtype": "float16",
+            },
+        },
+        {
+            "name": "b1_h32_kv8_s8192_bs128_fp16",
+            # enabled in CI to guard the long-sequence fix onboard.
+            "platforms": ["a2a3"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 24},
+            "params": {
+                "batch": 1,
+                "num_heads": 32,
+                "num_kv_heads": 8,
+                "head_dim": 128,
+                "kv_seq": 8192,
+                "block_size": 128,
+                "block_dim": 24,
+                "dtype": "float16",
+            },
+        },
+        {
+            "name": "b2_h32_kv8_s4096_bs128_fp16",
+            "manual": True,
+            "platforms": ["a2a3"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 24},
+            "params": {
+                "batch": 2,
+                "num_heads": 32,
+                "num_kv_heads": 8,
+                "head_dim": 128,
+                "kv_seq": 4096,
+                "block_size": 128,
+                "block_dim": 24,
+                "dtype": "float16",
+            },
+        },
+        {
+            "name": "b2_h32_kv8_s8192_bs128_fp16",
+            "manual": True,
+            "platforms": ["a2a3"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 24},
+            "params": {
+                "batch": 2,
+                "num_heads": 32,
+                "num_kv_heads": 8,
+                "head_dim": 128,
+                "kv_seq": 8192,
+                "block_size": 128,
+                "block_dim": 24,
+                "dtype": "float16",
+            },
+        },
+    ]
+
+    def generate_args(self, params):
+        batch = params["batch"]
+        num_heads = params["num_heads"]
+        num_kv_heads = params["num_kv_heads"]
+        head_dim = params["head_dim"]
+        kv_seq = params["kv_seq"]
+        block_size = params["block_size"]
+        block_dim = params["block_dim"]
+        dtype = getattr(torch, params["dtype"])
+        scale = 1.0 / math.sqrt(float(head_dim))
+
+        torch.manual_seed(42)
+        q = torch.randn(batch, num_heads, head_dim, dtype=dtype)
+        k_dense = torch.randn(batch, kv_seq, num_kv_heads * head_dim, dtype=dtype)
+        v_dense = torch.randn(batch, kv_seq, num_kv_heads * head_dim, dtype=dtype)
+        k_page, v_page, block_table = _pack_kv_to_paged(k_dense, v_dense, num_kv_heads, head_dim, block_size)
+        context_lens = torch.tensor([kv_seq] * batch, dtype=torch.int32)
+
+        tiling, effective_block_dim = make_pa_nd_decode_tiling(
+            batch=batch,
+            kv_seq_lens=context_lens.tolist(),
+            num_heads=num_heads,
+            kv_heads=num_kv_heads,
+            head_dim=head_dim,
+            head_dim_v=head_dim,
+            num_blocks=k_page.shape[0],
+            block_size=block_size,
+            max_blocks_per_query=block_table.shape[1],
+            scale=scale,
+            block_dim=block_dim,
+            device="cpu",
+            dtype=dtype,
+        )
+        ws = workspace_sizes(batch, num_heads, head_dim, head_dim, block_dim)
+
+        return TaskArgsBuilder(
+            Tensor("query", q),
+            Tensor("key_cache", k_page),
+            Tensor("value_cache", v_page),
+            Tensor("block_table", block_table),
+            Tensor("out", torch.zeros(batch, num_heads, head_dim, dtype=dtype)),
+            Tensor("s_gm", torch.zeros(ws["s"], dtype=torch.uint8)),
+            Tensor("p_gm", torch.zeros(ws["p"], dtype=torch.uint8)),
+            Tensor("o_tmp_gm", torch.zeros(ws["o_tmp"], dtype=torch.uint8)),
+            Tensor("go_gm", torch.zeros(ws["go"], dtype=torch.uint8)),
+            Tensor("o_core_tmp_gm", torch.zeros(ws["o_core_tmp"], dtype=torch.uint8)),
+            Tensor("l_gm", torch.zeros(ws["l"], dtype=torch.uint8)),
+            Tensor("gm_k16", torch.zeros(ws["k16"], dtype=torch.uint8)),
+            Tensor("gm_v16", torch.zeros(ws["v16"], dtype=torch.uint8)),
+            Tensor("tiling", tiling),
+            Tensor("null", torch.zeros(1, dtype=torch.uint8)),
+            Scalar("effective_block_dim", ctypes.c_int64(effective_block_dim)),
+        )
+
+    def compute_golden(self, args, params):
+        batch = params["batch"]
+        num_heads = params["num_heads"]
+        num_kv_heads = params["num_kv_heads"]
+        head_dim = params["head_dim"]
+        kv_seq = params["kv_seq"]
+        block_size = params["block_size"]
+        scale = 1.0 / math.sqrt(float(head_dim))
+        context_lens = torch.tensor([kv_seq] * batch, dtype=torch.int32)
+        args.out[:] = _compute_gqa_golden(
+            args.query.reshape(batch, num_heads, head_dim),
+            args.key_cache.reshape(-1, block_size, num_kv_heads, head_dim),
+            args.value_cache.reshape(-1, block_size, num_kv_heads, head_dim),
+            args.block_table,
+            context_lens,
+            scale,
+        )
+
+
+if __name__ == "__main__":
+    SceneTestCase.run_module(__name__)
diff --git a/tests/st/a2a3/fully_distributed_within_core/spmd_starvation/kernels/orchestration/spmd_starvation_orch.cpp b/tests/st/a2a3/fully_distributed_within_core/spmd_starvation/kernels/orchestration/spmd_starvation_orch.cpp
new file mode 100644
index 000000000..f1e66c425
--- /dev/null
+++ b/tests/st/a2a3/fully_distributed_within_core/spmd_starvation/kernels/orchestration/spmd_starvation_orch.cpp
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+/**
+ * SPMD Starvation Prevention Orchestration
+ *
+ * Submits a large wave of normal MIX tasks followed by sync_start tasks,
+ * then another wave of normal tasks.  The drain mechanism must ensure the
+ * sync_start tasks are not indefinitely delayed by the surrounding load.
+ *
+ * Layout: 3 waves × 6 normal tasks (block_num=4) + 2 sync_start tasks (block_num=6)
+ *
+ * Normal task: block_num=4, require_sync_start=false  → 4 blocks × 3 slots = 12 CL each
+ * Sync task:   block_num=6, require_sync_start=true   → 6 blocks × 3 slots = 18 CL each
+ *
+ * Total CL: 3×6×12 + 2×18 = 216 + 36 = 252
+ *
+ * Args layout: [output]
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "pto_orchestration_api.h"
+
+#define FUNC_SPMD_MIX_AIC 0
+#define FUNC_SPMD_MIX_AIV0 1
+#define FUNC_SPMD_MIX_AIV1 2
+
+static constexpr int32_t SLOTS_PER_BLOCK = 3;  // AIC, AIV0, AIV1
+static constexpr int32_t NORMAL_BLOCK_NUM = 4;
+static constexpr int32_t SYNC_BLOCK_NUM = 6;
+static constexpr int32_t NORMAL_CL = NORMAL_BLOCK_NUM * SLOTS_PER_BLOCK;  // 12
+static constexpr int32_t SYNC_CL = SYNC_BLOCK_NUM * SLOTS_PER_BLOCK;      // 18
+
+extern "C" {
+
+__attribute__((visibility("default"))) PTO2OrchestrationConfig aicpu_orchestration_config(const L2TaskArgs &orch_args) {
+    (void)orch_args;  // NOLINT(readability/casting)
+    return PTO2OrchestrationConfig{
+        .expected_arg_count = 1,
+    };
+}
+
+static void submit_mix(const Tensor &out, int16_t block_num, int64_t base_cl, bool sync_start) {
+    MixedKernels mk;
+    mk.aic_kernel_id = FUNC_SPMD_MIX_AIC;
+    mk.aiv0_kernel_id = FUNC_SPMD_MIX_AIV0;
+    mk.aiv1_kernel_id = FUNC_SPMD_MIX_AIV1;
+
+    L0TaskArgs args;
+    args.add_inout(out);
+    args.add_scalar(base_cl);
+    args.launch_spec.set_block_num(block_num);
+    args.launch_spec.set_require_sync_start(sync_start);
+    rt_submit_task(mk, args);
+}
+
+__attribute__((visibility("default"))) void aicpu_orchestration_entry(const L2TaskArgs &orch_args) {
+    const Tensor &ext_output = orch_args.tensor(0).ref();
+
+    int64_t cl = 0;
+
+    // Wave 1: 6 normal MIX tasks
+    for (int i = 0; i < 6; i++, cl += NORMAL_CL)
+        submit_mix(ext_output, NORMAL_BLOCK_NUM, cl, false);
+
+    // Sync-start task 0: must not be starved by wave 1 or wave 2
+    submit_mix(ext_output, SYNC_BLOCK_NUM, cl, true);
+    cl += SYNC_CL;
+
+    // Wave 2: 6 normal MIX tasks
+    for (int i = 0; i < 6; i++, cl += NORMAL_CL)
+        submit_mix(ext_output, NORMAL_BLOCK_NUM, cl, false);
+
+    // Sync-start task 1: must not be starved by wave 2 or wave 3
+    submit_mix(ext_output, SYNC_BLOCK_NUM, cl, true);
+    cl += SYNC_CL;
+
+    // Wave 3: 6 normal MIX tasks
+    for (int i = 0; i < 6; i++, cl += NORMAL_CL)
+        submit_mix(ext_output, NORMAL_BLOCK_NUM, cl, false);
+
+    LOG_INFO_V9("[spmd_starvation] Submitted 20 tasks (18 normal + 2 sync_start)");
+}
+
+}  // extern "C"
diff --git a/tests/st/a2a3/fully_distributed_within_core/spmd_starvation/test_spmd_starvation.py b/tests/st/a2a3/fully_distributed_within_core/spmd_starvation/test_spmd_starvation.py
new file mode 100644
index 000000000..d21b9c7ab
--- /dev/null
+++ b/tests/st/a2a3/fully_distributed_within_core/spmd_starvation/test_spmd_starvation.py
@@ -0,0 +1,108 @@
+#!/usr/bin/env python3
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""SPMD starvation prevention: 18 normal MIX + 2 sync_start MIX tasks.
+
+Total: 252 CL = 4032 float32.
+"""
+
+import torch
+from simpler.task_interface import ArgDirection as D
+
+from simpler_setup import SceneTestCase, TaskArgsBuilder, Tensor, scene_test
+
+FLOATS_PER_CACHE_LINE = 16
+SLOTS_PER_BLOCK = 3
+NORMAL_BN, SYNC_BN = 4, 6
+NORMAL_CL, SYNC_CL = NORMAL_BN * SLOTS_PER_BLOCK, SYNC_BN * SLOTS_PER_BLOCK
+
+
+def _build_tasks():
+    tasks, cl = [], 0
+    for _ in range(6):
+        tasks.append((NORMAL_BN, cl))
+        cl += NORMAL_CL
+    tasks.append((SYNC_BN, cl))
+    cl += SYNC_CL
+    for _ in range(6):
+        tasks.append((NORMAL_BN, cl))
+        cl += NORMAL_CL
+    tasks.append((SYNC_BN, cl))
+    cl += SYNC_CL
+    for _ in range(6):
+        tasks.append((NORMAL_BN, cl))
+        cl += NORMAL_CL
+    return tasks
+
+
+TASKS = _build_tasks()
+TOTAL_CL = sum(bn * SLOTS_PER_BLOCK for bn, _ in TASKS)
+
+
+@scene_test(level=2, runtime="tensormap_and_ringbuffer")
+class TestSpmdStarvation(SceneTestCase):
+    RTOL = 0
+    ATOL = 0
+
+    CALLABLE = {
+        "orchestration": {
+            "source": "kernels/orchestration/spmd_starvation_orch.cpp",
+            "function_name": "aicpu_orchestration_entry",
+            "signature": [D.INOUT],
+        },
+        "incores": [
+            {
+                "func_id": 0,
+                "name": "SPMD_MIX_AIC",
+                "source": "../spmd_multiblock_mix/kernels/aic/kernel_spmd_mix.cpp",
+                "core_type": "aic",
+                # Cooperative MIX (AIC+AIV0+AIV1 share one args[]). Declare the
+                # payload signature on exactly ONE subtask so the tensor dump's
+                # per-subtask sum equals the payload (1 INOUT tensor); the AIVs
+                # stay empty or the sum would triple and the dump is skipped.
+                "signature": [D.INOUT],
+            },
+            {
+                "func_id": 1,
+                "name": "SPMD_MIX_AIV0",
+                "source": "../spmd_multiblock_mix/kernels/aiv/kernel_spmd_mix.cpp",
+                "core_type": "aiv",
+            },
+            {
+                "func_id": 2,
+                "name": "SPMD_MIX_AIV1",
+                "source": "../spmd_multiblock_mix/kernels/aiv/kernel_spmd_mix.cpp",
+                "core_type": "aiv",
+            },
+        ],
+    }
+
+    CASES = [
+        {
+            "name": "Case1",
+            "platforms": ["a2a3sim", "a2a3"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 24},
+            "params": {},
+        }
+    ]
+
+    def generate_args(self, params):
+        return TaskArgsBuilder(Tensor("output", torch.zeros(TOTAL_CL * FLOATS_PER_CACHE_LINE, dtype=torch.float32)))
+
+    def compute_golden(self, args, params):
+        out = args.output
+        for block_num, base_cl in TASKS:
+            for block_idx in range(block_num):
+                for slot in range(SLOTS_PER_BLOCK):
+                    cl = base_cl + block_idx * SLOTS_PER_BLOCK + slot
+                    out[cl * FLOATS_PER_CACHE_LINE] = float(block_idx)
+
+
+if __name__ == "__main__":
+    SceneTestCase.run_module(__name__)
diff --git a/tests/st/a2a3/fully_distributed_within_core/spmd_sync_start/kernels/orchestration/spmd_sync_start_orch.cpp b/tests/st/a2a3/fully_distributed_within_core/spmd_sync_start/kernels/orchestration/spmd_sync_start_orch.cpp
new file mode 100644
index 000000000..e13fb23a4
--- /dev/null
+++ b/tests/st/a2a3/fully_distributed_within_core/spmd_sync_start/kernels/orchestration/spmd_sync_start_orch.cpp
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+/**
+ * SPMD Sync-Start Orchestration
+ *
+ * Submits MIX tasks with require_sync_start=true to verify that the scheduler
+ * atomically launches all blocks before any can run.
+ *
+ * Tasks:
+ *   T0: block_num=2,  require_sync_start=true   (basic sync launch)
+ *   T1: block_num=8,  require_sync_start=true   (larger batch)
+ *   T2: block_num=2,  require_sync_start=false  (normal, as baseline)
+ *   T3: block_num=12, require_sync_start=true   (cross-thread batch)
+ *
+ * Each block writes float(block_idx) to its allocated cache-line slot,
+ * identical to spmd_multiblock_mix so the same kernel binaries can be reused.
+ *
+ * Args layout: [output]
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "pto_orchestration_api.h"
+
+#define FUNC_SPMD_MIX_AIC 0
+#define FUNC_SPMD_MIX_AIV0 1
+#define FUNC_SPMD_MIX_AIV1 2
+
+extern "C" {
+
+__attribute__((visibility("default"))) PTO2OrchestrationConfig aicpu_orchestration_config(const L2TaskArgs &orch_args) {
+    (void)orch_args;  // NOLINT(readability/casting)
+    return PTO2OrchestrationConfig{
+        .expected_arg_count = 1,
+    };
+}
+
+static void submit_mix(const Tensor &out, int16_t block_num, int64_t base_cl, bool sync_start) {
+    MixedKernels mk;
+    mk.aic_kernel_id = FUNC_SPMD_MIX_AIC;
+    mk.aiv0_kernel_id = FUNC_SPMD_MIX_AIV0;
+    mk.aiv1_kernel_id = FUNC_SPMD_MIX_AIV1;
+
+    L0TaskArgs args;
+    args.add_inout(out);
+    args.add_scalar(base_cl);
+    args.launch_spec.set_block_num(block_num);
+    args.launch_spec.set_require_sync_start(sync_start);
+    rt_submit_task(mk, args);
+}
+
+__attribute__((visibility("default"))) void aicpu_orchestration_entry(const L2TaskArgs &orch_args) {
+    const Tensor &ext_output = orch_args.tensor(0).ref();
+
+    // T0: 2 blocks, sync_start=true  (6 CL)
+    submit_mix(ext_output, 2, 0, true);
+    // T1: 8 blocks, sync_start=true  (24 CL)
+    submit_mix(ext_output, 8, 6, true);
+    // T2: 2 blocks, sync_start=false (6 CL, baseline)
+    submit_mix(ext_output, 2, 30, false);
+    // T3: 12 blocks, sync_start=true (36 CL)
+    submit_mix(ext_output, 12, 36, true);
+
+    LOG_INFO_V9("[spmd_sync_start] Submitted 4 tasks (3 sync_start + 1 baseline)");
+}
+
+}  // extern "C"
diff --git a/tests/st/a2a3/fully_distributed_within_core/spmd_sync_start/test_spmd_sync_start.py b/tests/st/a2a3/fully_distributed_within_core/spmd_sync_start/test_spmd_sync_start.py
new file mode 100644
index 000000000..61b394873
--- /dev/null
+++ b/tests/st/a2a3/fully_distributed_within_core/spmd_sync_start/test_spmd_sync_start.py
@@ -0,0 +1,82 @@
+#!/usr/bin/env python3
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""SPMD sync_start: 4 MIX tasks (3 sync_start + 1 baseline). Output: 72 CL = 1152 float32."""
+
+import torch
+from simpler.task_interface import ArgDirection as D
+
+from simpler_setup import SceneTestCase, TaskArgsBuilder, Tensor, scene_test
+
+FLOATS_PER_CACHE_LINE = 16
+SLOTS_PER_BLOCK = 3
+TASKS = [(2, 0), (8, 6), (2, 30), (12, 36)]
+TOTAL_CL = sum(bn * SLOTS_PER_BLOCK for bn, _ in TASKS)
+
+
+@scene_test(level=2, runtime="tensormap_and_ringbuffer")
+class TestSpmdSyncStart(SceneTestCase):
+    RTOL = 0
+    ATOL = 0
+
+    CALLABLE = {
+        "orchestration": {
+            "source": "kernels/orchestration/spmd_sync_start_orch.cpp",
+            "function_name": "aicpu_orchestration_entry",
+            "signature": [D.INOUT],
+        },
+        "incores": [
+            {
+                "func_id": 0,
+                "name": "SPMD_MIX_AIC",
+                "source": "../spmd_multiblock_mix/kernels/aic/kernel_spmd_mix.cpp",
+                "core_type": "aic",
+                # Cooperative MIX (AIC+AIV0+AIV1 share one args[]). Declare the
+                # payload signature on exactly ONE subtask so the tensor dump's
+                # per-subtask sum equals the payload (1 INOUT tensor); the AIVs
+                # stay empty or the sum would triple and the dump is skipped.
+                "signature": [D.INOUT],
+            },
+            {
+                "func_id": 1,
+                "name": "SPMD_MIX_AIV0",
+                "source": "../spmd_multiblock_mix/kernels/aiv/kernel_spmd_mix.cpp",
+                "core_type": "aiv",
+            },
+            {
+                "func_id": 2,
+                "name": "SPMD_MIX_AIV1",
+                "source": "../spmd_multiblock_mix/kernels/aiv/kernel_spmd_mix.cpp",
+                "core_type": "aiv",
+            },
+        ],
+    }
+
+    CASES = [
+        {
+            "name": "Case1",
+            "platforms": ["a2a3sim", "a2a3"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 24},
+            "params": {},
+        }
+    ]
+
+    def generate_args(self, params):
+        return TaskArgsBuilder(Tensor("output", torch.zeros(TOTAL_CL * FLOATS_PER_CACHE_LINE, dtype=torch.float32)))
+
+    def compute_golden(self, args, params):
+        out = args.output
+        for block_num, base_cl in TASKS:
+            for block_idx in range(block_num):
+                for slot in range(SLOTS_PER_BLOCK):
+                    out[(base_cl + block_idx * SLOTS_PER_BLOCK + slot) * FLOATS_PER_CACHE_LINE] = float(block_idx)
+
+
+if __name__ == "__main__":
+    SceneTestCase.run_module(__name__)
diff --git a/tests/st/a2a3/fully_distributed_within_core/spmd_sync_start_aiv/kernels/orchestration/spmd_sync_start_aiv_orch.cpp b/tests/st/a2a3/fully_distributed_within_core/spmd_sync_start_aiv/kernels/orchestration/spmd_sync_start_aiv_orch.cpp
new file mode 100644
index 000000000..c1f936c07
--- /dev/null
+++ b/tests/st/a2a3/fully_distributed_within_core/spmd_sync_start_aiv/kernels/orchestration/spmd_sync_start_aiv_orch.cpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+/**
+ * SPMD Sync-Start AIV Orchestration
+ *
+ * Submits AIV-only tasks with require_sync_start=true to exercise:
+ *   - AIV fast path: count_idle_aiv_cores() >= block_num (small block_num)
+ *   - AIV drain path: block_num exceeds local AIV cores (cross-thread drain)
+ *
+ * Tasks:
+ *   T0: block_num=4,  require_sync_start=true   (fast path)
+ *   T1: block_num=16, require_sync_start=true   (saturate one thread: 8 clusters × 2 AIV)
+ *   T2: block_num=4,  require_sync_start=false  (baseline)
+ *   T3: block_num=24, require_sync_start=true   (cross-thread drain)
+ *
+ * Each block writes float(block_idx) at (base_cl + block_idx) × FLOATS_PER_CACHE_LINE,
+ * reusing the kernel from spmd_multiblock_aiv.
+ *
+ * Args layout: [output]
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "pto_orchestration_api.h"
+
+#define FUNC_SPMD_WRITE_AIV 0
+
+extern "C" {
+
+__attribute__((visibility("default"))) PTO2OrchestrationConfig aicpu_orchestration_config(const L2TaskArgs &orch_args) {
+    (void)orch_args;  // NOLINT(readability/casting)
+    return PTO2OrchestrationConfig{
+        .expected_arg_count = 1,
+    };
+}
+
+static void submit_aiv(const Tensor &out, int16_t block_num, int64_t base_cl, bool sync_start) {
+    L0TaskArgs args;
+    args.add_inout(out);
+    args.add_scalar(base_cl);
+    args.launch_spec.set_block_num(block_num);
+    args.launch_spec.set_require_sync_start(sync_start);
+    rt_submit_aiv_task(FUNC_SPMD_WRITE_AIV, args);
+}
+
+__attribute__((visibility("default"))) void aicpu_orchestration_entry(const L2TaskArgs &orch_args) {
+    const Tensor &ext_output = orch_args.tensor(0).ref();
+
+    // T0: 4 blocks, sync_start=true (fast path: 4 <= idle AIV cores on one thread)
+    submit_aiv(ext_output, 4, 0, true);
+    // T1: 16 blocks, sync_start=true (saturate: 8 clusters × 2 AIV = 16 cores)
+    submit_aiv(ext_output, 16, 4, true);
+    // T2: 4 blocks, sync_start=false (baseline)
+    submit_aiv(ext_output, 4, 20, false);
+    // T3: 24 blocks, sync_start=true (cross-thread drain)
+    submit_aiv(ext_output, 24, 24, true);
+
+    LOG_INFO_V9("[spmd_sync_start_aiv] Submitted 4 AIV tasks (3 sync_start + 1 baseline)");
+}
+
+}  // extern "C"
diff --git a/tests/st/a2a3/fully_distributed_within_core/spmd_sync_start_aiv/test_spmd_sync_start_aiv.py b/tests/st/a2a3/fully_distributed_within_core/spmd_sync_start_aiv/test_spmd_sync_start_aiv.py
new file mode 100644
index 000000000..0af847616
--- /dev/null
+++ b/tests/st/a2a3/fully_distributed_within_core/spmd_sync_start_aiv/test_spmd_sync_start_aiv.py
@@ -0,0 +1,66 @@
+#!/usr/bin/env python3
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""SPMD sync_start AIV: 4 AIV tasks testing fast path and drain. Output: 48 CL = 768 float32."""
+
+import torch
+from simpler.task_interface import ArgDirection as D
+
+from simpler_setup import SceneTestCase, TaskArgsBuilder, Tensor, scene_test
+
+FLOATS_PER_CACHE_LINE = 16
+TASKS = [(4, 0), (16, 4), (4, 20), (24, 24)]
+TOTAL_CL = sum(bn for bn, _ in TASKS)
+
+
+@scene_test(level=2, runtime="tensormap_and_ringbuffer")
+class TestSpmdSyncStartAiv(SceneTestCase):
+    RTOL = 0
+    ATOL = 0
+
+    CALLABLE = {
+        "orchestration": {
+            "source": "kernels/orchestration/spmd_sync_start_aiv_orch.cpp",
+            "function_name": "aicpu_orchestration_entry",
+            "signature": [D.INOUT],
+        },
+        "incores": [
+            {
+                "func_id": 0,
+                "name": "SPMD_WRITE_AIV",
+                "source": "../spmd_multiblock_aiv/kernels/aiv/kernel_spmd_write.cpp",
+                "core_type": "aiv",
+                # Single-AIV task: one INOUT tensor in args[]. Declare it so the
+                # tensor dump's per-subtask sum matches the payload (1).
+                "signature": [D.INOUT],
+            },
+        ],
+    }
+
+    CASES = [
+        {
+            "name": "Case1",
+            "platforms": ["a2a3sim", "a2a3"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 24},
+            "params": {},
+        }
+    ]
+
+    def generate_args(self, params):
+        return TaskArgsBuilder(Tensor("output", torch.zeros(TOTAL_CL * FLOATS_PER_CACHE_LINE, dtype=torch.float32)))
+
+    def compute_golden(self, args, params):
+        out = args.output
+        for block_num, base_cl in TASKS:
+            for block_idx in range(block_num):
+                out[(base_cl + block_idx) * FLOATS_PER_CACHE_LINE] = float(block_idx)
+
+
+if __name__ == "__main__":
+    SceneTestCase.run_module(__name__)
diff --git a/tests/st/a2a3/fully_distributed_within_core/spmd_sync_start_edge/kernels/orchestration/spmd_sync_start_edge_orch.cpp b/tests/st/a2a3/fully_distributed_within_core/spmd_sync_start_edge/kernels/orchestration/spmd_sync_start_edge_orch.cpp
new file mode 100644
index 000000000..77a54fd6c
--- /dev/null
+++ b/tests/st/a2a3/fully_distributed_within_core/spmd_sync_start_edge/kernels/orchestration/spmd_sync_start_edge_orch.cpp
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+/**
+ * SPMD Sync-Start Boundary Orchestration
+ *
+ * Tests edge-case block_num values relative to per-thread cluster capacity
+ * (8 clusters per sched thread, 24 total clusters).
+ *
+ * Tasks:
+ *   T0: block_num=1,  sync_start=true   (degenerate: always fast path)
+ *   T1: block_num=8,  sync_start=true   (exactly one thread's capacity)
+ *   T2: block_num=9,  sync_start=true   (one over: must enter drain)
+ *   T3: block_num=23, sync_start=true   (max valid: total_clusters - 1)
+ *   T4: block_num=1,  sync_start=false  (baseline)
+ *
+ * Args layout: [output]
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "pto_orchestration_api.h"
+
+#define FUNC_SPMD_MIX_AIC 0
+#define FUNC_SPMD_MIX_AIV0 1
+#define FUNC_SPMD_MIX_AIV1 2
+
+extern "C" {
+
+__attribute__((visibility("default"))) PTO2OrchestrationConfig aicpu_orchestration_config(const L2TaskArgs &orch_args) {
+    (void)orch_args;  // NOLINT(readability/casting)
+    return PTO2OrchestrationConfig{
+        .expected_arg_count = 1,
+    };
+}
+
+static void submit_mix(const Tensor &out, int16_t block_num, int64_t base_cl, bool sync_start) {
+    MixedKernels mk;
+    mk.aic_kernel_id = FUNC_SPMD_MIX_AIC;
+    mk.aiv0_kernel_id = FUNC_SPMD_MIX_AIV0;
+    mk.aiv1_kernel_id = FUNC_SPMD_MIX_AIV1;
+
+    L0TaskArgs args;
+    args.add_inout(out);
+    args.add_scalar(base_cl);
+    args.launch_spec.set_block_num(block_num);
+    args.launch_spec.set_require_sync_start(sync_start);
+    rt_submit_task(mk, args);
+}
+
+__attribute__((visibility("default"))) void aicpu_orchestration_entry(const L2TaskArgs &orch_args) {
+    const Tensor &ext_output = orch_args.tensor(0).ref();
+
+    // T0: block_num=1, sync_start=true (degenerate: always fast path, 3 CL)
+    submit_mix(ext_output, 1, 0, true);
+    // T1: block_num=8, sync_start=true (exactly one thread's cluster capacity, 24 CL)
+    submit_mix(ext_output, 8, 3, true);
+    // T2: block_num=9, sync_start=true (one over single thread → must drain, 27 CL)
+    submit_mix(ext_output, 9, 27, true);
+    // T3: block_num=23, sync_start=true (max valid = total_clusters - 1, 69 CL)
+    submit_mix(ext_output, 23, 54, true);
+    // T4: block_num=1, sync_start=false (baseline, 3 CL)
+    submit_mix(ext_output, 1, 123, false);
+
+    LOG_INFO_V9("[spmd_sync_start_edge] Submitted 5 tasks: block_num=1,8,9,23 (sync) + 1 (baseline)");
+}
+
+}  // extern "C"
diff --git a/tests/st/a2a3/fully_distributed_within_core/spmd_sync_start_edge/test_spmd_sync_start_edge.py b/tests/st/a2a3/fully_distributed_within_core/spmd_sync_start_edge/test_spmd_sync_start_edge.py
new file mode 100644
index 000000000..e1fa9021d
--- /dev/null
+++ b/tests/st/a2a3/fully_distributed_within_core/spmd_sync_start_edge/test_spmd_sync_start_edge.py
@@ -0,0 +1,82 @@
+#!/usr/bin/env python3
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""SPMD sync_start edge: boundary block_num values. Output: 126 CL = 2016 float32."""
+
+import torch
+from simpler.task_interface import ArgDirection as D
+
+from simpler_setup import SceneTestCase, TaskArgsBuilder, Tensor, scene_test
+
+FLOATS_PER_CACHE_LINE = 16
+SLOTS_PER_BLOCK = 3
+TASKS = [(1, 0), (8, 3), (9, 27), (23, 54), (1, 123)]
+TOTAL_CL = sum(bn * SLOTS_PER_BLOCK for bn, _ in TASKS)
+
+
+@scene_test(level=2, runtime="tensormap_and_ringbuffer")
+class TestSpmdSyncStartEdge(SceneTestCase):
+    RTOL = 0
+    ATOL = 0
+
+    CALLABLE = {
+        "orchestration": {
+            "source": "kernels/orchestration/spmd_sync_start_edge_orch.cpp",
+            "function_name": "aicpu_orchestration_entry",
+            "signature": [D.INOUT],
+        },
+        "incores": [
+            {
+                "func_id": 0,
+                "name": "SPMD_MIX_AIC",
+                "source": "../spmd_multiblock_mix/kernels/aic/kernel_spmd_mix.cpp",
+                "core_type": "aic",
+                # Cooperative MIX (AIC+AIV0+AIV1 share one args[]). Declare the
+                # payload signature on exactly ONE subtask so the tensor dump's
+                # per-subtask sum equals the payload (1 INOUT tensor); the AIVs
+                # stay empty or the sum would triple and the dump is skipped.
+                "signature": [D.INOUT],
+            },
+            {
+                "func_id": 1,
+                "name": "SPMD_MIX_AIV0",
+                "source": "../spmd_multiblock_mix/kernels/aiv/kernel_spmd_mix.cpp",
+                "core_type": "aiv",
+            },
+            {
+                "func_id": 2,
+                "name": "SPMD_MIX_AIV1",
+                "source": "../spmd_multiblock_mix/kernels/aiv/kernel_spmd_mix.cpp",
+                "core_type": "aiv",
+            },
+        ],
+    }
+
+    CASES = [
+        {
+            "name": "Case1",
+            "platforms": ["a2a3sim", "a2a3"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 24},
+            "params": {},
+        }
+    ]
+
+    def generate_args(self, params):
+        return TaskArgsBuilder(Tensor("output", torch.zeros(TOTAL_CL * FLOATS_PER_CACHE_LINE, dtype=torch.float32)))
+
+    def compute_golden(self, args, params):
+        out = args.output
+        for block_num, base_cl in TASKS:
+            for block_idx in range(block_num):
+                for slot in range(SLOTS_PER_BLOCK):
+                    out[(base_cl + block_idx * SLOTS_PER_BLOCK + slot) * FLOATS_PER_CACHE_LINE] = float(block_idx)
+
+
+if __name__ == "__main__":
+    SceneTestCase.run_module(__name__)
diff --git a/tests/st/a2a3/fully_distributed_within_core/spmd_sync_start_stress/kernels/orchestration/spmd_sync_start_stress_orch.cpp b/tests/st/a2a3/fully_distributed_within_core/spmd_sync_start_stress/kernels/orchestration/spmd_sync_start_stress_orch.cpp
new file mode 100644
index 000000000..8822d719f
--- /dev/null
+++ b/tests/st/a2a3/fully_distributed_within_core/spmd_sync_start_stress/kernels/orchestration/spmd_sync_start_stress_orch.cpp
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+/**
+ * SPMD Sync-Start Stress Orchestration (mixed shapes)
+ *
+ * Submits 6 rounds of mixed MIX + AIV tasks to stress-test:
+ *   - Drain CAS contention (multiple sync_start tasks per round)
+ *   - Ack barrier correctness (normal tasks occupy clusters during drain entry)
+ *   - State cleanup between consecutive drain cycles
+ *
+ * Each round (9 tasks):
+ *   4 × normal MIX  (block_num=4,  sync=false) -> 4 × 4 × 3 = 48 CL
+ *   2 × sync   MIX  (block_num=12, sync=true)  -> 2 × 12 × 3 = 72 CL
+ *   2 × sync   AIV  (block_num=8,  sync=true)  -> 2 × 8 × 1 = 16 CL
+ *   1 × normal AIV  (block_num=4,  sync=false) -> 1 × 4 × 1 = 4 CL
+ *   Round total: 140 CL
+ *
+ * 6 rounds → 54 tasks total, 840 CL grand total.
+ *
+ * Args layout: [output]
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "pto_orchestration_api.h"
+
+#define FUNC_SPMD_MIX_AIC 0
+#define FUNC_SPMD_MIX_AIV0 1
+#define FUNC_SPMD_MIX_AIV1 2
+#define FUNC_SPMD_WRITE_AIV 3
+
+static constexpr int32_t MIX_SLOTS = 3;
+static constexpr int32_t NORMAL_MIX_BN = 4;
+static constexpr int32_t SYNC_MIX_BN = 12;
+static constexpr int32_t SYNC_AIV_BN = 8;
+static constexpr int32_t NORMAL_AIV_BN = 4;
+static constexpr int32_t ROUNDS = 6;
+
+extern "C" {
+
+__attribute__((visibility("default"))) PTO2OrchestrationConfig aicpu_orchestration_config(const L2TaskArgs &orch_args) {
+    (void)orch_args;
+    return PTO2OrchestrationConfig{.expected_arg_count = 1};
+}
+
+static void submit_mix(const Tensor &out, int16_t block_num, int64_t base_cl, bool sync_start) {
+    MixedKernels mk;
+    mk.aic_kernel_id = FUNC_SPMD_MIX_AIC;
+    mk.aiv0_kernel_id = FUNC_SPMD_MIX_AIV0;
+    mk.aiv1_kernel_id = FUNC_SPMD_MIX_AIV1;
+    L0TaskArgs args;
+    args.add_inout(out);
+    args.add_scalar(base_cl);
+    args.launch_spec.set_block_num(block_num);
+    args.launch_spec.set_require_sync_start(sync_start);
+    rt_submit_task(mk, args);
+}
+
+static void submit_aiv(const Tensor &out, int16_t block_num, int64_t base_cl, bool sync_start) {
+    L0TaskArgs args;
+    args.add_inout(out);
+    args.add_scalar(base_cl);
+    args.launch_spec.set_block_num(block_num);
+    args.launch_spec.set_require_sync_start(sync_start);
+    rt_submit_aiv_task(FUNC_SPMD_WRITE_AIV, args);
+}
+
+__attribute__((visibility("default"))) void aicpu_orchestration_entry(const L2TaskArgs &orch_args) {
+    const Tensor &ext_output = orch_args.tensor(0).ref();
+
+    int64_t cl = 0;
+
+    for (int32_t r = 0; r < ROUNDS; r++) {
+        // 4 × normal MIX
+        for (int i = 0; i < 4; i++, cl += NORMAL_MIX_BN * MIX_SLOTS)
+            submit_mix(ext_output, NORMAL_MIX_BN, cl, false);
+
+        // 2 × sync MIX — CAS contention: second sync task may arrive while first is draining
+        for (int i = 0; i < 2; i++, cl += SYNC_MIX_BN * MIX_SLOTS)
+            submit_mix(ext_output, SYNC_MIX_BN, cl, true);
+
+        // 2 × sync AIV — cross-shape drain contention with the MIX drain above
+        for (int i = 0; i < 2; i++, cl += SYNC_AIV_BN)
+            submit_aiv(ext_output, SYNC_AIV_BN, cl, true);
+
+        // 1 × normal AIV
+        submit_aiv(ext_output, NORMAL_AIV_BN, cl, false);
+        cl += NORMAL_AIV_BN;
+    }
+
+    LOG_INFO_V9("[spmd_sync_start_stress] Submitted %d tasks over %d rounds", 9 * ROUNDS, ROUNDS);
+}
+
+}  // extern "C"
diff --git a/tests/st/a2a3/fully_distributed_within_core/spmd_sync_start_stress/test_spmd_sync_start_stress.py b/tests/st/a2a3/fully_distributed_within_core/spmd_sync_start_stress/test_spmd_sync_start_stress.py
new file mode 100644
index 000000000..85e88fa2d
--- /dev/null
+++ b/tests/st/a2a3/fully_distributed_within_core/spmd_sync_start_stress/test_spmd_sync_start_stress.py
@@ -0,0 +1,119 @@
+#!/usr/bin/env python3
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""SPMD sync_start stress: 54 tasks over 6 rounds with mixed shapes (MIX + AIV).
+
+Grand total: 840 CL = 13440 float32.
+"""
+
+import torch
+from simpler.task_interface import ArgDirection as D
+
+from simpler_setup import SceneTestCase, TaskArgsBuilder, Tensor, scene_test
+
+FLOATS_PER_CACHE_LINE = 16
+ROUNDS = 6
+SHAPE_MIX, SHAPE_AIV = "MIX", "AIV"
+MIX_SLOTS, AIV_SLOTS = 3, 1
+NORMAL_MIX_BN, SYNC_MIX_BN, SYNC_AIV_BN, NORMAL_AIV_BN = 4, 12, 8, 4
+
+
+def _build_tasks():
+    tasks, cl = [], 0
+    for _ in range(ROUNDS):
+        for _ in range(4):
+            tasks.append((NORMAL_MIX_BN, cl, SHAPE_MIX))
+            cl += NORMAL_MIX_BN * MIX_SLOTS
+        for _ in range(2):
+            tasks.append((SYNC_MIX_BN, cl, SHAPE_MIX))
+            cl += SYNC_MIX_BN * MIX_SLOTS
+        for _ in range(2):
+            tasks.append((SYNC_AIV_BN, cl, SHAPE_AIV))
+            cl += SYNC_AIV_BN * AIV_SLOTS
+        tasks.append((NORMAL_AIV_BN, cl, SHAPE_AIV))
+        cl += NORMAL_AIV_BN * AIV_SLOTS
+    return tasks
+
+
+TASKS = _build_tasks()
+TOTAL_CL = sum(bn * (MIX_SLOTS if s == SHAPE_MIX else AIV_SLOTS) for bn, _, s in TASKS)
+
+
+@scene_test(level=2, runtime="tensormap_and_ringbuffer")
+class TestSpmdSyncStartStress(SceneTestCase):
+    RTOL = 0
+    ATOL = 0
+
+    CALLABLE = {
+        "orchestration": {
+            "source": "kernels/orchestration/spmd_sync_start_stress_orch.cpp",
+            "function_name": "aicpu_orchestration_entry",
+            "signature": [D.INOUT],
+        },
+        "incores": [
+            {
+                "func_id": 0,
+                "name": "SPMD_MIX_AIC",
+                "source": "../spmd_multiblock_mix/kernels/aic/kernel_spmd_mix.cpp",
+                "core_type": "aic",
+                # Cooperative MIX (AIC+AIV0+AIV1 share one args[]). Declare the
+                # payload signature on exactly ONE subtask so the tensor dump's
+                # per-subtask sum equals the payload (1 INOUT tensor); the AIVs
+                # stay empty or the sum would triple and the dump is skipped.
+                "signature": [D.INOUT],
+            },
+            {
+                "func_id": 1,
+                "name": "SPMD_MIX_AIV0",
+                "source": "../spmd_multiblock_mix/kernels/aiv/kernel_spmd_mix.cpp",
+                "core_type": "aiv",
+            },
+            {
+                "func_id": 2,
+                "name": "SPMD_MIX_AIV1",
+                "source": "../spmd_multiblock_mix/kernels/aiv/kernel_spmd_mix.cpp",
+                "core_type": "aiv",
+            },
+            {
+                "func_id": 3,
+                "name": "SPMD_WRITE_AIV",
+                "source": "../spmd_multiblock_aiv/kernels/aiv/kernel_spmd_write.cpp",
+                "core_type": "aiv",
+                # Separate single-AIV task (not part of the MIX above): its own
+                # args[] has one INOUT tensor, so declare it here.
+                "signature": [D.INOUT],
+            },
+        ],
+    }
+
+    CASES = [
+        {
+            "name": "Case1",
+            "platforms": ["a2a3sim", "a2a3"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 24},
+            "params": {},
+        }
+    ]
+
+    def generate_args(self, params):
+        return TaskArgsBuilder(Tensor("output", torch.zeros(TOTAL_CL * FLOATS_PER_CACHE_LINE, dtype=torch.float32)))
+
+    def compute_golden(self, args, params):
+        out = args.output
+        for block_num, base_cl, shape in TASKS:
+            for block_idx in range(block_num):
+                if shape == SHAPE_MIX:
+                    for slot in range(MIX_SLOTS):
+                        out[(base_cl + block_idx * MIX_SLOTS + slot) * FLOATS_PER_CACHE_LINE] = float(block_idx)
+                else:
+                    out[(base_cl + block_idx) * FLOATS_PER_CACHE_LINE] = float(block_idx)
+
+
+if __name__ == "__main__":
+    SceneTestCase.run_module(__name__)
diff --git a/tests/st/a2a3/fully_distributed_within_core/test_l3_dependency.py b/tests/st/a2a3/fully_distributed_within_core/test_l3_dependency.py
new file mode 100644
index 000000000..f2fd8ebf9
--- /dev/null
+++ b/tests/st/a2a3/fully_distributed_within_core/test_l3_dependency.py
@@ -0,0 +1,106 @@
+#!/usr/bin/env python3
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""L3 ChipTask → SubTask dependency via TensorMap.
+
+Worker(level=3) submits a ChipTask then a SubTask that depends on it.
+Verifies: TensorMap dependency inference, cross-fork data visibility,
+SubWorker reads result produced by ChipWorker.
+"""
+
+import torch
+from simpler.task_interface import ArgDirection as D
+from simpler.task_interface import TaskArgs, TensorArgType
+
+from simpler_setup import SceneTestCase, TaskArgsBuilder, Tensor, make_tensor_arg, scene_test
+from simpler_setup.scene_test import _build_l3_task_args
+
+KERNELS_BASE = "../../../../examples/a2a3/tensormap_and_ringbuffer/vector_example/kernels"
+
+
+def verify(args):
+    """SubCallable — dependency target, runs after ChipTask completes."""
+
+
+def run_dag(orch, callables, task_args, config):
+    """L3 orchestration: ChipTask → SubTask dependency."""
+    # ChipTask: tags inside chip_args drive deps (INPUT → lookup; OUTPUT_EXISTING → insert).
+    chip_args, _ = _build_l3_task_args(task_args, callables.vector_kernel_sig)
+    callables.keep(chip_args)  # prevent GC before drain
+
+    orch.submit_next_level(callables.vector_kernel, chip_args, config)
+
+    # SubTask: tag the chip output as INPUT — Orchestrator wires the dep via TensorMap.
+    sub_args = TaskArgs()
+    sub_args.add_tensor(make_tensor_arg(task_args.f), TensorArgType.INPUT)
+    orch.submit_sub(callables.verify, sub_args)
+
+
+@scene_test(level=3, runtime="tensormap_and_ringbuffer")
+class TestL3Dependency(SceneTestCase):
+    """L3: ChipTask produces output, SubTask depends on it via TensorMap."""
+
+    CALLABLE = {
+        "orchestration": run_dag,
+        "callables": [
+            {
+                "name": "vector_kernel",
+                "orchestration": {
+                    "source": f"{KERNELS_BASE}/orchestration/example_orchestration.cpp",
+                    "function_name": "aicpu_orchestration_entry",
+                    "signature": [D.IN, D.IN, D.OUT],
+                },
+                "incores": [
+                    {
+                        "func_id": 0,
+                        "source": f"{KERNELS_BASE}/aiv/kernel_add.cpp",
+                        "core_type": "aiv",
+                        "signature": [D.IN, D.IN, D.OUT],
+                    },
+                    {
+                        "func_id": 1,
+                        "source": f"{KERNELS_BASE}/aiv/kernel_add_scalar.cpp",
+                        "core_type": "aiv",
+                        "signature": [D.IN, D.OUT],
+                    },
+                    {
+                        "func_id": 2,
+                        "source": f"{KERNELS_BASE}/aiv/kernel_mul.cpp",
+                        "core_type": "aiv",
+                        "signature": [D.IN, D.IN, D.OUT],
+                    },
+                ],
+            },
+            {"name": "verify", "callable": verify},
+        ],
+    }
+
+    CASES = [
+        {
+            "name": "default",
+            "platforms": ["a2a3sim", "a2a3"],
+            "config": {"device_count": 1, "num_sub_workers": 1, "block_dim": 3, "aicpu_thread_num": 4},
+            "params": {},
+        },
+    ]
+
+    def generate_args(self, params):
+        SIZE = 128 * 128
+        return TaskArgsBuilder(
+            Tensor("a", torch.full((SIZE,), 2.0, dtype=torch.float32).share_memory_()),
+            Tensor("b", torch.full((SIZE,), 3.0, dtype=torch.float32).share_memory_()),
+            Tensor("f", torch.zeros(SIZE, dtype=torch.float32).share_memory_()),
+        )
+
+    def compute_golden(self, args, params):
+        args.f[:] = (args.a + args.b + 1) * (args.a + args.b + 2) + (args.a + args.b)
+
+
+if __name__ == "__main__":
+    SceneTestCase.run_module(__name__)
diff --git a/tests/st/a2a3/fully_distributed_within_core/test_l3_group.py b/tests/st/a2a3/fully_distributed_within_core/test_l3_group.py
new file mode 100644
index 000000000..5dfe1e659
--- /dev/null
+++ b/tests/st/a2a3/fully_distributed_within_core/test_l3_group.py
@@ -0,0 +1,120 @@
+#!/usr/bin/env python3
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""L3 group task — 2 ChipWorkers (process-isolated) on 1 DAG node.
+
+Each chip runs the same kernel with its own args (different tensors).
+A downstream SubTask depends on the group output.
+Verifies: fork+shm process isolation, 2-chip concurrent execution,
+group completion aggregation, downstream dependency waits for group.
+"""
+
+import torch
+from simpler.task_interface import ArgDirection as D
+from simpler.task_interface import TaskArgs, TensorArgType
+
+from simpler_setup import SceneTestCase, TaskArgsBuilder, Tensor, make_tensor_arg, scene_test
+
+KERNELS_BASE = "../../../../examples/a2a3/tensormap_and_ringbuffer/vector_example/kernels"
+
+
+def verify(args):
+    """SubCallable — runs after group completes."""
+
+
+def _chip_args(in_a, in_b, out_f) -> TaskArgs:
+    """Build per-chip TaskArgs with INPUT/INPUT/OUTPUT_EXISTING tags."""
+    a = TaskArgs()
+    a.add_tensor(make_tensor_arg(in_a), TensorArgType.INPUT)
+    a.add_tensor(make_tensor_arg(in_b), TensorArgType.INPUT)
+    a.add_tensor(make_tensor_arg(out_f), TensorArgType.OUTPUT_EXISTING)
+    return a
+
+
+def run_dag(orch, callables, task_args, config):
+    """L3 orchestration: group of 2 chips → SubTask dependency."""
+    args0 = _chip_args(task_args.a0, task_args.b0, task_args.f0)
+    args1 = _chip_args(task_args.a1, task_args.b1, task_args.f1)
+    callables.keep(args0, args1)  # prevent GC before drain
+
+    orch.submit_next_level_group(callables.vector_kernel, [args0, args1], config)
+
+    # SubTask depends on both group outputs (f0, f1) — tag both as INPUT.
+    sub_args = TaskArgs()
+    sub_args.add_tensor(make_tensor_arg(task_args.f0), TensorArgType.INPUT)
+    sub_args.add_tensor(make_tensor_arg(task_args.f1), TensorArgType.INPUT)
+    orch.submit_sub(callables.verify, sub_args)
+
+
+@scene_test(level=3, runtime="tensormap_and_ringbuffer")
+class TestL3Group(SceneTestCase):
+    """L3: Group of 2 ChipWorkers as 1 DAG node, SubTask depends on group."""
+
+    CALLABLE = {
+        "orchestration": run_dag,
+        "callables": [
+            {
+                "name": "vector_kernel",
+                "orchestration": {
+                    "source": f"{KERNELS_BASE}/orchestration/example_orchestration.cpp",
+                    "function_name": "aicpu_orchestration_entry",
+                    "signature": [D.IN, D.IN, D.OUT],
+                },
+                "incores": [
+                    {
+                        "func_id": 0,
+                        "source": f"{KERNELS_BASE}/aiv/kernel_add.cpp",
+                        "core_type": "aiv",
+                        "signature": [D.IN, D.IN, D.OUT],
+                    },
+                    {
+                        "func_id": 1,
+                        "source": f"{KERNELS_BASE}/aiv/kernel_add_scalar.cpp",
+                        "core_type": "aiv",
+                        "signature": [D.IN, D.OUT],
+                    },
+                    {
+                        "func_id": 2,
+                        "source": f"{KERNELS_BASE}/aiv/kernel_mul.cpp",
+                        "core_type": "aiv",
+                        "signature": [D.IN, D.IN, D.OUT],
+                    },
+                ],
+            },
+            {"name": "verify", "callable": verify},
+        ],
+    }
+
+    CASES = [
+        {
+            "name": "default",
+            "platforms": ["a2a3sim", "a2a3"],
+            "config": {"device_count": 2, "num_sub_workers": 1, "block_dim": 3, "aicpu_thread_num": 4},
+            "params": {},
+        },
+    ]
+
+    def generate_args(self, params):
+        SIZE = 128 * 128
+        return TaskArgsBuilder(
+            Tensor("a0", torch.full((SIZE,), 2.0, dtype=torch.float32).share_memory_()),
+            Tensor("b0", torch.full((SIZE,), 3.0, dtype=torch.float32).share_memory_()),
+            Tensor("f0", torch.zeros(SIZE, dtype=torch.float32).share_memory_()),
+            Tensor("a1", torch.full((SIZE,), 2.0, dtype=torch.float32).share_memory_()),
+            Tensor("b1", torch.full((SIZE,), 3.0, dtype=torch.float32).share_memory_()),
+            Tensor("f1", torch.zeros(SIZE, dtype=torch.float32).share_memory_()),
+        )
+
+    def compute_golden(self, args, params):
+        args.f0[:] = (args.a0 + args.b0 + 1) * (args.a0 + args.b0 + 2) + (args.a0 + args.b0)
+        args.f1[:] = (args.a1 + args.b1 + 1) * (args.a1 + args.b1 + 2) + (args.a1 + args.b1)
+
+
+if __name__ == "__main__":
+    SceneTestCase.run_module(__name__)