From df205c974fed4177a96726015df9e180de1f4a97 Mon Sep 17 00:00:00 2001
From: Chao Wang <26245345+ChaoWao@users.noreply.github.com>
Date: Mon, 29 Jun 2026 11:12:37 +0800
Subject: [PATCH] Add: host_build_graph runtime (host-orchestration variant of
 tensormap)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Introduces the host_build_graph (hbg) runtime: the host-orchestration
variant of tensormap_and_ringbuffer. The orchestrator runs on the host to
completion — building the whole task graph, relocating it to device
addresses, and H2D'ing the image — then the device boots scheduler-only.
Only the orchestration timing/location differs from tensormap; the data
structures and dispatch are shared.

Pipeline (host): stage tensors (device alloc + H2D) and SVM-map them so the
host can read control tensors directly; dlopen the orchestration .so and run
it; wire fanout inline during submit (build dep_pool/fanout_head + seed the
ready queue on the host); relocate every cross-task pointer to its final
device address (range-based, two-delta: SM region + arena region); H2D the
populated SM + arena. Device: attach the already-device-addressed image and
dispatch from the seeded ready queue; no on-device orchestrator, no pointer
fixup, no execution-time reclaim. Completion is tracked by completed_tasks_,
independent of last_task_alive.

Simplifications enabled by host-orchestration:
- No device-orch path (the device boots scheduler-only).
- No execution-time reclaim: the whole graph is host-resident and runs once,
  so advance_ring_pointers / reset_for_reuse / the COMPLETED->CONSUMED flip
  are removed. Consumer waits key on fanout_refcount.
- Single ring: with no reclaim and a whole-graph-resident ring, the
  per-scope-depth multi-ring split is gone (PTO2_MAX_RING_DEPTH == 1);
  rings[]/ring_id are physically removed. All scope depths map to ring 0.
- Host reads device control tensors (e.g. paged_attention's context_lens /
  block_table) via SVM (a2a3 halHostRegister identity map; sim is already a
  host pointer), so get_tensor_data dereferences buffer.addr directly.

The SVM map/unmap is exposed through the per-thread DeviceRunner bridge, not
the Runtime.host_api struct, so the HostApi ABI shared with
tensormap_and_ringbuffer / a5 is untouched. tensormap_and_ringbuffer and a5
runtimes are not modified.

Verified on a2a3 onboard: 12 passed / 1 skipped (host_build_graph suite,
incl. paged_attention reading control tensors via SVM).
---
 .../platform/include/common/platform_config.h |   13 +
 .../platform/include/common/tensor_dump.h     |    5 +-
 .../platform/onboard/host/device_runner.cpp   |   37 +
 .../platform/onboard/host/device_runner.h     |    8 +
 .../aicore/aicore_executor.cpp                |  202 ++-
 .../host_build_graph/aicpu/aicpu_executor.cpp | 1426 +++-------------
 .../runtime/host_build_graph/build_config.py  |   28 +-
 .../host_build_graph/common/intrinsic.h       |  199 +++
 .../common/pto_runtime_status.h               |   53 +
 .../host_build_graph/docs/RUNTIME_LOGIC.md    |  822 ++++++++-
 .../docs/SCALAR_DATA_ACCESS.md                |  137 ++
 .../docs/SUBMIT_BY_CLUSTER.md                 |  222 +++
 .../docs/device_log_profiling.md              |  175 ++
 .../host_build_graph/docs/profiling_levels.md |  492 ++++++
 .../host_build_graph/host/dep_gen_replay.cpp  |  787 +++++++++
 .../host_build_graph/host/dep_gen_replay.h    |  106 ++
 .../host/host_orch_compat_stubs.cpp           |   47 +
 .../host/runtime_compile_info.cpp             |    3 +-
 .../host_build_graph/host/runtime_maker.cpp   | 1195 +++++++++----
 .../host_build_graph/orchestration/common.cpp |  197 +++
 .../orchestration/orchestration_api.h         |  103 --
 .../orchestration/pto_arg_with_deps.h         |  140 ++
 .../orchestration/pto_orchestration_api.h     |  385 +++++
 .../runtime/aicore_completion_mailbox.h       |  189 +++
 .../runtime/aicore_completion_mailbox_types.h |   67 +
 .../backend/sdma/sdma_completion_kernel.h     |  143 ++
 .../backend/sdma/sdma_completion_scheduler.h  |   66 +
 .../runtime/host_build_graph/runtime/common.h |   39 +
 .../orchestrator_core/pto_orchestrator.cpp    | 1120 +++++++++++++
 .../orchestrator_core/pto_ring_buffer.cpp     |  185 ++
 .../orchestrator_core/pto_runtime2.cpp        |  308 ++++
 .../runtime/pto2_dispatch_payload.h           |   97 ++
 .../runtime/pto_async_kernel_api.h            |  157 ++
 .../host_build_graph/runtime/pto_async_wait.h |  303 ++++
 .../runtime/pto_completion_token.h            |   45 +
 .../host_build_graph/runtime/pto_constants.h  |   19 +
 .../runtime/pto_dep_compute.h                 |  179 ++
 .../runtime/pto_orchestrator.h                |  189 +++
 .../runtime/pto_ring_buffer.h                 |  774 +++++++++
 .../host_build_graph/runtime/pto_runtime2.h   |  290 ++++
 .../runtime/pto_runtime2_types.h              |  532 +++++-
 .../runtime/pto_shared_memory.h               |  296 ++++
 .../runtime/pto_submit_types.h                |  161 ++
 .../host_build_graph/runtime/pto_tensormap.h  |  730 ++++++++
 .../host_build_graph/runtime/pto_types.h      |  614 +++++++
 .../host_build_graph/runtime/runtime.cpp      |  221 ---
 .../host_build_graph/runtime/runtime.h        |  508 ++----
 .../runtime/scheduler/pto_scheduler.cpp       |  109 ++
 .../runtime/scheduler/pto_scheduler.h         | 1493 +++++++++++++++++
 .../runtime/scheduler/scheduler_cold_path.cpp | 1105 ++++++++++++
 .../scheduler/scheduler_completion.cpp        |  614 +++++++
 .../runtime/scheduler/scheduler_context.h     |  415 +++++
 .../runtime/scheduler/scheduler_dispatch.cpp  | 1473 ++++++++++++++++
 .../runtime/scheduler/scheduler_types.h       |  479 ++++++
 .../runtime/shared/pto_runtime2_init.cpp      |  400 +++++
 .../runtime/shared/pto_shared_memory.cpp      |  258 +++
 .../runtime/shared/pto_tensormap.cpp          |  246 +++
 .../runtime/shared/runtime.cpp                |  158 ++
 .../runtime/tensor_create_info.h              |  147 ++
 .../host_build_graph/runtime/tensor_info.h    |   60 -
 .../platform/include/common/platform_config.h |   13 +
 src/a5/platform/include/common/tensor_dump.h  |    5 +-
 .../platform/onboard/host/c_api_shared.cpp    |   19 +
 .../onboard/host/device_runner_base.h         |   18 +
 src/common/platform/sim/host/c_api_shared.cpp |   17 +
 .../platform/sim/host/device_runner_base.h    |   10 +
 .../bgemm/kernels/aic/kernel_gemm_tile.cpp    |   23 +-
 .../bgemm/kernels/aiv/kernel_tile_add.cpp     |   41 +-
 .../kernels/orchestration/bgemm_orch.cpp      |  179 +-
 .../a2a3/host_build_graph/bgemm/test_bgemm.py |    4 +-
 .../dump_tensor/kernels/aiv/kernel_add.cpp    |   22 +-
 .../kernels/aiv/kernel_add_scalar_inplace.cpp |   17 +-
 .../orchestration/dump_tensor_orch.cpp        |   82 +-
 .../dump_tensor/test_dump_tensor_example.py   |    4 +-
 .../matmul/kernels/aic/kernel_matmul.cpp      |   43 +-
 .../matmul/kernels/aiv/kernel_add_exp.cpp     |   38 +-
 .../matmul/kernels/aiv/kernel_log_sqrt.cpp    |   32 +-
 .../kernels/orchestration/matmul_orch.cpp     |  210 +--
 .../host_build_graph/matmul/test_matmul.py    |    4 +-
 .../kernels/aic/aic_pv_matmul.cpp             |   40 +-
 .../kernels/aic/aic_qk_matmul.cpp             |   42 +-
 .../kernels/aiv/aiv_online_update.cpp         |  251 +--
 .../kernels/aiv/aiv_softmax_prepare.cpp       |   66 +-
 .../orchestration/paged_attention_orch.cpp    |  477 +++---
 .../paged_attention/test_paged_attention.py   |   17 +-
 .../test_prepared_callable.py                 |    4 +-
 .../vector_example/kernels/aiv/kernel_add.cpp |   37 +-
 .../kernels/aiv/kernel_add_scalar.cpp         |   36 +-
 .../vector_example/kernels/aiv/kernel_mul.cpp |   37 +-
 .../kernels/orchestration/example_orch.cpp    |  195 +--
 .../vector_example/test_vector_example.py     |    4 +-
 91 files changed, 19587 insertions(+), 3371 deletions(-)
 create mode 100644 src/a2a3/runtime/host_build_graph/common/intrinsic.h
 create mode 100644 src/a2a3/runtime/host_build_graph/common/pto_runtime_status.h
 create mode 100644 src/a2a3/runtime/host_build_graph/docs/SCALAR_DATA_ACCESS.md
 create mode 100644 src/a2a3/runtime/host_build_graph/docs/SUBMIT_BY_CLUSTER.md
 create mode 100644 src/a2a3/runtime/host_build_graph/docs/device_log_profiling.md
 create mode 100644 src/a2a3/runtime/host_build_graph/docs/profiling_levels.md
 create mode 100644 src/a2a3/runtime/host_build_graph/host/dep_gen_replay.cpp
 create mode 100644 src/a2a3/runtime/host_build_graph/host/dep_gen_replay.h
 create mode 100644 src/a2a3/runtime/host_build_graph/host/host_orch_compat_stubs.cpp
 create mode 100644 src/a2a3/runtime/host_build_graph/orchestration/common.cpp
 delete mode 100644 src/a2a3/runtime/host_build_graph/orchestration/orchestration_api.h
 create mode 100644 src/a2a3/runtime/host_build_graph/orchestration/pto_arg_with_deps.h
 create mode 100644 src/a2a3/runtime/host_build_graph/orchestration/pto_orchestration_api.h
 create mode 100644 src/a2a3/runtime/host_build_graph/runtime/aicore_completion_mailbox.h
 create mode 100644 src/a2a3/runtime/host_build_graph/runtime/aicore_completion_mailbox_types.h
 create mode 100644 src/a2a3/runtime/host_build_graph/runtime/backend/sdma/sdma_completion_kernel.h
 create mode 100644 src/a2a3/runtime/host_build_graph/runtime/backend/sdma/sdma_completion_scheduler.h
 create mode 100644 src/a2a3/runtime/host_build_graph/runtime/common.h
 create mode 100644 src/a2a3/runtime/host_build_graph/runtime/orchestrator_core/pto_orchestrator.cpp
 create mode 100644 src/a2a3/runtime/host_build_graph/runtime/orchestrator_core/pto_ring_buffer.cpp
 create mode 100644 src/a2a3/runtime/host_build_graph/runtime/orchestrator_core/pto_runtime2.cpp
 create mode 100644 src/a2a3/runtime/host_build_graph/runtime/pto2_dispatch_payload.h
 create mode 100644 src/a2a3/runtime/host_build_graph/runtime/pto_async_kernel_api.h
 create mode 100644 src/a2a3/runtime/host_build_graph/runtime/pto_async_wait.h
 create mode 100644 src/a2a3/runtime/host_build_graph/runtime/pto_completion_token.h
 create mode 100644 src/a2a3/runtime/host_build_graph/runtime/pto_constants.h
 create mode 100644 src/a2a3/runtime/host_build_graph/runtime/pto_dep_compute.h
 create mode 100644 src/a2a3/runtime/host_build_graph/runtime/pto_orchestrator.h
 create mode 100644 src/a2a3/runtime/host_build_graph/runtime/pto_ring_buffer.h
 create mode 100644 src/a2a3/runtime/host_build_graph/runtime/pto_runtime2.h
 create mode 100644 src/a2a3/runtime/host_build_graph/runtime/pto_shared_memory.h
 create mode 100644 src/a2a3/runtime/host_build_graph/runtime/pto_submit_types.h
 create mode 100644 src/a2a3/runtime/host_build_graph/runtime/pto_tensormap.h
 create mode 100644 src/a2a3/runtime/host_build_graph/runtime/pto_types.h
 delete mode 100644 src/a2a3/runtime/host_build_graph/runtime/runtime.cpp
 create mode 100644 src/a2a3/runtime/host_build_graph/runtime/scheduler/pto_scheduler.cpp
 create mode 100644 src/a2a3/runtime/host_build_graph/runtime/scheduler/pto_scheduler.h
 create mode 100644 src/a2a3/runtime/host_build_graph/runtime/scheduler/scheduler_cold_path.cpp
 create mode 100644 src/a2a3/runtime/host_build_graph/runtime/scheduler/scheduler_completion.cpp
 create mode 100644 src/a2a3/runtime/host_build_graph/runtime/scheduler/scheduler_context.h
 create mode 100644 src/a2a3/runtime/host_build_graph/runtime/scheduler/scheduler_dispatch.cpp
 create mode 100644 src/a2a3/runtime/host_build_graph/runtime/scheduler/scheduler_types.h
 create mode 100644 src/a2a3/runtime/host_build_graph/runtime/shared/pto_runtime2_init.cpp
 create mode 100644 src/a2a3/runtime/host_build_graph/runtime/shared/pto_shared_memory.cpp
 create mode 100644 src/a2a3/runtime/host_build_graph/runtime/shared/pto_tensormap.cpp
 create mode 100644 src/a2a3/runtime/host_build_graph/runtime/shared/runtime.cpp
 create mode 100644 src/a2a3/runtime/host_build_graph/runtime/tensor_create_info.h
 delete mode 100644 src/a2a3/runtime/host_build_graph/runtime/tensor_info.h

diff --git a/src/a2a3/platform/include/common/platform_config.h b/src/a2a3/platform/include/common/platform_config.h
index 757db37a5..12abb0ebe 100644
--- a/src/a2a3/platform/include/common/platform_config.h
+++ b/src/a2a3/platform/include/common/platform_config.h
@@ -248,6 +248,19 @@ constexpr int PLATFORM_DUMP_READYQUEUE_SIZE = PLATFORM_MAX_AICPU_THREADS * PLATF
  */
 constexpr int PLATFORM_DUMP_TIMEOUT_SECONDS = 30;
 
+/**
+ * Dump-args mask pool dimensions. The pool is keyed by (ring_id, slot) packed
+ * from a PTO2 task_id, so it must span the largest ring depth and task window
+ * any runtime built against this platform can use. The dump infra is shared by
+ * every runtime (device-orch tensormap_and_ringbuffer at ring depth 4 and the
+ * single-ring host-orch host_build_graph), so these are sized to the maximum
+ * rather than coupled to one runtime's pto_runtime2_types.h — a runtime that
+ * lowers its own PTO2_MAX_RING_DEPTH must not shrink the pool other runtimes
+ * rely on (see set_dump_args_task_mask's ring_id bound check).
+ */
+constexpr uint32_t PLATFORM_DUMP_MASK_POOL_MAX_RINGS = 4;
+constexpr uint32_t PLATFORM_DUMP_MASK_POOL_MAX_SLOTS = 16384;
+
 // =============================================================================
 // PMU Profiling Configuration
 // =============================================================================
diff --git a/src/a2a3/platform/include/common/tensor_dump.h b/src/a2a3/platform/include/common/tensor_dump.h
index 3be0952f1..86699d60d 100644
--- a/src/a2a3/platform/include/common/tensor_dump.h
+++ b/src/a2a3/platform/include/common/tensor_dump.h
@@ -44,7 +44,6 @@
 #include <cstdint>
 
 #include "common/platform_config.h"
-#include "host_build_graph/runtime/pto_runtime2_types.h"
 
 // =============================================================================
 // Constants
@@ -84,8 +83,8 @@ using TensorDumpArgMask = uint64_t;
 // Zero preserves legacy "dump all tasks" behavior unless selective mode is enabled.
 constexpr TensorDumpArgMask TENSOR_DUMP_ARG_MASK_NONE = 0;
 constexpr uint32_t TENSOR_DUMP_ARG_MASK_BITS = 64;
-constexpr uint32_t TENSOR_DUMP_MASK_POOL_MAX_RINGS = PTO2_MAX_RING_DEPTH;
-constexpr uint32_t TENSOR_DUMP_MASK_POOL_MAX_SLOTS = PTO2_TASK_WINDOW_SIZE;
+constexpr uint32_t TENSOR_DUMP_MASK_POOL_MAX_RINGS = PLATFORM_DUMP_MASK_POOL_MAX_RINGS;
+constexpr uint32_t TENSOR_DUMP_MASK_POOL_MAX_SLOTS = PLATFORM_DUMP_MASK_POOL_MAX_SLOTS;
 constexpr uint32_t TENSOR_DUMP_MASK_POOL_DEFAULT_SLOT_MASK = TENSOR_DUMP_MASK_POOL_MAX_SLOTS - 1;
 constexpr uint8_t TENSOR_DUMP_RECORD_FLAG_ARG_INDEX_AMBIGUOUS = 1u << 0;
 
diff --git a/src/a2a3/platform/onboard/host/device_runner.cpp b/src/a2a3/platform/onboard/host/device_runner.cpp
index 941fcc100..565fd2a55 100644
--- a/src/a2a3/platform/onboard/host/device_runner.cpp
+++ b/src/a2a3/platform/onboard/host/device_runner.cpp
@@ -968,6 +968,43 @@ int DeviceRunner::init_scope_stats(int num_threads, int device_id) {
     return 0;
 }
 
+void *DeviceRunner::svm_register(void *dev_ptr, std::size_t bytes) {
+    if (dev_ptr == nullptr || bytes == 0) {
+        return nullptr;
+    }
+    if (load_hal_if_needed() != 0) {
+        LOG_ERROR("svm_register: failed to load ascend_hal: %s", dlerror());
+        return nullptr;
+    }
+    HalHostRegisterFn fn = get_halHostRegister();
+    if (fn == nullptr) {
+        LOG_ERROR("svm_register: halHostRegister symbol not found: %s", dlerror());
+        return nullptr;
+    }
+    void *host_va = nullptr;
+    int rc = fn(dev_ptr, bytes, DEV_SVM_MAP_HOST, device_id_, &host_va);
+    if (rc != 0) {
+        LOG_ERROR("svm_register: halHostRegister failed for dev_ptr %p (rc=%d)", dev_ptr, rc);
+        return nullptr;
+    }
+    return host_va;
+}
+
+void DeviceRunner::svm_unregister(void *dev_ptr) {
+    if (dev_ptr == nullptr) {
+        return;
+    }
+    // halHostUnregister is keyed by the device pointer (mirrors the profiling
+    // finalize path); the HAL maps it back to the host VA internally.
+    HalHostUnregisterFn fn = get_halHostUnregister();
+    if (fn != nullptr) {
+        int rc = fn(dev_ptr, device_id_);
+        if (rc != 0) {
+            LOG_ERROR("svm_unregister: halHostUnregister failed for dev_ptr %p (rc=%d)", dev_ptr, rc);
+        }
+    }
+}
+
 void DeviceRunner::finalize_collectors() {
     auto unregister_cb = [](void *dev_ptr, int device_id) -> int {
         HalHostUnregisterFn fn = get_halHostUnregister();
diff --git a/src/a2a3/platform/onboard/host/device_runner.h b/src/a2a3/platform/onboard/host/device_runner.h
index 1df2a832f..9ec911fe6 100644
--- a/src/a2a3/platform/onboard/host/device_runner.h
+++ b/src/a2a3/platform/onboard/host/device_runner.h
@@ -114,6 +114,14 @@ class DeviceRunner : public DeviceRunnerBase {
      */
     int run(Runtime &runtime, const CallConfig &config) override;
 
+    // SVM map/unmap a device buffer into host address space via
+    // halHostRegister(DEV_SVM_MAP_HOST) / halHostUnregister. host_build_graph
+    // uses these so its host-side orchestrator can read control tensors whose
+    // buffer.addr is a device address. The returned host VA may differ from
+    // dev_ptr — callers must use it for host access.
+    void *svm_register(void *dev_ptr, std::size_t bytes) override;
+    void svm_unregister(void *dev_ptr) override;
+
     /**
      * a2a3-only `dep_gen` enablement setter. The shared
      * `set_l2_swimlane_enabled`, `set_dump_tensor_enabled`,
diff --git a/src/a2a3/runtime/host_build_graph/aicore/aicore_executor.cpp b/src/a2a3/runtime/host_build_graph/aicore/aicore_executor.cpp
index 30417ae8e..f10737969 100644
--- a/src/a2a3/runtime/host_build_graph/aicore/aicore_executor.cpp
+++ b/src/a2a3/runtime/host_build_graph/aicore/aicore_executor.cpp
@@ -14,26 +14,65 @@
 #include "aicore/l2_swimlane_collector_aicore.h"
 #include "aicore/pmu_collector_aicore.h"
 #include "common/l2_swimlane_profiling.h"
-#include "common/platform_config.h"  // Platform configuration (C/C++ compatible)
+#include "common/platform_config.h"  // Register-based communication
+#include "pto2_dispatch_payload.h"
 #include "runtime.h"
 
-typedef void (*KernelFunc)(__gm__ int64_t *);
+/**
+ * Unified function pointer type for kernel dispatch
+ *
+ * All kernels follow the same signature: void kernel(__gm__ int64_t* args)
+ * This enables simple, switch-free dispatch.
+ */
+typedef void (*UnifiedKernelFunc)(__gm__ int64_t *);
 
-__aicore__ __attribute__((always_inline)) static void execute_task(__gm__ Task *task) {
-    if (task->function_bin_addr == 0) {
+/**
+ * Execute task from PTO2DispatchPayload.
+ *
+ * Reads function_bin_addr and args from the dispatch payload.
+ *
+ * @param payload Pointer to PTO2DispatchPayload in global memory
+ */
+__aicore__ __attribute__((always_inline)) static void execute_task(__gm__ PTO2DispatchPayload *payload) {
+    if (payload == nullptr || payload->function_bin_addr == 0) {
         return;
     }
-    KernelFunc kernel = (KernelFunc)task->function_bin_addr;
-    kernel(reinterpret_cast<__gm__ int64_t *>(task->args));
+
+    UnifiedKernelFunc kernel = (UnifiedKernelFunc)payload->function_bin_addr;
+    kernel(reinterpret_cast<__gm__ int64_t *>(payload->args));
     OUT_OF_ORDER_STORE_BARRIER();
 }
 
+/**
+ * AICore main execution loop
+ *
+ * Implements the AICPU-AICore register-based dispatch protocol:
+ * 1. Wait for AICPU ready signal via handshake buffer
+ * 2. Report physical core ID and core type, signal AICore ready
+ * 3. Cache per-core PTO2DispatchPayload pointer from hank->task
+ * 4. Poll DATA_MAIN_BASE register for task dispatch until exit signal
+ *
+ * AICPU writes &s_payload_per_core[i] to hank->task before setting
+ * aicpu_ready=1. AICore caches this pointer and reads function_bin_addr +
+ * args pointer from it on each dispatch. reg_val is a monotonically
+ * increasing task ID used only for dispatch signaling and ACK/FIN protocol.
+ *
+ * Profiling state (enable flag, L2 swimlane rotation channel) is published into the platform
+ * via set_aicore_profiling_flag / set_aicore_l2_swimlane_ring at kernel entry —
+ * this routine reads it through the matching getters, so neither Handshake
+ * nor this signature carry profiling fields.
+ *
+ * @param runtime Pointer to Runtime in global memory
+ * @param block_idx Block index (core ID)
+ * @param core_type Core type (AIC or AIV)
+ */
 __aicore__ __attribute__((weak)) void aicore_execute(__gm__ Runtime *runtime, int block_idx, CoreType core_type) {
     __gm__ Handshake *my_hank = (__gm__ Handshake *)(&runtime->workers[block_idx]);
 
     // Phase 1: Wait for AICPU initialization signal
     while (my_hank->aicpu_ready == 0) {
         dcci(my_hank, SINGLE_CACHE_LINE);
+        SPIN_WAIT_HINT();
     }
 
     // Phase 2: Report physical core ID, signal ready
@@ -43,6 +82,7 @@ __aicore__ __attribute__((weak)) void aicore_execute(__gm__ Runtime *runtime, in
     dcci(&my_hank->aicore_regs_ready, SINGLE_CACHE_LINE, CACHELINE_OUT);
     while (my_hank->aicpu_regs_ready == 0) {
         dcci(&my_hank->aicpu_regs_ready, SINGLE_CACHE_LINE);
+        SPIN_WAIT_HINT();
     }
     // Report initial idle status via register
     write_reg(RegId::COND, AICORE_IDLE_VALUE);
@@ -50,66 +90,131 @@ __aicore__ __attribute__((weak)) void aicore_execute(__gm__ Runtime *runtime, in
     // Phase 3: Report core type, signal ready
     my_hank->core_type = core_type;
     OUT_OF_ORDER_STORE_BARRIER();
-    my_hank->aicore_done = block_idx + 1;
+    my_hank->aicore_done = block_idx + 1;  // Signal ready (use block_idx + 1 to avoid 0)
 
     dcci(my_hank, SINGLE_CACHE_LINE, CACHELINE_OUT);
 
+    // Cache per-core dispatch payload pointer (set by AICPU before aicpu_ready)
+    __gm__ PTO2DispatchPayload *payload = reinterpret_cast<__gm__ PTO2DispatchPayload *>(my_hank->task);
+
     uint32_t enable_profiling_flag = get_aicore_profiling_flag();
     bool l2_swimlane_enabled = GET_PROFILING_FLAG(enable_profiling_flag, PROFILING_FLAG_L2_SWIMLANE);
     bool dump_tensor_enabled = GET_PROFILING_FLAG(enable_profiling_flag, PROFILING_FLAG_DUMP_TENSOR);
     bool pmu_enabled = GET_PROFILING_FLAG(enable_profiling_flag, PROFILING_FLAG_PMU);
 
-    // Per-core L2SwimlaneActiveHead channel; see tensormap_and_ringbuffer/.../aicore_executor.cpp.
-    // Deferred until first task so AICPU's init has populated the rotation
-    // table (the dispatch itself proves init is done).
-    __gm__ L2SwimlaneActiveHead *l2_swimlane_head = nullptr;
+    // Per-core L2SwimlaneActiveHead channel. AICPU completes
+    // `l2_swimlane_aicpu_init` before writing `aicpu_ready = 1` in
+    // `handshake_all_cores`, and Phase 1 above has already observed
+    // `aicpu_ready == 1`, so the rotation-table slot is populated and the
+    // first deref is safe here — off the dispatch→start critical path.
+    __gm__ L2SwimlaneActiveHead *l2_swimlane_head = l2_swimlane_enabled ? get_l2_swimlane_aicore_head() : nullptr;
     // cached_buf_seq must start != AICPU's initial head.current_buf_seq (0)
     // so the first record_task observes a mismatch and loads the buffer ptr.
     L2SwimlaneAicoreLocalState l2_swimlane_local = {nullptr, UINT32_MAX, 0};
 
-    volatile uint32_t task_id = AICPU_IDLE_TASK_ID;
-    volatile uint32_t last_task_id = AICPU_IDLE_TASK_ID;
+    // Phase 4: Main execution loop - poll register for tasks until exit signal
+    // Register encoding: AICPU_IDLE_TASK_ID=idle, task_id=task, AICORE_EXIT_SIGNAL=exit
+    uint32_t reg_val = AICPU_IDLE_TASK_ID;
+    uint32_t last_reg_val = AICPU_IDLE_TASK_ID;
+    bool exiting = false;
 
     while (true) {
-        task_id = static_cast<uint32_t>(read_reg(RegId::DATA_MAIN_BASE));
-        if (task_id == AICORE_EXIT_SIGNAL) {
+        reg_val = static_cast<uint32_t>(read_reg(RegId::DATA_MAIN_BASE));
+        if (reg_val == AICORE_EXIT_SIGNAL) {
             // Signal exit acknowledgment to AICPU
             write_reg(RegId::COND, AICORE_EXITED_VALUE);
             break;
         }
 
-        if (task_id == AICPU_IDLE_TASK_ID || task_id == last_task_id) {
+        // Execute task if new (reg_val encoding: AICPU_IDLE_TASK_ID=idle, task_id=task)
+        if (reg_val == AICPU_IDLE_TASK_ID || reg_val == last_reg_val) {
             SPIN_WAIT_HINT();
             continue;
         }
 
         {
-            // receive_time captures the instant DATA_MAIN_BASE returned a new
-            // task_id, BEFORE the ack write. Paired with start_time (captured
-            // after task_ptr resolve) it lets DFX split head_OH into the
-            // AICPU→AICore NoC propagation (dispatch_ts → receive_time,
-            // hardware-bound) and the AICore-local ack + task_ptr resolve
-            // (receive_time → start_time). host_build_graph has no per-task
-            // dcci so the local-setup span is naturally tighter than the
-            // tensormap_and_ringbuffer runtime; the field still records it.
+            // receive_time marks the moment AICPU's full "task is ready to
+            // execute" signal landed on this core. Paired with start_time
+            // (captured after the per-task dcci + ack pair) it lets DFX split
+            // head_OH into the AICPU→AICore-ready propagation (dispatch_ts →
+            // receive_time, hardware + scheduling-bound) and the AICore-local
+            // critical-path prep (receive_time → start_time, software-tunable).
+            // Stored in the record as a 32-bit delta `start_time - receive_time`.
+            //
+            // For the common path (not_ready == 0) the new task_id on
+            // DATA_MAIN_BASE is itself the ready signal, so receive_time is
+            // stamped immediately and local_setup covers dcci + ack.
+            //
+            // For the speculative early-dispatch path (not_ready == 1) the
+            // dcci ran BEFORE the dependency-wait spin, so its cost is hidden
+            // behind the doorbell-wait — not on the critical path between
+            // "task genuinely ready" and "kernel begins". receive_time is
+            // re-stamped after the doorbell arrives, so propagation absorbs
+            // both the original NoC delivery AND any speculation overshoot,
+            // while local_setup stays the pure ack-on-critical-path cost. This
+            // makes local_setup the clean "AICore prep we can't hide" figure
+            // for both paths.
             uint64_t receive_time = get_sys_cnt_aicore();
 
-            uint32_t actual_task_id = task_id;
-            write_reg(RegId::COND, MAKE_ACK_VALUE(actual_task_id));
+            uint32_t task_id = reg_val;  // Decode: register holds task_id directly
+
+            // Select dual-buffer slot: same bit as AICPU used when writing payload
+            __gm__ PTO2DispatchPayload *exec_payload = payload + (task_id & 1u);
 
-            // First-task lazy resolve of the rotation channel.
-            if (l2_swimlane_enabled && l2_swimlane_head == nullptr) {
-                l2_swimlane_head = get_l2_swimlane_aicore_head();
+            // Invalidate payload buffer (AICPU updates its content each dispatch)
+            dcci(exec_payload, ENTIRE_DATA_CACHE);
+
+            // Speculative early-dispatch gate. A not-ready task was staged on
+            // this core before its dependencies resolved; wait until AICPU rings
+            // the doorbell (DATA_MAIN_BASE high 32 == task_id) before executing.
+            // The ACK is deferred until AFTER the gate so the scheduler keeps the
+            // core off-limits (pending_occupied stays set, no ACK->pending_freed)
+            // while the task is gated — preventing a real task from being
+            // dual-issued behind it. The kernel's own input dcci runs inside
+            // execute_task() below — strictly AFTER this gate — so predecessor
+            // outputs are visible. not_ready == 0 (the common path) skips this.
+            if (exec_payload->not_ready) {
+                while (true) {
+                    // Honor teardown: shutdown overwrites the low half with EXIT.
+                    // Check it on the doorbell-match iteration too, so an EXIT that
+                    // races in right after the matching doorbell still wins over
+                    // executing the gated task.
+                    if (read_dmb_high32() == task_id) {
+                        if (static_cast<uint32_t>(read_reg(RegId::DATA_MAIN_BASE)) == AICORE_EXIT_SIGNAL) {
+                            exiting = true;
+                        }
+                        break;
+                    }
+                    if (static_cast<uint32_t>(read_reg(RegId::DATA_MAIN_BASE)) == AICORE_EXIT_SIGNAL) {
+                        exiting = true;
+                        break;
+                    }
+                    SPIN_WAIT_HINT();
+                }
+                if (exiting) {
+                    write_reg(RegId::COND, AICORE_EXITED_VALUE);
+                    break;
+                }
+                // Re-stamp receive_time at the moment the doorbell landed: the
+                // dcci above ran during the speculative-staging window
+                // (overlapped with the dependency wait, off the critical path).
+                // Propagation now absorbs the speculation overshoot; local_setup
+                // = start - receive stays the pure ack-on-critical-path cost.
+                receive_time = get_sys_cnt_aicore();
             }
 
-            __gm__ Task *task_ptr = &(runtime->tasks[actual_task_id]);
+            write_reg(RegId::COND, MAKE_ACK_VALUE(task_id));
+
+            // Performance profiling: record start time
             uint64_t start_time = get_sys_cnt_aicore();
 
+            // PMU: start counting window around kernel execution
             if (pmu_enabled) {
                 pmu_aicore_begin();
             }
 
-            execute_task(task_ptr);
+            // Execute the task
+            execute_task(exec_payload);
 
             if (pmu_enabled) {
                 pmu_aicore_end();
@@ -119,22 +224,35 @@ __aicore__ __attribute__((weak)) void aicore_execute(__gm__ Runtime *runtime, in
                 pipe_barrier(PIPE_ALL);
             }
 
+            // Performance profiling: record task execution.
+            // Two identity fields go into the record (different roles):
+            //   - task_token_raw (PTO2 ring/local) is pulled from the dispatch
+            //     payload's LocalContext.async_ctx — already in AICore cache
+            //     from the just-completed task, no extra GM load. Host uses
+            //     it as the canonical task identity for JSON output / ring
+            //     decoding.
+            //   - reg_task_id is `task_id` (= reg_val, the per-core dispatch
+            //     token AICore just read from DATA_MAIN_BASE). Per-dispatch
+            //     unique within this core; host uses it as the join key
+            //     against the AICPU record stream. Required for correctness
+            //     under SPMD (block_num > num_cores) and MIX cluster spread,
+            //     where multiple dispatches of the same task share the same
+            //     task_token_raw.
+            last_reg_val = reg_val;
+            write_reg(RegId::COND, MAKE_FIN_VALUE(task_id));
+
+            // Sample end_time AFTER the FIN write so the op-event end marks the
+            // moment the AICPU can first observe completion — any compute-end ->
+            // FIN gap (epilogue / write-back) shows directly on the bar instead
+            // of being inferred. The record write itself stays off the critical
+            // path (it runs after FIN, so it no longer delays completion).
             if (l2_swimlane_enabled) {
                 uint64_t end_time = get_sys_cnt_aicore();
-                // host_build_graph uses plain task indices; zero-extend into
-                // the task_token_raw slot (identity) AND pass as reg_task_id
-                // (join key). With block_num always == 1 in this runtime
-                // there is no dispatch fan-out per task, so identity and
-                // dispatch token coincide and a single value covers both.
+                uint64_t task_token_raw = exec_payload->local_context.async_ctx.task_token.raw;
                 l2_swimlane_aicore_record_task(
-                    l2_swimlane_head, &l2_swimlane_local, static_cast<uint64_t>(actual_task_id),
-                    static_cast<uint32_t>(actual_task_id), receive_time, start_time, end_time
+                    l2_swimlane_head, &l2_swimlane_local, task_token_raw, task_id, receive_time, start_time, end_time
                 );
             }
-
-            last_task_id = task_id;
-
-            write_reg(RegId::COND, MAKE_FIN_VALUE(actual_task_id));
         }
     }
 
diff --git a/src/a2a3/runtime/host_build_graph/aicpu/aicpu_executor.cpp b/src/a2a3/runtime/host_build_graph/aicpu/aicpu_executor.cpp
index 6ab91ecf6..e290edfa3 100644
--- a/src/a2a3/runtime/host_build_graph/aicpu/aicpu_executor.cpp
+++ b/src/a2a3/runtime/host_build_graph/aicpu/aicpu_executor.cpp
@@ -8,308 +8,113 @@
  * See LICENSE in the root of the software repository for the full text of the License.
  * -----------------------------------------------------------------------------------------------------------
  */
+#include <unistd.h>
 
 #include <atomic>
+#include <cinttypes>
 #include <cstdint>
 #include <cstdio>
-#include <mutex>
+#include <cstdlib>
+#include <cstring>
+#ifdef __linux__
+#include <sys/mman.h>
+#endif
 
-#include "aicpu/device_log.h"
 #include "aicpu/device_time.h"
+#include "callable_protocol.h"
+#include "pto2_dispatch_payload.h"
+#include "runtime.h"
+#include "spin_hint.h"
+
+// Runtime headers (full struct definition for create/destroy + PTO2_SCOPE)
+#include "pto_runtime2.h"
+#include "pto_runtime2_types.h"
+#include "pto_shared_memory.h"
+
+// Performance profiling headers
 #include "aicpu/l2_swimlane_collector_aicpu.h"
-#include "aicpu/platform_aicpu_affinity.h"
-#include "aicpu/platform_regs.h"
-#include "aicpu/pmu_collector_aicpu.h"
+#include "aicpu/scope_stats_collector_aicpu.h"
 #include "aicpu/tensor_dump_aicpu.h"
-#include "callable.h"
-#include "common/memory_barrier.h"
+#include "aicpu/dep_gen_collector_aicpu.h"
 #include "common/l2_swimlane_profiling.h"
-#include "common/platform_config.h"
 #include "common/unified_log.h"
-#include "runtime.h"
-#include "spin_hint.h"
 
-#ifndef unlikely
-#define unlikely(x) __builtin_expect(!!(x), 0)
-#endif
+// Register-based communication
+#include "aicpu/platform_aicpu_affinity.h"
+#include "aicpu/platform_regs.h"
+#include "common/platform_config.h"
 
-constexpr int MAX_AICPU_THREADS = PLATFORM_MAX_AICPU_THREADS;
-constexpr int MAX_CORES_PER_THREAD = PLATFORM_MAX_CORES_PER_THREAD;
-constexpr int MAX_CORES = PLATFORM_MAX_CORES;
+// Core type definitions
+#include "common/core_type.h"
 
-// Core information for discovery
-struct CoreInfo {
-    int worker_id;              // Index in runtime.workers[]
-    uint32_t physical_core_id;  // Hardware physical core ID (from AICore)
-    uint64_t reg_addr;          // Cached register address for fast access
-    CoreType core_type;
-};
+// CoreCallable for resolved dispatch address
+#include "callable.h"
+
+// Scheduler data structures (CoreExecState, CoreTracker, etc.)
+#include "scheduler/scheduler_types.h"
+
+// Scheduler context class
+#include "scheduler/scheduler_context.h"
+
+// From orchestration/common.cpp linked into this DSO — updates g_current_runtime
+// here (cleared on teardown before runtime_destroy).
+extern "C" void framework_bind_runtime(PTO2Runtime *rt);
+
+static int32_t read_pto2_runtime_status(Runtime *runtime) {
+    if (runtime == nullptr) {
+        return 0;
+    }
+
+    void *sm = runtime->get_gm_sm_ptr();
+    if (sm == nullptr) {
+        return 0;
+    }
+
+    auto *header = static_cast<PTO2SharedMemoryHeader *>(sm);
+    int32_t orch_error_code = header->orch_error_code.load(std::memory_order_acquire);
+    int32_t sched_error_code = header->sched_error_code.load(std::memory_order_acquire);
+    return runtime_status_from_error_codes(orch_error_code, sched_error_code);
+}
+
+static PTO2Runtime *rt{nullptr};
 
 struct AicpuExecutor {
+    int32_t sched_thread_num_;
+    bool orch_to_sched_{false};
+
     // ===== Thread management state =====
-    std::atomic<int> thread_idx_{0};
+    std::atomic<int32_t> thread_idx_{0};
     std::atomic<bool> initialized_{false};
     std::atomic<bool> init_done_{false};
     std::atomic<bool> init_failed_{false};
     std::atomic<bool> finished_{false};
 
-    int aicpu_thread_num_{0};
-    int cores_total_num_{0};
-    int thread_cores_num_[MAX_AICPU_THREADS]{};  // Total cores (AIC+AIV) assigned to each thread
-    int aic_per_thread_{0};                      // Max AIC cores per thread (ceil), used as local queue cap
-    int aiv_per_thread_{0};                      // Max AIV cores per thread (ceil), used as local queue cap
-    int core_assignments_[MAX_AICPU_THREADS][MAX_CORES_PER_THREAD];
-
-    // Core discovery arrays (space-time tradeoff: avoid sorting)
-    CoreInfo aic_cores_[MAX_CORES_PER_THREAD];
-    CoreInfo aiv_cores_[MAX_CORES_PER_THREAD];
-    int aic_count_{0};
-    int aiv_count_{0};
-
-#if PTO2_PROFILING
-    // Logical core_id -> hardware physical core id, collected during handshake.
-    // Handed to pmu_aicpu_init() so the platform can resolve per-core PMU MMIO bases.
-    uint32_t physical_core_ids_[RUNTIME_MAX_WORKER];
-#endif
-
-    // Fast lookup: core_id -> reg_addr
-    uint64_t core_id_to_reg_addr_[MAX_CORES_PER_THREAD];
+    int32_t aicpu_thread_num_{0};
 
-    // Platform register base address array (set via get_platform_regs())
-    uint64_t regs_{0};
+    // ===== Task queue state (managed by scheduler ready queues) =====
 
-    // volatile required to prevent compiler from caching in registers during polling loops
-    volatile int pending_task_ids_[MAX_CORES];  // Task waiting for ACK
-    volatile int running_task_ids_[MAX_CORES];  // Task executing after ACK
+    std::atomic<int32_t> finished_count_{0};
+    std::atomic<bool> runtime_init_ready_{false};
 
-    bool core_first_dispatch_[MAX_CORES];
+    // Per-Worker arena backing the PTO2Runtime + sm_handle + orch/sched/mailbox
+    // sub-regions (created in runtime_create_from_sm, released in runtime_destroy).
+    // Default-constructed: libc-backed backend, no ctx.
+    DeviceArena runtime_arena_;
 
-    // Per-thread local ready queues
-    int cur_ready_queue_aic_[MAX_AICPU_THREADS][MAX_CORES_PER_THREAD];
-    int cur_ready_queue_aiv_[MAX_AICPU_THREADS][MAX_CORES_PER_THREAD];
-    int cur_ready_queue_aic_head_[MAX_AICPU_THREADS];
-    int cur_ready_queue_aic_tail_[MAX_AICPU_THREADS];
-    int cur_ready_queue_aiv_head_[MAX_AICPU_THREADS];
-    int cur_ready_queue_aiv_tail_[MAX_AICPU_THREADS];
-
-    // ===== Task queue state =====
-    std::mutex ready_queue_aic_mutex_;
-    int ready_queue_aic_[RUNTIME_MAX_TASKS];
-    std::atomic<int> ready_count_aic_{0};
-    int ready_queue_aic_head_{0};  // Circular queue: read position (front)
-    int ready_queue_aic_tail_{0};  // Circular queue: write position (back)
-
-    std::mutex ready_queue_aiv_mutex_;
-    int ready_queue_aiv_[RUNTIME_MAX_TASKS];
-    std::atomic<int> ready_count_aiv_{0};
-    int ready_queue_aiv_head_{0};  // Circular queue: read position (front)
-    int ready_queue_aiv_tail_{0};  // Circular queue: write position (back)
-
-    // Task execution tracking
-    std::atomic<int> completed_tasks_{0};
-    std::atomic<int> total_tasks_{0};
-    std::atomic<int> finished_count_{0};
-
-    // ===== Performance profiling state =====
-    uint64_t dispatch_timestamps_[RUNTIME_MAX_WORKER];  // Per-core AICPU dispatch timestamp
+    // ===== Scheduler context (owns all dispatch/completion/drain state) =====
+    SchedulerContext sched_ctx_;
 
     // ===== Methods =====
-    int init(Runtime *runtime);
-    int handshake_all_cores(Runtime *runtime);
-    void assign_cores_to_threads();
-    void classify_and_distribute_initial_tasks(Runtime *runtime);
-    int resolve_and_dispatch(Runtime &runtime, int thread_idx, const int *cur_thread_cores, int core_num);
-    int shutdown_aicore(Runtime *runtime, int thread_idx, const int *cur_thread_cores);
-    int run(Runtime *runtime);
+    int32_t init(Runtime *runtime);
+    int32_t run(Runtime *runtime);
     void deinit(Runtime *runtime);
-    void emergency_shutdown(Runtime *runtime);
-    void
-    diagnose_stuck_state(Runtime &runtime, int thread_idx, const int *cur_thread_cores, int core_num, Handshake *hank);
-
-    // Helper functions (inline to avoid linker issues, not always_inline to preserve barriers)
-    //
-    // resolve_task_dependencies also handles post-completion profiling hooks
-    // (AFTER_COMPLETION tensor dump + per-task PMU record) so that callers
-    // walk one boundary instead of sprinkling three #if PTO2_PROFILING blocks
-    // after every resolve site. core_id / core_type are only read when the
-    // relevant profiling flag is enabled.
-    inline void resolve_task_dependencies(
-        Task *task, Runtime &runtime, int thread_idx, int core_id, CoreType core_type, int *cur_ready_queue_aic,
-        int &cur_aic_tail, int &cur_aic_ready_count, int *cur_ready_queue_aiv, int &cur_aiv_tail,
-        int &cur_aiv_ready_count
-    );
-
-    inline bool try_dispatch_task(
-        int core_id, uint64_t reg_addr, CoreType core_type, int thread_idx, int *local_queue, int &head,
-        int &ready_count, bool l2_swimlane_enabled, Runtime &runtime
-    );
 };
 
 static AicpuExecutor g_aicpu_executor;
 
-#if PTO2_PROFILING
-static int
-collect_task_tensor_buffer_addrs(const Runtime &runtime, const Task &task, uint64_t *buffer_addrs, int max_count) {
-    int found = 0;
-    for (int arg_idx = 0; arg_idx < task.num_args; arg_idx++) {
-        uint64_t arg = task.args[arg_idx];
-        if (!runtime.is_tensor_buffer_addr(arg)) {
-            continue;
-        }
-        if (found < max_count) {
-            buffer_addrs[found] = arg;
-        }
-        found++;
-    }
-    return found;
-}
-#endif
-
-// ===== Helper Function Implementations =====
-
-// Resolve dependencies: decrement fanin and enqueue newly ready tasks.
-// Also handles post-completion profiling hooks (AFTER_COMPLETION tensor dump
-// + per-task PMU record) so callers don't need to re-check profiling flags.
-inline void AicpuExecutor::resolve_task_dependencies(
-    Task *task, Runtime &runtime, int thread_idx, int core_id, CoreType core_type, int *cur_ready_queue_aic,
-    int &cur_aic_tail, int &cur_aic_ready_count, int *cur_ready_queue_aiv, int &cur_aiv_tail, int &cur_aiv_ready_count
-) {
-    if (task == nullptr) {
-        return;
-    }
-
-#if PTO2_PROFILING
-    if (is_dump_args_enabled()) {
-        uint64_t callable_addr = runtime.get_function_bin_addr(task->func_id);
-        if (callable_addr != 0) {
-            const CoreCallable *callable = reinterpret_cast<const CoreCallable *>(callable_addr);
-            int tensor_info_count = 0;
-            const TensorInfo *tensor_info = runtime.get_tensor_info(task->task_id, &tensor_info_count);
-            uint64_t tensor_buffer_addrs[RUNTIME_MAX_ARGS] = {};
-            int tensor_buffer_count =
-                collect_task_tensor_buffer_addrs(runtime, *task, tensor_buffer_addrs, RUNTIME_MAX_ARGS);
-            dump_args_for_task(
-                thread_idx, static_cast<uint64_t>(task->task_id), task->num_args, *callable, tensor_info,
-                tensor_info_count, tensor_buffer_addrs, tensor_buffer_count, TensorDumpStage::AFTER_COMPLETION
-            );
-        }
-    }
-    if (is_pmu_enabled()) {
-        pmu_aicpu_record_task(core_id, thread_idx, static_cast<uint64_t>(task->task_id), task->func_id, core_type);
-    }
-#else
-    (void)thread_idx;
-    (void)core_id;
-    (void)core_type;
-#endif
-
-    for (int j = 0; j < task->fanout_count; j++) {
-        int dep_id = task->fanout[j];
-        Task *dep = runtime.get_task(dep_id);
-        int prev_fanin = dep->fanin.fetch_sub(1, std::memory_order_acq_rel);
-
-        if (prev_fanin == 1) {
-            if (dep->core_type == CoreType::AIC) {
-                if (cur_aic_ready_count < aic_per_thread_) {
-                    cur_ready_queue_aic[cur_aic_tail] = dep_id;
-                    cur_aic_tail = (cur_aic_tail + 1) % MAX_CORES_PER_THREAD;
-                    cur_aic_ready_count++;
-                } else {
-                    std::scoped_lock lock(ready_queue_aic_mutex_);
-                    ready_queue_aic_[ready_queue_aic_tail_] = dep_id;
-                    ready_queue_aic_tail_ = (ready_queue_aic_tail_ + 1) % RUNTIME_MAX_TASKS;
-                    ready_count_aic_.fetch_add(1, std::memory_order_release);
-                }
-            } else {
-                if (cur_aiv_ready_count < aiv_per_thread_) {
-                    cur_ready_queue_aiv[cur_aiv_tail] = dep_id;
-                    cur_aiv_tail = (cur_aiv_tail + 1) % MAX_CORES_PER_THREAD;
-                    cur_aiv_ready_count++;
-                } else {
-                    std::scoped_lock lock(ready_queue_aiv_mutex_);
-                    ready_queue_aiv_[ready_queue_aiv_tail_] = dep_id;
-                    ready_queue_aiv_tail_ = (ready_queue_aiv_tail_ + 1) % RUNTIME_MAX_TASKS;
-                    ready_count_aiv_.fetch_add(1, std::memory_order_release);
-                }
-            }
-        }
-    }
-}
-
-// Try to dispatch a task from thread-local queue to a core
-inline bool AicpuExecutor::try_dispatch_task(
-    int core_id, uint64_t reg_addr, CoreType core_type, int thread_idx, int *local_queue, int &head, int &ready_count,
-    bool l2_swimlane_enabled, [[maybe_unused]] Runtime &runtime
-) {
-    if (ready_count <= 0) {
-        return false;
-    }
-
-    // Dequeue task from thread-local queue
-    int task_id = local_queue[head];
-    head = (head + 1) % MAX_CORES_PER_THREAD;
-    ready_count--;
-
-    const char *core_type_str = (core_type == CoreType::AIC) ? "AIC" : "AIV";
-    LOG_INFO_V0(
-        "Thread %d: Dispatching %s task %d to core %d (running_id=%d)", thread_idx, core_type_str, task_id, core_id,
-        running_task_ids_[core_id]
-    );
-
-#if PTO2_PROFILING
-    if (is_dump_args_enabled()) {
-        Task *task = runtime.get_task(task_id);
-        if (task != nullptr) {
-            uint64_t callable_addr = runtime.get_function_bin_addr(task->func_id);
-            if (callable_addr != 0) {
-                const CoreCallable *callable = reinterpret_cast<const CoreCallable *>(callable_addr);
-                int tensor_info_count = 0;
-                const TensorInfo *tensor_info = runtime.get_tensor_info(task_id, &tensor_info_count);
-                uint64_t tensor_buffer_addrs[RUNTIME_MAX_ARGS] = {};
-                int tensor_buffer_count =
-                    collect_task_tensor_buffer_addrs(runtime, *task, tensor_buffer_addrs, RUNTIME_MAX_ARGS);
-                dump_args_for_task(
-                    thread_idx, static_cast<uint64_t>(task_id), task->num_args, *callable, tensor_info,
-                    tensor_info_count, tensor_buffer_addrs, tensor_buffer_count, TensorDumpStage::BEFORE_DISPATCH
-                );
-            }
-        }
-    }
-#endif
-
-    // Set state before writing register to avoid race with AICore ACK
-    pending_task_ids_[core_id] = task_id;
-
-    // AICore buffer rotation: count this dispatch and rotate before write_reg
-    // when crossing a BUFFER_SIZE boundary. The completion-before-dispatch
-    // invariant makes this race-free (all prior tasks on this core have FIN'd,
-    // so AICore has dcci'd their records out of the old buffer).
-    if (l2_swimlane_enabled) {
-        l2_swimlane_aicpu_on_aicore_dispatch(core_id, thread_idx);
-    }
-
-    // Publish task data before AICore can observe the dispatched task_id.
-    // ARM64 needs an explicit store-store fence across Normal-cacheable ->
-    // Device-nGnRnE; the old write_reg() helper carried this implicitly via
-    // __sync_synchronize.
-    wmb();
-
-    // Capture dispatch timestamp at the latest possible moment — after wmb,
-    // immediately before the DATA_MAIN_BASE write. Anything earlier
-    // (LOG_INFO_V0, on_aicore_dispatch's per-BUFFER_SIZE rotation work, wmb
-    // itself) would charge AICPU-internal cost to (dispatch_time → start_time).
-    if (l2_swimlane_enabled && get_l2_swimlane_level() >= L2SwimlaneLevel::AICPU_TIMING) {
-        dispatch_timestamps_[core_id] = get_sys_cnt_aicpu();
-    }
-
-    write_reg(reg_addr, RegId::DATA_MAIN_BASE, static_cast<uint64_t>(task_id));
-
-    return true;
-}
-
 // ===== AicpuExecutor Method Implementations =====
 
-int AicpuExecutor::init(Runtime *runtime) {
+int32_t AicpuExecutor::init(Runtime *runtime) {
     bool expected = false;
     if (!initialized_.compare_exchange_strong(expected, true, std::memory_order_acq_rel, std::memory_order_acquire)) {
         return 0;
@@ -323,892 +128,174 @@ int AicpuExecutor::init(Runtime *runtime) {
         return -1;
     }
 
-    // Read execution parameters from runtime
+    // Read execution parameters from runtime. The 0 → 1 fixup runs before the
+    // sched_thread_num_ derivation so a zero input doesn't leave the scheduler
+    // count at -1.
     aicpu_thread_num_ = runtime->aicpu_thread_num;
+    if (aicpu_thread_num_ == 0) aicpu_thread_num_ = 1;
+    sched_thread_num_ = aicpu_thread_num_ - 1;
+    orch_to_sched_ = runtime->orch_to_sched;
 
-    // Simplified defensive check
     if (aicpu_thread_num_ < 1 || aicpu_thread_num_ > MAX_AICPU_THREADS) {
-        LOG_ERROR("Invalid aicpu_thread_num: %d (valid range: 1-%d)", aicpu_thread_num_, MAX_AICPU_THREADS);
+        LOG_ERROR("Invalid aicpu_thread_num: %d", aicpu_thread_num_);
         init_failed_.store(true, std::memory_order_release);
         return -1;
     }
 
-    // Initialize core_id_to_reg_addr_ array to 0 before handshake
-    for (int i = 0; i < MAX_CORES_PER_THREAD; i++) {
-        core_id_to_reg_addr_[i] = 0;
-    }
-
-    if (is_l2_swimlane_enabled()) {
-        l2_swimlane_aicpu_init(runtime->worker_count);
-    }
-
-    // Perform core discovery: handshake with all cores and collect core type information
-    int rc = handshake_all_cores(runtime);
-    if (rc != 0) {
-        LOG_ERROR("Core discovery failed");
+    if (sched_ctx_.init(runtime, aicpu_thread_num_, sched_thread_num_, orch_to_sched_, get_platform_regs()) != 0) {
         init_failed_.store(true, std::memory_order_release);
         return -1;
     }
 
-    LOG_INFO_V0("Config: threads=%d, cores=%d", aicpu_thread_num_, cores_total_num_);
-
-    for (int i = 0; i < cores_total_num_; i++) {
-        pending_task_ids_[i] = AICPU_TASK_INVALID;
-        running_task_ids_[i] = AICPU_TASK_INVALID;
-        core_first_dispatch_[i] = true;
-    }
-
-    assign_cores_to_threads();
-    classify_and_distribute_initial_tasks(runtime);
-
-    total_tasks_.store(runtime->get_task_count(), std::memory_order_release);
-    completed_tasks_.store(0, std::memory_order_release);
     finished_count_.store(0, std::memory_order_release);
 
-    for (int i = 0; i < RUNTIME_MAX_WORKER; i++) {
-        dispatch_timestamps_[i] = 0;
-    }
-#if PTO2_PROFILING
-    if (is_dump_args_enabled()) {
-        dump_args_init(aicpu_thread_num_);
-    }
-    if (is_pmu_enabled()) {
-        pmu_aicpu_init(physical_core_ids_, cores_total_num_);
-        LOG_INFO_V0("PMU profiling started on %d cores", cores_total_num_);
-    }
-#endif
-
     init_done_.store(true, std::memory_order_release);
     LOG_INFO_V0("AicpuExecutor: Init complete");
     return 0;
 }
 
 /**
- * Handshake with all AICore workers and discover core types
- *
- * This function performs centralized handshaking with all cores and collects
- * their type information. By doing this in a single thread, we avoid redundant
- * handshakes and enable dynamic core assignment.
- *
- * Protocol:
- * 1. Send aicpu_ready=1 to all cores
- * 2. Wait for each core's aicore_done response
- * 3. Read core_type reported by each core
- * 4. Classify cores into aic_cores_[] and aiv_cores_[] arrays
- *
- * @param runtime Runtime pointer
- * @return 0 on success, -1 on failure
+ * Shutdown AICore - Send exit signal via registers to all AICore kernels
  */
-int AicpuExecutor::handshake_all_cores(Runtime *runtime) {
-    Handshake *all_handshakes = reinterpret_cast<Handshake *>(runtime->workers);
-    cores_total_num_ = runtime->worker_count;
-
-    // Validate cores_total_num_ before using as array index
-    if (cores_total_num_ == 0 || cores_total_num_ > MAX_CORES_PER_THREAD) {
-        LOG_ERROR("Invalid cores_total_num %d (expected 1-%d)", cores_total_num_, MAX_CORES_PER_THREAD);
-        return -1;
-    }
-
-    aic_count_ = 0;
-    aiv_count_ = 0;
-
-    LOG_INFO_V0("Core Discovery: Handshaking with %d cores", cores_total_num_);
-
-    // Step 1: Send handshake signal to all cores
-    for (int i = 0; i < cores_total_num_; i++) {
-        all_handshakes[i].aicpu_ready = 1;
-    }
-    OUT_OF_ORDER_STORE_BARRIER();
-
-    // Get platform physical cores count for validation
-    uint32_t max_physical_cores_count = platform_get_physical_cores_count();
-
-    // Step 2: Wait for all cores to respond and collect core type information
-    bool handshake_failed = false;
-    for (int i = 0; i < cores_total_num_; i++) {
-        Handshake *hank = &all_handshakes[i];
-
-        // Wait for aicore_regs_ready signal
-        while (hank->aicore_regs_ready == 0) {
-            // Busy wait for core response
-        }
-
-        uint32_t physical_core_id = hank->physical_core_id;
-
-        // Validate physical_core_id before using as array index
-        if (physical_core_id >= max_physical_cores_count) {
-            LOG_ERROR(
-                "Core %d reported invalid physical_core_id=%u (platform max=%u)", i, physical_core_id,
-                max_physical_cores_count
-            );
-            handshake_failed = true;
-            continue;
-        }
-
-        // Get register address using physical_core_id
-        uint64_t *regs = reinterpret_cast<uint64_t *>(regs_);
-        uint64_t reg_addr = regs[physical_core_id];
-
-        // Initialize AICore registers after discovery (first round)
-        platform_init_aicore_regs(reg_addr);
-        OUT_OF_ORDER_STORE_BARRIER();
-        hank->aicpu_regs_ready = 1;
-
-        OUT_OF_ORDER_STORE_BARRIER();
-
-        while (hank->aicore_done == 0) {}
-
-        CoreType type = hank->core_type;
-
-        if (type == CoreType::AIC) {
-            aic_cores_[aic_count_].worker_id = i;
-            aic_cores_[aic_count_].physical_core_id = physical_core_id;
-            aic_cores_[aic_count_].reg_addr = reg_addr;
-            aic_cores_[aic_count_].core_type = type;
-            aic_count_++;
-        } else if (type == CoreType::AIV) {
-            aiv_cores_[aiv_count_].worker_id = i;
-            aiv_cores_[aiv_count_].physical_core_id = physical_core_id;
-            aiv_cores_[aiv_count_].reg_addr = reg_addr;
-            aiv_cores_[aiv_count_].core_type = type;
-            aiv_count_++;
-        } else {
-            LOG_ERROR("Unknown core type from core %d", i);
-            handshake_failed = true;
-        }
-
-        core_id_to_reg_addr_[i] = reg_addr;
-
-#if PTO2_PROFILING
-        physical_core_ids_[i] = physical_core_id;
-#endif
-
-        LOG_INFO_V0(
-            "  Core %d: type=%s, physical_id=%u, reg_addr=0x%lx", i, core_type_to_string(type), physical_core_id,
-            reg_addr
+int32_t AicpuExecutor::run(Runtime *runtime) {
+    int32_t affinity_exec_idx = platform_aicpu_affinity_thread_idx();
+    int32_t thread_idx = (affinity_exec_idx >= 0) ? affinity_exec_idx : (thread_idx_++);
+    if (thread_idx < 0 || thread_idx >= aicpu_thread_num_ || thread_idx >= MAX_AICPU_THREADS) {
+        LOG_ERROR(
+            "Thread index %d out of bounds (active=%d max=%d exec_idx=%d)", thread_idx, aicpu_thread_num_,
+            MAX_AICPU_THREADS, affinity_exec_idx
         );
-    }
-
-    if (handshake_failed) {
-        emergency_shutdown(runtime);
         return -1;
     }
+    int32_t run_rc = 0;
+    LOG_INFO_V0("Thread %d: Start (exec_idx=%d)", thread_idx, affinity_exec_idx);
 
-    LOG_INFO_V0("Discovery complete: AIC=%d, AIV=%d, Total=%d", aic_count_, aiv_count_, cores_total_num_);
-    return 0;
-}
-
-// Assign discovered cores to threads using round-robin
-void AicpuExecutor::assign_cores_to_threads() {
-    // Round-robin: AIC core i → thread (i % aicpu_thread_num_), AIV core i → thread (i % aicpu_thread_num_).
-    // AIC and AIV are assigned independently; no cluster pairing is required.
-    // aic_per_thread_ / aiv_per_thread_ store the ceiling value and serve as local queue caps.
-    aic_per_thread_ = (aic_count_ + aicpu_thread_num_ - 1) / aicpu_thread_num_;
-    aiv_per_thread_ = (aiv_count_ + aicpu_thread_num_ - 1) / aicpu_thread_num_;
-
-    LOG_INFO_V0(
-        "Core Assignment: %d AIC cores, %d AIV cores across %d threads (max %d AIC/thread, %d AIV/thread)", aic_count_,
-        aiv_count_, aicpu_thread_num_, aic_per_thread_, aiv_per_thread_
-    );
-
-    for (int t = 0; t < aicpu_thread_num_; t++) {
-        int core_idx = 0;
-
-        // Assign AIC cores: cores at indices t, t+aicpu_thread_num_, t+2*aicpu_thread_num_, ...
-        for (int i = t; i < aic_count_; i += aicpu_thread_num_) {
-            core_assignments_[t][core_idx++] = aic_cores_[i].worker_id;
-        }
-
-        // Assign AIV cores after AIC cores
-        for (int i = t; i < aiv_count_; i += aicpu_thread_num_) {
-            core_assignments_[t][core_idx++] = aiv_cores_[i].worker_id;
-        }
-
-        thread_cores_num_[t] = core_idx;
-
-        char log_buffer[256];
-        int offset = 0;
-
-        offset += snprintf(
-            log_buffer + offset, sizeof(log_buffer) - offset, "Thread %d: assigned %d cores - AIC[", t, core_idx
-        );
-
-        for (int k = 0, i = t; i < aic_count_; i += aicpu_thread_num_, k++) {
-            if (k > 0) offset += snprintf(log_buffer + offset, sizeof(log_buffer) - offset, ",");
-            offset += snprintf(log_buffer + offset, sizeof(log_buffer) - offset, "%d", aic_cores_[i].worker_id);
-        }
-
-        offset += snprintf(log_buffer + offset, sizeof(log_buffer) - offset, "] AIV[");
-
-        for (int k = 0, i = t; i < aiv_count_; i += aicpu_thread_num_, k++) {
-            if (k > 0) offset += snprintf(log_buffer + offset, sizeof(log_buffer) - offset, ",");
-            offset += snprintf(log_buffer + offset, sizeof(log_buffer) - offset, "%d", aiv_cores_[i].worker_id);
-        }
-
-        offset += snprintf(log_buffer + offset, sizeof(log_buffer) - offset, "]");
-
-        LOG_INFO_V0("%s", log_buffer);
-    }
-}
-
-// Classify and distribute initial ready tasks to thread-local and shared queues
-void AicpuExecutor::classify_and_distribute_initial_tasks(Runtime *runtime) {
-    ready_queue_aic_head_ = 0;
-    ready_queue_aic_tail_ = 0;
-    ready_queue_aiv_head_ = 0;
-    ready_queue_aiv_tail_ = 0;
-
-    for (int t = 0; t < MAX_AICPU_THREADS; t++) {
-        cur_ready_queue_aic_head_[t] = 0;
-        cur_ready_queue_aic_tail_[t] = 0;
-        cur_ready_queue_aiv_head_[t] = 0;
-        cur_ready_queue_aiv_tail_[t] = 0;
-    }
-
-    int initial_count = 0;
-    int initial_aic_count = 0;
-    int initial_aiv_count = 0;
-    int aic_shared_count = 0;
-    int aiv_shared_count = 0;
-    int next_aic_thread = 0;
-    int next_aiv_thread = 0;
-
-    auto enqueue_initial_task = [&](int task_id, CoreType core_type, int &next_thread_idx, int &shared_count) {
-        int thread_idx = next_thread_idx;
-        int *head_ptr = (core_type == CoreType::AIC) ? &cur_ready_queue_aic_head_[thread_idx] :
-                                                       &cur_ready_queue_aiv_head_[thread_idx];
-        int *tail_ptr = (core_type == CoreType::AIC) ? &cur_ready_queue_aic_tail_[thread_idx] :
-                                                       &cur_ready_queue_aiv_tail_[thread_idx];
-        int cur_size = (*tail_ptr - *head_ptr + MAX_CORES_PER_THREAD) % MAX_CORES_PER_THREAD;
-        int local_capacity = (core_type == CoreType::AIC) ? aic_per_thread_ : aiv_per_thread_;
-
-        if (cur_size < local_capacity) {
-            if (core_type == CoreType::AIC) {
-                cur_ready_queue_aic_[thread_idx][*tail_ptr] = task_id;
-            } else {
-                cur_ready_queue_aiv_[thread_idx][*tail_ptr] = task_id;
-            }
-            *tail_ptr = (*tail_ptr + 1) % MAX_CORES_PER_THREAD;
-            LOG_INFO_V0(
-                "Init: %s task %d -> Thread %d local queue (size=%d)", core_type == CoreType::AIC ? "AIC" : "AIV",
-                task_id, thread_idx, cur_size + 1
-            );
-        } else if (core_type == CoreType::AIC) {
-            ready_queue_aic_[ready_queue_aic_tail_] = task_id;
-            ready_queue_aic_tail_ = (ready_queue_aic_tail_ + 1) % RUNTIME_MAX_TASKS;
-            shared_count++;
-        } else {
-            ready_queue_aiv_[ready_queue_aiv_tail_] = task_id;
-            ready_queue_aiv_tail_ = (ready_queue_aiv_tail_ + 1) % RUNTIME_MAX_TASKS;
-            shared_count++;
+    // Boot thread (thread N-1): host_build_graph host-orch boot. The
+    // orchestrator already ran on the host, which also relocated every
+    // cross-task pointer to its final device address before H2D — so the
+    // SM/arena this thread sees are already fully device-addressed. This thread
+    // attaches the prebuilt arena, points the SM handle's ring-header pointers
+    // at the device SM WITHOUT resetting the host-populated data, releases the
+    // scheduler threads, and hands the host-computed task count to the
+    // scheduler. It owns no AICore cores, so it does not dispatch.
+    if (thread_idx >= sched_thread_num_) {
+        void *prebuilt_arena = runtime->get_prebuilt_arena_base();
+        size_t off_runtime = runtime->get_prebuilt_runtime_offset();
+        if (prebuilt_arena == nullptr) {
+            LOG_ERROR("Thread %d: host-orch: prebuilt_arena_base is null", thread_idx);
+            runtime_init_ready_.store(true, std::memory_order_release);
+            return -1;
         }
-
-        next_thread_idx = (thread_idx + 1) % aicpu_thread_num_;
-    };
-
-    int task_count = runtime->get_task_count();
-    for (int task_id = 0; task_id < task_count; task_id++) {
-        Task *task = runtime->get_task(task_id);
-        if (task == nullptr || task->fanin.load(std::memory_order_acquire) != 0) {
-            continue;
+        runtime_arena_.attach(prebuilt_arena, DeviceArena::kDefaultBaseAlign);
+        rt = reinterpret_cast<PTO2Runtime *>(static_cast<char *>(prebuilt_arena) + off_runtime);
+        runtime_wire_arena_pointers(runtime_arena_, rt->prebuilt_layout, rt);
+
+        void *sm_ptr = runtime->get_gm_sm_ptr();
+        uint64_t sm_size = PTO2SharedMemoryHandle::calculate_size_per_ring(rt->prebuilt_layout.task_window_sizes);
+        memset(rt->sm_handle, 0, sizeof(*rt->sm_handle));
+        if (!rt->sm_handle->attach_populated(sm_ptr, sm_size, rt->prebuilt_layout.task_window_sizes)) {
+            LOG_ERROR("Thread %d: host-orch: sm_handle->attach_populated failed", thread_idx);
+            rt = nullptr;
+            runtime_init_ready_.store(true, std::memory_order_release);
+            return -1;
         }
 
-        initial_count++;
-        if (task->core_type == CoreType::AIC) {
-            initial_aic_count++;
-            enqueue_initial_task(task_id, CoreType::AIC, next_aic_thread, aic_shared_count);
-        } else {
-            initial_aiv_count++;
-            enqueue_initial_task(task_id, CoreType::AIV, next_aiv_thread, aiv_shared_count);
+        memset(rt->aicore_mailbox, 0, sizeof(*rt->aicore_mailbox));
+        runtime_finalize_after_wire(rt, sched_ctx_.aic_count(), sched_ctx_.aiv_count());
+        runtime->set_slot_states_ptr(nullptr);
+
+        sched_ctx_.bind_runtime(rt);
+
+        // Publish orchestration completion (sets total_tasks_ + orchestrator_done_)
+        // BEFORE releasing the scheduler threads. Otherwise they unblock with
+        // total_tasks_=0 and can race to an early exit before the host task
+        // count is published (host-orch has no concurrent orchestrator to
+        // keep them alive).
+        // NOTE: do NOT call rt_orchestration_done(rt) here. The HOST already
+        // called it in run_host_orchestration; the orchestrator's own
+        // task-allocator pointers are intentionally NOT relocated (only the
+        // SM cross-task pointers and the host-built fanout adjacency —
+        // dep_pool / ready queues / fanout_head — were), so they still hold
+        // host addresses and mark_done()'s active_count() read would
+        // dereference host memory and fault the AICPU. on_orchestration_done
+        // only needs total_tasks and the scalar
+        // orchestrator.inline_completed_tasks, both already valid.
+        sched_ctx_.on_orchestration_done(runtime, rt, thread_idx, runtime->host_total_tasks);
+
+        runtime_init_ready_.store(true, std::memory_order_release);
+        LOG_INFO_V0("Thread %d: host-orch boot complete (%d tasks)", thread_idx, runtime->host_total_tasks);
+    }
+
+    // Scheduler thread (orchestrator threads skip dispatch when orch_to_sched_ is false)
+    if (!sched_ctx_.is_completed() && (thread_idx < sched_thread_num_ || orch_to_sched_)) {
+        // Device orchestration: wait for the primary orchestrator to initialize the SM header
+        while (!runtime_init_ready_.load(std::memory_order_acquire)) {
+            SPIN_WAIT_HINT();
         }
-    }
-
-    LOG_INFO_V0("Init: Found %d initially ready tasks", initial_count);
-    LOG_INFO_V0("Init: Initial ready tasks by type: AIC=%d, AIV=%d", initial_aic_count, initial_aiv_count);
-    ready_count_aic_.store(aic_shared_count, std::memory_order_release);
-    ready_count_aiv_.store(aiv_shared_count, std::memory_order_release);
-
-    LOG_INFO_V0(
-        "Init: Task distribution complete - AIC: %d in local queues, %d in shared queue",
-        initial_aic_count - aic_shared_count, aic_shared_count
-    );
-    LOG_INFO_V0(
-        "Init: Task distribution complete - AIV: %d in local queues, %d in shared queue",
-        initial_aiv_count - aiv_shared_count, aiv_shared_count
-    );
-
-    for (int t = 0; t < aicpu_thread_num_; t++) {
-        int aic_size =
-            (cur_ready_queue_aic_tail_[t] - cur_ready_queue_aic_head_[t] + MAX_CORES_PER_THREAD) % MAX_CORES_PER_THREAD;
-        int aiv_size =
-            (cur_ready_queue_aiv_tail_[t] - cur_ready_queue_aiv_head_[t] + MAX_CORES_PER_THREAD) % MAX_CORES_PER_THREAD;
-        LOG_INFO_V0("Init: Thread %d local queues - AIC: %d tasks, AIV: %d tasks", t, aic_size, aiv_size);
-    }
-}
-
-/**
- * Shutdown AICore - Send quit signal to all AICore kernels
- */
-int AicpuExecutor::shutdown_aicore(Runtime *runtime, int thread_idx, const int *cur_thread_cores) {
-    Handshake *all_handshakes = reinterpret_cast<Handshake *>(runtime->workers);
-
-    LOG_INFO_V0("Thread %d: Shutting down %d cores", thread_idx, thread_cores_num_[thread_idx]);
-
-    for (int i = 0; i < thread_cores_num_[thread_idx]; i++) {
-        int core_id = cur_thread_cores[i];
-        Handshake *hank = &all_handshakes[core_id];
-        LOG_INFO_V0("Thread %d: AICPU hank addr = 0x%lx", thread_idx, reinterpret_cast<uint64_t>(hank));
-
-        uint64_t reg_addr = core_id_to_reg_addr_[core_id];
-        if (reg_addr != 0) {
-            platform_deinit_aicore_regs(reg_addr);
+        if (rt == nullptr) {
+            LOG_ERROR("Thread %d: rt is null after orchestrator error, skipping dispatch", thread_idx);
         } else {
-            LOG_ERROR("Thread %d: Core %d has invalid register address", thread_idx, core_id);
-        }
-    }
-    LOG_INFO_V0("Thread %d: Shutdown complete", thread_idx);
-    return 0;
-}
-
-/**
- * Resolve dependencies and dispatch tasks using fast-path scheduling
- */
-int AicpuExecutor::resolve_and_dispatch(Runtime &runtime, int thread_idx, const int *cur_thread_cores, int core_num) {
-    Handshake *hank = reinterpret_cast<Handshake *>(runtime.workers);
-
-    LOG_INFO_V0("Thread %d: Starting execution with %d cores", thread_idx, core_num);
-
-    int cur_thread_completed = 0;
-    int task_count = total_tasks_.load(std::memory_order_acquire);
-
-    // Timeout detection using idle iteration counting
-    int idle_iterations = 0;
-    const int MAX_IDLE_ITERATIONS = 50000000;
-    const int WARN_INTERVAL = 1000000;
-    bool made_progress = false;
-
-    int verification_warning_count = 0;
-    const int MAX_VERIFICATION_WARNINGS = 10;
-    bool l2_swimlane_enabled = is_l2_swimlane_enabled();
-    L2SwimlaneLevel l2_swimlane_level = get_l2_swimlane_level();
-    // PMU runs require single-issue dispatch — overlapping in-flight tasks
-    // pollute per-task PMU counters. Cached at function scope:
-    // is_pmu_enabled() is extern "C" and the compiler cannot hoist it
-    // across the dispatch loop on its own.
-    const bool pmu_active = is_pmu_enabled();
-
-    // Extract array pointers as local variables for better readability and performance
-    int *cur_ready_queue_aic = cur_ready_queue_aic_[thread_idx];
-    int *cur_ready_queue_aiv = cur_ready_queue_aiv_[thread_idx];
-
-    // Initialize local circular queue pointers from member variables (set by init())
-    // After this point, only use local variables for lock-free performance
-    int cur_aic_head = cur_ready_queue_aic_head_[thread_idx];
-    int cur_aic_tail = cur_ready_queue_aic_tail_[thread_idx];
-    int cur_aiv_head = cur_ready_queue_aiv_head_[thread_idx];
-    int cur_aiv_tail = cur_ready_queue_aiv_tail_[thread_idx];
-
-    // Calculate initial queue sizes
-    int cur_aic_ready_count = (cur_aic_tail - cur_aic_head + MAX_CORES_PER_THREAD) % MAX_CORES_PER_THREAD;
-    int cur_aiv_ready_count = (cur_aiv_tail - cur_aiv_head + MAX_CORES_PER_THREAD) % MAX_CORES_PER_THREAD;
-
-    LOG_INFO_V0(
-        "Thread %d: Initial state - local queue: %d AIC, %d AIV", thread_idx, cur_aic_ready_count, cur_aiv_ready_count
-    );
-
-    // Initialize dispatch timestamps for all cores (only needed at level >= 2)
-    if (l2_swimlane_level >= L2SwimlaneLevel::AICPU_TIMING) {
-        uint64_t dispatch_start_time = get_sys_cnt_aicpu();
-        for (int i = 0; i < core_num; i++) {
-            int core_id = cur_thread_cores[i];
-            dispatch_timestamps_[core_id] = dispatch_start_time;
-        }
-    }
-
-    // Main execution loop with unified scheduling
-    while (true) {
-        for (int i = 0; i < core_num; i++) {
-            int core_id = cur_thread_cores[i];
-            uint64_t reg_addr = core_id_to_reg_addr_[core_id];
-            Handshake *h = &hank[core_id];
-
-            uint64_t reg_val = read_reg(reg_addr, RegId::COND);
-            // ARM64 allows Device-nGnRnE -> Normal-cacheable load reorder;
-            // the rmb() pins any AICore-published cacheable reads downstream
-            // of the FIN observation. Replaces the post-`__sync_synchronize`
-            // that the old read_reg() helper carried implicitly.
-            rmb();
-            int reg_task_id = EXTRACT_TASK_ID(reg_val);
-            int reg_state = EXTRACT_TASK_STATE(reg_val);
-
-            // Case 1: Pending task finished directly
-            if (reg_task_id == pending_task_ids_[core_id] && reg_state == TASK_FIN_STATE) {
-                // Capture finish_ts at FIN-observation — after rmb() above,
-                // BEFORE any logging or completion processing. Both completions
-                // below (implicit prev_running + explicit pending) happen at this
-                // same observation moment, so they share one timestamp.
-                uint64_t finish_ts = (l2_swimlane_enabled && l2_swimlane_level >= L2SwimlaneLevel::AICPU_TIMING) ?
-                                         get_sys_cnt_aicpu() :
-                                         0;
-
-                LOG_INFO_V0(
-                    "Thread %d: Core %d completed task %d (running_id=%d)", thread_idx, core_id,
-                    pending_task_ids_[core_id], running_task_ids_[core_id]
-                );
-
-                int completed_task_id = pending_task_ids_[core_id];
-                int prev_running_id = running_task_ids_[core_id];
-
-                // Profiling: when prev_running_id exists, its AICore timing was
-                // published to the ring slot first, so complete it BEFORE the
-                // pending task's record to maintain buffer ordering.
-                // Level gate: AICPU contributes only at AICPU_TIMING+; level=1
-                // is satisfied by the AICore record alone (carries task_token).
-                if (l2_swimlane_enabled && l2_swimlane_level >= L2SwimlaneLevel::AICPU_TIMING) {
-                    if (prev_running_id != AICPU_TASK_INVALID) {
-                        if (l2_swimlane_aicpu_complete_task(
-                                core_id, thread_idx, static_cast<uint32_t>(prev_running_id),
-                                dispatch_timestamps_[core_id], finish_ts
-                            ) != 0) {
-                            LOG_ERROR(
-                                "Core %d: l2_swimlane_aicpu_complete_task failed for implicit task %d", core_id,
-                                prev_running_id
-                            );
-                        }
-                    }
-
-                    if (l2_swimlane_aicpu_complete_task(
-                            core_id, thread_idx, static_cast<uint32_t>(completed_task_id),
-                            dispatch_timestamps_[core_id], finish_ts
-                        ) != 0) {
-                        LOG_ERROR(
-                            "Core %d: l2_swimlane_aicpu_complete_task failed for task %d", core_id, completed_task_id
-                        );
-                    }
-                }
-
-                cur_thread_completed++;
-                completed_tasks_.fetch_add(1, std::memory_order_release);
-
-                pending_task_ids_[core_id] = AICPU_TASK_INVALID;
-                running_task_ids_[core_id] = AICPU_TASK_INVALID;
-
-                // Try dispatch BEFORE resolve_dependencies
-                // This allows the core to start next task immediately
-                if (h->core_type == CoreType::AIC && cur_aic_ready_count > 0) {
-                    try_dispatch_task(
-                        core_id, reg_addr, CoreType::AIC, thread_idx, cur_ready_queue_aic, cur_aic_head,
-                        cur_aic_ready_count, l2_swimlane_enabled, runtime
-                    );
-                } else if (h->core_type == CoreType::AIV && cur_aiv_ready_count > 0) {
-                    try_dispatch_task(
-                        core_id, reg_addr, CoreType::AIV, thread_idx, cur_ready_queue_aiv, cur_aiv_head,
-                        cur_aiv_ready_count, l2_swimlane_enabled, runtime
-                    );
-                }
-
-                // Resolve old running task dependencies (if exists)
-                // When pending task FINs directly, the running task was implicitly
-                // completed (AICore overwrote COND before we could read its FIN).
-                // Count it here to avoid losing completion.
-                if (prev_running_id != AICPU_TASK_INVALID) {
-                    cur_thread_completed++;
-                    completed_tasks_.fetch_add(1, std::memory_order_release);
-
-                    Task *prev_running_task = runtime.get_task(prev_running_id);
-                    resolve_task_dependencies(
-                        prev_running_task, runtime, thread_idx, core_id, h->core_type, cur_ready_queue_aic,
-                        cur_aic_tail, cur_aic_ready_count, cur_ready_queue_aiv, cur_aiv_tail, cur_aiv_ready_count
-                    );
-
-                    LOG_INFO_V0(
-                        "Thread %d: Core %d resolved old running task %d", thread_idx, core_id, prev_running_id
-                    );
-                }
-
-                Task *task = runtime.get_task(completed_task_id);
-                resolve_task_dependencies(
-                    task, runtime, thread_idx, core_id, h->core_type, cur_ready_queue_aic, cur_aic_tail,
-                    cur_aic_ready_count, cur_ready_queue_aiv, cur_aiv_tail, cur_aiv_ready_count
-                );
-
-                made_progress = true;
-            } else if (reg_task_id == pending_task_ids_[core_id] && reg_state == TASK_ACK_STATE) {
-                // Case 2: Pending task received ACK. The ACK observation is
-                // also the implicit-completion observation point for any prior
-                // running task (AICore overwrote COND before we could read
-                // its FIN). Capture finish_ts here, before LOG_INFO_V0.
-                uint64_t finish_ts = (l2_swimlane_enabled && l2_swimlane_level >= L2SwimlaneLevel::AICPU_TIMING) ?
-                                         get_sys_cnt_aicpu() :
-                                         0;
-
-                LOG_INFO_V0(
-                    "Thread %d: Core %d ACKed task %d (running_id=%d)", thread_idx, core_id, pending_task_ids_[core_id],
-                    running_task_ids_[core_id]
-                );
-
-                int prev_running_id = running_task_ids_[core_id];
-
-                // Move pending to running
-                running_task_ids_[core_id] = pending_task_ids_[core_id];
-                pending_task_ids_[core_id] = AICPU_TASK_INVALID;
-                made_progress = true;
-
-                // When pending task ACKs, the old running task was implicitly
-                // completed (AICore overwrote COND before we could read its FIN).
-                // Count it here to avoid losing completion.
-                if (prev_running_id != AICPU_TASK_INVALID) {
-                    // Profiling: complete the implicit task's AICore record.
-                    // Level gate: see complete-task comment at the matching
-                    // block ~100 lines above.
-                    if (l2_swimlane_enabled && l2_swimlane_level >= L2SwimlaneLevel::AICPU_TIMING) {
-                        if (l2_swimlane_aicpu_complete_task(
-                                core_id, thread_idx, static_cast<uint32_t>(prev_running_id),
-                                dispatch_timestamps_[core_id], finish_ts
-                            ) != 0) {
-                            LOG_ERROR(
-                                "Core %d: l2_swimlane_aicpu_complete_task failed for implicit task %d", core_id,
-                                prev_running_id
-                            );
-                        }
-                    }
-
-                    cur_thread_completed++;
-                    completed_tasks_.fetch_add(1, std::memory_order_release);
-
-                    Task *prev_running_task = runtime.get_task(prev_running_id);
-                    resolve_task_dependencies(
-                        prev_running_task, runtime, thread_idx, core_id, h->core_type, cur_ready_queue_aic,
-                        cur_aic_tail, cur_aic_ready_count, cur_ready_queue_aiv, cur_aiv_tail, cur_aiv_ready_count
-                    );
-
-                    LOG_INFO_V0(
-                        "Thread %d: Core %d resolved old running task %d", thread_idx, core_id, prev_running_id
-                    );
-                }
-
-                // Core can accept new task now (pipeline!)
-                // Continue to Case 4 to dispatch next task
-            } else if (reg_task_id == running_task_ids_[core_id] && reg_state == TASK_FIN_STATE) {
-                // Case 3: Running task finished. Capture finish_ts at the FIN
-                // observation point, BEFORE LOG_INFO_V0 / completion work.
-                uint64_t finish_ts = (l2_swimlane_enabled && l2_swimlane_level >= L2SwimlaneLevel::AICPU_TIMING) ?
-                                         get_sys_cnt_aicpu() :
-                                         0;
-
-                LOG_INFO_V0(
-                    "Thread %d: Core %d completed task %d (pending_id=%d)", thread_idx, core_id,
-                    running_task_ids_[core_id], pending_task_ids_[core_id]
-                );
-
-                int completed_task_id = running_task_ids_[core_id];
-
-                // Level gate: see complete-task comment at the matching block above.
-                if (l2_swimlane_enabled && l2_swimlane_level >= L2SwimlaneLevel::AICPU_TIMING) {
-                    if (l2_swimlane_aicpu_complete_task(
-                            core_id, thread_idx, static_cast<uint32_t>(completed_task_id),
-                            dispatch_timestamps_[core_id], finish_ts
-                        ) != 0) {
-                        LOG_ERROR(
-                            "Core %d: l2_swimlane_aicpu_complete_task failed for task %d", core_id, completed_task_id
-                        );
-                    }
-                }
-
-                cur_thread_completed++;
-                completed_tasks_.fetch_add(1, std::memory_order_release);
-
-                running_task_ids_[core_id] = AICPU_TASK_INVALID;
-
-                if (pending_task_ids_[core_id] == AICPU_TASK_INVALID) {
-                    if (h->core_type == CoreType::AIC && cur_aic_ready_count > 0) {
-                        try_dispatch_task(
-                            core_id, reg_addr, CoreType::AIC, thread_idx, cur_ready_queue_aic, cur_aic_head,
-                            cur_aic_ready_count, l2_swimlane_enabled, runtime
-                        );
-                    } else if (h->core_type == CoreType::AIV && cur_aiv_ready_count > 0) {
-                        try_dispatch_task(
-                            core_id, reg_addr, CoreType::AIV, thread_idx, cur_ready_queue_aiv, cur_aiv_head,
-                            cur_aiv_ready_count, l2_swimlane_enabled, runtime
-                        );
-                    }
-                }
-
-                Task *task = runtime.get_task(completed_task_id);
-                resolve_task_dependencies(
-                    task, runtime, thread_idx, core_id, h->core_type, cur_ready_queue_aic, cur_aic_tail,
-                    cur_aic_ready_count, cur_ready_queue_aiv, cur_aiv_tail, cur_aiv_ready_count
-                );
-
-                made_progress = true;
-            }
-
-            // Case 4: Dispatch new task if pending slot is available. When PMU
-            // is active we additionally require the running slot to be empty —
-            // see pmu_active comment above the dispatch loop.
-            if (pending_task_ids_[core_id] == AICPU_TASK_INVALID &&
-                (!unlikely(pmu_active) || running_task_ids_[core_id] == AICPU_TASK_INVALID)) {
-                if (h->core_type == CoreType::AIC && cur_aic_ready_count > 0) {
-                    if (try_dispatch_task(
-                            core_id, reg_addr, CoreType::AIC, thread_idx, cur_ready_queue_aic, cur_aic_head,
-                            cur_aic_ready_count, l2_swimlane_enabled, runtime
-                        )) {
-                        made_progress = true;
-                    }
-                } else if (h->core_type == CoreType::AIV && cur_aiv_ready_count > 0) {
-                    if (try_dispatch_task(
-                            core_id, reg_addr, CoreType::AIV, thread_idx, cur_ready_queue_aiv, cur_aiv_head,
-                            cur_aiv_ready_count, l2_swimlane_enabled, runtime
-                        )) {
-                        made_progress = true;
-                    }
-                }
-            }
-        }
-
-        // Refill local queues from shared queues
-        if (cur_aic_ready_count == 0) {
-            if (ready_count_aic_.load(std::memory_order_acquire) > 0) {
-                std::scoped_lock lock(ready_queue_aic_mutex_);
-                int available = ready_count_aic_.load(std::memory_order_relaxed);
-                int to_grab = (available < aic_per_thread_) ? available : aic_per_thread_;
-
-                for (int i = 0; i < to_grab; i++) {
-                    int task_id = ready_queue_aic_[ready_queue_aic_head_];
-                    ready_queue_aic_head_ = (ready_queue_aic_head_ + 1) % RUNTIME_MAX_TASKS;
-                    cur_ready_queue_aic[cur_aic_tail] = task_id;
-                    cur_aic_tail = (cur_aic_tail + 1) % MAX_CORES_PER_THREAD;
-                }
-                ready_count_aic_.fetch_sub(to_grab, std::memory_order_release);
-                cur_aic_ready_count += to_grab;
-
-                LOG_INFO_V0(
-                    "Thread %d: Grabbed %d AIC tasks from shared queue (available=%d)", thread_idx, to_grab, available
-                );
-            }
-        }
-
-        if (cur_aiv_ready_count == 0) {
-            if (ready_count_aiv_.load(std::memory_order_acquire) > 0) {
-                std::scoped_lock lock(ready_queue_aiv_mutex_);
-                int available = ready_count_aiv_.load(std::memory_order_relaxed);
-                int to_grab = (available < aiv_per_thread_) ? available : aiv_per_thread_;
-
-                for (int i = 0; i < to_grab; i++) {
-                    int task_id = ready_queue_aiv_[ready_queue_aiv_head_];
-                    ready_queue_aiv_head_ = (ready_queue_aiv_head_ + 1) % RUNTIME_MAX_TASKS;
-                    cur_ready_queue_aiv[cur_aiv_tail] = task_id;
-                    cur_aiv_tail = (cur_aiv_tail + 1) % MAX_CORES_PER_THREAD;
-                }
-                ready_count_aiv_.fetch_sub(to_grab, std::memory_order_release);
-                cur_aiv_ready_count += to_grab;
-
-                LOG_INFO_V0(
-                    "Thread %d: Grabbed %d AIV tasks from shared queue (available=%d)", thread_idx, to_grab, available
-                );
-            }
-        }
-
-        // Check completion
-        if (completed_tasks_.load(std::memory_order_acquire) >= task_count) {
-            bool all_cores_idle = true;
-
-            for (int i = 0; i < core_num; i++) {
-                int core_id = cur_thread_cores[i];
-                if (pending_task_ids_[core_id] != AICPU_TASK_INVALID ||
-                    running_task_ids_[core_id] != AICPU_TASK_INVALID) {
-                    all_cores_idle = false;
-
-                    if (verification_warning_count == 0) {
-                        uint64_t reg_addr = core_id_to_reg_addr_[core_id];
-                        uint64_t reg_val = read_reg(reg_addr, RegId::COND);
-                        LOG_WARN(
-                            "Thread %d: Counter reached %d/%d but core %d still has work (COND=0x%lx, pending_id=%d, "
-                            "running_id=%d)",
-                            thread_idx, completed_tasks_.load(std::memory_order_acquire), task_count, core_id, reg_val,
-                            pending_task_ids_[core_id], running_task_ids_[core_id]
-                        );
-                    }
-                    break;
-                }
-            }
-
-            if (all_cores_idle) {
-                // Truly complete: counter reached and all cores idle
-                int aic_remaining = ready_count_aic_.load(std::memory_order_acquire);
-                int aiv_remaining = ready_count_aiv_.load(std::memory_order_acquire);
-                if (aic_remaining > 0 || aiv_remaining > 0) {
-                    LOG_WARN(
-                        "Thread %d: Queues not empty after completion! AIC=%d, AIV=%d", thread_idx, aic_remaining,
-                        aiv_remaining
-                    );
-                }
-                break;  // Exit main loop
-            }
-
-            verification_warning_count++;
-            if (verification_warning_count > MAX_VERIFICATION_WARNINGS) {
-                LOG_ERROR(
-                    "Thread %d: Counter reached but cores still working after %d checks!", thread_idx,
-                    verification_warning_count
-                );
-                diagnose_stuck_state(runtime, thread_idx, cur_thread_cores, core_num, hank);
-                return -1;
-            }
-        }
-
-        // Timeout detection
-        if (!made_progress) {
-            idle_iterations++;
-            if (idle_iterations % WARN_INTERVAL == 0) {
-                int current = completed_tasks_.load(std::memory_order_acquire);
-                LOG_WARN(
-                    "Thread %d: %d idle iterations, progress %d/%d tasks", thread_idx, idle_iterations, current,
-                    task_count
-                );
-            }
-            if (idle_iterations > MAX_IDLE_ITERATIONS) {
-                LOG_ERROR("Thread %d: Timeout after %d idle iterations!", thread_idx, idle_iterations);
-                diagnose_stuck_state(runtime, thread_idx, cur_thread_cores, core_num, hank);
-                return -1;
+            sched_ctx_.bind_runtime(rt);
+            int32_t completed = sched_ctx_.resolve_and_dispatch(runtime, thread_idx);
+            if (completed < 0) {
+                LOG_ERROR("Thread %d: Scheduler failed with rc=%d", thread_idx, completed);
+                run_rc = completed;
             } else {
-                SPIN_WAIT_HINT();
+                LOG_INFO_V0("Thread %d: Executed %d tasks from runtime", thread_idx, completed);
             }
-        } else {
-            idle_iterations = 0;
         }
-        made_progress = false;
     }
 
-    LOG_INFO_V0("Thread %d: Execution complete, completed %d tasks", thread_idx, cur_thread_completed);
-    return cur_thread_completed;
-}
-
-int AicpuExecutor::run(Runtime *runtime) {
-    int affinity_exec_idx = platform_aicpu_affinity_thread_idx();
-    int thread_idx = (affinity_exec_idx >= 0) ? affinity_exec_idx : (thread_idx_++);
-    if (thread_idx < 0 || thread_idx >= aicpu_thread_num_ || thread_idx >= MAX_AICPU_THREADS) {
-        LOG_ERROR(
-            "Thread index %d out of bounds (active=%d max=%d exec_idx=%d)", thread_idx, aicpu_thread_num_,
-            MAX_AICPU_THREADS, affinity_exec_idx
-        );
-        return -1;
-    }
-
-    LOG_INFO_V0("Thread %d: Start (exec_idx=%d)", thread_idx, affinity_exec_idx);
-
-    const int *cur_thread_cores = core_assignments_[thread_idx];
-
-    LOG_INFO_V0("Thread %d: Runtime has %d tasks", thread_idx, runtime->get_task_count());
-    int completed = resolve_and_dispatch(*runtime, thread_idx, cur_thread_cores, thread_cores_num_[thread_idx]);
-    LOG_INFO_V0("Thread %d: Executed %d tasks from runtime", thread_idx, completed);
-
-    // Flush performance buffers for cores managed by this thread
-    if (is_l2_swimlane_enabled()) {
-        l2_swimlane_aicpu_flush(thread_idx, cur_thread_cores, thread_cores_num_[thread_idx]);
-    }
-#if PTO2_PROFILING
-    if (is_pmu_enabled()) {
-        pmu_aicpu_flush_buffers(thread_idx, cur_thread_cores, thread_cores_num_[thread_idx]);
-    }
-    if (is_dump_args_enabled()) {
-        dump_args_flush(thread_idx);
-    }
-#endif
-
-#if PTO2_PROFILING
-    // Restore PMU CTRL registers for this thread's cores before AICore shutdown
-    if (is_pmu_enabled()) {
-        pmu_aicpu_finalize(cur_thread_cores, thread_cores_num_[thread_idx]);
-    }
-#endif
-
-    int rc = shutdown_aicore(runtime, thread_idx, cur_thread_cores);
-    if (rc != 0) {
-        return rc;
+    // Always shutdown AICore — even if sched_ctx_.completed_ was already true.
+    // platform_deinit_aicore_regs is idempotent; orchestrator threads have
+    // core_trackers_[thread_idx].core_num() == 0 so they skip the loop harmlessly.
+    int32_t shutdown_rc = sched_ctx_.shutdown(thread_idx);
+    if (shutdown_rc != 0 && run_rc == 0) {
+        run_rc = shutdown_rc;
     }
 
     LOG_INFO_V0("Thread %d: Completed", thread_idx);
 
-    int prev_finished = finished_count_.fetch_add(1, std::memory_order_acq_rel);
+    // Check if this is the last thread to finish
+    int32_t prev_finished = finished_count_.fetch_add(1, std::memory_order_acq_rel);
     if (prev_finished + 1 == aicpu_thread_num_) {
         finished_.store(true, std::memory_order_release);
-        LOG_INFO_V0("Thread %d: Last thread, marking executor finished", thread_idx);
+        // Destroy PTO2 runtime. sm_handle / rt are recreated every run so we
+        // always tear them down here.
+        if (rt != nullptr) {
+            // Clear g_current_runtime in this DSO before destroying rt.
+            framework_bind_runtime(nullptr);
+            runtime_destroy(rt, runtime_arena_);
+            rt = nullptr;
+        }
     }
 
-    return 0;
+    return run_rc;
 }
 
 void AicpuExecutor::deinit(Runtime *runtime) {
-    // === Exit cleanup: reset all inter-round state ===
-
     // 1. Invalidate AICPU cache for Runtime address range.
     //    Next round's Host DMA (rtMemcpy) writes fresh Runtime to HBM but
     //    bypasses this cache. Invalidating now ensures next round reads from HBM.
-    //    Invalidate exactly the uploaded prefix (offsetof(tasks) + populated
-    //    tasks): the device buffer is allocated at that size (see
-    //    runtime_device_copy_size / init_runtime_args), so invalidating
-    //    sizeof(Runtime) would iterate cache lines past the allocation into
-    //    unrelated memory. A short invalidate is also sufficient — every run
-    //    reads only tasks[0..next_task_id) for its OWN next_task_id (get_task()
-    //    bounds-checks), and the H2D DMA wrote exactly that, so no run ever
-    //    observes a task line beyond its own prefix. cache_invalidate_range
-    //    rounds the end up to a 64-byte line, covering a partial trailing line.
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Winvalid-offsetof"
-    const size_t runtime_prefix_bytes =
-        offsetof(Runtime, tasks) + static_cast<size_t>(runtime->get_task_count()) * sizeof(Task);
-#pragma GCC diagnostic pop
-    cache_invalidate_range(runtime, runtime_prefix_bytes);
-    if (runtime->get_tensor_info_storage() != nullptr && runtime->get_tensor_info_storage_bytes() > 0) {
-        cache_invalidate_range(
-            runtime->get_tensor_info_storage(), static_cast<size_t>(runtime->get_tensor_info_storage_bytes())
-        );
-    }
-    if (runtime->get_tensor_allocation_storage() != nullptr && runtime->get_tensor_allocation_storage_bytes() > 0) {
-        cache_invalidate_range(
-            runtime->get_tensor_allocation_storage(),
-            static_cast<size_t>(runtime->get_tensor_allocation_storage_bytes())
-        );
-    }
+    cache_invalidate_range(runtime, sizeof(Runtime));
 
-    // === Existing reset logic ===
-    ready_count_aic_.store(0, std::memory_order_release);
-    ready_count_aiv_.store(0, std::memory_order_release);
+    // Reset all SchedulerContext-owned state in one place.
+    sched_ctx_.deinit();
 
-    ready_queue_aic_head_ = 0;
-    ready_queue_aic_tail_ = 0;
-    ready_queue_aiv_head_ = 0;
-    ready_queue_aiv_tail_ = 0;
+    finished_count_.store(0, std::memory_order_release);
+    runtime_init_ready_.store(false, std::memory_order_release);
 
-    for (int i = 0; i < RUNTIME_MAX_WORKER; i++) {
-        dispatch_timestamps_[i] = 0;
-        pending_task_ids_[i] = AICPU_TASK_INVALID;
-        running_task_ids_[i] = AICPU_TASK_INVALID;
-        core_first_dispatch_[i] = true;
-    }
+    aicpu_thread_num_ = 0;
+    sched_thread_num_ = 0;
+    orch_to_sched_ = false;
 
-    for (int t = 0; t < MAX_AICPU_THREADS; t++) {
-        cur_ready_queue_aic_head_[t] = 0;
-        cur_ready_queue_aic_tail_[t] = 0;
-        cur_ready_queue_aiv_head_[t] = 0;
-        cur_ready_queue_aiv_tail_[t] = 0;
-    }
+    // Clear file-scope PTO2Runtime pointer (freed by orchestrator thread before deinit)
+    rt = nullptr;
 
-    completed_tasks_.store(0, std::memory_order_release);
-    total_tasks_.store(0, std::memory_order_release);
-    finished_count_.store(0, std::memory_order_release);
+    // Clear dep_gen file-local bookkeeping. No-op when dep_gen is disabled.
+    dep_gen_aicpu_finalize();
 
-    // Reset core discovery and assignment state
-    aic_count_ = 0;
-    aiv_count_ = 0;
-    cores_total_num_ = 0;
-    aicpu_thread_num_ = 0;
-    aic_per_thread_ = 0;
-    aiv_per_thread_ = 0;
-    memset(core_assignments_, 0, sizeof(core_assignments_));
-    memset(thread_cores_num_, 0, sizeof(thread_cores_num_));
-    regs_ = 0;
+    LOG_INFO_V0("DeInit: Runtime execution state reset");
 
     initialized_.store(false, std::memory_order_release);
     init_done_.store(false, std::memory_order_release);
@@ -1219,112 +306,18 @@ void AicpuExecutor::deinit(Runtime *runtime) {
     LOG_INFO_V0("DeInit: AicpuExecutor reset complete");
 }
 
-void AicpuExecutor::emergency_shutdown(Runtime *runtime) {
-    LOG_WARN("Emergency shutdown: sending exit signal to all initialized cores");
-    Handshake *all_handshakes = reinterpret_cast<Handshake *>(runtime->workers);
-    for (int i = 0; i < cores_total_num_; i++) {
-        Handshake *hank = &all_handshakes[i];
-        OUT_OF_ORDER_STORE_BARRIER();
-        hank->aicpu_regs_ready = 1;
-        if (core_id_to_reg_addr_[i] != 0) {
-            platform_deinit_aicore_regs(core_id_to_reg_addr_[i]);
-        }
-    }
-
-    LOG_WARN("Emergency shutdown complete");
-}
-
-void AicpuExecutor::diagnose_stuck_state(
-    Runtime &runtime, int thread_idx, const int *cur_thread_cores, int core_num, Handshake *hank
-) {
-    LOG_ERROR("========== DIAGNOSTIC REPORT: Thread %d ==========", thread_idx);
-
-    int completed = completed_tasks_.load(std::memory_order_acquire);
-    int total = total_tasks_.load(std::memory_order_acquire);
-    LOG_ERROR("Progress: %d/%d tasks (%.1f%%)", completed, total, total > 0 ? completed * 100.0 / total : 0.0);
-
-    int aic_ready = ready_count_aic_.load(std::memory_order_acquire);
-    int aiv_ready = ready_count_aiv_.load(std::memory_order_acquire);
-    LOG_ERROR("Ready Queues: AIC=%d, AIV=%d", aic_ready, aiv_ready);
-
-    int busy_cores = 0;
-    int idle_cores = 0;
-
-    LOG_ERROR("Core Status:");
-    for (int i = 0; i < core_num; i++) {
-        int core_id = cur_thread_cores[i];
-        Handshake *h = &hank[core_id];
-
-        const char *core_type_str = core_type_to_string(h->core_type);
-
-        uint64_t reg_addr = core_id_to_reg_addr_[core_id];
-        uint64_t reg_val = read_reg(reg_addr, RegId::COND);
-        int reg_task_id = EXTRACT_TASK_ID(reg_val);
-        int reg_state = EXTRACT_TASK_STATE(reg_val);
-
-        int pending_id = pending_task_ids_[core_id];
-        int running_id = running_task_ids_[core_id];
-
-        if (pending_id != AICPU_TASK_INVALID || running_id != AICPU_TASK_INVALID) {
-            busy_cores++;
-
-            if (pending_id != AICPU_TASK_INVALID) {
-                Task *task = runtime.get_task(pending_id);
-                LOG_ERROR(
-                    "  Core %d [%s, PENDING]: COND=0x%lx (reg_task_id=%d, reg_state=%d), pending_id=%d, func_id=%d, "
-                    "fanin=%d, fanout=%d",
-                    core_id, core_type_str, reg_val, reg_task_id, reg_state, task->task_id, task->func_id,
-                    task->fanin.load(std::memory_order_acquire), task->fanout_count
-                );
-            }
-            if (running_id != AICPU_TASK_INVALID) {
-                Task *task = runtime.get_task(running_id);
-                LOG_ERROR(
-                    "  Core %d [%s, RUNNING]: COND=0x%lx (reg_task_id=%d, reg_state=%d), running_id=%d, func_id=%d, "
-                    "fanin=%d, fanout=%d",
-                    core_id, core_type_str, reg_val, reg_task_id, reg_state, task->task_id, task->func_id,
-                    task->fanin.load(std::memory_order_acquire), task->fanout_count
-                );
-            }
-        } else {
-            idle_cores++;
-        }
-    }
-
-    LOG_ERROR("Summary: %d busy, %d idle", busy_cores, idle_cores);
-
-    // Diagnose deadlock vs livelock
-    if (busy_cores == 0 && aic_ready == 0 && aiv_ready == 0 && completed < total) {
-        LOG_ERROR("*** DEADLOCK DETECTED ***");
-        LOG_ERROR("All cores idle, no ready tasks, but %d tasks incomplete", total - completed);
-
-        LOG_ERROR("Tasks with fanin > 0:");
-        int stuck_count = 0;
-        for (int tid = 0; tid < total && stuck_count < 10; tid++) {
-            Task *t = runtime.get_task(tid);
-            int fanin = t->fanin.load(std::memory_order_acquire);
-            if (fanin > 0) {
-                LOG_ERROR("  Task %d: fanin=%d (waiting for dependencies)", tid, fanin);
-                stuck_count++;
-            }
-        }
-        if (stuck_count == 0) {
-            LOG_ERROR("  No tasks waiting! Possible counter corruption.");
-        }
-    } else if (busy_cores > 0) {
-        LOG_ERROR("*** LIVELOCK / HUNG TASK ***");
-        LOG_ERROR("%d cores executing but no progress", busy_cores);
-    }
-
-    LOG_ERROR("========== END DIAGNOSTIC ==========");
-}
-
 // ===== Public Entry Point =====
 
-// host_build_graph resolves orchestration on the host during prepare, so it has
-// no device-side registration: it deliberately does NOT export
-// simpler_aicpu_register_callable (only the TMARB runtime does). The host's
-// register launch is gated on the device-orch path and never targets hbg.
+extern "C" int32_t aicpu_prewarm_callable(Runtime *runtime) {
+    // host_build_graph host-orch: the orchestration .so is dlopen'd on the HOST
+    // during prepare_callable_impl and the whole task graph is built host-side,
+    // so there is no device-side orchestrator .so to pre-load — prewarm is a
+    // no-op. The symbol is retained because the platform onboard kernel
+    // (src/a2a3/platform/onboard/aicpu/kernel.cpp) links it strongly via
+    // simpler_aicpu_prewarm_callable; removing it would break the onboard link.
+    (void)runtime;
+    return 0;
+}
 
 /**
  * aicpu_execute - Main AICPU kernel execution entry point
@@ -1339,13 +332,7 @@ void AicpuExecutor::diagnose_stuck_state(
  * @param runtime Pointer to Runtime structure
  * @return 0 on success, non-zero on error
  */
-extern "C" int aicpu_execute(Runtime *runtime) {
-    // Initialize log switches (only once, thread-safe)
-    static std::once_flag log_init_flag;
-    std::call_once(log_init_flag, []() {
-        init_log_switch();
-    });
-
+extern "C" int32_t aicpu_execute(Runtime *runtime) {
     if (runtime == nullptr) {
         LOG_ERROR("%s", "Invalid argument: null Runtime pointer");
         return -1;
@@ -1353,9 +340,6 @@ extern "C" int aicpu_execute(Runtime *runtime) {
 
     LOG_INFO_V0("%s", "aicpu_execute: Starting AICPU kernel execution");
 
-    // Get platform register addresses from platform-level global
-    g_aicpu_executor.regs_ = get_platform_regs();
-
     g_aicpu_executor.init(runtime);
 
     while (!g_aicpu_executor.init_done_.load(std::memory_order_acquire)) {
@@ -1365,18 +349,28 @@ extern "C" int aicpu_execute(Runtime *runtime) {
         }
     }
 
-    int rc = g_aicpu_executor.run(runtime);
+    int32_t rc = g_aicpu_executor.run(runtime);
     if (rc != 0) {
         LOG_ERROR("aicpu_execute: Thread execution failed with rc=%d", rc);
-        return rc;
     }
 
+    int32_t runtime_rc = read_pto2_runtime_status(runtime);
+
     // Last thread cleans up
     if (g_aicpu_executor.finished_.load(std::memory_order_acquire)) {
         LOG_INFO_V0("aicpu_execute: Last thread finished, cleaning up");
         g_aicpu_executor.deinit(runtime);
     }
 
+    if (runtime_rc != 0) {
+        LOG_ERROR("aicpu_execute: PTO2 runtime failed with rc=%d", runtime_rc);
+        return runtime_rc;
+    }
+
+    if (rc != 0) {
+        return rc;
+    }
+
     LOG_INFO_V0("%s", "aicpu_execute: Kernel execution completed successfully");
     return 0;
 }
diff --git a/src/a2a3/runtime/host_build_graph/build_config.py b/src/a2a3/runtime/host_build_graph/build_config.py
index a1c96c8fa..871332f6b 100644
--- a/src/a2a3/runtime/host_build_graph/build_config.py
+++ b/src/a2a3/runtime/host_build_graph/build_config.py
@@ -6,12 +6,28 @@
 # INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
 # See LICENSE in the root of the software repository for the full text of the License.
 # -----------------------------------------------------------------------------------------------------------
-# Runtime build configuration
-# All paths are relative to this file's directory (src/runtime/)
+# host_build_graph Runtime build configuration
+# All paths are relative to this file's directory (src/runtime/host_build_graph/)
+#
+# This is the host-orchestration variant of tensormap_and_ringbuffer: it shares
+# the same scheduler, ring buffers, and shared-memory layout, differing only in
+# WHEN the orchestrator runs.
+# - Host runs the orchestrator to completion, populating SM + arena, then H2Ds
+#   the image to device (vs tensormap, where AICPU thread N-1 orchestrates on
+#   device concurrently with the scheduler threads)
+# - AICPU threads 0..N-1 all run schedulers (no on-device orchestrator thread)
+# - AICore executes tasks via an aligned PTO2DispatchPayload + pre-built dispatch_args
+#
+# The "orchestration" directory contains source files compiled into both
+# runtime targets AND the orchestration .so (e.g., tensor methods needed
+# by the Tensor constructor's validation logic).
 
 BUILD_CONFIG = {
-    "aicore": {"include_dirs": ["runtime"], "source_dirs": ["aicore", "runtime"]},
-    "aicpu": {"include_dirs": ["runtime", ".."], "source_dirs": ["aicpu", "runtime"]},
-    "host": {"include_dirs": ["runtime", "orchestration", ".."], "source_dirs": ["host", "runtime"]},
-    "orchestration": {"include_dirs": ["runtime", "orchestration"], "source_dirs": []},
+    "aicore": {"include_dirs": ["runtime", "common", ".."], "source_dirs": ["aicore", "orchestration"]},
+    "aicpu": {"include_dirs": ["runtime", "common", ".."], "source_dirs": ["aicpu", "runtime", "orchestration"]},
+    "host": {
+        "include_dirs": ["runtime", "common", ".."],
+        "source_dirs": ["host", "runtime/orchestrator_core", "runtime/shared", "orchestration"],
+    },
+    "orchestration": {"include_dirs": ["runtime", "orchestration", "common", ".."], "source_dirs": ["orchestration"]},
 }
diff --git a/src/a2a3/runtime/host_build_graph/common/intrinsic.h b/src/a2a3/runtime/host_build_graph/common/intrinsic.h
new file mode 100644
index 000000000..768e6a612
--- /dev/null
+++ b/src/a2a3/runtime/host_build_graph/common/intrinsic.h
@@ -0,0 +1,199 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+/**
+ * @file intrinsic.h
+ * @brief SPMD execution context for AICore user kernels
+ *
+ * Topology data exposed to user kernels has two distinct lifetimes:
+ *
+ *   1. Global topology (per-core, fixed after runtime init):
+ *      - sub_block_id : identifies the AIV lane within a cluster
+ *        (0 = AIV0/left, 1 = AIV1/right).  Initialized once at runtime
+ *        startup based on each core's cluster position; never changes.
+ *        Only meaningful for AIV kernels in MIX tasks.
+ *
+ *   2. Local per-dispatch context (changes each dispatch):
+ *      - block_idx : which logical block the current worker is executing
+ *      - block_num : total number of blocks in this task (= block_dim)
+ *      Written by build_payload() before each dispatch.
+ *
+ * Both categories are injected via two pointer slots appended at the tail
+ * of the kernel args[] array:
+ *
+ *   args layout:
+ *     [0 .. tensor_count-1]                 = tensor GM pointers
+ *     [tensor_count .. +scalar_count-1]     = scalar values
+ *     ...
+ *     [SPMD_LOCAL_CONTEXT_INDEX]            = (uint64_t)&LocalContext   (per-dispatch)
+ *     [SPMD_GLOBAL_CONTEXT_INDEX]           = (uint64_t)&GlobalContext  (per-core)
+ *
+ * The suffix positions are compile-time constants and do not depend on the
+ * runtime tensor_count or scalar_count.
+ *
+ * Include this header in AICore kernel source files to use the Get* accessors.
+ * Do NOT depend on the raw index constants; always use the accessor functions.
+ *
+ * On CCEC (real hardware), __gm__ and __aicore__ must be defined before
+ * including this header (e.g. via <pto/pto-inst.hpp> or manual #define).
+ * The #ifndef guards below provide fallbacks for non-kernel builds
+ * (AICPU, HOST) where these qualifiers are not needed.
+ *
+ * IMPORTANT — do NOT mix these with the CCE built-in topology intrinsics
+ * (`get_subblockid()`, `get_block_idx()`, `get_block_num()` declared in
+ * `kernel_operator.h` / tikcfw). Those intrinsics read AICore hardware
+ * registers that simpler's tensormap_and_ringbuffer runtime does NOT
+ * program. Specifically:
+ *
+ *   - CCE `get_subblockid()` returns whatever stale value the AICore
+ *     sub-block register holds — under simpler's MIX dispatch it is 0
+ *     for BOTH AIV0 and AIV1 of every cluster, so a kernel that uses
+ *     it to partition heads will silently have AIV1 redo AIV0's work
+ *     and the AIV1 share of the output is never written. This is the
+ *     exact failure mode that produced the partial-zero output in
+ *     issue #900 (PR #899 spmd_paged_attention_highperf); the kernel
+ *     compiled, ran without error, and produced wrong output. Use
+ *     `get_sub_block_id(args)` instead, which reads from the runtime's
+ *     `GlobalContext.sub_block_id` that the scheduler initializes per
+ *     AIV core in `scheduler_cold_path.cpp::SchedulerContext::init`.
+ *
+ *   - `get_block_idx()` and `get_block_num()` are not redirected to
+ *     simpler's LocalContext either — use the `(args)` variants below
+ *     so the values reflect simpler's logical block_dim (which can
+ *     differ from `RUNTIME_CONFIG.block_dim`, the physical core count).
+ *
+ * If you are porting a kernel originally written for native CANN dispatch
+ * (AscendC, ascend-transformer-boost, etc.), every reference to those
+ * three CCE intrinsics needs to be rewritten against this header. See
+ * `docs/aicore-kernel-programming.md` for the full author contract,
+ * porting checklist, and the worked example from PR #899 / issue #900.
+ */
+
+#pragma once
+
+#include <stdint.h>
+
+#include "aicore_completion_mailbox_types.h"
+#include "pto_task_id.h"
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__
+#endif
+
+/** Number of extra pointer slots appended to the args[] tail (LocalContext + GlobalContext). */
+static constexpr int32_t PTO2_EXT_PARAMS_COUNT = 2;
+
+/**
+ * Args[] suffix indices for context pointers.
+ * Derived from MAX_TENSOR_ARGS(32) + MAX_SCALAR_ARGS(16).
+ * Users should not depend on these values; use the Get* functions below.
+ */
+static constexpr int32_t SPMD_LOCAL_CONTEXT_INDEX = 48;
+static constexpr int32_t SPMD_GLOBAL_CONTEXT_INDEX = 49;
+static constexpr int32_t PAYLOAD_LOCAL_CONTEXT_INDEX = SPMD_LOCAL_CONTEXT_INDEX;
+static constexpr int32_t PAYLOAD_GLOBAL_CONTEXT_INDEX = SPMD_GLOBAL_CONTEXT_INDEX;
+
+/**
+ * Per-core global context, stored in PTO2DispatchPayload.
+ * Initialized once at runtime startup (init_global_context) based on each
+ * core's cluster position.  Never modified after initialization.
+ */
+struct GlobalContext {
+    // AIV lane within cluster: 0=AIV0(left), 1=AIV1(right).
+    // Used by AIV to select the correct intra-cluster hw instruction.
+    // Not meaningful for AIC kernels or single-AIV tasks.
+    int32_t sub_block_id;
+};
+
+struct AsyncCtx {
+    volatile __gm__ uint32_t *completion_count;
+    volatile __gm__ int32_t *completion_error_code;
+    volatile __gm__ DeferredCompletionEntry *completion_entries;
+    uint32_t completion_capacity;
+    PTO2TaskId task_token;
+
+    static inline AsyncCtx make(PTO2TaskId task_token, volatile __gm__ DeferredCompletionSlab *buffer) {
+        AsyncCtx ctx{};
+        ctx.task_token = task_token;
+        if (buffer == nullptr) {
+            ctx.task_token = PTO2TaskId::invalid();
+            return ctx;
+        }
+        ctx.completion_count = &buffer->count;
+        ctx.completion_error_code = &buffer->error_code;
+        ctx.completion_entries = &buffer->entries[0];
+        ctx.completion_capacity = MAX_COMPLETIONS_PER_TASK;
+        return ctx;
+    }
+};
+
+/**
+ * Per-dispatch local context, stored in PTO2DispatchPayload.
+ * Written by build_payload() before each dispatch. Different blocks of the
+ * same task receive different block_idx values but the same block_num.
+ *
+ */
+struct LocalContext {
+    int32_t block_idx;  // Logical block index within the task [0, block_num)
+    int32_t block_num;  // How many logical blocks this task requires.
+                        // Currently fixed to 1 (block_dim > 1 not yet implemented).
+                        // NOT the same as RUNTIME_CONFIG.block_dim in kernel_config.py,
+                        // which controls how many physical cores the runtime launches.
+    AsyncCtx async_ctx;
+};
+
+/**
+ * Return the AIV lane index within the cluster.
+ * In a MIX 1C2V task: AIV0(left)=0, AIV1(right)=1.
+ *
+ * This value is only meaningful for AIV kernels in MIX tasks.  It tells
+ * the AIV whether it is the left lane or the right lane within the cluster,
+ * which determines the correct hardware instruction for intra-cluster
+ * communication.
+ *
+ * AIC kernels should NOT call this function.
+ * Single-AIV tasks have no intra-cluster communication, so sub_block_id
+ * has no meaning and should not be used.
+ */
+static __aicore__ inline int32_t get_sub_block_id(__gm__ int64_t *args) {
+    __gm__ GlobalContext *ctx =
+        reinterpret_cast<__gm__ GlobalContext *>(static_cast<uint64_t>(args[SPMD_GLOBAL_CONTEXT_INDEX]));
+    return ctx->sub_block_id;
+}
+
+/**
+ * Return the logical block index assigned to the current worker.
+ * Range: [0, get_block_num(args)).
+ * Within the same task, different blocks receive different indices.
+ */
+static __aicore__ inline int32_t get_block_idx(__gm__ int64_t *args) {
+    __gm__ LocalContext *ctx =
+        reinterpret_cast<__gm__ LocalContext *>(static_cast<uint64_t>(args[SPMD_LOCAL_CONTEXT_INDEX]));
+    return ctx->block_idx;
+}
+
+/**
+ * Return how many logical blocks the current task requires.
+ * All blocks of the same task see the same value.
+ * Currently always returns 1 (block_dim>1 not yet implemented).
+ *
+ * Note: this is NOT the same as RUNTIME_CONFIG.block_dim in
+ * kernel_config.py, which controls how many physical cores are launched.
+ */
+static __aicore__ inline int32_t get_block_num(__gm__ int64_t *args) {
+    __gm__ LocalContext *ctx =
+        reinterpret_cast<__gm__ LocalContext *>(static_cast<uint64_t>(args[SPMD_LOCAL_CONTEXT_INDEX]));
+    return ctx->block_num;
+}
diff --git a/src/a2a3/runtime/host_build_graph/common/pto_runtime_status.h b/src/a2a3/runtime/host_build_graph/common/pto_runtime_status.h
new file mode 100644
index 000000000..c81a121d8
--- /dev/null
+++ b/src/a2a3/runtime/host_build_graph/common/pto_runtime_status.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+/**
+ * PTO2 Runtime Status Helpers
+ *
+ * Shared error-code contract used inside the tensormap_and_ringbuffer runtime.
+ */
+
+#ifndef SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_COMMON_PTO_RUNTIME_STATUS_H_
+#define SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_COMMON_PTO_RUNTIME_STATUS_H_
+
+#include <stdint.h>
+
+// Orchestrator errors (1-99): detected in orchestrator thread
+#define PTO2_ERROR_NONE 0  // Explicitly means "no error"; it is not an "unknown/unspecified" error code.
+#define PTO2_ERROR_SCOPE_DEADLOCK 1
+#define PTO2_ERROR_HEAP_RING_DEADLOCK 2
+#define PTO2_ERROR_FLOW_CONTROL_DEADLOCK 3
+#define PTO2_ERROR_DEP_POOL_OVERFLOW 4
+#define PTO2_ERROR_INVALID_ARGS 5         // Arg construction error (invalid args)
+#define PTO2_ERROR_DEPENDENCY_OVERFLOW 6  // Too many unique fanin dependencies for one task
+#define PTO2_ERROR_REQUIRE_SYNC_START_INVALID 7
+#define PTO2_ERROR_TENSOR_WAIT_TIMEOUT 8
+#define PTO2_ERROR_EXPLICIT_ORCH_FATAL 9
+#define PTO2_ERROR_SCOPE_TASKS_OVERFLOW 10  // scope_tasks buffer saturated (all rings full)
+#define PTO2_ERROR_TENSORMAP_OVERFLOW 11    // tensormap entry pool wedged (last_task_alive not advancing)
+
+// Scheduler errors (100+): detected in scheduler threads
+#define PTO2_ERROR_SCHEDULER_TIMEOUT 100
+#define PTO2_ERROR_ASYNC_COMPLETION_INVALID 101
+#define PTO2_ERROR_ASYNC_WAIT_OVERFLOW 102
+#define PTO2_ERROR_ASYNC_REGISTRATION_FAILED 103
+
+static inline int32_t runtime_status_from_error_codes(int32_t orch_error_code, int32_t sched_error_code) {
+    if (orch_error_code != PTO2_ERROR_NONE) {
+        return orch_error_code < 0 ? orch_error_code : -orch_error_code;
+    }
+    if (sched_error_code != PTO2_ERROR_NONE) {
+        return sched_error_code < 0 ? sched_error_code : -sched_error_code;
+    }
+    return 0;
+}
+
+#endif  // SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_COMMON_PTO_RUNTIME_STATUS_H_
diff --git a/src/a2a3/runtime/host_build_graph/docs/RUNTIME_LOGIC.md b/src/a2a3/runtime/host_build_graph/docs/RUNTIME_LOGIC.md
index c6f72a70d..5c7e53af4 100644
--- a/src/a2a3/runtime/host_build_graph/docs/RUNTIME_LOGIC.md
+++ b/src/a2a3/runtime/host_build_graph/docs/RUNTIME_LOGIC.md
@@ -1,38 +1,810 @@
-# Runtime Logic: host_build_graph
+# PTO2 Runtime System Design (host_build_graph)
 
 ## Overview
 
-The host_build_graph runtime builds a static task graph on the host, copies the Runtime object to device memory, and lets AICPU scheduler threads dispatch tasks to AICore via a per-core handshake. Dependencies are explicit edges created by orchestration code, so scheduling is a standard fanin/fanout ready-queue model.
+host_build_graph is the **host-orchestration** variant of the PTO2 runtime: it
+shares tensormap_and_ringbuffer's scheduler, ring buffers, and shared-memory
+layout, and differs only in **when** the orchestrator runs. The host runs the
+orchestrator to completion — building the whole task graph, populating shared
+memory and the prebuilt arena — then H2Ds that image to the device, where the
+AICPU boots scheduler-only (no on-device orchestrator thread). It coordinates
+four layers of execution:
 
-## Core Data Structures
+- **Host** (x86/ARM CPU): compiles kernels, allocates device memory, **runs the orchestrator to build the task graph**, H2Ds the populated SM + arena, and launches AICPU/AICore threads.
+- **AICPU** (device ARM cores): runs scheduler threads only — it attaches the host-populated shared memory (already device-addressed by the host) and dispatches the already-built graph.
+- **AICore** (AI compute cores): executes kernel functions dispatched by the scheduler.
+- **Shared Memory** (Global Memory): ring buffers, task descriptors, heap, and TensorMap — built on host, attached read-mostly by the schedulers.
 
-- `Runtime` owns the task table, handshake buffers, and host-side device APIs. See `src/a2a3/runtime/host_build_graph/runtime/runtime.h`.
-- `Task` is a fixed-size record that stores `func_id`, argument array, `fanin`, `fanout`, `core_type`, and `function_bin_addr`.
-- `Handshake` is the shared per-core control block used by AICPU and AICore for dispatch and completion.
-- `HostApi` provides device memory ops used by host orchestration (`device_malloc`, `copy_to_device`, `upload_chip_callable_buffer`, etc.).
+```text
+┌───────────────────────────────────────────────────────────────────────┐
+│                            Host (CPU)                                 │
+│  test_*.py (SceneTestCase) → compile kernels → init Runtime           │
+│  → run orchestrator (build graph) → H2D populated SM + arena          │
+│  → upload binaries → launch AICPU/AICore → collect results            │
+└───────────────────────────┬───────────────────────────────────────────┘
+                            │ device memory / GM (populated graph image)
+┌───────────────────────────▼───────────────────────────────────────────┐
+│                     AICPU (N threads)                                  │
+│  All threads: Schedulers (attach + dispatch to AICore)                │
+│  (no on-device orchestrator thread — host already built the graph)    │
+│                                                                       │
+│  ┌─────────────────────────────────────────────────────────────────┐  │
+│  │                   Shared Memory (GM)                             │  │
+│  │  SharedMemoryHeader │ TaskDescriptors[] │ DepListPool           │  │
+│  │  GM Heap (output buffers)                                       │  │
+│  └─────────────────────────────────────────────────────────────────┘  │
+│                                                                       │
+│  Scheduler ──Handshake/Registers──► AICore workers (AIC + AIV)        │
+└───────────────────────────────────────────────────────────────────────┘
+```
 
-## Build And Init Flow
+> **Where the orchestrator runs (host_build_graph vs tensormap).** The data
+> structures and scheduler mechanics described in the sections below — the
+> orchestrator state, ring buffers, TensorMap dependency tracking, and the
+> dispatch handshake — are **shared** with `tensormap_and_ringbuffer`. Two
+> things differ, both following from **when and where the orchestrator runs**:
+>
+> - **host_build_graph (this runtime):** the **host** dlopens the orchestration
+>   `.so` and runs it to completion, populating shared memory + the prebuilt
+>   arena. Because the orchestrator runs to completion before any scheduler
+>   exists, it **wires the fanout adjacency inline during submit** (lock
+>   producers, allocate `dep_pool` entries, seed the ready queue) instead of
+>   deferring it to a device-side wiring queue. The host then relocates every
+>   cross-task pointer to its final device address (`relocate_host_orch_image`)
+>   **before** the H2D copy — pointers into the SM and pointers into the arena
+>   shift by independent deltas — and ships the image. The device boots
+>   **scheduler-only**: no on-device orchestrator thread, no on-device pointer
+>   fixup, no wiring drain (the ready queue is already seeded); it attaches the
+>   already-device-addressed SM/arena and dispatches.
+> - **tensormap_and_ringbuffer:** the orchestrator runs **on-device** on AICPU
+>   thread N-1, concurrently with the scheduler threads, and defers fanout
+>   wiring to the scheduler's global wiring queue (drained by thread 0).
+>
+> Where a section below says "the orchestrator runs on AICPU Thread 3" or
+> "Thread 3 dlopens the SO", read that as the **tensormap (device-orch)
+> mechanics this runtime inherits** — under host_build_graph that same work
+> happens on the **host** before launch.
 
-1. Python tooling compiles kernels and orchestration into shared objects.
-2. `register_callable_impl` uploads the entire ChipCallable buffer (orch SO + all child kernel binaries) in one shot via `host_api.upload_chip_callable_buffer`, then dlopens the orchestration SO and resolves the entry symbol. For each child, host computes `chip_dev + offsetof(ChipCallable, storage_) + child_offset(i)` and records it in the callable's `kernel_addrs` table; `bind_callable_to_runtime` later replays those into `Runtime::func_id_to_addr_[child_func_id(i)]` via `Runtime::replay_function_bin_addr` before each run. See `src/a2a3/runtime/host_build_graph/host/runtime_maker.cpp`.
-3. The orchestration function runs on the host and builds the graph. Because it runs on host, it can (and sometimes must) dereference entry-tensor host pointers — e.g. to read a control tensor that drives per-block dispatch. So the orch owns its own H2D: it allocates device buffers, copies inputs to device, and registers outputs for copy-back via `record_tensor_pair(runtime, ...)`. It adds tasks via `add_task(runtime, ...)` and adds dependency edges via `add_successor(runtime, ...)`. (Contrast with `tensormap_and_ringbuffer`, where the orch runs on device AICPU and `runtime_maker.cpp` centralizes H2D using the chip-level `ArgDirection` signature.)
-4. The populated `Runtime` is copied to device memory by the platform layer. AICPU then runs the executor with this Runtime snapshot.
+---
 
-## Execution Flow (Device)
+## 1. Runtime Variants
 
-1. `aicpu_executor.cpp` performs core discovery, handshake initialization, and ready-queue seeding using `Runtime::get_initial_ready_tasks`.
-2. Scheduler threads maintain per-core and global ready queues. When a task is ready, the scheduler publishes the task pointer and signals the core via `DATA_MAIN_BASE`.
-3. AICore reads the task_id from `DATA_MAIN_BASE`, executes the kernel at `Task::function_bin_addr`, and writes FIN to `COND` on completion.
-4. AICPU observes completion, resolves dependencies by decrementing fanin, and enqueues newly-ready tasks.
-5. The executor shuts down cores by setting `Handshake::control=1` after all tasks complete.
+Two runtime backends exist under `src/runtime/`, each representing a different orchestration and scheduling strategy.
 
-## Finalize And Cleanup
+### 1.1 host_build_graph
 
-`validate_runtime_impl` copies all recorded output tensors back to the host and frees device allocations recorded in tensor pairs. See `src/a2a3/runtime/host_build_graph/host/runtime_maker.cpp`.
+The host-orchestration variant of `tensormap_and_ringbuffer`: it shares the same
+ring-buffer task storage, GM heap, and TensorMap dependency tracking, and runs
+the orchestration SO **on the host CPU** to build the complete task graph before
+launching device execution. The device then boots scheduler-only.
 
-## Key Files
+- **Task storage**: `PTO2TaskDescriptor[]` in shared memory ring buffer (same as 1.2)
+- **Dependencies**: automatically derived from tensor read/write patterns via TensorMap (same as 1.2)
+- **Scheduling**: AICPU attaches the host-populated, already-device-addressed SM and dispatches the pre-built graph
+- **Use case**: host-side graph construction; device runs no orchestrator thread
 
-- `src/a2a3/runtime/host_build_graph/runtime/runtime.h`
-- `src/a2a3/runtime/host_build_graph/runtime/runtime.cpp`
-- `src/a2a3/runtime/host_build_graph/host/runtime_maker.cpp`
-- `src/a2a3/runtime/host_build_graph/aicpu/aicpu_executor.cpp`
+### 1.2 tensormap_and_ringbuffer (PTO2)
+
+The primary production runtime. Uses ring buffers for task slots and output memory, with a TensorMap for automatic dependency tracking.
+
+- **Task storage**: `PTO2TaskDescriptor[]` in shared memory ring buffer
+- **Memory**: GM Heap ring for output buffer allocation
+- **Dependencies**: automatically derived from tensor read/write patterns via TensorMap
+- **Thread model**: 3 scheduler threads + 1 orchestrator thread on AICPU
+- **Single ring**: host_build_graph builds the whole graph on the host with no
+  execution-time reclaim, so HeapRing, TaskRing, and DepPool are single
+  whole-graph-resident instances (`PTO2_MAX_RING_DEPTH == 1`); all scope depths
+  map to ring 0.
+- **Use case**: production workloads; supports streaming, flow control, and large batch sizes
+
+---
+
+## 2. Platform Abstraction
+
+Two platform implementations exist under `src/platform/`, sharing a common interface.
+
+### 2.1 a2a3 (Real Ascend Hardware)
+
+| Component | Description |
+| --------- | ----------- |
+| `device_runner.cpp` | Uses CANN APIs: `rtMalloc`, `rtMemcpy`, `rtLaunchKernel` |
+| `memory_allocator.cpp` | Wraps `rtMalloc`/`rtFree` with allocation tracking |
+| `aicore/kernel.cpp` | `KERNEL_ENTRY(aicore_kernel)` → `aicore_execute` |
+| `aicpu/kernel.cpp` | `DynTileFwkBackendKernelServer` entry → `aicpu_execute` |
+| `spin_hint.h` | ARM `wfe`/`yield` instructions for efficient spinning |
+
+### 2.2 a2a3sim (Thread Simulation)
+
+| Component | Description |
+| --------- | ----------- |
+| `device_runner.cpp` | Uses `std::thread` to simulate AICPU/AICore |
+| `memory_allocator.cpp` | Wraps `malloc`/`free` |
+| `aicore/kernel.cpp` | `aicore_execute_wrapper` sets `g_sim_reg_base` per core |
+| `upload_chip_callable_buffer` | Copy ChipCallable bytes to a host scratch, `dlopen` each child SO, `dlsym` "kernel_entry", patch the scratch's `resolved_addr_` with the function pointer |
+
+### 2.3 Platform Constants (`platform_config.h`)
+
+| Constant | Value | Description |
+| -------- | ----- | ----------- |
+| `PLATFORM_MAX_BLOCKDIM` | 24 | Maximum blocks (each = 1 AIC + 2 AIV) |
+| `PLATFORM_MAX_AICPU_THREADS` | 4 | AICPU thread count (3 schedulers + 1 orchestrator) |
+| `PLATFORM_MAX_AIC_PER_THREAD` | 24 | Max AIC cores per scheduler thread |
+| `PLATFORM_MAX_AIV_PER_THREAD` | 48 | Max AIV cores per scheduler thread |
+| `PLATFORM_PROF_SYS_CNT_FREQ` | 50 MHz | System counter frequency for profiling |
+
+---
+
+## 3. Shared Memory Layout
+
+The orchestrator and schedulers communicate through a contiguous shared memory region in Global Memory (GM). The single ring's TaskDescriptor and DepListPool sections are laid out by `pto2_sm_layout::ring_segment_offsets`.
+
+```text
+┌─────────────────────────────┐  offset 0
+│  PTO2SharedMemoryHeader     │  (per-ring flow control + layout, global flags)
+├─────────────────────────────┤  aligned
+│  Per-ring regions ×4:       │
+│    PTO2TaskDescriptor[N]    │  N = task_window_size per ring
+│    PTO2TaskPayload[N]       │
+│    PTO2TaskSlotState[N]     │
+└─────────────────────────────┘
+```
+
+### 3.1 SharedMemoryHeader Fields
+
+| Field | Writer | Reader | Purpose |
+| ----- | ------ | ------ | ------- |
+| `current_task_index` | Orchestrator | Scheduler | Next task ID to allocate (task ring head) |
+| `last_task_alive` | Scheduler | Orchestrator | Oldest still-active task (task ring tail) |
+| `heap_top` | Orchestrator | Scheduler | Heap ring allocation pointer |
+| `heap_tail` | Scheduler | Orchestrator | Heap ring reclamation pointer |
+| `orchestrator_done` | Orchestrator | Scheduler | Signals orchestration completion |
+| `task_window_size` | Init | Both | Number of task slots (per-ring, in `PTO2SharedMemoryRingHeader`) |
+| `heap_size` | Init | Both | Heap total size (per-ring, in `PTO2SharedMemoryRingHeader`) |
+| `task_descriptors_offset` | Init | Both | Offset to TaskDescriptor array in SM (per-ring) |
+| `total_size` | Init | Both | Total shared memory size |
+| `graph_output_ptr` | Orchestrator | Host | Address of final output (packed buffer) |
+| `graph_output_size` | Orchestrator | Host | Size of final output in bytes |
+
+### 3.2 Size Calculation
+
+```text
+total = ALIGN(Header)
+      + Σ_ring [ ALIGN(window_size * sizeof(TaskDescriptor))
+               + ALIGN(window_size * sizeof(TaskPayload))
+               + ALIGN(window_size * sizeof(TaskSlotState)) ]
+```
+
+Alignment is 64 bytes (`PTO2_ALIGN_SIZE`).
+
+---
+
+## 4. Ring Buffer Mechanisms
+
+> **Single ring**: TaskRing, HeapRing, and DepPool are single whole-graph
+> instances (`PTO2_MAX_RING_DEPTH == 1`). The host builds the entire graph
+> before the device boots and there is no execution-time reclaim, so a
+> per-scope-depth ring split would buy nothing — every scope depth maps to
+> ring 0.
+
+### 4.1 Task Ring
+
+The task ring manages task slot allocation with back-pressure flow control.
+
+**Structure** (`PTO2TaskRing`):
+
+- `descriptors`: pointer to `TaskDescriptor[]` in shared memory
+- `window_size`: number of slots (power of 2)
+- `current_index`: next task ID to allocate (monotonically increasing)
+- `last_alive_ptr`: pointer to `header->last_task_alive`
+
+**Slot mapping**: `slot = task_id & (window_size - 1)`
+
+**Allocation** (`PTO2TaskAllocator::alloc`):
+
+```text
+active_count = current_index - *last_alive_ptr
+if active_count < window_size - 1:
+    allocate slot, advance current_index
+else:
+    spin-wait (back-pressure from scheduler)
+```
+
+**Reclamation**: Scheduler threads advance `last_task_alive` via lock-free CAS when the oldest task reaches state CONSUMED (4). This frees slots for reuse.
+
+**Flow control**: When the ring is full, the orchestrator blocks until the scheduler advances `last_task_alive`. With `PTO2_RING_TASK_WINDOW=16` and 208 tasks, slots are recycled ~13 times each.
+
+### 4.2 Heap Ring
+
+The heap ring manages output buffer allocation from a circular GM heap.
+
+**Structure** (`PTO2HeapRing`):
+
+- `base`: GM heap base address
+- `size`: total heap size (default 1 GB)
+- `top`: allocation pointer (local to orchestrator)
+- `tail_ptr`: pointer to `header->heap_tail` (updated by scheduler)
+
+**Allocation**: Buffers are allocated contiguously from `top`. When reaching the end, allocation wraps to the beginning if `tail` has advanced far enough. Buffers never straddle the wrap-around boundary.
+
+**Reclamation**: When `last_task_alive` advances past a task, its `packed_buffer_end` is used to advance `heap_tail`, freeing the memory region.
+
+### 4.3 Dependency List Pool
+
+A simple bump allocator for `PTO2DepListEntry` nodes used in fanin/fanout linked lists.
+
+- **Entry 0**: NULL sentinel (`task_id=-1, next_offset=0`)
+- **Allocation**: `pool->top++`, wraps around when full
+- **Reclamation**: implicit — old entries become unreachable as `last_task_alive` advances
+
+### 4.4 Flow Control and Back-Pressure
+
+The ring buffer mechanism provides **flow control** between the orchestrator (producer) and the scheduler (consumer). When a ring is exhausted, the orchestrator **blocks** — it cannot submit new tasks or allocate more output memory until the scheduler reclaims slots/space by advancing the watermarks.
+
+**Task Ring back-pressure**: When `active_count = current_index - last_task_alive >= window_size - 1`, `PTO2TaskAllocator::alloc` spin-waits until the scheduler completes tasks and advances `last_task_alive`.
+
+**Heap Ring back-pressure**: When the heap has insufficient contiguous space, `PTO2TaskAllocator::alloc` spin-waits until the scheduler advances `heap_tail` past completed tasks' output buffers.
+
+**TensorMap pool back-pressure**: Before STEP 4 registers a task's outputs, the orchestrator's `ensure_tensormap_capacity` reserves pool space for the inserts. When the shared entry pool is exhausted, it reclaims retired entries across all rings and spin-waits until reclaim actually frees entries, with a 500 ms wall-clock deadlock backstop (see Section 5.4).
+
+This back-pressure is essential for correctness with small ring sizes — for example, with `PTO2_RING_TASK_WINDOW=16` and 208 tasks, the orchestrator blocks ~192 times, each time waiting for the scheduler to drain completed tasks before continuing.
+
+### 4.5 Deadlock Detection
+
+A ring that is **too small** can cause a **deadlock**. The root cause is the scope mechanism: each task's `fanout_count` includes a reference from its owning scope. The scope reference is only released when `scope_end()` runs — but `scope_end()` is called by the orchestrator, which is blocked waiting for ring space. This creates a circular dependency:
+
+```text
+Orchestrator blocked on task_ring_alloc (ring full)
+    → needs scheduler to advance last_task_alive
+    → needs tasks to reach CONSUMED state (fanout_count == 0)
+    → needs scope_end() to release scope reference
+    → needs orchestrator to continue
+    → DEADLOCK
+```
+
+The runtime detects this automatically by counting spin iterations in the allocation functions:
+
+**Periodic BLOCKED warnings** (every 10,000 spins):
+
+```text
+[TaskRing] BLOCKED (Flow Control): current=208, last_alive=192, active=16/16 (100.0%), spins=10000
+[HeapRing] BLOCKED: requesting 4096 bytes, available=0, top=65536, tail=0, spins=10000
+```
+
+**Deadlock detection** (after 100,000 spins with no progress):
+
+```text
+FATAL: Flow Control Deadlock Detected!
+Task Ring is FULL and no progress after 100000 spins.
+  - Active tasks:  16
+  - Window size:   16
+Root Cause:
+  Tasks cannot transition to CONSUMED state because fanout_count
+  includes 1 for the owning scope, and scope_end() requires the
+  orchestrator to continue — creating a circular dependency.
+Solution:
+  Recommended: 32 (at least 2x current active tasks)
+```
+
+The FATAL message is logged to the device log and the process exits. The solution is to increase the ring size so that it can hold at least all tasks within the largest parallel scope. For example, if a scope submits 13 tasks, `task_window >= 14` is required (13 + 1 to distinguish full from empty).
+
+**Sizing guideline**: `task_window_size` must be larger than the maximum number of tasks in any single `PTO2_SCOPE`. A safe choice is `2 × max_tasks_per_scope` or simply the default 65536 for production.
+
+---
+
+## 5. TensorMap and Automatic Dependency Tracking
+
+### 5.1 Purpose
+
+TensorMap maintains a mapping from tensor memory regions to their producer task IDs. When a new task reads a tensor (INPUT/INOUT), TensorMap automatically discovers the producer and establishes a dependency edge.
+
+### 5.2 Hash Table Design
+
+- **Key**: tensor base address (`buffer.addr`)
+- **Value**: producer task ID, with overlap detection for sub-regions
+- **Overlap**: `COVERED` (new region fully contains old) or `OTHER` (partial overlap)
+- Sub-tensors of the same base tensor hash to the same bucket, enabling overlap detection
+
+### 5.3 Entry Pool Management
+
+Unlike the Task Ring and Heap Ring, TensorMap entries are **not** managed by a ring buffer. Instead, a **fixed-size pool + free list** is used:
+
+1. **Free list first**: `free_entry_list[]` stores pointers to released entries. Allocation pops from here (O(1)).
+2. **Bump allocation**: if free list is empty, `entry_pool[next_entry_idx++]` allocates from the end of the pool.
+3. **Blocking reclaim**: if the pool is short of the inserts a task needs, the orchestrator's `ensure_tensormap_capacity` reads the latest `last_task_alive` for every ring and calls `reclaim_retired_all` (`cleanup_retired` per ring) to batch-free entries belonging to retired tasks, returning them to the free list, before the inserts proceed.
+
+This design avoids the complexity of ring-based wrapping while still being bounded by `PTO2_TENSORMAP_POOL_SIZE` (default 65536 entries).
+
+### 5.4 Stale Entry Cleanup: Three-Layer Defense
+
+TensorMap must ensure entries for retired tasks (`producer_task_id < last_task_alive`) are removed, so that:
+
+- The pool does not grow unboundedly (capacity is finite)
+- Lookup performance does not degrade as stale entries accumulate in bucket chains
+
+Three complementary mechanisms achieve this:
+
+**Layer 1 — Chain Truncation during Lookup** (lazy, per-bucket):
+
+Since `insert` always prepends to the bucket head, entries in each bucket chain are in **descending task_id order**. When `PTO2TensorMap::lookup` encounters the first stale entry (`producer_task_id < last_task_alive`), all subsequent entries in the chain are guaranteed stale too. The entire tail is truncated in one operation using `prev_in_bucket` pointers for O(1) unlinking.
+
+This guarantees lookup only traverses valid entries — O(valid_entries_in_bucket), not O(total_entries).
+
+**Layer 2 — Periodic Batch Cleanup** (`cleanup_retired`, per-task):
+
+Every time the orchestrator submits a task (Step 0 of `PTO2OrchestratorState::submit_task`), it calls `PTO2TensorMap::sync_tensormap`. When `last_task_alive` has advanced by more than `PTO2_TENSORMAP_CLEANUP_INTERVAL` (default 64) tasks since the last cleanup, `PTO2TensorMap::cleanup_retired` runs:
+
+This uses the **per-task entry chain** (`task_entry_head[task_slot]`) — each task's entries are doubly-linked together at insert time via `next_in_task`/`prev_in_task`, allowing O(entries_per_task) cleanup without scanning the entire pool or all buckets. Freed entries are returned to `free_entry_list` for immediate reuse.
+
+**Layer 3 — Back-Pressure on Pool Exhaustion** (blocking):
+
+Before STEP 4 inserts a task's outputs, `ensure_tensormap_capacity` checks the free list + bump region against the task's needed entry count. If short, it reclaims retired entries across all rings and blocks until reclaim frees enough entries. Progress is measured by entries actually freed, not by watermark movement — a ring can retire zero-output tasks, advancing `last_task_alive` without freeing any entry. A pool that frees nothing for a 500 ms wall-clock timeout is a genuine deadlock: it latches `PTO2_ERROR_TENSORMAP_OVERFLOW` and unwinds, matching the task allocator and fanin spill pool.
+
+This forms a back-pressure mechanism analogous to the Task Ring's flow control.
+
+**Summary**:
+
+| Layer | Trigger | Method | Guarantees |
+| ----- | ------- | ------ | ---------- |
+| Chain Truncation | Every lookup | Truncate stale tail of bucket chain | Lookup only visits valid entries |
+| Periodic Cleanup | Every 64 retired tasks | Walk per-task chains, free entries | Pool capacity reclaimed in bounded time |
+| Pool Back-Pressure | Pool exhausted | Block until scheduler advances watermark | Hard capacity bound, no OOM |
+
+In steady state, the number of valid TensorMap entries ≈ `active_tasks × avg_outputs_per_task`. With the default `task_window=65536` and `pool_size=65536`, this is well within bounds. With small windows (e.g., `task_window=16`), active entries are even fewer (~16 × a few), and cleanup runs frequently.
+
+### 5.5 Dependency Discovery Flow
+
+When `PTO2OrchestratorState::submit_task` processes parameters:
+
+1. **INPUT/INOUT**: `PTO2TensorMap::lookup` searches for overlapping producers (with chain truncation)
+2. For each producer found: `append_fanin_or_fail` adds the dependency
+3. **OUTPUT/INOUT**: `PTO2TensorMap::insert` registers the current task as the new producer at bucket head
+4. Stale entries are pruned lazily during lookup (Layer 1) and periodically by cleanup (Layer 2)
+
+---
+
+## 6. Task Descriptor and States
+
+### 6.1 PTO2TaskDescriptor (Hot Path)
+
+| Field | Description |
+| ----- | ----------- |
+| `task_id` | Canonical mixed-task ID (64-bit: `ring_id << 32 \| local_id`; `ring_id` is always 0 in this single-ring runtime). |
+| `kernel_id[3]` | Per-slot kernel IDs: `[AIC, AIV0, AIV1]`; `INVALID_KERNEL_ID` = inactive |
+| `active_mask` | Bitmask of active subtask slots: `bit0=AIC`, `bit1=AIV0`, `bit2=AIV1` |
+| `completed_subtasks` | Atomic counter; each subtask increments on completion. Trigger condition: `completed_subtasks == total_required_subtasks` |
+| `fanin_count` | Number of producer dependencies (set by scheduler during wiring) |
+| `fanout_lock` | Per-task spinlock for concurrent fanout modification (used by scheduler wiring + completion) |
+| `fanout_head` | Head of fanout consumer list (pointer, protected by `fanout_lock`) |
+| `fanout_count` | 1 (scope ref) + number of consumers |
+| `packed_buffer_base` | Start of packed buffer in GM Heap |
+| `packed_buffer_end` | End of packed buffer (for heap reclamation) |
+
+### 6.1b PTO2TaskPayload (Cold Path)
+
+| Field | Description |
+| ----- | ----------- |
+| `tensors[16]` | Tensor descriptors for parameters |
+| `scalar_value[16]` | Scalar parameter values |
+| `is_tensor[16]` | Whether each parameter is tensor or scalar |
+| `param_count` | Number of valid parameters |
+| `fanin_slot_states[]` | Producer slot state pointers (used by `on_task_release`) |
+| `fanin_actual_count` | Actual fanin count |
+
+### 6.2 Task State Machine
+
+```text
+  [0] PENDING ──worker(s) done──► [1] COMPLETED ──fanout done──► [2] CONSUMED
+      ▲                                                                │
+      │                                                                ▼
+      └──────────────────── slot recycled ◄───────────────────────────┘
+```
+
+In the scheduler's `task_state[]` array (`std::atomic<PTO2TaskState>`):
+
+- **0 (PENDING)**: slot is allocated and remains PENDING through "waiting on
+  producers", "queued in ready queue", and "dispatched to a worker"; ready vs
+  running is derived from `fanin_refcount` and per-core `running_slot_state`
+- **1 (COMPLETED)**: hardware execution complete, output may still be in use
+- **2 (CONSUMED)**: output fully consumed, buffers can be released
+
+---
+
+## 7. Orchestrator
+
+### 7.1 PTO2OrchestratorState
+
+The orchestrator builds the task graph by calling the user-provided
+orchestration function. In host_build_graph it runs **on the host** (see the
+divergence note in the Overview); the `PTO2OrchestratorState` below is the same
+structure tensormap drives on AICPU thread N-1.
+
+Key members:
+
+- `ring`: the single `PTO2RingSet` (HeapRing + TaskRing + FaninPool).
+- `tensor_map`, `tensor_pool`: dependency tracking
+- `scope_tasks[]`, `scope_begins[]`, `scope_stack_top`: scope nesting stack (flat buffer partitioned by level)
+- `scheduler`: pointer to scheduler state (for inline fanout wiring and ready queue access)
+- `gm_heap_base`, `gm_heap_size`: GM heap for output buffers
+
+### 7.2 Task Submission Flow (`PTO2OrchestratorState::submit_task`)
+
+| Step | Operation |
+| ---- | --------- |
+| 0 | `PTO2TensorMap::sync_tensormap` — prune stale TensorMap entries |
+| 1 | `PTO2TaskAllocator::alloc` — allocate task slot (may block on flow control) |
+| 2 | Initialize task descriptor + slot state, copy parameters |
+| 3 | **Lookup**: for each INPUT/INOUT param, search TensorMap for producers; collect producer pointers in `PTO2FaninBuilder` |
+| 4 | **Insert**: register OUTPUT/INOUT args in TensorMap |
+| 5 | **Record fanin metadata**: store producer pointers in `payload->fanin_inline_slot_states[]` (+ spill pool if >64); increment each producer's `fanout_count` (no lock needed — single writer). This step runs **before** `payload.init()`. |
+| 6 | **Wire fanout inline** (`PTO2SchedulerState::wire_task`): lock each producer, allocate `dep_pool` entries, prepend the consumer to each producer's `fanout_head`, and seed the ready queue when all deps are already satisfied. |
+
+> **Note**: Under host_build_graph the orchestrator runs to completion on the
+> host before any scheduler exists, so fanout wiring is done **inline in
+> submit** (Step 6) rather than deferred to a device-side wiring queue. The
+> `dep_pool` is sized for the whole graph — there is no reclaim during host
+> orchestration, exactly like the task window and GM heap — so an exhausted
+> pool latches `PTO2_ERROR_DEP_POOL_OVERFLOW` and aborts the run. The
+> `fanout_head` / dep-entry / ready-queue pointers this produces are host-DDR
+> addresses; `relocate_host_orch_image` shifts them to device addresses before
+> H2D (SM pointers and arena pointers by independent deltas).
+>
+> The inherited tensormap (device-orch) variant instead defers this to the
+> scheduler's global `wiring_queue` (SPSC, drained by thread 0) to keep the
+> on-device orchestrator's submit path off `fanout_lock` / `dep_pool`. That
+> path is described below for the shared mechanics; host_build_graph performs
+> the same steps synchronously in Step 6.
+
+### 7.3 Fanout Wiring (`wire_task`)
+
+Whether wired inline on the host (host_build_graph) or drained from the
+device-side wiring queue (tensormap), each task is wired by the same
+`wire_task` logic:
+
+1. Sets `fanin_count = N + 1` (+1 redundance to prevent premature readiness)
+2. For each producer in `payload->fanin_slot_states[]`:
+   - **Acquires** the producer's `fanout_lock`
+   - Checks `task_state >= COMPLETED` (early-finished optimization)
+   - If not completed: prepends consumer to producer's `fanout_head` via `dep_pool.prepend`
+   - **Releases** `fanout_lock`
+3. Atomically releases the +1 redundance + early_finished count via `fanin_refcount.fetch_add`
+4. If all deps satisfied: pushes task to ready queue
+
+The scheduler's completion handler mirrors this:
+
+1. **Acquire** `fanout_lock`, mark `task_state = COMPLETED`, read `fanout_head`, **release** lock
+2. Traverse fanout list, incrementing each consumer's `fanin_refcount`
+3. Mark `task_state = CONSUMED` when `fanout_refcount` reaches `fanout_count`
+
+This protocol guarantees every consumer is accounted for exactly once.
+
+### 7.4 Scope Mechanism (`PTO2_SCOPE`)
+
+Scopes control the lifetime of intermediate buffers. Each scope:
+
+- Tracks tasks submitted within it via a flat `scope_tasks[]` buffer partitioned by `scope_begins[]`
+- On `scope_end`: increments `fanout_refcount` for scope tasks; when it reaches `fanout_count`, the task's packed buffer can be reclaimed
+
+```cpp
+PTO2_SCOPE(rt) {
+    // Tasks submitted here belong to this scope
+    rt_submit_aic_task(FUNC_QK, args);
+    rt_submit_aiv_task(FUNC_SF, args);
+}
+// scope_end: scope reference released from all tasks above
+```
+
+**Output tensor lifetime — single-scope only.** `submit_task` returns a
+`TaskOutputTensors`, and `get_ref(i)` hands back a `const Tensor&`. Both are
+backed by pointers into the submitting task's `PTO2TaskPayload::tensors[]`,
+which lives in a ring-buffer slot. host_build_graph does not recycle slots
+within a run (whole-graph-resident, no execution-time reclaim), so the storage
+is not overwritten mid-run; the constraint is instead structural — the
+`TaskOutputTensors` and the refs it returns are scoped to the orchestration
+function that built the graph and must not be retained past it.
+
+Therefore the `TaskOutputTensors` instance, the references it returns, and
+any pointer derived from them MUST NOT outlive the `PTO2_SCOPE` in which
+submit was called. The typical safe pattern is:
+
+```cpp
+PTO2_SCOPE() {
+    TaskOutputTensors outs = rt_submit_aic_task(FUNC_QK, args);
+    const Tensor &y = outs.get_ref(0);
+    // Use y here and in subsequent submits within the same scope.
+}   // outs and y both go out of scope; no dangling references can escape.
+```
+
+Anti-patterns that compile but silently break:
+
+```cpp
+const Tensor *kept = nullptr;
+PTO2_SCOPE() {
+    TaskOutputTensors outs = rt_submit_aic_task(FUNC_QK, args);
+    kept = &outs.get_ref(0);          // escapes the scope
+}
+// `kept` still points at a payload slot. After enough submits in later
+// scopes, the slot is reused and `*kept` aliases an unrelated task's
+// tensor — a wrong-tensor read with no runtime diagnostic.
+
+TaskOutputTensors outs;               // declared in outer scope
+PTO2_SCOPE() {
+    outs = rt_submit_aic_task(FUNC_QK, args);
+}
+const Tensor &t = outs.get_ref(0);    // same hazard: outs survives scope
+```
+
+This invariant is intentionally not runtime-checked. A reused slot carries
+a different but valid `owner_task_id`, so an assertion based on
+`owner_task_id` cannot distinguish "still the original task" from
+"silently aliased to a newer task". Treat the rule as a static contract,
+verified by review.
+
+---
+
+## 8. Scheduler
+
+### 8.1 Thread Model
+
+With `aicpu_thread_num=4`, the AICPU runs 4 threads:
+
+| Thread | Role | Cores |
+| ------ | ---- | ----- |
+| 0 | Scheduler | 6 AIC + ~13 AIV |
+| 1 | Scheduler | 6 AIC + ~13 AIV |
+| 2 | Scheduler | 6 AIC + ~13 AIV |
+| 3 | Orchestrator | none |
+
+Core assignment: AICs and AIVs are divided equally among the 3 scheduler threads.
+
+### 8.2 Scheduler Main Loop
+
+Each scheduler thread runs a tight loop with two main phases:
+
+**Phase 1 — Completion Handling**:
+
+- Poll register `COND` on each managed core
+- When `TASK_FIN_STATE` detected: record completion timestamps, call `on_subtask_complete(task_id, subslot)` to increment the completion counter; when `completed_subtasks == total_required_subtasks`, trigger `on_task_complete(task_id)` which marks `task_state[slot] = COMPLETED`, acquires fanout lock, traverses fanout list (incrementing consumers' `fanin_refcount`), marks `task_state[slot] = CONSUMED`, and advances `last_task_alive` watermark
+
+**Phase 2 — Dispatch**:
+
+- For each idle core: pop a task from the matching shape-based ready queue (lock-free MPMC Vyukov queue, one per resource shape)
+- Build `PTO2DispatchPayload` from `TaskDescriptor` with `task_id`, `subslot`, `kernel_id`, and `core_type`
+- Write task pointer to `Handshake.task`, signal AICore via register `DATA_MAIN_BASE`
+
+After these phases, the scheduler updates profiling headers and checks for termination (all tasks completed and orchestrator done).
+
+### 8.3 Ready Queue Design
+
+Ready queues use a lock-free bounded MPMC (Vyukov) design:
+
+- One `PTO2ReadyQueue` per resource shape (5 shapes: `AIC_ONLY`, `AIV_X1`, `AIV_X2`, `AIC_AIV_X1`, `AIC_AIV_X2`)
+- **Push**: any thread (orchestrator via `init_task`, or scheduler on completion) pushes newly-ready tasks to the queue matching `task->active_mask.to_shape()`
+- **Pop**: scheduler threads pop from the queue matching the idle core's resource shape
+- Per-slot sequence counters prevent ABA problems
+- `enqueue_pos` and `dequeue_pos` are on separate cache lines to avoid false sharing
+
+### 8.4 No Runtime Watermark Advancement (host-orch)
+
+host_build_graph is whole-graph-resident: the host builds the entire task
+graph into a single ring and H2Ds it once, and the device runs it with **no
+execution-time reclaim**. `last_task_alive` is initialized to 0 and is **not
+advanced at runtime** — slots are never recycled within a run, so there is no
+`advance_ring_pointers` / per-ring `advance_lock` step (both removed; they
+existed only for the device-orch reclaim path in `tensormap_and_ringbuffer`).
+Completion is tracked by `completed_tasks_`; consumer waits key on
+`fanout_refcount` rather than on a watermark.
+
+`reset_for_reuse()` survives but runs **once at init**
+(`pto_shared_memory.cpp`) to zero each slot's dynamic scheduling fields before
+the host orchestrator populates them — it is not a runtime recycle hook.
+
+### 8.5 SchedulerContext
+
+All scheduler-side state and methods live in `SchedulerContext` (`runtime/scheduler/scheduler_context.h`). It is held as a `sched_ctx_` member of `AicpuExecutor`; `AicpuExecutor` is a thin wrapper that owns the lifecycle atomics and delegates everything else to `SchedulerContext`.
+
+Public surface (called from `AicpuExecutor::init/run/deinit`):
+
+| Method | Phase | Purpose |
+| ------ | ----- | ------- |
+| `init(runtime, aicpu_thread_num, sched_thread_num, orch_to_sched, regs_base)` | once per run | Handshake + assign cores, reset counters, latch `regs_base`, bind `func_id_to_addr_` |
+| `bind_runtime(rt)` | boot thread | Wire `sched_` to `rt->scheduler` once the boot thread attaches the host-built `rt` |
+| `resolve_and_dispatch(runtime, thread_idx)` | per scheduler thread | Main dispatch loop |
+| `shutdown(thread_idx)` | per thread on exit | `platform_deinit_aicore_regs` for this thread's cores; PMU finalize when enabled |
+| `on_orchestration_done(runtime, rt, thread_idx, total_tasks)` | orchestrator thread | Publish core assignments, latch task count, fold inline-completed tasks, flip `orchestrator_done_`, drive orch→sched core transition (or `emergency_shutdown` on fatal) |
+| `deinit()` | once per run | Reset every scheduler-owned field to its post-construction default |
+| Read-only accessors | various | `aic_count()` / `aiv_count()` / `is_completed()` / `completed_tasks_count()` |
+
+Private internals are split across three .cpp files by responsibility:
+
+- `scheduler_completion.cpp` — completion polling, drain protocol
+- `scheduler_dispatch.cpp` — task dispatch loop and helpers
+- `scheduler_cold_path.cpp` — exit checks, stall diagnostics, profiling, lifecycle (`init/deinit`), core management (`handshake_all_cores` / `assign_cores_to_threads` / `reassign_cores_for_all_threads` / `emergency_shutdown`), and `on_orchestration_done`
+
+`AicpuExecutor` calls neither `handshake_*`, `assign_*`, `reassign_*`, nor `emergency_shutdown` directly — they are private, invoked only by `init` and `on_orchestration_done`.
+
+---
+
+## 9. AICore Worker Interaction
+
+### 9.1 Handshake Protocol
+
+Each AICore worker has a `Handshake` struct in shared memory:
+
+| Field | Direction | Purpose |
+| ----- | --------- | ------- |
+| `task` | AICPU→AICore | Pointer to `PTO2DispatchPayload` |
+| `control` | AICPU→AICore | 0=normal, 1=shutdown |
+| `perf_records_addr` | AICPU→AICore | Performance buffer address |
+
+### 9.2 Register-Based Dispatch
+
+Instead of polling a shared-memory status flag, the production protocol uses hardware registers.
+
+> **Note**: `task_id` is 64-bit but registers are 32-bit. A per-core monotonic dispatch counter (`s_dispatch_seq`) replaces `task_id` in register writes to prevent collisions.
+
+| Register | Direction | Usage |
+| -------- | --------- | ----- |
+| `DATA_MAIN_BASE` | AICPU→AICore | Write `task_id` to dispatch (idle=0x7FFFFFFD); `EXIT_SIGNAL` to shut down |
+| `COND` | AICore→AICPU | `[bit31=state, bits30:0=task_id]`: ACK (state=0) or FIN (state=1) |
+
+**AICore execution loop**:
+
+1. Poll `DATA_MAIN_BASE` for value != AICPU_IDLE_TASK_ID
+2. Read payload from `Handshake.task`
+3. Write ACK to `COND`
+4. Execute kernel function via `func_id_to_addr` lookup
+5. Write FIN to `COND`
+
+### 9.3 PTO2DispatchPayload
+
+Built by the scheduler from `PTO2TaskDescriptor`:
+
+| Field | Description |
+| ----- | ----------- |
+| `task_id` | Mixed-task identifier (for completion aggregation) |
+| `subslot` | Which subtask slot this dispatch represents (`AIC`, `AIV0`, or `AIV1`) |
+| `kernel_id` | Function ID for this subtask slot |
+| `core_type` | AIC or AIV |
+| `function_bin_addr` | GM address of compiled kernel binary |
+| `num_args` | Number of arguments |
+| `args[]` | Tensor addresses and scalar values |
+
+---
+
+## 10. Kernel and Orchestration Loading
+
+### 10.1 Kernel Binary Loading
+
+1. **Host** compiles each kernel source (`.cpp`) into a binary (`.o` or `.so`)
+   and packs all children into a single `ChipCallable` buffer alongside the
+   orchestration SO.
+2. `host_api.upload_chip_callable_buffer(callable)` H2Ds the whole buffer
+   once and returns the device address of the ChipCallable header.
+3. For each child, host computes
+   `chip_dev + offsetof(ChipCallable, storage_) + callable->child_offset(i)`
+   and stores it in `Runtime.func_id_to_addr_[child_func_id(i)]`.
+4. When dispatching, the scheduler reads `func_id_to_addr_[fid]`, casts to
+   `const CoreCallable*`, reads `resolved_addr_`, and copies that into
+   `PTO2DispatchPayload.function_bin_addr`.
+
+### 10.2 Orchestration SO Loading
+
+1. **Host** compiles the orchestration source into a shared library (`.so`)
+2. The SO binary is embedded into `Runtime.device_orch_so_storage_[]` and copied to device
+3. **AICPU Thread 3** writes the SO to a temp file, calls `dlopen`
+4. `dlsym("aicpu_orchestration_config")` returns configuration (expected arg count)
+5. `dlsym("aicpu_orchestration_entry")` returns the orchestration function pointer
+6. Thread 3 creates a `PTO2Runtime`, calls the orchestration function within a `PTO2_SCOPE`
+7. After orchestration completes: `dlclose`, delete temp file
+
+### 10.3 Thread Startup Synchronization
+
+| Flag | Set by | Waited by | Purpose |
+| ---- | ------ | --------- | ------- |
+| `runtime_init_ready_` | Thread 3 | Threads 0-2 | Runtime and SM handle initialized |
+
+Profiling-subsystem init (`dump_args` / `pmu` / `dep_gen` / `l2_swimlane`) runs
+once in `SchedulerContext::init()` on the single-threaded cold path, before any
+scheduler/orchestrator thread starts — so it needs no cross-thread init
+handshake.
+
+Startup sequence:
+
+1. Thread 3: create SM handle + runtime → set `runtime_init_ready_`
+2. Scheduler threads: wait for `runtime_init_ready_` → enter main loop
+3. Thread 3: configure orchestrator-scheduler pointers → call orchestration function → set `orchestrator_done_`
+
+---
+
+## 11. PTO2 Orchestration API
+
+The orchestration API is defined in `pto_orchestration_api.h`. Orchestration code depends only on this header.
+
+### 11.1 Core API
+
+| Function/Macro | Purpose |
+| -------------- | ------- |
+| `rt_submit_task(mixed_kernels, args)` | Submit a mixed task with `MixedKernels` struct |
+| `rt_submit_aic_task(kernel_id, args)` | Convenience: submit AIC-only task |
+| `rt_submit_aiv_task(kernel_id, args)` | Convenience: submit AIV-only task |
+| `PTO2_SCOPE() { ... }` | RAII scope for buffer lifetime |
+| `rt_orchestration_done()` | Signal orchestration complete |
+
+### 11.2 Parameter Construction
+
+| Function | Description |
+| -------- | ----------- |
+| `make_tensor_external(ptr, shapes, ndim, dtype)` | Wrap an existing device pointer as a tensor |
+| `TensorCreateInfo(shapes, ndim, dtype)` | Describe a runtime-created output buffer |
+| `Arg::add_input(tensor)` | INPUT parameter — read by the task |
+| `Arg::add_output(create_info)` | OUTPUT parameter — runtime allocates and returns a Tensor |
+| `Arg::add_inout(tensor)` | INOUT parameter — existing tensor read then written |
+| `Arg::add_scalar(value)` | 64-bit scalar parameter |
+
+### 11.3 Resource Shapes
+
+Tasks are queued by resource shape, which is derived from the `active_mask` in the `MixedKernels` struct:
+
+| Shape | Active Mask | Description |
+| ----- | ----------- | ----------- |
+| `AIC_ONLY` | AIC only | AIC cores (matrix multiplication) |
+| `AIV_X1` | AIV0 or AIV1 only | Single AIV core (vector operations) |
+| `AIV_X2` | AIV0 + AIV1 | Two AIV cores |
+| `AIC_AIV_X1` | AIC + one AIV | AIC + single AIV core |
+| `AIC_AIV_X2` | AIC + AIV0 + AIV1 | Full cluster (AIC + two AIV cores) |
+
+### 11.4 Orchestration Export Interface
+
+Each orchestration `.so` must export:
+
+```cpp
+extern "C" PTO2OrchestrationConfig aicpu_orchestration_config(uint64_t* args, int arg_count);
+extern "C" void aicpu_orchestration_entry(uint64_t* args, int arg_count);
+```
+
+---
+
+## 12. Example: Batch Paged Attention
+
+### 12.1 Kernel Configuration (`kernel_config.py`)
+
+```python
+KERNELS = [
+    {"func_id": 0, "name": "QK",      "source": "aic/aic_qk_matmul.cpp",       "core_type": "aic"},
+    {"func_id": 1, "name": "SF",      "source": "aiv/aiv_softmax_prepare.cpp", "core_type": "aiv"},
+    {"func_id": 2, "name": "PV",      "source": "aic/aic_pv_matmul.cpp",       "core_type": "aic"},
+    {"func_id": 3, "name": "UP",      "source": "aiv/aiv_online_update.cpp",   "core_type": "aiv"},
+    {"func_id": 5, "name": "AIV_HUB", "source": "aiv/aiv_hub.cpp",            "core_type": "aiv"},
+]
+
+ORCHESTRATION = {
+    "source": "orchestration/paged_attention_orch.cpp",
+    "function_name": "aicpu_orchestration_entry",
+}
+
+RUNTIME_CONFIG = {
+    "runtime": "tensormap_and_ringbuffer",
+    "aicpu_thread_num": 4,
+    "block_dim": 24,
+}
+```
+
+### 12.2 Orchestration Structure
+
+```cpp
+void aicpu_orchestration_entry(uint64_t* args, int arg_count) {
+    // Unpack args: query, key_cache, value_cache, block_table, context_lens, out, config
+    for (q_idx = 0; q_idx < q_loop; q_idx++) {
+        for (batch_start = 0; batch_start < batch; batch_start += IN_CORE_BATCH) {
+            PTO2_SCOPE() {
+                // Describe accumulator tensors (oi, li, mi) with TensorCreateInfo
+                // Submit AIV_HUB to initialize accumulators
+                for (bn = 0; bn < max_bn; bn++) {
+                    // Allocate intermediate tensors (sij, pij, mij, lij, oi_new)
+                    // Submit QK (CUBE) → SF (VECTOR) → PV (CUBE) → UP (VECTOR)
+                }
+            }
+        }
+    }
+}
+```
diff --git a/src/a2a3/runtime/host_build_graph/docs/SCALAR_DATA_ACCESS.md b/src/a2a3/runtime/host_build_graph/docs/SCALAR_DATA_ACCESS.md
new file mode 100644
index 000000000..ef1de83b4
--- /dev/null
+++ b/src/a2a3/runtime/host_build_graph/docs/SCALAR_DATA_ACCESS.md
@@ -0,0 +1,137 @@
+# Scalar Data Access — get/set_tensor_data Design
+
+## 1. Overview
+
+During task graph construction, orchestration sometimes needs to read InCore kernel results (for control-flow decisions) or write initial values into tensors. `get_tensor_data` / `set_tensor_data` provide **blocking** cross-layer data access, allowing orchestration to safely read and write tensor data.
+
+**Core design principle**: Reuse the existing TensorMap dependency tracking mechanism — no new synchronization infrastructure.
+
+## 2. API
+
+```cpp
+// Blocking read: returns value at the given indices (default: raw uint64_t bits)
+// Specify T for typed read: float val = get_tensor_data<float>(tensor, 1, idx);
+template<typename T = uint64_t>
+T get_tensor_data(const Tensor& tensor, uint32_t ndims, const uint32_t indices[]);
+
+// Blocking write: stores value at the given indices (type deduced from argument)
+// Typed write: set_tensor_data(tensor, 1, idx, 42.0f);
+template<typename T = uint64_t>
+void set_tensor_data(Tensor& tensor, uint32_t ndims, const uint32_t indices[], T value);
+```
+
+Both call into the runtime through the ops table — orchestration .so needs no runtime symbol linkage.
+
+## 3. Blocking Interface Design
+
+### 3.1 get_tensor_data Flow
+
+```text
+addr null-check → TensorMap lookup → spin-wait producer COMPLETED → compute flat offset → memcpy read
+```
+
+- **addr null-check**: `buffer.addr == 0` means unallocated — log error, return 0
+- **TensorMap lookup**: find producer task by `buffer.addr`
+- **spin-wait**: wait until producer `task_state >= PTO2_TASK_COMPLETED`
+- **No producer** (lookup callback never fires): skip waiting, read immediately
+
+### 3.2 set_tensor_data Flow
+
+```text
+addr null-check → TensorMap lookup → spin-wait producer COMPLETED → spin-wait consumers done → memcpy write
+```
+
+One extra step versus get_tensor_data: wait for all consumers to finish (`fanout_refcount >= fanout_count - 1`, excluding the scope reference).
+
+### 3.3 Timeout
+
+- Uses cycle counter (`get_sys_cnt_aicpu()`), checked every 1024 spins
+- Threshold: `PTO2_TENSOR_DATA_TIMEOUT_CYCLES` (~10 s at 1.5 GHz)
+- On timeout: sets `orch.fatal = true`, preventing further task submission
+
+## 4. add_output with Initial Value
+
+```cpp
+TensorCreateInfo ci(shapes, ndims, dtype);
+ci.set_initial_value(initial_value);
+args.add_output(ci);
+```
+
+**Mechanism**:
+
+1. `ci.set_initial_value(value)` marks the create-info with an initial value before submission
+2. `add_output(ci)` stores a pointer to `ci` in `L0TaskArgs` (the original must remain valid until submit)
+3. During payload init, the output tensor is materialized via `init_from_create_info()` which triggers the fill
+4. Fill strategy:
+   - Small buffer (< 64 B): element-by-element memcpy directly into dst
+   - Large buffer (≥ 64 B): fill the first 64 bytes as a template block, then bulk-memcpy in 64 B chunks; partial tail copy for remainder
+
+**Constraint**: existing tensors are write targets only through `add_inout()`.
+
+## 5. Scalar Dependencies via 1-Element Tensors
+
+Traditional scalars (`L0TaskArgs::add_scalar`) are one-way inputs with no TensorMap tracking. For cross-task scalar values, use a 1-element tensor as the carrier:
+
+```cpp
+uint32_t shapes[1] = {1};
+TensorCreateInfo scalar_ci(shapes, 1, DataType::FLOAT32);
+
+// Submit with initial value and keep the returned tensor
+scalar_ci.set_initial_value(float_to_u64(77.0f));
+L0TaskArgs args;
+args.add_output(scalar_ci);
+TaskOutputTensors outs = rt_submit_aiv_task(FUNC_NOOP, args);
+const Tensor& scalar_tensor = outs.get_ref(0);
+
+// Orchestration-side blocking read (waits for kernel completion)
+uint32_t idx[1] = {0};
+float val = get_tensor_data<float>(scalar_tensor, 1, idx);
+```
+
+**Advantage**: Fully reuses existing TensorMap (producer tracking, fanin/fanout dependencies) — no new infrastructure needed.
+
+## 6. Data Hazard Analysis
+
+Three actors:
+
+- **Kernel**: InCore task submitted via add_input/add_output/add_inout (asynchronous execution)
+- **Orch Read**: orchestration calls `get_tensor_data` (blocking read)
+- **Orch Write**: orchestration calls `set_tensor_data` (blocking write)
+
+### Hazard Matrix (earlier operation → later operation)
+
+| # | Earlier Op | Later Op | Hazard | Guarantee | Safe? |
+| - | ---------- | -------- | ------ | --------- | ----- |
+| 1 | Kernel write (OUTPUT) | Orch Read | RAW | spin-wait producer COMPLETED | Yes |
+| 2 | Kernel write (OUTPUT) | Orch Write | WAW | spin-wait producer COMPLETED | Yes |
+| 3 | Kernel read (INPUT) | Orch Write | WAR | spin-wait fanout_refcount | **Needs INOUT** |
+| 4 | Kernel read-write (INOUT) | Orch Read | RAW | spin-wait producer COMPLETED | Yes |
+| 5 | Kernel read-write (INOUT) | Orch Write | WAW+WAR | spin-wait producer + consumers | Yes |
+| 6 | Orch Write | Kernel read (INPUT) | RAW | blocking completes before next submit | Yes |
+| 7 | Orch Write | Kernel write (OUTPUT) | WAW | same — serial guarantee | Yes |
+| 8 | Orch Read | Kernel write (OUTPUT) | WAR | same — serial guarantee | Yes |
+| 9–12 | Orch ↔ Orch | — | — | same-thread serial execution | Yes |
+
+### Key Design Points
+
+**Scenario #3 is the only case requiring special attention**:
+
+TensorMap tracks only producers (OUTPUT/INOUT), not pure INPUT consumers. If a tensor is only registered via `add_input()`, TensorMap has no producer entry for it. `set_tensor_data`'s `wait_for_tensor_ready()` finds no matching producer (the lookup callback never fires) and returns immediately — but the kernel may still be reading → **WAR data race**.
+
+**Solution**: For tensors that may later be written via `set_tensor_data`, use `add_inout()` instead of `add_input()`. INOUT registers a producer entry in TensorMap, enabling `set_tensor_data` to track all consumers through `fanout_refcount`.
+
+**Scenarios #6–8 serial guarantee**:
+
+get/set_tensor_data are blocking calls, and orchestration is single-threaded serial submission. After a blocking operation completes, subsequent code (including task submissions) executes strictly afterward.
+
+## 7. External Tensor Behavior
+
+`make_tensor_external()` creates tensors with a pre-set `buffer.addr` (pointing to host-allocated device memory).
+
+| Scenario | Behavior |
+| -------- | -------- |
+| External tensor never submitted as OUTPUT/INOUT | No TensorMap entry — get/set execute immediately |
+| External tensor previously submitted as OUTPUT/INOUT | TensorMap has producer entry — get/set spin-wait |
+| External tensor submitted as INPUT, then set_tensor_data | **WAR risk** — must use INOUT instead (same as scenario #3) |
+
+**Key rule**: If an external tensor will later be written via `set_tensor_data`, all prior kernel accesses must use `add_inout()`, not `add_input()`.
diff --git a/src/a2a3/runtime/host_build_graph/docs/SUBMIT_BY_CLUSTER.md b/src/a2a3/runtime/host_build_graph/docs/SUBMIT_BY_CLUSTER.md
new file mode 100644
index 000000000..71debb93e
--- /dev/null
+++ b/src/a2a3/runtime/host_build_graph/docs/SUBMIT_BY_CLUSTER.md
@@ -0,0 +1,222 @@
+# Submit by Cluster - Requirements and Main-Branch-Aligned Design
+
+## 1. Goal
+
+Define a single, main-branch-aligned specification for PTO2 cluster submission that combines:
+
+1. Product requirements (what must be true).
+2. Runtime design (how it is implemented on current main baseline).
+
+The target model is: one submitted graph node is one `MixedTask`, and dispatch/completion is mixed-task-granular.
+
+## 2. Background and Motivation
+
+Future Ascend hardware is expected to provide stronger locality within an AICore cluster (`1 AIC + 2 AIV`).
+The runtime therefore needs a "submit together, run together" model for related AIC/AIV kernels.
+
+Legacy per-task submit (`kernel_id + worker_type`) cannot express atomic co-dispatch of multiple kernels to one cluster.
+
+## 3. Scope
+
+### In Scope
+
+1. New orchestration-facing submit API for cluster-aware mixed submission.
+2. Runtime/backend scheduler and executor changes to treat a mixed submit as one atomic scheduling unit.
+3. Dependency gating, readiness, dispatch, completion, and reclamation at mixed-task granularity.
+4. AIV slot equivalence (`AIV0` and `AIV1` are equivalent execution targets).
+
+### Out of Scope
+
+1. User-facing cluster pinning (`allocate_cluster/free_cluster`-style APIs).
+2. New worker types beyond AIC/AIV.
+3. Cross-cluster user placement policies.
+4. Hardware topology changes beyond `1 AIC + 2 AIV` per cluster.
+
+## 4. Main-Branch Baseline Constraints
+
+Design must preserve the current main runtime architecture:
+
+1. Executor threading split (orchestrator thread vs scheduler threads), and post-orchestrator transition (`transition_requested_` + `reassign_cores_for_all_threads()`).
+2. Shared-memory hot/cold split (`PTO2TaskDescriptor` hot + `PTO2TaskPayload` cold).
+
+## 5. Terminology
+
+1. `cluster`: one physical unit with `1 AIC + 2 AIV`.
+2. `MixedKernels`: 3 submit slots (`AIC`, `AIV0`, `AIV1`) with `INVALID_KERNEL_ID` for inactive slots.
+3. `MixedTask`: one runtime graph node created by one submit call.
+4. `active_mask`: bitmask of active subtask slots.
+5. `resource shape`: normalized lane demand class of a mixed task.
+
+## 6. API Contract
+
+```cpp
+inline constexpr int32_t INVALID_KERNEL_ID = -1;
+
+struct MixedKernels {
+    int32_t aic_kernel_id{INVALID_KERNEL_ID};
+    int32_t aiv0_kernel_id{INVALID_KERNEL_ID};
+    int32_t aiv1_kernel_id{INVALID_KERNEL_ID};
+};
+
+static inline void rt_submit_task(PTO2Runtime* rt,
+                                       const MixedKernels& mixed_kernels,
+                                       Arg* args,
+                                       int32_t num_args);
+
+static inline void rt_submit_aic_task(PTO2Runtime* rt,
+                                           int32_t kernel_id,
+                                           Arg* args,
+                                           int32_t num_args);
+
+static inline void rt_submit_aiv_task(PTO2Runtime* rt,
+                                           int32_t kernel_id,
+                                           Arg* args,
+                                           int32_t num_args);
+```
+
+Rules:
+
+1. One submit call creates one `MixedTask`.
+2. All active slots share the same `args` and `num_args`.
+3. At least one slot must be active.
+4. `aiv0_kernel_id` and `aiv1_kernel_id` are semantically equivalent.
+5. Wrappers are orchestration sugar only (inline in orchestration API); no dedicated runtime ops entries.
+6. Submit-contract types are defined once in a shared header-only submit-types surface consumed by orchestration and runtime headers.
+7. Invalid submits follow existing PTO2 behavior (`always_assert`), not a new recoverable return-code API.
+
+## 7. Data Model (Requirements + Design)
+
+`PTO2TaskDescriptor` (hot path) carries mixed-task identity/state:
+
+1. `task_id`
+2. `active_mask`
+3. `completed_subtasks` (atomic counter, incremented per subtask completion)
+4. `kernel_id[3]` for `(AIC, AIV0, AIV1)`
+5. dependency heads/counters and packed-buffer metadata
+
+`PTO2TaskPayload` (cold path) carries:
+
+1. shared args/tensors/scalars copied once per mixed submit
+2. fanin mixed-task IDs
+3. other cold-path submit metadata
+
+Producer identity in TensorMap is mixed-task ID end-to-end.
+
+## 8. Scheduling Model
+
+### 8.1 Resource Shapes
+
+Runtime uses shape-based ready queues (not worker-type queues):
+
+1. `AIC_ONLY`
+2. `AIV_X1`
+3. `AIV_X2`
+4. `AIC_AIV_X1`
+5. `AIC_AIV_X2`
+
+Queueing key is normalized resource shape (not raw slot label).
+
+### 8.2 Atomic Cluster Dispatch
+
+1. Dispatch decision unit is one mixed task.
+2. For multi-slot mixed tasks, partial launch is forbidden.
+3. A mixed task is dispatchable only when one local owned cluster can satisfy all required lanes.
+4. Compatible mixed tasks may co-reside over time if they use disjoint free lanes.
+
+### 8.3 Dependency and Completion
+
+1. Fanin release/readiness remains dependency-correct and graph-level.
+2. Two-stage completion:
+   - `on_subtask_complete(task_id, subslot)`
+   - `on_task_complete(task_id)` only when `completed_subtasks == total_required_subtasks`
+3. Downstream release is triggered once per mixed task completion, not once per subslot.
+
+## 9. Executor Ownership and Numbering
+
+### 9.1 Canonical Flattened Numbering (Unchanged)
+
+Given `block_dim` clusters:
+
+1. AIC IDs: `[0, block_dim)`
+2. AIV IDs: `[block_dim, 3 * block_dim)`
+3. Cluster `i`: `{i, block_dim + i, 2 * block_dim + i}`
+
+This project-defined flattened numbering is kept unchanged.
+
+### 9.2 Cluster Ownership
+
+1. One cluster must be owned by one scheduler domain/thread at a time.
+2. No split-cluster ownership in either:
+   - initial `assign_cores_to_threads()`
+   - post-orchestrator `reassign_cores_for_all_threads()`
+3. Lane occupancy bookkeeping must remain consistent with ownership after reassignment.
+
+## 10. Functional Requirements
+
+### 10.1 Valid Mixed Shapes
+
+1. AIC only
+2. AIV only (1 or 2 AIV lanes)
+3. AIC + 1 AIV
+4. AIC + 2 AIV
+
+### 10.2 Runtime Behavior per Submit
+
+1. Validate submit arguments.
+2. Allocate mixed-task ID and initialize descriptor/payload/slot_state once.
+3. Lookup producers via TensorMap; collect fanin metadata and increment producers' `fanout_count`.
+4. Wire fanout edges and determine readiness. Under host_build_graph (host-orch) this is done **inline in submit** via `wire_task`; the inherited tensormap (device-orch) path instead pushes the task to the scheduler's wiring queue, which scheduler thread 0 drains asynchronously.
+5. Dispatch all active lanes atomically when resources allow.
+6. Aggregate completion and release downstream once.
+
+## 11. Non-Functional Requirements
+
+1. Correctness: no dependency violation, no partial mixed-task dispatch.
+2. Determinism: dependency-correct ordering preserved; AIV lane choice may vary but remains semantically equivalent.
+3. Fairness: resource-aware polling heuristic is allowed; strict starvation-free guarantee across all shapes is not required.
+4. Performance: no obvious regression for non-cluster workflows.
+5. Observability: lifecycle visibility for submit/ready/dispatch/block/complete.
+
+## 12. Acceptance Criteria
+
+Feature is accepted when:
+
+1. Orchestration compiles and submits via `MixedKernels` API/wrappers.
+2. Scheduler dispatches each mixed task as one cluster scheduling decision.
+3. Dependencies gate mixed-task readiness correctly.
+4. AIV execution remains cluster-local and semantically equivalent across lanes.
+5. Existing non-cluster workflows continue to pass without behavior regression.
+6. Cluster ownership is never split across scheduler domains before/after transition.
+
+## 13. Verification Matrix
+
+Recommended validation coverage:
+
+1. Mapping correctness for cluster-to-core ID relation.
+2. Atomic dispatch for multi-slot shapes.
+3. Dependency gating and completion aggregation (`done_mask == active_mask`).
+4. Lane-occupancy co-residency behavior for compatible shapes.
+5. Core-transition ownership stability.
+6. Invalid submit handling (`always_assert` path).
+7. Regression coverage for existing examples/tests.
+
+Milestone command (device):
+
+```bash
+python tests/st/a2a3/tensormap_and_ringbuffer/batch_paged_attention/test_batch_paged_attention.py \
+  -p a2a3 -d 9
+```
+
+Final validation:
+
+```bash
+pytest examples tests/st --platform a2a3
+```
+
+## 14. Resolved Decisions
+
+1. Legacy orchestration-facing single-task submit is replaced by mixed submit contract.
+2. Invalid mixed submits fail with existing submit-time assert behavior.
+3. Per-cluster concurrent capacity is lane-occupancy-driven, not a fixed constant.
+4. Submit-contract types live in one shared header-only surface.
+5. Resource-aware dispatch heuristics are allowed without a strict starvation-free guarantee.
diff --git a/src/a2a3/runtime/host_build_graph/docs/device_log_profiling.md b/src/a2a3/runtime/host_build_graph/docs/device_log_profiling.md
new file mode 100644
index 000000000..783132fd5
--- /dev/null
+++ b/src/a2a3/runtime/host_build_graph/docs/device_log_profiling.md
@@ -0,0 +1,175 @@
+# PTO2 Device Log Profiling Guide
+
+## How to Find Device Logs
+
+AICPU logs (via `LOG_INFO_V9`) are written by CANN's **dlog** subsystem and do **not** appear in the `python test_*.py` / pytest terminal output. They are written to CANN's device log directory:
+
+```text
+$HOME/ascend/log/debug/device-<device_id>/device-<pid>_<timestamp>.log
+```
+
+Each run produces a new log file (or appends to an existing one). Find the most recent file by modification time:
+
+```bash
+ls -lt $HOME/ascend/log/debug/device-<device_id>/ | head -5
+```
+
+## Log Structure Overview
+
+A single run produces two profiling blocks in the device log:
+
+| Block | Emitted by | Function | Content |
+| ----- | ---------- | -------- | ------- |
+| **Orchestrator Profiling** | Thread 3 (orchestrator) | `aicpu_orchestration_entry` | Time breakdown of graph construction on device |
+| **PTO2 Scheduler Summary** | Threads 0/1/2 (schedulers) | `SchedulerContext::resolve_and_dispatch` | Per-thread scheduling statistics, phase timing, and lock contention |
+
+All timing values are in microseconds (us), converted from AICPU cycle counters.
+
+> **host_build_graph (host-orch) note.** The **Orchestrator Profiling** block
+> below only appears in the **device** log under the device-orchestration that
+> `tensormap_and_ringbuffer` runs. In host_build_graph the orchestrator runs on
+> the **host** and the device boots scheduler-only — `aicpu_executor.cpp`
+> carries no on-device orchestrator path — so the device log for a
+> host_build_graph run contains **only Block 2 (the Scheduler Summary)**.
+> Orchestrator graph-construction timing for host_build_graph is a host-side
+> measurement, not a device-log line.
+
+---
+
+## Block 1: Orchestrator Profiling
+
+Thread 3 loads the orchestration `.so` via `dlopen`, calls `aicpu_orchestration_entry`, and prints a profiling summary after it returns.
+
+### Example (from a real run: batch=64, 16704 tasks)
+
+```text
+Thread 3: Calling aicpu_orchestration_entry from SO
+Thread 3: aicpu_orchestration_entry returned, cost 20943.940us
+Thread 3: === Orchestrator Profiling: 16704 tasks, total=14601.580us ===
+Thread 3:   sync_tensormap : 286.300us (2.0%)
+Thread 3:   task_ring_alloc: 380.400us (2.6%)
+Thread 3:   param_copy     : 2147.800us (14.7%)
+Thread 3:   lookup+dep     : 7290.300us (49.9%)
+Thread 3:   heap_alloc     : 701.500us (4.8%)
+Thread 3:   tensormap_ins  : 1890.380us (12.9%)
+Thread 3:   fanin+ready    : 1207.400us (8.3%)
+Thread 3:   finalize+SM    : 697.500us (4.8%)
+Thread 3:   scope_end      : 364.080us
+Thread 3:   avg/task       : 0.874us
+Thread 3: PTO2 total submitted tasks = 16704
+```
+
+### Field Reference
+
+| Field | Source (`pto_orchestrator.cpp`) | Description |
+| ----- | ------------------------------- | ----------- |
+| **cost** | Wall-clock around `orch_func()` call | Total time including orchestration logic + scope overhead |
+| **total** | Sum of all sub-steps below | Accumulated time inside `submit_task` across all tasks |
+| **sync_tensormap** | `g_orch_sync_cycle` | TensorMap validity sync and optional cleanup before each submission |
+| **task_ring_alloc** | `g_orch_alloc_cycle` | Allocating a task slot from the task ring buffer |
+| **param_copy** | `g_orch_args_cycle` | Copying param descriptors + tensor descriptor copies into task-owned storage |
+| **lookup+dep** | `g_orch_lookup_cycle` | TensorMap lookup for inputs/inouts + building fanin/fanout dependency edges |
+| **heap_alloc** | `g_orch_heap_cycle` | Allocating packed output buffers from the heap ring |
+| **tensormap_ins** | `g_orch_insert_cycle` | Inserting output/inout tensors into the TensorMap |
+| **fanin+ready** | `g_orch_fanin_cycle` | Building the fanin list + checking if task is already ready (Step 5/5b) |
+| **scope_end** | `g_orch_scope_end_cycle` | `end_scope` overhead (notifying scheduler of scope completion) |
+| **avg/task** | `total / submit_count` | Average orchestrator time per task submission |
+
+### Interpreting the Numbers
+
+- **cost > total**: The difference is overhead outside `submit_task` (the orchestration user code itself, scope_begin/end, TensorCreateInfo construction, etc.).
+- **lookup+dep** is typically the dominant cost (~50%) because it involves TensorMap hash lookups and building dependency edges with spinlock-protected fanout list insertions.
+- **param_copy** scales with the number of parameters per task.
+- **avg/task < 1us** indicates efficient graph construction.
+
+---
+
+## Block 2: PTO2 Scheduler Summary
+
+Each of the 3 scheduler threads (Thread 0, 1, 2) prints its own summary after completing all tasks. The output has two sub-sections: **summary** and **phase breakdown**.
+
+### Example (Thread 0, from a different run: batch=1, 1044 tasks)
+
+```text
+Thread 0: completed=352 tasks in 3477.420us (147 loops, 2.4 tasks/loop)
+Thread 0: --- Phase Breakdown ---
+Thread 0:   complete:    1485.020us (42.7%)
+Thread 0:   scan:        14.400us (0.4%)
+Thread 0:   dispatch:    1973.060us (56.7%)
+Thread 0:   idle:        4.940us (0.1%)
+```
+
+### Summary Line
+
+```text
+Thread N: completed=X tasks in Yus (Z loops, W tasks/loop)
+```
+
+| Field | Description |
+| ----- | ----------- |
+| **completed** | Number of tasks this thread processed to completion |
+| **Y us** | Total scheduler loop time (sum of all phase cycles) |
+| **Z loops** | Number of scheduler loop iterations |
+| **W tasks/loop** | Average tasks completed per loop iteration; higher = better throughput |
+
+### Phase Breakdown
+
+The scheduler loop runs four phases each iteration. Each phase's time is accumulated across all loop iterations.
+
+| Phase | What it does | Inline stats |
+| ----- | ------------ | ------------ |
+| **complete** | Polls handshake on each managed core; when a core completes, calls `on_subtask_complete(task_id, subslot)` to increment the completion counter; when `completed_subtasks == total_required_subtasks`, triggers `on_task_complete` which traverses fanout list (notify consumers) and fanin list (release producers) | `fanout`: edges/max_degree/avg for consumer notification; `fanin`: edges/max_degree/avg for producer release |
+| **scan** | Updates the perf profiling header with latest scheduler state | — |
+| **dispatch** | For each idle core, pops a task from the shape-based ready queue via `get_ready_task(shape)`, builds the dispatch payload, and writes the task to the core's handshake register | `pop`: `hit` = successful pops (task dispatched), `miss` = empty queue pops, `hit_rate` = hit/(hit+miss) |
+| **idle** | Scheduler loop iteration where no progress was made (no completions, no dispatches) | — |
+
+**Interpreting phase percentages:**
+
+- **dispatch** is typically the largest (~55-60%) because it includes ready-queue pops (with spinlock), payload construction, and cache flush (`dc cvac` + `dsb sy`).
+- **complete** is the second largest (~40-45%) because it traverses both fanout (CAS-based fanin decrement, conditional ready-queue push) and fanin (release_producer, check_consumed, ring pointer advancement).
+- **scan** is small (<1%) — only updates the perf header.
+- **idle** is negligible when tasks are flowing; high idle% indicates the scheduler is starved.
+
+**Interpreting pop hit_rate:**
+
+- **High hit_rate (>50%)**: Ready queue is well-supplied; dispatch is efficient.
+- **Low hit_rate (<10%)**: Ready queue is mostly empty when cores become idle. The bottleneck is upstream (orchestrator submission speed or fanout resolution latency), not dispatch itself.
+
+### Per-Task Averages
+
+Divide each thread's phase times by its `completed` count to get per-task scheduling cost:
+
+| Metric | Formula | Typical value |
+| ------ | ------- | ------------- |
+| Scheduling overhead per task | total_time / completed | ~5-10 us/task |
+| Dispatch per task | dispatch_time / completed | ~3-6 us/task |
+| Complete per task | complete_time / completed | ~2-4 us/task |
+
+---
+
+## Cross-Referencing with Host Profiling
+
+When `--enable-l2-swimlane` is used, the host terminal prints a **Task Statistics by Function** table with `Total_Exec` (total AICore kernel execution time). Combined with device log data:
+
+| Metric | Source | Description |
+| ------ | ------ | ----------- |
+| Avg kernel exec time | `Total_Exec / total_tasks` (host) | Time AICore spends executing each kernel |
+| Avg scheduling overhead | `sum(thread_total) / total_tasks` (device log) | Time AICPU spends scheduling each task |
+| Sched/Exec ratio | scheduling / execution | Scheduling overhead relative to kernel execution |
+
+A high sched/exec ratio (e.g., >3x) indicates that scheduling overhead dominates, and optimizations should target the scheduler's dispatch hot path (cache flush, payload construction) or upstream task flow.
+
+---
+
+## Quick Reference: Extracting Profiling Data
+
+```bash
+# Find the latest device log for device 2
+ls -t $HOME/ascend/log/debug/device-2/device-*.log | head -1
+
+# Extract orchestrator profiling (Thread 3)
+grep "Thread 3:" <logfile>
+
+# Extract scheduler profiling (Threads 0/1/2)
+grep -E "Thread [012]:" <logfile>
+```
diff --git a/src/a2a3/runtime/host_build_graph/docs/profiling_levels.md b/src/a2a3/runtime/host_build_graph/docs/profiling_levels.md
new file mode 100644
index 000000000..dd825273a
--- /dev/null
+++ b/src/a2a3/runtime/host_build_graph/docs/profiling_levels.md
@@ -0,0 +1,492 @@
+# PTO Runtime2 Profiling Levels
+
+This document describes the profiling macro hierarchy and logging control in the PTO Runtime2 system.
+
+## Overview
+
+PTO Runtime2 uses a hierarchical profiling system with compile-time macros to control profiling code compilation and log output. The `enable_l2_swimlane` runtime flag (integer perf_level 0–4) controls data collection granularity (performance buffers, shared memory writes) but does NOT control log output.
+
+> **host_build_graph (host-orch) note.** The profiling **macros** below
+> (`PTO2_PROFILING`, `PTO2_ORCH_PROFILING`, …) are shared with
+> `tensormap_and_ringbuffer`. But the orchestrator-timing **device-log lines**
+> (`orch_start` / `orch_end` / `orch_cost` / `orch_stage_end`) and the
+> device-log line-count formulas that include `N_orch` describe the
+> **device-orch** case that `tensormap_and_ringbuffer` runs. In
+> host_build_graph the orchestrator runs on the **host** and the device boots
+> scheduler-only — `aicpu_executor.cpp` carries no on-device orchestrator path
+> at all — so those orch-timing lines do **not** appear in the device log; only
+> the scheduler-timing lines do. Orchestrator profiling for host_build_graph is
+> a host-side measurement.
+
+## Profiling Macro Hierarchy
+
+Defaults and dependency validation are centralized in
+`src/common/task_interface/profiling_config.h`. Runtime headers include that
+file before using the macros, so both a2a3 and a5 share the same default
+values and compile-time checks.
+
+```text
+PTO2_PROFILING (base level, default=1)
+├── PTO2_ORCH_PROFILING (orchestrator, default=0, requires PTO2_PROFILING=1)
+|   └──PTO2_TENSORMAP_PROFILING (tensormap, default=0, requires PTO2_ORCH_PROFILING=1)
+├── PTO2_SCHED_PROFILING (scheduler, default=0, requires PTO2_PROFILING=1)
+└── --enable-l2-swimlane [PERF_LEVEL] (L2 swimlane data collection, 0-4, bare=4, requires PTO2_PROFILING=1)
+
+```
+
+### Compile-Time Validation
+
+Each sub-level macro requires `PTO2_PROFILING=1`:
+
+```cpp
+#if PTO2_ORCH_PROFILING && !PTO2_PROFILING
+#error "PTO2_ORCH_PROFILING requires PTO2_PROFILING=1"
+#endif
+
+#if PTO2_SCHED_PROFILING && !PTO2_PROFILING
+#error "PTO2_SCHED_PROFILING requires PTO2_PROFILING=1"
+#endif
+
+#if PTO2_TENSORMAP_PROFILING && !PTO2_ORCH_PROFILING
+#error "PTO2_TENSORMAP_PROFILING requires PTO2_ORCH_PROFILING=1"
+#endif
+```
+
+## Profiling Levels
+
+### Level 0: No Profiling (PTO2_PROFILING=0)
+
+**What's compiled:**
+
+- Debug/diagnostic logs (always present)
+- Progress tracking (`PTO2 progress: completed=...`)
+- Stall detection and dump (triggered after the `SCHEDULER_TIMEOUT_MS` wall-clock no-progress budget)
+- Deadlock/livelock detection (`diagnose_stuck_state`, called on stall)
+
+**What's NOT compiled:**
+
+- All `CYCLE_COUNT_*` timing counters (`sched_*_cycle`, orchestrator cost counters)
+- Scheduler/Orchestrator profiling summary logs guarded by `#if PTO2_PROFILING`
+- Performance data collection paths (`enable_l2_swimlane` runtime flag becomes ineffective because profiling code is not compiled)
+
+**Log output (normal run, no stall):**
+
+- No `sched_start/sched_end/sched_cost` timestamps
+- No `orch_start/orch_end/orch_cost` timestamps
+- No `Scheduler summary: total_time=...`
+- No `PTO2 total submitted tasks` log
+- `PTO2 progress: completed=... total=...` may appear (thread 0 only, at task completion milestones)
+
+---
+
+### Level 1: Basic Profiling (PTO2_PROFILING=1)
+
+**What's compiled:**
+
+- Base timing counters for scheduler loop (`sched_complete/dispatch/idle/scan`)
+- Per-thread orchestration timing (`orch_start`, `orch_end`, `orch_cost`)
+- Stage-level orchestration end timestamp (`orch_stage_end`, printed by last orch thread only, marks the moment all orch threads have finished and core transition is about to be requested; only when `orch_to_sched_` is true)
+- PTO2 total submitted tasks count (printed by last orch thread, after orch timing line)
+- Scheduler summary output (`total_time`, `loops`, `tasks_scheduled`)
+- Scheduler lifetime timestamps and cost (`sched_start`, `sched_end`, `sched_cost` — captured inside `resolve_and_dispatch_pto2()`, printed before Scheduler summary)
+
+**What's NOT compiled:**
+
+- Detailed phase breakdowns
+- TensorMap statistics
+
+**Log output (additional lines vs Level 0, per normal run):**
+
+- `Thread %d: orch_start=%llu orch_end=%llu orch_cost=%.3fus` — each orch thread, after orchestration fully complete
+- `PTO2 total submitted tasks = %d, already executed %d tasks` — last orch thread only (×1), after orch timing line
+- `Thread %d: orch_stage_end=%llu` — last orch thread only (×1), only when `orch_to_sched_=true`
+- `Thread %d: sched_start=%llu sched_end=%llu sched_cost=%.3fus` — each sched thread, printed before Scheduler summary
+- `Thread %d: Scheduler summary: total_time=%.3fus, loops=%llu, tasks_scheduled=%d` — each sched thread
+- `Thread %d: sched_start=%llu sched_end(timeout)=%llu sched_cost=%.3fus` — timeout path only (replaces normal `sched_end`)
+
+**LOG_INFO_V9 count (normal run):**
+
+- `orch_to_sched_=false` (default): `N_sched*2 + N_orch*1 + 1` (orch_timing + PTO2_total + sched_timing + Scheduler_summary)
+- `orch_to_sched_=true` (`PTO2_ORCH_TO_SCHED=1`): adds 1 (`orch_stage_end`)
+
+> See the table at the end for concrete counts based on the `paged_attention` example.
+
+**Example log output — `orch_to_sched_=false`** (from `paged_attention`, device 10):
+
+```text
+Thread 2: orch_start=48214752948321 orch_end=48214752959379 orch_cost=230.000us
+Thread 3: orch_start=48214752948316 orch_end=48214752961505 orch_cost=275.000us
+PTO2 total submitted tasks = 13, already executed 13 tasks
+Thread 1: sched_start=48214752948235 sched_end=48214752962379 sched_cost=295.000us
+Thread 1: Scheduler summary: total_time=159.560us, loops=3782, tasks_scheduled=6
+Thread 0: sched_start=48214752948200 sched_end=48214752963571 sched_cost=320.000us
+Thread 0: Scheduler summary: total_time=183.180us, loops=4611, tasks_scheduled=7
+```
+
+**Example log output — `orch_to_sched_=true`** (`PTO2_ORCH_TO_SCHED=1`, from `paged_attention`, device 11):
+
+```text
+Thread 3: orch_stage_end=48236915058307
+Thread 3: orch_start=48236915044001 orch_end=48236915058781 orch_cost=308.000us
+Thread 2: orch_start=48236915044003 orch_end=48236915058782 orch_cost=308.000us
+PTO2 total submitted tasks = 13, already executed 13 tasks
+Thread 0: sched_start=48236915043911 sched_end=48236915059191 sched_cost=318.000us
+Thread 0: Scheduler summary: total_time=187.920us, loops=4561, tasks_scheduled=4
+Thread 1: sched_start=48236915043947 sched_end=48236915061881 sched_cost=372.000us
+Thread 1: Scheduler summary: total_time=168.620us, loops=3880, tasks_scheduled=9
+```
+
+> With `orch_to_sched_=true`, orch threads transition to schedulers after orchestration. They print `orch_end` but do NOT print `Scheduler summary` or `sched_end` (they have no cores assigned at shutdown time).
+
+**Note:**
+
+- All logs above are controlled by compile-time macro `PTO2_PROFILING`, not by `enable_l2_swimlane`.
+- `enable_l2_swimlane` only controls shared-memory data collection / swimlane export.
+- Enable `orch_to_sched_` via environment variable: `PTO2_ORCH_TO_SCHED=1`.
+
+---
+
+### Level 2: Scheduler Detailed Profiling (PTO2_SCHED_PROFILING=1)
+
+**Requires:** `PTO2_PROFILING=1`
+
+**What's compiled:**
+
+- All Level 1 features
+- Detailed scheduler phase counters
+- Phase-specific statistics (complete, scan, dispatch, idle)
+- Hit rate tracking (complete poll, ready queue pop)
+
+**Log output:** 18 LOG_INFO_V9 logs (11 debug + 2 basic + 7 scheduler detailed - 2 replaced)
+
+- Replaces scheduler summary with detailed breakdown
+
+**Scheduler output:**
+
+```text
+Thread X: === Scheduler Phase Breakdown: total=XXXus, XXX tasks ===
+Thread X:   complete       : XXXus (XX.X%)
+Thread X:     poll         : XXXus (XX.X%)  hit=XXX, miss=XXX, hit_rate=XX.X%
+Thread X:     otc_lock     : XXXus (XX.X%)  work=XXXus wait=XXXus  atomics=XXX
+Thread X:     otc_fanout   : XXXus (XX.X%)  work=XXXus wait=XXXus  atomics=XXX
+Thread X:     otc_fanin    : XXXus (XX.X%)  atomics=XXX
+Thread X:     otc_self     : XXXus (XX.X%)  atomics=XXX
+Thread X:     perf         : XXXus (XX.X%)
+Thread X:   dispatch       : XXXus (XX.X%)
+Thread X:     poll         : XXXus (XX.X%)
+Thread X:     pop          : XXXus (XX.X%)  work=XXXus wait=XXXus  atomics=XXX
+Thread X:     setup        : XXXus (XX.X%)
+Thread X:   scan           : XXXus (XX.X%)
+Thread X:   idle           : XXXus (XX.X%)
+Thread X:   avg/complete   : XXXus
+Thread X: Scheduler summary: total_time=XXXus, loops=XXX, tasks_scheduled=XXX
+```
+
+Per-thread fanout / fanin edge counts and ready-queue pop hit / miss
+stats live in `aicpu_scheduler_phases[]` (in `l2_swimlane_records.json`
+captured at l2_swimlane_level >= 3) and `deps.json`; consume them via
+`simpler_setup/tools/sched_overhead_analysis.py`.
+
+---
+
+### Level 3: Orchestrator Detailed Profiling (PTO2_ORCH_PROFILING=1)
+
+**Requires:** `PTO2_PROFILING=1`
+
+**What's compiled:**
+
+- All Level 1 features
+- Detailed orchestrator phase counters
+- Per-phase cycle tracking
+- Atomic operation counters
+- Wait time tracking
+
+**Log output:** 30 LOG_INFO_V9 logs (11 debug + 2 basic + 1 scheduler summary + 17 orchestrator detailed - 1 replaced)
+
+- Replaces basic orchestration completion with detailed breakdown
+
+**Orchestrator output:**
+
+```text
+Thread X: === Orchestrator Profiling: XXX tasks, total=XXXus ===
+Thread X:   sync_tensormap : XXXus (XX.X%)
+Thread X:   task_ring_alloc: XXXus (XX.X%)  work=XXXus wait=XXXus  atomics=XXX
+Thread X:   param_copy     : XXXus (XX.X%)  atomics=XXX
+Thread X:   lookup+dep     : XXXus (XX.X%)
+Thread X:   heap_alloc     : XXXus (XX.X%)  work=XXXus wait=XXXus  atomics=XXX
+Thread X:   tensormap_ins  : XXXus (XX.X%)
+Thread X:   fanin+ready    : XXXus (XX.X%)  work=XXXus wait=XXXus  atomics=XXX
+Thread X:   finalize+SM    : XXXus (XX.X%)  work=XXXus wait=XXXus  atomics=XXX
+Thread X:   scope_end      : XXXus  atomics=XXX
+Thread X:   avg/task       : XXXus
+```
+
+**Note:** Orchestrator logs always print when `PTO2_ORCH_PROFILING=1`, regardless of `enable_l2_swimlane` flag.
+
+---
+
+### Level 4: TensorMap Profiling (PTO2_TENSORMAP_PROFILING=1)
+
+**Requires:** `PTO2_PROFILING=1` AND `PTO2_ORCH_PROFILING=1`
+
+**What's compiled:**
+
+- All Level 3 features
+- TensorMap lookup statistics
+- Hash chain walk tracking
+- Overlap check counters
+
+**Log output:** 34 LOG_INFO_V9 logs (30 from Level 3 + 4 tensormap)
+
+**TensorMap output:**
+
+```text
+Thread X: === TensorMap Lookup Stats ===
+Thread X:   lookups        : XXX, inserts: XXX
+Thread X:   chain walked   : total=XXX, avg=X.X, max=X
+Thread X:   overlap checks : XXX, hits=XXX (XX.X%)
+```
+
+---
+
+## Runtime Flag: enable_l2_swimlane (perf_level)
+
+`--enable-l2-swimlane` accepts an integer perf_level (0–4). Transport
+mirrors the PMU pattern — two independent channels (one binary, one int):
+
+- **Binary on/off** — `KernelArgs::enable_profiling_flag` bit1
+  (`PROFILING_FLAG_L2_SWIMLANE`). Set by the host whenever level > 0; read
+  by AICore (which only needs on/off to decide whether to write timing) and
+  by AICPU kernel entry via `set_l2_swimlane_enabled(bool)`.
+- **Granular level (0–4)** — `L2SwimlaneDataHeader::l2_swimlane_level`
+  (shared memory). Host writes it in `L2SwimlaneCollector::initialize`; AICPU
+  promotes it from the header in `l2_swimlane_aicpu_init` and exposes it via
+  `get_l2_swimlane_level()` (typed `L2SwimlaneLevel`) for
+  `>= AICPU_TIMING / SCHED_PHASES / ORCH_PHASES` gates.
+
+On sim, the binary on/off travels via the dlsym'd `set_l2_swimlane_enabled`
+entry point; the granular level still goes through the shared-memory
+header just like on onboard.
+
+| Level | Collects |
+| ----- | -------- |
+| 0 | Nothing (disabled) |
+| 1 | AICore timing only (start/end/task_token_raw) — AICPU `complete_task` is bypassed |
+| 2 | + AICPU dispatch_time, finish_time |
+| 3 | + Scheduler phases (`SCHED_*`) |
+| 4 | + Orchestrator phases (full) |
+
+At level 1 the AICore record carries the full PTO2 `task_token_raw`
+(`(ring_id << 32) | local_id`), read straight from
+`LocalContext.async_ctx.task_token.raw` inside the AICore helper —
+already in cache from the dispatch payload, so no extra GM load.
+Identity fields the AICPU side used to write at level 1 (`func_id`,
+`core_type`) are derived host-side:
+
+- `func_id` ← `deps.json`'s per-task `kernel_ids[]`, joined by
+  `task_id` at post-process by `swimlane_converter.py`. Same model
+  `fanout` already uses.
+- `core_type` ← per-core static table published by the host into the
+  collector (`L2SwimlaneCollector::set_core_types`).
+
+AICore buffer rotation no longer piggy-backs on `complete_task`. AICPU
+counts dispatches per core in the dispatch path (scheduler_dispatch in
+tensormap_and_ringbuffer; aicpu_executor in host_build_graph) and rotates
+the AICore buffer when the count is about to cross a
+`PLATFORM_AICORE_BUFFER_SIZE` boundary — strictly before
+`write_reg(DATA_MAIN_BASE)` for the first task of the new batch. The
+hook is `l2_swimlane_aicpu_on_aicore_dispatch`. No AICore-side signal is
+needed: AICPU has full dispatch visibility on its own. Race safety comes
+from the completion-before-dispatch invariant (AICore per core is
+single-threaded and AICPU does not dispatch task K+1 until K FIN'd), which
+guarantees AICore has FIN'd — and `dcci`'d out — every record in the old
+buffer by rotation time. This decoupling is what lets level 1 skip
+`complete_task` without losing rotations.
+
+Fanout edges are no longer carried on the device hot path — `swimlane_converter.py`
+joins them from the sibling `deps.json` (produced by dep_gen) at post-process time.
+
+Bare `--enable-l2-swimlane` = level 4 (backward compatible).
+
+### Level gating in AICPU code
+
+Use the strongly-typed `L2SwimlaneLevel` enum so each gate names the
+content it depends on instead of relying on magic numbers:
+
+```cpp
+// Any level > 0: AICPU task record buffer init / flush.
+// Cheap binary check, available immediately after kernel entry.
+if (is_l2_swimlane_enabled()) { ... }
+
+// AICPU dispatch/finish timestamps.
+// Granular checks below require l2_swimlane_aicpu_init to have already run
+// (so the level has been promoted from the shared-memory header).
+if (get_l2_swimlane_level() >= L2SwimlaneLevel::AICPU_TIMING) { ... }
+
+// Scheduler main-loop phase records (SCHED_*)
+if (get_l2_swimlane_level() >= L2SwimlaneLevel::SCHED_PHASES) { ... }
+
+// Orchestrator phase records
+if (get_l2_swimlane_level() >= L2SwimlaneLevel::ORCH_PHASES) { ... }
+```
+
+`L2SwimlaneLevel` is defined in `common/l2_swimlane_profiling.h` with
+underlying type `uint32_t` (matches the `L2SwimlaneDataHeader::l2_swimlane_level`
+shared-memory field and mirrors `PmuEventType : uint32_t`):
+
+| Enumerator | Underlying value |
+| ---------- | ---------------- |
+| `DISABLED` | 0 |
+| `AICORE_TIMING` | 1 |
+| `AICPU_TIMING` | 2 |
+| `SCHED_PHASES` | 3 |
+| `ORCH_PHASES` | 4 |
+
+### When enable_l2_swimlane=0
+
+- No performance data collection
+- No shared memory writes
+- Logs still print (controlled by macros only)
+
+---
+
+## Common Profiling Configurations
+
+### Development (minimal overhead)
+
+```bash
+# No profiling overhead
+PTO2_PROFILING=0
+```
+
+### Basic Performance Monitoring
+
+```bash
+# Minimal overhead, summary logs only
+PTO2_PROFILING=1
+PTO2_ORCH_PROFILING=0
+PTO2_SCHED_PROFILING=0
+```
+
+### Scheduler Performance Analysis
+
+```bash
+# Detailed scheduler breakdown
+PTO2_PROFILING=1
+PTO2_ORCH_PROFILING=0
+PTO2_SCHED_PROFILING=1
+```
+
+### Orchestrator Performance Analysis
+
+```bash
+# Detailed orchestrator breakdown
+PTO2_PROFILING=1
+PTO2_ORCH_PROFILING=1
+PTO2_SCHED_PROFILING=0
+```
+
+### Full Profiling (maximum overhead)
+
+```bash
+# All profiling features enabled
+PTO2_PROFILING=1
+PTO2_ORCH_PROFILING=1
+PTO2_SCHED_PROFILING=1
+PTO2_TENSORMAP_PROFILING=1
+```
+
+---
+
+## Setting Profiling Macros
+
+### At compile time
+
+Pass compile definitions through the build command or CI `CXXFLAGS`.
+This overrides the defaults in `profiling_config.h` without changing source.
+
+```bash
+# Example: disable all profiling code
+CXXFLAGS="-DPTO2_PROFILING=0" pip install --no-build-isolation -e .
+
+# Example: enable orchestrator and tensormap profiling
+CXXFLAGS="-DPTO2_ORCH_PROFILING=1 -DPTO2_TENSORMAP_PROFILING=1" \
+    pip install --no-build-isolation -e .
+```
+
+### In source code (before including headers)
+
+Source-level overrides are only for local experiments. They must appear before
+any header includes `profiling_config.h`; do not add duplicated fallback
+definitions to runtime headers.
+
+```cpp
+#define PTO2_PROFILING 1
+#define PTO2_ORCH_PROFILING 1
+#include "pto_runtime2_types.h"
+```
+
+---
+
+## Log Output Summary
+
+> Example: `paged_attention` on Ascend hardware, 2 sched threads + 2 orch threads, normal run (no stall/timeout).
+
+| Level | Macro Settings | LOG_INFO_V9 Count (`orch_to_sched_=false`) | LOG_INFO_V9 Count (`orch_to_sched_=true`) | Description |
+| ----- | -------------- | ------------------------------------------ | ----------------------------------------- | ----------- |
+| 0 | `PTO2_PROFILING=0` | 0 | 0 | No timing output |
+| 1 | `PTO2_PROFILING=1` | 7 | 8 | Timing timestamps + scheduler summary |
+| 2 | `+PTO2_SCHED_PROFILING=1` | — | — | Scheduler detailed phase breakdown |
+| 3 | `+PTO2_ORCH_PROFILING=1` | — | — | Orchestrator detailed phase breakdown |
+| 4 | `+PTO2_TENSORMAP_PROFILING=1` | — | — | TensorMap lookup stats |
+
+---
+
+## Implementation Notes
+
+### Key Principles
+
+1. **Macros control compilation and logging**
+   - `#if PTO2_PROFILING` controls whether profiling code is compiled
+   - Logs print when macro is enabled, regardless of runtime flag
+
+2. **Runtime flag controls data collection**
+   - `enable_l2_swimlane` controls performance buffer allocation
+   - Controls shared memory writes for host-side export
+   - Does NOT control log output
+
+3. **Consistent behavior across components**
+   - Scheduler logs: macro-controlled only
+   - Orchestrator logs: macro-controlled only
+   - Data collection: runtime flag controlled
+
+### Code Locations
+
+- Macro defaults and validation: `src/common/task_interface/profiling_config.h`
+- Scheduler profiling: `src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp` and `scheduler_cold_path.cpp`
+- Orchestrator profiling: `src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp`
+- TensorMap profiling: `src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h`
+
+---
+
+## Performance Impact
+
+### Compilation overhead
+
+- Level 0: No overhead
+- Level 1: Minimal (counter increments, basic arithmetic)
+- Level 2-4: Low to moderate (additional counters, cycle measurements)
+
+### Runtime overhead
+
+- Logging: Negligible (device logs are asynchronous)
+- Data collection (`enable_l2_swimlane>0`): Low to moderate
+  - Performance buffer writes
+  - Shared memory updates
+  - Per-task timing measurements
+
+### Recommendation
+
+- Use Level 0 for production
+- Use Level 1-2 for performance monitoring
+- Use Level 3-4 for detailed performance analysis only
diff --git a/src/a2a3/runtime/host_build_graph/host/dep_gen_replay.cpp b/src/a2a3/runtime/host_build_graph/host/dep_gen_replay.cpp
new file mode 100644
index 000000000..f554c03c0
--- /dev/null
+++ b/src/a2a3/runtime/host_build_graph/host/dep_gen_replay.cpp
@@ -0,0 +1,787 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+/**
+ * @file dep_gen_replay.cpp
+ * @brief Replay in-memory DepGenRecord stream → deps.json (strided tensor
+ *        representation, tensor-annotated) via a host-resident PTO2TensorMap,
+ *        with a differential check against the runtime template `compute_task_fanin`.
+ *
+ * Two passes run per record against two parallel PTO2TensorMap instances that
+ * evolve in lockstep:
+ *
+ *   ORACLE pass (read-only contract):
+ *     Drives `compute_task_fanin` (the same template the device orchestrator
+ *     uses in pto_orchestrator.cpp:submit_task) against `tm_oracle`. Emits
+ *     only PTO2TaskId values — the canonical set of producer IDs the runtime
+ *     would have wired. We never widen this template's emit signature: this
+ *     pass IS the contract, and any future change to `compute_task_fanin`
+ *     automatically refreshes the oracle.
+ *
+ *   ANNOT pass (this file's feature):
+ *     Inlines the same STEP A (creator retention) + STEP B (tensormap lookup)
+ *     against `tm_annot`, but the callback fires with the full
+ *     `PTO2TensorMapEntry&` + the consumer Tensor* + the arg index, so the
+ *     replay can record per-edge tensor metadata (producer/consumer
+ *     shape/offset, dtype, version).
+ *
+ * After both passes finish per record, we compare the producer-ID set the
+ * oracle emitted to the producer-ID set the annot pass emitted. They MUST
+ * match. If they diverge, deps.json is not written and the function returns
+ * non-zero — this is the "no shotgun modifications" guarantee: anyone who
+ * changes `compute_task_fanin` will trip this gate immediately and know to
+ * mirror the change in the annot pass.
+ *
+ * STEP 1 (explicit_deps) is emitted at the call site (per pto_dep_compute.h's
+ * "kept at call site" note); both passes run the same explicit-deps loop, so
+ * the comparison covers it too.
+ *
+ * STEP 4 (`register_task_outputs`) runs on BOTH tensor maps after both passes
+ * complete, keeping `tm_oracle` and `tm_annot` bit-equivalent for the next
+ * record's INOUT+COVERED `remove_entry` mutations.
+ *
+ * Pool sizing: replay never advances last_task_alive, so each tensor map's
+ * entry pool must accommodate every output write across the whole trace. We
+ * scan the record buffer once to count INOUT + OUTPUT_EXISTING slots and size
+ * the pool accordingly. Both maps get the same size.
+ */
+
+#include "dep_gen_replay.h"
+
+#include <cinttypes>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <fstream>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "common/dep_gen.h"
+#include "common/unified_log.h"
+#include "data_type.h"
+#include "pto_dep_compute.h"
+#include "pto_task_id.h"
+#include "pto_tensormap.h"
+#include "tensor.h"
+
+namespace {
+
+int32_t ceil_pow2(int32_t v) {
+    if (v <= 1) return 1;
+    v--;
+    v |= v >> 1;
+    v |= v >> 2;
+    v |= v >> 4;
+    v |= v >> 8;
+    v |= v >> 16;
+    return v + 1;
+}
+
+// Count INOUT + OUTPUT_EXISTING slots across the record buffer —
+// register_task_outputs only inserts those, and skips entries with manual_dep
+// set. Counting both without inspecting manual_dep is a conservative upper
+// bound (manual_dep is rare; the small over-allocation pays for itself in
+// avoided pool exhaustion).
+int32_t count_outputs(const DepGenRecord *records, size_t n) {
+    int32_t total = 0;
+    for (size_t i = 0; i < n; i++) {
+        const DepGenRecord &r = records[i];
+        // Overflow chain slots are reinterpret_cast views with no tensor data;
+        // their `tensor_count` bytes are actually the overflow `dep_count` field,
+        // which would mislead the loop below if read as a tensor count.
+        if (r.flags & DEP_GEN_FLAG_OVERFLOW) continue;
+        for (uint16_t j = 0; j < r.tensor_count; j++) {
+            auto t = static_cast<TensorArgType>(r.arg_types[j]);
+            if (t == TensorArgType::INOUT || t == TensorArgType::OUTPUT_EXISTING) {
+                total++;
+            }
+        }
+    }
+    return total;
+}
+
+// ---------------------------------------------------------------------------
+// JSON output accumulators (in-memory tables that get serialized at the end)
+// ---------------------------------------------------------------------------
+
+// Edge categories — matches the three places a runtime fanin edge is born.
+enum class EdgeSource { EXPLICIT, CREATOR, TENSORMAP };
+
+const char *edge_source_str(EdgeSource s) {
+    switch (s) {
+    case EdgeSource::EXPLICIT:
+        return "explicit";
+    case EdgeSource::CREATOR:
+        return "creator";
+    case EdgeSource::TENSORMAP:
+        return "tensormap";
+    }
+    return "unknown";
+}
+
+const char *overlap_status_str(OverlapStatus s) {
+    switch (s) {
+    case OverlapStatus::COVERED:
+        return "covered";
+    case OverlapStatus::OTHER:
+        return "other";
+    case OverlapStatus::NO_OVERLAP:
+        return "no_overlap";
+    }
+    return "unknown";
+}
+
+// One annotated edge. consumer_* always populated. producer_* populated for
+// TENSORMAP source only — the explicit/creator emit paths don't have a
+// matched tensormap entry to copy from.
+//
+// Slice description follows the strided Tensor model: (start_offset, strides[])
+// in element units. Byte offset of element coords[] is
+//   (start_offset + Σ coords[i] · strides[i]) · dtype_bytes
+struct EdgeAnnot {
+    uint64_t pred;
+    uint64_t succ;
+    int32_t consumer_arg_idx;  // -1 for EXPLICIT (not tied to a tensor arg)
+    EdgeSource source;
+    OverlapStatus overlap;  // only meaningful for TENSORMAP
+    uint64_t tensor_id;     // 0 for EXPLICIT
+    // Consumer side (the Tensor the submitting task is reading).
+    uint8_t consumer_dtype;
+    uint32_t consumer_ndims;
+    uint32_t consumer_shape[MAX_TENSOR_DIMS];
+    uint64_t consumer_start_offset;  // 1D element offset
+    uint32_t consumer_strides[MAX_TENSOR_DIMS];
+    // Producer side (the slice the producer wrote, from the tensormap entry).
+    // Only populated when source == TENSORMAP.
+    uint32_t producer_ndims;
+    uint32_t producer_shape[MAX_TENSOR_DIMS];
+    uint64_t producer_start_offset;
+    uint32_t producer_strides[MAX_TENSOR_DIMS];
+};
+
+// One entry in the tensors[] table: the underlying storage, keyed by
+// (buffer_addr, version). buffer_numel is the storage element count;
+// per-edge fields describe the slice (start_offset + stride).
+struct TensorTableEntry {
+    uint64_t tensor_id;
+    uint64_t buffer_addr;
+    uint64_t buffer_numel;  // storage size in elements (= buffer.size / dtype_bytes)
+    int32_t version;
+    uint8_t dtype;
+};
+
+// One arg slot of a task, captured for the `tasks[].args[]` block so
+// downstream viewers can render per-task input / output compartments without
+// having to scan every edge. `has_tensor_info` is false only for OUTPUT slots:
+// the runtime hasn't materialized a Tensor yet at submit_task time, so the
+// captured blob is zeroed.
+struct TaskArgEntry {
+    int32_t idx;
+    TensorArgType arg_type;
+    bool has_tensor_info;
+    uint64_t tensor_id;
+    uint8_t dtype;
+    uint32_t ndims;
+    uint32_t shape[MAX_TENSOR_DIMS];
+    uint64_t start_offset;  // 1D element offset
+    uint32_t strides[MAX_TENSOR_DIMS];
+};
+
+struct TaskTableEntry {
+    uint64_t task_id;
+    bool in_manual_scope;
+    int32_t kernel_id[3];  // per-subslot {AIC, AIV0, AIV1}, -1 = inactive
+    uint32_t block_num;
+    std::vector<TaskArgEntry> args;
+};
+
+const char *arg_type_str(TensorArgType t) {
+    switch (t) {
+    case TensorArgType::INPUT:
+        return "INPUT";
+    case TensorArgType::OUTPUT:
+        return "OUTPUT";
+    case TensorArgType::INOUT:
+        return "INOUT";
+    case TensorArgType::OUTPUT_EXISTING:
+        return "OUTPUT_EXISTING";
+    }
+    return "UNKNOWN";
+}
+
+// FNV-1a 64-bit hash of (buffer_addr, version) — stable tensor identity
+// across runs (no time-dependent inputs).
+uint64_t make_tensor_id(uint64_t buffer_addr, int32_t version) {
+    constexpr uint64_t FNV_OFFSET = 0xcbf29ce484222325ULL;
+    constexpr uint64_t FNV_PRIME = 0x100000001b3ULL;
+    uint64_t h = FNV_OFFSET;
+    const uint8_t *p;
+    p = reinterpret_cast<const uint8_t *>(&buffer_addr);
+    for (size_t i = 0; i < sizeof(buffer_addr); i++) {
+        h ^= p[i];
+        h *= FNV_PRIME;
+    }
+    uint32_t v = static_cast<uint32_t>(version);
+    p = reinterpret_cast<const uint8_t *>(&v);
+    for (size_t i = 0; i < sizeof(v); i++) {
+        h ^= p[i];
+        h *= FNV_PRIME;
+    }
+    return h;
+}
+
+// Register a tensor in the tensors[] table on first sight of (addr,
+// version). buffer_numel describes the underlying storage size in elements;
+// per-edge fields describe the slice via (start_offset, strides[]). Subsequent
+// sightings of the same (addr, version) are no-ops.
+uint64_t register_tensor(
+    std::unordered_map<uint64_t, size_t> &index_by_id, std::vector<TensorTableEntry> &table, const Tensor &t
+) {
+    uint64_t id = make_tensor_id(t.buffer.addr, t.version);
+    auto it = index_by_id.find(id);
+    if (it != index_by_id.end()) {
+        return id;
+    }
+    TensorTableEntry e;
+    e.tensor_id = id;
+    e.buffer_addr = t.buffer.addr;
+    e.version = t.version;
+    e.dtype = static_cast<uint8_t>(t.dtype);
+    const uint64_t elem_size = get_element_size(t.dtype);
+    e.buffer_numel = (elem_size == 0) ? 0 : (t.buffer.size / elem_size);
+    index_by_id[id] = table.size();
+    table.push_back(e);
+    return id;
+}
+
+// Copy a Tensor's slice description (shape + start_offset + stride) into an
+// EdgeAnnot's consumer_* fields.
+void fill_consumer(EdgeAnnot &e, const Tensor &t) {
+    e.consumer_dtype = static_cast<uint8_t>(t.dtype);
+    e.consumer_ndims = t.ndims;
+    e.consumer_start_offset = t.start_offset;
+    for (uint32_t i = 0; i < t.ndims && i < MAX_TENSOR_DIMS; i++) {
+        e.consumer_shape[i] = t.shapes[i];
+        e.consumer_strides[i] = t.strides[i];
+    }
+}
+
+// Copy a PTO2TensorMapEntry's slice description into an EdgeAnnot's producer_*
+// fields. Only called from the TENSORMAP emit path.
+void fill_producer(EdgeAnnot &e, const PTO2TensorMapEntry &entry) {
+    e.producer_ndims = entry.ndims;
+    e.producer_start_offset = entry.start_offset;
+    for (uint32_t i = 0; i < entry.ndims && i < MAX_TENSOR_DIMS; i++) {
+        e.producer_shape[i] = entry.shapes[i];
+        e.producer_strides[i] = entry.strides[i];
+    }
+}
+
+// ---------------------------------------------------------------------------
+// JSON writer
+// ---------------------------------------------------------------------------
+
+void write_uint_array(std::ofstream &out, const uint32_t *data, uint32_t n) {
+    out << '[';
+    for (uint32_t i = 0; i < n; i++) {
+        if (i > 0) out << ',';
+        out << data[i];
+    }
+    out << ']';
+}
+
+bool write_deps_json(
+    const char *path, const std::vector<TaskTableEntry> &tasks, const std::vector<TensorTableEntry> &tensors,
+    const std::vector<EdgeAnnot> &edges
+) {
+    std::ofstream out(path, std::ios::out | std::ios::trunc);
+    if (!out) {
+        LOG_ERROR("dep_gen replay: failed to open '%s' for write", path);
+        return false;
+    }
+    // Strided tensor representation. tensors[].buffer_numel is the underlying
+    // storage element count; tasks[].args[] and edges[] carry per-slice
+    // geometry as (start_offset uint64, strides[] uint32 — runtime invariant
+    // forbids zero / negative strides, see runtime/tensor.h).
+    out << "{\"tasks\":[";
+    for (size_t i = 0; i < tasks.size(); i++) {
+        if (i > 0) out << ',';
+        const auto &t = tasks[i];
+        // uint64 fields are quoted as strings — task_id/tensor_id/buffer_addr/
+        // pred/succ can exceed Number.MAX_SAFE_INTEGER (2^53-1), silently
+        // losing precision in JS-based JSON parsers. Python consumers already
+        // pass these through int(...) and don't care which form they receive.
+        out << "{\"task_id\":\"" << t.task_id << '"';
+        out << ",\"scope\":\"" << (t.in_manual_scope ? "manual" : "auto") << '"';
+        // Per-subslot kernel ids {AIC, AIV0, AIV1}; INVALID_KERNEL_ID = -1 for
+        // inactive subslots. Emitted as a plain int triple — downstream viewers
+        // (and the swimlane host post-processor) use it to resolve task_id →
+        // kernel without the AICore record carrying the field itself.
+        out << ",\"kernel_ids\":[" << t.kernel_id[0] << ',' << t.kernel_id[1] << ',' << t.kernel_id[2] << ']';
+        out << ",\"block_num\":" << t.block_num;
+        out << ",\"args\":[";
+        for (size_t a = 0; a < t.args.size(); a++) {
+            if (a > 0) out << ',';
+            const auto &arg = t.args[a];
+            out << "{\"idx\":" << arg.idx;
+            out << ",\"type\":\"" << arg_type_str(arg.arg_type) << '"';
+            if (arg.has_tensor_info) {
+                out << ",\"tensor_id\":\"" << arg.tensor_id << '"';
+                out << ",\"dtype\":\"" << get_dtype_name(static_cast<DataType>(arg.dtype)) << '"';
+                out << ",\"shape\":";
+                write_uint_array(out, arg.shape, arg.ndims);
+                out << ",\"start_offset\":\"" << arg.start_offset << '"';
+                out << ",\"strides\":";
+                write_uint_array(out, arg.strides, arg.ndims);
+            }
+            out << '}';
+        }
+        out << "]}";
+    }
+    out << ']';
+
+    out << ",\"tensors\":[";
+    for (size_t i = 0; i < tensors.size(); i++) {
+        if (i > 0) out << ',';
+        const auto &t = tensors[i];
+        out << "{\"tensor_id\":\"" << t.tensor_id << '"';
+        out << ",\"buffer_addr\":\"" << t.buffer_addr << '"';
+        out << ",\"version\":" << t.version;
+        out << ",\"dtype\":\"" << get_dtype_name(static_cast<DataType>(t.dtype)) << '"';
+        out << ",\"buffer_numel\":\"" << t.buffer_numel << '"';
+        out << '}';
+    }
+    out << ']';
+
+    out << ",\"edges\":[";
+    for (size_t i = 0; i < edges.size(); i++) {
+        if (i > 0) out << ',';
+        const auto &e = edges[i];
+        out << "{\"pred\":\"" << e.pred << "\",\"succ\":\"" << e.succ << '"';
+        out << ",\"arg\":" << e.consumer_arg_idx;
+        out << ",\"source\":\"" << edge_source_str(e.source) << '"';
+        if (e.source == EdgeSource::TENSORMAP) {
+            out << ",\"overlap\":\"" << overlap_status_str(e.overlap) << '"';
+        }
+        if (e.source != EdgeSource::EXPLICIT) {
+            out << ",\"tensor_id\":\"" << e.tensor_id << '"';
+            out << ",\"consumer_dtype\":\"" << get_dtype_name(static_cast<DataType>(e.consumer_dtype)) << '"';
+            out << ",\"consumer_shape\":";
+            write_uint_array(out, e.consumer_shape, e.consumer_ndims);
+            out << ",\"consumer_start_offset\":\"" << e.consumer_start_offset << '"';
+            out << ",\"consumer_strides\":";
+            write_uint_array(out, e.consumer_strides, e.consumer_ndims);
+        }
+        if (e.source == EdgeSource::TENSORMAP) {
+            out << ",\"producer_shape\":";
+            write_uint_array(out, e.producer_shape, e.producer_ndims);
+            out << ",\"producer_start_offset\":\"" << e.producer_start_offset << '"';
+            out << ",\"producer_strides\":";
+            write_uint_array(out, e.producer_strides, e.producer_ndims);
+        }
+        out << '}';
+    }
+    out << "]}\n";
+    return static_cast<bool>(out);
+}
+
+// ---------------------------------------------------------------------------
+// Annot pass — mirrors compute_task_fanin step-by-step against tm_annot.
+// Must stay bit-equivalent to pto_dep_compute.h::compute_task_fanin in terms
+// of which producer IDs are emitted (the differential check enforces this).
+// ---------------------------------------------------------------------------
+
+template <typename EmitTM, typename EmitCreator>
+void annot_pass(
+    const DepInputs &inputs, PTO2TensorMap &tensor_map, bool in_manual_scope, EmitCreator emit_creator,
+    EmitTM emit_tensormap
+) {
+    if (in_manual_scope) {
+        return;
+    }
+    for (int32_t i = 0; i < inputs.tensor_count; i++) {
+        TensorArgType ptype = inputs.arg_types[i];
+        if (ptype == TensorArgType::OUTPUT) {
+            continue;
+        }
+        const Tensor *tensor = &inputs.tensors[i].ref();
+
+        // STEP A: creator retention.
+        PTO2TaskId owner = tensor->owner_task_id;
+        if (owner.is_valid()) {
+            emit_creator(owner, i, *tensor);
+        }
+
+        // STEP B: tensormap lookup (only INPUT/INOUT, skip manual_dep).
+        if (ptype != TensorArgType::INPUT && ptype != TensorArgType::INOUT) {
+            continue;
+        }
+        if (tensor->manual_dep) {
+            continue;
+        }
+
+        tensor_map.lookup(*tensor, [&](PTO2TensorMapEntry &entry, OverlapStatus overlap_status) -> bool {
+            emit_tensormap(entry.producer_task_id, i, *tensor, entry, overlap_status);
+            if (ptype == TensorArgType::INOUT && overlap_status == OverlapStatus::COVERED) {
+                tensor_map.remove_entry(entry);
+            }
+            return true;
+        });
+    }
+}
+
+}  // namespace
+
+extern "C" int
+dep_gen_replay_emit_deps_json(const DepGenRecord *records, size_t num_records, const char *deps_json_path) {
+    if (deps_json_path == nullptr) {
+        LOG_ERROR("dep_gen replay: null deps_json_path");
+        return -1;
+    }
+    if (num_records > 0 && records == nullptr) {
+        LOG_ERROR("dep_gen replay: num_records=%zu but records pointer is null", num_records);
+        return -1;
+    }
+    LOG_INFO_V0("dep_gen replay: processing %zu in-memory records (dual-pass)", num_records);
+
+    // Per-ring task window sizes — tensormap masks slot indices and requires
+    // each to be a power of two. Auto-size from the records themselves so each
+    // ring's window comfortably covers its observed max local_id (no slot
+    // aliasing during INOUT+COVERED remove_from_task). Same sizes feed both
+    // maps so they stay in lockstep.
+    int32_t task_window_sizes[PTO2_MAX_RING_DEPTH];
+    uint32_t max_local[PTO2_MAX_RING_DEPTH] = {0};
+    for (size_t i = 0; i < num_records; i++) {
+        PTO2TaskId tid{records[i].task_id};
+        uint8_t ring = tid.ring();
+        uint32_t local = tid.local();
+        if (ring < PTO2_MAX_RING_DEPTH && local > max_local[ring]) {
+            max_local[ring] = local;
+        }
+    }
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        int32_t need = static_cast<int32_t>(max_local[r] + 1);
+        task_window_sizes[r] = ceil_pow2(need < 16 ? 16 : need);
+    }
+
+    int32_t output_count = count_outputs(records, num_records);
+    int32_t pool_size = output_count + (output_count / 10) + 64;
+    if (pool_size < PTO2_TENSORMAP_POOL_SIZE) {
+        pool_size = PTO2_TENSORMAP_POOL_SIZE;
+    }
+
+    PTO2TensorMap tm_oracle;
+    PTO2TensorMap tm_annot;
+    std::memset(&tm_oracle, 0, sizeof(tm_oracle));
+    std::memset(&tm_annot, 0, sizeof(tm_annot));
+
+    // Libc-backed arena (default ctor) that owns both replay tensormaps'
+    // storage. Released by the arena destructor when this function returns.
+    DeviceArena replay_arena;
+
+    auto oracle_layout =
+        PTO2TensorMap::reserve_layout(replay_arena, PTO2_TENSORMAP_NUM_BUCKETS, pool_size, task_window_sizes[0]);
+    auto annot_layout =
+        PTO2TensorMap::reserve_layout(replay_arena, PTO2_TENSORMAP_NUM_BUCKETS, pool_size, task_window_sizes[0]);
+    if (replay_arena.commit() == nullptr || !tm_oracle.init_data_from_layout(oracle_layout, replay_arena) ||
+        !tm_annot.init_data_from_layout(annot_layout, replay_arena)) {
+        LOG_ERROR("dep_gen replay: tensormap.init failed (buckets=%d, pool=%d)", PTO2_TENSORMAP_NUM_BUCKETS, pool_size);
+        return -3;
+    }
+    // Replay tensormaps live entirely on host; only arena-internal pointer
+    // fields need wiring (no parent-orch back-reference exists anymore).
+    tm_oracle.wire_arena_pointers(oracle_layout, replay_arena);
+    tm_annot.wire_arena_pointers(annot_layout, replay_arena);
+
+    // JSON output accumulators.
+    std::vector<TaskTableEntry> task_table;
+    std::vector<TensorTableEntry> tensor_table;
+    std::unordered_map<uint64_t, size_t> tensor_index;  // tensor_id → table idx
+    std::vector<EdgeAnnot> annot_edges;
+    annot_edges.reserve(num_records * 2);
+
+    TensorRef tref_buf[CORE_MAX_TENSOR_ARGS];
+    TensorArgType atype_buf[CORE_MAX_TENSOR_ARGS];
+
+    // Per-record dedup of producer IDs — must match runtime's
+    // PTO2FaninBuilder::append_fanin_or_fail semantics, which collapses STEP 1
+    // (explicit_deps) + STEP A (creator retention) + STEP B (tensormap lookup)
+    // into a single per-task fanin list. Both oracle and annot use this same
+    // semantics so the divergence check is meaningful.
+    std::unordered_set<uint64_t> oracle_preds;
+    std::unordered_set<uint64_t> annot_preds;
+
+    // Scratch buffer for assembling full dep lists across overflow chains.
+    // Declared outside the loop so it can be reused (clear() keeps capacity).
+    std::vector<uint64_t> full_deps_buf;
+
+    for (size_t rec_i = 0; rec_i < num_records; rec_i++) {
+        const DepGenRecord &rec = records[rec_i];
+
+        // Overflow chain records are consumed by the preceding base; skip
+        // them in the main scan so we don't double-process or read the
+        // overflow's reinterpreted bytes as tensor/dep info.
+        if (rec.flags & DEP_GEN_FLAG_OVERFLOW) continue;
+
+        PTO2TaskId task_id{rec.task_id};
+        bool in_manual_scope = (rec.flags & DEP_GEN_FLAG_IN_MANUAL_SCOPE) != 0;
+
+        oracle_preds.clear();
+        annot_preds.clear();
+
+        int32_t tc = static_cast<int32_t>(rec.tensor_count);
+        if (tc > CORE_MAX_TENSOR_ARGS) {
+            tc = CORE_MAX_TENSOR_ARGS;
+        }
+        for (int32_t i = 0; i < tc; i++) {
+            tref_buf[i] = reinterpret_cast<const Tensor *>(&rec.tensors[i][0]);
+            atype_buf[i] = static_cast<TensorArgType>(rec.arg_types[i]);
+        }
+
+        // Assemble the full dep list. Fast path: ≤ DEP_GEN_MAX_EXPLICIT_DEPS,
+        // no chain, point straight at rec.explicit_deps. Slow path: gather
+        // base + chain into full_deps_buf and point at the buffer.
+        //
+        // `explicit_dep_count` / `over->dep_count` originate from device
+        // shared memory and are bounded by the writer to the array sizes, but
+        // we clamp on read too so a corrupted record never drives an OOB read
+        // off the end of rec.explicit_deps[64] / over->deps[582].
+        const uint64_t *deps_data;
+        int32_t dc;
+        if (rec.flags & DEP_GEN_FLAG_HAS_OVERFLOW) {
+            full_deps_buf.clear();
+            uint16_t base_dc = rec.explicit_dep_count;
+            if (base_dc > DEP_GEN_MAX_EXPLICIT_DEPS) {
+                LOG_ERROR(
+                    "dep_gen replay: clamping base explicit_dep_count %u > %d at rec_idx=%zu (task_id=%" PRIu64 ")",
+                    base_dc, DEP_GEN_MAX_EXPLICIT_DEPS, rec_i, rec.task_id
+                );
+                base_dc = DEP_GEN_MAX_EXPLICIT_DEPS;
+            }
+            full_deps_buf.reserve(static_cast<size_t>(base_dc) + DEP_GEN_OVERFLOW_DEPS_PER_RECORD);
+            full_deps_buf.insert(full_deps_buf.end(), rec.explicit_deps, rec.explicit_deps + base_dc);
+            bool chain_complete = false;
+            for (size_t j = rec_i + 1; j < num_records; j++) {
+                const DepGenRecord &maybe = records[j];
+                if (!(maybe.flags & DEP_GEN_FLAG_OVERFLOW)) {
+                    LOG_ERROR(
+                        "dep_gen replay: unterminated overflow chain at rec_idx=%zu (task_id=%" PRIu64 ")", rec_i,
+                        rec.task_id
+                    );
+                    break;
+                }
+                if (maybe.task_id != rec.task_id) {
+                    LOG_ERROR(
+                        "dep_gen replay: orphan overflow at rec_idx=%zu (expected task_id=%" PRIu64 ", found %" PRIu64
+                        ")",
+                        j, rec.task_id, maybe.task_id
+                    );
+                    break;
+                }
+                const auto *over = reinterpret_cast<const DepGenOverflowRecord *>(&maybe);
+                uint16_t over_dc = over->dep_count;
+                if (over_dc > DEP_GEN_OVERFLOW_DEPS_PER_RECORD) {
+                    LOG_ERROR(
+                        "dep_gen replay: clamping overflow dep_count %u > %d at rec_idx=%zu (task_id=%" PRIu64 ")",
+                        over_dc, DEP_GEN_OVERFLOW_DEPS_PER_RECORD, j, rec.task_id
+                    );
+                    over_dc = DEP_GEN_OVERFLOW_DEPS_PER_RECORD;
+                }
+                full_deps_buf.insert(full_deps_buf.end(), over->deps, over->deps + over_dc);
+                if (over->flags & DEP_GEN_FLAG_LAST_OVERFLOW) {
+                    chain_complete = true;
+                    break;
+                }
+            }
+            if (!chain_complete) {
+                LOG_ERROR(
+                    "dep_gen replay: chain for task_id=%" PRIu64 " missing LAST_OVERFLOW marker — "
+                    "using partial dep list (%zu deps)",
+                    rec.task_id, full_deps_buf.size()
+                );
+            }
+            deps_data = full_deps_buf.data();
+            dc = static_cast<int32_t>(full_deps_buf.size());
+        } else {
+            deps_data = rec.explicit_deps;
+            uint16_t base_dc = rec.explicit_dep_count;
+            if (base_dc > DEP_GEN_MAX_EXPLICIT_DEPS) {
+                LOG_ERROR(
+                    "dep_gen replay: clamping no-chain explicit_dep_count %u > %d at rec_idx=%zu (task_id=%" PRIu64 ")",
+                    base_dc, DEP_GEN_MAX_EXPLICIT_DEPS, rec_i, rec.task_id
+                );
+                base_dc = DEP_GEN_MAX_EXPLICIT_DEPS;
+            }
+            dc = static_cast<int32_t>(base_dc);
+        }
+
+        DepInputs inputs;
+        inputs.tensor_count = tc;
+        inputs.tensors = tref_buf;
+        inputs.arg_types = atype_buf;
+        inputs.explicit_dep_count = dc;
+        inputs.explicit_deps = reinterpret_cast<const PTO2TaskId *>(deps_data);
+
+        // Register tasks[] entry (with per-arg slot info) and any unseen
+        // tensors[] entries up-front. Tensors are registered from the
+        // consumer-side blob so raw_shapes / dtype are populated (the
+        // producer-side PTO2TensorMapEntry drops raw_shapes to fit in two
+        // cache lines).
+        TaskTableEntry task_entry;
+        task_entry.task_id = rec.task_id;
+        task_entry.in_manual_scope = in_manual_scope;
+        task_entry.kernel_id[0] = rec.kernel_id[0];
+        task_entry.kernel_id[1] = rec.kernel_id[1];
+        task_entry.kernel_id[2] = rec.kernel_id[2];
+        task_entry.block_num = rec.block_num > 0 ? rec.block_num : 1u;
+        task_entry.args.reserve(tc);
+        for (int32_t i = 0; i < tc; i++) {
+            TaskArgEntry slot{};
+            slot.idx = i;
+            slot.arg_type = atype_buf[i];
+            if (atype_buf[i] == TensorArgType::OUTPUT) {
+                // OUTPUT blob is zero at submit time (writer has no Tensor
+                // yet); leave has_tensor_info=false. Viewers render this as
+                // a placeholder "alloc" output slot.
+                slot.has_tensor_info = false;
+            } else {
+                const Tensor &t = tref_buf[i].ref();
+                register_tensor(tensor_index, tensor_table, t);
+                slot.has_tensor_info = true;
+                slot.tensor_id = make_tensor_id(t.buffer.addr, t.version);
+                slot.dtype = static_cast<uint8_t>(t.dtype);
+                slot.ndims = t.ndims;
+                slot.start_offset = t.start_offset;
+                for (uint32_t d = 0; d < t.ndims && d < MAX_TENSOR_DIMS; d++) {
+                    slot.shape[d] = t.shapes[d];
+                    slot.strides[d] = t.strides[d];
+                }
+            }
+            task_entry.args.push_back(slot);
+        }
+        task_table.push_back(std::move(task_entry));
+
+        // ============ STEP 1 — explicit_deps (call-site emit) ============
+        // Same loop on both passes; they MUST produce identical sets here
+        // because they read the same record. Annot records explicit edges
+        // with consumer_arg_idx = -1 (not tied to any tensor arg). Reads
+        // from deps_data (base record's explicit_deps[] on fast path, the
+        // gathered base+chain buffer on overflow path).
+        for (int32_t i = 0; i < dc; i++) {
+            uint64_t pred_raw = deps_data[i];
+            if (oracle_preds.insert(pred_raw).second) {
+                // First time this pred is seen at runtime call site.
+            }
+            if (annot_preds.insert(pred_raw).second) {
+                EdgeAnnot e{};
+                e.pred = pred_raw;
+                e.succ = rec.task_id;
+                e.consumer_arg_idx = -1;
+                e.source = EdgeSource::EXPLICIT;
+                annot_edges.push_back(e);
+            }
+        }
+
+        // ============ ORACLE pass — drive compute_task_fanin ============
+        bool ok = compute_task_fanin(inputs, tm_oracle, in_manual_scope, [&](PTO2TaskId producer) -> bool {
+            oracle_preds.insert(producer.raw);
+            return true;
+        });
+        if (!ok) {
+            LOG_ERROR("dep_gen replay: compute_task_fanin returned fatal at task_id=%" PRIu64, rec.task_id);
+            tm_oracle.destroy();
+            tm_annot.destroy();
+            return -4;
+        }
+
+        // ============ ANNOT pass — inline mirror, full entry capture ============
+        annot_pass(
+            inputs, tm_annot, in_manual_scope,
+            // emit_creator(producer, arg_idx, consumer_tensor)
+            [&](PTO2TaskId producer, int32_t arg_idx, const Tensor &consumer) {
+                if (!annot_preds.insert(producer.raw).second) {
+                    return;  // already covered by an earlier emit on this record
+                }
+                EdgeAnnot e{};
+                e.pred = producer.raw;
+                e.succ = rec.task_id;
+                e.consumer_arg_idx = arg_idx;
+                e.source = EdgeSource::CREATOR;
+                e.tensor_id = make_tensor_id(consumer.buffer.addr, consumer.version);
+                fill_consumer(e, consumer);
+                annot_edges.push_back(e);
+            },
+            // emit_tensormap(producer, arg_idx, consumer_tensor, entry, status)
+            [&](PTO2TaskId producer, int32_t arg_idx, const Tensor &consumer, const PTO2TensorMapEntry &entry,
+                OverlapStatus status) {
+                // Per-(succ, arg_idx, producer_buffer_addr, producer_version)
+                // dedup gives us "the same producer slice fired twice for the
+                // same consumer arg" collapse — but two distinct slices from
+                // the same producer (different version), or two different
+                // producers, both yield their own edges. The producer-id-set
+                // comparison below uses annot_preds, which dedups by pred
+                // only, matching runtime PTO2FaninBuilder semantics.
+                annot_preds.insert(producer.raw);
+                EdgeAnnot e{};
+                e.pred = producer.raw;
+                e.succ = rec.task_id;
+                e.consumer_arg_idx = arg_idx;
+                e.source = EdgeSource::TENSORMAP;
+                e.overlap = status;
+                e.tensor_id = make_tensor_id(entry.buffer_addr, entry.version);
+                fill_consumer(e, consumer);
+                fill_producer(e, entry);
+                annot_edges.push_back(e);
+            }
+        );
+
+        // ============ Differential check ============
+        if (oracle_preds != annot_preds) {
+            LOG_ERROR(
+                "dep_gen replay: DIVERGENCE at task_id=%" PRIu64 " (rec_idx=%zu): oracle has %zu preds, annot has %zu",
+                rec.task_id, rec_i, oracle_preds.size(), annot_preds.size()
+            );
+            // Log the symmetric difference for debugging.
+            for (uint64_t p : oracle_preds) {
+                if (annot_preds.find(p) == annot_preds.end()) {
+                    LOG_ERROR("  only-in-oracle pred: %" PRIu64, p);
+                }
+            }
+            for (uint64_t p : annot_preds) {
+                if (oracle_preds.find(p) == oracle_preds.end()) {
+                    LOG_ERROR("  only-in-annot  pred: %" PRIu64, p);
+                }
+            }
+            tm_oracle.destroy();
+            tm_annot.destroy();
+            return -6;
+        }
+
+        // ============ STEP 4 — publish outputs on BOTH maps ============
+        register_task_outputs(inputs, task_id, tm_oracle, in_manual_scope);
+        register_task_outputs(inputs, task_id, tm_annot, in_manual_scope);
+    }
+
+    tm_oracle.destroy();
+    tm_annot.destroy();
+
+    if (!write_deps_json(deps_json_path, task_table, tensor_table, annot_edges)) {
+        return -5;
+    }
+    LOG_INFO_V0(
+        "dep_gen replay: wrote deps.json to %s (tasks=%zu, tensors=%zu, edges=%zu)", deps_json_path, task_table.size(),
+        tensor_table.size(), annot_edges.size()
+    );
+    return 0;
+}
diff --git a/src/a2a3/runtime/host_build_graph/host/dep_gen_replay.h b/src/a2a3/runtime/host_build_graph/host/dep_gen_replay.h
new file mode 100644
index 000000000..2ea3d5768
--- /dev/null
+++ b/src/a2a3/runtime/host_build_graph/host/dep_gen_replay.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+/**
+ * @file dep_gen_replay.h
+ * @brief Host-side replay of in-memory DepGenRecord stream → deps.json.
+ *
+ * Takes the records the host collector drained from the device ring buffer
+ * (``DepGenCollector::records()``) and runs them back through a host-resident
+ * PTO2TensorMap using the same ``compute_task_fanin`` / ``register_task_outputs``
+ * primitives the device orchestrator uses, emitting the full
+ * predecessor → successor edge list to deps.json.
+ *
+ * The records buffer is passed in directly — there is no intermediate
+ * ``submit_trace.bin`` on disk. The host already has the records once the
+ * device run completes, so going through the filesystem would just be
+ * extra I/O and an extra file in the output directory.
+ *
+ * deps.json is the sole source of truth for fanout: the L2 swimlane hot
+ * path no longer records ``L2SwimlaneAicpuTaskRecord::fanout[]`` (taking the per-task
+ * 1 KB GM store off the scheduler critical path). Replay sees every
+ * submit and reconstructs the complete dependency graph.
+ *
+ * Output format (deps.json, strided tensor representation):
+ *
+ *   {"tasks":   [{"task_id":<u64>, "scope":"auto|manual",
+ *                 "args":[{"idx":<i32>, "type":"<arg_type>",
+ *                          "tensor_id":<u64>, "dtype":"...", "shape":[...],
+ *                          "start_offset":<u64>, "strides":[...]}, ...]}, ...],
+ *    "tensors": [{"tensor_id":<u64>, "buffer_addr":<u64>, "version":<i32>,
+ *                 "dtype":"FLOAT32", "buffer_numel":<u64>}, ...],
+ *    "edges":   [{"pred":<u64>, "succ":<u64>, "arg":<i32>,
+ *                 "source":"explicit|creator|tensormap",
+ *                 "overlap":"covered|other" (tensormap only),
+ *                 "tensor_id":<u64> (non-explicit),
+ *                 "consumer_dtype":"...", "consumer_shape":[...],
+ *                 "consumer_start_offset":<u64>, "consumer_strides":[...],
+ *                 "producer_shape":[...] (tensormap),
+ *                 "producer_start_offset":<u64> (tensormap),
+ *                 "producer_strides":[...] (tensormap)},
+ *                ...]}
+ *
+ *   - All task ids are ``PTO2TaskId::raw`` values (``(ring_id << 32) | local_id``).
+ *   - ``tensor_id`` is a stable FNV-1a hash of ``(buffer_addr, version)``.
+ *   - ``buffer_numel`` is the underlying storage element count; tensor shapes
+ *     are carried per-arg / per-edge alongside ``start_offset`` + ``strides``.
+ *   - Distinct producers / arg indices / sources keep their own edges; per-record
+ *     deduplication of producer ids mirrors the runtime
+ *     ``PTO2FaninBuilder::append_fanin_or_fail`` semantics so the set of
+ *     ``(pred, succ)`` pairs is identical to what the runtime would have
+ *     recorded.
+ *
+ * Self-checking: the replay runs two parallel tensormap instances per record —
+ * an "oracle" map driven by the canonical ``compute_task_fanin`` template, and
+ * an "annotated" map driven by an inlined mirror that captures the per-edge
+ * tensor metadata. If the producer-id set on the two passes ever diverges,
+ * deps.json is NOT written and the function returns a non-zero error code.
+ * This is the guarantee against silent shotgun modifications: anyone who
+ * changes ``compute_task_fanin`` semantics has to mirror the change here too
+ * or the gate fires immediately.
+ *
+ * The replay is single-threaded and pure CPU: no device handle is required.
+ */
+
+#ifndef SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_HOST_DEP_GEN_REPLAY_H_
+#define SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_HOST_DEP_GEN_REPLAY_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+// Opaque forward decl — the canonical layout lives in common/dep_gen.h, but
+// replay's API only needs to take a pointer + count. Callers who construct
+// the buffer must include common/dep_gen.h themselves.
+struct DepGenRecord;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * Replay an in-memory DepGenRecord stream and write deps.json.
+ *
+ * Per-ring task window sizes are auto-derived from the trace itself so each
+ * ring's window covers its observed max local_id without slot aliasing.
+ *
+ * @param records            Pointer to a contiguous DepGenRecord array
+ *                           (typically ``DepGenCollector::records().data()``).
+ * @param num_records        Number of records in the array.
+ * @param deps_json_path     Output path; truncated if it exists.
+ * @return 0 on success; negative on error (see source for codes).
+ */
+int dep_gen_replay_emit_deps_json(const struct DepGenRecord *records, size_t num_records, const char *deps_json_path);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_HOST_DEP_GEN_REPLAY_H_
diff --git a/src/a2a3/runtime/host_build_graph/host/host_orch_compat_stubs.cpp b/src/a2a3/runtime/host_build_graph/host/host_orch_compat_stubs.cpp
new file mode 100644
index 000000000..603daa157
--- /dev/null
+++ b/src/a2a3/runtime/host_build_graph/host/host_orch_compat_stubs.cpp
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Host-side weak stubs for AICPU-only profiling / dump-args symbols.
+ *
+ * host_build_graph runs the orchestrator on the host (host-orch-first), so the
+ * orchestrator core (pto_orchestrator.cpp / pto_runtime2.cpp) is compiled into
+ * libhost_runtime.so, which is dlopen'd RTLD_LOCAL and must therefore resolve
+ * all of its symbols. The scope-stats and dump-args collectors are AICPU-only
+ * (defined in common/platform/.../aicpu) and are NOT linked into the host
+ * library. They record on-device diagnostics; the host orchestrator only builds
+ * the task graph, so no-op definitions are correct here.
+ *
+ * Marked weak + hidden so they never leak into the global dynamic symbol table
+ * (RTLD_LOCAL keeps them library-local anyway) and never shadow the AICPU
+ * library's strong definitions, mirroring the weak-stub pattern in
+ * pto_orchestrator.cpp.
+ */
+
+#include "aicpu/scope_stats_collector_aicpu.h"
+#include "aicpu/tensor_dump_aicpu.h"
+
+__attribute__((weak, visibility("hidden"))) bool is_dump_args_enabled() { return false; }
+
+__attribute__((weak, visibility("hidden"))) void
+set_dump_args_task_mask(uint64_t, TensorDumpArgMask, TensorDumpArgMask) {}
+
+__attribute__((weak, visibility("hidden"))) void set_dump_args_task_scalar_dtypes(uint64_t, uint32_t, const uint8_t *) {
+}
+
+__attribute__((weak, visibility("hidden"))) void
+scope_stats_begin(int, int32_t, int32_t, uint64_t, uint64_t, int32_t, int32_t, int32_t) {}
+
+__attribute__((weak, visibility("hidden"))) void
+scope_stats_end(int, int32_t, int32_t, uint64_t, uint64_t, int32_t, int32_t, int32_t) {}
+
+__attribute__((weak, visibility("hidden"))) void scope_stats_on_fatal() {}
+
+__attribute__((weak, visibility("hidden"))) void scope_stats_set_pending_site(const char *, int) {}
diff --git a/src/a2a3/runtime/host_build_graph/host/runtime_compile_info.cpp b/src/a2a3/runtime/host_build_graph/host/runtime_compile_info.cpp
index ea62cb3fa..dfc5590c1 100644
--- a/src/a2a3/runtime/host_build_graph/host/runtime_compile_info.cpp
+++ b/src/a2a3/runtime/host_build_graph/host/runtime_compile_info.cpp
@@ -20,7 +20,8 @@ ToolchainType get_incore_compiler(void) {
 }
 
 ToolchainType get_orchestration_compiler(void) {
-    // host_build_graph: always host g++ (orchestration runs on host)
+    // tensormap_and_ringbuffer: a2a3 needs aarch64 cross-compile (AICPU is aarch64)
+    if (strcmp(get_platform(), "a2a3") == 0) return TOOLCHAIN_AARCH64_GXX;
     return TOOLCHAIN_HOST_GXX;
 }
 }
diff --git a/src/a2a3/runtime/host_build_graph/host/runtime_maker.cpp b/src/a2a3/runtime/host_build_graph/host/runtime_maker.cpp
index ec686ab95..18ada7110 100644
--- a/src/a2a3/runtime/host_build_graph/host/runtime_maker.cpp
+++ b/src/a2a3/runtime/host_build_graph/host/runtime_maker.cpp
@@ -9,302 +9,593 @@
  * -----------------------------------------------------------------------------------------------------------
  */
 /**
- * Runtime Builder - Generic Implementation
+ * Runtime Builder - rt2 Implementation (host_build_graph: Host Orchestration)
  *
- * Provides init_runtime_impl and validate_runtime_impl functions that work with
- * pluggable orchestration functions for building task graphs.
+ * Provides init_runtime_impl and validate_runtime_impl functions for rt2 runtime.
+ * The HOST runs the orchestrator to completion, populates shared memory + the
+ * prebuilt arena, and H2Ds the image; the device boots scheduler-only.
  *
  * init_runtime_impl:
- *   - Calls orchestration function to build task graph
- *   - Orchestration is responsible for device memory management
+ *   - Converts host tensor pointers to device pointers (all inputs copied H2D;
+ *     only OUTPUT/INOUT tensors are copied back D2H)
+ *   - dlopens the orchestration SO on the host and runs it to build the graph
+ *   - Sets up runtime state for host orchestration
  *
- * validate_runtime_impl (finalize_runtime_impl):
- *   - Copies recorded tensors back from device to host
+ * validate_runtime_impl:
+ *   - Copies OUTPUT/INOUT tensors back from device to host (read-only inputs
+ *     are skipped)
  *   - Frees device memory
  */
 
 #include <dlfcn.h>
-#include <fcntl.h>
 #include <stddef.h>
 #include <stdint.h>
 #include <sys/stat.h>
+#include <sys/time.h>
 #include <unistd.h>
 
+#include <cerrno>
+#include <cinttypes>
 #include <cstddef>
 #include <cstdint>
 #include <cstdio>
+#include <cctype>
+#include <cstdlib>
 #include <cstring>
+#include <limits>
 #include <string>
+#include <type_traits>
 #include <vector>
 
+#include "../common/pto_runtime_status.h"
+#include "../runtime/common.h"
+#include "../runtime/pto_orchestrator.h"
+#include "../runtime/pto_runtime2.h"
+#include "../runtime/pto_shared_memory.h"
+#include "../runtime/pto_types.h"
+#include "../runtime/runtime.h"
+#include "../../../../common/task_interface/call_config.h"
 #include "callable.h"
-#include "orchestration_api.h"
+#include "common/platform_config.h"
+#include "common/unified_log.h"
+#include "utils/device_arena.h"
 #include "prepare_callable_common.h"
-#include "runtime.h"  // Includes unified_log.h and provides LOG_* macros
-#include "task_args.h"
 
-namespace {
-
-struct OrchestrationRuntimeImpl {
-    const OrchestrationRuntimeOps *ops;
-    Runtime *runtime;
-    // Platform device-memory hooks. Host orchestration runs through the ops
-    // callbacks below, which need host_api but cannot take it as a parameter
-    // (fixed OrchestrationRuntimeOps ABI) — so it travels here, alongside the
-    // Runtime pointer, filled by bind_callable_to_runtime_impl.
-    const HostApi *host_api;
-    struct TensorInfoBuilder *tensor_info_builder;
-    struct TensorAllocationBuilder *tensor_allocation_builder;
-};
+// SVM map/unmap bridge, defined in the platform's c_api_shared.cpp (linked into
+// the same host_runtime .so). Maps a staged device tensor into host address
+// space so the host-side orchestrator can read control tensors directly. Routed
+// through the per-thread DeviceRunner, not Runtime.host_api — tensormap/a5 ABI
+// is untouched. extern "C" to match the C-linkage definition (c_api_shared
+// defines these inside its `extern "C"` block).
+extern "C" void *svm_register_via_runner(void *dev_ptr, size_t size);
+extern "C" void svm_unregister_via_runner(void *dev_ptr);
+
+// RuntimeEnv (call_config.h) is the cross-runtime ABI for per-ring config and
+// carries RUNTIME_ENV_RING_COUNT slots, shared with tensormap_and_ringbuffer.
+// host_build_graph is single-ring (PTO2_MAX_RING_DEPTH == 1) and reads only the
+// first slot; it must fit within the ABI's slot budget, not equal it.
+static_assert(PTO2_MAX_RING_DEPTH <= RUNTIME_ENV_RING_COUNT, "PTO2 runtime ring depth must fit RuntimeEnv ring slots");
+
+// Helper: return current time in milliseconds
+static int64_t _now_ms() {
+    struct timeval tv;
+    gettimeofday(&tv, nullptr);
+    return static_cast<int64_t>(tv.tv_sec) * 1000 + tv.tv_usec / 1000;
+}
 
-struct TensorInfoBuilder {
-    std::vector<std::vector<TensorInfo>> tensor_info_by_task;
+static bool is_power_of_2_u64(uint64_t value) { return value != 0 && (value & (value - 1)) == 0; }
 
-    int set_tensor_info_to_task(int task_id, const TensorInfo *tensor_info, int tensor_count) {
-        if (task_id < 0 || tensor_count < 0 || tensor_count > RUNTIME_MAX_ARGS) {
-            return -1;
-        }
-        if (static_cast<size_t>(task_id) >= tensor_info_by_task.size()) {
-            tensor_info_by_task.resize(static_cast<size_t>(task_id) + 1);
+template <typename T>
+static std::string format_ring_array(const T (&values)[PTO2_MAX_RING_DEPTH]) {
+    std::string out = "[";
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; ++r) {
+        if (r != 0) {
+            out += ", ";
         }
-        std::vector<TensorInfo> &task_info = tensor_info_by_task[static_cast<size_t>(task_id)];
-        task_info.assign(tensor_info, tensor_info + tensor_count);
-        return 0;
+        out += std::to_string(values[r]);
     }
-};
+    out += "]";
+    return out;
+}
 
-struct TensorAllocationBuilder {
-    std::vector<TensorAllocationInfo> allocations;
+static std::string trim_copy(const std::string &input) {
+    size_t begin = 0;
+    while (begin < input.size() && std::isspace(static_cast<unsigned char>(input[begin]))) {
+        ++begin;
+    }
+    size_t end = input.size();
+    while (end > begin && std::isspace(static_cast<unsigned char>(input[end - 1]))) {
+        --end;
+    }
+    return input.substr(begin, end - begin);
+}
 
-    void record_allocation(void *ptr, size_t size) {
-        if (ptr == nullptr || size == 0) {
+static bool parse_uint_token(
+    const char *name, const std::string &raw, uint64_t min_val, uint64_t max_val, bool require_power_of_2, uint64_t *out
+) {
+    std::string token = trim_copy(raw);
+    if (token.empty()) {
+        LOG_WARN("%s has an empty value in '%s', ignored", name, raw.c_str());
+        return false;
+    }
+
+    if (token[0] == '-') {
+        LOG_WARN("%s=%s invalid (must be a non-negative integer), ignored", name, token.c_str());
+        return false;
+    }
+    char *endptr = nullptr;
+    errno = 0;
+    unsigned long long parsed = std::strtoull(token.c_str(), &endptr, 10);
+    if (errno == ERANGE || endptr == token.c_str() || *endptr != '\0') {
+        LOG_WARN("%s=%s invalid (must be a non-negative integer), ignored", name, token.c_str());
+        return false;
+    }
+    uint64_t val = static_cast<uint64_t>(parsed);
+
+    if (val < min_val || val > max_val) {
+        LOG_WARN(
+            "%s=%s invalid (must be in [%" PRIu64 ", %" PRIu64 "]), ignored", name, token.c_str(), min_val, max_val
+        );
+        return false;
+    }
+    if (require_power_of_2 && !is_power_of_2_u64(val)) {
+        LOG_WARN("%s=%s invalid (must be a power of 2), ignored", name, token.c_str());
+        return false;
+    }
+    *out = val;
+    return true;
+}
+
+static void apply_env_ring_values(
+    const char *name, uint64_t min_val, uint64_t max_val, bool require_power_of_2, uint64_t out[PTO2_MAX_RING_DEPTH]
+) {
+    const char *env = std::getenv(name);
+    if (!env) return;
+
+    std::string text(env);
+    if (text.find(',') == std::string::npos) {
+        uint64_t value = 0;
+        if (!parse_uint_token(name, text, min_val, max_val, require_power_of_2, &value)) {
             return;
         }
-        allocations.push_back({reinterpret_cast<uint64_t>(ptr), static_cast<uint64_t>(size)});
+        for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+            out[r] = value;
+        }
+        return;
     }
 
-    void erase_allocation(void *ptr) {
-        if (ptr == nullptr) {
+    uint64_t parsed[PTO2_MAX_RING_DEPTH]{};
+    size_t pos = 0;
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        size_t comma = text.find(',', pos);
+        std::string token = text.substr(pos, comma == std::string::npos ? std::string::npos : comma - pos);
+        if (!parse_uint_token(name, token, min_val, max_val, require_power_of_2, &parsed[r])) {
             return;
         }
-        uint64_t base_addr = reinterpret_cast<uint64_t>(ptr);
-        for (auto it = allocations.begin(); it != allocations.end(); ++it) {
-            if (it->base_addr == base_addr) {
-                allocations.erase(it);
+        if (comma == std::string::npos) {
+            if (r != PTO2_MAX_RING_DEPTH - 1) {
+                LOG_WARN(
+                    "%s=%s invalid (expected exactly %d comma-separated values), ignored", name, env,
+                    PTO2_MAX_RING_DEPTH
+                );
                 return;
             }
+            pos = text.size();
+        } else {
+            pos = comma + 1;
         }
     }
-};
-
-// Free every device buffer the orchestration recorded in the allocation
-// builder. Used on bind error paths: once tensor_pairs_ is cleared, the
-// finalize-time cleanup can no longer see these, so they must be freed here
-// or they leak.
-void free_tensor_allocations(const HostApi *api, const TensorAllocationBuilder &builder) {
-    for (const TensorAllocationInfo &allocation : builder.allocations) {
-        api->device_free(reinterpret_cast<void *>(static_cast<uintptr_t>(allocation.base_addr)));
+    if (pos < text.size() || (!text.empty() && text.back() == ',')) {
+        LOG_WARN("%s=%s invalid (expected exactly %d comma-separated values), ignored", name, env, PTO2_MAX_RING_DEPTH);
+        return;
+    }
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        out[r] = parsed[r];
     }
 }
 
-Runtime *unwrap_runtime(OrchestrationRuntime *runtime) {
-    return reinterpret_cast<OrchestrationRuntimeImpl *>(runtime)->runtime;
-}
-
-const HostApi *unwrap_host_api(OrchestrationRuntime *runtime) {
-    return reinterpret_cast<OrchestrationRuntimeImpl *>(runtime)->host_api;
+// ring_task_window / ring_heap / ring_dep_pool point into the #pragma pack(1)
+// RuntimeEnv wire struct (call_config.h), so their uint64_t entries are only
+// byte-aligned — runtime_env sits at offset 28 in CallConfig (after 7 int32_t),
+// i.e. 4-byte but not 8-byte aligned. Reading them as `base[idx]` is an
+// unaligned 8-byte load: UB, and fatal under UBSan (-fsanitize=alignment). Copy
+// the bytes out instead. A null base means "no per-task overrides" -> 0 (unset).
+static uint64_t read_ring_override(const uint64_t *base, int idx) {
+    if (base == nullptr) {
+        return 0;
+    }
+    uint64_t value;
+    std::memcpy(&value, base + idx, sizeof(value));
+    return value;
 }
 
-TensorInfoBuilder *unwrap_tensor_info_builder(OrchestrationRuntime *runtime) {
-    return reinterpret_cast<OrchestrationRuntimeImpl *>(runtime)->tensor_info_builder;
-}
+// Each of ring_task_window / ring_heap / ring_dep_pool is a per-ring array of
+// PTO2_MAX_RING_DEPTH entries (0 = unset). Precedence per ring: per-task entry >
+// PTO2_RING_* env value > compile-time default. A "size all rings the same"
+// request arrives already broadcast to every entry by the caller.
+static bool resolve_ring_config(
+    const uint64_t *ring_task_window, const uint64_t *ring_heap, const uint64_t *ring_dep_pool,
+    uint64_t eff_task_window_sizes[PTO2_MAX_RING_DEPTH], uint64_t eff_heap_sizes[PTO2_MAX_RING_DEPTH],
+    int32_t eff_dep_pool_capacities[PTO2_MAX_RING_DEPTH]
+) {
+    uint64_t dep_pool_values[PTO2_MAX_RING_DEPTH];
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        eff_task_window_sizes[r] = PTO2_TASK_WINDOW_SIZE;
+        eff_heap_sizes[r] = PTO2_HEAP_SIZE;
+        dep_pool_values[r] = PTO2_DEP_LIST_POOL_SIZE;
+    }
 
-TensorAllocationBuilder *unwrap_tensor_allocation_builder(OrchestrationRuntime *runtime) {
-    return reinterpret_cast<OrchestrationRuntimeImpl *>(runtime)->tensor_allocation_builder;
-}
+    apply_env_ring_values("PTO2_RING_TASK_WINDOW", 4, static_cast<uint64_t>(INT32_MAX), true, eff_task_window_sizes);
+    apply_env_ring_values("PTO2_RING_HEAP", 1024, std::numeric_limits<uint64_t>::max(), false, eff_heap_sizes);
+    apply_env_ring_values("PTO2_RING_DEP_POOL", 4, static_cast<uint64_t>(INT32_MAX), false, dep_pool_values);
 
-int runtime_add_task(OrchestrationRuntime *runtime, uint64_t *args, int num_args, int func_id, CoreType core_type) {
-    return unwrap_runtime(runtime)->add_task(args, num_args, func_id, core_type);
-}
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        const uint64_t task_window_override = read_ring_override(ring_task_window, r);
+        const uint64_t heap_override = read_ring_override(ring_heap, r);
+        const uint64_t dep_pool_override = read_ring_override(ring_dep_pool, r);
+        if (task_window_override != 0) {
+            eff_task_window_sizes[r] = task_window_override;
+        }
+        if (heap_override != 0) {
+            eff_heap_sizes[r] = heap_override;
+        }
+        if (dep_pool_override != 0) {
+            dep_pool_values[r] = dep_pool_override;
+        }
 
-int runtime_set_tensor_info_to_task(
-    OrchestrationRuntime *runtime, int task_id, const TensorInfo *tensor_info, int tensor_count
-) {
-    Runtime *host_runtime = unwrap_runtime(runtime);
-    if (task_id < 0 || task_id >= host_runtime->get_task_count()) {
-        LOG_ERROR("Invalid task_id %d for task tensor info", task_id);
-        return -1;
-    }
-    if (tensor_count == 0) {
-        return 0;
-    }
-    if (tensor_info == nullptr) {
-        LOG_ERROR("Task %d tensor info pointer is null", task_id);
-        return -1;
+        if (eff_task_window_sizes[r] < 4 || eff_task_window_sizes[r] > static_cast<uint64_t>(INT32_MAX) ||
+            !is_power_of_2_u64(eff_task_window_sizes[r])) {
+            LOG_ERROR(
+                "ring_task_window[%d]=%" PRIu64 " must be a power of 2 in [4, INT32_MAX]", r, eff_task_window_sizes[r]
+            );
+            return false;
+        }
+        if (eff_heap_sizes[r] < 1024) {
+            LOG_ERROR("ring_heap[%d]=%" PRIu64 " must be >= 1024", r, eff_heap_sizes[r]);
+            return false;
+        }
+        if (dep_pool_values[r] < 4 || dep_pool_values[r] > static_cast<uint64_t>(INT32_MAX)) {
+            LOG_ERROR("ring_dep_pool[%d]=%" PRIu64 " must be in [4, INT32_MAX]", r, dep_pool_values[r]);
+            return false;
+        }
+        eff_dep_pool_capacities[r] = static_cast<int32_t>(dep_pool_values[r]);
     }
-    return unwrap_tensor_info_builder(runtime)->set_tensor_info_to_task(task_id, tensor_info, tensor_count);
-}
 
-void runtime_add_successor(OrchestrationRuntime *runtime, int from_task, int to_task) {
-    unwrap_runtime(runtime)->add_successor(from_task, to_task);
-}
-
-void runtime_record_tensor_pair(OrchestrationRuntime *runtime, void *host_ptr, void *dev_ptr, size_t size) {
-    unwrap_runtime(runtime)->tensor_pairs_.push_back({host_ptr, dev_ptr, size});
+    return true;
 }
 
-int runtime_get_task_count(OrchestrationRuntime *runtime) { return unwrap_runtime(runtime)->get_task_count(); }
+static int32_t pto2_read_runtime_status(Runtime *runtime, const HostApi *api, PTO2SharedMemoryHeader *host_header) {
+    if (runtime == nullptr || api == nullptr || host_header == nullptr) {
+        return 0;
+    }
 
-void runtime_print_runtime(OrchestrationRuntime *runtime) { unwrap_runtime(runtime)->print_runtime(); }
+    void *pto2_sm = runtime->get_gm_sm_ptr();
+    if (pto2_sm == nullptr) {
+        return 0;
+    }
 
-void *runtime_device_malloc(OrchestrationRuntime *runtime, size_t size) {
-    void *ptr = unwrap_host_api(runtime)->device_malloc(size);
-    unwrap_tensor_allocation_builder(runtime)->record_allocation(ptr, size);
-    return ptr;
-}
+    int hdr_rc = api->copy_from_device(host_header, pto2_sm, sizeof(PTO2SharedMemoryHeader));
+    if (hdr_rc != 0) {
+        LOG_WARN("Failed to copy PTO2 header from device");
+        return 0;
+    }
 
-void runtime_device_free(OrchestrationRuntime *runtime, void *ptr) {
-    unwrap_tensor_allocation_builder(runtime)->erase_allocation(ptr);
-    unwrap_host_api(runtime)->device_free(ptr);
+    int32_t orch_error_code = host_header->orch_error_code.load(std::memory_order_relaxed);
+    int32_t sched_error_code = host_header->sched_error_code.load(std::memory_order_relaxed);
+    return runtime_status_from_error_codes(orch_error_code, sched_error_code);
 }
 
-int runtime_copy_to_device(OrchestrationRuntime *runtime, void *dev_ptr, const void *host_ptr, size_t size) {
-    return unwrap_host_api(runtime)->copy_to_device(dev_ptr, host_ptr, size);
-}
+namespace {
 
-const OrchestrationRuntimeOps k_orchestration_runtime_ops = {
-    runtime_add_task,       runtime_set_tensor_info_to_task, runtime_add_successor, runtime_record_tensor_pair,
-    runtime_get_task_count, runtime_print_runtime,           runtime_device_malloc, runtime_device_free,
-    runtime_copy_to_device,
-};
+// host_build_graph is host-orchestration-first: the HOST dlopens the
+// orchestration .so and runs it to completion. The shared memory + arena carry
+// host-DDR cross-task pointers (slot_state.task/payload,
+// payload.fanin_inline_slot_states[], wiring queue); the host relocates them to
+// their final device addresses (relocate_host_orch_image, below) BEFORE the H2D
+// copy, so the device receives a fully device-addressed image and schedules
+// only — no on-device pointer fixup.
 
 bool write_all_bytes(int fd, const uint8_t *data, size_t size) {
-    size_t total_written = 0;
-    while (total_written < size) {
-        ssize_t written = write(fd, data + total_written, size - total_written);
-        if (written <= 0) {
+    size_t total = 0;
+    while (total < size) {
+        ssize_t w = write(fd, data + total, size - total);
+        if (w <= 0) {
             return false;
         }
-        total_written += static_cast<size_t>(written);
+        total += static_cast<size_t>(w);
     }
     return true;
 }
 
-bool create_temp_so_file(const uint8_t *data, size_t size, std::string *out_path) {
-    char path_template[] = "/tmp/orch_so_XXXXXX";
-    int fd = mkstemp(path_template);
+// Materialize the orchestration .so bytes to a temp file so it can be dlopen'd
+// on the host (dlopen needs a real path + the exec bit).
+bool create_orch_so_tempfile(const uint8_t *data, size_t size, std::string *out_path) {
+    char tmpl[] = "/tmp/orch_so_XXXXXX";
+    int fd = mkstemp(tmpl);
     if (fd < 0) {
         return false;
     }
-
-    // dlopen requires the file to be executable; mkstemp creates 0600 (no exec bit)
     if (fchmod(fd, 0755) != 0) {
         close(fd);
-        unlink(path_template);
+        unlink(tmpl);
         return false;
     }
-
     bool ok = write_all_bytes(fd, data, size);
     if (close(fd) != 0) {
         ok = false;
     }
     if (!ok) {
-        unlink(path_template);
+        unlink(tmpl);
         return false;
     }
-
-    *out_path = path_template;
+    *out_path = tmpl;
     return true;
 }
 
-int upload_tensor_info_storage(Runtime *runtime, const HostApi *api, const TensorInfoBuilder &builder) {
-    runtime->clear_tensor_info_storage();
-    for (int task_id = 0; task_id < RUNTIME_MAX_TASKS; task_id++) {
-        runtime->set_tensor_info_range(task_id, 0, 0);
+// The orchestration .so exports these (PTO2 submit_task form).
+typedef void (*OrchestrationEntryFunc)(const L2TaskArgs &);
+typedef void (*OrchestrationBindFunc)(PTO2Runtime *);
+
+// Resolved orchestration .so entry points. register_callable_impl allocates one
+// of these (so both the entry and the .so's own framework_bind_runtime — which
+// sets the .so-private g_current_runtime its inline rt_submit_* reads — are
+// available per run) and stores its pointer in CallableArtifacts::
+// host_orch_func_ptr. Owned for the callable's lifetime alongside
+// host_dlopen_handle.
+struct HostOrchEntryPoints {
+    OrchestrationEntryFunc entry{nullptr};
+    OrchestrationBindFunc bind{nullptr};
+};
+
+// Run the orchestrator on the host. `rt` was built with its scheduler half
+// pointing at the device SM; here we re-point ONLY the orchestrator half at a
+// host SM mirror, run the orchestration entry against it, latch the submitted
+// task count, and H2D the populated SM to the device (the device scheduler
+// reads task descriptors from there). The device never dereferences the
+// orchestrator's SM pointers, so leaving them host-side is safe. Returns the
+// total task count (>= 0) on success, or -1 on failure.
+// host_build_graph host-orch: the orchestrator built the task graph in a host
+// SM mirror and (when wiring is folded into submit) the fanout adjacency in the
+// host arena, storing host-DDR addresses into the cross-task pointers. Relocate
+// them to their FINAL device addresses here on the host, BEFORE the SM/arena are
+// copied to the device — so the device receives a fully device-addressed image
+// and boots scheduler-only with no on-device pointer fixup.
+//
+// Relocated pointers span TWO regions with DIFFERENT deltas: the SM block
+// (slot_state.task/.payload, fanin_inline_slot_states[], dep-entry.slot_state,
+// ready-queue slot.slot_state) and the arena block (slot_state.fanout_head,
+// dep-entry.next, wiring-queue entries point into the SM but live in the arena).
+// Rather than track which delta each field needs, reloc() classifies every
+// pointer by the region it points INTO and applies that region's delta; foreign
+// and null pointers pass through untouched. This makes the same walk correct
+// whether wiring is still drained on the device (dep_pool/ready empty here, only
+// the wiring queue populated) or folded into the host submit (dep_pool/ready
+// populated, wiring queue empty).
+//
+// The orchestrator's own task-allocator pointers are intentionally NOT relocated
+// (the device runs scheduler-only and never dereferences them, and must not call
+// rt_orchestration_done — the host already did). Multi-fanin spill is not yet
+// relocated; a task exceeding PTO2_FANIN_INLINE_CAP producers latches fatal here
+// (returns false) rather than shipping un-relocated host pointers to the device.
+// Returns false on any unrelocatable pointer so the caller can fail the prepare.
+static bool relocate_host_orch_image(
+    PTO2SharedMemoryHandle &host_sm_handle, PTO2Runtime *rt, uint64_t host_sm, uint64_t sm_size, int64_t sm_delta,
+    uint64_t host_arena, uint64_t arena_size, int64_t arena_delta
+) {
+    // host_build_graph is single-ring; the loops below iterate the lone ring and
+    // index header->ring (singular). If the ring depth ever grows, those loops
+    // would relocate the same ring N times (applying the delta repeatedly =
+    // corruption), so pin the assumption here.
+    static_assert(PTO2_MAX_RING_DEPTH == 1, "relocate_host_orch_image assumes a single ring");
+
+    // SM and arena windows must not overlap — reloc classifies a pointer by
+    // which window it falls in, so an overlap would misclassify and apply the
+    // wrong delta. Both are independent malloc-backed host buffers in practice;
+    // assert it so a future shared-buffer layout can't silently corrupt.
+    if (!(host_sm + sm_size <= host_arena || host_arena + arena_size <= host_sm)) {
+        LOG_ERROR(
+            "host-orch: SM window [%#lx,+%#lx) overlaps arena window [%#lx,+%#lx); cannot relocate", host_sm, sm_size,
+            host_arena, arena_size
+        );
+        return false;
     }
 
-    int task_count = runtime->get_task_count();
-    std::vector<TensorInfo> compact_tensor_info;
-    for (int task_id = 0; task_id < task_count; task_id++) {
-        const std::vector<TensorInfo> *task_info = nullptr;
-        if (static_cast<size_t>(task_id) < builder.tensor_info_by_task.size()) {
-            task_info = &builder.tensor_info_by_task[static_cast<size_t>(task_id)];
+    bool ok = true;
+    auto reloc = [&](auto *&p) {
+        using Ptr = std::remove_reference_t<decltype(p)>;
+        uint64_t v = reinterpret_cast<uint64_t>(p);
+        if (v == 0) {
+            return;
         }
-        uint32_t offset = static_cast<uint32_t>(compact_tensor_info.size());
-        uint16_t count = 0;
-        if (task_info != nullptr) {
-            count = static_cast<uint16_t>(task_info->size());
-            compact_tensor_info.insert(compact_tensor_info.end(), task_info->begin(), task_info->end());
+        if (v >= host_sm && v < host_sm + sm_size) {
+            p = reinterpret_cast<Ptr>(static_cast<uintptr_t>(v + sm_delta));
+        } else if (v >= host_arena && v < host_arena + arena_size) {
+            p = reinterpret_cast<Ptr>(static_cast<uintptr_t>(v + arena_delta));
+        } else {
+            // A non-null pointer in neither window is an external/host address
+            // the device would dereference verbatim after H2D. No field should
+            // legitimately carry one; latch fatal rather than ship a host VA to
+            // the device (silent AICPU corruption otherwise).
+            LOG_ERROR("host-orch: pointer %#lx is outside both SM and arena windows; cannot relocate for device", v);
+            ok = false;
         }
-        runtime->set_tensor_info_range(task_id, offset, count);
-    }
+    };
 
-    if (compact_tensor_info.empty()) {
-        return 0;
+    PTO2SharedMemoryHeader *header = host_sm_handle.header;
+    if (header != nullptr) {
+        for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+            PTO2SharedMemoryRingHeader &ring = header->ring;
+            int32_t count = ring.fc.current_task_index.load(std::memory_order_acquire);
+            for (int32_t slot = 0; slot < count; slot++) {
+                PTO2TaskSlotState *ss = &ring.slot_states[slot];
+                reloc(ss->task);
+                reloc(ss->payload);
+                reloc(ss->fanout_head);
+
+                PTO2TaskPayload *payload = &ring.task_payloads[slot];
+                int32_t nf = payload->fanin_actual_count;
+                if (nf > PTO2_FANIN_INLINE_CAP) {
+                    // host-orch does not yet relocate the multi-fanin spill pool,
+                    // so a task with more than the inline cap of producers would
+                    // ship un-relocated host pointers to the device. Fail loud
+                    // rather than silently clamp (tensormap handles this via the
+                    // on-device spill pool; that path is not wired here yet).
+                    LOG_ERROR(
+                        "host-orch: task slot %d has fanin %d > inline cap %d; multi-fanin spill relocation is not "
+                        "implemented",
+                        slot, nf, PTO2_FANIN_INLINE_CAP
+                    );
+                    ok = false;
+                    nf = PTO2_FANIN_INLINE_CAP;
+                }
+                for (int32_t i = 0; i < nf; i++) {
+                    reloc(payload->fanin_inline_slot_states[i]);
+                }
+            }
+        }
     }
 
-    size_t tensor_info_bytes = compact_tensor_info.size() * sizeof(TensorInfo);
-    void *dev_tensor_info_storage = api->device_malloc(tensor_info_bytes);
-    if (dev_tensor_info_storage == nullptr) {
-        LOG_ERROR("Failed to allocate tensor info storage (%zu bytes)", tensor_info_bytes);
-        return -1;
+    // Per-ring fanout adjacency (dep_pool entries) built by wire_task on the
+    // host. Each live entry [tail, top) carries a consumer slot_state pointer
+    // (into the SM) and a next pointer (into the arena). Empty when wiring is
+    // still drained on the device.
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        PTO2DepListPool &dp = rt->scheduler.ring_sched_state.dep_pool;
+        if (dp.base == nullptr || dp.capacity == 0) {
+            continue;
+        }
+        for (int32_t i = dp.tail; i < dp.top; i++) {
+            PTO2DepListEntry &e = dp.base[i % dp.capacity];
+            reloc(e.slot_state);
+            reloc(e.next);
+        }
     }
 
-    int rc = api->copy_to_device(dev_tensor_info_storage, compact_tensor_info.data(), tensor_info_bytes);
-    if (rc != 0) {
-        LOG_ERROR("Failed to copy tensor info storage to device: %d", rc);
-        api->device_free(dev_tensor_info_storage);
-        return rc;
+    // Ready queues seeded by push_ready_routed on the host. Each populated slot
+    // [dequeue_pos, enqueue_pos) holds a slot_state pointer into the SM.
+    auto reloc_ready = [&](PTO2ReadyQueue &q) {
+        if (q.slots == nullptr) {
+            return;
+        }
+        uint64_t enq = q.enqueue_pos.load(std::memory_order_relaxed);
+        uint64_t deq = q.dequeue_pos.load(std::memory_order_relaxed);
+        for (uint64_t pos = deq; pos < enq; pos++) {
+            reloc(q.slots[pos & q.mask].slot_state);
+        }
+    };
+    for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) {
+        reloc_ready(rt->scheduler.ready_queues[i]);
     }
-
-    runtime->set_tensor_info_storage(dev_tensor_info_storage, tensor_info_bytes);
-    LOG_INFO_V0("Uploaded %zu tensor info entries (%zu bytes)", compact_tensor_info.size(), tensor_info_bytes);
-    return 0;
+    reloc_ready(rt->scheduler.dummy_ready_queue);
+    reloc_ready(rt->scheduler.early_dispatch_queue);
+
+    // Wiring queue (in the arena) — populated only while wiring is still drained
+    // on the device; its entries are slot_state pointers into the SM. Empty once
+    // wiring is folded into the host submit.
+    PTO2SpscQueue &wq = rt->scheduler.wiring.queue;
+    uint64_t head = wq.head_.load(std::memory_order_relaxed);
+    uint64_t tail = wq.tail_.load(std::memory_order_relaxed);
+    for (uint64_t pos = tail; pos < head; pos++) {
+        reloc(wq.buffer_[pos & wq.mask_]);
+    }
+    return ok;
 }
 
-int upload_tensor_allocation_storage(Runtime *runtime, const HostApi *api, const TensorAllocationBuilder &builder) {
-    runtime->clear_tensor_allocation_storage();
-    if (builder.allocations.empty()) {
-        return 0;
+int32_t run_host_orchestration(
+    Runtime *runtime, const HostApi *api, PTO2Runtime *rt, DeviceArena &host_arena,
+    const PTO2RuntimeArenaLayout &layout, void *device_sm, uint64_t sm_size, void *device_arena, void *gm_heap,
+    const uint64_t eff_heap_sizes[PTO2_MAX_RING_DEPTH], const uint64_t eff_task_window_sizes[PTO2_MAX_RING_DEPTH],
+    void *host_orch_func_ptr, const L2TaskArgs &orch_l2
+) {
+    std::vector<uint8_t> host_sm_buf(sm_size, 0);
+    void *host_sm = host_sm_buf.data();
+
+    // Re-point the orchestrator half at the host SM (scheduler keeps device SM).
+    // init_data_from_layout resets the orchestrator state, so this is safe.
+    if (!rt->orchestrator.init_data_from_layout(
+            layout.orch, host_arena, host_sm, gm_heap, eff_heap_sizes[0], eff_task_window_sizes[0]
+        )) {
+        LOG_ERROR("host-orch: orchestrator re-init against host SM failed");
+        return -1;
     }
+    rt->orchestrator.wire_arena_pointers(layout.orch, host_arena, &rt->scheduler);
 
-    size_t allocation_bytes = builder.allocations.size() * sizeof(TensorAllocationInfo);
-    void *dev_allocation_storage = api->device_malloc(allocation_bytes);
-    if (dev_allocation_storage == nullptr) {
-        LOG_ERROR("Failed to allocate tensor allocation storage (%zu bytes)", allocation_bytes);
+    // Initialize the host SM header (ring flow control) so submit_task can run.
+    PTO2SharedMemoryHandle host_sm_handle;
+    if (!host_sm_handle.init_per_ring(host_sm, sm_size, eff_task_window_sizes, eff_heap_sizes)) {
+        LOG_ERROR("host-orch: host SM init_per_ring failed");
         return -1;
     }
 
-    int rc = api->copy_to_device(dev_allocation_storage, builder.allocations.data(), allocation_bytes);
-    if (rc != 0) {
-        LOG_ERROR("Failed to copy tensor allocation storage to device: %d", rc);
-        api->device_free(dev_allocation_storage);
-        return rc;
+    // Install the ops table (host s_runtime_ops). The SPMD core counts are
+    // re-applied with the real device values on the AICPU at boot; the values
+    // here only feed cluster spreading during this host submit and are unused
+    // by the migrated non-cluster examples.
+    runtime_finalize_after_wire(rt, /*aic*/ 24, /*aiv*/ 48);
+    rt->mode = PTO2_MODE_EXECUTE;
+    // get_tensor_data/set_tensor_data dereference buffer.addr directly: the
+    // input tensors were SVM-mapped into host address space at staging time
+    // (svm_register_via_runner), so the host orchestrator can read control
+    // tensors (e.g. paged_attention's context_lens/block_table) in place.
+
+    // Bind both framework_current_runtime instances: the host library's (used by
+    // rt_scope_* / rt_orchestration_done) and the orch .so's own copy (used by
+    // its inline rt_submit_* -> current_runtime()).
+    const HostOrchEntryPoints *eps = reinterpret_cast<const HostOrchEntryPoints *>(host_orch_func_ptr);
+    framework_bind_runtime(rt);
+    if (eps->bind != nullptr) {
+        eps->bind(rt);
+    } else {
+        LOG_ERROR("host-orch: orch .so framework_bind_runtime was not resolved");
+        return -1;
     }
 
-    runtime->set_tensor_allocation_storage(
-        dev_allocation_storage, static_cast<uint32_t>(builder.allocations.size()), allocation_bytes
-    );
-    LOG_INFO_V0("Uploaded %zu tensor allocation ranges (%zu bytes)", builder.allocations.size(), allocation_bytes);
-    return 0;
+    rt_scope_begin(rt);
+    eps->entry(orch_l2);
+    rt_scope_end(rt);
+    rt_orchestration_done(rt);
+
+    int32_t total_tasks = pto2_sm_layout::ring_current_task_index_addr(host_sm)->load(std::memory_order_acquire);
+
+    // Relocate the host-DDR cross-task pointers to their final DEVICE addresses
+    // on the host, before the SM and arena leave for the device. Pointers into
+    // the SM shift by sm_delta; pointers into the arena (fanout adjacency, wiring
+    // queue) shift by arena_delta. After this both the SM and arena carry device
+    // addresses, so the device boots scheduler-only.
+    const int64_t sm_delta = static_cast<int64_t>(reinterpret_cast<uint64_t>(device_sm)) -
+                             static_cast<int64_t>(reinterpret_cast<uint64_t>(host_sm));
+    const int64_t arena_delta = static_cast<int64_t>(reinterpret_cast<uint64_t>(device_arena)) -
+                                static_cast<int64_t>(reinterpret_cast<uint64_t>(host_arena.base()));
+    if (!relocate_host_orch_image(
+            host_sm_handle, rt, reinterpret_cast<uint64_t>(host_sm), sm_size, sm_delta,
+            reinterpret_cast<uint64_t>(host_arena.base()), layout.arena_size, arena_delta
+        )) {
+        LOG_ERROR("host-orch: relocation failed; refusing to H2D an image with unrelocated host pointers");
+        return -1;
+    }
+
+    if (api->copy_to_device(device_sm, host_sm, sm_size) != 0) {
+        LOG_ERROR("host-orch: H2D of populated SM failed");
+        return -1;
+    }
+    return total_tasks;
 }
 
 }  // namespace
 
-#ifdef __cplusplus
-extern "C" {
-#endif
-
 /**
- * Stage the per-callable resources for the host_build_graph variant: upload
- * kernel binaries and dlopen the orchestration SO on the host. The dlopen
- * handle and resolved entry-symbol pointer are returned via
- * CallableArtifacts so the platform layer can hoist them into its
- * CallableState. Splitting this out of init_runtime_impl is what
- * the hbg simpler_register_callable / simpler_run path rests on — the dlopen runs
- * once per cid instead of every run.
+ * Stage the per-callable resources (kernel binaries + orchestration SO) into
+ * the supplied runtime so a subsequent bind_callable_to_runtime_impl can use
+ * them. This is the cacheable half of init_runtime_impl: nothing here depends
+ * on per-run argument values, so the prepare_callable / run_prepared split
+ * lets us run this once per callable_id and amortize across runs.
+ *
+ * @param runtime   Pointer to pre-constructed Runtime (host_api populated)
+ * @param callable  ChipCallable carrying the orch SO + child kernel binaries
+ * @return 0 on success, -1 on failure
  */
-int register_callable_impl(const ChipCallable *callable, uint64_t (*upload_fn)(const void *), CallableArtifacts *out) {
+extern "C" int
+register_callable_impl(const ChipCallable *callable, uint64_t (*upload_fn)(const void *), CallableArtifacts *out) {
     if (callable == nullptr) {
         LOG_ERROR("Callable pointer is null");
         return -1;
@@ -330,55 +621,85 @@ int register_callable_impl(const ChipCallable *callable, uint64_t (*upload_fn)(c
 
     const uint8_t *orch_so_binary = static_cast<const uint8_t *>(callable->binary_data());
     size_t orch_so_size = callable->binary_size();
-    const char *orch_func_name = callable->func_name();
 
-    if (orch_so_binary == nullptr || orch_so_size == 0 || orch_func_name[0] == '\0') {
-        LOG_ERROR("Invalid orchestration parameters");
+    if (orch_so_binary == nullptr || orch_so_size == 0) {
+        LOG_ERROR("Orchestration SO binary is required for host orchestration");
         return -1;
     }
 
-    // Load orchestration SO from binary data via temp file. Held open across
-    // the lifetime of the prepared callable; closed by
-    // DeviceRunner::unregister_callable.
-    std::string fd_path;
-    if (!create_temp_so_file(orch_so_binary, orch_so_size, &fd_path)) {
-        LOG_ERROR("Failed to create temp SO file");
-        return -1;
-    }
-
-    void *handle = dlopen(fd_path.c_str(), RTLD_NOW | RTLD_LOCAL);
-    unlink(fd_path.c_str());
-    if (handle == nullptr) {
-        LOG_ERROR("dlopen failed: %s", dlerror());
-        return -1;
-    }
-
-    dlerror();
-    OrchestrationFunc orch_func = reinterpret_cast<OrchestrationFunc>(dlsym(handle, orch_func_name));
-    const char *dlsym_error = dlerror();
-    if (dlsym_error != nullptr) {
-        LOG_ERROR("dlsym failed for '%s': %s", orch_func_name, dlsym_error);
-        dlclose(handle);
-        return -1;
+    out->orch_so_data = orch_so_binary;
+    out->orch_so_size = orch_so_size;
+    out->func_name = callable->func_name();
+    out->config_name = callable->config_name();
+
+    // host_build_graph host-orch: dlopen the orchestration .so ON THE HOST and
+    // resolve its entry symbol now. The handle is held across the prepared
+    // callable's lifetime (closed by DeviceRunner::unregister_callable via
+    // host_dlopen_handle); bind_callable_to_runtime_impl invokes the resolved
+    // entry per run. This is what makes the host-side dlopen observable
+    // (host_dlopen_count) while the AICPU never dlopens the orch .so.
+    {
+        const char *orch_func_name = callable->func_name();
+        if (orch_func_name == nullptr || orch_func_name[0] == '\0') {
+            LOG_ERROR("host-orch: orchestration function name is empty");
+            return -1;
+        }
+        std::string so_path;
+        if (!create_orch_so_tempfile(orch_so_binary, orch_so_size, &so_path)) {
+            LOG_ERROR("host-orch: failed to materialize orchestration .so");
+            return -1;
+        }
+        void *handle = dlopen(so_path.c_str(), RTLD_NOW | RTLD_LOCAL);
+        if (handle == nullptr) {
+            LOG_ERROR("host-orch: dlopen failed: %s", dlerror());
+            return -1;
+        }
+        void *entry = dlsym(handle, orch_func_name);
+        if (entry == nullptr) {
+            LOG_ERROR("host-orch: dlsym('%s') failed: %s", orch_func_name, dlerror());
+            dlclose(handle);
+            return -1;
+        }
+        // The orch .so has its own framework_bind_runtime / g_current_runtime
+        // (orchestration/common.cpp is compiled into it); resolve it now so the
+        // per-run bind can set it before the .so's inline rt_submit_* run.
+        void *bind_sym = dlsym(handle, "framework_bind_runtime");
+        if (bind_sym == nullptr) {
+            LOG_ERROR("host-orch: orch .so does not export framework_bind_runtime: %s", dlerror());
+            dlclose(handle);
+            return -1;
+        }
+        // Safe to unlink now: the handle keeps the .so mapped regardless of path.
+        unlink(so_path.c_str());
+        auto *eps = new HostOrchEntryPoints{};
+        eps->entry = reinterpret_cast<OrchestrationEntryFunc>(entry);
+        eps->bind = reinterpret_cast<OrchestrationBindFunc>(bind_sym);
+        out->host_dlopen_handle = handle;
+        out->host_orch_func_ptr = eps;
+        LOG_INFO_V0("host-orch: loaded orchestration entry '%s' on host", orch_func_name);
     }
-
-    LOG_INFO_V0("Loaded orchestration function: %s", orch_func_name);
-
-    out->host_dlopen_handle = handle;
-    out->host_orch_func_ptr = reinterpret_cast<void *>(orch_func);
+    LOG_INFO_V0("Orchestration SO: %zu bytes staged", orch_so_size);
     return 0;
 }
 
 /**
- * Per-run binding for hbg: invoke the previously-resolved orchestration entry
- * point against the supplied args, then upload tensor info / allocation
- * storage. DeviceRunner::bind_callable_to_runtime passes `host_orch_func_ptr`
- * straight through from CallableState for this run's callable_id.
+ * Per-run binding: build device-side argument storage (tensor copy-out, GM
+ * heap, PTO2 shared memory) and publish it to the runtime. Assumes the
+ * callable-side state (kernel binaries, orch SO bytes, func/config names)
+ * is already populated by register_callable_impl.
+ *
+ * Splitting this from register_callable_impl matches the per-callable_id
+ * design: register/run_prepared invokes this every call, while the prep
+ * half runs only once per callable_id.
+ *
+ * @param runtime    Pointer to pre-constructed Runtime (host_api populated)
+ * @param orch_args  Separated tensor/scalar arguments for this run
+ * @return 0 on success, -1 on failure
  */
-int bind_callable_to_runtime_impl(
+extern "C" int bind_callable_to_runtime_impl(
     Runtime *runtime, const HostApi *api, const ChipStorageTaskArgs *orch_args, void *host_orch_func_ptr,
-    const ArgDirection *signature, int sig_count, const uint64_t * /*ring_task_window*/, const uint64_t * /*ring_heap*/,
-    const uint64_t * /*ring_dep_pool*/
+    const ArgDirection *signature, int sig_count, const uint64_t *ring_task_window, const uint64_t *ring_heap,
+    const uint64_t *ring_dep_pool
 ) {
     if (runtime == nullptr) {
         LOG_ERROR("Runtime pointer is null");
@@ -392,63 +713,249 @@ int bind_callable_to_runtime_impl(
         LOG_ERROR("orch_args pointer is null");
         return -1;
     }
-    OrchestrationFunc orch_func = reinterpret_cast<OrchestrationFunc>(host_orch_func_ptr);
-    if (orch_func == nullptr) {
-        LOG_ERROR("bind_callable_to_runtime_impl: host orch_func pointer is null");
+    // host_build_graph host-orch: register_callable_impl resolved the
+    // orchestration entry on the host and passed it here as host_orch_func_ptr;
+    // it is run below (after the arena is built) against a host SM mirror.
+    int tensor_count = orch_args->tensor_count();
+    int scalar_count = orch_args->scalar_count();
+    LOG_INFO_V0("RT2 bind: %d tensors + %d scalars, host orchestration mode", tensor_count, scalar_count);
+
+    int64_t t_total_start = _now_ms();
+
+    uint64_t eff_task_window_sizes[PTO2_MAX_RING_DEPTH];
+    uint64_t eff_heap_sizes[PTO2_MAX_RING_DEPTH];
+    int32_t eff_dep_pool_capacities[PTO2_MAX_RING_DEPTH];
+    if (!resolve_ring_config(
+            ring_task_window, ring_heap, ring_dep_pool, eff_task_window_sizes, eff_heap_sizes, eff_dep_pool_capacities
+        )) {
         return -1;
     }
+    const std::string task_window_log = format_ring_array(eff_task_window_sizes);
+    const std::string heap_log = format_ring_array(eff_heap_sizes);
+    const std::string dep_pool_log = format_ring_array(eff_dep_pool_capacities);
+    LOG_INFO_V0(
+        "Ring buffer sizes: task_window=%s heap=%s dep_pool=%s", task_window_log.c_str(), heap_log.c_str(),
+        dep_pool_log.c_str()
+    );
 
-    runtime->tensor_pairs_.clear();
+    // Build device args: copy from input, replace host tensor pointers with device pointers
+    ChipStorageTaskArgs device_args;
 
-    LOG_INFO_V0("=== Calling Orchestration Function ===");
-    LOG_DEBUG(
-        "Args count: %d (%d tensors + %d scalars)", orch_args->tensor_count() + orch_args->scalar_count(),
-        orch_args->tensor_count(), orch_args->scalar_count()
-    );
+    int64_t t_args_start = _now_ms();
+    for (int i = 0; i < tensor_count; i++) {
+        Tensor t = orch_args->tensor(i);
 
-    TensorInfoBuilder tensor_info_builder;
-    TensorAllocationBuilder tensor_allocation_builder;
-    OrchestrationRuntimeImpl orchestration_runtime = {
-        &k_orchestration_runtime_ops, runtime, api, &tensor_info_builder, &tensor_allocation_builder
-    };
+        if (t.is_child_memory()) {
+            LOG_INFO_V0("  Tensor %d: child memory, pass-through (0x%" PRIx64 ")", i, t.buffer.addr);
+            device_args.add_tensor(t);
+            continue;
+        }
+
+        void *host_ptr = reinterpret_cast<void *>(static_cast<uintptr_t>(t.buffer.addr));
+        size_t size = static_cast<size_t>(t.nbytes());
+
+        void *dev_ptr = api->device_malloc(size);
+        if (dev_ptr == nullptr) {
+            LOG_ERROR("Failed to allocate device memory for tensor %d", i);
+            return -1;
+        }
+
+        // Pure write-only OUTPUT buffers carry no meaningful host content, so
+        // the H2D copy-in is wasted. Zero them on-device instead (cheap HBM
+        // memset, no PCIe) so any region the kernel leaves unwritten reads as 0
+        // rather than pooled-allocator garbage. INOUT (read-before-write)
+        // and IN keep the H2D copy. Falls back to copy_to_device if a backend
+        // did not wire device_memset.
+        bool is_pure_output = (signature != nullptr && i < sig_count && signature[i] == ArgDirection::OUT);
+        int rc;
+        if (is_pure_output && api->device_memset != nullptr) {
+            rc = api->device_memset(dev_ptr, 0, size);
+        } else {
+            rc = api->copy_to_device(dev_ptr, host_ptr, size);
+        }
+        if (rc != 0) {
+            LOG_ERROR("Failed to stage tensor %d to device", i);
+            api->device_free(dev_ptr);
+            return -1;
+        }
+        // Read-only INPUT tensors are never written by the kernel, so there is
+        // no point copying them back D2H at the end. Index the signature
+        // by the orch tensor index `i` (child_memory tensors are skipped above
+        // but do not consume a separate signature slot — scalars follow the
+        // tensor entries). Anything not provably IN keeps the safe default of
+        // copying back.
+        bool needs_copy_back = !(signature != nullptr && i < sig_count && signature[i] == ArgDirection::IN);
+        runtime->tensor_pairs_.push_back({host_ptr, dev_ptr, size, needs_copy_back});
+        LOG_INFO_V0("  Tensor %d: %zu bytes at %p", i, size, dev_ptr);
+
+        // host_build_graph runs the orchestrator on the host, which may read
+        // control tensors (e.g. paged_attention's context_lens/block_table) via
+        // get_tensor_data to shape the graph. Map this device buffer into the
+        // host address space so the host can dereference buffer.addr directly.
+        // Routed through the per-thread DeviceRunner (svm_register_via_runner),
+        // NOT the Runtime.host_api struct — keeps tensormap/a5 ABI untouched.
+        // Released in validate_runtime_impl before device_free.
+        //
+        // The host then reads/writes buffer.addr (== dev_ptr) directly, so this
+        // path REQUIRES an identity mapping (host VA == dev_ptr). a2a3
+        // halHostRegister(DEV_SVM_MAP_HOST) returns identity and sim is already a
+        // host pointer, but the HAL contract permits a non-identity VA — verify
+        // it here and fail the prepare rather than letting the host dereference a
+        // device address (segfault / silent corruption) on a future HAL.
+        void *host_va = svm_register_via_runner(dev_ptr, size);
+        if (host_va != nullptr && host_va != dev_ptr) {
+            LOG_ERROR(
+                "host-orch: SVM map returned non-identity host VA %p for dev_ptr %p; the host orchestrator "
+                "dereferences buffer.addr directly and assumes identity mapping",
+                host_va, dev_ptr
+            );
+            return -1;
+        }
+
+        t.buffer.addr = reinterpret_cast<uint64_t>(dev_ptr);
+        device_args.add_tensor(t);
+    }
+    for (int i = 0; i < scalar_count; i++) {
+        device_args.add_scalar(orch_args->scalar(i));
+    }
+    int64_t t_args_end = _now_ms();
+
+    // Read orchestrator-to-scheduler transition flag from environment
+    {
+        const char *env_val = std::getenv("PTO2_ORCH_TO_SCHED");
+        if (env_val && (env_val[0] == '1' || env_val[0] == 't' || env_val[0] == 'T')) {
+            runtime->orch_to_sched = true;
+        }
+        LOG_INFO_V0("Orchestrator-to-scheduler transition: %s", runtime->orch_to_sched ? "enabled" : "disabled");
+    }
+
+    // Lay out the per-Worker static device arena. GM heap, PTO2 shared memory,
+    // and the prebuilt runtime arena all live in a single backing allocation;
+    // setup_static_arena reserves the three regions and commits in one shot.
+    // Owned by DeviceRunner across runs — do NOT record in tensor_pairs_; the
+    // free is deferred to DeviceRunner::finalize(). The runtime-arena size is
+    // determined by replaying the reserve sequence on a host-side arena.
+    uint64_t total_heap_size = 0;
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        if (eff_heap_sizes[r] > std::numeric_limits<uint64_t>::max() - total_heap_size) {
+            LOG_ERROR("Total ring heap size overflows uint64_t");
+            return -1;
+        }
+        total_heap_size += eff_heap_sizes[r];
+    }
+    uint64_t sm_size = PTO2SharedMemoryHandle::calculate_size_per_ring(eff_task_window_sizes);
+
+    int64_t t_prebuilt_start = _now_ms();
+    DeviceArena host_arena;  // libc malloc backend by default
+    PTO2RuntimeArenaLayout layout =
+        runtime_reserve_layout(host_arena, eff_task_window_sizes, eff_heap_sizes, eff_dep_pool_capacities);
+    if (host_arena.commit(DeviceArena::kDefaultBaseAlign) == nullptr) {
+        LOG_ERROR("Failed to commit host arena for prebuilt runtime image");
+        return -1;
+    }
+
+    int64_t t_setup_start = _now_ms();
+    if (api->setup_static_arena(total_heap_size, sm_size, layout.arena_size) != 0) {
+        LOG_ERROR("Failed to setup pooled static arena");
+        return -1;
+    }
+    int64_t t_setup_end = _now_ms();
+
+    int64_t t_heap_start = _now_ms();
+    void *gm_heap = api->acquire_pooled_gm_heap();
+    int64_t t_heap_end = _now_ms();
+    if (gm_heap == nullptr) {
+        LOG_ERROR("Failed to acquire pooled GM heap");
+        return -1;
+    }
+    runtime->set_gm_heap(gm_heap);
+
+    int64_t t_sm_start = _now_ms();
+    void *sm_ptr = api->acquire_pooled_gm_sm();
+    int64_t t_sm_end = _now_ms();
+    if (sm_ptr == nullptr) {
+        LOG_ERROR("Failed to acquire pooled PTO2 shared memory");
+        return -1;
+    }
+    runtime->set_gm_sm_ptr(sm_ptr);
+
+    void *runtime_arena_dev = api->acquire_pooled_runtime_arena();
+    if (runtime_arena_dev == nullptr) {
+        LOG_ERROR("Failed to acquire pooled runtime arena");
+        return -1;
+    }
+
+    // Set up orchestration state (consumed by the host orchestrator below)
+    runtime->set_orch_args(device_args);
+
+    // -------------------------------------------------------------------------
+    // Build the prebuilt runtime-arena image on host.
+    //
+    // We pre-compute every byte the AICPU's runtime arena would otherwise have
+    // to write at boot: layout offsets, sub-structure init data, and pointers
+    // back to the SM / GM heap. Then we rtMemcpy the image into the pooled
+    // runtime-arena region that DeviceRunner keeps alive across runs. AICPU
+    // boot becomes attach + wire (cheap pointer fixup) + sm_handle->init (SM
+    // reset) + a handful of device-only field fixups.
+    // -------------------------------------------------------------------------
+    PTO2Runtime *rt =
+        runtime_init_data_from_layout(host_arena, layout, PTO2_MODE_EXECUTE, sm_ptr, sm_size, gm_heap, eff_heap_sizes);
+    if (rt == nullptr) {
+        LOG_ERROR("runtime_init_data_from_layout failed");
+        return -1;
+    }
+    runtime_wire_arena_pointers(host_arena, layout, rt);
+
+    // host_build_graph host-orch: run the orchestrator on the host now, against
+    // a host SM mirror, and ship the populated SM to the device. The arena
+    // (copied to the device below) carries the resulting orchestrator/scheduler
+    // state; the device boots scheduler-only. register_callable_impl guarantees
+    // host_orch_func_ptr is non-null on success (it fails the whole prepare
+    // otherwise), so this is an assertion-style guard, not a fallback path.
+    if (host_orch_func_ptr == nullptr) {
+        LOG_ERROR("host-orch: orchestration entry points were not resolved");
+        return -1;
+    }
+    {
+        L2TaskArgs orch_l2;
+        orch_l2.create_from_chip_args(device_args);
+        int32_t total_tasks = run_host_orchestration(
+            runtime, api, rt, host_arena, layout, sm_ptr, sm_size, runtime_arena_dev, gm_heap, eff_heap_sizes,
+            eff_task_window_sizes, host_orch_func_ptr, orch_l2
+        );
+        if (total_tasks < 0) {
+            LOG_ERROR("host-orch: orchestration run failed");
+            return -1;
+        }
+        runtime->host_total_tasks = total_tasks;
+        LOG_INFO_V0("host-orch: submitted %d tasks on host", total_tasks);
+    }
+
+    // Stash the layout inside the PTO2Runtime image so the AICPU can recover
+    // every arena-internal offset after rtMemcpy. The runtime arena's device
+    // base does NOT travel in this image — it's on the host Runtime
+    // (set_prebuilt_arena below), since the AICPU needs that pointer
+    // *before* it can dereference the image.
+    rt->prebuilt_layout = layout;
+
+    int rc_upload = api->copy_to_device(runtime_arena_dev, host_arena.base(), layout.arena_size);
+    if (rc_upload != 0) {
+        LOG_ERROR("Failed to rtMemcpy prebuilt runtime arena to device (rc=%d)", rc_upload);
+        return -1;
+    }
+    runtime->set_prebuilt_arena(runtime_arena_dev, layout.off_runtime);
+    int64_t t_prebuilt_end = _now_ms();
+
+    LOG_INFO_V0("Device orchestration ready: %d tensors + %d scalars", tensor_count, scalar_count);
+
+    int64_t t_total_end = _now_ms();
+    LOG_INFO_V0("TIMING: args_malloc_copy = %" PRId64 "ms", t_args_end - t_args_start);
+    LOG_INFO_V0("TIMING: static_arena_setup = %" PRId64 "ms", t_setup_end - t_setup_start);
+    LOG_INFO_V0("TIMING: gm_heap_acquire = %" PRId64 "ms", t_heap_end - t_heap_start);
+    LOG_INFO_V0("TIMING: shared_mem_acquire = %" PRId64 "ms", t_sm_end - t_sm_start);
+    LOG_INFO_V0("TIMING: prebuilt_runtime_arena = %" PRId64 "ms", t_prebuilt_end - t_prebuilt_start);
+    LOG_INFO_V0("TIMING: total_init_runtime_impl = %" PRId64 "ms", t_total_end - t_total_start);
 
-    // hbg orch runs on the host, so it may legitimately need to dereference
-    // entry-tensor host pointers (e.g. to drive per-block dispatch from a
-    // control tensor). Unlike TMARB, runtime_maker cannot pre-upload entry
-    // tensors here without breaking that pattern — the orch keeps ownership
-    // of H2D decisions and uses record_tensor_pair to register outputs for
-    // copy-back. signature is plumbed for future use but unused on this path.
-    (void)signature;
-    (void)sig_count;
-    int rc = orch_func(reinterpret_cast<OrchestrationRuntime *>(&orchestration_runtime), *orch_args);
-    if (rc != 0) {
-        LOG_ERROR("Orchestration function failed with code %d", rc);
-        free_tensor_allocations(api, tensor_allocation_builder);
-        runtime->tensor_pairs_.clear();
-        return rc;
-    }
-
-    rc = upload_tensor_allocation_storage(runtime, api, tensor_allocation_builder);
-    if (rc != 0) {
-        LOG_ERROR("Failed to upload tensor allocations: %d", rc);
-        free_tensor_allocations(api, tensor_allocation_builder);
-        runtime->tensor_pairs_.clear();
-        return rc;
-    }
-
-    rc = upload_tensor_info_storage(runtime, api, tensor_info_builder);
-    if (rc != 0) {
-        LOG_ERROR("Failed to upload tensor info storage: %d", rc);
-        if (runtime->get_tensor_allocation_storage() != nullptr) {
-            api->device_free(runtime->get_tensor_allocation_storage());
-            runtime->clear_tensor_allocation_storage();
-        }
-        free_tensor_allocations(api, tensor_allocation_builder);
-        runtime->tensor_pairs_.clear();
-        return rc;
-    }
-
-    LOG_INFO_V0("Runtime initialized. Ready for execution from Python.");
     return 0;
 }
 
@@ -463,7 +970,7 @@ int bind_callable_to_runtime_impl(
  * @param runtime  Pointer to Runtime
  * @return 0 on success, -1 on failure
  */
-int validate_runtime_impl(Runtime *runtime, const HostApi *api) {
+extern "C" int validate_runtime_impl(Runtime *runtime, const HostApi *api) {
     if (runtime == nullptr) {
         LOG_ERROR("Runtime pointer is null");
         return -1;
@@ -481,55 +988,117 @@ int validate_runtime_impl(Runtime *runtime, const HostApi *api) {
     TensorPair *tensor_pairs = runtime->tensor_pairs_.data();
     int tensor_pair_count = static_cast<int>(runtime->tensor_pairs_.size());
 
-    for (int i = 0; i < tensor_pair_count; i++) {
-        const TensorPair &pair = tensor_pairs[i];
-        int copy_rc = api->copy_from_device(pair.host_ptr, pair.dev_ptr, pair.size);
-        if (copy_rc != 0) {
-            LOG_ERROR("Failed to copy tensor %d from device: %d", i, copy_rc);
-            rc = copy_rc;
-            // Continue with cleanup anyway
-        } else {
-            LOG_DEBUG("Tensor %d: %zu bytes copied to host", i, pair.size);
+    LOG_INFO_V0("Tensor pairs to process: %d", tensor_pair_count);
+
+    // PTO2: graph output may be in packed buffer
+    uint64_t graph_out_ptr = 0;
+    uint64_t graph_out_size = 0;
+    bool skip_tensor_copy_back = false;
+    int32_t runtime_status = 0;
+    PTO2SharedMemoryHeader host_header;
+    memset(&host_header, 0, sizeof(host_header));
+
+    runtime_status = pto2_read_runtime_status(runtime, api, &host_header);
+    if (runtime_status != 0) {
+        int32_t orch_error_code = host_header.orch_error_code.load(std::memory_order_relaxed);
+        int32_t sched_error_code = host_header.sched_error_code.load(std::memory_order_relaxed);
+        LOG_ERROR(
+            "PTO2 runtime failed: orch_error_code=%d sched_error_code=%d runtime_status=%d", orch_error_code,
+            sched_error_code, runtime_status
+        );
+        skip_tensor_copy_back = true;
+    } else {
+        graph_out_ptr = host_header.graph_output_ptr;
+        graph_out_size = host_header.graph_output_size;
+        if (graph_out_ptr != 0) {
+            LOG_INFO_V0("Graph output buffer: ptr=0x%" PRIx64 ", size=%" PRIu64, graph_out_ptr, graph_out_size);
         }
     }
 
-    // Note: print_handshake_results() is called in DeviceRunner::run()
+    if (skip_tensor_copy_back) {
+        LOG_WARN("Skipping tensor copy-back because PTO2 runtime reported fatal status");
+    } else {
+        bool first_output_tensor = true;
+        for (int i = 0; i < tensor_pair_count; i++) {
+            const TensorPair &pair = tensor_pairs[i];
+
+            // Skip if device pointer is null
+            if (pair.dev_ptr == nullptr) {
+                LOG_WARN("Tensor %d has null device pointer, skipping", i);
+                continue;
+            }
+
+            // If host pointer is null, this is a device-only allocation (no copy-back)
+            if (pair.host_ptr == nullptr) {
+                LOG_INFO_V0("Tensor %d: device-only allocation (no copy-back)", i);
+                continue;
+            }
+
+            // Read-only INPUT tensors were uploaded H2D but the kernel never
+            // wrote them — copying them back (potentially ~GB) is pure waste.
+            // They are still device_free'd in the cleanup loop below.
+            if (!pair.needs_copy_back) {
+                LOG_INFO_V0("Tensor %d: read-only input, skipping copy-back", i);
+                continue;
+            }
+
+            void *src_ptr = pair.dev_ptr;
+            size_t copy_size = pair.size;
+
+            // Use graph_output_ptr for the first output tensor if available
+            if (first_output_tensor && graph_out_ptr != 0 && graph_out_size > 0) {
+                src_ptr = reinterpret_cast<void *>(static_cast<uintptr_t>(graph_out_ptr));
+                copy_size = static_cast<size_t>(graph_out_size);
+                LOG_INFO_V0("Using packed output buffer for tensor %d", i);
+                first_output_tensor = false;
+            }
+
+            int copy_rc = api->copy_from_device(pair.host_ptr, src_ptr, copy_size);
+            if (copy_rc != 0) {
+                LOG_ERROR("Failed to copy tensor %d from device: %d", i, copy_rc);
+                rc = copy_rc;
+            } else {
+                LOG_INFO_V0("Tensor %d: %zu bytes copied to host", i, pair.size);
+            }
+        }
+    }
 
     // Cleanup device tensors
     LOG_INFO_V0("=== Cleaning Up ===");
     for (int i = 0; i < tensor_pair_count; i++) {
-        api->device_free(tensor_pairs[i].dev_ptr);
+        if (tensor_pairs[i].dev_ptr != nullptr) {
+            // Release the SVM host mapping installed at staging time before
+            // freeing the device buffer (unregister-before-free, as the HAL
+            // requires). No-op on sim. Keyed by dev_ptr.
+            svm_unregister_via_runner(tensor_pairs[i].dev_ptr);
+            api->device_free(tensor_pairs[i].dev_ptr);
+        }
     }
-    LOG_INFO_V0("Freed %d device tensors", tensor_pair_count);
-
-    if (runtime->get_tensor_info_storage() != nullptr) {
-        api->device_free(runtime->get_tensor_info_storage());
-        runtime->clear_tensor_info_storage();
+    LOG_INFO_V0("Freed %d device allocations", tensor_pair_count);
+
+    // Clear the per-run dispatch-table entries staged by register_callable_impl.
+    // The underlying chip-callable device buffer is pool-managed by
+    // DeviceRunner (keyed by content hash) and bulk-freed in
+    // DeviceRunner::finalize(); re-running the same callable repeatedly
+    // should not re-upload.
+    int kernel_count = runtime->get_registered_kernel_count();
+    for (int i = 0; i < kernel_count; i++) {
+        int func_id = runtime->get_registered_kernel_func_id(i);
+        runtime->set_function_bin_addr(func_id, 0);
     }
-    if (runtime->get_tensor_allocation_storage() != nullptr) {
-        api->device_free(runtime->get_tensor_allocation_storage());
-        runtime->clear_tensor_allocation_storage();
+    if (kernel_count > 0) {
+        LOG_INFO_V0("Cleared %d kernel dispatch-table entries", kernel_count);
     }
+    runtime->clear_registered_kernels();
 
     // Clear tensor pairs
     runtime->tensor_pairs_.clear();
 
     LOG_INFO_V0("=== Finalize Complete ===");
 
-    return rc;
-}
-
-// host_build_graph resolves orchestration on the host, so it exports no AICPU
-// entries beyond the base {simpler_aicpu_exec, simpler_aicpu_init} — in
-// particular it does not export simpler_aicpu_register_callable. Reporting an
-// empty extra-symbol set keeps the common AICPU loader from looking for it.
-const char *const *runtime_extra_aicpu_symbols(size_t *count) {
-    if (count != nullptr) {
-        *count = 0;
+    if (rc == 0 && runtime_status != 0) {
+        rc = runtime_status;
     }
-    return nullptr;
-}
 
-#ifdef __cplusplus
-} /* extern "C" */
-#endif
+    return rc;
+}
diff --git a/src/a2a3/runtime/host_build_graph/orchestration/common.cpp b/src/a2a3/runtime/host_build_graph/orchestration/common.cpp
new file mode 100644
index 000000000..c4878a1c2
--- /dev/null
+++ b/src/a2a3/runtime/host_build_graph/orchestration/common.cpp
@@ -0,0 +1,197 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+#include "common.h"
+
+#ifdef __linux__
+#include <cxxabi.h>
+#include <dlfcn.h>
+#include <execinfo.h>
+#include <unistd.h>
+
+#include <array>
+#include <cstring>
+#include <vector>
+#endif
+
+struct PTO2Runtime;
+
+// Unified-log error sink. Forward-declared here rather than pulled via
+// common/unified_log.h: that header lives under common/log/include, which is
+// not on the orchestration .so build's include path. The symbol resolves at
+// link time for the runtime targets, and at dlopen time for the orchestration
+// .so (against the executor's unified_log_device), so onboard diagnostics still
+// reach the CANN device log.
+extern "C" void unified_log_error(const char *func, const char *fmt, ...);
+
+namespace {
+// Plain global (not thread_local) to avoid glibc TLSDESC stale-resolution
+// crash (BZ #32412) when the orchestration SO is dlclose'd/re-dlopen'd
+// between execution rounds.  All orchestrator threads bind the same rt
+// value, so per-thread storage is unnecessary.
+PTO2Runtime *g_current_runtime = nullptr;
+}  // namespace
+
+extern "C" __attribute__((visibility("default"))) void framework_bind_runtime(PTO2Runtime *rt) {
+    g_current_runtime = rt;
+}
+
+// Keep current_runtime local to this .so so orchestration helpers do not
+// accidentally bind to the AICPU binary's same-named symbol.
+extern "C" __attribute__((visibility("hidden"))) PTO2Runtime *framework_current_runtime() { return g_current_runtime; }
+
+/**
+ * Use addr2line to convert an address to file:line information.
+ * Uses the -i flag to expand inlines; returns the first line (innermost actual code location).
+ * If inlining is present, also returns the outer call chain via inline_chain.
+ */
+#ifdef __linux__
+static std::string addr_to_line(const char *executable, void *addr, std::string *inline_chain = nullptr) {
+    char cmd[512];
+    snprintf(cmd, sizeof(cmd), "addr2line -e %s -f -C -p -i %p 2>/dev/null", executable, addr);
+
+    std::array<char, 256> buffer;
+    std::string raw_output;
+
+    FILE *pipe = popen(cmd, "r");
+    if (pipe) {
+        while (fgets(buffer.data(), buffer.size(), pipe) != nullptr) {
+            raw_output += buffer.data();
+        }
+        pclose(pipe);
+    }
+
+    if (raw_output.empty() || raw_output.find("??") != std::string::npos) {
+        return "";
+    }
+
+    // Split by lines
+    std::vector<std::string> lines;
+    size_t pos = 0;
+    while (pos < raw_output.size()) {
+        size_t nl = raw_output.find('\n', pos);
+        if (nl == std::string::npos) nl = raw_output.size();
+        std::string line = raw_output.substr(pos, nl - pos);
+        while (!line.empty() && line.back() == '\r')
+            line.pop_back();
+        if (!line.empty()) lines.push_back(line);
+        pos = nl + 1;
+    }
+
+    if (lines.empty()) return "";
+
+    // First line is the innermost actual code location; subsequent lines are outer inline callers
+    if (inline_chain && lines.size() > 1) {
+        *inline_chain = "";
+        for (size_t j = 1; j < lines.size(); j++) {
+            *inline_chain += "    [inlined by] " + lines[j] + "\n";
+        }
+    }
+
+    return lines.front();
+}
+#endif
+
+/**
+ * Get current stack trace information (including file paths and line numbers).
+ * Uses dladdr to locate the shared library for each stack frame, then calls addr2line with relative addresses.
+ */
+std::string get_stacktrace(int skip_frames) {
+    (void)skip_frames;  // May be unused on non-Linux platforms
+    std::string result;
+#ifdef __linux__
+    const int max_frames = 64;
+    void *buffer[max_frames];
+    int nframes = backtrace(buffer, max_frames);
+    char **symbols = backtrace_symbols(buffer, nframes);
+
+    if (symbols) {
+        result = "Stack trace:\n";
+        for (int i = skip_frames; i < nframes; i++) {
+            std::string frame_info;
+
+            void *addr = (void *)((char *)buffer[i] - 1);
+
+            Dl_info dl_info;
+            std::string inline_chain;
+            if (dladdr(addr, &dl_info) && dl_info.dli_fname) {
+                void *rel_addr = (void *)((char *)addr - (char *)dl_info.dli_fbase);
+                std::string addr2line_result = addr_to_line(dl_info.dli_fname, rel_addr, &inline_chain);
+
+                if (addr2line_result.empty()) {
+                    addr2line_result = addr_to_line(dl_info.dli_fname, addr, &inline_chain);
+                }
+
+                if (!addr2line_result.empty()) {
+                    frame_info = std::string(dl_info.dli_fname) + ": " + addr2line_result;
+                }
+            }
+
+            if (frame_info.empty()) {
+                std::string frame(symbols[i]);
+
+                size_t start = frame.find('(');
+                size_t end = frame.find('+', start);
+                if (start != std::string::npos && end != std::string::npos) {
+                    std::string mangled = frame.substr(start + 1, end - start - 1);
+                    int status;
+                    char *demangled = abi::__cxa_demangle(mangled.c_str(), nullptr, nullptr, &status);
+                    if (status == 0 && demangled) {
+                        frame = frame.substr(0, start + 1) + demangled + frame.substr(end);
+                        free(demangled);
+                    }
+                }
+                frame_info = frame;
+            }
+
+            char buf[16];
+            snprintf(buf, sizeof(buf), "  #%d ", i - skip_frames);
+            result += buf + frame_info + "\n";
+            if (!inline_chain.empty()) {
+                result += inline_chain;
+            }
+        }
+        free(symbols);
+    }
+#else
+    result = "(Stack trace is only available on Linux)\n";
+#endif
+    return result;
+}
+
+// AssertionError constructor
+static std::string build_assert_message(const char *condition, const char *file, int line) {
+    std::string msg = "Assertion failed: " + std::string(condition) + "\n";
+    msg += "  Location: " + std::string(file) + ":" + std::to_string(line) + "\n";
+    msg += get_stacktrace(3);
+    return msg;
+}
+
+AssertionError::AssertionError(const char *condition, const char *file, int line) :
+    std::runtime_error(build_assert_message(condition, file, line)),
+    condition_(condition),
+    file_(file),
+    line_(line) {}
+
+[[noreturn]] void assert_impl(const char *condition, const char *file, int line) {
+    // Use unified_log_error directly rather than the LOG_ERROR macro: that macro
+    // lives in pto_orchestration_api.h and expands to
+    // current_runtime()->ops->log_error, but the ops table's definition pulls in
+    // pto_types.h (Arg → __aicore__-only to_u64), which the AICore build of this
+    // TU cannot compile. unified_log_error reaches the same sink without that
+    // dependency.
+    unified_log_error(__FUNCTION__, "\n========================================");
+    unified_log_error(__FUNCTION__, "Assertion failed: %s", condition);
+    unified_log_error(__FUNCTION__, "Location: %s:%d", file, line);
+    unified_log_error(__FUNCTION__, "%s", get_stacktrace(2).c_str());
+    unified_log_error(__FUNCTION__, "========================================\n");
+
+    throw AssertionError(condition, file, line);
+}
diff --git a/src/a2a3/runtime/host_build_graph/orchestration/orchestration_api.h b/src/a2a3/runtime/host_build_graph/orchestration/orchestration_api.h
deleted file mode 100644
index 39f55c159..000000000
--- a/src/a2a3/runtime/host_build_graph/orchestration/orchestration_api.h
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- * Copyright (c) PyPTO Contributors.
- * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
- * CANN Open Software License Agreement Version 2.0 (the "License").
- * Please refer to the License for details. You may not use this file except in compliance with the License.
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
- * See LICENSE in the root of the software repository for the full text of the License.
- * -----------------------------------------------------------------------------------------------------------
- */
-/**
- * Orchestration API for host_build_graph.
- *
- * Orchestration sources include only this header and interact with the runtime
- * through the function-pointer table embedded in OrchestrationRuntime.
- */
-
-#ifndef SRC_A2A3_RUNTIME_HOST_BUILD_GRAPH_ORCHESTRATION_ORCHESTRATION_API_H_
-#define SRC_A2A3_RUNTIME_HOST_BUILD_GRAPH_ORCHESTRATION_ORCHESTRATION_API_H_
-
-#include <stddef.h>
-#include <stdint.h>
-
-#include "common/core_type.h"
-#include "task_args.h"
-#include "tensor_info.h"
-
-typedef struct OrchestrationRuntime OrchestrationRuntime;
-
-typedef struct OrchestrationRuntimeOps {
-    int (*add_task)(OrchestrationRuntime *runtime, uint64_t *args, int num_args, int func_id, CoreType core_type);
-    int (*set_tensor_info_to_task)(
-        OrchestrationRuntime *runtime, int task_id, const TensorInfo *tensor_info, int tensor_count
-    );
-    void (*add_successor)(OrchestrationRuntime *runtime, int from_task, int to_task);
-    // Host-build-graph orch owns its entry-tensor H2D (the orch runs on host
-    // and may need host-side pointers for control tensors). This call
-    // registers a host<->device mapping so runtime_maker's validate path can
-    // D2H copy-back and free at finalize. TMARB has no equivalent — its
-    // runtime_maker handles tensor uploads directly because its orch is
-    // device-side.
-    void (*record_tensor_pair)(OrchestrationRuntime *runtime, void *host_ptr, void *dev_ptr, size_t size);
-    int (*get_task_count)(OrchestrationRuntime *runtime);
-    void (*print_runtime)(OrchestrationRuntime *runtime);
-
-    void *(*device_malloc)(OrchestrationRuntime *runtime, size_t size);
-    void (*device_free)(OrchestrationRuntime *runtime, void *ptr);
-    int (*copy_to_device)(OrchestrationRuntime *runtime, void *dev_ptr, const void *host_ptr, size_t size);
-} OrchestrationRuntimeOps;
-
-struct OrchestrationRuntime {
-    const OrchestrationRuntimeOps *ops;
-};
-
-static inline int
-add_task(OrchestrationRuntime *runtime, uint64_t *args, int num_args, int func_id, CoreType core_type) {
-    return runtime->ops->add_task(runtime, args, num_args, func_id, core_type);
-}
-
-static inline int
-set_tensor_info_to_task(OrchestrationRuntime *runtime, int task_id, const TensorInfo *tensor_info, int tensor_count) {
-    return runtime->ops->set_tensor_info_to_task(runtime, task_id, tensor_info, tensor_count);
-}
-
-static inline int add_task_with_tensor_info(
-    OrchestrationRuntime *runtime, uint64_t *args, int num_args, int func_id, CoreType core_type,
-    const TensorInfo *tensor_info, int tensor_count
-) {
-    int task_id = add_task(runtime, args, num_args, func_id, core_type);
-    if (task_id < 0) {
-        return task_id;
-    }
-    if (set_tensor_info_to_task(runtime, task_id, tensor_info, tensor_count) != 0) {
-        return -1;
-    }
-    return task_id;
-}
-
-static inline void add_successor(OrchestrationRuntime *runtime, int from_task, int to_task) {
-    runtime->ops->add_successor(runtime, from_task, to_task);
-}
-
-static inline void record_tensor_pair(OrchestrationRuntime *runtime, void *host_ptr, void *dev_ptr, size_t size) {
-    runtime->ops->record_tensor_pair(runtime, host_ptr, dev_ptr, size);
-}
-
-static inline int get_task_count(OrchestrationRuntime *runtime) { return runtime->ops->get_task_count(runtime); }
-
-static inline void print_runtime(OrchestrationRuntime *runtime) { runtime->ops->print_runtime(runtime); }
-
-static inline void *device_malloc(OrchestrationRuntime *runtime, size_t size) {
-    return runtime->ops->device_malloc(runtime, size);
-}
-
-static inline void device_free(OrchestrationRuntime *runtime, void *ptr) { runtime->ops->device_free(runtime, ptr); }
-
-static inline int copy_to_device(OrchestrationRuntime *runtime, void *dev_ptr, const void *host_ptr, size_t size) {
-    return runtime->ops->copy_to_device(runtime, dev_ptr, host_ptr, size);
-}
-
-typedef int (*OrchestrationFunc)(OrchestrationRuntime *runtime, const ChipStorageTaskArgs &orch_args);
-
-#endif  // SRC_A2A3_RUNTIME_HOST_BUILD_GRAPH_ORCHESTRATION_ORCHESTRATION_API_H_
diff --git a/src/a2a3/runtime/host_build_graph/orchestration/pto_arg_with_deps.h b/src/a2a3/runtime/host_build_graph/orchestration/pto_arg_with_deps.h
new file mode 100644
index 000000000..863bed92d
--- /dev/null
+++ b/src/a2a3/runtime/host_build_graph/orchestration/pto_arg_with_deps.h
@@ -0,0 +1,140 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Convenience layer over Arg: bundles a fixed-capacity dependency buffer with
+ * an Arg and exposes an incremental add_dep(...) API on top of the runtime
+ * primitive L0TaskArgs::set_dependencies(ptr, count).
+ *
+ * Layering:
+ *   - Primitive:   Arg + set_dependencies(ptr, count) in pto_types.h.
+ *                  No cap, caller owns the deps buffer.
+ *   - Convenience: L0TaskArgsWithDeps<N> in this header. Owns a stack-sized dep
+ *                  buffer of capacity N (default 16); provides add_dep().
+ *                  Submitted via the rt_submit_*_task overloads below, which
+ *                  forward the bundled deps into the underlying Arg.
+ *
+ * This file is auto-included at the bottom of pto_orchestration_api.h so
+ * orchestration sources see L0TaskArgsWithDeps after a single `#include
+ * "pto_orchestration_api.h"`. The split is purely organizational —
+ * orchestration code should not include this header directly. Code generated
+ * from pypto can ignore the convenience layer entirely and target Arg +
+ * set_dependencies(ptr, count) directly.
+ *
+ * L0TaskArgsWithDeps uses private inheritance from Arg so that set_dependencies and
+ * the explicit_dep* accessors are NOT reachable on a wrapper instance — users
+ * who pick the convenience layer cannot accidentally mix it with the
+ * primitive layer's dep API on the same object.
+ */
+
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <type_traits>
+
+#include "pto_orchestration_api.h"  // Arg, MixedKernels, rt_submit_* primitives
+
+template <size_t MAX_DEP_COUNT = 16>
+class L0TaskArgsWithDeps : private L0TaskArgs {
+public:
+    // Tensor / scalar setters — forward to Arg
+    using L0TaskArgs::add_inout;
+    using L0TaskArgs::add_input;
+    using L0TaskArgs::add_no_dep;
+    using L0TaskArgs::add_output;
+    using L0TaskArgs::add_scalar;
+    using L0TaskArgs::add_scalars;
+    using L0TaskArgs::add_scalars_i32;
+    using L0TaskArgs::allow_early_resolve;  // speculative early-dispatch hint (getter)
+    using L0TaskArgs::copy_scalars_from;
+    using L0TaskArgs::set_allow_early_resolve;  // speculative early-dispatch hint (setter)
+
+    // Error / status — forward to Arg
+    using L0TaskArgs::error_msg;
+    using L0TaskArgs::has_error;
+    using L0TaskArgs::launch_spec;
+    using L0TaskArgs::set_error;
+
+    // NOT exposed: set_dependencies, explicit_dep_count, explicit_dep,
+    // explicit_deps_data — these are the primitive-layer dep API. Users of
+    // the convenience layer reach dependencies only through add_dep() below.
+
+    /**
+     * Append one or more dependencies to the bundled buffer. May be called
+     * multiple times; deps accumulate. Variadic accepts any non-zero number
+     * of PTO2TaskId arguments.
+     *
+     * Overflow (more than MAX_DEP_COUNT total) records an error on the
+     * underlying Arg; the error surfaces at submit time.
+     */
+    template <typename... Ids>
+    void add_dep(Ids... ids) {
+        static_assert(sizeof...(Ids) >= 1, "add_dep: at least one task id is required");
+        static_assert(
+            (std::is_same_v<std::decay_t<Ids>, PTO2TaskId> && ...), "add_dep: all arguments must be PTO2TaskId"
+        );
+        if (count_ + sizeof...(Ids) > MAX_DEP_COUNT) {
+            L0TaskArgs::set_error(
+                "L0TaskArgsWithDeps::add_dep: dep count exceeds MAX_DEP_COUNT (bump the template arg)"
+            );
+            return;
+        }
+        ((deps_[count_++] = ids), ...);
+    }
+
+    /**
+     * Clear the bundled dep buffer and reset the underlying Arg.
+     * Use this to recycle an L0TaskArgsWithDeps across loop iterations.
+     */
+    void reset() {
+        L0TaskArgs::reset();
+        count_ = 0;
+    }
+
+    /**
+     * Submit-only hook: bind the bundled deps onto the underlying Arg and
+     * return it as Arg&. Called by the rt_submit_*_task overloads below;
+     * orchestration code does not invoke this directly.
+     *
+     * Idempotent: explicitly clears any prior dep binding before re-setting,
+     * so a wrapper can be re-finalized (e.g. resubmitted) without tripping
+     * the primitive layer's single-shot check.
+     */
+    L0TaskArgs &finalize_for_submit() {
+        L0TaskArgs::set_dependencies(nullptr, 0);
+        L0TaskArgs::set_dependencies(deps_, count_);
+        return *this;
+    }
+
+private:
+    PTO2TaskId deps_[MAX_DEP_COUNT];
+    uint32_t count_ = 0;
+};
+
+// =============================================================================
+// Submit overloads — accept L0TaskArgsWithDeps<N> transparently
+// =============================================================================
+
+template <size_t N>
+static inline TaskOutputTensors rt_submit_task(const MixedKernels &mixed_kernels, L0TaskArgsWithDeps<N> &awd) {
+    return rt_submit_task(mixed_kernels, awd.finalize_for_submit());
+}
+
+template <size_t N>
+static inline TaskOutputTensors rt_submit_aic_task(int32_t kernel_id, L0TaskArgsWithDeps<N> &awd) {
+    return rt_submit_aic_task(kernel_id, awd.finalize_for_submit());
+}
+
+template <size_t N>
+static inline TaskOutputTensors rt_submit_aiv_task(int32_t kernel_id, L0TaskArgsWithDeps<N> &awd) {
+    return rt_submit_aiv_task(kernel_id, awd.finalize_for_submit());
+}
diff --git a/src/a2a3/runtime/host_build_graph/orchestration/pto_orchestration_api.h b/src/a2a3/runtime/host_build_graph/orchestration/pto_orchestration_api.h
new file mode 100644
index 000000000..b07c94926
--- /dev/null
+++ b/src/a2a3/runtime/host_build_graph/orchestration/pto_orchestration_api.h
@@ -0,0 +1,385 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * PTO Orchestration API - Slim header for orchestration .so files
+ *
+ * This header provides everything an orchestration source needs without
+ * pulling in runtime implementation headers.  The orchestration .so has
+ * zero link dependencies on runtime .cpp files; all runtime calls go
+ * through the PTO2RuntimeOps function-pointer table embedded in
+ * PTO2Runtime.
+ *
+ * Orchestration sources include ONLY this header:
+ *   #include "pto_orchestration_api.h"
+ *
+ * Runtime sources continue to use pto_runtime2.h (which defines the
+ * full PTO2Runtime struct with all internal fields).
+ */
+
+#pragma once
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include <type_traits>
+
+// Type headers needed by orchestration
+#include "common.h"              // framework_bind_runtime / framework_current_runtime
+#include "pto_runtime2_types.h"  // PTO2_ERROR_*
+#include "pto_submit_types.h"    // MixedKernels, INVALID_KERNEL_ID, subtask slots
+#include "pto_types.h"           // Arg, TaskOutputTensors, TensorArgType
+#include "task_args.h"           // ChipStorageTaskArgs, Tensor
+#include "tensor.h"              // Tensor, TensorCreateInfo
+
+// =============================================================================
+// Tensor Factory Helpers
+// =============================================================================
+
+// make_tensor_external(...) — canonical factory for pre-allocated external
+// memory — is defined in the unified tensor.h (common), so host and runtime
+// build Tensors through the same controlled path.
+
+// =============================================================================
+// Ops Table and Opaque Runtime
+// =============================================================================
+
+/**
+ * Forward declaration — the orchestration sees PTO2Runtime as a partial
+ * struct whose first field is the ops pointer.  The full definition
+ * lives in pto_runtime2.h (used only by runtime .cpp files).
+ */
+typedef struct PTO2Runtime PTO2Runtime;
+
+/**
+ * Function-pointer table for runtime operations.
+ * Populated by the runtime; called by orchestration through inline wrappers.
+ */
+typedef struct PTO2RuntimeOps {
+    TaskOutputTensors (*submit_task)(PTO2Runtime *rt, const MixedKernels &mixed_kernels, const L0TaskArgs &args);
+    void (*scope_begin)(PTO2Runtime *rt);
+    void (*scope_end)(PTO2Runtime *rt);
+    void (*orchestration_done)(PTO2Runtime *rt);
+    bool (*is_fatal)(PTO2Runtime *rt);
+    void (*report_fatal)(PTO2Runtime *rt, int32_t error_code, const char *func, const char *fmt, ...);
+
+    // Logging (populated by runtime, called by orchestration)
+    void (*log_error)(const char *func, const char *fmt, ...);
+    void (*log_warn)(const char *func, const char *fmt, ...);
+    void (*log_debug)(const char *func, const char *fmt, ...);
+    // INFO with explicit verbosity tier (v ∈ [0,9]; gating done inside).
+    void (*log_info_v)(const char *func, int v, const char *fmt, ...);
+
+    // Cross-layer data access (orchestration reads/writes tensor values via runtime)
+    // Placed after logging to avoid shifting hot-path field offsets.
+    uint64_t (*get_tensor_data)(PTO2Runtime *rt, const Tensor &tensor, uint32_t ndims, const uint32_t indices[]);
+    void (*set_tensor_data)(
+        PTO2Runtime *rt, const Tensor &tensor, uint32_t ndims, const uint32_t indices[], uint64_t value
+    );
+    TaskOutputTensors (*alloc_tensors)(PTO2Runtime *rt, const L0TaskArgs &args);
+    TaskOutputTensors (*submit_dummy_task)(PTO2Runtime *rt, const L0TaskArgs &args);
+
+    // Stash the call-site of the next PTO2ScopeGuard so the [ScopeStats]
+    // collector can log it. Always present to keep ops-table layout stable
+    // across PTO2_PROFILING settings; set to nullptr at PTO2_PROFILING=0.
+    void (*scope_set_site)(const char *file, int line);
+} PTO2RuntimeOps;
+
+/**
+ * Partial PTO2Runtime definition for orchestration.
+ *
+ * Exposes the ops pointer (for runtime calls) and pending_scope_mode
+ * (read directly by inline scope wrappers).  The real struct (in
+ * pto_runtime2.h) has the same first fields, so accessing them through
+ * this definition is well-defined (C struct layout guarantee).
+ */
+struct PTO2Runtime {
+    const PTO2RuntimeOps *ops;
+    PTO2ScopeMode pending_scope_mode;
+};
+
+// =============================================================================
+// Inline Convenience Wrappers (call through ops table)
+// =============================================================================
+
+static inline PTO2Runtime *current_runtime() { return framework_current_runtime(); }
+
+static inline TaskOutputTensors alloc_tensors(const L0TaskArgs &args) {
+    PTO2Runtime *rt = current_runtime();
+    if (rt->ops->is_fatal(rt)) {
+        return TaskOutputTensors{};
+    }
+    return rt->ops->alloc_tensors(rt, args);
+}
+
+static inline TaskOutputTensors alloc_tensors(const TensorCreateInfo create_infos[], uint32_t count) {
+    PTO2Runtime *rt = current_runtime();
+    if (rt->ops->is_fatal(rt)) {
+        return TaskOutputTensors{};
+    }
+    L0TaskArgs args;
+    for (uint32_t i = 0; i < count; i++) {
+        args.add_output(create_infos[i]);
+    }
+    if (args.has_error) {
+        rt->ops->report_fatal(
+            rt, PTO2_ERROR_INVALID_ARGS, __FUNCTION__, "%s",
+            args.error_msg ? args.error_msg : "alloc_tensors failed to construct output-only Arg"
+        );
+        return TaskOutputTensors{};
+    }
+    return alloc_tensors(args);
+}
+
+template <typename... CIs>
+static inline TaskOutputTensors alloc_tensors(const CIs &...cis) {
+    static_assert(sizeof...(cis) > 0, "alloc_tensors requires at least one TensorCreateInfo");
+    static_assert(
+        (std::is_same_v<std::decay_t<CIs>, TensorCreateInfo> && ...),
+        "alloc_tensors only accepts TensorCreateInfo arguments"
+    );
+    PTO2Runtime *rt = current_runtime();
+    if (rt->ops->is_fatal(rt)) {
+        return TaskOutputTensors{};
+    }
+    L0TaskArgs args;
+    (args.add_output(cis), ...);
+    if (args.has_error) {
+        rt->ops->report_fatal(
+            rt, PTO2_ERROR_INVALID_ARGS, __FUNCTION__, "%s",
+            args.error_msg ? args.error_msg : "alloc_tensors failed to construct output-only Arg"
+        );
+        return TaskOutputTensors{};
+    }
+    return alloc_tensors(args);
+}
+
+static inline TaskOutputTensors rt_submit_task(const MixedKernels &mixed_kernels, const L0TaskArgs &args) {
+    PTO2Runtime *rt = current_runtime();
+    if (rt->ops->is_fatal(rt)) {
+        return TaskOutputTensors{};
+    }
+    return rt->ops->submit_task(rt, mixed_kernels, args);
+}
+
+/**
+ * Convenience wrapper: submit an AIC-only task.
+ */
+static inline TaskOutputTensors rt_submit_aic_task(int32_t kernel_id, const L0TaskArgs &args) {
+    MixedKernels mk;
+    mk.aic_kernel_id = kernel_id;
+    return rt_submit_task(mk, args);
+}
+
+/**
+ * Convenience wrapper: submit an AIV-only task (uses AIV0 slot).
+ */
+static inline TaskOutputTensors rt_submit_aiv_task(int32_t kernel_id, const L0TaskArgs &args) {
+    MixedKernels mk;
+    mk.aiv0_kernel_id = kernel_id;
+    return rt_submit_task(mk, args);
+}
+
+/**
+ * Submit a dependency-only task. Accepts the same Arg shape as rt_submit_task
+ * (inputs, outputs, inouts, explicit_deps, scalars) but does not run any
+ * AICore kernel. The task still participates in the dependency graph: it
+ * waits on its fanin and notifies its fanout. Useful as a synchronization
+ * barrier or as a placeholder producer for tests / dep-graph wiring.
+ */
+static inline TaskOutputTensors rt_submit_dummy_task(const L0TaskArgs &args) {
+    PTO2Runtime *rt = current_runtime();
+    if (rt->ops->is_fatal(rt)) {
+        return TaskOutputTensors{};
+    }
+    return rt->ops->submit_dummy_task(rt, args);
+}
+
+static inline void rt_scope_begin(PTO2ScopeMode mode = PTO2ScopeMode::AUTO) {
+    PTO2Runtime *rt = current_runtime();
+    if (rt->ops->is_fatal(rt)) {
+        return;
+    }
+    rt->pending_scope_mode = mode;
+    rt->ops->scope_begin(rt);
+}
+
+static inline void rt_scope_end() {
+    PTO2Runtime *rt = current_runtime();
+    if (rt->ops->is_fatal(rt)) {
+        return;
+    }
+    rt->ops->scope_end(rt);
+}
+
+static inline void rt_orchestration_done() {
+    PTO2Runtime *rt = current_runtime();
+    rt->ops->orchestration_done(rt);
+}
+
+static inline bool rt_is_fatal() {
+    PTO2Runtime *rt = current_runtime();
+    return rt->ops->is_fatal(rt);
+}
+
+#define rt_report_fatal(code, fmt, ...)                                          \
+    do {                                                                         \
+        PTO2Runtime *_rt = current_runtime();                                    \
+        _rt->ops->report_fatal(_rt, (code), __FUNCTION__, (fmt), ##__VA_ARGS__); \
+    } while (0)
+
+// =============================================================================
+// Logging Macros for Orchestration (call through ops table)
+// =============================================================================
+
+#define LOG_ERROR(fmt, ...) current_runtime()->ops->log_error(__FUNCTION__, fmt, ##__VA_ARGS__)
+#define LOG_WARN(fmt, ...) current_runtime()->ops->log_warn(__FUNCTION__, fmt, ##__VA_ARGS__)
+#define LOG_DEBUG(fmt, ...) current_runtime()->ops->log_debug(__FUNCTION__, fmt, ##__VA_ARGS__)
+
+// INFO verbosity tiers. v=0 most verbose, v=9 must-see, v=5 default.
+#define LOG_INFO_V0(fmt, ...) current_runtime()->ops->log_info_v(__FUNCTION__, 0, fmt, ##__VA_ARGS__)
+#define LOG_INFO_V1(fmt, ...) current_runtime()->ops->log_info_v(__FUNCTION__, 1, fmt, ##__VA_ARGS__)
+#define LOG_INFO_V2(fmt, ...) current_runtime()->ops->log_info_v(__FUNCTION__, 2, fmt, ##__VA_ARGS__)
+#define LOG_INFO_V3(fmt, ...) current_runtime()->ops->log_info_v(__FUNCTION__, 3, fmt, ##__VA_ARGS__)
+#define LOG_INFO_V4(fmt, ...) current_runtime()->ops->log_info_v(__FUNCTION__, 4, fmt, ##__VA_ARGS__)
+#define LOG_INFO_V5(fmt, ...) current_runtime()->ops->log_info_v(__FUNCTION__, 5, fmt, ##__VA_ARGS__)
+#define LOG_INFO_V6(fmt, ...) current_runtime()->ops->log_info_v(__FUNCTION__, 6, fmt, ##__VA_ARGS__)
+#define LOG_INFO_V7(fmt, ...) current_runtime()->ops->log_info_v(__FUNCTION__, 7, fmt, ##__VA_ARGS__)
+#define LOG_INFO_V8(fmt, ...) current_runtime()->ops->log_info_v(__FUNCTION__, 8, fmt, ##__VA_ARGS__)
+#define LOG_INFO_V9(fmt, ...) current_runtime()->ops->log_info_v(__FUNCTION__, 9, fmt, ##__VA_ARGS__)
+
+// =============================================================================
+// Cross-Layer Data Access
+// =============================================================================
+
+/**
+ * Read a value from a tensor at the given multi-dimensional indices.
+ *
+ * Default T = uint64_t preserves old behavior (raw bits).
+ * Specify T to get automatic type conversion:
+ *
+ *   uint64_t raw = get_tensor_data(tensor, 1, idx);       // old usage unchanged
+ *   float val = get_tensor_data<float>(tensor, 1, idx);   // typed read
+ *
+ * If the tensor has a producer in TensorMap, spin-waits until the producer
+ * task completes before reading. External tensors (make_tensor_external)
+ * are read immediately without waiting.
+ */
+template <typename T = uint64_t>
+static inline T get_tensor_data(const Tensor &tensor, uint32_t ndims, const uint32_t indices[]) {
+    PTO2Runtime *rt = current_runtime();
+    if (rt->ops->is_fatal(rt)) {
+        return from_u64<T>(0);
+    }
+    return from_u64<T>(rt->ops->get_tensor_data(rt, tensor, ndims, indices));
+}
+
+/**
+ * Write a value to a tensor at the given multi-dimensional indices.
+ *
+ * Type is deduced from value argument; uint64_t by default:
+ *
+ *   set_tensor_data(tensor, 1, idx, raw_u64);     // old usage unchanged
+ *   set_tensor_data(tensor, 1, idx, 42.0f);       // typed write (T = float)
+ *
+ * If the tensor has a producer in TensorMap, spin-waits until the producer
+ * and all its consumers complete before writing (WAW + WAR safety).
+ * External tensors (make_tensor_external) with no TensorMap entry are
+ * written immediately without waiting.
+ *
+ * Limitation: TensorMap only tracks producers (OUTPUT/INOUT), not consumers
+ * that used the tensor as INPUT. If a kernel reads this tensor as INPUT
+ * (not INOUT) and the tensor has no TensorMap producer entry, set_tensor_data
+ * cannot detect the reader and may cause a data race.
+ *
+ * To ensure WAR safety for all access patterns, use add_inout() instead of
+ * add_input() for kernel parameters that may later be written via
+ * set_tensor_data. INOUT creates a TensorMap entry that enables automatic
+ * consumer tracking via fanout_refcount.
+ *
+ * The tensor must already have an allocated buffer (addr != 0).
+ * For runtime-created outputs, call this only on the Tensor returned by
+ * add_output(TensorCreateInfo) after submit returns.
+ */
+template <typename T = uint64_t>
+static inline void set_tensor_data(const Tensor &tensor, uint32_t ndims, const uint32_t indices[], T value) {
+    PTO2Runtime *rt = current_runtime();
+    if (rt->ops->is_fatal(rt)) {
+        return;
+    }
+    rt->ops->set_tensor_data(rt, tensor, ndims, indices, to_u64(value));
+}
+
+// =============================================================================
+// C++ Scope Guards and Macros
+// =============================================================================
+
+/**
+ * RAII Scope Guard (calls through ops table)
+ */
+class PTO2ScopeGuard {
+public:
+    explicit PTO2ScopeGuard(
+        PTO2ScopeMode mode = PTO2ScopeMode::AUTO, const char *file = __builtin_FILE(), int line = __builtin_LINE()
+    ) :
+        rt_(current_runtime()) {
+        if (!rt_->ops->is_fatal(rt_)) {
+            rt_->pending_scope_mode = mode;
+            if (rt_->ops->scope_set_site) rt_->ops->scope_set_site(file, line);
+            rt_->ops->scope_begin(rt_);
+        }
+    }
+    ~PTO2ScopeGuard() {
+        if (!rt_->ops->is_fatal(rt_)) {
+            rt_->ops->scope_end(rt_);
+        }
+    }
+
+private:
+    PTO2Runtime *rt_;
+};
+
+#define _PTO2_CONCATENATE_IMPL(x, y) x##y
+#define _PTO2_CONCATENATE(x, y) _PTO2_CONCATENATE_IMPL(x, y)
+
+#define PTO2_SCOPE_GUARD() [[maybe_unused]] PTO2ScopeGuard _PTO2_CONCATENATE(scope_guard_, __COUNTER__)
+
+/**
+ * Scoped block macro:
+ *   PTO2_SCOPE() {
+ *       rt_submit_task(...);
+ *   }
+ */
+#define PTO2_SCOPE(...) if (PTO2ScopeGuard _PTO2_CONCATENATE(scope_guard_, __COUNTER__){__VA_ARGS__}; true)
+
+// =============================================================================
+// Orchestration Config
+// =============================================================================
+
+/**
+ * Configuration exported by orchestration .so via aicpu_orchestration_config().
+ * The executor reads these values to set up shared memory and runtime.
+ *
+ * This struct is defined identically in pto_runtime2.h (with an include
+ * guard) so the executor can use the same type without including this header.
+ */
+#ifndef PTO2_ORCHESTRATION_CONFIG_DEFINED
+#define PTO2_ORCHESTRATION_CONFIG_DEFINED
+struct PTO2OrchestrationConfig {
+    int expected_arg_count;
+};
+#endif
+
+// Convenience layer (L0TaskArgsWithDeps<N> + matching rt_submit_*_task overloads).
+// Pulled in at the bottom so the wrapper sees L0TaskArgs, MixedKernels, and the
+// rt_submit_*_task primitives defined above. Orchestration sources include
+// only this single header to access both the primitive and convenience APIs.
+#include "pto_arg_with_deps.h"  // NOLINT(build/include_subdir)
diff --git a/src/a2a3/runtime/host_build_graph/runtime/aicore_completion_mailbox.h b/src/a2a3/runtime/host_build_graph/runtime/aicore_completion_mailbox.h
new file mode 100644
index 000000000..0f73a043a
--- /dev/null
+++ b/src/a2a3/runtime/host_build_graph/runtime/aicore_completion_mailbox.h
@@ -0,0 +1,189 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+#ifndef SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_AICORE_COMPLETION_MAILBOX_H_
+#define SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_AICORE_COMPLETION_MAILBOX_H_
+
+#include <atomic>
+#include <cstdint>
+
+#include "aicore_completion_mailbox_types.h"
+#include "pto_constants.h"
+#include "pto_task_id.h"
+
+// AICPU-only MPSC ring used to convey deferred-completion observations from
+// FIN-handling scheduler threads to the dispatch thread. Producers push under
+// CAS on `head`; the single consumer (dispatch thread, under AsyncWaitList::
+// busy) drains in seq order. Kernel-side code never touches this struct —
+// AICore writes go into DeferredCompletionSlab (see
+// aicore_completion_mailbox_types.h), which the FIN thread reads, flattens
+// into messages here, and forwards.
+
+#define AICORE_COMPLETION_MAILBOX_CAPACITY 4096u
+#define AICORE_COMPLETION_MAILBOX_MASK (AICORE_COMPLETION_MAILBOX_CAPACITY - 1u)
+
+static_assert(
+    (AICORE_COMPLETION_MAILBOX_CAPACITY & (AICORE_COMPLETION_MAILBOX_CAPACITY - 1u)) == 0,
+    "AICORE_COMPLETION_MAILBOX_CAPACITY must be a power of two"
+);
+
+// Mailbox message discriminator. CONDITION carries one deferred-completion
+// observation flattened from a DeferredCompletionEntry. TASK_NORMAL_DONE
+// carries the slot_state pointer in `addr` so the consumer can finalize the
+// AsyncWaitEntry.slot_state binding for tasks whose conditions arrived
+// before the FIN thread saw task_complete. New kinds may be added in future
+// without growing the message — the `_pad[5]` slack is reserved for
+// kind-specific payload extension.
+#define MSG_KIND_CONDITION 0u
+#define MSG_KIND_TASK_NORMAL_DONE 1u
+
+struct AICoreCompletionMailboxMessage {
+    // Per-slot ready flag. Producer publishes `tail+1` after filling the rest
+    // of the slot with a release store; consumer waits for the matching seq
+    // value with an acquire load. The release-acquire pair publishes all
+    // other fields below as a side effect, so they stay plain.
+    std::atomic<uint64_t> seq;
+    PTO2TaskId task_token;
+    // CONDITION: completion observation addr (counter / SDMA event record).
+    // TASK_NORMAL_DONE: PTO2TaskSlotState pointer carried over to the consumer
+    //   so it can finalize the AsyncWaitEntry.slot_state binding.
+    uint64_t addr;
+    uint32_t expected_value;
+    uint32_t engine;
+    int32_t completion_type;
+    uint32_t kind;
+    uint32_t _pad[5];
+};
+
+static_assert(sizeof(AICoreCompletionMailboxMessage) == PTO2_ALIGN_SIZE, "AICoreCompletionMailboxMessage layout drift");
+static_assert(
+    sizeof(std::atomic<uint64_t>) == sizeof(uint64_t),
+    "std::atomic<uint64_t> must be layout-compatible with uint64_t for the message slot layout to hold"
+);
+static_assert(
+    std::atomic<uint64_t>::is_always_lock_free,
+    "AICoreCompletionMailbox requires lock-free uint64_t atomics on every supported target"
+);
+
+// POD view of a drained message. `seq` is the ring's publication flag, not
+// payload, so try_pop copies out only the fields below (and seq is not even
+// copyable — it is a std::atomic).
+struct AICoreCompletionMsgView {
+    PTO2TaskId task_token{PTO2TaskId::invalid()};
+    uint64_t addr{0};
+    uint32_t expected_value{0};
+    uint32_t engine{0};
+    int32_t completion_type{0};
+    uint32_t kind{0};
+};
+
+struct AICoreCompletionMailbox {
+    // head and tail live on their own cache lines so producer CAS contention
+    // on head can't false-share with the consumer's tail updates.
+    alignas(PTO2_ALIGN_SIZE) std::atomic<uint64_t> head;
+    uint8_t _head_pad[PTO2_ALIGN_SIZE - sizeof(uint64_t)];
+    alignas(PTO2_ALIGN_SIZE) std::atomic<uint64_t> tail;
+    uint8_t _tail_pad[PTO2_ALIGN_SIZE - sizeof(uint64_t)];
+    alignas(PTO2_ALIGN_SIZE) AICoreCompletionMailboxMessage entries[AICORE_COMPLETION_MAILBOX_CAPACITY];
+
+    // Cheap, lock-free pending hint. Callers may invoke this outside the
+    // consumer lock; a stale answer only over/under-triggers a drain attempt.
+    bool has_pending() { return tail.load(std::memory_order_acquire) < head.load(std::memory_order_acquire); }
+
+    // MPSC push for a CONDITION message. Returns false when the ring is full
+    // (head - tail >= CAPACITY); caller should SPIN_WAIT_HINT and retry.
+    // Lock-free: CAS the shared head to claim a slot, write the fields, then
+    // release-store seq so the single consumer observes the publication.
+    //
+    // The head CAS is relaxed: head is a pure ticket counter and carries no
+    // data to the consumer — publication is solely the seq release-store, and
+    // slot-reuse safety rests on the acquire load of tail. The relaxed failure
+    // order is likewise sufficient since a lost CAS just re-reads head and
+    // retries. compare_exchange_weak is used because this loop already re-reads
+    // head and re-checks fullness, so masking LL/SC spurious failures (what
+    // _strong adds on aarch64) would only be a redundant inner retry.
+    //
+    // Safe to call concurrently from any number of producers; structurally
+    // independent of the AsyncWaitList::busy lock.
+    bool try_push_condition(
+        PTO2TaskId task_token, uint64_t addr, uint32_t expected_value, uint32_t engine, int32_t completion_type
+    ) {
+        while (true) {
+            uint64_t h = head.load(std::memory_order_relaxed);
+            uint64_t t = tail.load(std::memory_order_acquire);
+            if (h - t >= AICORE_COMPLETION_MAILBOX_CAPACITY) return false;
+            uint64_t new_head = h + 1;
+            if (head.compare_exchange_weak(h, new_head, std::memory_order_relaxed, std::memory_order_relaxed)) {
+                AICoreCompletionMailboxMessage *slot = &entries[h & AICORE_COMPLETION_MAILBOX_MASK];
+                slot->task_token.raw = task_token.raw;
+                slot->addr = addr;
+                slot->expected_value = expected_value;
+                slot->engine = engine;
+                slot->completion_type = completion_type;
+                slot->kind = MSG_KIND_CONDITION;
+                slot->seq.store(new_head, std::memory_order_release);
+                return true;
+            }
+            // CAS lost: another producer claimed the slot, retry with refreshed head.
+        }
+    }
+
+    // MPSC push for a TASK_NORMAL_DONE sentinel. Carries the PTO2TaskSlotState
+    // pointer in the `addr` field so the consumer can finish binding the
+    // AsyncWaitEntry.slot_state without going back to the FIN-handling thread.
+    bool try_push_normal_done(PTO2TaskId task_token, uint64_t slot_state_addr) {
+        while (true) {
+            uint64_t h = head.load(std::memory_order_relaxed);
+            uint64_t t = tail.load(std::memory_order_acquire);
+            if (h - t >= AICORE_COMPLETION_MAILBOX_CAPACITY) return false;
+            uint64_t new_head = h + 1;
+            if (head.compare_exchange_weak(h, new_head, std::memory_order_relaxed, std::memory_order_relaxed)) {
+                AICoreCompletionMailboxMessage *slot = &entries[h & AICORE_COMPLETION_MAILBOX_MASK];
+                slot->task_token.raw = task_token.raw;
+                slot->addr = slot_state_addr;
+                slot->expected_value = 0;
+                slot->engine = 0;
+                slot->completion_type = 0;
+                slot->kind = MSG_KIND_TASK_NORMAL_DONE;
+                slot->seq.store(new_head, std::memory_order_release);
+                return true;
+            }
+        }
+    }
+
+    // Single-consumer transport-level dequeue (caller holds the consumer lock).
+    // Returns false at the first not-yet-published slot (gap) or when empty;
+    // otherwise copies the next message in tail order into `out`, advances
+    // tail, and returns true. tail is consumer-only-written (relaxed read);
+    // head bounds the scan (relaxed); the seq acquire is the real publication
+    // gate; the tail release publishes "slot free" to reusing producers.
+    bool try_pop(AICoreCompletionMsgView &out) {
+        uint64_t t = tail.load(std::memory_order_relaxed);
+        uint64_t h = head.load(std::memory_order_relaxed);
+        if (t >= h) return false;
+        AICoreCompletionMailboxMessage *slot = &entries[t & AICORE_COMPLETION_MAILBOX_MASK];
+        if (slot->seq.load(std::memory_order_acquire) != t + 1) return false;
+        out.task_token.raw = slot->task_token.raw;
+        out.addr = slot->addr;
+        out.expected_value = slot->expected_value;
+        out.engine = slot->engine;
+        out.completion_type = slot->completion_type;
+        out.kind = slot->kind;
+        tail.store(t + 1, std::memory_order_release);
+        return true;
+    }
+};
+
+static_assert(
+    sizeof(AICoreCompletionMailbox) % PTO2_ALIGN_SIZE == 0, "AICoreCompletionMailbox size must be cache-line aligned"
+);
+
+#endif  // SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_AICORE_COMPLETION_MAILBOX_H_
diff --git a/src/a2a3/runtime/host_build_graph/runtime/aicore_completion_mailbox_types.h b/src/a2a3/runtime/host_build_graph/runtime/aicore_completion_mailbox_types.h
new file mode 100644
index 000000000..da0d89ad7
--- /dev/null
+++ b/src/a2a3/runtime/host_build_graph/runtime/aicore_completion_mailbox_types.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+#ifndef SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_AICORE_COMPLETION_MAILBOX_TYPES_H_
+#define SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_AICORE_COMPLETION_MAILBOX_TYPES_H_
+
+#include <stdint.h>
+
+#include "pto_constants.h"
+
+// Types shared across the AICore↔AICPU boundary.
+//
+// This header is reachable from AICore-side translation units (via
+// pto_async_kernel_api.h / pto_completion_token.h / sdma_completion_kernel.h)
+// and must stay parseable by every AICore toolchain configuration: no
+// <atomic>, no __atomic_* intrinsics, no MPSC ring buffer struct.
+//
+// The MPSC ring (AICoreCompletionMailbox) and its push/drain helpers live in
+// aicore_completion_mailbox.h, which is AICPU-only.
+
+inline constexpr int32_t MAX_COMPLETIONS_PER_TASK = 64;
+
+#define COMPLETION_ENGINE_SDMA 0u
+#define COMPLETION_ENGINE_ROCE 1u
+#define COMPLETION_ENGINE_URMA 2u
+#define COMPLETION_ENGINE_CCU 3u
+
+#define COMPLETION_TYPE_COUNTER 0
+#define COMPLETION_TYPE_SDMA_EVENT_RECORD 1
+
+// DeferredCompletionEntry / DeferredCompletionSlab back the per-task scratch
+// area that AICore writes into to record "this completion has to be observed
+// before the task can retire." The FIN-handling scheduler thread reads the
+// slab, flattens entries into AICoreCompletionMailbox messages, and forwards
+// them to the dispatch thread. `volatile` here is load-bearing: writers live
+// on AICore and readers on AICPU, so the qualifier is the correct way to
+// pin the compiler against caching / reordering on either side.
+struct DeferredCompletionEntry {
+    uint64_t addr;
+    uint32_t expected_value;
+    uint32_t engine;
+    int32_t completion_type;
+    uint32_t _pad;
+};
+
+static_assert(sizeof(DeferredCompletionEntry) == 24, "DeferredCompletionEntry layout drift");
+
+struct alignas(PTO2_ALIGN_SIZE) DeferredCompletionSlab {
+    volatile uint32_t count;
+    volatile int32_t error_code;
+    DeferredCompletionEntry entries[MAX_COMPLETIONS_PER_TASK];
+};
+
+static_assert(
+    sizeof(DeferredCompletionSlab) % PTO2_ALIGN_SIZE == 0,
+    "DeferredCompletionSlab size must preserve array element cache-line boundaries"
+);
+
+#endif  // SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_AICORE_COMPLETION_MAILBOX_TYPES_H_
diff --git a/src/a2a3/runtime/host_build_graph/runtime/backend/sdma/sdma_completion_kernel.h b/src/a2a3/runtime/host_build_graph/runtime/backend/sdma/sdma_completion_kernel.h
new file mode 100644
index 000000000..49ee7cc11
--- /dev/null
+++ b/src/a2a3/runtime/host_build_graph/runtime/backend/sdma/sdma_completion_kernel.h
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+#pragma once
+
+#include <stdint.h>
+
+#include <pto/comm/async_common/async_event_impl.hpp>
+#include <pto/npu/comm/async/sdma/sdma_async_intrin.hpp>
+
+#include "pto_async_kernel_api.h"
+#include "aicore_completion_mailbox_types.h"
+#include "pto_runtime_status.h"
+
+#ifndef __aicore__
+#define __aicore__
+#endif
+#ifndef __gm__
+#define __gm__
+#endif
+
+// Re-exposed PTO-ISA constant so examples / callers don't need to include
+// <pto/npu/comm/async/sdma/sdma_types.hpp> just to spell their scratch tile.
+inline constexpr uint32_t SDMA_SCRATCH_ALIGNMENT = pto::comm::sdma::UB_ALIGN_SIZE;
+
+enum class SdmaOp : uint8_t {
+    TGET = 0,
+    TPUT = 1,
+};
+
+// SdmaRequestDescriptor bundles everything send_request_entry needs to drive
+// one SDMA transfer + completion registration. It is a template because the
+// destination / source / scratch types carry tensor shape & stride at compile
+// time; the SdmaTget() / SdmaTput() helpers below let callers skip the
+// template arguments.
+//
+// sync_id selects which event-record slot inside the workspace the engine
+// writes into. Concurrent dispatches must use distinct sync_ids; today every
+// caller submits one request per kernel invocation so passing 0 is safe.
+// Future work (see .docs/25.comm-api-refactor/03.implementation-plan.md §5.2)
+// will fold sync_id allocation into the adapter.
+template <typename DstTensor, typename SrcTensor, typename ScratchTileT>
+struct SdmaRequestDescriptor {
+    SdmaOp op;
+    DstTensor dst;
+    SrcTensor src;
+    ScratchTileT scratch;
+    __gm__ uint8_t *workspace;
+    uint32_t sync_id;
+};
+
+template <typename DstTensor, typename SrcTensor, typename ScratchTileT>
+inline __aicore__ SdmaRequestDescriptor<DstTensor, SrcTensor, ScratchTileT> SdmaTget(
+    const DstTensor &dst, const SrcTensor &src, const ScratchTileT &scratch, __gm__ uint8_t *workspace,
+    uint32_t sync_id = 0
+) {
+    return SdmaRequestDescriptor<DstTensor, SrcTensor, ScratchTileT>{SdmaOp::TGET, dst,       src,
+                                                                     scratch,      workspace, sync_id};
+}
+
+template <typename DstTensor, typename SrcTensor, typename ScratchTileT>
+inline __aicore__ SdmaRequestDescriptor<DstTensor, SrcTensor, ScratchTileT> SdmaTput(
+    const DstTensor &dst, const SrcTensor &src, const ScratchTileT &scratch, __gm__ uint8_t *workspace,
+    uint32_t sync_id = 0
+) {
+    return SdmaRequestDescriptor<DstTensor, SrcTensor, ScratchTileT>{SdmaOp::TPUT, dst,       src,
+                                                                     scratch,      workspace, sync_id};
+}
+
+namespace pto2::detail {
+
+inline __aicore__ void register_sdma_event_record(AsyncCtx &ctx, volatile __gm__ void *record_addr) {
+    CompletionToken token{
+        reinterpret_cast<uint64_t>(record_addr), 0, COMPLETION_ENGINE_SDMA, COMPLETION_TYPE_SDMA_EVENT_RECORD, 0
+    };
+    (void)register_completion_condition(ctx, token);
+}
+
+template <typename PtoAsyncEvent, typename PtoAsyncSession>
+inline __aicore__ void
+register_pto_async_event(AsyncCtx &ctx, const PtoAsyncEvent &event, const PtoAsyncSession &session) {
+    if (ctx.task_token.is_invalid() || ctx.completion_count == nullptr || ctx.completion_entries == nullptr) {
+        (void)event.Wait(session);
+        return;
+    }
+    if (event.handle == 0) {
+        return;
+    }
+
+    const uint32_t engine = static_cast<uint32_t>(event.engine);
+    if (engine != static_cast<uint32_t>(::pto::comm::DmaEngine::SDMA)) {
+        defer_error(ctx, PTO2_ERROR_ASYNC_COMPLETION_INVALID);
+        return;
+    }
+
+    ::pto::comm::sdma::detail::UbTmpBuf tmp_buf;
+    uint32_t sync_id = 0;
+    __gm__ uint8_t *recv_workspace = nullptr;
+    uint32_t queue_num = 0;
+    if (!::pto::comm::sdma::detail::PrepareEventCheck(
+            session.sdmaSession, tmp_buf, sync_id, recv_workspace, queue_num
+        )) {
+        defer_error(ctx, PTO2_ERROR_ASYNC_COMPLETION_INVALID);
+        return;
+    }
+    for (uint32_t queue_id = 0; queue_id < queue_num; ++queue_id) {
+        register_sdma_event_record(ctx, ::pto::comm::sdma::detail::GetEventRecord(recv_workspace, queue_id));
+    }
+}
+
+}  // namespace pto2::detail
+
+// SDMA overload of the runtime's send_request_entry. Submits the descriptor
+// to PTO-ISA, then registers the resulting AsyncEvent's GM flag(s) into the
+// AsyncCtx deferred-wait slab and flushes. Returns false on submit/session
+// failure (also records the error in ctx.completion_error_code).
+template <typename DstTensor, typename SrcTensor, typename ScratchTileT>
+inline __aicore__ bool
+send_request_entry(AsyncCtx &ctx, SdmaRequestDescriptor<DstTensor, SrcTensor, ScratchTileT> desc) {
+    pto::comm::AsyncSession session;
+    if (!pto::comm::BuildAsyncSession(desc.scratch, desc.workspace, session, desc.sync_id)) {
+        pto2::detail::defer_error(ctx, PTO2_ERROR_ASYNC_COMPLETION_INVALID);
+        return false;
+    }
+
+    pto::comm::AsyncEvent event;
+    if (desc.op == SdmaOp::TGET) {
+        event = pto::comm::TGET_ASYNC(desc.dst, desc.src, session);
+    } else {
+        event = pto::comm::TPUT_ASYNC(desc.dst, desc.src, session);
+    }
+    pto2::detail::register_pto_async_event(ctx, event, session);
+    pto2::detail::defer_flush(ctx);
+    return true;
+}
diff --git a/src/a2a3/runtime/host_build_graph/runtime/backend/sdma/sdma_completion_scheduler.h b/src/a2a3/runtime/host_build_graph/runtime/backend/sdma/sdma_completion_scheduler.h
new file mode 100644
index 000000000..689219c35
--- /dev/null
+++ b/src/a2a3/runtime/host_build_graph/runtime/backend/sdma/sdma_completion_scheduler.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+
+#include "aicpu/platform_regs.h"
+#include "aicore_completion_mailbox.h"
+#include "pto_completion_token.h"
+#include "pto_runtime_status.h"
+
+// runtime-side mirror of the PTO-ISA SdmaEventRecord. SDMA backend is the only
+// allowed holder of this ABI knowledge; the generic scheduler dispatches into
+// the helpers below through the completion ops table.
+struct SdmaEventRecord {
+    uint32_t flag;
+    uint32_t sq_tail;
+    uint64_t channel_info;
+};
+
+static_assert(sizeof(SdmaEventRecord) == 16, "SDMA event record ABI drift");
+static_assert(offsetof(SdmaEventRecord, sq_tail) == 4, "SDMA event record ABI drift");
+
+inline uintptr_t sdma_completion_cache_line(const volatile void *addr) {
+    return reinterpret_cast<uintptr_t>(addr) & ~(uintptr_t(PTO2_ALIGN_SIZE) - 1u);
+}
+
+inline CompletionPollResult poll_sdma_event_record(uint64_t record_addr) {
+    if (record_addr == 0) {
+        return {CompletionPollState::FAILED, PTO2_ERROR_ASYNC_COMPLETION_INVALID};
+    }
+    volatile SdmaEventRecord *record =
+        reinterpret_cast<volatile SdmaEventRecord *>(static_cast<uintptr_t>(record_addr));
+    cache_invalidate_range(reinterpret_cast<const void *>(sdma_completion_cache_line(record)), PTO2_ALIGN_SIZE);
+    uint32_t flag = __atomic_load_n(&record->flag, __ATOMIC_ACQUIRE);
+    return {flag != 0 ? CompletionPollState::READY : CompletionPollState::PENDING, PTO2_ERROR_NONE};
+}
+
+inline void retire_sdma_event_record(uint64_t record_addr) {
+    if (record_addr == 0) return;
+    volatile SdmaEventRecord *record =
+        reinterpret_cast<volatile SdmaEventRecord *>(static_cast<uintptr_t>(record_addr));
+    cache_invalidate_range(reinterpret_cast<const void *>(sdma_completion_cache_line(record)), PTO2_ALIGN_SIZE);
+    uint32_t completed_tail = __atomic_load_n(&record->sq_tail, __ATOMIC_ACQUIRE);
+    uint64_t channel_info_addr = __atomic_load_n(&record->channel_info, __ATOMIC_ACQUIRE);
+
+    volatile uint64_t *record_head = reinterpret_cast<volatile uint64_t *>(record);
+    __atomic_store_n(record_head, 0ULL, __ATOMIC_RELEASE);
+    cache_flush_range(const_cast<const void *>(reinterpret_cast<volatile void *>(record_head)), sizeof(uint64_t));
+
+    if (channel_info_addr == 0) return;
+    uint64_t packed = (static_cast<uint64_t>(completed_tail) << 32) | static_cast<uint64_t>(completed_tail);
+    volatile uint64_t *channel_info = reinterpret_cast<volatile uint64_t *>(static_cast<uintptr_t>(channel_info_addr));
+    __atomic_store_n(channel_info, packed, __ATOMIC_RELEASE);
+    cache_flush_range(const_cast<const void *>(reinterpret_cast<volatile void *>(channel_info)), sizeof(uint64_t));
+}
diff --git a/src/a2a3/runtime/host_build_graph/runtime/common.h b/src/a2a3/runtime/host_build_graph/runtime/common.h
new file mode 100644
index 000000000..9dcf438ed
--- /dev/null
+++ b/src/a2a3/runtime/host_build_graph/runtime/common.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <stdlib.h>
+
+// Assertion macros (always_assert / debug_assert), AssertionError, and the
+// MAYBE_UNINITIALIZED diagnostics live in the shared header so the unified
+// Tensor (src/common/task_interface/tensor.h) can use them without depending
+// on this runtime-specific header. assert_impl / get_stacktrace are defined in
+// orchestration/common.cpp for runtime targets.
+#include "assert_compat.h"
+
+// Framework-internal TLS bridge. The executor binds the current thread's
+// runtime before invoking the orchestration entry, so orchestration helpers can
+// fetch the current PTO2Runtime without explicit parameter threading. Declared
+// here (rather than in pto_orchestration_api.h) so framework TUs the AICore
+// build also compiles — notably orchestration/common.cpp — see these symbols
+// without pulling in pto_types.h, whose Arg::add_scalar → to_u64 path is
+// __aicore__-only and would break the ccec build.
+#ifdef __cplusplus
+extern "C" {
+#endif
+struct PTO2Runtime;
+PTO2Runtime *framework_current_runtime(void);
+void framework_bind_runtime(PTO2Runtime *rt);
+#ifdef __cplusplus
+}
+#endif
diff --git a/src/a2a3/runtime/host_build_graph/runtime/orchestrator_core/pto_orchestrator.cpp b/src/a2a3/runtime/host_build_graph/runtime/orchestrator_core/pto_orchestrator.cpp
new file mode 100644
index 000000000..e5e6a09f1
--- /dev/null
+++ b/src/a2a3/runtime/host_build_graph/runtime/orchestrator_core/pto_orchestrator.cpp
@@ -0,0 +1,1120 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+/**
+ * PTO Runtime2 - Orchestrator Implementation
+ *
+ * Implements orchestrator state management, scope handling, and task submission.
+ *
+ * Based on: docs/RUNTIME_LOGIC.md
+ */
+
+#include "pto_orchestrator.h"
+
+#include <assert.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <stdarg.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "aicpu/dep_gen_collector_aicpu.h"
+#include "common/dep_gen.h"
+#include "common/unified_log.h"
+#include "pto_dep_compute.h"
+#include "pto_runtime2_types.h"
+#include "pto_shared_memory.h"
+#include "pto_tensormap.h"
+#include "pto_types.h"
+#include "tensor.h"
+
+#if PTO2_PROFILING
+#include "aicpu/scope_stats_collector_aicpu.h"
+#include "aicpu/tensor_dump_aicpu.h"
+#endif
+
+// Verify the captured Tensor blob size in DepGenRecord matches the runtime
+// Tensor layout. The platform header defines DEP_GEN_TENSOR_SIZE without
+// including runtime/tensor.h, so this check lives at the orch callsite.
+static_assert(sizeof(Tensor) == DEP_GEN_TENSOR_SIZE, "DepGenRecord::tensors slot size out of sync with sizeof(Tensor)");
+// DEP_GEN_MAX_EXPLICIT_DEPS is a diagnostic-side capture cap only; the runtime
+// imposes no hard cap on explicit dep count. If a submit exceeds this cap,
+// dep_gen_aicpu_record_submit() logs and truncates — runtime correctness is
+// unaffected, only the captured replay record is truncated.
+
+// Weak fallbacks: dep_gen_collector_aicpu.cpp provides the strong symbols in
+// AICPU builds. Host builds (host_build_graph runtime, future dep_gen replay)
+// link these no-op stubs so the runtime translation unit is self-contained.
+// Visibility is hidden so the HOST .so doesn't export them into the global
+// dynamic symbol table where they'd shadow the AICPU .so's strong symbols
+// (same pattern as get_sys_cnt_aicpu / l2_swimlane_aicpu_record_orch_phase below).
+extern "C" __attribute__((weak, visibility("hidden"))) bool is_dep_gen_enabled() { return false; }
+__attribute__((weak, visibility("hidden"))) void dep_gen_aicpu_record_submit(
+    uint64_t, bool, int, const void *const *, const uint8_t *, int, const uint64_t *, int, const int32_t[3]
+) {}
+
+// Scope_stats enable gate, queried via the same predicate idiom as
+// is_dep_gen_enabled above. The AICPU collector links the strong definition;
+// host builds fall back to this weak `false`. Gating here still skips the
+// cross-agent occupancy reads that feed the sample when scope_stats is disabled.
+extern "C" __attribute__((weak, visibility("hidden"))) bool is_scope_stats_enabled() { return false; }
+
+// Heap-ring wrap report, called from the allocator (pto_ring_buffer.h) on each
+// wrap. Strong definition lives in the AICPU collector; host builds fall back to
+// this weak no-op so the runtime translation unit stays self-contained.
+extern "C" __attribute__((weak, visibility("hidden"))) void scope_stats_note_heap_wrap(int) {}
+
+// =============================================================================
+// Orchestrator Profiling (compile-time toggle)
+// =============================================================================
+#if PTO2_ORCH_PROFILING
+#include "aicpu/device_time.h"
+#include "aicpu/l2_swimlane_collector_aicpu.h"
+// Weak fallback for builds that don't link device_time.cpp (e.g. host).
+// The strong symbol from platform/.../device_time.cpp wins in the AICPU build.
+//
+// IMPORTANT: visibility("hidden") is required to prevent the HOST .so from
+// exporting this weak fallback into the global dynamic symbol table via
+// RTLD_GLOBAL. Without it, when the AICPU .so is loaded and its PLT entry
+// for get_sys_cnt_aicpu is resolved, the dynamic linker finds the HOST .so's
+// weak definition first (already in global table) and uses it — returning 0.
+// With hidden visibility, the HOST .so does not export this symbol globally,
+// so the AICPU .so's PLT resolves to its own strong definition from
+// device_time.cpp.
+__attribute__((weak, visibility("hidden"))) uint64_t get_sys_cnt_aicpu() { return 0; }
+// Weak fallback for builds that don't link l2_swimlane_collector_aicpu.cpp.
+// The strong symbol from the AICPU build wins when profiling is available.
+// Also hidden to prevent HOST .so from polluting the global symbol table.
+__attribute__((weak, visibility("hidden"))) void
+l2_swimlane_aicpu_record_orch_phase(uint64_t, uint64_t, uint64_t, uint32_t) {}
+// Accumulated cycles per sub-step (only needed for ORCH_PROFILING export)
+static uint64_t g_orch_sync_cycle = 0;       // tensormap sync
+static uint64_t g_orch_alloc_cycle = 0;      // unified task+heap alloc
+static uint64_t g_orch_args_cycle = 0;       // param copy
+static uint64_t g_orch_lookup_cycle = 0;     // tensormap lookup + dep building
+static uint64_t g_orch_insert_cycle = 0;     // tensormap insert
+static uint64_t g_orch_fanin_cycle = 0;      // fanin list + early-return check
+static uint64_t g_orch_scope_end_cycle = 0;  // scope_end overhead
+static int64_t g_orch_submit_count = 0;
+static uint32_t g_orch_submit_idx = 0;
+uint64_t g_orch_alloc_wait_cycle = 0;
+uint64_t g_orch_fanin_wait_cycle = 0;
+uint64_t g_orch_alloc_atomic_count = 0;
+uint64_t g_orch_args_atomic_count = 0;
+uint64_t g_orch_scope_end_atomic_count = 0;
+// Cycle accumulation is unconditional under PTO2_ORCH_PROFILING (that's what
+// the flag is for) and feeds the per-sub-step `g_orch_*_cycle` cumulatives
+// printed in the cold-path log.
+//
+// Per-submit ORCH_SUBMIT record is the only swim-lane emit on the orch
+// path — one record per submit_task() / alloc_tensors() call spanning
+// the entire [start, end] window. Per-sub-step phase records were dropped
+// in favour of the cumulatives + per-submit envelope; the dispatcher
+// already inserts one record at the end of each submit path via
+// CYCLE_COUNT_ORCH_SUBMIT_RECORD.
+#define CYCLE_COUNT_START()                                                        \
+    bool _prof_active = (orch->l2_swimlane_level >= L2SwimlaneLevel::ORCH_PHASES); \
+    uint64_t _t0 = get_sys_cnt_aicpu(), _t1;                                       \
+    uint64_t _submit_start_ts = _t0
+#define CYCLE_COUNT_LAP(acc)       \
+    do {                           \
+        _t1 = get_sys_cnt_aicpu(); \
+        acc += (_t1 - _t0);        \
+        _t0 = _t1;                 \
+    } while (0)
+#define CYCLE_COUNT_ORCH_SUBMIT_RECORD(tid)                                                       \
+    do {                                                                                          \
+        if (_prof_active) {                                                                       \
+            l2_swimlane_aicpu_record_orch_phase(_submit_start_ts, _t1, (tid), g_orch_submit_idx); \
+        }                                                                                         \
+    } while (0)
+#elif PTO2_PROFILING
+#include "aicpu/device_time.h"
+#include "aicpu/l2_swimlane_collector_aicpu.h"
+__attribute__((weak, visibility("hidden"))) uint64_t get_sys_cnt_aicpu() { return 0; }
+__attribute__((weak, visibility("hidden"))) void
+l2_swimlane_aicpu_record_orch_phase(uint64_t, uint64_t, uint64_t, uint32_t) {}
+// submit_idx needed for swimlane task_id tagging (no cycle accumulation at this level)
+static uint32_t g_orch_submit_idx = 0;
+#define CYCLE_COUNT_START()                                                        \
+    bool _prof_active = (orch->l2_swimlane_level >= L2SwimlaneLevel::ORCH_PHASES); \
+    uint64_t _t0 = _prof_active ? get_sys_cnt_aicpu() : 0, _t1 = 0;                \
+    uint64_t _submit_start_ts = _t0
+#define CYCLE_COUNT_LAP(acc) \
+    do {                     \
+    } while (0)
+#define CYCLE_COUNT_ORCH_SUBMIT_RECORD(tid)                                                       \
+    do {                                                                                          \
+        if (_prof_active) {                                                                       \
+            _t1 = get_sys_cnt_aicpu();                                                            \
+            l2_swimlane_aicpu_record_orch_phase(_submit_start_ts, _t1, (tid), g_orch_submit_idx); \
+        }                                                                                         \
+    } while (0)
+#else
+#define CYCLE_COUNT_START()
+#define CYCLE_COUNT_LAP(acc)
+#define CYCLE_COUNT_ORCH_SUBMIT_RECORD(tid)
+#endif
+
+static int32_t orch_mark_fatal(PTO2OrchestratorState *orch, int32_t error_code) {
+    always_assert(orch != nullptr);
+    orch->fatal = true;
+    if (error_code == PTO2_ERROR_NONE || orch->sm_header == nullptr) {
+        return PTO2_ERROR_NONE;
+    }
+
+    int32_t expected = PTO2_ERROR_NONE;
+    std::atomic<int32_t> &orch_error_code = orch->sm_header->orch_error_code;
+    if (orch_error_code.compare_exchange_strong(expected, error_code, std::memory_order_acq_rel)) {
+        return error_code;
+    }
+    return expected;
+}
+
+static void
+orch_report_fatal_v(PTO2OrchestratorState *orch, int32_t error_code, const char *func, const char *fmt, va_list args) {
+    int32_t latched_code = orch_mark_fatal(orch, error_code);
+
+#if PTO2_PROFILING
+    // Flush the current scope's peaks BEFORE the FATAL log line, so the
+    // diagnostic context (which pool/window filled up) appears right next to
+    // the failure reason. on_fatal is latched, so duplicate fatals from
+    // different layers don't print multiple stats lines.
+    scope_stats_on_fatal();
+#endif
+
+    if (fmt == nullptr || fmt[0] == '\0') {
+        if (latched_code != PTO2_ERROR_NONE && latched_code != error_code) {
+            unified_log_error(func, "FATAL(code=%d, latched=%d)", error_code, latched_code);
+        } else {
+            unified_log_error(func, "FATAL(code=%d)", error_code);
+        }
+        return;
+    }
+
+    char message[1024];
+    vsnprintf(message, sizeof(message), fmt, args);
+    if (latched_code != PTO2_ERROR_NONE && latched_code != error_code) {
+        unified_log_error(func, "FATAL(code=%d, latched=%d): %s", error_code, latched_code, message);
+        return;
+    }
+    unified_log_error(func, "FATAL(code=%d): %s", error_code, message);
+}
+
+void PTO2OrchestratorState::report_fatal(int32_t error_code, const char *func, const char *fmt, ...) {
+    auto *orch = this;
+    va_list args;
+    va_start(args, fmt);
+    orch_report_fatal_v(orch, error_code, func, fmt, args);
+    va_end(args);
+}
+
+static uint32_t next_fanin_seen_epoch(PTO2OrchestratorState *orch) {
+    uint32_t next = orch->fanin_seen_current_epoch + 1;
+    if (next == 0) {
+        memset(
+            orch->fanin_seen_epoch, 0, static_cast<size_t>(orch->sm_header->ring.task_window_size) * sizeof(uint32_t)
+        );
+        next = 1;
+    }
+    orch->fanin_seen_current_epoch = next;
+    return next;
+}
+
+struct PTO2FaninBuilder {
+    PTO2FaninBuilder(PTO2OrchestratorState *orch, PTO2FaninPool &spill_pool, uint32_t seen_epoch) :
+        count(0),
+        spill_start(0),
+        orch(orch),
+        seen_epoch(seen_epoch),
+        spill_pool(spill_pool) {}
+    int32_t count{0};
+    int32_t spill_start{0};
+    PTO2OrchestratorState *orch{nullptr};
+    uint32_t seen_epoch{0};
+    PTO2FaninPool &spill_pool;
+    PTO2TaskSlotState *inline_slots[PTO2_FANIN_INLINE_CAP];
+
+    template <typename Fn>
+    PTO2FaninForEachReturn<Fn> for_each(Fn &&fn) const {
+        return for_each_fanin_storage(inline_slots, count, spill_start, spill_pool, static_cast<Fn &&>(fn));
+    }
+
+    bool mark_seen(uint8_t prod_ring, int32_t prod_slot) {
+        if (prod_ring >= PTO2_MAX_RING_DEPTH || prod_slot < 0) {
+            return false;
+        }
+        uint32_t *seen = orch->fanin_seen_epoch;
+        uint32_t slot = static_cast<uint32_t>(prod_slot);
+        if (seen[slot] == seen_epoch) {
+            return true;
+        }
+        seen[slot] = seen_epoch;
+        return false;
+    }
+};
+
+static bool append_fanin_or_fail(
+    PTO2OrchestratorState *orch, uint8_t prod_ring, int32_t prod_slot, PTO2TaskSlotState *prod_state,
+    PTO2TaskId producer_task_id, PTO2FaninBuilder *fanin_builder
+) {
+    // Decide-and-claim under the producer's fanout_lock. Two conditions make this
+    // resolved slot a non-dependency, and both must be checked together with the
+    // fanout_count++ so the producer cannot slip from live to consumed/reused in
+    // between:
+    //   (1) Generation mismatch — the producer was CONSUMED, its slot
+    //       reset_for_reuse'd and rebound to a newer task. The cached
+    //       owner_task_id still resolves to this slot, but it no longer holds our
+    //       producer; ++'ing it would corrupt an unrelated task.
+    //   (2) Already CONSUMED in place — finished, output ready, no real edge.
+    // In either case, adding it to the fanin and bumping fanout_count would leave
+    // a stale ++/release pair (wire_task drops the fanout edge but keeps the fanin
+    // slot, so on_task_release still release_producer()'s it) that desyncs the
+    // slot's refcount (rc != fc) and wedges in-order reclaim. Claiming a live
+    // producer under the lock pins it: fanout_count now counts us, so it cannot
+    // reach CONSUMED (rc == fc) until we release it in on_task_release, keeping the
+    // slot's generation stable until then. check_and_handle_consumed flips
+    // COMPLETED->CONSUMED under the same lock, so the check and the ++ are atomic
+    // against the consume. fanout_count is lock-protected per the
+    // PTO2TaskSlotState contract.
+    //
+    // Dedup (mark_seen) happens HERE, gated on a live producer — NOT before the
+    // gone check. mark_seen keys only on (ring, slot); a stale owner that resolves
+    // to a reused slot must not record it as seen, or a later dependency on the
+    // live generation in the same submission would hit mark_seen and be skipped
+    // without claiming it (dropped edge). Marking only when !gone keeps the dedup
+    // keyed to the live producer, and doing it before the ++ still suppresses a
+    // double-count for a producer named twice in one submission.
+    prod_state->lock_fanout();
+    bool gone = prod_state->task == nullptr || prod_state->task->task_id.local() != producer_task_id.local() ||
+                prod_state->task_state.load(std::memory_order_acquire) == PTO2_TASK_CONSUMED;
+    bool claim = !gone && !fanin_builder->mark_seen(prod_ring, prod_slot);
+    if (claim) {
+        // Low bits hold the consumer count; bit31 is the scope ref. The consumer
+        // count must never carry into bit31 (would corrupt the scope-release
+        // flag) — true for any sane fanout (<< 2^31).
+        assert(
+            (prod_state->fanout_count & ~PTO2_FANOUT_SCOPE_BIT) < (PTO2_FANOUT_SCOPE_BIT - 1) &&
+            "fanout consumer count overflow into scope bit"
+        );
+        prod_state->fanout_count++;
+    }
+    prod_state->unlock_fanout();
+#if PTO2_ORCH_PROFILING
+    // lock + unlock always; one fanout_count store when we actually claim.
+    g_orch_args_atomic_count += claim ? 3 : 2;
+#endif
+    // gone (stale/consumed) or an already-seen duplicate live producer: no new
+    // fanin edge either way.
+    if (!claim) {
+        return true;
+    }
+
+    if (fanin_builder->count < PTO2_FANIN_INLINE_CAP) {
+        fanin_builder->inline_slots[fanin_builder->count++] = prod_state;
+        return true;
+    }
+
+    PTO2FaninPool &fanin_pool = fanin_builder->spill_pool;
+    if (!fanin_pool.ensure_space(orch->sm_header->ring, 1)) {
+        orch_mark_fatal(orch, PTO2_ERROR_DEP_POOL_OVERFLOW);
+        return false;
+    }
+    int32_t spill_idx = fanin_pool.top;
+    PTO2FaninSpillEntry *entry = fanin_pool.alloc();
+    if (entry == nullptr) {
+        orch_mark_fatal(orch, PTO2_ERROR_DEP_POOL_OVERFLOW);
+        return false;
+    }
+    if (fanin_builder->count == PTO2_FANIN_INLINE_CAP) {
+        fanin_builder->spill_start = spill_idx;
+    }
+    entry->slot_state = prod_state;
+    fanin_builder->count++;
+    return true;
+}
+
+static void scope_tasks_push(PTO2OrchestratorState *orch, PTO2TaskSlotState *task_slot_state);
+
+struct PTO2PreparedTask {
+    PTO2TaskId task_id = PTO2TaskId::invalid();
+    PTO2TaskAllocResult alloc_result = {-1, 0, nullptr, nullptr};
+    PTO2TaskDescriptor *task = nullptr;
+    PTO2TaskPayload *payload = nullptr;
+    PTO2TaskSlotState *slot_state = nullptr;
+};
+
+static PTO2OutputLayout calculate_output_layout(const L0TaskArgs &args) {
+    PTO2OutputLayout layout;
+    for (int32_t i = 0; i < args.tensor_count(); i++) {
+        if (args.tag(i) != TensorArgType::OUTPUT) {
+            continue;
+        }
+        layout.offsets[i] = layout.total_output_size;
+        layout.buffer_sizes[i] =
+            PTO2_ALIGN_UP(args.tensor(i).create_info().buffer_size_bytes(), PTO2_PACKED_OUTPUT_ALIGN);
+        layout.total_output_size += layout.buffer_sizes[i];
+    }
+    return layout;
+}
+
+static bool check_scope_can_accept_task(PTO2OrchestratorState *orch, PTO2TaskAllocator &allocator, uint8_t ring_id) {
+    always_assert(orch->scope_stack_top >= 0 && "Cannot submit task outside a scope");
+
+    int32_t scope_task_count = orch->scope_tasks_size - orch->scope_begins[orch->scope_stack_top];
+    if (scope_task_count < allocator.window_size() - 1) {
+        return true;
+    }
+
+    int32_t active_count = allocator.active_count();
+
+    LOG_ERROR("========================================");
+    LOG_ERROR("FATAL: Scope Deadlock Detected! (ring %d)", ring_id);
+    LOG_ERROR("========================================");
+    LOG_ERROR("Tasks in current scope (%d) >= task_window_size (%d).", scope_task_count, allocator.window_size());
+    LOG_ERROR("  scope_depth:        %d", orch->scope_stack_top + 1);
+    LOG_ERROR("  ring_id:            %d", ring_id);
+    LOG_ERROR("  scope_task_count:   %d", scope_task_count);
+    LOG_ERROR("  active_tasks:       %d / %d", active_count, allocator.window_size());
+    LOG_ERROR("Root Cause:");
+    LOG_ERROR("  Tasks within a scope hold a fanout_count reference that is only");
+    LOG_ERROR("  released at scope_end. When scope task count >= window_size,");
+    LOG_ERROR("  no slots can be reclaimed -> deadlock.");
+    LOG_ERROR("Solution:");
+    LOG_ERROR("  1. Reduce tasks per scope (use batching/unroll)");
+    LOG_ERROR("  2. Increase task window (current: %d)", allocator.window_size());
+    LOG_ERROR("     Compile-time: PTO2_TASK_WINDOW_SIZE in pto_runtime2_types.h");
+    LOG_ERROR("     Runtime env:  PTO2_RING_TASK_WINDOW=<power-of-2>");
+    LOG_ERROR("  3. Split work across multiple scopes");
+    LOG_ERROR("========================================");
+    orch_mark_fatal(orch, PTO2_ERROR_SCOPE_DEADLOCK);
+    return false;
+}
+
+static bool prepare_task(
+    PTO2OrchestratorState *orch, const L0TaskArgs &args, int32_t total_output_size, ActiveMask active_mask,
+    PTO2PreparedTask *out
+) {
+    uint8_t ring_id = 0;
+    auto &allocator = orch->ring.task_allocator;
+
+    if (!check_scope_can_accept_task(orch, allocator, ring_id)) {
+        return false;
+    }
+
+    out->alloc_result = allocator.alloc(total_output_size);
+    if (out->alloc_result.failed()) {
+        orch_mark_fatal(orch, PTO2_ERROR_HEAP_RING_DEADLOCK);
+        return false;
+    }
+
+    out->task_id = PTO2TaskId::make(ring_id, static_cast<uint32_t>(out->alloc_result.task_id));
+    out->slot_state = &orch->sm_header->ring.get_slot_state_by_slot(out->alloc_result.slot);
+    out->task = &orch->sm_header->ring.task_descriptors[out->alloc_result.slot];
+    out->payload = &orch->sm_header->ring.task_payloads[out->alloc_result.slot];
+
+    out->payload->prefetch(args.tensor_count(), args.scalar_count());
+
+    // Re-bind payload/task pointers each submit. Value is per-slot constant
+    // (same as &task_payloads[slot] / &task_descriptors[slot]), but writing
+    // here lets RingSchedState::init() skip the O(window_size) bind loop.
+    // Both writes hit the same 64B slot_state cache line we're about to
+    // dirty below, so the extra cost is two stores on an already-hot line.
+    // Must precede the scheduler wiring.queue.push at the end of
+    // submit_task_common — that push is the first read of slot_state->task /
+    // slot_state->payload by another thread.
+    out->slot_state->bind_buffers(out->payload, out->task);
+
+    // prepare_task does NO payload writes: all payload content (tensors/scalars +
+    // early-dispatch spec fields) is initialized in PTO2TaskPayload::init, the
+    // single payload-init point, which runs before the scheduler wiring push.
+
+    // Fields already zeroed by reset_for_reuse() at slot init:
+    //   fanout_lock=0, fanout_count=PTO2_FANOUT_SCOPE_BIT, fanout_head=nullptr,
+    //   fanin_refcount=0, fanout_refcount=0, completed_subtasks=0, next_block_idx=0
+    // Fields immutable after RingSchedState::init():
+    //   ring_id
+    // task_state is set to PENDING here as the orchestrator populates the slot
+    // (host_build_graph does not recycle slots at runtime, so there is no
+    // post-CONSUMED reset path).
+    out->slot_state->task_state.store(PTO2_TASK_PENDING, std::memory_order_relaxed);
+    int16_t block_num = args.launch_spec.block_num();
+    out->slot_state->total_required_subtasks =
+        static_cast<int16_t>(block_num * __builtin_popcount(active_mask.core_mask()));
+    out->slot_state->logical_block_num = block_num;
+    out->slot_state->active_mask = active_mask;
+    // fanin_count is set by scheduler during wiring
+    scope_tasks_push(orch, out->slot_state);
+
+    return true;
+}
+
+// =============================================================================
+// Scope Management
+// =============================================================================
+
+static void scope_tasks_push(PTO2OrchestratorState *orch, PTO2TaskSlotState *task_slot_state) {
+    if (orch->scope_tasks_size >= orch->scope_tasks_capacity) {
+        // scope_tasks lives in the per-Worker arena (single backing allocation),
+        // so realloc is not legal. Capacity == PTO2_SCOPE_TASKS_CAP ==
+        // PTO2_TASK_WINDOW_SIZE × PTO2_MAX_RING_DEPTH, the total in-flight slot
+        // budget — hitting it means every ring is saturated, so no further push
+        // could succeed regardless of buffer growth.
+        orch->report_fatal(
+            PTO2_ERROR_SCOPE_TASKS_OVERFLOW, __FUNCTION__,
+            "scope_tasks buffer saturated at %d entries (all rings full)", orch->scope_tasks_capacity
+        );
+        return;
+    }
+    orch->scope_tasks[orch->scope_tasks_size++] = task_slot_state;
+}
+
+void PTO2OrchestratorState::begin_scope(PTO2ScopeMode mode) {
+    auto *orch = this;
+    if (orch->fatal) {
+        return;
+    }
+    assert(orch->scope_stack_top < static_cast<int32_t>(orch->scope_stack_capacity - 1) && "Scope stack overflow");
+    if (mode == PTO2ScopeMode::AUTO && orch->in_manual_scope()) {
+        report_fatal(PTO2_ERROR_INVALID_ARGS, __FUNCTION__, "auto scope nested inside manual scope is not supported");
+        return;
+    }
+
+    bool already_in_manual_scope = orch->in_manual_scope();
+    ++orch->scope_stack_top;
+    orch->scope_begins[orch->scope_stack_top] = orch->scope_tasks_size;
+    if (mode == PTO2ScopeMode::MANUAL && !already_in_manual_scope) {
+        orch->manual_begin_depth = orch->scope_stack_top;
+    }
+#if PTO2_PROFILING
+    // Gate via is_scope_stats_enabled() (weak-false in host builds) BEFORE the
+    // collector call: when disabled we pay nothing. Sample the current ring's
+    // task/heap start-end and tensormap usage at the scope boundary.
+    if (is_scope_stats_enabled()) {
+        uint8_t ring_id = 0;
+        auto &alloc = orch->ring.task_allocator;
+        int32_t dep_pool_tail = 0;
+        int32_t dep_pool_top = 0;
+        if (orch->scheduler) {
+            orch->scheduler->ring_sched_state.read_dep_pool_snapshot(dep_pool_tail, dep_pool_top);
+        }
+        scope_stats_begin(
+            ring_id, alloc.task_tail(), alloc.task_head(), alloc.heap_tail(), alloc.heap_top(), dep_pool_tail,
+            dep_pool_top, orch->tensor_map.current_used()
+        );
+    }
+#endif
+}
+
+void PTO2OrchestratorState::end_scope() {
+    auto *orch = this;
+    if (orch->fatal) {
+        return;
+    }
+    assert(orch->scope_stack_top >= 0 && "Scope stack underflow");
+
+    // Snapshot the ring start/end BEFORE the orchestrator drains pending tasks
+    // via scheduler->on_scope_end, so the end record reflects the scope's
+    // occupancy at close, not the residual after teardown.
+#if PTO2_PROFILING
+    // Gate via is_scope_stats_enabled() (see begin_scope). One collector call
+    // emits the end-boundary record and tears down bookkeeping.
+    if (is_scope_stats_enabled()) {
+        uint8_t ring_id = 0;
+        auto &alloc = orch->ring.task_allocator;
+        int32_t dep_pool_tail = 0;
+        int32_t dep_pool_top = 0;
+        if (orch->scheduler) {
+            orch->scheduler->ring_sched_state.read_dep_pool_snapshot(dep_pool_tail, dep_pool_top);
+        }
+        scope_stats_end(
+            ring_id, alloc.task_tail(), alloc.task_head(), alloc.heap_tail(), alloc.heap_top(), dep_pool_tail,
+            dep_pool_top, orch->tensor_map.current_used()
+        );
+    }
+#endif
+
+#if PTO2_ORCH_PROFILING
+    uint64_t _se0 = get_sys_cnt_aicpu();
+#endif
+
+    bool ending_manual_scope = orch->scope_stack_top == orch->manual_begin_depth;
+    int32_t begin = orch->scope_begins[orch->scope_stack_top--];
+    int32_t count = orch->scope_tasks_size - begin;
+    if (ending_manual_scope) {
+        orch->manual_begin_depth = PTO2_MAX_SCOPE_DEPTH;
+    }
+
+    if (orch->scheduler && count > 0) {
+        orch->scheduler->on_scope_end(&orch->scope_tasks[begin], count);
+    }
+
+    // Rewind the task buffer — these entries are no longer needed
+    orch->scope_tasks_size = begin;
+
+#if PTO2_ORCH_PROFILING
+    uint64_t _se1 = get_sys_cnt_aicpu();
+    g_orch_scope_end_cycle += (_se1 - _se0);
+#endif
+}
+
+// =============================================================================
+// Task Submission
+// =============================================================================
+
+// Ensure the tensormap entry pool has room for `needed` inserts before STEP 4
+// registers this task's outputs. The pool is watermark-reclaimed like the
+// task/heap/fanin pools — retired tasks' entries free once last_task_alive
+// advances — so an exhausted pool is back-pressure, not a hard error. Reclaim
+// against the single ring's watermark; if still short,
+// spin until reclaim actually frees entries, with the same 500 ms wall-clock
+// backstop as the task allocator and fanin spill pool. A pool that stays full
+// (no entry freed) is a genuine deadlock: latch PTO2_ERROR_TENSORMAP_OVERFLOW
+// and bail. Returns false on deadlock or on a fatal already latched by another
+// party. Cold path — the fast path returns immediately when the pool has room.
+static bool ensure_tensormap_capacity(PTO2OrchestratorState *orch, int32_t needed) {
+    PTO2TensorMap &tm = orch->tensor_map;
+    if (tm.free_entries() >= needed) {
+        return true;
+    }
+
+    int32_t alive;
+    auto read_alive = [&]() {
+        // Relaxed: a self-correcting poll re-read every reclaim tick, so a stale
+        // watermark only defers reclaim one tick and never over-frees.
+        alive = orch->sm_header->ring.fc.last_task_alive.load(std::memory_order_relaxed);
+    };
+
+    read_alive();
+    int64_t cur_alive_sum = tm.reclaim_retired_all(alive);  // kept for the deadlock diagnostic
+    int32_t prev_free = tm.free_entries();
+    if (prev_free >= needed) {
+        return true;
+    }
+
+    int spin_count = 0;
+    uint64_t block_cycle0 = 0;  // wall-clock anchor for the deadlock backstop
+    bool block_timing = false;  // false until the first no-reclaim-progress tick
+    while (tm.free_entries() < needed) {
+        spin_count++;
+
+        // Reclaim (and the all-ring watermark reads it needs) is the costly part of
+        // this spin and the only path that frees entries; gate it to a periodic tick.
+        // Cold path, but the spin itself is tight.
+        if ((spin_count & 31) == 0) {
+            read_alive();
+            cur_alive_sum = tm.reclaim_retired_all(alive);
+            int32_t cur_free = tm.free_entries();
+            if (cur_free >= needed) {
+                return true;
+            }
+            // Progress is entries actually freed, NOT watermark movement: a ring can
+            // retire zero-output tasks (count_registrable_outputs == 0), advancing
+            // last_task_alive without freeing any entry. Gating the backstop on
+            // free_entries() keeps a wedged pool from dodging the timeout while some
+            // unrelated ring keeps draining.
+            if (cur_free > prev_free) {
+                spin_count = 0;
+                prev_free = cur_free;
+                block_timing = false;
+            }
+        }
+
+        if ((spin_count & 1023) == 0) {
+            // A fatal latched elsewhere breaks this otherwise-unbounded spin.
+            if (orch->sm_header->orch_error_code.load(std::memory_order_acquire) != PTO2_ERROR_NONE) {
+                return false;
+            }
+            // Absolute-time backstop, matching the task allocator: stable across
+            // chips/contention, unlike a fixed spin count. get_sys_cnt_aicpu()
+            // is an MMIO read, so sample it only once per 1024 spins.
+            uint64_t now = get_sys_cnt_aicpu();
+            if (!block_timing) {
+                block_cycle0 = now;
+                block_timing = true;
+            } else if (now - block_cycle0 >= PTO2_ALLOC_DEADLOCK_TIMEOUT_CYCLES) {
+                LOG_ERROR("========================================");
+                LOG_ERROR("FATAL: TensorMap Entry Pool Deadlock Detected!");
+                LOG_ERROR("========================================");
+                LOG_ERROR("TensorMap entry pool freed no entries for ~500 ms while a task waits.");
+                LOG_ERROR("  - Pool used:   %d / %d", tm.current_used(), tm.pool_capacity());
+                LOG_ERROR("  - Needed:      %d entries", needed);
+                LOG_ERROR("  - last_task_alive: %" PRId64, cur_alive_sum);
+                LOG_ERROR("Diagnosis:");
+                LOG_ERROR("  No retiring task is freeing tensormap entries (last_task_alive may");
+                LOG_ERROR("  still move on rings with no registered outputs). Check TaskRing");
+                LOG_ERROR("  diagnostics for the stalled producer.");
+                LOG_ERROR("Solution:");
+                LOG_ERROR("  Increase PTO2_TENSORMAP_POOL_SIZE (current: %d).", tm.pool_capacity());
+                LOG_ERROR("========================================");
+                orch_mark_fatal(orch, PTO2_ERROR_TENSORMAP_OVERFLOW);
+                return false;
+            }
+        }
+        SPIN_WAIT_HINT();
+    }
+    return true;
+}
+
+// Shared body for submit_task / submit_dummy_task. Caller has already validated
+// args.has_error, decided active_mask (empty for dummy), and resolved the per-slot
+// kernel_ids (all INVALID_KERNEL_ID for dummy). Performs tensormap sync, fanin
+// computation (explicit_deps + auto), output registration, slot init, and pushes
+// to the scheduler wiring queue.
+static TaskOutputTensors submit_task_common(
+    PTO2OrchestratorState *orch, const L0TaskArgs &args, ActiveMask active_mask, int32_t aic_kernel_id,
+    int32_t aiv0_kernel_id, int32_t aiv1_kernel_id
+) {
+    CYCLE_COUNT_START();
+    TaskOutputTensors result;
+    PTO2OutputLayout layout = calculate_output_layout(args);
+    PTO2PreparedTask prepared;
+    if (!prepare_task(orch, args, layout.total_output_size, active_mask, &prepared)) {
+        return result;
+    }
+    PTO2SchedulerState *sched = orch->scheduler;
+    PTO2RingFlowControl &fc = orch->sm_header->ring.fc;
+    PTO2TaskId task_id = prepared.task_id;
+    PTO2TaskSlotState &cur_slot_state = *prepared.slot_state;
+    PTO2TaskDescriptor &task = *prepared.task;
+    PTO2TaskPayload &payload = *prepared.payload;
+    result.set_task_id(task_id);
+
+    // dep_gen capture point: snapshot the orch submit_task inputs while the
+    // tensormap is still in its pre-lookup state for this task. Replay reads
+    // these records offline to reconstruct the complete dep graph — the sole
+    // source of truth for fanout now that the swimlane hot path no longer
+    // records it.
+    if (is_dep_gen_enabled()) {
+        const void *tensor_ptrs[MAX_TENSOR_ARGS];
+        // TensorArgType is `enum class : int32_t` (4 bytes); the on-disk record
+        // packs arg_types as uint8_t[16] (5-value enum fits in a byte). Narrow
+        // each tag here rather than letting the AICPU writer reinterpret a
+        // 4×-wider array as bytes — that path silently lost two of every three
+        // tags on little-endian and synthesized phantom self-edges in replay.
+        uint8_t arg_types_u8[MAX_TENSOR_ARGS];
+        // Clamp to MAX_TENSOR_ARGS even though the Arg builder caps adds at
+        // MAX_TENSOR_ARGS: defensive against any future builder bypass /
+        // shared-memory bit-flip that could otherwise overrun the two
+        // MAX_TENSOR_ARGS-sized stack buffers above.
+        const int tc_raw = args.tensor_count();
+        const int tc = tc_raw > MAX_TENSOR_ARGS ? MAX_TENSOR_ARGS : tc_raw;
+        for (int i = 0; i < tc; i++) {
+            // OUTPUT slots carry create_info (not yet a Tensor); skip them —
+            // they have no producer to look up and replay's per-tensor loop
+            // also skips OUTPUT.
+            tensor_ptrs[i] = (args.tag(i) == TensorArgType::OUTPUT) ? nullptr : &args.tensor(i).ref();
+            arg_types_u8[i] = static_cast<uint8_t>(args.tag(i));
+        }
+        const int32_t kernel_ids_capture[3] = {aic_kernel_id, aiv0_kernel_id, aiv1_kernel_id};
+        dep_gen_aicpu_record_submit(
+            task_id.raw, orch->in_manual_scope(), tc, tensor_ptrs, arg_types_u8,
+            static_cast<int>(args.explicit_dep_count()), reinterpret_cast<const uint64_t *>(args.explicit_deps_data()),
+            args.launch_spec.block_num(), kernel_ids_capture
+        );
+    }
+
+    PTO2FaninBuilder fanin_builder(orch, orch->ring.fanin_pool, next_fanin_seen_epoch(orch));
+
+    CYCLE_COUNT_LAP(g_orch_alloc_cycle);
+
+#if PTO2_PROFILING
+    if (layout.total_output_size > 0) {
+        orch->buffers_allocated++;
+        orch->bytes_allocated += layout.total_output_size;
+    }
+#endif
+
+    // === STEP 2: Sync TensorMap validity and optional cleanup ===
+    // Read current last_task_alive from shared memory for this ring
+    int32_t sm_last_task_alive = fc.last_task_alive.load(std::memory_order_acquire);
+
+    orch->tensor_map.sync_tensormap(task_id, sm_last_task_alive);
+
+    CYCLE_COUNT_LAP(g_orch_sync_cycle);
+
+    for (uint32_t i = 0; i < args.explicit_dep_count(); i++) {
+        PTO2TaskId dep_task_id = args.explicit_dep(i);
+        if (!dep_task_id.is_valid()) {
+            orch->report_fatal(
+                PTO2_ERROR_INVALID_ARGS, __FUNCTION__, "Arg.set_dependencies(...) requires valid task ids"
+            );
+            return result;
+        }
+        uint8_t dep_ring_id = dep_task_id.ring();
+        PTO2SharedMemoryRingHeader &dep_ring = orch->sm_header->ring;
+        int32_t dep_local_task_id = static_cast<int32_t>(dep_task_id.local());
+        int32_t dep_last_task_alive = dep_ring.fc.last_task_alive.load(std::memory_order_acquire);
+        if (dep_local_task_id < dep_last_task_alive) {
+            continue;
+        }
+        int32_t dep_slot = dep_ring.get_slot_by_task_id(dep_local_task_id);
+        PTO2TaskSlotState *producer_slot_state = &dep_ring.get_slot_state_by_slot(dep_slot);
+        if (!append_fanin_or_fail(orch, dep_ring_id, dep_slot, producer_slot_state, dep_task_id, &fanin_builder)) {
+            return result;
+        }
+    }
+
+    // === STEP 3: Lookup inputs (creator retention + tensormap modifier lookup) ===
+    DepInputs dep_inputs{
+        args.tensor_count(),       args.tensor_data(), args.tag_data(), static_cast<int32_t>(args.explicit_dep_count()),
+        args.explicit_deps_data(),
+    };
+
+    auto runtime_emit = [&](PTO2TaskId producer_task_id) -> bool {
+        uint8_t prod_ring = producer_task_id.ring();
+        PTO2SharedMemoryRingHeader &producer_ring = orch->sm_header->ring;
+        int32_t prod_slot = producer_ring.get_slot_by_task_id(static_cast<int32_t>(producer_task_id.local()));
+        PTO2TaskSlotState *prod_state = &producer_ring.get_slot_state_by_slot(prod_slot);
+        return append_fanin_or_fail(orch, prod_ring, prod_slot, prod_state, producer_task_id, &fanin_builder);
+    };
+
+    if (!compute_task_fanin(dep_inputs, orch->tensor_map, orch->in_manual_scope(), runtime_emit)) {
+        return result;
+    }
+
+    CYCLE_COUNT_LAP(g_orch_lookup_cycle);
+
+    // === STEP 4: Register outputs/inouts in TensorMap (must be separate from lookup) ===
+    // Reserve pool capacity for this task's inserts before registering. The pool
+    // is reclaimed as last_task_alive advances; an
+    // exhausted pool back-pressures here (and detects a wedged watermark) rather
+    // than tripping new_entry()'s hard assert mid-registration.
+    int32_t tensormap_needed = count_registrable_outputs(dep_inputs, orch->in_manual_scope());
+    if (tensormap_needed > 0 && !ensure_tensormap_capacity(orch, tensormap_needed)) {
+        return result;
+    }
+    register_task_outputs(dep_inputs, task_id, orch->tensor_map, orch->in_manual_scope());
+
+    CYCLE_COUNT_LAP(g_orch_insert_cycle);
+
+    // === STEP 5: Batch-write to GM (single cache line burst) ===
+    // Deferred from allocation phase to avoid scattered GM writes that get
+    // evicted by TensorMap lookup/insert cache pressure.
+    __builtin_prefetch(&task, 1, 1);
+    task.task_id = task_id;
+    task.kernel_id[static_cast<int>(PTO2SubtaskSlot::AIC)] = aic_kernel_id;
+    task.kernel_id[static_cast<int>(PTO2SubtaskSlot::AIV0)] = aiv0_kernel_id;
+    task.kernel_id[static_cast<int>(PTO2SubtaskSlot::AIV1)] = aiv1_kernel_id;
+    task.packed_buffer_base = prepared.alloc_result.packed_base;
+    task.packed_buffer_end = prepared.alloc_result.packed_end;
+
+    // fanout_count was already incremented per live producer inside
+    // append_fanin_or_fail, atomically with the consumed/generation check under
+    // the producer's fanout_lock. Doing it there (rather than a separate pass
+    // here) is what prevents a producer from transitioning to CONSUMED between
+    // the dependency decision and the claim.
+    int32_t inline_count = std::min(fanin_builder.count, PTO2_FANIN_INLINE_CAP);
+    // Store fanin metadata in payload for scheduler to iterate
+    payload.fanin_actual_count = fanin_builder.count;
+    payload.fanin_spill_start = fanin_builder.spill_start;
+    payload.fanin_spill_pool = &fanin_builder.spill_pool;
+    for (int i = 0; i < inline_count; i++) {
+        payload.fanin_inline_slot_states[i] = fanin_builder.inline_slots[i];
+    }
+
+    payload.init(args, result, prepared.alloc_result, layout);
+#if PTO2_PROFILING
+    if (is_dump_args_enabled()) {
+        if (args.scalar_count() > 0) {
+            set_dump_args_task_scalar_dtypes(
+                task_id.raw, static_cast<uint32_t>(args.scalar_count()), args.scalar_dtypes()
+            );
+        }
+        // Selective vs full dump is latched at dump_args_init from DumpDataHeader
+        // (host-decided before any dispatch), so it is race-free regardless of
+        // submission order. Here we only record each marked task's arg mask and
+        // metadata flags, which selective collection consults.
+        if (args.dump_arg_mask() != 0) {
+            set_dump_args_task_mask(task_id.raw, args.dump_arg_mask(), args.dump_arg_index_ambiguous_mask());
+        }
+    }
+#endif
+
+    CYCLE_COUNT_LAP(g_orch_args_cycle);
+
+    // === STEP 6: wire fanout inline ===
+    // host_build_graph host-orch: the orchestrator runs to completion on the host
+    // before the device boots scheduler-only, so wire the fanout adjacency here —
+    // lock each producer, allocate dep_pool entries, and seed the ready queue —
+    // directly in submit instead of deferring it to a device-side wiring-queue
+    // drain. The dep_pool is sized for the whole graph (no reclaim during host
+    // orchestration, exactly like the task window and GM heap), so an exhausted
+    // pool latches PTO2_ERROR_DEP_POOL_OVERFLOW via dep_pool.alloc() — the same
+    // failure class as a task-window/heap overflow. The resulting
+    // fanout_head / dep-entry / ready-queue pointers are host-DDR addresses;
+    // runtime_maker relocates them to device addresses before H2D.
+    sched->wire_task(sched->ring_sched_state, &cur_slot_state, payload.fanin_actual_count);
+    if (orch->sm_header->orch_error_code.load(std::memory_order_acquire) != PTO2_ERROR_NONE) {
+        orch->fatal = true;
+        return result;
+    }
+
+    CYCLE_COUNT_LAP(g_orch_fanin_cycle);
+    CYCLE_COUNT_ORCH_SUBMIT_RECORD(task_id.raw);
+
+#if PTO2_PROFILING
+    orch->tasks_submitted++;
+#if PTO2_ORCH_PROFILING
+    g_orch_submit_count++;
+#endif
+    g_orch_submit_idx++;
+#endif
+    return result;
+}
+
+TaskOutputTensors PTO2OrchestratorState::submit_task(const MixedKernels &mixed_kernels, const L0TaskArgs &args) {
+    auto *orch = this;
+
+    // Orchestration API should short-circuit after fatal, but keep this entry
+    // robust as a no-op in case a caller reaches it directly.
+    if (orch->fatal) {
+        return TaskOutputTensors{};
+    }
+
+    // Validate Arg construction (errors recorded by add_input/add_output/etc.)
+    if (args.has_error) {
+        LOG_ERROR("========================================");
+        LOG_ERROR("FATAL: Invalid Arg Detected!");
+        LOG_ERROR("========================================");
+        LOG_ERROR("Error: %s", args.error_msg ? args.error_msg : "(unknown)");
+        LOG_ERROR("  tensor_count: %d, scalar_count: %d", args.tensor_count(), args.scalar_count());
+        LOG_ERROR("This is a bug in the orchestration code.");
+        LOG_ERROR("========================================");
+        orch_mark_fatal(orch, PTO2_ERROR_INVALID_ARGS);
+        return TaskOutputTensors{};
+    }
+    always_assert(orch->scheduler != nullptr);
+    // === Validate submit inputs ===
+    ActiveMask active_mask = mixed_kernels.to_active_mask();
+    always_assert(static_cast<bool>(active_mask) && "MixedKernels must have at least one active slot");
+
+    int16_t block_num = args.launch_spec.block_num();
+    always_assert(block_num >= 1 && "block_num must be >= 1");
+
+    // Normalize single-AIV tasks: if only aiv1 is set (no aic, no aiv0), move
+    // it to the aiv0 slot.  This guarantees the dispatch path can always use
+    // PTO2SubtaskSlot::AIV0 for single-AIV shapes without inspecting active_mask.
+    // Mixed tasks (AIC+AIV) keep their original AIV identity so the correct
+    // hardware channel (AIV0→AIC vs AIV1→AIC) is used at dispatch time.
+    MixedKernels normalized = mixed_kernels;
+    bool has_aic = active_mask.has_mask(PTO2_SUBTASK_MASK_AIC);
+    bool has_aiv0 = active_mask.has_mask(PTO2_SUBTASK_MASK_AIV0);
+    bool has_aiv1 = active_mask.has_mask(PTO2_SUBTASK_MASK_AIV1);
+    if (!has_aic && has_aiv1 && !has_aiv0) {
+        normalized.aiv0_kernel_id = normalized.aiv1_kernel_id;
+        normalized.aiv1_kernel_id = INVALID_KERNEL_ID;
+        active_mask = normalized.to_active_mask();
+    }
+
+    // Encode require_sync_start into active_mask bit 3 (only meaningful for tasks with block_num > 1)
+    if (block_num > 1 && args.launch_spec.require_sync_start()) {
+        // Deadlock check: block_num >= total available slots of the required type.
+        // For MIX/AIC: limit is total_cluster_count (one AIC per cluster).
+        // For AIV:     limit is total_aiv_count.
+        PTO2ResourceShape shape = active_mask.to_shape();
+        int32_t limit = (shape == PTO2ResourceShape::AIV) ? orch->total_aiv_count : orch->total_cluster_count;
+        if (limit > 0 && block_num > limit) {
+            report_fatal(
+                PTO2_ERROR_REQUIRE_SYNC_START_INVALID, __FUNCTION__,
+                "require_sync_start block_num=%d > limit=%d (deadlock guaranteed)", block_num, limit
+            );
+            return TaskOutputTensors{};
+        }
+        active_mask.set_sync_start();
+    }
+
+    return submit_task_common(
+        orch, args, active_mask, normalized.aic_kernel_id, normalized.aiv0_kernel_id, normalized.aiv1_kernel_id
+    );
+}
+
+// Submit a dependency-only task: full dependency graph participation
+// (tensormap lookup/insert, explicit_deps, manual_dep, manual_scope) but no
+// AICore dispatch. Empty active_mask routes the slot to the DUMMY ready
+// bucket; dispatch loop short-circuits to completion. Accepts the same Arg
+// shape as submit_task; scalars are permitted but never consumed.
+TaskOutputTensors PTO2OrchestratorState::submit_dummy_task(const L0TaskArgs &args) {
+    auto *orch = this;
+
+    if (orch->fatal) {
+        return TaskOutputTensors{};
+    }
+
+    if (args.has_error) {
+        LOG_ERROR("========================================");
+        LOG_ERROR("FATAL: Invalid Arg in submit_dummy_task!");
+        LOG_ERROR("========================================");
+        LOG_ERROR("Error: %s", args.error_msg ? args.error_msg : "(unknown)");
+        LOG_ERROR("  tensor_count: %d, scalar_count: %d", args.tensor_count(), args.scalar_count());
+        LOG_ERROR("========================================");
+        orch_mark_fatal(orch, PTO2_ERROR_INVALID_ARGS);
+        return TaskOutputTensors{};
+    }
+    always_assert(orch->scheduler != nullptr);
+
+    return submit_task_common(orch, args, ActiveMask{}, INVALID_KERNEL_ID, INVALID_KERNEL_ID, INVALID_KERNEL_ID);
+}
+
+TaskOutputTensors PTO2OrchestratorState::alloc_tensors(const L0TaskArgs &args) {
+    auto *orch = this;
+    // Orchestration API should short-circuit after fatal, but keep this entry
+    // robust as a no-op in case a caller reaches it directly.
+    if (orch->fatal) {
+        return TaskOutputTensors{};
+    }
+
+    if (args.tensor_count() <= 0) {
+        report_fatal(PTO2_ERROR_INVALID_ARGS, __FUNCTION__, "alloc_tensors requires at least one TensorCreateInfo");
+        return TaskOutputTensors{};
+    }
+    if (args.scalar_count() != 0) {
+        report_fatal(PTO2_ERROR_INVALID_ARGS, __FUNCTION__, "alloc_tensors only accepts output TensorCreateInfo args");
+        return TaskOutputTensors{};
+    }
+    for (int32_t i = 0; i < args.tensor_count(); i++) {
+        if (args.tag(i) != TensorArgType::OUTPUT) {
+            report_fatal(
+                PTO2_ERROR_INVALID_ARGS, __FUNCTION__, "alloc_tensors only accepts output TensorCreateInfo args"
+            );
+            return TaskOutputTensors{};
+        }
+    }
+
+    CYCLE_COUNT_START();
+
+    if (args.has_error) {
+        report_fatal(
+            PTO2_ERROR_INVALID_ARGS, __FUNCTION__, "%s",
+            args.error_msg ? args.error_msg : "alloc_tensors failed to construct output-only Arg"
+        );
+        return TaskOutputTensors{};
+    }
+
+    PTO2OutputLayout layout = calculate_output_layout(args);
+    PTO2PreparedTask prepared;
+    if (!prepare_task(orch, args, layout.total_output_size, ActiveMask{}, &prepared)) {
+        return TaskOutputTensors{};
+    }
+
+    PTO2TaskDescriptor &task = *prepared.task;
+    PTO2TaskPayload &payload = *prepared.payload;
+
+    CYCLE_COUNT_LAP(g_orch_alloc_cycle);
+
+#if PTO2_PROFILING
+    if (layout.total_output_size > 0) {
+        orch->buffers_allocated++;
+        orch->bytes_allocated += layout.total_output_size;
+    }
+#endif
+
+    task.task_id = prepared.task_id;
+    task.kernel_id[static_cast<int>(PTO2SubtaskSlot::AIC)] = INVALID_KERNEL_ID;
+    task.kernel_id[static_cast<int>(PTO2SubtaskSlot::AIV0)] = INVALID_KERNEL_ID;
+    task.kernel_id[static_cast<int>(PTO2SubtaskSlot::AIV1)] = INVALID_KERNEL_ID;
+    task.packed_buffer_base = prepared.alloc_result.packed_base;
+    task.packed_buffer_end = prepared.alloc_result.packed_end;
+
+    TaskOutputTensors outputs;
+    outputs.set_task_id(prepared.task_id);
+    payload.init(args, outputs, prepared.alloc_result, layout);
+    payload.fanin_actual_count = 0;
+    payload.fanin_spill_start = 0;
+    payload.fanin_spill_pool = &orch->ring.fanin_pool;
+    CYCLE_COUNT_LAP(g_orch_args_cycle);
+
+    if (prepared.slot_state != nullptr) {
+        // Hidden alloc tasks complete inline in the orchestrator before any
+        // consumer can exist, so they have no fanout to notify and no worker
+        // subtasks to retire. Running the full on_task_complete path
+        // would only pay unnecessary fanout_lock / traversal overhead here.
+        // The generic slot initialization done in prepare_task() is still
+        // required so scope_end can release the producer-side reference and
+        // drive the slot to CONSUMED, but worker dispatch fields are never
+        // observed for hidden alloc tasks.
+        prepared.slot_state->task_state.store(PTO2_TASK_COMPLETED, std::memory_order_release);
+    }
+    orch->inline_completed_tasks++;
+
+    CYCLE_COUNT_LAP(g_orch_fanin_cycle);
+    CYCLE_COUNT_ORCH_SUBMIT_RECORD(prepared.task_id.raw);
+
+#if PTO2_PROFILING
+    orch->tasks_submitted++;
+#if PTO2_ORCH_PROFILING
+    g_orch_submit_count++;
+#endif
+    g_orch_submit_idx++;
+#endif
+
+    return outputs;
+}
+
+// =============================================================================
+// Flow Control
+// =============================================================================
+
+void PTO2OrchestratorState::mark_done() {
+    auto *orch = this;
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        int32_t total_tasks = orch->ring.task_allocator.active_count();
+        if (total_tasks > 0) {
+            LOG_INFO_V0("=== [Orchestrator] ring %d: total_tasks=%d ===", r, total_tasks);
+        }
+        auto &fanin_pool = orch->ring.fanin_pool;
+        if (fanin_pool.top > 1) {
+            LOG_INFO_V0(
+                "=== [FaninPool %d] top=%d tail=%d used=%d high_water=%d capacity=%d ===", r, fanin_pool.top,
+                fanin_pool.tail, fanin_pool.top - fanin_pool.tail, fanin_pool.high_water, fanin_pool.capacity
+            );
+        }
+    }
+    orch->sm_header->orchestrator_done.store(1, std::memory_order_release);
+    orch->scope_tasks_size = 0;
+    orch->scope_stack_top = -1;
+    orch->manual_begin_depth = PTO2_MAX_SCOPE_DEPTH;
+#if !PTO2_ORCH_PROFILING && PTO2_PROFILING
+    g_orch_submit_idx = 0;
+#endif
+}
+
+#if PTO2_ORCH_PROFILING
+PTO2OrchProfilingData orchestrator_get_profiling() {
+    PTO2OrchProfilingData d;
+    d.sync_cycle = g_orch_sync_cycle;
+    d.alloc_cycle = g_orch_alloc_cycle;
+    d.args_cycle = g_orch_args_cycle;
+    d.lookup_cycle = g_orch_lookup_cycle;
+    d.insert_cycle = g_orch_insert_cycle;
+    d.fanin_cycle = g_orch_fanin_cycle;
+    d.scope_end_cycle = g_orch_scope_end_cycle;
+    d.submit_count = g_orch_submit_count;
+    d.alloc_wait_cycle = g_orch_alloc_wait_cycle;
+    d.fanin_wait_cycle = g_orch_fanin_wait_cycle;
+    d.alloc_atomic_count = g_orch_alloc_atomic_count;
+    d.args_atomic_count = g_orch_args_atomic_count;
+    d.scope_end_atomic_count = g_orch_scope_end_atomic_count;
+
+    // Reset
+    g_orch_sync_cycle = g_orch_alloc_cycle = g_orch_args_cycle = 0;
+    g_orch_lookup_cycle = g_orch_insert_cycle = 0;
+    g_orch_fanin_cycle = g_orch_scope_end_cycle = 0;
+    g_orch_submit_count = 0;
+    g_orch_submit_idx = 0;
+    g_orch_alloc_wait_cycle = 0;
+    g_orch_fanin_wait_cycle = 0;
+    g_orch_alloc_atomic_count = 0;
+    g_orch_args_atomic_count = 0;
+    g_orch_scope_end_atomic_count = 0;
+    return d;
+}
+#endif
diff --git a/src/a2a3/runtime/host_build_graph/runtime/orchestrator_core/pto_ring_buffer.cpp b/src/a2a3/runtime/host_build_graph/runtime/orchestrator_core/pto_ring_buffer.cpp
new file mode 100644
index 000000000..c2d7e7660
--- /dev/null
+++ b/src/a2a3/runtime/host_build_graph/runtime/orchestrator_core/pto_ring_buffer.cpp
@@ -0,0 +1,185 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * PTO Runtime2 - Ring Buffer Implementation
+ *
+ * Implements DepListPool ring buffer for zero-overhead dependency management.
+ * TaskAllocator methods are defined inline in pto_ring_buffer.h.
+ *
+ * Based on: docs/RUNTIME_LOGIC.md
+ */
+
+#include "pto_ring_buffer.h"
+#include <inttypes.h>
+#include <string.h>
+#include "common/unified_log.h"
+#include "scheduler/pto_scheduler.h"
+
+static void latch_pool_error(std::atomic<int32_t> *error_code_ptr, int32_t error_code) {
+    if (error_code_ptr == nullptr) {
+        return;
+    }
+    int32_t expected = PTO2_ERROR_NONE;
+    error_code_ptr->compare_exchange_strong(expected, error_code, std::memory_order_acq_rel);
+}
+
+// =============================================================================
+// Fanin Spill Pool Implementation
+// =============================================================================
+void PTO2FaninPool::reclaim(PTO2SharedMemoryRingHeader &ring, int32_t sm_last_task_alive) {
+    if (sm_last_task_alive <= reclaim_task_cursor) return;
+
+    int32_t scan_end = sm_last_task_alive;
+    for (int32_t task_id = reclaim_task_cursor; task_id < scan_end; ++task_id) {
+        PTO2TaskPayload &payload = ring.get_payload_by_task_id(task_id);
+        if (payload.fanin_spill_pool != this) {
+            continue;
+        }
+
+        int32_t inline_count = std::min(payload.fanin_actual_count, PTO2_FANIN_INLINE_CAP);
+        int32_t spill_edge_count = payload.fanin_actual_count - inline_count;
+        if (spill_edge_count > 0) {
+            advance_tail(payload.fanin_spill_start + spill_edge_count);
+        }
+    }
+    reclaim_task_cursor = scan_end;
+}
+
+bool PTO2FaninPool::ensure_space(PTO2SharedMemoryRingHeader &ring, int32_t needed) {
+    if (available() >= needed) return true;
+
+    int spin_count = 0;
+    int32_t prev_last_alive = ring.fc.last_task_alive.load(std::memory_order_acquire);
+    uint64_t block_cycle0 = 0;  // wall-clock anchor for the deadlock backstop
+    bool block_timing = false;  // false until the first no-reclaim-progress spin
+    while (available() < needed) {
+        reclaim(ring, prev_last_alive);
+        if (available() >= needed) return true;
+
+        spin_count++;
+
+        int32_t cur_last_alive = ring.fc.last_task_alive.load(std::memory_order_acquire);
+        if (cur_last_alive > prev_last_alive) {
+            spin_count = 0;
+            prev_last_alive = cur_last_alive;
+            block_timing = false;
+        } else if ((spin_count & 1023) == 0) {
+            // A fatal latched elsewhere breaks this otherwise-unbounded spin; the
+            // caller maps the failed ensure_space to orch_mark_fatal. Cold path.
+            if (error_code_ptr != nullptr && error_code_ptr->load(std::memory_order_acquire) != PTO2_ERROR_NONE) {
+                return false;
+            }
+            // Absolute-time backstop, matching the task allocator: stable across
+            // chips/contention, unlike a fixed spin count. get_sys_cnt_aicpu()
+            // is an MMIO read, so sample it only once per 1024 spins.
+            uint64_t now = get_sys_cnt_aicpu();
+            if (!block_timing) {
+                block_cycle0 = now;
+                block_timing = true;
+            } else if (now - block_cycle0 >= PTO2_ALLOC_DEADLOCK_TIMEOUT_CYCLES) {
+                int32_t current = ring.fc.current_task_index.load(std::memory_order_acquire);
+                LOG_ERROR("========================================");
+                LOG_ERROR("FATAL: Fanin Spill Pool Deadlock Detected!");
+                LOG_ERROR("========================================");
+                LOG_ERROR("Fanin spill pool cannot reclaim space after ~500 ms (no progress).");
+                LOG_ERROR(
+                    "  - Pool used:     %d / %d (%.1f%%)", used(), capacity,
+                    (capacity > 0) ? (100.0 * used() / capacity) : 0.0
+                );
+                LOG_ERROR("  - Pool top:      %d (linear)", top);
+                LOG_ERROR("  - Pool tail:     %d (linear)", tail);
+                LOG_ERROR("  - High water:    %d", high_water);
+                LOG_ERROR("  - Needed:        %d entries", needed);
+                LOG_ERROR("  - last_task_alive: %d (stuck here)", cur_last_alive);
+                LOG_ERROR("  - current_task:    %d", current);
+                LOG_ERROR("  - In-flight tasks: %d", current - cur_last_alive);
+                LOG_ERROR("Diagnosis:");
+                LOG_ERROR("  last_task_alive is not advancing, so fanin spill pool tail");
+                LOG_ERROR("  cannot reclaim. Check TaskRing diagnostics for root cause.");
+                LOG_ERROR("Solution:");
+                LOG_ERROR(
+                    "  Increase fanin spill pool capacity (current: %d, recommended: %d)", capacity, high_water * 2
+                );
+                LOG_ERROR("  Compile-time: PTO2_DEP_LIST_POOL_SIZE in pto_runtime2_types.h");
+                LOG_ERROR("  Runtime env:  PTO2_RING_DEP_POOL=%d", high_water * 2);
+                LOG_ERROR("========================================");
+                latch_pool_error(error_code_ptr, PTO2_ERROR_DEP_POOL_OVERFLOW);
+                return false;
+            }
+        }
+        SPIN_WAIT_HINT();
+    }
+    return true;
+}
+
+// =============================================================================
+// Dependency List Pool Implementation
+// =============================================================================
+void PTO2DepListPool::reclaim(PTO2SharedMemoryRingHeader &ring, int32_t sm_last_task_alive) {
+    if (sm_last_task_alive >= last_reclaimed + PTO2_DEP_POOL_CLEANUP_INTERVAL && sm_last_task_alive > 0) {
+        int32_t mark = ring.get_slot_state_by_task_id(sm_last_task_alive - 1).dep_pool_mark;
+        if (mark > 0) {
+            advance_tail(mark);
+        }
+        last_reclaimed = sm_last_task_alive;
+    }
+}
+
+bool PTO2DepListPool::ensure_space(PTO2SharedMemoryRingHeader &ring, int32_t needed) {
+    if (available() >= needed) return true;
+
+    int spin_count = 0;
+    int32_t prev_last_alive = ring.fc.last_task_alive.load(std::memory_order_acquire);
+    while (available() < needed) {
+        reclaim(ring, prev_last_alive);
+        if (available() >= needed) return true;
+
+        spin_count++;
+
+        // Progress detection: reset spin counter if last_task_alive advances
+        int32_t cur_last_alive = ring.fc.last_task_alive.load(std::memory_order_acquire);
+        if (cur_last_alive > prev_last_alive) {
+            spin_count = 0;
+            prev_last_alive = cur_last_alive;
+        }
+
+        if (spin_count >= PTO2_DEP_POOL_SPIN_LIMIT) {
+            int32_t current = ring.fc.current_task_index.load(std::memory_order_acquire);
+            LOG_ERROR("========================================");
+            LOG_ERROR("FATAL: Dependency Pool Deadlock Detected!");
+            LOG_ERROR("========================================");
+            LOG_ERROR("DepListPool cannot reclaim space after %d spins (no progress).", spin_count);
+            LOG_ERROR(
+                "  - Pool used:     %d / %d (%.1f%%)", used(), capacity,
+                (capacity > 0) ? (100.0 * used() / capacity) : 0.0
+            );
+            LOG_ERROR("  - Pool top:      %d (linear)", top);
+            LOG_ERROR("  - Pool tail:     %d (linear)", tail);
+            LOG_ERROR("  - High water:    %d", high_water);
+            LOG_ERROR("  - Needed:        %d entries", needed);
+            LOG_ERROR("  - last_task_alive: %d (stuck here)", cur_last_alive);
+            LOG_ERROR("  - current_task:    %d", current);
+            LOG_ERROR("  - In-flight tasks: %d", current - cur_last_alive);
+            LOG_ERROR("Diagnosis:");
+            LOG_ERROR("  last_task_alive is not advancing, so dep pool tail");
+            LOG_ERROR("  cannot reclaim. Check TaskRing diagnostics for root cause.");
+            LOG_ERROR("Solution:");
+            LOG_ERROR("  Increase dep pool capacity (current: %d, recommended: %d)", capacity, high_water * 2);
+            LOG_ERROR("  Compile-time: PTO2_DEP_LIST_POOL_SIZE in pto_runtime2_types.h");
+            LOG_ERROR("  Runtime env:  PTO2_RING_DEP_POOL=%d", high_water * 2);
+            LOG_ERROR("========================================");
+            latch_pool_error(error_code_ptr, PTO2_ERROR_DEP_POOL_OVERFLOW);
+            return false;
+        }
+        SPIN_WAIT_HINT();
+    }
+    return true;
+}
diff --git a/src/a2a3/runtime/host_build_graph/runtime/orchestrator_core/pto_runtime2.cpp b/src/a2a3/runtime/host_build_graph/runtime/orchestrator_core/pto_runtime2.cpp
new file mode 100644
index 000000000..e22f68598
--- /dev/null
+++ b/src/a2a3/runtime/host_build_graph/runtime/orchestrator_core/pto_runtime2.cpp
@@ -0,0 +1,308 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+/**
+ * PTO Runtime2 - Main Implementation
+ *
+ * Implements the unified runtime API that combines orchestrator and scheduler.
+ *
+ * Based on: docs/RUNTIME_LOGIC.md
+ */
+
+#include "pto_runtime2.h"
+
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <algorithm>
+
+#include "aicpu/device_time.h"
+#include "common/unified_log.h"
+#if PTO2_PROFILING
+#include "aicpu/scope_stats_collector_aicpu.h"
+#endif
+
+// Weak fallback for HOST .so builds (never called, but satisfies linker).
+// The AICPU build links the strong symbol from platform/.../device_time.cpp.
+// Hidden visibility prevents HOST .so from polluting global symbol table.
+__attribute__((weak, visibility("hidden"))) uint64_t get_sys_cnt_aicpu() { return 0; }
+
+// =============================================================================
+// Orchestration Ops Table (function-pointer dispatch for orchestration .so)
+// =============================================================================
+
+static TaskOutputTensors submit_task_impl(PTO2Runtime *rt, const MixedKernels &mixed_kernels, const L0TaskArgs &args) {
+    return rt->orchestrator.submit_task(mixed_kernels, args);
+}
+
+static TaskOutputTensors alloc_tensors_impl(PTO2Runtime *rt, const L0TaskArgs &args) {
+    return rt->orchestrator.alloc_tensors(args);
+}
+
+static TaskOutputTensors submit_dummy_task_impl(PTO2Runtime *rt, const L0TaskArgs &args) {
+    return rt->orchestrator.submit_dummy_task(args);
+}
+
+void rt_scope_begin(PTO2Runtime *rt) {
+    PTO2ScopeMode mode = rt->pending_scope_mode;
+    rt->pending_scope_mode = PTO2ScopeMode::AUTO;
+    rt->orchestrator.begin_scope(mode);
+}
+
+void rt_scope_end(PTO2Runtime *rt) { rt->orchestrator.end_scope(); }
+
+void rt_orchestration_done(PTO2Runtime *rt) { rt->orchestrator.mark_done(); }
+
+static bool is_fatal_impl(PTO2Runtime *rt) { return rt->orchestrator.fatal; }
+
+void rt_report_fatal(PTO2Runtime *rt, int32_t error_code, const char *func, const char *fmt, ...) {
+    va_list args;
+    va_start(args, fmt);
+    if (fmt == nullptr || fmt[0] == '\0') {
+        rt->orchestrator.report_fatal(error_code, func, nullptr);
+    } else {
+        char message[1024];
+        vsnprintf(message, sizeof(message), fmt, args);
+        rt->orchestrator.report_fatal(error_code, func, "%s", message);
+    }
+    va_end(args);
+}
+
+// Wait for all producers of this tensor to be safe for data access.
+// Checks owner metadata (lifecycle anchor) and OverlapMap (modifier writers).
+// For reads: wait until each producer COMPLETED (done writing).
+// For writes: also wait until all consumers done reading
+//   (consumer low bits of fanout_refcount >= consumer count, excluding the
+//    bit31 scope reference).
+// Uses cycle-based timeout (checked every 1024 spins).
+// Returns false on timeout (sets orch.fatal).
+MAYBE_UNINITIALIZED_BEGIN
+static bool wait_for_tensor_ready(PTO2Runtime *rt, const Tensor &tensor, bool wait_for_consumers, const char *caller) {
+    PTO2TaskId owner = tensor.owner_task_id;
+    PTO2OrchestratorState &orch = rt->orchestrator;
+
+    // Segmented wait: collect up to kSegmentCap producer slots, then flush by
+    // spinning on each. When the segment fills, we wait for the accumulated
+    // batch before continuing to gather more. Dedup is per-segment only; a
+    // producer that appears in two segments is waited on twice, which is
+    // idempotent (task_state is monotonic) and only adds one atomic load on
+    // the second encounter.
+    constexpr int kSegmentCap = 64;
+    const PTO2TaskSlotState *seg[kSegmentCap];
+    int seg_count = 0;
+    bool signaled = false;
+    bool failed = false;
+
+    auto wait_one_producer = [&](const PTO2TaskSlotState &slot) {
+        uint8_t ring_id = 0;
+        int32_t local_id = static_cast<int32_t>(slot.task->task_id.local());
+        uint64_t t0 = get_sys_cnt_aicpu();
+        int32_t spin_count = 0;
+        while (slot.task_state.load(std::memory_order_acquire) < PTO2_TASK_COMPLETED) {
+            SPIN_WAIT_HINT();
+            if ((++spin_count & 1023) == 0) {
+                // A fatal latched elsewhere (e.g. the scheduler-side wiring
+                // deadlock detector) breaks this wait; cold path only.
+                if (orch.sm_header->orch_error_code.load(std::memory_order_acquire) != PTO2_ERROR_NONE) {
+                    failed = true;
+                    return;
+                }
+                if (get_sys_cnt_aicpu() - t0 > PTO2_TENSOR_DATA_TIMEOUT_CYCLES) {
+                    orch.report_fatal(
+                        PTO2_ERROR_TENSOR_WAIT_TIMEOUT, caller,
+                        "Timeout (%llu cycles): producer (ring=%d, local=%d) not completed",
+                        (unsigned long long)PTO2_TENSOR_DATA_TIMEOUT_CYCLES, ring_id, local_id
+                    );
+                    failed = true;
+                    return;
+                }
+            }
+        }
+    };
+
+    auto wait_one_consumers = [&](const PTO2TaskSlotState &slot) {
+        uint8_t ring_id = 0;
+        int32_t local_id = slot.task->task_id.local();
+        uint64_t t0 = get_sys_cnt_aicpu();
+        int32_t spin_count = 0;
+        while ((slot.fanout_refcount.load(std::memory_order_acquire) & ~PTO2_FANOUT_SCOPE_BIT) <
+               (slot.fanout_count & ~PTO2_FANOUT_SCOPE_BIT)) {
+            SPIN_WAIT_HINT();
+            if ((++spin_count & 1023) == 0) {
+                // A fatal latched elsewhere (e.g. the scheduler-side wiring
+                // deadlock detector) breaks this wait; cold path only.
+                if (orch.sm_header->orch_error_code.load(std::memory_order_acquire) != PTO2_ERROR_NONE) {
+                    failed = true;
+                    return;
+                }
+                if (get_sys_cnt_aicpu() - t0 > PTO2_TENSOR_DATA_TIMEOUT_CYCLES) {
+                    orch.report_fatal(
+                        PTO2_ERROR_TENSOR_WAIT_TIMEOUT, caller,
+                        "Timeout (%llu cycles): consumers of producer (ring=%d, local=%d) not done",
+                        (unsigned long long)PTO2_TENSOR_DATA_TIMEOUT_CYCLES, ring_id, local_id
+                    );
+                    failed = true;
+                    return;
+                }
+            }
+        }
+    };
+
+    auto flush_segment = [&]() {
+        for (int i = 0; i < seg_count; i++) {
+            wait_one_producer(*seg[i]);
+            if (failed) return;
+            if (!wait_for_consumers) continue;
+            wait_one_consumers(*seg[i]);
+            if (failed) return;
+        }
+        seg_count = 0;
+    };
+
+    auto try_push = [&](const PTO2TaskSlotState &s) {
+        for (int j = 0; j < seg_count; j++) {
+            if (seg[j] == &s) return;  // per-segment dedup
+        }
+        if (seg_count == kSegmentCap) {
+            flush_segment();
+            if (failed) return;
+        }
+        seg[seg_count++] = &s;
+        if (!signaled) {
+            orch.scheduler->wiring.orch_needs_drain.store(true, std::memory_order_release);
+            signaled = true;
+        }
+    };
+
+    auto do_wait = [&]() {
+        // Step A: creator retention — read owner directly from tensor metadata
+        if (owner.is_valid()) {
+            auto &s = orch.sm_header->ring.get_slot_state_by_task_id(owner.local());
+            try_push(s);
+            if (failed) return;
+        }
+
+        // Step B: modifier writer lookup (OverlapMap), direct callback
+        orch.tensor_map.lookup(tensor, [&](PTO2TensorMapEntry &entry, OverlapStatus) -> bool {
+            PTO2TaskId pid = entry.producer_task_id;
+            auto &s = orch.sm_header->ring.get_slot_state_by_task_id(pid.local());
+            try_push(s);
+            return !failed;
+        });
+        if (failed) return;
+        flush_segment();
+    };
+
+    do_wait();
+    if (signaled) {
+        orch.scheduler->wiring.orch_needs_drain.store(false, std::memory_order_release);
+    }
+    return !failed;
+}
+MAYBE_UNINITIALIZED_END
+
+uint64_t get_tensor_data(PTO2Runtime *rt, const Tensor &tensor, uint32_t ndims, const uint32_t indices[]) {
+    if (tensor.buffer.addr == 0) {
+        unified_log_error(
+            __FUNCTION__, "get_tensor_data: buffer not allocated (addr=0). "
+                          "Use the Tensor returned by add_output(TensorCreateInfo) after submit returns."
+        );
+        return 0;
+    }
+
+    if (!wait_for_tensor_ready(rt, tensor, false, __FUNCTION__)) {
+        return 0;
+    }
+
+    uint64_t flat_offset = tensor.compute_flat_offset(indices, ndims);
+    uint64_t elem_size = get_element_size(tensor.dtype);
+    void *ptr = reinterpret_cast<void *>(tensor.buffer.addr + flat_offset * elem_size);
+    // buffer.addr is SVM-mapped into host address space at staging time
+    // (runtime_maker svm_register_via_runner), so the host orchestrator reads
+    // it directly — same as set_tensor_data's write below.
+    uint64_t result = 0;
+    memcpy(&result, ptr, elem_size);
+    return result;
+}
+
+void set_tensor_data(PTO2Runtime *rt, const Tensor &tensor, uint32_t ndims, const uint32_t indices[], uint64_t value) {
+    if (tensor.buffer.addr == 0) {
+        unified_log_error(
+            __FUNCTION__, "set_tensor_data: buffer not allocated (addr=0). "
+                          "Use the Tensor returned by add_output(TensorCreateInfo) after submit returns."
+        );
+        return;
+    }
+
+    // Wait for producer + all consumers before writing (WAW + WAR safety)
+    if (!wait_for_tensor_ready(rt, tensor, true, __FUNCTION__)) {
+        return;
+    }
+
+    uint64_t flat_offset = tensor.compute_flat_offset(indices, ndims);
+    uint64_t elem_size = get_element_size(tensor.dtype);
+    void *ptr = reinterpret_cast<void *>(tensor.buffer.addr + flat_offset * elem_size);
+    memcpy(ptr, &value, elem_size);
+}
+
+// Ops-table entry that hands the call-site captured by PTO2ScopeGuard to the
+// [ScopeStats] collector. The slot is always present in the struct to keep
+// the layout stable; at PTO2_PROFILING=0 we fill nullptr so the orchestration
+// .so's null-check skips it.
+#if PTO2_PROFILING
+static void scope_set_site_impl(const char *file, int line) { scope_stats_set_pending_site(file, line); }
+#endif
+
+static const PTO2RuntimeOps s_runtime_ops = {
+    .submit_task = submit_task_impl,
+    .scope_begin = rt_scope_begin,
+    .scope_end = rt_scope_end,
+    .orchestration_done = rt_orchestration_done,
+    .is_fatal = is_fatal_impl,
+    .report_fatal = rt_report_fatal,
+    .log_error = unified_log_error,
+    .log_warn = unified_log_warn,
+    .log_debug = unified_log_debug,
+    .log_info_v = unified_log_info_v,
+    .get_tensor_data = get_tensor_data,
+    .set_tensor_data = set_tensor_data,
+    .alloc_tensors = alloc_tensors_impl,
+    .submit_dummy_task = submit_dummy_task_impl,
+#if PTO2_PROFILING
+    .scope_set_site = scope_set_site_impl,
+#else
+    .scope_set_site = nullptr,
+#endif
+};
+
+// =============================================================================
+// Runtime Lifecycle (AICPU-only fixup)
+// =============================================================================
+//
+// Layout / init_data / wire / destroy live in
+// runtime/shared/pto_runtime2_init.cpp so the host build can pre-populate the
+// prebuilt arena image. The pieces below — wiring the ops table and the
+// SPMD core counts — depend on the device-side s_runtime_ops global and the
+// AICPU SchedulerContext respectively, so they remain in the AICPU build.
+
+void runtime_finalize_after_wire(PTO2Runtime *rt, int32_t aic_count, int32_t aiv_count) {
+    rt->ops = &s_runtime_ops;
+    rt->orchestrator.total_cluster_count = aic_count;
+    rt->orchestrator.total_aiv_count = aiv_count;
+}
+
+void runtime_set_mode(PTO2Runtime *rt, PTO2RuntimeMode mode) {
+    if (rt) {
+        rt->mode = mode;
+    }
+}
diff --git a/src/a2a3/runtime/host_build_graph/runtime/pto2_dispatch_payload.h b/src/a2a3/runtime/host_build_graph/runtime/pto2_dispatch_payload.h
new file mode 100644
index 000000000..e1bb3465e
--- /dev/null
+++ b/src/a2a3/runtime/host_build_graph/runtime/pto2_dispatch_payload.h
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+/**
+ * @file pto2_dispatch_payload.h
+ * @brief Per-core dispatch payload for AICore kernel execution
+ *
+ * PTO2DispatchPayload holds the kernel function address, a per-core args[]
+ * array, and embedded SPMD context (LocalContext + GlobalContext).  AICPU
+ * maintains a static array of these (one per core).
+ *
+ * GlobalContext (sub_block_id) is initialized once at runtime startup via
+ * init_global_context() and never modified afterwards.
+ *
+ * LocalContext (block_idx, block_num) and args[] are rebuilt by build_payload()
+ * before each dispatch.  Both context struct pointers are written into the
+ * args[] suffix on every dispatch (since args[] is rebuilt entirely each time).
+ *
+ * AICore caches a pointer to its per-core slot at startup and reads from
+ * it on each dispatch.  The struct is cache-line aligned to avoid false
+ * sharing across concurrently dispatched cores.
+ *
+ * The DATA_MAIN_BASE register protocol is unchanged from the base runtime:
+ * a monotonically increasing reg_task_id signals new work to AICore.
+ */
+
+#pragma once
+
+#include <stdint.h>
+
+#include "arg_direction.h"
+#include "intrinsic.h"
+
+/** Max dispatch arguments: 16 scalars + up to 32 tensor pointers + ext params */
+#ifndef PTO2_DISPATCH_MAX_ARGS
+#define PTO2_DISPATCH_MAX_ARGS (MAX_TENSOR_ARGS + MAX_SCALAR_ARGS + PTO2_EXT_PARAMS_COUNT)
+#endif
+
+#ifndef PTO2_ALIGN_UP
+#define PTO2_ALIGN_UP(x, align) (((x) + (align) - 1) & ~((align) - 1))
+#endif
+
+// Verify hardcoded indices in intrinsic.h match the computed values.
+static_assert(
+    (MAX_TENSOR_ARGS + MAX_SCALAR_ARGS) == SPMD_LOCAL_CONTEXT_INDEX, "LOCAL_CONTEXT_INDEX out of sync with intrinsic.h"
+);
+static_assert(
+    (MAX_TENSOR_ARGS + MAX_SCALAR_ARGS + 1) == SPMD_GLOBAL_CONTEXT_INDEX,
+    "GLOBAL_CONTEXT_INDEX out of sync with intrinsic.h"
+);
+
+/**
+ * Per-core dispatch payload: function address + args[] + SPMD context.
+ *
+ * AICPU maintains a static array s_payload_per_core[RUNTIME_MAX_WORKER].
+ * AICore caches a pointer to its per-core slot at startup (via Handshake.task)
+ * and reads from it on each dispatch.
+ *
+ * The struct is cache-line aligned to prevent false sharing across
+ * concurrently dispatched cores.
+ */
+struct alignas(64) PTO2DispatchPayload {
+    uint64_t function_bin_addr;            /**< Kernel entry address in GM (set by Scheduler) */
+    uint64_t args[PTO2_DISPATCH_MAX_ARGS]; /**< Kernel arguments (GM pointers + scalars + ext params) */
+
+    /** Per-dispatch context: block_idx and block_num.
+     *  Written by build_payload() before each dispatch.
+     *  args[SPMD_LOCAL_CONTEXT_INDEX] points here. */
+    LocalContext local_context;
+
+    /** Per-core global context: sub_block_id (AIV lane identity).
+     *  Initialized once by init_global_context() at runtime startup.
+     *  args[SPMD_GLOBAL_CONTEXT_INDEX] points here. */
+    GlobalContext global_context;
+
+    /** Speculative early-dispatch gate. 0 = ready: AICore executes on pickup.
+     *  1 = not-ready: AICore waits until AICPU rings the doorbell
+     *  (DATA_MAIN_BASE high 32 == this dispatch's reg_task_id) before executing. */
+    volatile uint32_t not_ready;
+    uint8_t reserved_payload_abi_pad[4];
+
+    static_assert(sizeof(args[0]) == 8);
+    static_assert(
+        PTO2_ALIGN_UP((MAX_TENSOR_ARGS + MAX_SCALAR_ARGS) * sizeof(args[0]), 64) ==
+        (MAX_TENSOR_ARGS + MAX_SCALAR_ARGS) * sizeof(args[0])
+    );
+};
+
+static_assert(sizeof(PTO2DispatchPayload) == 512, "PTO2DispatchPayload hardware ABI size drift");
diff --git a/src/a2a3/runtime/host_build_graph/runtime/pto_async_kernel_api.h b/src/a2a3/runtime/host_build_graph/runtime/pto_async_kernel_api.h
new file mode 100644
index 000000000..cf6eb4790
--- /dev/null
+++ b/src/a2a3/runtime/host_build_graph/runtime/pto_async_kernel_api.h
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+#ifndef PTO_ASYNC_KERNEL_API_H
+#define PTO_ASYNC_KERNEL_API_H
+
+#include <stdint.h>
+
+#include <pto/comm/comm_types.hpp>
+#include <pto/comm/pto_comm_inst.hpp>
+
+#include "intrinsic.h"
+#include "aicore_completion_mailbox_types.h"
+#include "pto_completion_token.h"
+#include "pto_runtime_status.h"
+
+#ifndef __aicore__
+#define __aicore__
+#endif
+#ifndef __gm__
+#define __gm__
+#endif
+
+// Public surface: get_async_ctx, async_ctx_is_deferred,
+// register_completion_condition, send_notification,
+// save_expected_notification_counter. Everything else lives in
+// pto2::detail and is reserved for backend adapters / internal use.
+namespace pto2::detail {
+
+inline __aicore__ void defer_load_slab(AsyncCtx &ctx) {
+    if (ctx.completion_count == nullptr) return;
+#if defined(__CCE_KT_TEST__) || defined(__CCE_AICORE__) || defined(__DAV_C220__)
+    uintptr_t line = reinterpret_cast<uintptr_t>(ctx.completion_count) & ~(uintptr_t(PTO2_ALIGN_SIZE) - 1u);
+    dcci((__gm__ int32_t *)line, SINGLE_CACHE_LINE);
+#else
+    __asm__ __volatile__("" ::: "memory");
+#endif
+}
+
+inline __aicore__ void defer_error(AsyncCtx &ctx, int32_t error_code) {
+    if (ctx.task_token.is_valid() && ctx.completion_error_code != nullptr) {
+        *ctx.completion_error_code = error_code;
+    }
+}
+
+inline __aicore__ void defer_flush_range(volatile __gm__ void *addr, uint32_t size_bytes) {
+    if (addr == nullptr || size_bytes == 0) return;
+#if defined(__CCE_KT_TEST__) || defined(__CCE_AICORE__) || defined(__DAV_C220__)
+    uintptr_t start = reinterpret_cast<uintptr_t>(addr) & ~(uintptr_t(PTO2_ALIGN_SIZE) - 1u);
+    uintptr_t end =
+        (reinterpret_cast<uintptr_t>(addr) + size_bytes + PTO2_ALIGN_SIZE - 1u) & ~(uintptr_t(PTO2_ALIGN_SIZE) - 1u);
+    for (uintptr_t p = start; p < end; p += PTO2_ALIGN_SIZE) {
+        dcci((__gm__ int32_t *)p, SINGLE_CACHE_LINE, CACHELINE_OUT);
+    }
+#else
+    (void)addr;
+    (void)size_bytes;
+#endif
+}
+
+inline __aicore__ void defer_flush(AsyncCtx &ctx) {
+    if (ctx.task_token.is_invalid() || ctx.completion_count == nullptr) return;
+#if defined(__CCE_KT_TEST__) || defined(__CCE_AICORE__) || defined(__DAV_C220__)
+    uint32_t count = *ctx.completion_count;
+    if (count > ctx.completion_capacity) {
+        count = ctx.completion_capacity;
+    }
+    uint32_t flush_bytes = static_cast<uint32_t>(sizeof(*ctx.completion_count));
+    if (ctx.completion_error_code != nullptr) {
+        flush_bytes += static_cast<uint32_t>(sizeof(*ctx.completion_error_code));
+    }
+    if (ctx.completion_entries != nullptr) {
+        flush_bytes += count * static_cast<uint32_t>(sizeof(DeferredCompletionEntry));
+    }
+    defer_flush_range(ctx.completion_count, flush_bytes);
+#if defined(__CPU_SIM)
+    dsb(0);
+#else
+    dsb(DSB_DDR);
+#endif
+    pipe_barrier(PIPE_ALL);
+#else
+    (void)ctx;
+    __asm__ __volatile__("" ::: "memory");
+#endif
+}
+
+}  // namespace pto2::detail
+
+inline __aicore__ AsyncCtx get_async_ctx(__gm__ int64_t *args) {
+    __gm__ LocalContext *lc =
+        reinterpret_cast<__gm__ LocalContext *>(static_cast<uintptr_t>(args[PAYLOAD_LOCAL_CONTEXT_INDEX]));
+    AsyncCtx ctx{};
+    ctx.completion_count = lc->async_ctx.completion_count;
+    ctx.completion_error_code = lc->async_ctx.completion_error_code;
+    ctx.completion_entries = lc->async_ctx.completion_entries;
+    ctx.completion_capacity = lc->async_ctx.completion_capacity;
+    ctx.task_token.raw = lc->async_ctx.task_token.raw;
+    pto2::detail::defer_load_slab(ctx);
+    return ctx;
+}
+
+inline __aicore__ bool async_ctx_is_deferred(const AsyncCtx &ctx) { return ctx.task_token.is_valid(); }
+
+// Canonical writer: backend submit handlers build a CompletionToken and pass
+// it here. Writes one DeferredCompletionEntry to the AsyncCtx slab and
+// bumps completion_count. Returns false on overflow (also stores
+// PTO2_ERROR_ASYNC_WAIT_OVERFLOW in ctx.completion_error_code) or when ctx is
+// not currently a deferred context.
+inline __aicore__ bool register_completion_condition(AsyncCtx &ctx, const CompletionToken &token) {
+    if (ctx.task_token.is_invalid() || ctx.completion_count == nullptr || ctx.completion_entries == nullptr) {
+        return false;
+    }
+
+    uint32_t idx = *ctx.completion_count;
+    if (idx >= ctx.completion_capacity) {
+        if (ctx.completion_error_code != nullptr) {
+            *ctx.completion_error_code = PTO2_ERROR_ASYNC_WAIT_OVERFLOW;
+        }
+        return false;
+    }
+
+    volatile __gm__ DeferredCompletionEntry *slot = &ctx.completion_entries[idx];
+    slot->addr = token.addr;
+    slot->expected_value = token.expected_value;
+    slot->engine = token.engine;
+    slot->completion_type = token.completion_type;
+    slot->_pad = 0;
+    *ctx.completion_count = idx + 1;
+    return true;
+}
+
+inline __aicore__ void
+send_notification(volatile __gm__ void *remote_counter_addr, int32_t value, pto::comm::NotifyOp notify_op) {
+    __gm__ int32_t *counter = reinterpret_cast<__gm__ int32_t *>(const_cast<__gm__ void *>(remote_counter_addr));
+    pto::comm::Signal signal(counter);
+    pto::comm::TNOTIFY(signal, value, notify_op);
+}
+
+inline __aicore__ void
+save_expected_notification_counter(AsyncCtx &ctx, volatile __gm__ void *counter_addr, uint32_t expected_value) {
+    CompletionToken token{
+        reinterpret_cast<uint64_t>(counter_addr), expected_value, COMPLETION_ENGINE_SDMA, COMPLETION_TYPE_COUNTER, 0
+    };
+    (void)register_completion_condition(ctx, token);
+    pto2::detail::defer_flush(ctx);
+}
+
+#endif  // PTO_ASYNC_KERNEL_API_H
diff --git a/src/a2a3/runtime/host_build_graph/runtime/pto_async_wait.h b/src/a2a3/runtime/host_build_graph/runtime/pto_async_wait.h
new file mode 100644
index 000000000..65608ad2f
--- /dev/null
+++ b/src/a2a3/runtime/host_build_graph/runtime/pto_async_wait.h
@@ -0,0 +1,303 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+#ifndef PTO_ASYNC_WAIT_H
+#define PTO_ASYNC_WAIT_H
+
+#include <atomic>
+#include <cstddef>
+#include <cstdint>
+
+#include "aicpu/platform_regs.h"
+#include "backend/sdma/sdma_completion_scheduler.h"
+#include "intrinsic.h"
+#include "aicore_completion_mailbox.h"
+#include "pto_completion_token.h"
+#include "pto_runtime2_types.h"
+
+struct PTO2SchedulerState;
+struct PTO2LocalReadyBuffer;
+struct CompletionStats;
+
+inline constexpr int32_t MAX_ASYNC_WAITS = 64;
+
+// The mailbox transport (has_pending / try_push_condition /
+// try_push_normal_done / try_pop) lives as AICoreCompletionMailbox member
+// functions in aicore_completion_mailbox.h. This file only holds the
+// application layer: translating drained messages into wait-list state.
+
+inline uintptr_t mailbox_cache_line(const volatile void *addr) {
+    return reinterpret_cast<uintptr_t>(addr) & ~(uintptr_t(PTO2_ALIGN_SIZE) - 1u);
+}
+
+struct CompletionCondition;
+
+using CompletionPollFn = CompletionPollResult (*)(const CompletionCondition &);
+using CompletionRetireFn = void (*)(CompletionCondition &);
+
+struct CompletionBackendOps {
+    CompletionPollFn poll;
+    CompletionRetireFn retire;
+};
+
+struct CompletionCondition {
+    AsyncEngine engine{ASYNC_ENGINE_SDMA};
+    int32_t completion_type{COMPLETION_TYPE_COUNTER};
+    bool satisfied{false};
+    bool retired{false};
+    volatile uint32_t *counter_addr{nullptr};
+    uint64_t addr{0};
+    uint32_t expected_value{0};
+
+    CompletionPollResult test() const;
+    void retire();
+};
+
+// Per-completion-type ops. SDMA_EVENT_RECORD detail lives in
+// backend/sdma/sdma_completion_scheduler.h; the op wrappers below are thin
+// glue mapping CompletionCondition.addr into the backend's raw-addr helpers.
+inline CompletionPollResult counter_poll_op(const CompletionCondition &cond) {
+    if (cond.counter_addr == nullptr) {
+        return {CompletionPollState::FAILED, PTO2_ERROR_ASYNC_COMPLETION_INVALID};
+    }
+    return {
+        *cond.counter_addr >= cond.expected_value ? CompletionPollState::READY : CompletionPollState::PENDING,
+        PTO2_ERROR_NONE
+    };
+}
+
+inline void counter_retire_op(CompletionCondition & /*cond*/) {}
+
+inline CompletionPollResult sdma_event_record_poll_op(const CompletionCondition &cond) {
+    return poll_sdma_event_record(cond.addr);
+}
+
+inline void sdma_event_record_retire_op(CompletionCondition &cond) { retire_sdma_event_record(cond.addr); }
+
+inline const CompletionBackendOps *completion_backend_ops_for(int completion_type) {
+    static const CompletionBackendOps kOps[] = {
+        {counter_poll_op, counter_retire_op},                      // COMPLETION_TYPE_COUNTER = 0
+        {sdma_event_record_poll_op, sdma_event_record_retire_op},  // COMPLETION_TYPE_SDMA_EVENT_RECORD = 1
+    };
+    constexpr int kOpsCount = static_cast<int>(sizeof(kOps) / sizeof(kOps[0]));
+    if (completion_type < 0 || completion_type >= kOpsCount) return nullptr;
+    return &kOps[completion_type];
+}
+
+inline CompletionPollResult CompletionCondition::test() const {
+    if (satisfied) {
+        return {CompletionPollState::READY, PTO2_ERROR_NONE};
+    }
+    const CompletionBackendOps *ops = completion_backend_ops_for(completion_type);
+    if (ops == nullptr || ops->poll == nullptr) {
+        return {CompletionPollState::FAILED, PTO2_ERROR_ASYNC_COMPLETION_INVALID};
+    }
+    return ops->poll(*this);
+}
+
+inline void CompletionCondition::retire() {
+    if (retired) return;
+    const CompletionBackendOps *ops = completion_backend_ops_for(completion_type);
+    if (ops != nullptr && ops->retire != nullptr) {
+        ops->retire(*this);
+    }
+    retired = true;
+}
+
+struct AsyncWaitEntry {
+    PTO2TaskSlotState *slot_state{nullptr};
+    PTO2TaskId task_token{PTO2TaskId::invalid()};
+    CompletionCondition conditions[MAX_COMPLETIONS_PER_TASK];
+    int32_t condition_count{0};
+    int32_t waiting_completion_count{0};
+    bool normal_done{false};
+};
+
+struct AsyncPollResult {
+    int32_t completed{0};
+    int32_t error_code{PTO2_ERROR_NONE};
+    PTO2TaskSlotState *failed_slot_state{nullptr};
+};
+
+inline const char *async_engine_name(AsyncEngine engine) {
+    switch (engine) {
+    case ASYNC_ENGINE_SDMA:
+        return "SDMA";
+    case ASYNC_ENGINE_ROCE:
+        return "ROCE";
+    case ASYNC_ENGINE_URMA:
+        return "URMA";
+    case ASYNC_ENGINE_CCU:
+        return "CCU";
+    default:
+        return "UNKNOWN";
+    }
+}
+
+struct AsyncWaitList {
+    std::atomic<int32_t> busy{0};
+    AsyncWaitEntry entries[MAX_ASYNC_WAITS];
+    int32_t count{0};
+    // Diagnostic: counts every FIN-side try_push that hit a full mailbox.
+    // Expected to stay zero on real workloads (ring is 4096 entries); a
+    // non-zero value means consumers are too slow or the ring is undersized.
+    // Read by scheduler shutdown / l2 perf summary; not on the hot path.
+    std::atomic<uint64_t> mpsc_skipped_count{0};
+
+    bool try_lock() {
+        int32_t expected = 0;
+        return busy.compare_exchange_strong(expected, 1, std::memory_order_acquire, std::memory_order_relaxed);
+    }
+
+    void unlock() { busy.store(0, std::memory_order_release); }
+
+    AsyncWaitEntry *find_entry_by_token(PTO2TaskId token) {
+        for (int32_t i = 0; i < count; i++) {
+            if (entries[i].task_token == token) return &entries[i];
+        }
+        return nullptr;
+    }
+
+    // Captures the side-channel a scheduler-aware drain needs to complete
+    // NotDeferred tasks inline (without storing a transient entry in
+    // entries[]).
+    struct DrainCompletionSink {
+        PTO2SchedulerState *sched{nullptr};
+        PTO2LocalReadyBuffer *local_bufs{nullptr};
+        PTO2TaskSlotState **deferred_release_slot_states{nullptr};
+        int32_t *deferred_release_count{nullptr};
+        int32_t deferred_release_capacity{0};
+        int32_t inline_completed{0};
+#if PTO2_SCHED_PROFILING
+        int32_t thread_idx{0};
+#endif
+
+        bool can_inline_complete() const { return sched != nullptr; }
+    };
+
+    // Inline-complete a NotDeferred task during drain. Returns false on
+    // deferred_release_slot_states overflow.
+    bool try_inline_complete_locked(DrainCompletionSink &sink, PTO2TaskSlotState &slot_state);
+
+    // Single-consumer drain: pop each published message in tail order and
+    // translate it into wait-list state. An empty sink (sched == nullptr) just
+    // materializes entries; a sched-aware sink additionally inline-completes
+    // lonely NotDeferred NORMAL_DONEs without ever growing entries[].
+    int32_t drain_aicore_completion_mailbox_locked(
+        AICoreCompletionMailbox *aicore_mailbox, DrainCompletionSink &sink, int32_t &error_code
+    ) {
+        error_code = PTO2_ERROR_NONE;
+        if (aicore_mailbox == nullptr) return 0;
+
+        int32_t drained = 0;
+        AICoreCompletionMsgView msg;
+        // try_pop is the transport layer (seq-gated, in-order dequeue); this
+        // loop is the application layer (translate each message into wait-list
+        // state). try_pop returns false at the first gap or when empty.
+        while (aicore_mailbox->try_pop(msg)) {
+            drained++;
+            if (msg.kind == MSG_KIND_CONDITION) {
+                AsyncWaitEntry *entry = find_entry_by_token(msg.task_token);
+                if (entry == nullptr) {
+                    // First message for this task — materialize the entry here.
+                    // slot_state stays null until the matching TASK_NORMAL_DONE
+                    // sentinel arrives.
+                    if (count >= MAX_ASYNC_WAITS) {
+                        error_code = PTO2_ERROR_ASYNC_WAIT_OVERFLOW;
+                        return drained;
+                    }
+                    entry = &entries[count++];
+                    entry->task_token = msg.task_token;
+                    entry->slot_state = nullptr;
+                    entry->condition_count = 0;
+                    entry->waiting_completion_count = 0;
+                    entry->normal_done = false;
+                }
+                if (!append_condition_locked(
+                        *entry, msg.addr, msg.expected_value, static_cast<AsyncEngine>(msg.engine), msg.completion_type,
+                        error_code
+                    )) {
+                    return drained;
+                }
+            } else if (msg.kind == MSG_KIND_TASK_NORMAL_DONE) {
+                PTO2TaskSlotState *slot_state_ptr =
+                    reinterpret_cast<PTO2TaskSlotState *>(static_cast<uintptr_t>(msg.addr));
+                AsyncWaitEntry *entry = find_entry_by_token(msg.task_token);
+                if (entry == nullptr) {
+                    // Producers strictly order: all CONDITIONs for token T are
+                    // pushed before the matching NORMAL_DONE (the acq_rel on
+                    // on_subtask_complete enforces this across producers). So
+                    // observing NORMAL_DONE first => the task registered no
+                    // conditions => NotDeferred. Complete it inline when the
+                    // sink allows; otherwise fall back to the entry-store path.
+                    if (sink.can_inline_complete()) {
+                        (void)try_inline_complete_locked(sink, *slot_state_ptr);
+                        continue;
+                    }
+                    if (count >= MAX_ASYNC_WAITS) {
+                        error_code = PTO2_ERROR_ASYNC_WAIT_OVERFLOW;
+                        return drained;
+                    }
+                    entry = &entries[count++];
+                    entry->task_token = msg.task_token;
+                    entry->slot_state = slot_state_ptr;
+                    entry->condition_count = 0;
+                    entry->waiting_completion_count = 0;
+                    entry->normal_done = true;
+                } else {
+                    if (entry->slot_state == nullptr) {
+                        entry->slot_state = slot_state_ptr;
+                    }
+                    entry->normal_done = true;
+                }
+            } else {
+                error_code = PTO2_ERROR_ASYNC_REGISTRATION_FAILED;
+                return drained;
+            }
+        }
+        return drained;
+    }
+
+    bool append_condition_locked(
+        AsyncWaitEntry &entry, uint64_t addr, uint32_t expected_value, AsyncEngine engine, int32_t completion_type,
+        int32_t &error_code
+    ) {
+        if (entry.condition_count >= MAX_COMPLETIONS_PER_TASK) {
+            error_code = PTO2_ERROR_ASYNC_REGISTRATION_FAILED;
+            return false;
+        }
+        CompletionCondition &cond = entry.conditions[entry.condition_count++];
+        cond.engine = engine;
+        cond.completion_type = completion_type;
+        cond.satisfied = false;
+        cond.retired = false;
+        cond.addr = addr;
+        cond.counter_addr = completion_type == COMPLETION_TYPE_COUNTER ?
+                                reinterpret_cast<volatile uint32_t *>(static_cast<uintptr_t>(addr)) :
+                                nullptr;
+        cond.expected_value = expected_value;
+        entry.waiting_completion_count++;
+        return true;
+    }
+
+    template <bool Profiling>
+    AsyncPollResult poll_and_complete(
+        AICoreCompletionMailbox *aicore_mailbox, PTO2SchedulerState *sched, PTO2LocalReadyBuffer *local_bufs,
+        PTO2TaskSlotState **deferred_release_slot_states, int32_t &deferred_release_count,
+        int32_t deferred_release_capacity
+#if PTO2_SCHED_PROFILING
+        ,
+        int thread_idx
+#endif
+    );
+};
+
+#endif  // PTO_ASYNC_WAIT_H
diff --git a/src/a2a3/runtime/host_build_graph/runtime/pto_completion_token.h b/src/a2a3/runtime/host_build_graph/runtime/pto_completion_token.h
new file mode 100644
index 000000000..c5a8c345f
--- /dev/null
+++ b/src/a2a3/runtime/host_build_graph/runtime/pto_completion_token.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+#ifndef SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_PTO_COMPLETION_TOKEN_H_
+#define SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_PTO_COMPLETION_TOKEN_H_
+
+#include <stdint.h>
+
+#include "aicore_completion_mailbox_types.h"
+#include "pto_runtime_status.h"
+
+// CompletionToken is the runtime-internal POD that backend submit handlers
+// produce and the generic register_completion_condition() consumes. It is the
+// ABI contract for "this is one completion to wait on" — independent of which
+// backend (SDMA, RoCE, notification counter, ...) generated it. Each backend's
+// (poll, retire) pair is registered in pto_async_wait.h's ops table, keyed by
+// completion_type.
+struct CompletionToken {
+    uint64_t addr;
+    uint32_t expected_value;
+    uint32_t engine;
+    int32_t completion_type;
+    uint64_t backend_cookie;
+};
+
+enum class CompletionPollState : uint8_t {
+    PENDING = 0,
+    READY = 1,
+    FAILED = 2,
+};
+
+struct CompletionPollResult {
+    CompletionPollState state{CompletionPollState::PENDING};
+    int32_t error_code{PTO2_ERROR_NONE};
+};
+
+#endif  // SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_PTO_COMPLETION_TOKEN_H_
diff --git a/src/a2a3/runtime/host_build_graph/runtime/pto_constants.h b/src/a2a3/runtime/host_build_graph/runtime/pto_constants.h
new file mode 100644
index 000000000..07251cc39
--- /dev/null
+++ b/src/a2a3/runtime/host_build_graph/runtime/pto_constants.h
@@ -0,0 +1,19 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+#ifndef SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_PTO_CONSTANTS_H_
+#define SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_PTO_CONSTANTS_H_
+
+#define PTO2_ALIGN_SIZE 64             // Cache line alignment
+#define PTO2_PACKED_OUTPUT_ALIGN 1024  // Each output in packed buffer aligned to 1024B; gap is padding
+#define PTO2_ALIGN_UP(x, align) (((x) + (align) - 1) & ~((align) - 1))
+
+#endif  // SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_PTO_CONSTANTS_H_
diff --git a/src/a2a3/runtime/host_build_graph/runtime/pto_dep_compute.h b/src/a2a3/runtime/host_build_graph/runtime/pto_dep_compute.h
new file mode 100644
index 000000000..f8392dfbf
--- /dev/null
+++ b/src/a2a3/runtime/host_build_graph/runtime/pto_dep_compute.h
@@ -0,0 +1,179 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+/**
+ * @file pto_dep_compute.h
+ * @brief Dependency computation primitives shared by runtime submit_task and dep_gen replay.
+ *
+ * Two header-only template entry points:
+ *
+ *   compute_task_fanin     — STEP 3 in submit_task: per-tensor creator retention (Step A)
+ *                            + tensormap.lookup for INPUT/INOUT (Step B). Calls back into
+ *                            user-supplied `emit` for each producer it identifies.
+ *
+ *   register_task_outputs  — STEP 4 in submit_task: tensormap.insert for INOUT and
+ *                            OUTPUT_EXISTING tensors. No callbacks.
+ *
+ * STEP 1 (explicit_deps) is intentionally left at the runtime call site because its
+ * `last_task_alive` shortcut + unchecked slot lookup is subtly different from the
+ * `slot_state->task->task_id == producer` reuse check in STEP 3. Unifying them would
+ * require two emit semantics or a marginal behavior change in transients — not worth
+ * the minor structural overlap. Replay handles STEP 1 with a one-line loop of its own.
+ *
+ * The Emit callback contract:
+ *   bool emit(PTO2TaskId producer);
+ *     - return true to continue (whether or not the producer was actually recorded —
+ *       producer-not-alive / dedup-hit / etc. all return true silently)
+ *     - return false to signal fatal (e.g. fanin spill overflow); caller bails
+ *
+ * Performance: Emit is a template parameter, not std::function. Both runtime
+ * (lambda capturing fanin_builder + sm_header) and replay (lambda capturing edge
+ * vector) instantiate at the call site and inline through. Do NOT replace with
+ * std::function — it would break the inlining and add ~5 ns/call to the orch hot path.
+ */
+
+#ifndef SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_PTO_DEP_COMPUTE_H_
+#define SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_PTO_DEP_COMPUTE_H_
+
+#include <cstdint>
+
+#include "pto_task_id.h"
+#include "pto_tensormap.h"
+#include "pto_types.h"  // TensorRef
+#include "tensor.h"
+
+/**
+ * View struct for inputs to compute_task_fanin / register_task_outputs.
+ *
+ * Both runtime and replay assemble one of these from their own data sources
+ * (runtime: from Arg accessors; replay: from SubmitTraceEntry fields). All
+ * pointer arrays must remain valid for the duration of the call.
+ */
+struct DepInputs {
+    int32_t tensor_count;
+    const TensorRef *tensors;        // length = tensor_count (union; OUTPUT slots' .ptr is unused)
+    const TensorArgType *arg_types;  // length = tensor_count
+    int32_t explicit_dep_count;
+    const PTO2TaskId *explicit_deps;  // length = explicit_dep_count (validity checked by caller)
+};
+
+/**
+ * Compute fanin for a task being submitted (STEP 3: Step A creator retention +
+ * Step B tensormap modifier lookup).
+ *
+ * For each non-OUTPUT tensor:
+ *   - If owner_task_id is valid, emit(owner)
+ *   - For INPUT/INOUT (and not manual_dep), tensor_map.lookup(*tensor) and emit
+ *     each matching producer. INOUT+COVERED triggers tensor_map.remove_entry(entry).
+ *
+ * @return true on success (or producer-skipped-silently); false if emit signaled
+ *         fatal — caller should propagate (after any fatal bookkeeping done by emit).
+ */
+template <typename Emit>
+[[nodiscard]] inline bool
+compute_task_fanin(const DepInputs &inputs, PTO2TensorMap &tensor_map, bool in_manual_scope, Emit emit) {
+    if (in_manual_scope) {
+        return true;
+    }
+
+    for (int32_t i = 0; i < inputs.tensor_count; i++) {
+        TensorArgType ptype = inputs.arg_types[i];
+        if (ptype == TensorArgType::OUTPUT) {
+            // Runtime-created OUTPUT tensors are not looked up in the TensorMap since
+            // they have no dependencies.
+            continue;
+        }
+
+        const Tensor *tensor = &inputs.tensors[i].ref();
+
+        // Step A: creator retention — all existing tensors extend their creator lifetime.
+        PTO2TaskId owner = tensor->owner_task_id;
+        if (owner.is_valid()) {
+            if (!emit(owner)) {
+                return false;
+            }
+        }
+
+        // Step B: only INPUT/INOUT need modifier dependency lookup.
+        if (ptype != TensorArgType::INPUT && ptype != TensorArgType::INOUT) {
+            continue;
+        }
+        if (tensor->manual_dep) {
+            continue;
+        }
+
+        bool fatal = false;
+        tensor_map.lookup(*tensor, [&](PTO2TensorMapEntry &entry, OverlapStatus overlap_status) -> bool {
+            if (!emit(entry.producer_task_id)) {
+                fatal = true;
+                return false;  // stop iteration
+            }
+            if (ptype == TensorArgType::INOUT && overlap_status == OverlapStatus::COVERED) {
+                tensor_map.remove_entry(entry);
+            }
+            return true;
+        });
+        if (fatal) {
+            return false;
+        }
+    }
+    return true;
+}
+
+/**
+ * Register a task's outputs in the tensormap (STEP 4 in submit_task).
+ *
+ * For INOUT and OUTPUT_EXISTING tensors (excluding manual_dep), inserts the
+ * tensor into tensor_map keyed by its buffer.addr with `task_id` as producer.
+ *
+ * No-op when in_manual_scope.
+ */
+inline void
+register_task_outputs(const DepInputs &inputs, PTO2TaskId task_id, PTO2TensorMap &tensor_map, bool in_manual_scope) {
+    if (in_manual_scope) {
+        return;
+    }
+    for (int32_t i = 0; i < inputs.tensor_count; i++) {
+        TensorArgType ptype = inputs.arg_types[i];
+        if (ptype == TensorArgType::INOUT || ptype == TensorArgType::OUTPUT_EXISTING) {
+            const Tensor *tensor = &inputs.tensors[i].ref();
+            if (!tensor->manual_dep) {
+                tensor_map.insert(*tensor, task_id);
+            }
+        }
+    }
+}
+
+/**
+ * Count the tensormap entries register_task_outputs() will insert for this task.
+ *
+ * Mirrors register_task_outputs()'s selection exactly (INOUT / OUTPUT_EXISTING,
+ * excluding manual_dep), so the returned value is the precise number of
+ * new_entry() calls that step makes. The orchestrator uses it to reserve pool
+ * capacity before inserting. Returns 0 in a manual scope (no registration).
+ */
+inline int32_t count_registrable_outputs(const DepInputs &inputs, bool in_manual_scope) {
+    if (in_manual_scope) {
+        return 0;
+    }
+    int32_t needed = 0;
+    for (int32_t i = 0; i < inputs.tensor_count; i++) {
+        TensorArgType ptype = inputs.arg_types[i];
+        if (ptype == TensorArgType::INOUT || ptype == TensorArgType::OUTPUT_EXISTING) {
+            if (!inputs.tensors[i].ref().manual_dep) {
+                needed++;
+            }
+        }
+    }
+    return needed;
+}
+
+#endif  // SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_PTO_DEP_COMPUTE_H_
diff --git a/src/a2a3/runtime/host_build_graph/runtime/pto_orchestrator.h b/src/a2a3/runtime/host_build_graph/runtime/pto_orchestrator.h
new file mode 100644
index 000000000..6250f0489
--- /dev/null
+++ b/src/a2a3/runtime/host_build_graph/runtime/pto_orchestrator.h
@@ -0,0 +1,189 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * PTO Runtime2 - Orchestrator Interface
+ *
+ * The Orchestrator is responsible for:
+ * 1. Executing the orchestration function (Turing-complete control flow)
+ * 2. Allocating intermediate buffers from the heap
+ * 3. Submitting tasks via async InCore function calls
+ * 4. Building the dependency graph using TensorMap
+ * 5. Managing buffer scopes for lifecycle control
+ *
+ * The Orchestrator can run on either:
+ * - Host CPU (lower latency for complex control, easier debugging)
+ * - Device AI_CPU (lower latency for task submission)
+ *
+ * Based on: docs/RUNTIME_LOGIC.md
+ */
+
+#ifndef PTO_ORCHESTRATOR_H
+#define PTO_ORCHESTRATOR_H
+
+#include "common/l2_swimlane_profiling.h"
+#include "utils/device_arena.h"
+#include "pto_ring_buffer.h"
+#include "pto_runtime2_types.h"
+#include "pto_submit_types.h"
+#include "scheduler/pto_scheduler.h"
+#include "pto_shared_memory.h"
+#include "pto_tensormap.h"
+#include "pto_types.h"
+
+/**
+ * Layout descriptor produced by PTO2OrchestratorState::reserve_layout(). Holds
+ * arena offsets for every sub-region the orchestrator owns (per-ring fanin
+ * pools, scope arrays, plus the nested PTO2TensorMap layout).
+ */
+struct PTO2OrchestratorLayout {
+    size_t off_fanin_pool;
+    size_t off_fanin_seen_epoch;
+    size_t off_scope_tasks;
+    size_t off_scope_begins;
+    PTO2TensorMapLayout tensor_map;
+    int32_t dep_pool_capacity;
+    int32_t scope_tasks_cap;
+    uint64_t scope_stack_capacity;
+};
+
+// =============================================================================
+// Orchestrator State
+// =============================================================================
+
+/**
+ * Orchestrator state structure (private to Orchestrator)
+ *
+ * Contains all state needed for task graph construction and buffer management.
+ */
+struct PTO2OrchestratorState {
+    // === SHARED MEMORY ACCESS ===
+    PTO2SharedMemoryHeader *sm_header;
+
+    // === RING RESOURCES (single ring) ===
+    PTO2RingSet ring;
+    uint32_t *fanin_seen_epoch;
+    uint32_t fanin_seen_current_epoch{1};
+
+    // === TENSOR MAP (Private) ===
+    PTO2TensorMap tensor_map;  // Producer lookup
+
+    // === SCOPE STACK (Private) ===
+    // Single contiguous buffer of task IDs, partitioned by scope level.
+    // scope_begins[i] is the index into scope_tasks where scope i starts.
+    // Tasks for the top scope occupy [scope_begins[top], scope_tasks_size).
+    PTO2TaskSlotState **scope_tasks;  // Flat buffer of taskSlotState (all scopes concatenated)
+    int32_t scope_tasks_size;         // Number of task IDs currently in the buffer
+    int32_t scope_tasks_capacity;     // Allocated capacity of scope_tasks
+    int32_t *scope_begins;            // scope_begins[i] = start index of scope i in scope_tasks
+    int32_t scope_stack_top;          // Current top of stack (-1 = no scope open)
+    uint64_t scope_stack_capacity;    // Max nesting depth (PTO2_MAX_SCOPE_DEPTH)
+    int32_t manual_begin_depth{PTO2_MAX_SCOPE_DEPTH};
+
+    // === SCHEDULER REFERENCE ===
+    // Note: In simulated mode, orchestrator and scheduler share address space
+    // In real mode, they communicate via shared memory only
+    PTO2SchedulerState *scheduler;  // For simulated mode only
+
+    // Total core counts set once at executor init; used for submit-time deadlock detection.
+    int32_t total_cluster_count{0};  // AIC cores = MIX clusters
+    int32_t total_aiv_count{0};      // AIV cores (= 2 × clusters on standard hardware)
+#if PTO2_PROFILING
+    // L2 swimlane_level copied from get_l2_swimlane_level().
+    L2SwimlaneLevel l2_swimlane_level{L2SwimlaneLevel::DISABLED};
+#endif
+
+    // === GM HEAP (for output buffers) ===
+    void *gm_heap_base;     // Base address of GM heap
+    uint64_t gm_heap_size;  // Total size of GM heap (all rings)
+
+    // === FATAL ERROR ===
+    // Fatal error flag (single-thread access by orchestrator, no atomic needed)
+    // Cross-thread notification uses shared memory orch_error_code (atomic)
+    bool fatal;
+
+    // Hidden alloc tasks complete synchronously inside the orchestrator and
+    // therefore bypass the executor's normal worker-completion counter path.
+    // The executor adds this count into its completed_tasks_ progress counter
+    // after orchestration finishes so shutdown/profiling totals remain closed.
+    int64_t inline_completed_tasks{0};
+
+    // === STATISTICS ===
+#if PTO2_PROFILING
+    int64_t tasks_submitted;
+    int64_t buffers_allocated;
+    int64_t bytes_allocated;
+#endif
+
+    bool in_manual_scope() const { return scope_stack_top >= manual_begin_depth; }
+
+    // === Cold-path API (defined in pto_orchestrator.cpp) ===
+
+    // Phase 1: declare every sub-region (per-ring fanin pool, scope arrays,
+    // tensor_map sub-layout) on the supplied arena. task_window_sizes feeds
+    // the nested tensor_map layout. Returned layout is consumed by
+    // init_from_layout.
+    static PTO2OrchestratorLayout
+    reserve_layout(DeviceArena &arena, int32_t task_window_size, int32_t dep_pool_capacity = PTO2_DEP_LIST_POOL_SIZE);
+
+    // Phase 3a: write everything *except* arena-internal pointer fields.
+    // sm_dev_base is the SM device address (only stored, never dereferenced);
+    // task_window_size feeds the SM address arithmetic. Safe to call on a host
+    // arena that holds the prebuilt image.
+    bool init_data_from_layout(
+        const PTO2OrchestratorLayout &layout, DeviceArena &arena, void *sm_dev_base, void *gm_heap, uint64_t heap_size,
+        uint64_t task_window_size
+    );
+
+    // Phase 3b: write the arena-internal pointer fields (scope_tasks,
+    // scope_begins, ring.fanin_pool.base, tensor_map.{buckets,entry_pool,
+    // free_entry_list,task_entry_heads}, scheduler reference).
+    // Idempotent — host runs once on the image, AICPU runs once after attach.
+    void wire_arena_pointers(const PTO2OrchestratorLayout &layout, DeviceArena &arena, PTO2SchedulerState *scheduler);
+
+    // Forget pointers; arena owns the backing buffers.
+    void destroy();
+    void set_scheduler(PTO2SchedulerState *scheduler);
+    void report_fatal(int32_t error_code, const char *func, const char *fmt, ...);
+    void begin_scope(PTO2ScopeMode mode = PTO2ScopeMode::AUTO);
+    void end_scope();
+    TaskOutputTensors submit_task(const MixedKernels &mixed_kernels, const L0TaskArgs &args);
+    TaskOutputTensors submit_dummy_task(const L0TaskArgs &args);
+    TaskOutputTensors alloc_tensors(const L0TaskArgs &args);
+    void mark_done();
+};
+
+// =============================================================================
+// Orchestrator Profiling Data
+// =============================================================================
+
+#if PTO2_ORCH_PROFILING
+struct PTO2OrchProfilingData {
+    uint64_t sync_cycle;
+    uint64_t alloc_cycle;  // Combined task slot + heap allocation
+    uint64_t args_cycle;
+    uint64_t lookup_cycle;
+    uint64_t insert_cycle;
+    uint64_t fanin_cycle;
+    uint64_t scope_end_cycle;
+    int64_t submit_count;
+    // Wait time tracking for blocking phases
+    uint64_t alloc_wait_cycle;  // Cycles spent waiting in unified alloc
+    uint64_t fanin_wait_cycle;  // Cycles spent waiting in fanout_lock
+    // Atomic operation counts per phase
+    uint64_t alloc_atomic_count;
+    uint64_t args_atomic_count;
+    uint64_t scope_end_atomic_count;
+};
+
+PTO2OrchProfilingData orchestrator_get_profiling();
+#endif
+
+#endif  // PTO_ORCHESTRATOR_H
diff --git a/src/a2a3/runtime/host_build_graph/runtime/pto_ring_buffer.h b/src/a2a3/runtime/host_build_graph/runtime/pto_ring_buffer.h
new file mode 100644
index 000000000..2be9171c2
--- /dev/null
+++ b/src/a2a3/runtime/host_build_graph/runtime/pto_ring_buffer.h
@@ -0,0 +1,774 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * PTO Runtime2 - Ring Buffer Data Structures
+ *
+ * Implements ring buffer designs for zero-overhead memory management:
+ *
+ * 1. TaskAllocator - Unified task slot + output buffer allocation
+ *    - Combines task ring (slot allocation) and heap ring (output buffer allocation)
+ *    - Single spin-wait loop with unified back-pressure and deadlock detection
+ *    - O(1) bump allocation for both task slots and heap buffers
+ *
+ * 2. FaninPool - Fanin spill entry allocation
+ *    - Ring buffer for spilled fanin entries
+ *    - O(1) append allocation
+ *    - Implicit reclamation with task ring
+ *
+ * 3. DepListPool - Dependency list entry allocation
+ *    - Ring buffer for linked list entries
+ *    - O(1) prepend operation
+ *    - Implicit reclamation with task ring
+ *
+ * Based on: docs/RUNTIME_LOGIC.md
+ */
+
+#ifndef PTO_RING_BUFFER_H
+#define PTO_RING_BUFFER_H
+
+#include <algorithm>
+#include <inttypes.h>
+#include <type_traits>
+
+#include "pto_runtime2_types.h"
+#include "pto_shared_memory.h"
+#include "aicpu/device_time.h"       // get_sys_cnt_aicpu (deadlock wall-clock backstop)
+#include "common/platform_config.h"  // PLATFORM_PROF_SYS_CNT_FREQ (deadlock wall-clock)
+#include "common/unified_log.h"
+
+#if PTO2_PROFILING
+// Heap-ring wrap reporting — the allocator is the only place each individual
+// wrap is observable, so it notifies the scope_stats collector here. Gated:
+// pays nothing (no include, no call) when profiling is compiled out.
+#include "aicpu/scope_stats_collector_aicpu.h"
+#endif
+
+// Block notification interval (in spin counts)
+#define PTO2_BLOCK_NOTIFY_INTERVAL 10000
+// Heap/task deadlock is detected structurally (head task COMPLETED + all
+// consumers released + scope still open -> only scope_end can free it, which a
+// blocked orchestrator can never reach). This wall-clock value is only a
+// backstop for the residual case the structural test can't prove locally; it is
+// an ABSOLUTE TIME (not a spin count), so it is stable across chips/contention.
+#define PTO2_ALLOC_DEADLOCK_TIMEOUT_CYCLES (PLATFORM_PROF_SYS_CNT_FREQ / 2)  // 500 ms
+
+// Dep pool spin limit - if exceeded, dep pool capacity too small for workload
+#define PTO2_DEP_POOL_SPIN_LIMIT 100000
+
+// =============================================================================
+// Task Allocator (unified task slot + heap buffer allocation)
+// =============================================================================
+
+/**
+ * Unified task slot + heap buffer allocator.
+ *
+ * Since task and heap are always allocated together and the orchestrator is
+ * single-threaded, both pointers (task index, heap top) are tracked locally
+ * and published to shared memory via plain store — no fetch_add or CAS needed.
+ *
+ * The alloc() method checks both resources BEFORE committing to either,
+ * eliminating the need for rollback on partial failure.
+ */
+class PTO2TaskAllocator {
+public:
+    /**
+     * Initialize the allocator with task ring and heap ring resources.
+     *
+     * All pointer arguments are device addresses (live in SM / GM heap); this
+     * function only stores them, no dereferences, so it is safe to invoke
+     * from host code that constructs a prebuilt arena image.
+     *
+     * Production callers leave `initial_local_task_id` at 0: the SM ring
+     * flow-control counters that current_index_ptr / last_alive_ptr point at
+     * start at zero (PTO2RingFlowControl::init() runs on the AICPU during SM
+     * reset), so we keep local_task_id_ aligned with that without reading the
+     * SM. Tests that drive SM state directly may pass a non-zero seed to
+     * exercise corner cases like task IDs near INT32_MAX.
+     */
+    void init(
+        PTO2TaskDescriptor *descriptors, int32_t window_size, std::atomic<int32_t> *current_index_ptr,
+        std::atomic<int32_t> *last_alive_ptr, void *heap_base, uint64_t heap_size, std::atomic<int32_t> *error_code_ptr,
+        PTO2TaskSlotState *slot_states = nullptr, int32_t initial_local_task_id = 0
+    ) {
+        descriptors_ = descriptors;
+        slot_states_ = slot_states;
+        window_size_ = window_size;
+        window_mask_ = window_size - 1;
+        current_index_ptr_ = current_index_ptr;
+        last_alive_ptr_ = last_alive_ptr;
+        heap_base_ = heap_base;
+        heap_size_ = heap_size;
+        error_code_ptr_ = error_code_ptr;
+        local_task_id_ = initial_local_task_id;
+        heap_top_ = 0;
+        heap_tail_ = 0;
+        last_alive_seen_ = 0;
+    }
+
+    /**
+     * Allocate a task slot and its associated output buffer in one call.
+     *
+     * Both task index and heap top are maintained as local counters and
+     * published to shared memory only on success. Since the orchestrator is
+     * single-threaded, no CAS or fetch_add is needed — just check-then-commit.
+     *
+     * @param output_size  Total packed output size in bytes (0 = no heap needed)
+     * @return Allocation result; check failed() for errors
+     */
+    PTO2TaskAllocResult alloc(int32_t output_size) {
+        uint64_t aligned_size =
+            output_size > 0 ? PTO2_ALIGN_UP(static_cast<uint64_t>(output_size), PTO2_ALIGN_SIZE) : 0;
+
+        int spin_count = 0;
+        int32_t prev_last_alive = last_alive_ptr_->load(std::memory_order_acquire);
+        int32_t last_alive = prev_last_alive;
+        update_heap_tail(last_alive);
+        bool blocked_on_heap = false;
+        uint64_t block_cycle0 = 0;  // wall-clock anchor for the deadlock backstop
+        bool block_timing = false;  // false until the first no-reclaim-progress spin
+#if PTO2_ORCH_PROFILING
+        uint64_t wait_start = 0;
+        bool waiting = false;
+#endif
+
+        while (true) {
+            // Check both resources; commit only if both available
+            if (local_task_id_ - last_alive + 1 < window_size_) {
+                void *heap_ptr = try_bump_heap(aligned_size);
+                if (heap_ptr) {
+                    int32_t task_id = commit_task();
+#if PTO2_ORCH_PROFILING
+                    record_wait(spin_count, wait_start, waiting);
+#endif
+                    return {task_id, task_id & window_mask_, heap_ptr, static_cast<char *>(heap_ptr) + aligned_size};
+                }
+                blocked_on_heap = true;
+            } else {
+                blocked_on_heap = false;
+            }
+
+            // Spin: wait for scheduler to advance last_task_alive
+            spin_count++;
+#if PTO2_ORCH_PROFILING
+            if (!waiting) {
+                wait_start = get_sys_cnt_aicpu();
+                waiting = true;
+            }
+#endif
+            last_alive = last_alive_ptr_->load(std::memory_order_acquire);
+            update_heap_tail(last_alive);
+            if (last_alive > prev_last_alive) {
+                // Reclaim advanced -> productive backpressure, not a deadlock.
+                spin_count = 0;
+                prev_last_alive = last_alive;
+                block_timing = false;
+            } else if ((spin_count & 1023) == 0) {
+                // A fatal latched elsewhere (e.g. the scheduler-side wiring
+                // deadlock detector) breaks this otherwise-unbounded spin; the
+                // caller maps the failed alloc to orch_mark_fatal. Polled on the
+                // cold path only -- error_code_ptr_ is orch_error_code.
+                if (error_code_ptr_ != nullptr && error_code_ptr_->load(std::memory_order_acquire) != PTO2_ERROR_NONE) {
+                    return {-1, -1, nullptr, nullptr};
+                }
+                // Reclaim watermark is stuck. Run the deadlock checks only once
+                // per 1024 spins: get_sys_cnt_aicpu() is an MMIO read and
+                // head_blocked_on_scope_end() walks the head slot, neither of
+                // which needs to fire on every hot spin (1024 spins is far below
+                // the wall-clock timeout, so detection latency is unaffected).
+                // (1) Structural, immediate: if the head task is COMPLETED with
+                // every consumer released but its scope still open, only
+                // scope_end can free it and a blocked orchestrator can never
+                // call it -> provable deadlock now.
+                if (head_blocked_on_scope_end(last_alive)) {
+                    report_deadlock(output_size, blocked_on_heap, /*scope_gated=*/true);
+                    return {-1, -1, nullptr, nullptr};
+                }
+                // (2) Wall-clock backstop for the residual case the local head
+                // test can't prove (e.g. a closed sibling whose consumer is
+                // deferred). Absolute time, not a spin count.
+                uint64_t now = get_sys_cnt_aicpu();
+                if (!block_timing) {
+                    block_cycle0 = now;
+                    block_timing = true;
+                } else if (now - block_cycle0 >= PTO2_ALLOC_DEADLOCK_TIMEOUT_CYCLES) {
+                    report_deadlock(output_size, blocked_on_heap, /*scope_gated=*/false);
+                    return {-1, -1, nullptr, nullptr};
+                }
+                if (spin_count % PTO2_BLOCK_NOTIFY_INTERVAL == 0) {
+                    LOG_WARN(
+                        "[TaskAllocator] BLOCKED: tasks=%d/%d, heap=%" PRIu64 "/%" PRIu64 ", on=%s, spins=%d",
+                        local_task_id_ - last_alive, window_size_, heap_top_, heap_size_,
+                        blocked_on_heap ? "heap" : "task", spin_count
+                    );
+                }
+            }
+            SPIN_WAIT_HINT();
+        }
+    }
+
+    // =========================================================================
+    // State queries
+    // =========================================================================
+
+    int32_t active_count() const {
+        int32_t last_alive = last_alive_ptr_->load(std::memory_order_acquire);
+        return local_task_id_ - last_alive;
+    }
+
+    // Task ring start/end: tail = oldest live task (last_task_alive), head =
+    // next task id to allocate. head - tail == active_count().
+    int32_t task_tail() const { return last_alive_ptr_->load(std::memory_order_acquire); }
+    int32_t task_head() const { return local_task_id_; }
+
+    int32_t window_size() const { return window_size_; }
+
+    uint64_t heap_available() const {
+        uint64_t tail = heap_tail_;
+        if (heap_top_ >= tail) {
+            uint64_t at_end = heap_size_ - heap_top_;
+            uint64_t at_begin = tail;
+            return at_end > at_begin ? at_end : at_begin;
+        }
+        return tail - heap_top_;
+    }
+
+    uint64_t heap_top() const { return heap_top_; }
+    // Heap ring start: reclaim pointer (oldest byte still live). heap_top() is
+    // the end (next allocation). heap_top - heap_tail == heap_used_bytes().
+    uint64_t heap_tail() const { return heap_tail_; }
+    uint64_t heap_capacity() const { return heap_size_; }
+    uint64_t heap_used_bytes() const {
+        if (heap_size_ == 0) return 0;
+        return (heap_top_ + heap_size_ - heap_tail_) % heap_size_;
+    }
+
+private:
+    // --- Task Ring ---
+    PTO2TaskDescriptor *descriptors_ = nullptr;
+    // Parallel to descriptors_, indexed by task_id & window_mask_. Read-only here,
+    // used by the deadlock detector to inspect the head task's state + fanout.
+    PTO2TaskSlotState *slot_states_ = nullptr;
+    int32_t window_size_ = 0;
+    int32_t window_mask_ = 0;
+    std::atomic<int32_t> *current_index_ptr_ = nullptr;
+    std::atomic<int32_t> *last_alive_ptr_ = nullptr;
+
+    // --- Heap ---
+    void *heap_base_ = nullptr;
+    uint64_t heap_size_ = 0;
+
+    // --- Local state (single-writer, no atomics needed) ---
+    int32_t local_task_id_ = 0;    // Next task ID to allocate
+    uint64_t heap_top_ = 0;        // Current heap allocation pointer
+    uint64_t heap_tail_ = 0;       // Heap reclamation pointer (derived from consumed tasks)
+    int32_t last_alive_seen_ = 0;  // last_task_alive at last heap_tail derivation
+
+    // --- Shared ---
+    std::atomic<int32_t> *error_code_ptr_ = nullptr;
+
+    // =========================================================================
+    // Internal helpers
+    // =========================================================================
+
+    /**
+     * Commit a task slot: bump local counter and publish to shared memory.
+     * Must only be called after space check has passed.
+     */
+    int32_t commit_task() {
+        int32_t task_id = local_task_id_++;
+        current_index_ptr_->store(local_task_id_, std::memory_order_release);
+        return task_id;
+    }
+
+    /**
+     * Derive heap_tail_ from the last consumed task's packed_buffer_end.
+     *
+     * Every task has a valid packed_buffer_end (equal to packed_buffer_base
+     * for zero-size allocations), so the last consumed task always determines
+     * the correct heap_tail — no backward scan needed.
+     */
+    void update_heap_tail(int32_t last_alive) {
+        if (last_alive <= last_alive_seen_) return;
+        last_alive_seen_ = last_alive;
+
+        PTO2TaskDescriptor &desc = descriptors_[(last_alive - 1) & window_mask_];
+        uint64_t old_tail = heap_tail_;
+        heap_tail_ =
+            static_cast<uint64_t>(static_cast<char *>(desc.packed_buffer_end) - static_cast<char *>(heap_base_));
+#if PTO2_PROFILING
+        // Reclaim pointer moves forward monotonically in ring order; a decrease
+        // means it wrapped past heap_size_ (occupancy < heap_size_ guarantees at
+        // most one wrap per call). Report it so scope_stats can unroll.
+        if (is_scope_stats_enabled() && heap_tail_ < old_tail) {
+            scope_stats_note_heap_wrap(SCOPE_STATS_HEAP_SIDE_RECLAIM);
+        }
+#else
+        (void)old_tail;
+#endif
+    }
+
+    /**
+     * Bump the heap pointer for the given allocation size.
+     * Returns the allocated pointer, or nullptr if insufficient space.
+     * When alloc_size == 0, returns current position without advancing.
+     */
+    void *try_bump_heap(uint64_t alloc_size) {
+        uint64_t top = heap_top_;
+        if (alloc_size == 0) {
+            return static_cast<char *>(heap_base_) + top;
+        }
+        uint64_t tail = heap_tail_;
+        void *result;
+
+        if (top >= tail) {
+            uint64_t space_at_end = heap_size_ - top;
+            if (space_at_end >= alloc_size) {
+                result = static_cast<char *>(heap_base_) + top;
+                heap_top_ = top + alloc_size;
+            } else if (tail > alloc_size) {
+                LOG_DEBUG(
+                    "try_bump_heap wrap-around alloc: top=%" PRIu64 ", tail=%" PRIu64 ", alloc=%" PRIu64, top, tail,
+                    alloc_size
+                );
+                result = heap_base_;
+                heap_top_ = alloc_size;
+#if PTO2_PROFILING
+                // Allocation pointer just wrapped past heap_size_; report it so
+                // scope_stats can unroll the wrapping offset into a monotonic value.
+                // The collector attributes the wrap to the current scope's ring.
+                if (is_scope_stats_enabled()) scope_stats_note_heap_wrap(SCOPE_STATS_HEAP_SIDE_ALLOC);
+#endif
+            } else {
+                LOG_DEBUG(
+                    "try_bump_heap failed (top>=tail): top=%" PRIu64 ", tail=%" PRIu64 ", alloc=%" PRIu64
+                    ", heap_size=%" PRIu64,
+                    top, tail, alloc_size, heap_size_
+                );
+                return nullptr;
+            }
+        } else {
+            if (tail - top > alloc_size) {
+                result = static_cast<char *>(heap_base_) + top;
+                heap_top_ = top + alloc_size;
+            } else {
+                LOG_DEBUG(
+                    "try_bump_heap failed (top<tail): top=%" PRIu64 ", tail=%" PRIu64 ", alloc=%" PRIu64
+                    ", free_gap=%" PRIu64,
+                    top, tail, alloc_size, tail - top
+                );
+                return nullptr;
+            }
+        }
+
+        return result;
+    }
+
+#if PTO2_ORCH_PROFILING
+    void record_wait(int spin_count, uint64_t wait_start, bool waiting) {
+        if (waiting) {
+            extern uint64_t g_orch_alloc_wait_cycle;
+            g_orch_alloc_wait_cycle += (get_sys_cnt_aicpu() - wait_start);
+        }
+        {
+            extern uint64_t g_orch_alloc_atomic_count;
+            g_orch_alloc_atomic_count += spin_count + 1;
+        }
+    }
+#endif
+
+    /**
+     * Structural deadlock test on the reclaim head.
+     *
+     * The head (oldest un-CONSUMED task, at last_task_alive) gates all
+     * reclamation. If it is COMPLETED and every consumer reference is released
+     * (low bits of fanout_refcount == consumer count) but the scope reference
+     * (bit31) is still unset, the only release left is its scope_end. Because
+     * this is evaluated while the orchestrator is blocked in alloc(), scope_end
+     * can never be reached -> provable deadlock, no timeout required.
+     *
+     * The COMPLETED guard is mandatory: a zero-consumer task has
+     * refcount == 0 == (count & ~SCOPE_BIT) from birth, before it has run.
+     */
+    bool head_blocked_on_scope_end(int32_t head_task_id) const {
+        if (slot_states_ == nullptr) return false;
+        PTO2TaskSlotState &h = slot_states_[head_task_id & window_mask_];
+        if (h.task_state.load(std::memory_order_acquire) != PTO2_TASK_COMPLETED) return false;
+        uint32_t rc = h.fanout_refcount.load(std::memory_order_acquire);
+        return rc == (h.fanout_count & ~PTO2_FANOUT_SCOPE_BIT);
+    }
+
+    /**
+     * Report deadlock with targeted diagnostics. scope_gated == true means the
+     * head-of-line structural test proved it (waiting only on scope_end);
+     * false means the wall-clock backstop fired.
+     */
+    void report_deadlock(int32_t requested_output_size, bool heap_blocked, bool scope_gated) {
+        int32_t last_alive = last_alive_ptr_->load(std::memory_order_acquire);
+        int32_t active_tasks = local_task_id_ - last_alive;
+        uint64_t htail = heap_tail_;
+
+        LOG_ERROR("========================================");
+        if (heap_blocked) {
+            LOG_ERROR("FATAL: Task Allocator Deadlock - Heap Exhausted!");
+        } else {
+            LOG_ERROR("FATAL: Task Allocator Deadlock - Task Ring Full!");
+        }
+        LOG_ERROR("========================================");
+        if (scope_gated) {
+            LOG_ERROR("Head task %d COMPLETED, all consumers released, scope still open ->", last_alive);
+            LOG_ERROR("only scope_end can free it and the orchestrator is blocked here.");
+            LOG_ERROR("Provable head-of-line deadlock.");
+        } else {
+            LOG_ERROR(
+                "No reclaim progress for ~500 ms (%" PRIu64 " cycles wall clock).",
+                (uint64_t)PTO2_ALLOC_DEADLOCK_TIMEOUT_CYCLES
+            );
+        }
+        LOG_ERROR(
+            "  Task ring:  current=%d, last_alive=%d, active=%d/%d (%.1f%%)", local_task_id_, last_alive, active_tasks,
+            window_size_, 100.0 * active_tasks / window_size_
+        );
+        LOG_ERROR(
+            "  Heap ring:  top=%" PRIu64 ", tail=%" PRIu64 ", size=%" PRIu64 ", available=%" PRIu64, heap_top_, htail,
+            heap_size_, heap_available()
+        );
+        if (heap_blocked) {
+            LOG_ERROR("  Requested:  %d bytes", requested_output_size);
+        }
+        // Head-task state dump: what the reclaim watermark is actually waiting on.
+        if (slot_states_ != nullptr) {
+            PTO2TaskSlotState &h = slot_states_[last_alive & window_mask_];
+            uint32_t fc = h.fanout_count;
+            uint32_t rc = h.fanout_refcount.load(std::memory_order_acquire);
+            LOG_ERROR(
+                "  Head task %d: state=%d, consumers=%u/%u, scope_released=%d", last_alive,
+                static_cast<int>(h.task_state.load(std::memory_order_acquire)), rc & ~PTO2_FANOUT_SCOPE_BIT,
+                fc & ~PTO2_FANOUT_SCOPE_BIT, (rc & PTO2_FANOUT_SCOPE_BIT) ? 1 : 0
+            );
+        }
+        LOG_ERROR("Solution:");
+        if (scope_gated) {
+            LOG_ERROR("  The open scope's own allocation exceeds this ring. Either:");
+            LOG_ERROR("  1. Split the scope / reduce per-scope allocation (reclaim sooner), or");
+            LOG_ERROR("  2. Size the ring >= the scope's peak live-set (heap*2 may not be enough).");
+        } else if (heap_blocked) {
+            LOG_ERROR(
+                "  Increase heap (current: %" PRIu64 "); env PTO2_RING_HEAP=<pow2> (e.g. %" PRIu64 ")", heap_size_,
+                heap_size_ * 2
+            );
+        } else {
+            LOG_ERROR(
+                "  Increase task window (current: %d); env PTO2_RING_TASK_WINDOW=<pow2> (e.g. %d)", window_size_,
+                active_tasks * 2
+            );
+        }
+        LOG_ERROR("========================================");
+        if (error_code_ptr_) {
+            int32_t code = heap_blocked ? PTO2_ERROR_HEAP_RING_DEADLOCK : PTO2_ERROR_FLOW_CONTROL_DEADLOCK;
+            error_code_ptr_->store(code, std::memory_order_release);
+        }
+    }
+};
+
+// =============================================================================
+// Fanin Spill Pool
+// =============================================================================
+
+/**
+ * Fanin spill pool structure
+ *
+ * True ring buffer for allocating spilled fanin entries.
+ * Entries are reclaimed when their consumer tasks become CONSUMED.
+ *
+ * Linear counters (top, tail) grow monotonically; the physical index
+ * is obtained via modulo: base[linear_index % capacity].
+ */
+struct PTO2FaninPool {
+    PTO2FaninSpillEntry *base;       // Pool base address
+    int32_t capacity;                // Total number of entries
+    int32_t top;                     // Linear next-allocation counter (starts from 1)
+    int32_t tail;                    // Linear first-alive counter (entries before this are dead)
+    int32_t high_water;              // Peak concurrent usage (top - tail)
+    int32_t reclaim_task_cursor{0};  // Last task id scanned for reclaim on this pool
+
+    std::atomic<int32_t> *error_code_ptr = nullptr;
+
+    void init(PTO2FaninSpillEntry *in_base, int32_t in_capacity, std::atomic<int32_t> *in_error_code_ptr) {
+        base = in_base;
+        capacity = in_capacity;
+        top = 1;
+        tail = 1;
+        high_water = 0;
+        reclaim_task_cursor = 0;
+        base[0].slot_state = nullptr;
+        error_code_ptr = in_error_code_ptr;
+    }
+
+    void reclaim(PTO2SharedMemoryRingHeader &ring, int32_t sm_last_task_alive);
+
+    bool ensure_space(PTO2SharedMemoryRingHeader &ring, int32_t needed);
+
+    PTO2FaninSpillEntry *alloc() {
+        int32_t used = top - tail;
+        if (used >= capacity) {
+            LOG_ERROR("========================================");
+            LOG_ERROR("FATAL: Fanin Spill Pool Overflow!");
+            LOG_ERROR("========================================");
+            LOG_ERROR("Fanin spill pool exhausted: %d entries alive (capacity=%d).", used, capacity);
+            LOG_ERROR("  - Pool top:      %d (linear)", top);
+            LOG_ERROR("  - Pool tail:     %d (linear)", tail);
+            LOG_ERROR("  - High water:    %d", high_water);
+            LOG_ERROR("Solution:");
+            LOG_ERROR("  Increase fanin spill pool capacity (current: %d, recommended: %d).", capacity, capacity * 2);
+            LOG_ERROR("  Compile-time: PTO2_DEP_LIST_POOL_SIZE in pto_runtime2_types.h");
+            LOG_ERROR("  Runtime env:  PTO2_RING_DEP_POOL=%d", capacity * 2);
+            LOG_ERROR("========================================");
+            if (error_code_ptr) {
+                error_code_ptr->store(PTO2_ERROR_DEP_POOL_OVERFLOW, std::memory_order_release);
+            }
+            return nullptr;
+        }
+        int32_t idx = top % capacity;
+        top++;
+        used++;
+        if (used > high_water) high_water = used;
+        return &base[idx];
+    }
+
+    void advance_tail(int32_t new_tail) {
+        if (new_tail > tail) {
+            tail = new_tail;
+        }
+    }
+
+    int32_t used() const { return top - tail; }
+
+    int32_t available() const { return capacity - used(); }
+};
+
+template <typename Fn>
+using PTO2FaninCallbackResult = std::invoke_result_t<Fn &, PTO2TaskSlotState *>;
+
+template <typename Fn>
+using PTO2FaninForEachReturn = std::conditional_t<std::is_same_v<PTO2FaninCallbackResult<Fn>, void>, void, bool>;
+
+template <typename InlineSlots, typename Fn>
+inline PTO2FaninForEachReturn<Fn> for_each_fanin_storage(
+    InlineSlots &&inline_slot_states, int32_t fanin_count, int32_t spill_start, PTO2FaninPool &spill_pool, Fn &&fn
+) {
+    using FaninCallbackResult = PTO2FaninCallbackResult<Fn>;
+    static_assert(
+        std::is_same_v<FaninCallbackResult, void> || std::is_same_v<FaninCallbackResult, bool>,
+        "fanin callback must return void or bool"
+    );
+
+    if constexpr (std::is_void_v<FaninCallbackResult>) {
+        int32_t inline_count = std::min(fanin_count, PTO2_FANIN_INLINE_CAP);
+        for (int32_t i = 0; i < inline_count; i++) {
+            fn(inline_slot_states[i]);
+        }
+
+        int32_t spill_count = fanin_count - inline_count;
+        if (spill_count <= 0) {
+            return;
+        }
+
+        int32_t start_idx = spill_start % spill_pool.capacity;
+        int32_t first_count = std::min(spill_count, spill_pool.capacity - start_idx);
+        PTO2FaninSpillEntry *first = spill_pool.base + start_idx;
+        for (int32_t i = 0; i < first_count; i++) {
+            fn(first[i].slot_state);
+        }
+
+        int32_t second_count = spill_count - first_count;
+        for (int32_t i = 0; i < second_count; i++) {
+            fn(spill_pool.base[i].slot_state);
+        }
+        return;
+    } else {
+        int32_t inline_count = std::min(fanin_count, PTO2_FANIN_INLINE_CAP);
+        for (int32_t i = 0; i < inline_count; i++) {
+            if (!fn(inline_slot_states[i])) {
+                return false;
+            }
+        }
+
+        int32_t spill_count = fanin_count - inline_count;
+        if (spill_count <= 0) {
+            return true;
+        }
+
+        int32_t start_idx = spill_start % spill_pool.capacity;
+        int32_t first_count = std::min(spill_count, spill_pool.capacity - start_idx);
+        PTO2FaninSpillEntry *first = spill_pool.base + start_idx;
+        for (int32_t i = 0; i < first_count; i++) {
+            if (!fn(first[i].slot_state)) {
+                return false;
+            }
+        }
+
+        int32_t second_count = spill_count - first_count;
+        for (int32_t i = 0; i < second_count; i++) {
+            if (!fn(spill_pool.base[i].slot_state)) {
+                return false;
+            }
+        }
+        return true;
+    }
+}
+
+template <typename Fn>
+inline PTO2FaninForEachReturn<Fn> for_each_fanin_slot_state(const PTO2TaskPayload &payload, Fn &&fn) {
+    return for_each_fanin_storage(
+        payload.fanin_inline_slot_states, payload.fanin_actual_count, payload.fanin_spill_start,
+        *payload.fanin_spill_pool, static_cast<Fn &&>(fn)
+    );
+}
+
+// =============================================================================
+// Dependency List Pool
+// =============================================================================
+
+/**
+ * Dependency list pool structure
+ *
+ * True ring buffer for allocating linked list entries.
+ * Entries are reclaimed when their producer tasks become CONSUMED,
+ * as tracked by the orchestrator via dep_pool_mark per task.
+ *
+ * Linear counters (top, tail) grow monotonically; the physical index
+ * is obtained via modulo: base[linear_index % capacity].
+ */
+struct PTO2DepListPool {
+    PTO2DepListEntry *base;     // Pool base address
+    int32_t capacity;           // Total number of entries
+    int32_t top;                // Linear next-allocation counter (starts from 1)
+    int32_t tail;               // Linear first-alive counter (entries before this are dead)
+    int32_t high_water;         // Peak concurrent usage (top - tail)
+    int32_t last_reclaimed{0};  // last_task_alive at last successful reclamation
+
+    // Error code pointer for fatal error reporting (→ sm_header->orch_error_code)
+    std::atomic<int32_t> *error_code_ptr = nullptr;
+
+    /**
+     *
+     * Initialize dependency list pool
+     * @param base      Pool base address from shared memory
+     * @param capacity  Total number of entries
+     */
+    void init(PTO2DepListEntry *in_base, int32_t in_capacity, std::atomic<int32_t> *in_error_code_ptr) {
+        base = in_base;
+        capacity = in_capacity;
+        top = 1;   // Start from 1, 0 means NULL/empty
+        tail = 1;  // Match initial top (no reclaimable entries yet)
+        high_water = 0;
+        last_reclaimed = 0;
+
+        // Initialize entry 0 as NULL marker
+        base[0].slot_state = nullptr;
+        base[0].next = nullptr;
+
+        error_code_ptr = in_error_code_ptr;
+    }
+
+    /**
+     * Reclaim dead entries based on scheduler's slot state dep_pool_mark.
+     * Safe to call multiple times — only advances tail forward.
+     *
+     * @param ring             Ring header (for reading slot dep_pool_mark)
+     * @param sm_last_task_alive Current last_task_alive from shared memory
+     */
+    void reclaim(PTO2SharedMemoryRingHeader &ring, int32_t sm_last_task_alive);
+
+    /**
+     * Ensure dep pool for a specific ring has at least `needed` entries available.
+     * Spin-waits for reclamation if under pressure. Detects deadlock if no progress.
+     */
+    bool ensure_space(PTO2SharedMemoryRingHeader &ring, int32_t needed);
+
+    /**
+     * Allocate a single entry from the pool (single-thread per pool instance)
+     *
+     * @return Pointer to allocated entry, or nullptr on fatal error
+     */
+    PTO2DepListEntry *alloc() {
+        int32_t used = top - tail;
+        if (used >= capacity) {
+            LOG_ERROR("========================================");
+            LOG_ERROR("FATAL: Dependency Pool Overflow!");
+            LOG_ERROR("========================================");
+            LOG_ERROR("DepListPool exhausted: %d entries alive (capacity=%d).", used, capacity);
+            LOG_ERROR("  - Pool top:      %d (linear)", top);
+            LOG_ERROR("  - Pool tail:     %d (linear)", tail);
+            LOG_ERROR("  - High water:    %d", high_water);
+            LOG_ERROR("Solution:");
+            LOG_ERROR("  Increase dep pool capacity (current: %d, recommended: %d).", capacity, capacity * 2);
+            LOG_ERROR("  Compile-time: PTO2_DEP_LIST_POOL_SIZE in pto_runtime2_types.h");
+            LOG_ERROR("  Runtime env:  PTO2_RING_DEP_POOL=%d", capacity * 2);
+            LOG_ERROR("========================================");
+            if (error_code_ptr) {
+                error_code_ptr->store(PTO2_ERROR_DEP_POOL_OVERFLOW, std::memory_order_release);
+            }
+            return nullptr;
+        }
+        int32_t idx = top % capacity;
+        top++;
+        used++;
+        if (used > high_water) high_water = used;
+        return &base[idx];
+    }
+
+    /**
+     * Advance the tail pointer, reclaiming dead entries.
+     * Called by the orchestrator based on last_task_alive advancement.
+     */
+    void advance_tail(int32_t new_tail) {
+        if (new_tail > tail) {
+            tail = new_tail;
+        }
+    }
+
+    /**
+     * Prepend a task ID to a dependency list
+     *
+     * O(1) operation: allocates new entry and links to current head.
+     *
+     * @param current_head  Current list head offset (0 = empty list)
+     * @param task_slot     Task slot to prepend
+     * @return New head offset
+     */
+    PTO2DepListEntry *prepend(PTO2DepListEntry *cur, PTO2TaskSlotState *slot_state) {
+        PTO2DepListEntry *new_entry = alloc();
+        if (!new_entry) return nullptr;
+        new_entry->slot_state = slot_state;
+        new_entry->next = cur;
+        return new_entry;
+    }
+
+    int32_t used() const { return top - tail; }
+
+    int32_t available() const { return capacity - used(); }
+};
+
+// =============================================================================
+// Ring Set (per-depth aggregate)
+// =============================================================================
+
+/**
+ * Groups a TaskAllocator and DepPool into one per-depth unit.
+ * PTO2_MAX_RING_DEPTH instances provide independent reclamation per scope depth.
+ */
+struct PTO2RingSet {
+    PTO2TaskAllocator task_allocator;
+    PTO2FaninPool fanin_pool;
+};
+
+#endif  // PTO_RING_BUFFER_H
diff --git a/src/a2a3/runtime/host_build_graph/runtime/pto_runtime2.h b/src/a2a3/runtime/host_build_graph/runtime/pto_runtime2.h
new file mode 100644
index 000000000..993cf20de
--- /dev/null
+++ b/src/a2a3/runtime/host_build_graph/runtime/pto_runtime2.h
@@ -0,0 +1,290 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * PTO Runtime2 - Main Interface
+ *
+ * This is the main header for the PTO Runtime2 system.
+ * It provides a unified API for task graph construction and execution.
+ *
+ * Key Features:
+ * - Ring buffer based memory management (zero allocation overhead)
+ * - Lazy invalidation TensorMap for dependency discovery
+ * - Scope-based buffer lifecycle management
+ * - Per-task spinlocks for concurrent fanout updates
+ * - Orchestrator-Scheduler decoupling via shared memory
+ *
+ * Usage:
+ *   1. Create runtime: PTO2Runtime create methods
+ *   2. Build task graph in orchestration function:
+ *      - begin_scope() / end_scope()
+ *      - submit_task()
+ *   3. Mark orchestration complete: mark_done()
+ *   4. Destroy runtime
+ *
+ * Based on: docs/RUNTIME_LOGIC.md
+ */
+
+#pragma once
+
+#include "utils/device_arena.h"
+#include "pto_runtime2_types.h"
+#include "pto_submit_types.h"
+#include "pto_shared_memory.h"
+#include "pto_ring_buffer.h"
+#include "pto_tensormap.h"
+#include "scheduler/pto_scheduler.h"
+#include "pto_orchestrator.h"
+#include "aicore_completion_mailbox.h"
+
+// =============================================================================
+// Runtime Context
+// =============================================================================
+
+/**
+ * Runtime execution mode
+ */
+enum PTO2RuntimeMode {
+    PTO2_MODE_EXECUTE = 0,    // Execute tasks on workers
+    PTO2_MODE_SIMULATE = 1,   // Simulate task execution with cycle counting
+    PTO2_MODE_GRAPH_ONLY = 2  // Build graph only, no execution
+};
+
+/**
+ * Function-pointer ops table for runtime operations.
+ *
+ * The orchestration .so calls runtime functions through this table
+ * (via pto_orchestration_api.h inline wrappers), so it has zero link
+ * dependencies on runtime .cpp files.
+ */
+typedef struct PTO2Runtime PTO2Runtime;  // forward declare for ops signatures
+
+struct PTO2RuntimeOps {
+    TaskOutputTensors (*submit_task)(PTO2Runtime *rt, const MixedKernels &mixed_kernels, const L0TaskArgs &args);
+    void (*scope_begin)(PTO2Runtime *rt);
+    void (*scope_end)(PTO2Runtime *rt);
+    void (*orchestration_done)(PTO2Runtime *rt);
+    bool (*is_fatal)(PTO2Runtime *rt);
+    void (*report_fatal)(PTO2Runtime *rt, int32_t error_code, const char *func, const char *fmt, ...);
+
+    // Logging (populated by runtime, called by orchestration)
+    void (*log_error)(const char *func, const char *fmt, ...);
+    void (*log_warn)(const char *func, const char *fmt, ...);
+    void (*log_debug)(const char *func, const char *fmt, ...);
+    // INFO with explicit verbosity tier (v ∈ [0,9]; gating done inside).
+    void (*log_info_v)(const char *func, int v, const char *fmt, ...);
+
+    // Cross-layer data access (orchestration reads/writes tensor values via runtime)
+    // Placed after logging to avoid shifting hot-path field offsets.
+    uint64_t (*get_tensor_data)(PTO2Runtime *rt, const Tensor &tensor, uint32_t ndims, const uint32_t indices[]);
+    void (*set_tensor_data)(
+        PTO2Runtime *rt, const Tensor &tensor, uint32_t ndims, const uint32_t indices[], uint64_t value
+    );
+    TaskOutputTensors (*alloc_tensors)(PTO2Runtime *rt, const L0TaskArgs &args);
+    TaskOutputTensors (*submit_dummy_task)(PTO2Runtime *rt, const L0TaskArgs &args);
+    // Stash the call-site captured by PTO2ScopeGuard into the [ScopeStats]
+    // collector. Always present in the struct to keep ops-table layout stable
+    // across PTO2_PROFILING settings; set to nullptr at PTO2_PROFILING=0.
+    void (*scope_set_site)(const char *file, int line);
+};
+
+/**
+ * Layout descriptor for the prebuilt runtime arena. Holds all sub-region
+ * offsets (orchestrator / scheduler / sm_handle wrapper / runtime header /
+ * AICore mailbox) plus the layout-defining capacities. Produced once on the
+ * host by runtime_reserve_layout(); consumed by runtime_init_data_from_layout
+ * and runtime_wire_arena_pointers.
+ */
+struct PTO2RuntimeArenaLayout {
+    size_t off_sm_handle{0};
+    PTO2OrchestratorLayout orch;
+    PTO2SchedulerLayout sched;
+    size_t off_runtime{0};
+    size_t off_mailbox{0};
+
+    // Cached parameters (re-used by init_data + wire stages).
+    uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH]{};
+    uint64_t heap_sizes[PTO2_MAX_RING_DEPTH]{};
+    int32_t dep_pool_capacities[PTO2_MAX_RING_DEPTH]{};
+
+    // Total arena byte size post-commit. Used by host to size the prebuilt
+    // image buffer and as the rtMemcpy length.
+    size_t arena_size{0};
+};
+
+/**
+ * PTO Runtime2 context
+ *
+ * Contains all state for orchestration and scheduling.
+ * In simulated mode, runs in single process with shared address space.
+ */
+struct PTO2Runtime {
+    // Ops table (first field — used by orchestration .so via function pointers)
+    const PTO2RuntimeOps *ops;
+    PTO2ScopeMode pending_scope_mode;
+
+    // Components
+    PTO2SharedMemoryHandle *sm_handle;
+    PTO2OrchestratorState orchestrator;
+    PTO2SchedulerState scheduler;
+    AICoreCompletionMailbox *aicore_mailbox;
+
+    // GM Heap for output buffers
+    void *gm_heap;
+    uint64_t gm_heap_size;
+    bool gm_heap_owned;  // True if we allocated it
+
+    // Mode
+    PTO2RuntimeMode mode;
+
+    // Statistics
+    int64_t total_cycles;
+
+    // Prebuilt-arena fast path metadata. Carries every offset
+    // wire_arena_pointers needs at AICPU boot so the AICPU can reconstruct
+    // all arena-internal pointer fields without re-running init_data. The
+    // device base of the runtime arena travels separately on the host-side
+    // Runtime (Runtime::prebuilt_arena_base_), since the AICPU needs it
+    // *before* dereferencing this image. Populated on host by
+    // runtime_init_data_from_layout + runtime_wire_arena_pointers; read by
+    // aicpu_executor.cpp.
+    PTO2RuntimeArenaLayout prebuilt_layout;
+};
+
+// =============================================================================
+// Runtime Lifecycle API
+// =============================================================================
+
+/**
+ * Phase 1 — declare every sub-region (sm_handle wrapper, orchestrator /
+ * scheduler / tensor_map / mailbox / PTO2Runtime header) on the supplied
+ * arena. Pure arithmetic; does not touch device memory and may run on host.
+ * Returns the layout descriptor; caller commits/attaches the arena before
+ * Phase 2/3.
+ */
+PTO2RuntimeArenaLayout runtime_reserve_layout(
+    DeviceArena &arena, uint64_t task_window_size, int32_t dep_pool_capacity = PTO2_DEP_LIST_POOL_SIZE
+);
+PTO2RuntimeArenaLayout runtime_reserve_layout(
+    DeviceArena &arena, const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH],
+    const uint64_t heap_sizes[PTO2_MAX_RING_DEPTH], const int32_t dep_pool_capacities[PTO2_MAX_RING_DEPTH]
+);
+
+/**
+ * Phase 2 — write the data half of the runtime arena: standalone fields,
+ * memset'd arena regions, sub-structure initializers, and SM-side device
+ * pointers. The arena must already be committed (or attached); writes go
+ * into arena.base() + sub-region offsets.
+ *
+ * `sm_dev_base` / `gm_heap_dev_base` are device addresses; we only store
+ * them (never dereference). Safe to run on a host arena that owns a host
+ * mirror of the runtime image — the resulting buffer is rtMemcpy-ready.
+ *
+ * Returns the PTO2Runtime* that sits at layout.off_runtime within the arena.
+ * Caller must follow up with runtime_wire_arena_pointers; rt->ops and the
+ * AICore-side count fields are left untouched and must be filled by the
+ * AICPU at boot.
+ */
+PTO2Runtime *runtime_init_data_from_layout(
+    DeviceArena &arena, const PTO2RuntimeArenaLayout &layout, PTO2RuntimeMode mode, void *sm_dev_base, uint64_t sm_size,
+    void *gm_heap_dev_base, uint64_t heap_size
+);
+PTO2Runtime *runtime_init_data_from_layout(
+    DeviceArena &arena, const PTO2RuntimeArenaLayout &layout, PTO2RuntimeMode mode, void *sm_dev_base, uint64_t sm_size,
+    void *gm_heap_dev_base, const uint64_t heap_sizes[PTO2_MAX_RING_DEPTH]
+);
+
+/**
+ * Phase 3 — wire every arena-internal pointer field (rt->sm_handle,
+ * rt->aicore_mailbox, orchestrator.{scope_tasks, scope_begins, scheduler,
+ * tensor_map.*, ring.fanin_pool.base}, scheduler.{ready_queues, dep_pool,
+ * wiring.queue}) so each holds arena.base() + offset. Idempotent — runs on
+ * both host (writing host-mirror addresses) and AICPU (writing device
+ * addresses) sides.
+ */
+void runtime_wire_arena_pointers(DeviceArena &arena, const PTO2RuntimeArenaLayout &layout, PTO2Runtime *rt);
+
+/**
+ * AICPU-only Phase 4 — fill in the few fields the host could not know at
+ * prebuilt-image build time: the ops table (s_runtime_ops is a device-side
+ * file-local global, host cannot resolve its device address) and the
+ * orchestrator's core counts (depend on the executor's scheduler context).
+ * Call once per boot after runtime_wire_arena_pointers.
+ */
+void runtime_finalize_after_wire(PTO2Runtime *rt, int32_t aic_count, int32_t aiv_count);
+
+/**
+ * Destroy runtime. With the prebuilt-arena fast path the arena buffer is
+ * pooled across runs by DeviceRunner, so we never call arena.release()
+ * here — the destructor only forgets sub-structure pointers (idempotent
+ * cleanup).
+ */
+void runtime_destroy(PTO2Runtime *rt, DeviceArena &arena);
+
+/**
+ * Set execution mode
+ */
+void runtime_set_mode(PTO2Runtime *rt, PTO2RuntimeMode mode);
+
+// =============================================================================
+// Orchestration API (called by orchestration function)
+// =============================================================================
+
+/**
+ * Begin a new scope
+ *
+ * All tasks submitted within this scope will have their lifetime
+ * bounded by the scope. When scope_end() is called, the scope
+ * releases its reference to all enclosed tasks.
+ */
+void rt_scope_begin(PTO2Runtime *rt);
+
+/**
+ * End current scope
+ *
+ * Releases scope reference for all tasks submitted since scope_begin().
+ * Tasks whose refcount reaches zero will have their buffers released.
+ */
+void rt_scope_end(PTO2Runtime *rt);
+
+/**
+ * Mark orchestration as complete
+ *
+ * Signals that no more tasks will be submitted.
+ */
+void rt_orchestration_done(PTO2Runtime *rt);
+
+/**
+ * Enter fatal state explicitly from orchestration.
+ */
+void rt_report_fatal(PTO2Runtime *rt, int32_t error_code, const char *func, const char *fmt, ...);
+
+/**
+ * Cross-layer data access: read a tensor value by waiting for its producer.
+ */
+uint64_t get_tensor_data(PTO2Runtime *rt, const Tensor &tensor, uint32_t ndims, const uint32_t indices[]);
+
+/**
+ * Cross-layer data access: write a value to a tensor at given indices.
+ * Waits for producer completion (WAW) and all consumers (WAR) via TensorMap.
+ * See set_tensor_data in pto_orchestration_api.h for full documentation.
+ */
+void set_tensor_data(PTO2Runtime *rt, const Tensor &tensor, uint32_t ndims, const uint32_t indices[], uint64_t value);
+
+/**
+ * Slim config struct exported by orchestration .so via aicpu_orchestration_config().
+ * Shared definition with pto_orchestration_api.h (same layout, guarded).
+ */
+#ifndef PTO2_ORCHESTRATION_CONFIG_DEFINED
+#define PTO2_ORCHESTRATION_CONFIG_DEFINED
+struct PTO2OrchestrationConfig {
+    int expected_arg_count;
+};
+#endif
diff --git a/src/a2a3/runtime/host_build_graph/runtime/pto_runtime2_types.h b/src/a2a3/runtime/host_build_graph/runtime/pto_runtime2_types.h
index 82bb7c193..1363daaa4 100644
--- a/src/a2a3/runtime/host_build_graph/runtime/pto_runtime2_types.h
+++ b/src/a2a3/runtime/host_build_graph/runtime/pto_runtime2_types.h
@@ -9,18 +9,536 @@
  * -----------------------------------------------------------------------------------------------------------
  */
 
-#ifndef SRC_A2A3_RUNTIME_HOST_BUILD_GRAPH_RUNTIME_PTO_RUNTIME2_TYPES_H_
-#define SRC_A2A3_RUNTIME_HOST_BUILD_GRAPH_RUNTIME_PTO_RUNTIME2_TYPES_H_
+/**
+ * PTO Runtime2 - Core Type Definitions
+ *
+ * This header defines all fundamental types used by the PTO Runtime2 system:
+ * - Configuration constants
+ * - Worker types and task states
+ * - Tensor regions and task parameters
+ * - Task descriptors with fanin/fanout tracking
+ * - Dependency list entries
+ *
+ * Based on: docs/RUNTIME_LOGIC.md
+ */
+
+#ifndef SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_PTO_RUNTIME2_TYPES_H_
+#define SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_PTO_RUNTIME2_TYPES_H_
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include <atomic>
 
 #include "profiling_config.h"
+#include "pto_constants.h"
+#include "pto_runtime_status.h"
+// NOTE (host_build_graph divergence from tensormap_and_ringbuffer): the
+// pto2_dispatch_payload.h include is intentionally dropped here. This header is
+// pulled in by the platform's tensor_dump.h via a hardcoded
+// "host_build_graph/runtime/pto_runtime2_types.h" path, and pto2_dispatch_payload.h
+// uses #pragma once (path-keyed), so leaving it in double-defines PTO2DispatchPayload
+// against tensormap_and_ringbuffer's copy inside the shared host-dispatcher TU.
+// pto_runtime2_types.h never references PTO2DispatchPayload itself; consumers that
+// need it include it via runtime.h directly.
+#include "aicore_completion_mailbox.h"
+#include "pto_submit_types.h"
+#include "pto_task_id.h"
+#include "pto_types.h"
+
+// Spin-wait hint for AICPU threads.  On real hardware the AICPU has dedicated
+// ARM A55 cores — no OS yield is needed, so the hint is a no-op.  In simulation
+// all threads share host CPU cores, so we yield to prevent starvation.
+// This header is also compiled into the Host .so (for struct definitions only),
+// where the hint is never called — the fallback no-op keeps Host builds clean.
+#if __has_include("spin_hint.h")
+#include "spin_hint.h"
+#else
+#define SPIN_WAIT_HINT() ((void)0)
+#endif
+
+#if PTO2_ORCH_PROFILING || PTO2_SCHED_PROFILING
+#include "aicpu/device_time.h"
+#endif
 
 // =============================================================================
-// Tensor Dump Configuration
+// Configuration Constants
 // =============================================================================
 
-// Tensor dump uses these defaults to size its selective mask table so task-id
-// ring/slot lookup stays aligned with PTO2 task id layout.
+// Task management
+// NOTE: PTO2_TASK_WINDOW_SIZE is now a per-ring default value.
+// Actual window size is passed at runtime to runtime_create_from_sm().
+// Use pto2_task_slot(sched, task_id) for slot calculation.
 #define PTO2_TASK_WINDOW_SIZE 16384  // Default per-ring task window size (power of 2)
-#define PTO2_MAX_RING_DEPTH 4        // Number of task-id ring layers
 
-#endif  // SRC_A2A3_RUNTIME_HOST_BUILD_GRAPH_RUNTIME_PTO_RUNTIME2_TYPES_H_
+// Single ring. host_build_graph is host-orch: the whole graph is built on the
+// host, fits one ring, and the device runs it once without reclaim (see stages
+// 1-2 — execution-time recycle removed). The multi-ring design existed only to
+// let inner scopes reclaim independently under small rings; with no reclaim and
+// a whole-graph-resident ring, per-depth isolation is moot, so all scope depths
+// map to the single ring 0 (0 == 0).
+#define PTO2_MAX_RING_DEPTH 1
+
+// Memory pools (total = value, single ring)
+#define PTO2_HEAP_SIZE (256 * 1024 * 1024)  // 256MB
+#define PTO2_DEP_LIST_POOL_SIZE 16384       // Per-ring dependency list pool entries
+#define PTO2_TENSORMAP_POOL_SIZE (65536)    // TensorMap entry pool
+#define PTO2_TENSORMAP_NUM_BUCKETS 4096     // Power of 2 for fast hash (4096×8B=32KB fits L1)
+
+// Scope management
+#define PTO2_MAX_SCOPE_DEPTH 64  // Maximum nesting depth
+// Hard cap for the scope_tasks buffer. Equals the total in-flight ring slot
+// budget (PTO2_TASK_WINDOW_SIZE × PTO2_MAX_RING_DEPTH): once every ring slot
+// is in flight, no more tasks can ever be pushed regardless of buffer size.
+// scope_tasks_push fatals on overflow rather than growing the arena-owned
+// buffer (which would be UB on the arena's malloc'd backing).
+#define PTO2_SCOPE_TASKS_CAP (PTO2_TASK_WINDOW_SIZE * PTO2_MAX_RING_DEPTH)
+
+// Ready queue
+#define PTO2_READY_QUEUE_SIZE 65536  // Per-shape queue size
+
+// Cross-thread early-dispatch work queue (power of two)
+#define PTO2_EARLY_DISPATCH_QUEUE_SIZE 64
+
+// Wiring queue
+#define PTO2_WRIRING_QUEUE_SIZE 1024  // Per-shape queue size
+
+// Fanin storage
+#define PTO2_FANIN_INLINE_CAP 64
+
+// TensorMap cleanup interval
+#define PTO2_TENSORMAP_CLEANUP_INTERVAL 64  // Cleanup every N retired tasks
+#define PTO2_DEP_POOL_CLEANUP_INTERVAL 64   // Cleanup every N retired tasks
+
+// get_tensor_data/set_tensor_data spin wait timeout in cycles.
+// ~10s on hardware (1.5 GHz counter), ~10s on simulation (chrono-based).
+constexpr uint64_t PTO2_TENSOR_DATA_TIMEOUT_CYCLES = 15 * 1000 * 1000 * 1000ULL;
+
+// =============================================================================
+// Task States
+// =============================================================================
+
+/**
+ * Task state enumeration
+ *
+ * State transitions:
+ *   PENDING -> COMPLETED -> CONSUMED
+ *
+ * The slot stays in PENDING from submit through "ready in queue" and "running
+ * on a worker"; readiness and running-vs-idle are derived from fanin_refcount
+ * and per-core running_slot_state respectively, not from task_state itself.
+ *
+ * Conditions:
+ *   PENDING->COMPLETED:   all subtasks finish (set by scheduler) or task is a
+ *                         hidden alloc completed inline by the orchestrator
+ *   COMPLETED->CONSUMED:  fanout_refcount == fanout_count && state == COMPLETED
+ */
+typedef enum {
+    PTO2_TASK_PENDING = 0,    // Submitted; awaiting fanin, queued, or dispatched
+    PTO2_TASK_COMPLETED = 1,  // Execution finished, output may still be in use
+    PTO2_TASK_CONSUMED = 2    // Output fully consumed, buffers can be released
+} PTO2TaskState;
+
+/**
+ * Result of a unified task allocation.
+ */
+struct PTO2TaskAllocResult {
+    int32_t task_id;    // Absolute task ID (not wrapped)
+    int32_t slot;       // task_id & (window_size - 1)
+    void *packed_base;  // Heap allocation result (nullptr if failure)
+    void *packed_end;   // packed_base + aligned output_size
+
+    bool failed() const { return task_id < 0; }
+};
+
+struct PTO2OutputLayout {
+    uint64_t offsets[MAX_TENSOR_ARGS] = {};
+    uint64_t buffer_sizes[MAX_TENSOR_ARGS] = {};
+    int32_t total_output_size = 0;
+};
+
+// =============================================================================
+// Dependency List Entry
+// =============================================================================
+
+/**
+ * Fanin spill entry
+ * Stored in the dedicated fanin spill ring buffer.
+ */
+struct PTO2TaskSlotState;  // Forward declaration
+struct PTO2FaninPool;      // Forward declaration
+struct PTO2FaninSpillEntry {
+    PTO2TaskSlotState *slot_state;
+};
+static_assert(sizeof(PTO2FaninSpillEntry) == sizeof(uintptr_t));
+
+/**
+ * Dependency list entry (singly-linked list node)
+ * Stored in DepListPool ring buffer.
+ */
+struct PTO2DepListEntry {
+    PTO2TaskSlotState *slot_state;  // Consumer slot state (direct pointer)
+    PTO2DepListEntry *next;         // next entry
+};
+
+// =============================================================================
+// Task Descriptor
+// =============================================================================
+
+/**
+ * Task descriptor structure (shared memory)
+ *
+ * Stored in the TaskDescriptor ring buffer in shared memory.
+ * Contains static identification and buffer pointers only.
+ * Dynamic scheduling state (fanin/fanout/task_state) is in PTO2TaskSlotState.
+ *
+ * Fields set by Orchestrator at submission, read by Scheduler for dispatch.
+ */
+struct PTO2TaskDescriptor {
+    // Mixed-task identification (encodes ring_id in upper 32 bits)
+    PTO2TaskId task_id;  // raw: (ring_id << 32) | local_id
+
+    // Per-slot kernel IDs (INVALID_KERNEL_ID = inactive)
+    int32_t kernel_id[PTO2_SUBTASK_SLOT_COUNT];
+
+    // Packed output buffer (all outputs packed into single contiguous buffer)
+    void *packed_buffer_base;  // Start of packed buffer in GM Heap
+    void *packed_buffer_end;   // End of packed buffer (for heap reclamation)
+};
+
+// =============================================================================
+// Per-Slot Scheduling State
+// =============================================================================
+
+/**
+ * Task payload data (cold path - only accessed during orchestration and dispatch)
+ *
+ * Layout: metadata + inline fanin packed in the first 9 cache lines, followed
+ * by bulk tensor and scalar data. Small fanins stay fully inline; larger
+ * fanins spill into a per-ring ring buffer slice.
+ */
+// Speculative early-dispatch claim states for PTO2TaskPayload::spec_state.
+enum PTO2SpecState : uint8_t {
+    PTO2_SPEC_NONE = 0,       // not pre-staged
+    PTO2_SPEC_STAGING = 1,    // Hook 1 claimed it; staging in progress
+    PTO2_SPEC_STAGED = 2,     // staged on a core, gated; staged_* fields valid
+    PTO2_SPEC_DISPATCHED = 3  // routed via the normal dispatch path (no pre-stage)
+};
+
+// A pre-staged consumer occupies one core per gated subtask block. WHICH cores
+// it occupies is recorded as a bitmask (staged_core_mask, 1 bit per global
+// core_id); the completion-path release iterates the set bits and rings each
+// core's doorbell from the scheduler's per-core doorbell table. Bounded by the
+// chip's core count (RUNTIME_MAX_WORKER = 72; no two-level pre-dispatch means
+// gated cores in flight <= core count), NOT by block_num — so a wide SPMD
+// consumer can pre-stage all its idle cores. 2 words = 128 bits >= 72.
+inline constexpr int PTO2_SPEC_CORE_MASK_WORDS = 2;
+
+struct PTO2TaskPayload {
+    // === Cache lines 0-8 (576B) — metadata + inline fanin ===
+    int32_t tensor_count{0};
+    int32_t scalar_count{0};
+    int32_t fanin_actual_count{0};  // Actual fanin count (without the +1 redundance)
+    int32_t fanin_spill_start{0};   // Linear start index in fanin spill pool (0 = no spill)
+    PTO2FaninPool *fanin_spill_pool{nullptr};
+    PTO2TaskSlotState *fanin_inline_slot_states[PTO2_FANIN_INLINE_CAP];
+    // Speculative early-dispatch metadata (AICPU-side only). Ordered by descending
+    // alignment (8B mask, 4B fanin, then 1B flags) so the block packs with no
+    // internal padding. Kept here after the fanin array (not moved up front): on
+    // cache line 8 it shares only with the rarely-touched fanin tail, whereas in
+    // line 0 the spec atomics (written during staging) would false-share with
+    // tensor_count/scalar_count (read by build_payload at dispatch). Fits in the 40B
+    // between the fanin array (offset 536) and the 64B-aligned tensors[] (offset
+    // 576), so sizeof and tensors[] are unchanged.
+    //
+    // Bitmask of global core_ids this consumer is pre-staged (gated) on. Set with
+    // atomic fetch_or by concurrent stagers; read by release. (Re)initialized in
+    // PTO2TaskPayload::init before the slot can be staged again.
+    std::atomic<uint64_t> staged_core_mask[PTO2_SPEC_CORE_MASK_WORDS]{};
+    // Early-dispatch CANDIDATE detection (event-driven, dual of fanin_refcount):
+    // seeded at wiring with producers already complete, then a flagged producer's
+    // DISPATCH bumps each consumer's dispatch_fanin. dispatch_fanin ==
+    // fanin_actual_count  <=>  every producer is flagged-and-dispatched or was
+    // pre-completed  =>  this task is an early-dispatch candidate (push early_dispatch_queue).
+    std::atomic<int32_t> dispatch_fanin{0};  // CONSUMER side: flagged-dispatched + pre-completed producers
+    bool allow_early_resolve{false};         // codegen hint copied from Arg in PTO2TaskPayload::init
+    // Lock-free claim state shared by the stagers (Hook 1, possibly several AICPU
+    // threads concurrently) and the completion-path release: 0=NONE, 1=STAGING,
+    // 3=DISPATCHED (2=STAGED is unused now). STAGING is the STABLE gated state —
+    // many threads stage blocks concurrently while it holds, each claiming a block
+    // via the atomic next_block_idx and OR-ing its cores into staged_core_mask.
+    // Release does STAGING->DISPATCHED then rings the mask; a thread that stages a
+    // block AFTER release flipped DISPATCHED rings that block's doorbell itself
+    // (self-ring), so no doorbell is ever missed.
+    std::atomic<uint8_t> spec_state{0};
+    std::atomic<uint8_t> dispatch_propagated{0};  // PRODUCER side: once-guard for fanout propagation
+    std::atomic<uint8_t> spec_chain_active{0};    // inherited early-dispatch flag (auto-chain past codegen flag)
+    uint8_t spec_chain_depth{0};                  // auto-chain depth; inherited = parent+1, capped
+    // === Cache lines 9-72 (4096B) — tensors (alignas(64) forces alignment) ===
+    Tensor tensors[MAX_TENSOR_ARGS];
+    // === Cache lines 73-74 (128B) — scalars ===
+    uint64_t scalars[MAX_SCALAR_ARGS];
+
+    // Layout verification (size checks that don't need offsetof).
+    static_assert(sizeof(Tensor) == 128, "Tensor must be 2 cache lines");
+    static_assert(MAX_SCALAR_ARGS * sizeof(uint64_t) == 128, "scalar region must be 128B (2 cache lines)");
+
+    /**
+     * Prefetch (for write) the regions init() is about to fill so the stores land
+     * in warm cache. tensor_count/scalar_count come from the Arg — the payload's
+     * own counts are not set until init(). Warms the early-dispatch spec block at
+     * offset 536 (cache line 8) too. A member fn lowers to the same prefetch
+     * instructions as a free function (`this` is just a register), no cache impact.
+     */
+    void prefetch(int32_t tensor_count, int32_t scalar_count) const {
+        for (int32_t i = 0; i < tensor_count; i++) {
+            __builtin_prefetch(&tensors[i], 1, 3);
+            __builtin_prefetch(reinterpret_cast<const char *>(&tensors[i]) + 64, 1, 3);
+        }
+        for (int32_t i = 0; i < scalar_count; i += 8) {
+            __builtin_prefetch(&scalars[i], 1, 3);
+        }
+        __builtin_prefetch(this, 1, 3);
+        __builtin_prefetch(reinterpret_cast<const char *>(this) + 64, 1, 3);
+        __builtin_prefetch(reinterpret_cast<const char *>(this) + 128, 1, 3);
+        __builtin_prefetch(reinterpret_cast<const char *>(this) + 512, 1, 3);  // spec fields (cache line 8)
+    }
+
+    /**
+     * Initialize payload: copy tensors, store scalars.
+     *
+     * For each param slot, the tensor source is determined by TensorArgType:
+     * - OUTPUT -> use materialized_outputs.output_ptr(out_idx++)
+     * - INPUT / INOUT -> use refs[i].tensor
+     *
+     * @param args                Task arguments (tensors + scalars)
+     * @param result  Materialized output tensors (from TensorCreateInfo path)
+     */
+    void init(
+        const L0TaskArgs &args, TaskOutputTensors &result, PTO2TaskAllocResult &alloc_result, PTO2OutputLayout &layout
+    ) {
+        tensor_count = args.tensor_count();
+        scalar_count = args.scalar_count();
+
+        // int32_t out_idx = 0;
+        for (int32_t i = 0; i < args.tensor_count(); i++) {
+            if (args.tag(i) != TensorArgType::OUTPUT) {
+                tensors[i].copy(args.tensor(i).ref());
+            } else {
+                init_tensor_from_create_info(
+                    tensors[i], args.tensor(i).create_info(),
+                    reinterpret_cast<void *>(reinterpret_cast<char *>(alloc_result.packed_base) + layout.offsets[i]),
+                    layout.buffer_sizes[i]
+                );
+                tensors[i].owner_task_id = result.task_id();
+                result.materialize_output(tensors[i]);
+            }
+        }
+        // Round up to cache line boundary. Both arrays are 128B so no overrun.
+        // Eliminates branches; extra bytes within the same CL have zero additional cost.
+        memcpy(scalars, args.scalars(), PTO2_ALIGN_UP(args.scalar_count() * sizeof(uint64_t), 64));
+
+        // Speculative early-dispatch metadata — the single init point for these
+        // fields. reset_for_reuse MUST NOT touch the payload (it runs at slot
+        // init and would pull this cold cache line across structures);
+        // prepare_task only allocates/binds. prefetch() warms this
+        // line (offset 512) so these writes land in warm cache.
+        //
+        // spec_state / staged_core_mask / dispatch_fanin / spec_chain_* are all
+        // CONSUMER-side: a task with allow_early_resolve == false still has them
+        // touched when one of ITS producers is flagged (propagate_dispatch_fanin
+        // bumps dispatch_fanin and may CAS spec_state / set the auto-chain flag on
+        // any consumer, independent of the consumer's own hint). So they MUST be
+        // zeroed here unconditionally — no per-task allow_early_resolve gating.
+        allow_early_resolve = args.allow_early_resolve();
+        spec_state.store(PTO2_SPEC_NONE, std::memory_order_relaxed);
+        for (int w = 0; w < PTO2_SPEC_CORE_MASK_WORDS; w++)
+            staged_core_mask[w].store(0, std::memory_order_relaxed);
+        dispatch_fanin.store(0, std::memory_order_relaxed);
+        dispatch_propagated.store(0, std::memory_order_relaxed);
+        spec_chain_active.store(0, std::memory_order_relaxed);
+        spec_chain_depth = 0;
+    }
+};
+
+// PTO2TaskPayload layout verification (offsetof requires complete type).
+static_assert(offsetof(PTO2TaskPayload, fanin_spill_pool) == 16, "spill pool pointer layout drift");
+static_assert(
+    offsetof(PTO2TaskPayload, fanin_inline_slot_states) == 24, "inline fanin array must follow spill metadata"
+);
+static_assert(offsetof(PTO2TaskPayload, tensors) == 576, "tensors must start at byte 576 (cache line 9)");
+static_assert(
+    offsetof(PTO2TaskPayload, scalars) == 576 + MAX_TENSOR_ARGS * sizeof(Tensor),
+    "scalars must immediately follow tensors"
+);
+static_assert(
+    sizeof(PTO2TaskPayload) == 576 + MAX_TENSOR_ARGS * sizeof(Tensor) + MAX_SCALAR_ARGS * sizeof(uint64_t),
+    "PTO2TaskPayload size must stay on the baseline cache-line footprint"
+);
+
+/**
+ * Per-task slot scheduling state (scheduler-private, NOT in shared memory)
+ *
+ * Consolidates all hot-path scheduling fields into a single cache-friendly
+ * structure (32 bytes = half a cache line). Accessing any field of a task's
+ * slot state brings all related fields into the same cache line.
+ *
+ * Concurrency notes:
+ * - fanout_head, fanout_count protected by fanout_lock (per-task spinlock)
+ * - fanin_count set once at submission, read-only after (hot path for ready check)
+ * - task_state, fanin_refcount, fanout_refcount updated atomically
+ */
+
+// fanout_count / fanout_refcount bit encoding (both uint32):
+//   bits [30:0] = consumer references (count: # consumers; refcount: # released)
+//   bit  [31]   = the owning scope's reference (PTO2_FANOUT_SCOPE_BIT)
+// fanout_count is seeded to PTO2_FANOUT_SCOPE_BIT and ++'d per consumer, so it
+// ends as (SCOPE_BIT | num_consumers). release adds 1 (consumer completion) or
+// SCOPE_BIT (scope_end). CONSUMED iff fanout_refcount == fanout_count (every
+// consumer released AND scope bit set). Keeping the scope ref in a distinct bit
+// (rather than folding scope + consumers into one count) lets a consumer reach
+// fanout_refcount == (fanout_count & ~PTO2_FANOUT_SCOPE_BIT) while the scope bit
+// is still unset -- i.e. "all consumers done but scope still open" stays
+// distinguishable from "fully consumed". The heap/task deadlock detector keys
+// off exactly that complement: that condition with state==COMPLETED means the
+// head can only be released by scope_end, which a blocked orchestrator can
+// never reach -> provable deadlock.
+static constexpr uint32_t PTO2_FANOUT_SCOPE_BIT = 0x80000000u;
+
+struct alignas(64) PTO2TaskSlotState {
+    // Fanout lock + list (accessed together under lock in on_task_complete)
+    std::atomic<int32_t> fanout_lock;  // Per-task spinlock (0=unlocked, 1=locked)
+    uint32_t fanout_count;             // SCOPE_BIT (owning scope) | number of consumers
+
+    PTO2DepListEntry *fanout_head;  // Pointer to first fanout entry (nullptr = empty)
+
+    // Task state (completion, consumed check, ready check)
+    std::atomic<PTO2TaskState> task_state;  // PENDING/COMPLETED/CONSUMED
+
+    // Fanin (accessed together in release_fanin_and_check_ready)
+    std::atomic<int32_t> fanin_refcount;  // Dynamic: counts completed producers
+    int32_t fanin_count;                  // Number of producer dependencies (set once by wiring)
+
+    // Fanout refcount (read alongside fanout_count by consumer-wait checks)
+    std::atomic<uint32_t> fanout_refcount;  // Dynamic: low bits = released consumers, bit31 = scope released
+
+    // --- Per-slot constant, re-bound by orch::prepare_task each submit ---
+    // Value is the same on every reuse (&task_payloads[slot] / &task_descriptors[slot]),
+    // but written here per-submit instead of in an O(window_size) init loop —
+    // these are the only "scale-dependent" pointers in this struct, so moving
+    // them out of init makes startup cost independent of task_window_size.
+    PTO2TaskPayload *payload;
+    PTO2TaskDescriptor *task;
+
+    // --- Set per-submit (depend on task inputs) ---
+    ActiveMask active_mask;  // Bitmask of active subtask slots (set once)
+    // Set by any subtask FIN that pushed deferred-completion CONDITIONs to
+    // the runtime mailbox; read by the last subtask FIN to decide whether
+    // the task needs MPSC-deferred completion or can complete inline on this
+    // thread. The write is sequenced before on_subtask_complete's acq_rel
+    // fetch_add and the read after, so all earlier subtasks' writes are visible
+    // to the last subtask.
+    std::atomic<bool> any_subtask_deferred{false};
+    uint8_t _async_pad{0};
+    int32_t dep_pool_mark{0};  // Dep pool top after wiring (thread-0-only)
+
+    std::atomic<int16_t> completed_subtasks{0};  // Each core completion increments by 1
+    int16_t total_required_subtasks{0};          // = logical_block_num * popcount(active_mask)
+    int16_t logical_block_num{1};                // Total logical blocks (set by orchestrator)
+    // Next block to dispatch. Atomic so concurrent speculative stagers can each
+    // claim a distinct block via CAS; normal dispatch (ready-queue serialized)
+    // uses plain relaxed load/store. The two phases never overlap in time (staging
+    // happens before release; normal dispatch of the remainder happens after).
+    std::atomic<int16_t> next_block_idx{0};
+
+    /**
+     * Re-bind the per-slot payload/task pointers. Called by
+     * orch::prepare_task on every submit. Value is constant for a given
+     * slot, but we pay the cheap re-write each submit (both fields land on
+     * the same 64B slot_state cache line that prepare_task is already
+     * dirtying) to avoid the init-time per-slot loop.
+     */
+    void bind_buffers(PTO2TaskPayload *p, PTO2TaskDescriptor *t) {
+        payload = p;
+        task = t;
+    }
+
+    /**
+     * Reset dynamic scheduling fields to their pristine values.
+     * In host_build_graph this runs once per slot at init (pto_shared_memory.cpp)
+     * to zero the scheduling state before the host orchestrator populates it —
+     * there is no execution-time slot recycle (whole-graph-resident, no reclaim),
+     * so unlike the device-orch path this is not re-invoked after CONSUMED.
+     *
+     * Skips payload, task, ring_id (immutable, bound once at init).
+     * Skips task_state: the orchestrator sets it to PENDING when it populates
+     * the slot.
+     */
+    void reset_for_reuse() {
+        fanout_lock.store(0, std::memory_order_relaxed);
+        fanout_count = PTO2_FANOUT_SCOPE_BIT;  // bit31 = owning-scope ref; consumers ++ into low bits
+        fanout_head = nullptr;
+        fanin_refcount.store(0, std::memory_order_relaxed);
+        fanout_refcount.store(0, std::memory_order_relaxed);
+        completed_subtasks.store(0, std::memory_order_relaxed);
+        next_block_idx.store(0, std::memory_order_relaxed);
+        any_subtask_deferred.store(false, std::memory_order_relaxed);
+        // Note: payload spec fields (spec_state / staged_core_mask / dispatch_fanin /
+        // spec_chain_*) are NOT reset here — this method skips the payload by
+        // contract. They are (re)initialized in PTO2TaskPayload::init on every
+        // submit, before the slot becomes visible to the scheduler.
+    }
+
+    // === Per-task fanout spinlock ===
+    //
+    // Used by BOTH the orchestrator and the scheduler. The fanout_lock MUST
+    // be held whenever reading or writing fanout_head / fanout_count, because
+    // the orchestrator adds consumers concurrently with the scheduler
+    // traversing the list after task completion.
+
+#if PTO2_ORCH_PROFILING || PTO2_SCHED_PROFILING
+    void lock_fanout(uint64_t &atomic_count, uint64_t &wait_cycle) {
+        uint64_t t0 = get_sys_cnt_aicpu();
+        bool contended = false;
+        uint32_t atomic_ops = 0;
+
+        for (;;) {
+            while (fanout_lock.load(std::memory_order_acquire) != 0) {
+                contended = true;
+                atomic_ops++;
+                SPIN_WAIT_HINT();
+            }
+            int32_t expected = 0;
+            if (fanout_lock.compare_exchange_weak(expected, 1, std::memory_order_acquire, std::memory_order_relaxed)) {
+                atomic_ops++;
+                atomic_count += atomic_ops;
+                if (contended) {
+                    wait_cycle += (get_sys_cnt_aicpu() - t0);
+                }
+                return;
+            }
+            contended = true;
+            atomic_ops++;
+        }
+    }
+#endif
+
+    void lock_fanout() {
+        for (;;) {
+            while (fanout_lock.load(std::memory_order_acquire) != 0) {
+                SPIN_WAIT_HINT();
+            }
+            int32_t expected = 0;
+            if (fanout_lock.compare_exchange_weak(expected, 1, std::memory_order_acquire, std::memory_order_relaxed)) {
+                return;
+            }
+        }
+    }
+
+    void unlock_fanout() { fanout_lock.store(0, std::memory_order_release); }
+};
+
+static_assert(sizeof(PTO2TaskSlotState) == 64);
+
+#endif  // SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_PTO_RUNTIME2_TYPES_H_
diff --git a/src/a2a3/runtime/host_build_graph/runtime/pto_shared_memory.h b/src/a2a3/runtime/host_build_graph/runtime/pto_shared_memory.h
new file mode 100644
index 000000000..55a6a7048
--- /dev/null
+++ b/src/a2a3/runtime/host_build_graph/runtime/pto_shared_memory.h
@@ -0,0 +1,296 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * PTO Runtime2 - Shared Memory Layout
+ *
+ * Defines the shared memory structure for Orchestrator-Scheduler communication.
+ *
+ * Memory Layout (single ring):
+ *   +---------------------------+
+ *   | SharedMemoryHeader        |  (flow control + sync)
+ *   +---------------------------+
+ *   | TaskDescriptor[]          |
+ *   | TaskPayload[]             |
+ *   | TaskSlotState[]           |
+ *   +---------------------------+
+ *
+ * Design principles:
+ * - Only data needed for Orchestrator<->Scheduler communication is here
+ * - TensorMap, scope_stack, ready_queues, dep_pool are in private memory
+ * - Flow control via atomic counters/flags (no locks needed for single-word R/W)
+ *
+ * Based on: docs/RUNTIME_LOGIC.md
+ */
+
+#pragma once
+
+#include "utils/device_arena.h"
+#include "pto_runtime2_types.h"
+
+// =============================================================================
+// Shared Memory Header
+// =============================================================================
+
+struct PTO2SharedMemoryHandle;
+
+/**
+ * Per-ring flow control state in shared memory.
+ * Written/read by Orchestrator and Scheduler for synchronization.
+ */
+struct alignas(64) PTO2RingFlowControl {
+    // === Cache Line 0: Written by Orchestrator, Read by Scheduler ===
+    alignas(64) std::atomic<int32_t> current_task_index;  // Task ring head (next to allocate)
+
+    // === Cache Line 1: Written by Scheduler, Read by Orchestrator (for back-pressure) ===
+    alignas(64) std::atomic<int32_t> last_task_alive;  // Task ring tail (oldest active task)
+
+    // Per-boot SM reset. PTO2TaskAllocator::init() seeds its private
+    // local_task_id_ from initial_local_task_id (default 0 in production)
+    // *without* dereferencing current_task_index — it relies on this reset
+    // running on every AICPU boot so 0 stays in sync. If you ever change
+    // the initial fc value or the boot ordering, update the default in
+    // PTO2TaskAllocator::init (pto_ring_buffer.h) in the same change, or
+    // submit IDs will be off by the divergence.
+    void init() {
+        current_task_index.store(0, std::memory_order_relaxed);
+        last_task_alive.store(0, std::memory_order_relaxed);
+    }
+
+    bool validate(PTO2SharedMemoryHandle *handle, int32_t ring_id) const;
+};
+
+static_assert(sizeof(PTO2RingFlowControl) == 128, "PTO2RingFlowControl must be exactly 2 cache lines (128B)");
+
+/**
+ * Per-ring shared memory header section.
+ *
+ * Groups flow-control, layout info, and per-ring data pointers for a single ring.
+ * Pointers are host-side only (set by setup_pointers, invalid on device).
+ */
+struct alignas(64) PTO2SharedMemoryRingHeader {
+    PTO2RingFlowControl fc;
+
+    // Layout metadata (set once at init)
+    uint64_t task_window_size;
+    int32_t task_window_mask;
+    uint64_t heap_size;
+    uint64_t task_descriptors_offset;  // Offset from SM base, in bytes
+
+    // Per-ring data pointers (host-side, set by setup_pointers)
+    PTO2TaskDescriptor *task_descriptors;
+    PTO2TaskPayload *task_payloads;
+    PTO2TaskSlotState *slot_states;
+
+    int32_t get_slot_by_task_id(int32_t local_task_id) { return local_task_id & task_window_mask; }
+
+    PTO2TaskDescriptor &get_task_by_slot(int32_t slot) { return task_descriptors[slot]; }
+
+    PTO2TaskDescriptor &get_task_by_task_id(int32_t local_id) {
+        return task_descriptors[get_slot_by_task_id(local_id)];
+    }
+
+    PTO2TaskPayload &get_payload_by_slot(int32_t slot) { return task_payloads[slot]; }
+
+    PTO2TaskPayload &get_payload_by_task_id(int32_t local_id) { return task_payloads[get_slot_by_task_id(local_id)]; }
+
+    PTO2TaskSlotState &get_slot_state_by_slot(int32_t slot) { return slot_states[slot]; }
+
+    PTO2TaskSlotState &get_slot_state_by_task_id(int32_t local_id) {
+        return slot_states[get_slot_by_task_id(local_id)];
+    }
+};
+
+/**
+ * Shared memory header structure
+ *
+ * Contains per-ring flow control and global layout information.
+ */
+struct alignas(PTO2_ALIGN_SIZE) PTO2SharedMemoryHeader {
+    // === RING FLOW CONTROL + LAYOUT INFO (single ring, set once at init) ===
+    PTO2SharedMemoryRingHeader ring;
+
+    // === GLOBAL FIELDS ===
+    std::atomic<int32_t> orchestrator_done;  // Flag: orchestration complete
+
+    // Total shared memory size (for validation)
+    uint64_t total_size;
+
+    // Graph output for copy-back (set by orchestrator when using packed buffer)
+    // Host finalize copies from this address instead of dev_ptr when non-zero
+    std::atomic<uint64_t> graph_output_ptr;   // Address where final output was written (packed buffer)
+    std::atomic<uint64_t> graph_output_size;  // Size in bytes
+
+    // === ERROR REPORTING ===
+
+    // Orchestrator fatal error code (Orchestrator → Scheduler, AICPU → Host)
+    // Non-zero signals fatal error. Written by orchestrator, read by scheduler and host.
+    std::atomic<int32_t> orch_error_code;
+
+    // Scheduler error state (Scheduler → Host, independent of orchestrator)
+    // Written by scheduler threads on timeout; read by orchestrator and host.
+    std::atomic<uint32_t> sched_error_bitmap;  // Bit X set = thread X had error
+    std::atomic<int32_t> sched_error_code;     // Last scheduler error code (last-writer-wins)
+    std::atomic<int32_t> sched_error_thread;   // Thread index of last error writer
+};
+
+static_assert(
+    (sizeof(PTO2SharedMemoryHeader) % PTO2_ALIGN_SIZE == 0) && (sizeof(PTO2SharedMemoryHeader) < 4096),
+    "PTO2SharedMemoryHeader should be reasonably sized"
+);
+
+// =============================================================================
+// Shared Memory Handle
+// =============================================================================
+
+/**
+ * Handle for shared memory lifecycle management (create/destroy).
+ * Runtime components (orchestrator, scheduler) use PTO2SharedMemoryHeader* directly.
+ */
+struct PTO2SharedMemoryHandle {
+    void *sm_base;     // Base address of shared memory
+    uint64_t sm_size;  // Total size of shared memory
+
+    PTO2SharedMemoryHeader *header;
+
+    // Ownership flag
+    bool is_owner;  // True if this handle allocated the memory
+
+    // === Static helpers ===
+
+    static uint64_t calculate_size(uint64_t task_window_size);
+    static uint64_t calculate_size_per_ring(const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH]);
+
+    // UT convenience: reserve wrapper + sm_base on `arena`, commit, and init
+    // using default PTO2_TASK_WINDOW_SIZE / PTO2_HEAP_SIZE. Only valid when the
+    // arena is otherwise empty (the call performs the single commit). All
+    // memory is owned by the arena — caller must not call destroy().
+    static PTO2SharedMemoryHandle *create_and_init_default(DeviceArena &arena);
+
+    // === Instance methods ===
+
+    // In-place init for caller-provided wrapper storage (e.g. a region carved
+    // out of a DeviceArena). Sets is_owner = false, calls setup_pointers and
+    // init_header. Returns false when `sm_size` is too small for the requested
+    // `task_window_size`.
+    bool init(void *sm_base, uint64_t sm_size, uint64_t task_window_size, uint64_t heap_size);
+    bool init_per_ring(
+        void *sm_base, uint64_t sm_size, const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH],
+        const uint64_t heap_sizes[PTO2_MAX_RING_DEPTH]
+    );
+
+    // Attach to an ALREADY-populated shared memory region: point the handle and
+    // every ring header's data pointers (descriptors / payloads / slot_states)
+    // at `sm_base`, but do NOT reset the flow-control counters / slot states.
+    // Used by host_build_graph host-orch, where the host orchestrator populated
+    // the SM and H2D'd it; the device must re-point at its own SM base without
+    // wiping the contents (unlike init_per_ring, which also resets the header).
+    bool attach_populated(void *sm_base, uint64_t sm_size, const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH]);
+
+    void destroy();
+    void print_layout();
+    bool validate();
+
+private:
+    void init_header(uint64_t task_window_size, uint64_t heap_size);
+    void init_header_per_ring(
+        const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH], const uint64_t heap_sizes[PTO2_MAX_RING_DEPTH]
+    );
+    void setup_pointers(uint64_t task_window_size);
+    void setup_pointers_per_ring(const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH]);
+};
+
+// =============================================================================
+// SM Device Layout Helpers
+// =============================================================================
+//
+// When the host pre-builds a runtime-arena image, it needs the device-side
+// addresses of several SM sub-fields (ring flow-control counters,
+// task_descriptors arrays, orch_error_code) so it can wire them into the
+// orchestrator / scheduler init_data path without dereferencing the SM —
+// the SM lives in device memory and cannot be touched from host.
+//
+// These helpers compute those addresses by offset arithmetic on the SM
+// device base. Pure pointer math, no loads/stores; safe to call from host.
+// The same arithmetic happens on AICPU too (via PTO2SharedMemoryHandle's
+// own setup_pointers), so values are guaranteed consistent across sides.
+namespace pto2_sm_layout {
+
+inline std::atomic<int32_t> *orch_error_code_addr(void *sm_dev_base) noexcept {
+    return reinterpret_cast<std::atomic<int32_t> *>(
+        static_cast<char *>(sm_dev_base) + offsetof(PTO2SharedMemoryHeader, orch_error_code)
+    );
+}
+
+inline PTO2SharedMemoryRingHeader *ring_header_addr(void *sm_dev_base) noexcept {
+    return reinterpret_cast<PTO2SharedMemoryRingHeader *>(
+        static_cast<char *>(sm_dev_base) + offsetof(PTO2SharedMemoryHeader, ring)
+    );
+}
+
+inline std::atomic<int32_t> *ring_current_task_index_addr(void *sm_dev_base) noexcept {
+    return reinterpret_cast<std::atomic<int32_t> *>(
+        reinterpret_cast<char *>(ring_header_addr(sm_dev_base)) + offsetof(PTO2SharedMemoryRingHeader, fc) +
+        offsetof(PTO2RingFlowControl, current_task_index)
+    );
+}
+
+inline std::atomic<int32_t> *ring_last_task_alive_addr(void *sm_dev_base) noexcept {
+    return reinterpret_cast<std::atomic<int32_t> *>(
+        reinterpret_cast<char *>(ring_header_addr(sm_dev_base)) + offsetof(PTO2SharedMemoryRingHeader, fc) +
+        offsetof(PTO2RingFlowControl, last_task_alive)
+    );
+}
+
+// Byte offsets (from the SM base) of the ring's three segments. The layout is:
+// header, then descriptors -> payloads -> slot_states, every segment
+// PTO2_ALIGN_UP-padded.
+struct PTO2RingSegmentOffsets {
+    uint64_t descriptors;
+    uint64_t payloads;
+    uint64_t slot_states;
+    uint64_t end;  // offset just past slot_states (total SM size)
+};
+
+// Single source of truth for the SM segment layout. Returns offsets (not
+// pointers), so it serves BOTH the host-side pointer setup (`setup_pointers`,
+// which adds `sm_base`) and the device-address helpers below (which add
+// `sm_dev_base`). Adding or reordering a segment is a one-line edit here; every
+// consumer follows automatically, so the layout walk can never silently
+// disagree across call sites.
+inline PTO2RingSegmentOffsets ring_segment_offsets(uint64_t task_window_size) noexcept {
+    uint64_t off = PTO2_ALIGN_UP(sizeof(PTO2SharedMemoryHeader), PTO2_ALIGN_SIZE);
+    PTO2RingSegmentOffsets o{};
+    o.descriptors = off;
+    off += PTO2_ALIGN_UP(task_window_size * sizeof(PTO2TaskDescriptor), PTO2_ALIGN_SIZE);
+    o.payloads = off;
+    off += PTO2_ALIGN_UP(task_window_size * sizeof(PTO2TaskPayload), PTO2_ALIGN_SIZE);
+    o.slot_states = off;
+    off += PTO2_ALIGN_UP(task_window_size * sizeof(PTO2TaskSlotState), PTO2_ALIGN_SIZE);
+    o.end = off;
+    return o;
+}
+
+// Device address of the task_descriptors array.
+inline PTO2TaskDescriptor *ring_task_descriptors_addr(void *sm_dev_base, uint64_t task_window_size) noexcept {
+    return reinterpret_cast<PTO2TaskDescriptor *>(
+        static_cast<char *>(sm_dev_base) + ring_segment_offsets(task_window_size).descriptors
+    );
+}
+
+// Device address of the slot_states array (used by the allocator's deadlock
+// detector to inspect the head task's state/fanout).
+inline PTO2TaskSlotState *ring_slot_states_addr(void *sm_dev_base, uint64_t task_window_size) noexcept {
+    return reinterpret_cast<PTO2TaskSlotState *>(
+        static_cast<char *>(sm_dev_base) + ring_segment_offsets(task_window_size).slot_states
+    );
+}
+
+}  // namespace pto2_sm_layout
diff --git a/src/a2a3/runtime/host_build_graph/runtime/pto_submit_types.h b/src/a2a3/runtime/host_build_graph/runtime/pto_submit_types.h
new file mode 100644
index 000000000..21c77fce2
--- /dev/null
+++ b/src/a2a3/runtime/host_build_graph/runtime/pto_submit_types.h
@@ -0,0 +1,161 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+/**
+ * PTO Submit Types - Shared submit-contract definitions
+ *
+ * Header-only definitions shared by orchestration-facing and runtime-facing
+ * headers. Keeps orchestration slim (no dependency on pto_runtime2_types.h).
+ */
+
+#pragma once
+
+#include <stdint.h>
+
+inline constexpr int32_t INVALID_KERNEL_ID = -1;
+
+/**
+ * Subtask slot count: AIC, AIV0, AIV1
+ */
+inline constexpr int32_t PTO2_SUBTASK_SLOT_COUNT = 3;
+
+/**
+ * Subtask slot indices
+ */
+enum class PTO2SubtaskSlot : uint8_t {
+    AIC = 0,
+    AIV0 = 1,
+    AIV1 = 2,
+};
+
+/**
+ * Subtask mask bits (for ActiveMask)
+ */
+inline constexpr uint8_t PTO2_SUBTASK_MASK_AIC = (1u << 0);         // 0x1
+inline constexpr uint8_t PTO2_SUBTASK_MASK_AIV0 = (1u << 1);        // 0x2
+inline constexpr uint8_t PTO2_SUBTASK_MASK_AIV1 = (1u << 2);        // 0x4
+inline constexpr uint8_t PTO2_SUBTASK_FLAG_SYNC_START = (1u << 3);  // 0x8: all blocks must launch atomically
+
+/**
+ * Resource shape — classifies a MixedKernels into one of 3 scheduling buckets.
+ *
+ * Multi-subtask tasks (2+ active slots) are all scheduled as MIX. Dispatch
+ * chooses one cluster, then uses active_mask to decide which cores in that
+ * cluster must be placed together: all used cores idle -> running placement;
+ * all used cores already running with free pending slots -> pending placement;
+ * mixed used-core state is rejected and retried later.
+ *
+ * DUMMY is a synthetic shape for dep-only tasks (no AICore dispatch). Tasks
+ * with an empty core_mask route to a dedicated DUMMY ready queue and are
+ * completed inline by the scheduler dispatch loop, bypassing core allocation.
+ */
+enum class PTO2ResourceShape : uint8_t {
+    AIC = 0,    // Single AIC
+    AIV = 1,    // Single AIV
+    MIX = 2,    // Full cluster (dispatch uses active_mask)
+    DUMMY = 3,  // Dependency-only (no AICore dispatch)
+};
+
+// Number of *dispatchable* resource shapes (AIC, AIV, MIX). DUMMY does not
+// allocate a per-shape ready_queue entry / local buffer — it lives in a
+// dedicated queue inside PTO2SchedulerState.
+inline constexpr int32_t PTO2_NUM_RESOURCE_SHAPES = 3;
+
+/**
+ * Bitmask of active subtask slots + flags, sizeof == 1.
+ */
+class ActiveMask {
+public:
+    constexpr ActiveMask() = default;
+    constexpr explicit ActiveMask(uint8_t raw) :
+        raw_(raw) {}
+
+    uint8_t raw() const { return raw_; }
+
+    bool subtask_active(PTO2SubtaskSlot slot) const { return (raw_ & (1u << static_cast<uint8_t>(slot))) != 0; }
+
+    uint8_t core_mask() const { return raw_ & 0x07u; }
+
+    bool requires_sync_start() const { return (raw_ & PTO2_SUBTASK_FLAG_SYNC_START) != 0; }
+
+    PTO2ResourceShape to_shape() const {
+        uint8_t cmask = core_mask();
+        if (cmask == 0) return PTO2ResourceShape::DUMMY;
+        int bit_count = __builtin_popcount(cmask);
+        if (bit_count >= 2) return PTO2ResourceShape::MIX;
+        if (cmask & PTO2_SUBTASK_MASK_AIC) return PTO2ResourceShape::AIC;
+        return PTO2ResourceShape::AIV;
+    }
+
+    void set_sync_start() { raw_ |= PTO2_SUBTASK_FLAG_SYNC_START; }
+
+    bool operator==(ActiveMask other) const { return raw_ == other.raw_; }
+    bool operator!=(ActiveMask other) const { return raw_ != other.raw_; }
+
+    ActiveMask operator|(ActiveMask other) const { return ActiveMask(raw_ | other.raw_); }
+    ActiveMask &operator|=(ActiveMask other) {
+        raw_ |= other.raw_;
+        return *this;
+    }
+
+    ActiveMask operator&(uint8_t mask) const { return ActiveMask(raw_ & mask); }
+
+    bool has_mask(uint8_t mask) const { return (raw_ & mask) != 0; }
+
+    explicit operator bool() const { return raw_ != 0; }
+
+private:
+    uint8_t raw_{0};
+};
+
+static_assert(sizeof(ActiveMask) == 1, "ActiveMask must be exactly 1 byte");
+
+/**
+ * Mixed-task submit contract.
+ *
+ * Each field holds either a valid kernel ID or INVALID_KERNEL_ID (inactive).
+ * At least one slot must be valid.
+ */
+struct MixedKernels {
+    int32_t aic_kernel_id{INVALID_KERNEL_ID};
+    int32_t aiv0_kernel_id{INVALID_KERNEL_ID};
+    int32_t aiv1_kernel_id{INVALID_KERNEL_ID};
+
+    ActiveMask to_active_mask() const {
+        uint8_t mask = 0;
+        if (aic_kernel_id != INVALID_KERNEL_ID) mask |= PTO2_SUBTASK_MASK_AIC;
+        if (aiv0_kernel_id != INVALID_KERNEL_ID) mask |= PTO2_SUBTASK_MASK_AIV0;
+        if (aiv1_kernel_id != INVALID_KERNEL_ID) mask |= PTO2_SUBTASK_MASK_AIV1;
+        return ActiveMask(mask);
+    }
+};
+
+/**
+ * SPMD launch parameters carried inside Arg.
+ *
+ * Controls how many logical blocks (SPMD dimension) a single task
+ * is expanded into at dispatch time.  Each block receives a unique
+ * block_idx in [0, block_num) via the per-dispatch LocalContext.
+ */
+class PTO2LaunchSpec {
+public:
+    constexpr PTO2LaunchSpec() = default;
+
+    int16_t block_num() const { return block_num_; }
+    void set_block_num(int16_t n) { block_num_ = n; }
+
+    bool require_sync_start() const { return require_sync_start_; }
+    void set_require_sync_start(bool v) { require_sync_start_ = v; }
+
+private:
+    int16_t block_num_{1};
+    bool require_sync_start_{false};
+};
diff --git a/src/a2a3/runtime/host_build_graph/runtime/pto_tensormap.h b/src/a2a3/runtime/host_build_graph/runtime/pto_tensormap.h
new file mode 100644
index 000000000..aa4a2e1d9
--- /dev/null
+++ b/src/a2a3/runtime/host_build_graph/runtime/pto_tensormap.h
@@ -0,0 +1,730 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+/**
+ * PTO Runtime2 - TensorMap Interface
+ *
+ * TensorMap provides producer lookup for dependency discovery:
+ * - Maps Tensor -> producer task ID
+ * - Used by pto_submit_task() to find dependencies
+ *
+ * Key design features:
+ * 1. Ring buffer pool for entries (no malloc/free)
+ * 2. Lazy invalidation (entries become stale when producer retires)
+ * 3. Per-task per-ring entry tracking for efficient cleanup
+ * 4. OVERLAP DETECTION: Detects dependencies for overlapping sub-regions
+ *
+ * Hash table with chaining:
+ * - buckets[] array of head offsets
+ * - Entries linked via next_in_bucket
+ * - Insert at head (newest first) for sorted chains
+ *
+ * CRITICAL: Hash only by base_ptr
+ * ==============================
+ * For overlap detection to work, ALL sub-regions of the same base tensor
+ * MUST be in the SAME hash bucket. This allows lookup to compare all
+ * potentially overlapping regions.
+ *
+ * Overlap detection: Two regions create a dependency if:
+ *   1. Same base_ptr (raw tensor pointer)
+ *   2. Byte ranges [offset, offset+size) intersect
+ *
+ * Based on: docs/RUNTIME_LOGIC.md
+ */
+
+#pragma once
+
+#include "common.h"
+#include "profiling_config.h"
+#include "utils/device_arena.h"
+#include "pto_runtime2_types.h"
+#include "tensor.h"
+
+// Overlap geometry types. Relocated here from tensor.h: they are used only by
+// the runtime's overlap-detection / dependency machinery, not by the
+// wire/host-facing Tensor definition.
+enum class OverlapStatus {
+    NO_OVERLAP,
+    COVERED,
+    OTHER,
+};
+
+struct Segment {
+    uint64_t begin;
+    uint64_t end;
+
+    bool line_segment_intersection(const Segment &other) const { return end > other.begin && other.end > begin; }
+    bool contains(const Segment &other) const { return begin <= other.begin && other.end <= end; }
+};
+
+/**
+ * Layout descriptor produced by PTO2TensorMap::reserve_layout(). Stores the
+ * region offsets returned by DeviceArena::reserve() so init_from_layout()
+ * can fetch the matching pointers after the arena is committed.
+ *
+ * All offsets are relative to the arena's base.
+ */
+struct PTO2TensorMapLayout {
+    size_t off_buckets;
+    size_t off_entry_pool;
+    size_t off_free_entry_list;
+    size_t off_task_entry_heads;
+    int32_t num_buckets;
+    int32_t pool_size;
+    int32_t task_window_size;
+};
+
+// TensorMap Lookup Profiling (must precede inline lookup/insert methods).
+#if PTO2_TENSORMAP_PROFILING
+extern uint64_t g_lookup_chain_total;
+extern uint64_t g_lookup_count;
+extern int32_t g_lookup_chain_max;
+extern uint64_t g_lookup_overlap_checks;
+extern uint64_t g_lookup_overlap_hits;
+extern uint64_t g_insert_count;
+#endif
+
+// =============================================================================
+// TensorMap Structure
+// =============================================================================
+
+/**
+ * TensorMap entry structure — cache-line optimized for lookup
+ *
+ * Cache line 1 (64B, lookup hot path) mirrors Tensor cache line 1 byte-for-byte
+ * from byte 16 onward, so that `memcpy(this, &tensor, 64)` populates everything
+ * we need for overlap checks. Bytes [0, 16) carry entry-only fields (hash
+ * bucket head + chain pointer) that overlap Tensor::buffer (addr in [0, 8) is
+ * the hash key, size in [8, 16) is unused by the entry — we repurpose it for
+ * `next_in_bucket`).
+ *
+ *   buffer_addr / next_in_bucket / producer_task_id   — chain traversal + match
+ *   start_offset                                       — overlap byte range begin
+ *   version, ndims, dtype, manual_dep, is_contiguous   — overlap fast path
+ *   shapes[5]                                          — overlap comparison (line 1)
+ *
+ * Cache line 2 (64B, slow-path / non-contiguous overlap):
+ *   prev_in_bucket / next_in_task / prev_in_task       — chain manipulation
+ *   bucket_index                                       — bookkeeping
+ *   extent_elem_cache                                  — overlap byte range end
+ *   strides[5]                                          — reserved for L2 overlap (PR-2)
+ *
+ * When both entry & probe are `is_contiguous && start_offset == 0`, the overlap
+ * check derives `extent_elem = prod(shapes)` from cache line 1 alone.
+ *
+ * Entry size: 128B (2 cache lines), matches Tensor.
+ */
+struct alignas(64) PTO2TensorMapEntry {
+    // === Cache line 1 (64B) — lookup hot path; mirrors Tensor line 1 from byte 16 ===
+    uint64_t buffer_addr;                // 8B [0, 8):   tensor base address (hash key, mirrors Tensor::buffer.addr)
+    PTO2TensorMapEntry *next_in_bucket;  // 8B [8, 16):  next entry in hash bucket chain (overlays Tensor::buffer.size)
+    PTO2TaskId producer_task_id;         // 8B [16,24):  mirrors Tensor::owner_task_id slot
+    uint64_t start_offset;               // 8B [24,32):  mirrors Tensor::start_offset (element offset)
+    int32_t version;                     // 4B [32,36):  mirrors Tensor::version
+    uint32_t ndims;                      // 4B [36,40):  mirrors Tensor::ndims
+    DataType dtype;                      // 1B [40,41):  mirrors Tensor::dtype
+    bool manual_dep;                     // 1B [41,42):  mirrors Tensor::manual_dep
+    bool is_contiguous;                  // 1B [42,43):  mirrors Tensor::is_contiguous
+    uint8_t __padding1__;                // 1B [43,44):  mirrors Tensor padding
+    uint32_t shapes[MAX_TENSOR_DIMS];    // 20B [44,64): mirrors Tensor::shapes
+
+    // === Cache line 2 (64B) — chain manipulation + non-contiguous overlap data ===
+    PTO2TensorMapEntry *prev_in_bucket;  // 8B [64, 72)
+    PTO2TensorMapEntry *next_in_task;    // 8B [72, 80)
+    PTO2TensorMapEntry *prev_in_task;    // 8B [80, 88)
+    int32_t bucket_index;                // 4B [88, 92): -1 when unlinked
+    uint32_t __padding2__;               // 4B [92, 96)
+    uint64_t extent_elem_cache;          // 8B [96,104): non-contiguous extent (mirrors Tensor)
+    uint32_t strides[MAX_TENSOR_DIMS];   // 20B [104,124): element strides, mirrors Tensor::strides
+    uint8_t __padding3__[4];             // 4B [124,128)
+
+    /**
+     * Copy overlap-relevant fields from a Tensor into this entry.
+     *
+     * 64B memcpy of Tensor cache line 1 populates buffer_addr (byte [0,8)),
+     * producer_task_id, start_offset, version, ndims, dtype, manual_dep,
+     * is_contiguous and shapes[]. Byte [8,16) holds Tensor::buffer.size in
+     * the source and gets written into next_in_bucket; that's harmless
+     * because link_entry() overwrites next_in_bucket immediately after.
+     *
+     * Cache line 2 (stride / extent_elem_cache) is derived from line 1 when
+     * the source is canonically contiguous (is_contiguous && start_offset==0),
+     * so the producer Tensor's cache line 2 stays cold during insert. Only
+     * non-contiguous producers pay one extra line 2 read.
+     */
+    void copy_from_tensor(const Tensor &tensor) {
+        memcpy(this, &tensor, 64);
+        if (tensor.is_contiguous && tensor.start_offset == 0) {
+            uint64_t numel = 1;
+            for (uint32_t i = 0; i < tensor.ndims; i++)
+                numel *= tensor.shapes[i];
+            extent_elem_cache = numel;
+            uint32_t s = 1;
+            for (int32_t i = static_cast<int32_t>(tensor.ndims) - 1; i >= 0; i--) {
+                strides[i] = s;
+                s *= tensor.shapes[i];
+            }
+        } else {
+            extent_elem_cache = tensor.extent_elem_cache;
+            for (uint32_t i = 0; i < tensor.ndims; i++) {
+                strides[i] = tensor.strides[i];
+            }
+        }
+    }
+
+    void copy_tensor_create_info(const TensorCreateInfo &tensor_create_info, uint64_t addr) {
+        memcpy(this, &tensor_create_info, 64);
+        buffer_addr = addr;
+        // Create-info outputs are always contiguous with start_offset = 0;
+        // extent_elem = prod(shapes); stride is row-major.
+        uint64_t numel = 1;
+        for (uint32_t i = 0; i < tensor_create_info.ndims; i++) {
+            numel *= tensor_create_info.shapes[i];
+        }
+        extent_elem_cache = numel;
+        uint32_t s = 1;
+        for (int32_t i = static_cast<int32_t>(tensor_create_info.ndims) - 1; i >= 0; i--) {
+            strides[i] = s;
+            s *= tensor_create_info.shapes[i];
+        }
+    }
+
+    /**
+     * Effective element extent of this entry.
+     * Contiguous-aligned views compute it from shapes alone (line 1 hit only);
+     * non-contiguous views read the cached value from line 2.
+     */
+    uint64_t effective_extent_elem() const {
+        if (is_contiguous) {
+            uint64_t n = 1;
+            for (uint32_t i = 0; i < ndims; i++)
+                n *= shapes[i];
+            return n;
+        }
+        return extent_elem_cache;
+    }
+
+    /**
+     * Check overlap between input tensor and this entry (the producer output).
+     *
+     * Three-level cascade:
+     *   L1 — O(1) byte-range intersection. Disjoint -> NO_OVERLAP.
+     *   L2 — O(ndims) hyper-rectangle precise check, eligible only when both
+     *        sides share the same canonical row-major axis layout (same
+     *        dtype/ndims/strides[], stride descends as integer multiples,
+     *        start_offset decomposes cleanly under the reference shape).
+     *        Yields NO_OVERLAP / COVERED / OTHER per-dim.
+     *   L3 — Non-hyper-rectangle pairs (transpose/permute mismatch, slice
+     *        with step, etc): conservative OTHER. Exact enumeration via
+     *        contiguous-segment merge is scheduled for a follow-up.
+     *
+     * COVERED is returned when `input` completely contains `entry` per-dim
+     * — dep_compute uses this to retire the now-redundant entry.
+     */
+    OverlapStatus check_overlap(const Tensor &input) const {
+        debug_assert(input.buffer.addr == buffer_addr);
+        debug_assert(input.version >= version);
+        if (input.version > version) {
+            return OverlapStatus::OTHER;
+        }
+
+        // -------- L1: byte-range intersection (O(1) fast reject) --------
+        const uint64_t in_begin = input.start_offset;
+        const uint64_t in_end = input.start_offset + input.extent_elem();
+        const uint64_t ent_begin = start_offset;
+        const uint64_t ent_end = start_offset + effective_extent_elem();
+        Segment in_range_bytes{in_begin, in_end};
+        Segment ent_range_bytes{ent_begin, ent_end};
+        if (!in_range_bytes.line_segment_intersection(ent_range_bytes)) {
+            return OverlapStatus::NO_OVERLAP;
+        }
+
+        // -------- L2 prereqs: same axis layout? --------
+        if (input.dtype != dtype || input.ndims != ndims || ndims == 0) {
+            return OverlapStatus::OTHER;
+        }
+        for (uint32_t i = 0; i < ndims; i++) {
+            if (input.strides[i] != strides[i]) return OverlapStatus::OTHER;
+        }
+        // strides[ndims-1] must be 1 and strides[i-1] must be an integer
+        // multiple of strides[i] for the row-major reference-shape derivation
+        // below to hold. This rejects slice-with-step (strides[d] != prev factor)
+        // and any view chain that scrambles the axis order. (strides is
+        // uint32_t with the > 0 invariant enforced at construction, so no
+        // sign check needed.)
+        if (strides[ndims - 1] != 1) return OverlapStatus::OTHER;
+        for (uint32_t i = 1; i < ndims; i++) {
+            if (strides[i - 1] % strides[i] != 0) return OverlapStatus::OTHER;
+        }
+
+        // Derive reference shape A from stride. By construction stride is
+        // row-major over A: strides[i] = prod(A[i+1..ndims-1]). So
+        //   A[i] = strides[i-1] / strides[i]   for i >= 1
+        //   A[0] = (buffer.size / dtype_bytes) / strides[0]
+        // input.buffer.size is the storage size; entry shares the same buffer
+        // (debug-asserted by buffer.addr equality at the top), so we read it
+        // from input rather than mirroring buffer.size into the entry.
+        //
+        // Note on buffer padding: runtime allocators may over-allocate
+        // `buffer.size` (cache-line / 1024B alignment, ring-buffer slot
+        // rounding, etc). When that happens, `numel_storage` is larger than
+        // the true logical extent and `ref_shapes[0]` ends up generously over-
+        // sized. This is intentional: ref_shapes is only used as an *upper
+        // bound* in the in-bounds checks below; the actual overlap test (the
+        // per-dim line-segment intersection on the real start_offset /
+        // shapes / stride further down) is unaffected. A larger-than-truth
+        // ref_shapes[0] simply makes the bounds check more permissive — it
+        // can never cause a false NO_OVERLAP nor a false COVERED.
+        uint32_t ref_shapes[MAX_TENSOR_DIMS] = {};
+        for (uint32_t i = 1; i < ndims; i++) {
+            ref_shapes[i] = strides[i - 1] / strides[i];
+        }
+        const uint64_t elem_size = get_element_size(dtype);
+        if (elem_size == 0) return OverlapStatus::OTHER;
+        const uint64_t numel_storage = input.buffer.size / elem_size;
+        const uint32_t stride0 = strides[0];  // > 0 by Tensor invariant
+        if (numel_storage % stride0 != 0) return OverlapStatus::OTHER;
+        ref_shapes[0] = static_cast<uint32_t>(numel_storage / stride0);
+
+        // Decompose start_offset into row-major multi-dim offsets. By the same
+        // relation strides[i] = prod(ref_shapes[i+1..]) so dividing by strides[i]
+        // (no inner loop) yields each axis offset directly.
+        uint32_t in_offsets[MAX_TENSOR_DIMS] = {};
+        uint32_t ent_offsets[MAX_TENSOR_DIMS] = {};
+        uint64_t in_remain = input.start_offset;
+        uint64_t ent_remain = start_offset;
+        for (uint32_t i = 0; i < ndims; i++) {
+            const uint32_t s = strides[i];
+            in_offsets[i] = static_cast<uint32_t>(in_remain / s);
+            ent_offsets[i] = static_cast<uint32_t>(ent_remain / s);
+            in_remain %= s;
+            ent_remain %= s;
+        }
+        if (in_remain != 0 || ent_remain != 0) return OverlapStatus::OTHER;
+
+        // Validate that each side fits within ref_shapes (defense in depth —
+        // a well-formed view always satisfies this).
+        for (uint32_t i = 0; i < ndims; i++) {
+            if (static_cast<uint64_t>(in_offsets[i]) + input.shapes[i] > ref_shapes[i]) return OverlapStatus::OTHER;
+            if (static_cast<uint64_t>(ent_offsets[i]) + shapes[i] > ref_shapes[i]) return OverlapStatus::OTHER;
+        }
+
+        // -------- L2 core: per-dim line-segment intersection --------
+        bool input_contains_entry = true;
+        for (uint32_t i = 0; i < ndims; i++) {
+            Segment in_seg{in_offsets[i], static_cast<uint64_t>(in_offsets[i]) + input.shapes[i]};
+            Segment ent_seg{ent_offsets[i], static_cast<uint64_t>(ent_offsets[i]) + shapes[i]};
+            if (!in_seg.line_segment_intersection(ent_seg)) {
+                return OverlapStatus::NO_OVERLAP;
+            }
+            if (!in_seg.contains(ent_seg)) {
+                input_contains_entry = false;
+            }
+        }
+        return input_contains_entry ? OverlapStatus::COVERED : OverlapStatus::OTHER;
+    }
+};
+
+static_assert(sizeof(PTO2TensorMapEntry) == 128, "TensorMapEntry must be exactly 2 cache lines (128 bytes)");
+static_assert(offsetof(PTO2TensorMapEntry, buffer_addr) == offsetof(Tensor, buffer.addr));
+static_assert(offsetof(PTO2TensorMapEntry, producer_task_id) == offsetof(Tensor, owner_task_id));
+static_assert(offsetof(PTO2TensorMapEntry, start_offset) == offsetof(Tensor, start_offset));
+static_assert(offsetof(PTO2TensorMapEntry, version) == offsetof(Tensor, version));
+static_assert(offsetof(PTO2TensorMapEntry, ndims) == offsetof(Tensor, ndims));
+static_assert(offsetof(PTO2TensorMapEntry, dtype) == offsetof(Tensor, dtype));
+static_assert(offsetof(PTO2TensorMapEntry, manual_dep) == offsetof(Tensor, manual_dep));
+static_assert(offsetof(PTO2TensorMapEntry, is_contiguous) == offsetof(Tensor, is_contiguous));
+static_assert(offsetof(PTO2TensorMapEntry, shapes) == offsetof(Tensor, shapes));
+static_assert(
+    offsetof(PTO2TensorMapEntry, prev_in_bucket) == 64, "TensorMapEntry must be exactly 2 cache lines (128 bytes)"
+);
+
+// =============================================================================
+// TensorMap Lookup Chain Length Statistics (compile-time toggle)
+// =============================================================================
+
+/**
+ * TensorMap structure
+ *
+ * Hash table with ring buffer entry pool and lazy invalidation.
+ */
+struct PTO2TensorMap {
+    // Hash table buckets (fixed size, power of 2)
+    PTO2TensorMapEntry **buckets;  // Array of offsets into entry_pool (-1 = empty)
+    int32_t num_buckets;           // Must be power of 2 for fast modulo
+
+    // Entry pool as ring buffer
+    PTO2TensorMapEntry *entry_pool;        // Ring buffer of entries
+    PTO2TensorMapEntry **free_entry_list;  // free entry ids
+    int32_t pool_size;                     // Total pool capacity
+    int32_t next_entry_idx;                // id when next entry insert
+    int32_t free_num;                      // free entry number in entry pool
+
+    // Per-task entry tracking (for efficient bucket cleanup)
+    // Indexed by [local_id & (task_window_size - 1)]
+    PTO2TensorMapEntry **task_entry_heads;
+    int32_t task_window_size;  // Task window size (for slot masking)
+
+    // Validity threshold (for lazy invalidation), cached from shared memory.
+    int32_t last_task_alive_cached;
+
+    // Cleanup progress (for periodic cleanup_retired)
+    int32_t last_cleanup{};
+
+    uint32_t get_task_local_id_slot(uint32_t task_local_id) const { return task_local_id & (task_window_size - 1); }
+
+    // Accessors read by scope_stats_collector. Declared unconditionally so the
+    // collector .cpp compiles at PTO2_PROFILING=0 (collector is unconditional —
+    // setter symbols must export for host dlsym; the probe call sites that use
+    // these accessors stay gated by PTO2_PROFILING).
+    int32_t current_used() const { return next_entry_idx - free_num; }
+    int32_t pool_capacity() const { return pool_size; }
+    int32_t free_entries() const { return pool_size - current_used(); }
+
+    // Reclaim retired entries, advancing the cleanup cursor (last_cleanup) to
+    // the supplied watermark. Returns last_task_alive — the monotone progress
+    // signal the orchestrator's exhaustion back-pressure loop watches to tell a
+    // transient shortage (tasks still retiring) from a wedged pool (watermark
+    // not advancing). Idempotent per watermark: a watermark that has not passed
+    // last_cleanup is skipped, so it never double-frees.
+    int64_t reclaim_retired_all(int32_t sm_last_task_alive) {
+        sync_validity(sm_last_task_alive);
+        if (sm_last_task_alive > last_cleanup) {
+            cleanup_retired(last_cleanup, sm_last_task_alive);
+            last_cleanup = sm_last_task_alive;
+        }
+        return sm_last_task_alive;
+    }
+
+    // new_entry only allocates memory, does not assign attributes
+    PTO2TensorMapEntry *new_entry() {
+        if (free_num > 0) {
+            PTO2TensorMapEntry *res = free_entry_list[--free_num];
+            debug_assert(res->bucket_index == -1);
+            return res;
+        }
+        always_assert(next_entry_idx < pool_size);
+        PTO2TensorMapEntry *res = &entry_pool[next_entry_idx++];
+        debug_assert(res->bucket_index == -1);
+        return res;
+    }
+
+    void free_entry(PTO2TensorMapEntry &entry) {
+        always_assert(entry.bucket_index != -1);  // must still be in a bucket
+
+        // Update predecessor's next pointer (O(1) via prev_in_bucket)
+        if (entry.prev_in_bucket == nullptr) {
+            // Entry is the head of its bucket chain, update bucket head
+            // Must compute hash BEFORE clearing tensor
+            buckets[entry.bucket_index] = entry.next_in_bucket;
+        } else {
+            entry.prev_in_bucket->next_in_bucket = entry.next_in_bucket;
+        }
+
+        // Update successor's prev pointer
+        if (entry.next_in_bucket != nullptr) {
+            entry.next_in_bucket->prev_in_bucket = entry.prev_in_bucket;
+        }
+
+        free_entry_list[free_num++] = &entry;
+        entry.bucket_index = -1;
+        entry.next_in_bucket = nullptr;
+        entry.prev_in_bucket = nullptr;
+        entry.next_in_task = nullptr;
+        entry.prev_in_task = nullptr;
+    }
+
+    // =============================================================================
+    // TensorMap API
+    // =============================================================================
+
+    /**
+     * Phase 1: reserve every sub-region (buckets, entry_pool, free list, per-ring
+     * task_entry_heads) on the supplied arena. Records the resulting offsets in
+     * the returned layout descriptor. Must be called before the arena is
+     * committed.
+     */
+    static PTO2TensorMapLayout
+    reserve_layout(DeviceArena &arena, int32_t num_buckets, int32_t pool_size, int32_t task_window_size);
+
+    /**
+     * Same as reserve_layout() with default sizes (PTO2_TENSORMAP_NUM_BUCKETS,
+     * PTO2_TENSORMAP_POOL_SIZE).
+     */
+    static PTO2TensorMapLayout reserve_layout_default(DeviceArena &arena, int32_t task_window_size);
+
+    /**
+     * Phase 3a: write everything *except* arena-internal pointer fields
+     * (buckets, entry_pool, free_entry_list, task_entry_heads).
+     * Uses arena.region_ptr to address the arena regions for data writes,
+     * but does not store those addresses in struct fields. Safe to call on
+     * a host arena that holds the prebuilt image.
+     */
+    bool init_data_from_layout(const PTO2TensorMapLayout &layout, DeviceArena &arena);
+
+    /**
+     * Phase 3b: write the arena-internal pointer fields. Idempotent;
+     * called once on the host arena and once on the AICPU after attach.
+     */
+    void wire_arena_pointers(const PTO2TensorMapLayout &layout, DeviceArena &arena);
+
+    /**
+     * Tear down state. Does not free memory — the arena owns the backing
+     * buffer. Pointers are set to nullptr so accidental reuse traps.
+     */
+    void destroy();
+
+    /**
+     * Update validity threshold from shared memory
+     * Called periodically to refresh the lazy invalidation threshold.
+     *
+     * @param last_task_alive  Current value from shared memory
+     */
+    void sync_validity(int32_t last_task_alive) { this->last_task_alive_cached = last_task_alive; }
+
+    /**
+     * Lookup producer for a tensor region
+     *
+     * Searches the hash table for matching regions and invokes the callback
+     * for each overlapping valid entry.
+     * Stale entries from different rings are skipped (not truncated).
+     *
+     * The callback receives (PTO2TensorMapEntry &, OverlapStatus) and should
+     * return true to continue iteration, false to stop early. It is safe for
+     * the callback to call remove_entry() on the current entry: next_in_bucket
+     * is latched before invocation.
+     *
+     * @param tensor    Tensor to look up
+     * @param on_match  Callback invoked for each overlapping entry
+     */
+    template <typename Fn>
+    void lookup(const Tensor &tensor, Fn &&on_match) {
+        uint32_t bucket_index = hash(tensor.buffer.addr);
+        PTO2TensorMapEntry *cur_entry = buckets[bucket_index];
+
+#if PTO2_TENSORMAP_PROFILING
+        g_lookup_count++;
+        int32_t chain_len = 0;
+#endif
+
+        while (cur_entry != nullptr) {
+            PTO2TensorMapEntry *next_entry = cur_entry->next_in_bucket;
+
+#if PTO2_TENSORMAP_PROFILING
+            chain_len++;
+#endif
+            // Skip stale entries (no chain truncation — entries from different
+            // rings can be interleaved, so a stale entry from one ring does NOT
+            // imply subsequent entries from other rings are also stale)
+            if (!entry_valid(*cur_entry)) {
+                cur_entry = next_entry;
+                continue;
+            }
+
+            // Entry is valid - check if regions OVERLAP (not just exact match)
+            // Since we hash only by base_ptr, all entries in this bucket have
+            // potential to overlap. We must check actual byte-range overlap.
+            if (tensor.buffer.addr == cur_entry->buffer_addr) {
+#if PTO2_TENSORMAP_PROFILING
+                g_lookup_overlap_checks++;
+#endif
+                auto overlap_status = cur_entry->check_overlap(tensor);
+                if (overlap_status != OverlapStatus::NO_OVERLAP) {
+#if PTO2_TENSORMAP_PROFILING
+                    g_lookup_overlap_hits++;
+#endif
+                    if (!on_match(*cur_entry, overlap_status)) {
+#if PTO2_TENSORMAP_PROFILING
+                        g_lookup_chain_total += chain_len;
+                        if (chain_len > g_lookup_chain_max) g_lookup_chain_max = chain_len;
+#endif
+                        return;
+                    }
+                }
+            }
+
+            // Move to next entry
+            cur_entry = next_entry;
+        }
+#if PTO2_TENSORMAP_PROFILING
+        g_lookup_chain_total += chain_len;
+        if (chain_len > g_lookup_chain_max) g_lookup_chain_max = chain_len;
+#endif
+    }
+
+    /**
+     * Insert a new entry (called when task produces output)
+     *
+     * Allocates from ring buffer pool, may overwrite stale entries.
+     * Inserts at head of hash bucket chain (maintains task_id ordering).
+     *
+     * @param tensor            Tensor produced
+     * @param producer_task_id  Task ID of producer
+     */
+    void insert(const Tensor &tensor, PTO2TaskId producer_task_id) {
+        PTO2TensorMapEntry *entry = new_entry();
+        entry->copy_from_tensor(tensor);
+        link_entry(entry, tensor.buffer.addr, producer_task_id);
+    }
+
+    /**
+     * Cleanup stale entries for retired tasks
+     *
+     * Called periodically by Orchestrator when last_task_alive advances.
+     * Removes entries from bucket chains for tasks in [old, new) range.
+     *
+     * @param old_last_task_alive  Previous threshold
+     * @param new_last_task_alive  New threshold
+     */
+    void cleanup_retired(int32_t old_last_task_alive, int32_t new_last_task_alive) {
+        // Iterate through retired tasks and remove their entries
+        for (int32_t local_id = old_last_task_alive; local_id < new_last_task_alive; local_id++) {
+            int32_t task_slot = local_id & (task_window_size - 1);
+            PTO2TensorMapEntry *cur_entry = task_entry_heads[task_slot];
+
+            while (cur_entry != nullptr) {
+                PTO2TensorMapEntry *next_entry = cur_entry->next_in_task;  // Save before clearing
+                // Only remove if this entry belongs to the retiring task
+                // (slot may have been reused by a newer task)
+                debug_assert(cur_entry->producer_task_id == PTO2TaskId::make(0, static_cast<uint32_t>(local_id)));
+                free_entry(*cur_entry);
+                cur_entry = next_entry;
+            }
+
+            // Clear task's entry head (slot will be reused by local_id + task_window_size)
+            task_entry_heads[task_slot] = nullptr;
+        }
+    }
+
+    // =============================================================================
+    // Internal Helpers (exposed for testing)
+    // =============================================================================
+
+    /**
+     * Compute hash for tensor addr
+     *
+     * Multiplicative hash using the golden-ratio constant.  Multiplication
+     * mixes ALL input bits into the high bits of the product, so aligned
+     * addresses (low bits all-zero) still distribute evenly.  We extract
+     * the top log2(num_buckets) bits which carry the most entropy.
+     */
+    uint32_t hash(uint64_t key) {
+        key *= 0x9E3779B97F4A7C15ULL;
+        return static_cast<uint32_t>(key >> (64 - __builtin_ctz(num_buckets)));
+    }
+
+    /**
+     * Link an initialized entry into bucket and task chains.
+     */
+    void link_entry(PTO2TensorMapEntry *entry, uint64_t addr, PTO2TaskId producer_task_id) {
+#if PTO2_TENSORMAP_PROFILING
+        g_insert_count++;
+#endif
+        uint32_t bucket_index = hash(addr);
+        auto local_id = producer_task_id.local();
+        int32_t task_slot = local_id & (task_window_size - 1);
+
+        entry->producer_task_id = producer_task_id;
+
+        // Insert at head of hash bucket
+        entry->bucket_index = bucket_index;
+        entry->next_in_bucket = buckets[bucket_index];
+        if (entry->next_in_bucket != nullptr) {
+            entry->next_in_bucket->prev_in_bucket = entry;
+        }
+        buckets[bucket_index] = entry;
+        entry->prev_in_bucket = nullptr;
+
+        // Link to task's entry list
+        entry->next_in_task = task_entry_heads[task_slot];
+        entry->prev_in_task = nullptr;
+        if (entry->next_in_task != nullptr) {
+            entry->next_in_task->prev_in_task = entry;
+        }
+        task_entry_heads[task_slot] = entry;
+    }
+
+    /**
+     * Check if entry is valid (producer has not retired)
+     */
+    bool entry_valid(const PTO2TensorMapEntry &entry) const {
+        return static_cast<int32_t>(entry.producer_task_id.local()) >= last_task_alive_cached;
+    }
+
+    void remove_entry(PTO2TensorMapEntry &entry) {
+        remove_from_task(entry);
+        free_entry(entry);
+    }
+
+    /**
+     * Remove entry from its task chain (O(1) with prev pointer)
+     * Called during pool wrap-around to unlink reused entries.
+     */
+    void remove_from_task(PTO2TensorMapEntry &entry) {
+        always_assert(entry.bucket_index != -1);  // must still be in a bucket
+        // Update predecessor's next pointer (O(1) via prev_in_task)
+        if (entry.prev_in_task == nullptr) {
+            // Entry is the head of its task chain, update task_entry_heads
+            int32_t local_id = static_cast<int32_t>(entry.producer_task_id.local());
+            int32_t task_slot = local_id & (task_window_size - 1);
+            task_entry_heads[task_slot] = entry.next_in_task;
+        } else {
+            entry.prev_in_task->next_in_task = entry.next_in_task;
+        }
+
+        // Update successor's prev pointer
+        if (entry.next_in_task != nullptr) {
+            entry.next_in_task->prev_in_task = entry.prev_in_task;
+        }
+
+        entry.next_in_task = nullptr;
+        entry.prev_in_task = nullptr;
+    }
+
+    // =============================================================================
+    // Debug Utilities
+    // =============================================================================
+
+    /**
+     * Print TensorMap statistics
+     */
+    void print_stats();
+
+    /**
+     * Get count of valid entries
+     */
+    int32_t valid_count();
+
+    // =============================================================================
+    // TensorMap Synchronization
+    // =============================================================================
+
+    /**
+     * Sync TensorMap validity threshold from shared memory
+     *
+     * Called periodically to refresh the lazy invalidation threshold.
+     * Also triggers cleanup if threshold has advanced significantly.
+     */
+    void sync_tensormap(PTO2TaskId task_id, int32_t sm_last_task_alive);
+};
+
+#if PTO2_TENSORMAP_PROFILING
+struct PTO2TensorMapProfilingData {
+    uint64_t lookup_chain_total;
+    uint64_t lookup_count;
+    int32_t lookup_chain_max;
+    uint64_t overlap_checks;
+    uint64_t overlap_hits;
+    uint64_t insert_count;
+};
+
+PTO2TensorMapProfilingData pto2_tensormap_get_profiling();
+#endif
diff --git a/src/a2a3/runtime/host_build_graph/runtime/pto_types.h b/src/a2a3/runtime/host_build_graph/runtime/pto_types.h
new file mode 100644
index 000000000..65d593a49
--- /dev/null
+++ b/src/a2a3/runtime/host_build_graph/runtime/pto_types.h
@@ -0,0 +1,614 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Orchestration Build Graph Types - Data structures for orchestration runtime extensions
+ *
+ * Standalone header defining orchestration-specific types for:
+ * - TaskOutputTensors: Return value from submit containing materialized output Tensors
+ * - Arg: Aggregated argument container for pto_submit_task API
+ *
+ * Tensor descriptor types (Tensor, PTOBufferHandle, TensorCreateInfo) are
+ * defined in tensor.h.
+ *
+ * This header is independent of orch_build_graph_runtime.h to allow inclusion from runtime.h
+ * without type conflicts (Handshake, TensorPair, HostApi).
+ */
+
+#ifndef SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_PTO_TYPES_H_
+#define SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_PTO_TYPES_H_
+
+#include <stdint.h>
+#include <string.h>
+
+#include <string>
+#include <type_traits>
+#include <utility>
+
+#if defined(__aarch64__)
+#include <arm_neon.h>
+#endif
+
+#include "aicpu/dump_arg_selection.h"
+#include "data_type.h"
+#include "profiling_config.h"
+#include "pto_submit_types.h"
+#include "task_args.h"
+#include "tensor.h"
+#include "tensor_create_info.h"  // runtime-only TensorCreateInfo + materialization helpers
+
+typedef enum {
+    ASYNC_ENGINE_SDMA = 0,
+    ASYNC_ENGINE_ROCE = 1,
+    ASYNC_ENGINE_URMA = 2,
+    ASYNC_ENGINE_CCU = 3,
+    NUM_ASYNC_ENGINES = 4,
+} AsyncEngine;
+
+enum class CompletionType : int32_t {
+    COUNTER = 0,
+};
+
+// =============================================================================
+// Task Output Tensors (return value from submit)
+// =============================================================================
+
+enum class PTO2ScopeMode : uint8_t {
+    AUTO = 0,
+    MANUAL = 1,
+};
+
+/**
+ * TaskOutputTensors — returned by submit, holds materialized output Tensors.
+ *
+ * Only runtime-created outputs are stored here, indexed in add_output order.
+ *
+ * The underlying storage is uninitialized; only output_count elements are
+ * valid after submit returns.  This avoids default-constructing Tensor[]
+ * on the hot path (2 KB of unnecessary zeroing per submit).
+ *
+ * Users must hold a named TaskOutputTensors variable and borrow via get_ref();
+ * binding get_ref() on an rvalue is compile-time rejected to prevent dangling.
+ *
+ * LIFETIME — single-scope only:
+ *   Internally this class stores pointers into the submitting task's payload
+ *   (PTO2TaskPayload::tensors[]), which lives in a ring-buffer slot. After
+ *   scope_end the slot becomes eligible for reuse, and a later submit will
+ *   overwrite the same Tensor storage in place. Therefore the
+ *   TaskOutputTensors instance, the const Tensor& returned by get_ref(), and
+ *   any pointer derived from either MUST NOT outlive the PTO2_SCOPE in which
+ *   submit was called — do not move/copy them to outer-scope variables, do
+ *   not capture references by std::reference_wrapper or raw pointers across
+ *   scope boundaries.
+ *
+ *   This invariant is intentionally not enforced at runtime: a reused slot
+ *   simply carries a different but valid owner_task_id, so checking
+ *   owner_task_id cannot distinguish "still mine" from "silently aliased to
+ *   an unrelated task". Misuse manifests as a wrong-tensor read with no
+ *   diagnostic.
+ */
+class TaskOutputTensors {
+public:
+    TaskOutputTensors() :
+        task_id_(PTO2TaskId::invalid()),
+        output_count_(0) {}
+
+    bool empty() const { return output_count_ == 0; }
+    uint32_t size() const { return output_count_; }
+
+    /// Borrow a materialized output tensor by index (lvalue only).
+    const Tensor &get_ref(uint32_t index) const & {
+        always_assert(index < output_count_);
+        return *tensors_[index];
+    }
+    const Tensor &get_ref(uint32_t index) const && = delete;
+
+    /// Runtime-internal: append one materialized output Tensor.
+    void materialize_output(const Tensor &tensor) {
+        always_assert(output_count_ < MAX_TENSOR_ARGS);
+        tensors_[output_count_++] = &tensor;
+    }
+
+    void set_task_id(PTO2TaskId id) { task_id_ = id; }
+
+    PTO2TaskId task_id() const { return task_id_; }
+
+private:
+    PTO2TaskId task_id_;
+    uint32_t output_count_;
+    // Upper bound: a task cannot have more outputs than total tensor args
+    // (every OUTPUT/OUTPUT_EXISTING slot is one of the Arg's tensor slots).
+    const Tensor *tensors_[MAX_TENSOR_ARGS];
+};
+
+using TaskSubmitResult = TaskOutputTensors;
+
+// =============================================================================
+// Argument Types (for pto_submit_task API)
+// =============================================================================
+
+// TensorArgType is defined in tensor.h (included via task_args.h above)
+
+/**
+ * Tagged reference to a single Arg slot — either a Tensor* or a
+ * TensorCreateInfo*. The active member is determined by the slot's
+ * TensorArgType tag (OUTPUT → create_info, else → tensor pointer).
+ *
+ * Minimal-permission: the union members are private; content is set only via
+ * operator=(ptr) and read via ref()/create_info(). Copy/move are deleted — a
+ * TensorRef is written in place inside an Arg's slot array, never passed by
+ * value.
+ */
+class TensorRef {
+    union {
+        const Tensor *ptr_;
+        const TensorCreateInfo *create_info_;
+    };
+
+public:
+    TensorRef() :
+        ptr_(nullptr) {}
+    TensorRef(const TensorRef &) = delete;
+    TensorRef(TensorRef &&) = delete;
+    TensorRef &operator=(const TensorRef &) = delete;
+    TensorRef &operator=(TensorRef &&) = delete;
+
+    TensorRef &operator=(const Tensor *p) {
+        ptr_ = p;
+        return *this;
+    }
+    TensorRef &operator=(const TensorCreateInfo *ci) {
+        create_info_ = ci;
+        return *this;
+    }
+
+    const Tensor &ref() const { return *ptr_; }
+    const TensorCreateInfo &create_info() const { return *create_info_; }
+    bool refers_to(const Tensor *t) const { return ptr_ == t; }
+    bool refers_to(const TensorCreateInfo *ci) const { return create_info_ == ci; }
+};
+
+/**
+ * Aggregated argument container for pto_submit_task
+ *
+ * Inherits storage from TaskArgsTpl<TensorRef, uint64_t, MAX_TENSOR_ARGS, MAX_SCALAR_ARGS, TensorArgType>.
+ * Each tensor slot stores a TensorRef union (Tensor* or TensorCreateInfo)
+ * discriminated by the corresponding tag().
+ * Tensors are dispatched first in kernel args, followed by scalars.
+ *
+ * Output arguments follow two distinct ownership models:
+ * - add_output(const TensorCreateInfo&): OUTPUT — runtime allocates buffer
+ *   and materializes a new Tensor, returned via TaskOutputTensors.
+ * - add_inout(const Tensor&): INOUT — reuses an existing Tensor as the write target.
+ *
+ * Example:
+ *   Tensor x = make_tensor_external(dev_a, shapes, 2);
+ *   TensorCreateInfo ci(shapes, 2);  // must outlive submit
+ *   Arg args;
+ *   args.add_input(x);
+ *   args.add_output(ci);
+ *   args.add_scalar(some_value);
+ *   TaskOutputTensors outs = rt_submit_aic_task(kernel_id, args);
+ *   const Tensor& y = outs.get_ref(0);
+ */
+template <size_t MaxT, size_t MaxS>
+struct Arg : TaskArgsTpl<TensorRef, uint64_t, MaxT, MaxS, TensorArgType> {
+    using Base = TaskArgsTpl<TensorRef, uint64_t, MaxT, MaxS, TensorArgType>;
+    // Make dependent-base members visible for unqualified use (two-phase lookup
+    // does not search a dependent base in a class template).
+    using Base::scalar_count_;
+    using Base::scalars_;
+    using Base::tags_;
+    using Base::tensor_count_;
+    using Base::tensors_;
+
+    // Minimal-permission: an Arg is built in place and consumed by reference;
+    // it is never copied/moved (it is a large object, and its TensorRef slots
+    // are non-copyable by design).
+    Arg() = default;
+    Arg(const Arg &) = delete;
+    Arg(Arg &&) = delete;
+    Arg &operator=(const Arg &) = delete;
+    Arg &operator=(Arg &&) = delete;
+
+    bool has_error{false};
+    const char *error_msg{nullptr};
+    PTO2LaunchSpec launch_spec;  // SPMD launch parameters (block_num, etc.)
+
+    // Speculative early-dispatch hint (codegen-author set, off by default). When
+    // true, the scheduler may stage this task on an idle core before its producer
+    // finishes, gating execution on the DATA_MAIN_BASE doorbell — only safe when
+    // the author knows the task's data dependencies allow it. Read in-process by
+    // the runtime; never crosses the wire format.
+    bool allow_early_resolve_{false};
+    void set_allow_early_resolve(bool v = true) { allow_early_resolve_ = v; }
+    bool allow_early_resolve() const { return allow_early_resolve_; }
+
+    void clear() {
+        Base::clear();
+#if PTO2_PROFILING
+        dump_arg_selection_.clear();
+#endif
+        explicit_deps_ = nullptr;
+        explicit_dep_count_ = 0;
+        allow_early_resolve_ = false;
+    }
+
+    void reset() {
+        clear();
+        has_error = false;
+        error_msg = nullptr;
+    }
+
+    void set_error(const char *msg) {
+        if (!has_error) {
+            has_error = true;
+            error_msg = msg;
+        }
+    }
+
+    template <typename... Args>
+    void dump(Args &&...args) {
+#if PTO2_PROFILING
+        static_assert(
+            (std::is_lvalue_reference_v<Args> && ...),
+            "dump: temporaries are not allowed — pass tensors/scalars already added to this Arg"
+        );
+        static_assert(
+            (is_supported_dump_arg_v<Args> && ...),
+            "dump: all arguments must be Tensor, TensorCreateInfo, or scalar lvalues"
+        );
+        if constexpr (sizeof...(Args) == 0) {
+            mark_all_dump_args();
+        } else {
+            (mark_dump_arg(args), ...);
+        }
+#else
+        ((void)args, ...);
+#endif
+    }
+
+#if PTO2_PROFILING
+    uint64_t dump_arg_mask() const { return dump_arg_selection_.dump_arg_mask(); }
+    uint64_t dump_arg_index_ambiguous_mask() const { return dump_arg_selection_.dump_arg_index_ambiguous_mask(); }
+#else
+    uint64_t dump_arg_mask() const { return 0; }
+    uint64_t dump_arg_index_ambiguous_mask() const { return 0; }
+#endif
+
+    template <typename... Args>
+    void add_input(Args &&...args) {
+        assert_add_tensor_args<false, Args...>();
+        if (!check_add_tensor_capacity(static_cast<int32_t>(sizeof...(Args)))) {
+            return;
+        }
+        ((tensors_[tensor_count_] = &args, tags_[tensor_count_] = TensorArgType::INPUT, tensor_count_++), ...);
+    }
+
+    /// Batch add outputs — all Tensor or all TensorCreateInfo:
+    ///   add_output(ci1, ci2)         — runtime allocates buffers (OUTPUT)
+    ///   add_output(t1, t2)           — write-only existing tensors (OUTPUT_EXISTING)
+    template <typename... Args>
+    void add_output(Args &&...args) {
+        assert_add_tensor_args<true, Args...>();
+        if (!check_add_tensor_capacity(static_cast<int32_t>(sizeof...(Args)))) return;
+        if constexpr ((std::is_same_v<std::decay_t<Args>, TensorCreateInfo> && ...)) {
+            ((tensors_[tensor_count_] = &args, tags_[tensor_count_] = TensorArgType::OUTPUT, tensor_count_++), ...);
+        } else {
+            ((tensors_[tensor_count_] = &args, tags_[tensor_count_] = TensorArgType::OUTPUT_EXISTING, tensor_count_++),
+             ...);
+        }
+    }
+
+    template <typename... Args>
+    void add_inout(Args &&...args) {
+        assert_add_tensor_args<false, Args...>();
+        if (!check_add_tensor_capacity(static_cast<int32_t>(sizeof...(Args)))) {
+            return;
+        }
+        ((tensors_[tensor_count_] = &args, tags_[tensor_count_] = TensorArgType::INOUT, tensor_count_++), ...);
+    }
+
+    /// No-dependency existing tensor: skips OverlapMap lookup, depends on creator only.
+    template <typename... Args>
+    void add_no_dep(Args &&...args) {
+        assert_add_tensor_args<false, Args...>();
+        if (!check_add_tensor_capacity(static_cast<int32_t>(sizeof...(Args)))) return;
+        ((tensors_[tensor_count_] = &args, tags_[tensor_count_] = TensorArgType::NO_DEP, tensor_count_++), ...);
+    }
+
+    /**
+     * Attach an explicit dependency array. The Arg stores (ptr, count) without
+     * copying — the caller's array must outlive the submit (same lifetime rule
+     * as add_input/add_output, which also store pointers).
+     *
+     * count == 0 is a valid "set empty" — it clears any previously stored deps
+     * and returns. This lets callers that build the dep set conditionally pass
+     * the result through unguarded, including in the no-dep branch:
+     *   PTO2TaskId deps[3];
+     *   uint32_t n = 0;
+     *   if (have_prev) deps[n++] = prev;
+     *   if (is_last)   deps[n++] = alloc;
+     *   args.set_dependencies(deps, n);    // safe even if n == 0
+     *
+     * For count > 0, the call is single-shot: a second non-empty call after
+     * deps are already set will fail with set_error(). Use count == 0 first
+     * if you need to re-set.
+     */
+    void set_dependencies(const PTO2TaskId *deps, uint32_t count) {
+        if (count == 0) {
+            explicit_deps_ = nullptr;
+            explicit_dep_count_ = 0;
+            return;
+        }
+        if (deps == nullptr) {
+            set_error("set_dependencies: deps must not be null when count > 0");
+            return;
+        }
+        if (explicit_deps_ != nullptr) {
+            set_error("set_dependencies: may be called at most once per Arg");
+            return;
+        }
+        explicit_deps_ = deps;
+        explicit_dep_count_ = count;
+    }
+
+    uint32_t explicit_dep_count() const { return explicit_dep_count_; }
+
+    PTO2TaskId explicit_dep(uint32_t index) const {
+        always_assert(index < explicit_dep_count_);
+        return explicit_deps_[index];
+    }
+
+    const PTO2TaskId *explicit_deps_data() const { return explicit_deps_; }
+
+    /**
+     * Add scalar values. Types are deduced per argument; each value is
+     * bit-cast to uint64_t for storage. Mixed types are allowed:
+     *
+     *   args.add_scalar(uint64_val);                  // single
+     *   args.add_scalar(3.14f, int32_t(42), 7u);     // mixed batch
+     */
+    template <typename... Args>
+    void add_scalar(Args &&...args) {
+        static_assert(sizeof...(Args) >= 1, "add_scalar: at least one argument required");
+        static_assert((is_supported_scalar_arg_v<Args> && ...), "add_scalar: all types must be arithmetic or enum");
+        if (scalar_count_ + sizeof...(Args) > MaxS) {
+            set_error(scalar_cap_msg());
+            return;
+        }
+        (add_scalar_one(std::forward<Args>(args)), ...);
+    }
+
+    void add_scalars(const uint64_t *values, int count) {
+        if (count < 0 || scalar_count_ + count > MaxS) {
+            set_error(scalar_cap_msg());
+            return;
+        }
+        memcpy(&scalars_[scalar_count_], values, count * sizeof(uint64_t));
+#if PTO2_PROFILING
+        dump_arg_selection_.clear_scalar_metadata(scalar_count_, count);
+#endif
+        scalar_count_ += count;
+    }
+
+    /**
+     * Zero-extend int32 bit patterns into uint64 scalar slots.
+     * Negative values are treated as their unsigned 32-bit representation
+     * (e.g., -1 → 0x00000000FFFFFFFF, not 0xFFFFFFFFFFFFFFFF).
+     * Uses NEON to process 4 elements per iteration on aarch64.
+     */
+    void add_scalars_i32(const int32_t *values, int count) {
+        if (count < 0 || scalar_count_ + count > MaxS) {
+            set_error(scalar_cap_msg());
+            return;
+        }
+        uint64_t *dst = &scalars_[scalar_count_];
+#if defined(__aarch64__)
+        int i = 0;
+        for (; i + 4 <= count; i += 4) {
+            uint32x4_t v = vld1q_u32(reinterpret_cast<const uint32_t *>(values + i));
+            uint64x2_t lo = vmovl_u32(vget_low_u32(v));
+            uint64x2_t hi = vmovl_u32(vget_high_u32(v));
+            vst1q_u64(dst + i, lo);
+            vst1q_u64(dst + i + 2, hi);
+        }
+        for (; i < count; i++) {
+            dst[i] = static_cast<uint64_t>(static_cast<uint32_t>(values[i]));
+        }
+#else
+        for (int i = 0; i < count; i++) {
+            dst[i] = static_cast<uint64_t>(static_cast<uint32_t>(values[i]));
+        }
+#endif
+#if PTO2_PROFILING
+        dump_arg_selection_.clear_scalar_metadata(scalar_count_, count);
+#endif
+        scalar_count_ += count;
+    }
+
+    /**
+     * Copy scalars from another Arg's scalar array.
+     * Useful when multiple tasks share the same scalar data (e.g., block indices).
+     */
+    void copy_scalars_from(const Arg &src, int src_offset, int count) {
+        if (src_offset < 0 || count < 0 || src_offset + count > src.scalar_count_) {
+            set_error("Source scalar range out of bounds in copy_scalars_from");
+            return;
+        }
+        if (scalar_count_ + count > MaxS) {
+            set_error(scalar_cap_msg());
+            return;
+        }
+        memcpy(&scalars_[scalar_count_], &src.scalars_[src_offset], count * sizeof(uint64_t));
+#if PTO2_PROFILING
+        dump_arg_selection_.copy_scalar_dtypes_from(src.dump_arg_selection_, scalar_count_, src_offset, count);
+#endif
+        scalar_count_ += count;
+    }
+
+#if PTO2_PROFILING
+    const uint8_t *scalar_dtypes() const { return dump_arg_selection_.scalar_dtypes(); }
+#else
+    const uint8_t *scalar_dtypes() const { return nullptr; }
+#endif
+
+private:
+    // Caller-owned dependency array; lifetime must extend through submit.
+#if PTO2_PROFILING
+    DumpArgSelection dump_arg_selection_;
+#endif
+    const PTO2TaskId *explicit_deps_{nullptr};
+    uint32_t explicit_dep_count_{0};
+#if PTO2_PROFILING
+    template <typename T>
+    static constexpr bool is_supported_dump_arg_v =
+        std::is_same_v<std::decay_t<T>, Tensor> || std::is_same_v<std::decay_t<T>, TensorCreateInfo> ||
+        is_supported_scalar_arg_v<T>;
+#endif
+
+    // Capacity-overflow messages — spell the actual limit (MaxS/MaxT, whatever
+    // the instantiation is) into the text via std::to_string. Built once into a
+    // function-local static so set_error() can hold the const char* safely.
+    static const char *scalar_cap_msg() {
+        static const std::string msg = "Too many scalar args (max " + std::to_string(MaxS) + ")";
+        return msg.c_str();
+    }
+    static const char *tensor_cap_msg() {
+        static const std::string msg = "Too many tensor args (max " + std::to_string(MaxT) + ")";
+        return msg.c_str();
+    }
+
+    template <typename T>
+    void add_scalar_one(T &&value) {
+        scalars_[scalar_count_] = to_u64(value);
+#if PTO2_PROFILING
+        uintptr_t scalar_source_ptr = 0;
+        if constexpr (std::is_lvalue_reference_v<T>) {
+            scalar_source_ptr = reinterpret_cast<uintptr_t>(&value);
+        }
+        dump_arg_selection_.record_scalar_source(
+            scalar_count_, scalar_source_ptr, dtype_of<std::remove_cv_t<std::remove_reference_t<T>>>()
+        );
+#endif
+        scalar_count_++;
+    }
+
+#if PTO2_PROFILING
+    // No-arg dump(): mark every arg already added to this Arg.
+    void mark_all_dump_args() {
+        if (tensor_count_ == 0 && scalar_count_ == 0) {
+            set_error("dump: no arguments added to this Arg");
+            return;
+        }
+        dump_arg_selection_.mark_all(tensor_count_, scalar_count_);
+    }
+
+    void mark_dump_arg(const Tensor &tensor) {
+        for (int32_t i = 0; i < tensor_count_; i++) {
+            if (tags_[i] != TensorArgType::OUTPUT && tensors_[i].refers_to(&tensor)) {
+                dump_arg_selection_.mark_index(i);
+                return;
+            }
+        }
+        set_error("dump: tensor is not part of this Arg");
+    }
+
+    void mark_dump_arg(const TensorCreateInfo &create_info) {
+        for (int32_t i = 0; i < tensor_count_; i++) {
+            if (tags_[i] == TensorArgType::OUTPUT && tensors_[i].refers_to(&create_info)) {
+                dump_arg_selection_.mark_index(i);
+                return;
+            }
+        }
+        set_error("dump: TensorCreateInfo is not part of this Arg");
+    }
+
+    template <typename T>
+    std::enable_if_t<is_supported_scalar_arg_v<T>, void> mark_dump_arg(const T &scalar) {
+        uintptr_t ptr = reinterpret_cast<uintptr_t>(&scalar);
+        if (dump_arg_selection_.mark_scalar_by_ptr(ptr, scalar_count_, tensor_count_)) {
+            return;
+        }
+        set_error("dump: scalar is not part of this Arg");
+    }
+#endif
+
+    // Compile-time validation: arg count, value category (reject temporaries —
+    // a stored &arg would dangle after the call), and element type. Driven
+    // purely by Args, with no runtime state.
+    template <bool is_output, typename... Args>
+    static void assert_add_tensor_args() {
+        static_assert(sizeof...(Args) >= 1, "at least one argument required");
+        static_assert(
+            (std::is_lvalue_reference_v<Args> && ...),
+            "temporaries are not allowed — stored pointers would dangle after the call"
+        );
+        if constexpr (is_output) {
+            static_assert(
+                (std::is_same_v<std::decay_t<Args>, Tensor> && ...) ||
+                    (std::is_same_v<std::decay_t<Args>, TensorCreateInfo> && ...),
+                "add_output: all arguments must be the same type (all Tensor or all TensorCreateInfo)"
+            );
+        } else {
+            static_assert((std::is_same_v<std::decay_t<Args>, Tensor> && ...), "all arguments must be Tensor");
+        }
+    }
+
+    // Runtime validation: tensor-before-scalar ordering + slot capacity. Records
+    // an error and returns false on violation.
+    bool check_add_tensor_capacity(int32_t count) {
+        if (scalar_count_ != 0) {
+            set_error(
+                "add_input/add_output/add_inout called after add_scalar: "
+                "all tensors must be added before any scalars"
+            );
+            return false;
+        }
+        if (tensor_count_ + count > static_cast<int32_t>(MaxT)) {
+            set_error(tensor_cap_msg());
+            return false;
+        }
+        return true;
+    }
+};
+
+// =============================================================================
+// Task-args layer aliases
+// =============================================================================
+//
+// L0TaskArgs — core-level container used to build and submit tasks inside
+//   orchestration (small, stack-friendly).
+using L0TaskArgs = Arg<MAX_TENSOR_ARGS, MAX_SCALAR_ARGS>;
+
+// L2TaskArgs — chip-level entry-arg holding the orchestration entry's
+// already-allocated inputs (capacity matches ChipStorageTaskArgs).
+// aicpu_orchestration_entry/config receive a const L2TaskArgs&.
+struct L2TaskArgs : Arg<CHIP_MAX_TENSOR_ARGS, CHIP_MAX_SCALAR_ARGS> {
+    // Build from the executor's ChipStorageTaskArgs: each input becomes a
+    // TensorRef pointing at src's Tensor, so `src` must outlive this (on the
+    // executor path src is runtime->orch_args_storage_, alive for the whole run).
+    void create_from_chip_args(const ChipStorageTaskArgs &src) {
+        reset();
+        for (int32_t i = 0; i < src.tensor_count(); ++i) {
+            // Entry inputs are external submit-time tensors; the entry binds them
+            // by const Tensor& (replacing from_tensor_arg's old version/manual_dep
+            // reset), so this invariant is what keeps that binding behavior-preserving.
+            const Tensor &t = src.tensor(i);
+            debug_assert(!t.manual_dep && t.version == 0);
+            add_input(t);
+        }
+        for (int32_t i = 0; i < src.scalar_count(); ++i) {
+            add_scalar(src.scalar(i));
+        }
+    }
+};
+
+#endif  // SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_PTO_TYPES_H_
diff --git a/src/a2a3/runtime/host_build_graph/runtime/runtime.cpp b/src/a2a3/runtime/host_build_graph/runtime/runtime.cpp
deleted file mode 100644
index 057ee43cc..000000000
--- a/src/a2a3/runtime/host_build_graph/runtime/runtime.cpp
+++ /dev/null
@@ -1,221 +0,0 @@
-/*
- * Copyright (c) PyPTO Contributors.
- * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
- * CANN Open Software License Agreement Version 2.0 (the "License").
- * Please refer to the License for details. You may not use this file except in compliance with the License.
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
- * See LICENSE in the root of the software repository for the full text of the License.
- * -----------------------------------------------------------------------------------------------------------
- */
-/**
- * Runtime Class - Implementation
- *
- * Task dependency management with circular ready queue.
- * Follows patterns from pto_runtime.c for consistency.
- */
-
-#include "runtime.h"
-
-// =============================================================================
-// Constructor
-// =============================================================================
-
-Runtime::Runtime() {
-    // NOTE: host_api is initialized in InitRuntime() (host-only code)
-    // because the CApi functions don't exist when compiled for device.
-
-    memset(workers, 0, sizeof(workers));
-
-    // Initialize task array (cannot use memset with atomic members)
-    for (int i = 0; i < RUNTIME_MAX_TASKS; i++) {
-        tasks[i].task_id = 0;
-        tasks[i].func_id = 0;
-        tasks[i].num_args = 0;
-        tasks[i].function_bin_addr = 0;
-        tasks[i].core_type = CoreType::AIV;  // Default to AIV
-        tasks[i].fanin = 0;
-        tasks[i].fanout_count = 0;
-        tasks[i].start_time = 0;
-        tasks[i].end_time = 0;
-        memset(tasks[i].args, 0, sizeof(tasks[i].args));
-        memset(tasks[i].fanout, 0, sizeof(tasks[i].fanout));
-    }
-    next_task_id = 0;
-    initial_ready_count = 0;
-    worker_count = 0;
-    aicpu_thread_num = 1;
-    memset(aicpu_allowed_cpus, 0, sizeof(aicpu_allowed_cpus));
-    aicpu_allowed_cpu_count = 0;
-    aicpu_launch_count = 0;
-    tensor_info_storage_ = nullptr;
-    tensor_info_storage_bytes_ = 0;
-    tensor_allocation_storage_ = nullptr;
-    tensor_allocation_storage_bytes_ = 0;
-    tensor_allocation_count_ = 0;
-
-    // Initialize function address mapping
-    for (int i = 0; i < RUNTIME_MAX_FUNC_ID; i++) {
-        func_id_to_addr_[i] = 0;
-    }
-    memset(tensor_info_offsets_, 0, sizeof(tensor_info_offsets_));
-    memset(tensor_info_counts_, 0, sizeof(tensor_info_counts_));
-}
-
-// =============================================================================
-// Task Management
-// =============================================================================
-
-int Runtime::add_task(uint64_t *args, int num_args, int func_id, CoreType core_type) {
-    // Check bounds
-    if (next_task_id >= RUNTIME_MAX_TASKS) {
-        LOG_ERROR("[Runtime] Task table full (max=%d)", RUNTIME_MAX_TASKS);
-        return -1;
-    }
-
-    if (num_args > RUNTIME_MAX_ARGS) {
-        LOG_ERROR("[Runtime] Too many args (%d > %d)", num_args, RUNTIME_MAX_ARGS);
-        return -1;
-    }
-
-    // Allocate task
-    int task_id = next_task_id++;
-    Task *task = &tasks[task_id];
-
-    // Initialize task fields
-    task->task_id = task_id;
-    task->func_id = func_id;
-    task->num_args = num_args;
-    if (args && num_args > 0) {
-        memcpy(task->args, args, num_args * sizeof(uint64_t));
-    }
-    task->function_bin_addr = 0;  // Will be set by host before copying to device
-    task->core_type = core_type;  // Set core type
-    task->fanin = 0;
-    task->fanout_count = 0;
-    memset(task->fanout, 0, sizeof(task->fanout));
-
-    return task_id;
-}
-
-void Runtime::add_successor(int from_task, int to_task) {
-    // Validate task IDs
-    if (from_task < 0 || from_task >= next_task_id) {
-        LOG_ERROR("[Runtime] Invalid from_task ID %d", from_task);
-        return;
-    }
-
-    if (to_task < 0 || to_task >= next_task_id) {
-        LOG_ERROR("[Runtime] Invalid to_task ID %d", to_task);
-        return;
-    }
-
-    Task *from = &tasks[from_task];
-    Task *to = &tasks[to_task];
-
-    // Add to_task to from_task's fanout
-    if (from->fanout_count >= RUNTIME_MAX_FANOUT) {
-        LOG_ERROR("[Runtime] Fanout overflow for task %d (max=%d)", from_task, RUNTIME_MAX_FANOUT);
-        return;
-    }
-
-    from->fanout[from->fanout_count++] = to_task;
-    to->fanin++;
-}
-
-// =============================================================================
-// Query Methods
-// =============================================================================
-
-Task *Runtime::get_task(int task_id) {
-    if (task_id < 0 || task_id >= next_task_id) {
-        return nullptr;
-    }
-    return &tasks[task_id];
-}
-
-int Runtime::get_task_count() const { return next_task_id; }
-
-int Runtime::get_initial_ready_tasks(int *ready_tasks) {
-    initial_ready_count = 0;
-    for (int i = 0; i < next_task_id; i++) {
-        if (tasks[i].fanin == 0) {
-            initial_ready_tasks[initial_ready_count] = i;
-            if (ready_tasks != nullptr) {
-                ready_tasks[initial_ready_count] = i;
-            }
-            initial_ready_count++;
-        }
-    }
-    return initial_ready_count;
-}
-
-// =============================================================================
-// Utility Methods
-// =============================================================================
-
-void Runtime::print_runtime() const {
-    LOG_DEBUG("\n================================================================================");
-    LOG_DEBUG("[Runtime] Task Runtime Status");
-    LOG_DEBUG("========================================================================");
-    LOG_DEBUG("  Total tasks: %d", next_task_id);
-
-    // Print initially ready tasks
-    LOG_DEBUG("\nInitially Ready Tasks (fanin==0):");
-    LOG_DEBUG("----------------------------------------------------------------------");
-
-    // Build ready tasks string
-    char ready_tasks_str[1024] = "  ";
-    int offset = 2;
-    int ready_count = 0;
-    for (int i = 0; i < next_task_id && offset < 1000; i++) {
-        if (tasks[i].fanin.load() == 0) {
-            if (ready_count > 0) {
-                offset += snprintf(ready_tasks_str + offset, sizeof(ready_tasks_str) - offset, ", ");
-            }
-            offset += snprintf(ready_tasks_str + offset, sizeof(ready_tasks_str) - offset, "%d", i);
-            ready_count++;
-        }
-    }
-    if (ready_count == 0) {
-        snprintf(ready_tasks_str, sizeof(ready_tasks_str), "  (none)");
-    }
-    LOG_DEBUG("%s", ready_tasks_str);
-    LOG_DEBUG("  Count: %d", ready_count);
-
-    LOG_DEBUG("\nTask Table:");
-    LOG_DEBUG("----------------------------------------------------------------------");
-
-    for (int i = 0; i < next_task_id; i++) {
-        const Task *t = &tasks[i];
-
-        // Build fanout string
-        char fanout_str[512];
-        int fo_offset = 0;
-        for (int j = 0; j < t->fanout_count && fo_offset < 500; j++) {
-            fo_offset += snprintf(
-                fanout_str + fo_offset, sizeof(fanout_str) - fo_offset, "%d%s", t->fanout[j],
-                j < t->fanout_count - 1 ? "," : ""
-            );
-        }
-
-        LOG_DEBUG(
-            "  Task %d: func_id=%d, fanin=%d, fanout=%d, args=%d [%s]", i, t->func_id, t->fanin.load(), t->fanout_count,
-            t->num_args, fanout_str
-        );
-    }
-
-    LOG_DEBUG("========================================================================");
-}
-
-// host_build_graph uploads a variable-length prefix of the Runtime object:
-// everything up to and including the populated task slots. tasks[] is the last
-// device-read member, so this ships every device-read field while truncating the
-// unpopulated task tail and the host-only members declared after tasks[]. See
-// the static_asserts in runtime.h.
-size_t runtime_device_copy_size(const Runtime &rt) {
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Winvalid-offsetof"
-    return offsetof(Runtime, tasks) + static_cast<size_t>(rt.get_task_count()) * sizeof(Task);
-#pragma GCC diagnostic pop
-}
diff --git a/src/a2a3/runtime/host_build_graph/runtime/runtime.h b/src/a2a3/runtime/host_build_graph/runtime/runtime.h
index 7bc1f438e..500315d8e 100644
--- a/src/a2a3/runtime/host_build_graph/runtime/runtime.h
+++ b/src/a2a3/runtime/host_build_graph/runtime/runtime.h
@@ -8,34 +8,32 @@
  * See LICENSE in the root of the software repository for the full text of the License.
  * -----------------------------------------------------------------------------------------------------------
  */
-
 /**
- * Runtime Class - Task Dependency Runtime Management
- *
- * This is a simplified, standalone runtime class for managing task
- * dependencies. Tasks are stored in a fixed-size array with compile-time
- * configurable bounds. Each task has:
- * - Unique ID (array index)
- * - Arguments (uint64_t array)
- * - Fanin (predecessor count)
- * - Fanout (array of successor task IDs)
+ * Runtime Class - Device Execution and Handshake Control
  *
- * The runtime maintains a ready queue for tasks with fanin == 0.
+ * This class manages device-side execution through AICPU-AICore handshake
+ * protocol. Task graph construction is handled by PTO2Runtime; this class
+ * only handles:
+ * - Handshake buffers for AICPU-AICore communication
+ * - Execution parameters (block_dim, aicpu_thread_num)
+ * - Tensor pair management for host-device memory tracking
+ * - Device orchestration state (gm_sm_ptr_, orch_args_)
+ * - Function address mapping (func_id_to_addr_)
  *
- * Based on patterns from pto_runtime.h/c but simplified for educational
- * and lightweight scheduling use cases.
+ * Task dispatch uses a per-core PTO2DispatchPayload written by the scheduler.
+ * At dispatch time, build_payload() copies tensor pointers and scalars from
+ * the task payload into the per-core args[], populates SPMD context, then
+ * signals AICore via DATA_MAIN_BASE.
  */
 
-#ifndef SRC_A2A3_RUNTIME_HOST_BUILD_GRAPH_RUNTIME_RUNTIME_H_
-#define SRC_A2A3_RUNTIME_HOST_BUILD_GRAPH_RUNTIME_RUNTIME_H_
+#ifndef SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_RUNTIME_H_
+#define SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_RUNTIME_H_
 
 #include <stdbool.h>
-#include <stddef.h>  // for offsetof (layout static_asserts)
 #include <stdint.h>
 #include <stdio.h>   // for fprintf, printf
 #include <string.h>  // for memset
 
-#include <atomic>
 #include <vector>
 
 #include "common/core_type.h"
@@ -43,35 +41,21 @@
 #include "common/l2_swimlane_profiling.h"
 #include "common/platform_config.h"
 #include "aicpu/platform_aicpu_affinity.h"  // MAX_GATE_THREADS (aicpu_allowed_cpus bound)
-#include "pto_runtime2_types.h"
-#include "tensor_info.h"
-
-// Logging macros using unified logging interface
-#include "common/unified_log.h"
+#include "pto2_dispatch_payload.h"
+#include "task_args.h"
 
 // =============================================================================
 // Configuration Macros
 // =============================================================================
 
-#ifndef RUNTIME_MAX_TASKS
-#define RUNTIME_MAX_TASKS 131072
-#endif
-
-#ifndef RUNTIME_MAX_ARGS
-#define RUNTIME_MAX_ARGS 16
-#endif
-
-#ifndef RUNTIME_MAX_FANOUT
-#define RUNTIME_MAX_FANOUT 128
-#endif
-
-#ifndef RUNTIME_MAX_WORKER
-#define RUNTIME_MAX_WORKER PLATFORM_MAX_CORES_PER_THREAD
-#endif
-
-#ifndef RUNTIME_MAX_FUNC_ID
+#define RUNTIME_MAX_ARGS 128
+#define RUNTIME_MAX_WORKER 72  // 24 AIC + 48 AIV cores
 #define RUNTIME_MAX_FUNC_ID 1024
-#endif
+#define RUNTIME_MAX_ORCH_SO_SIZE (4 * 1024 * 1024)  // 4MB max for orchestration SO
+#define RUNTIME_MAX_ORCH_SYMBOL_NAME 64
+
+// Default ready queue shards: one shard per worker thread (total minus orchestrator)
+constexpr int RUNTIME_DEFAULT_READY_QUEUE_SHARDS = PLATFORM_MAX_AICPU_THREADS - 1;
 
 // =============================================================================
 // Data Structures
@@ -86,8 +70,8 @@
  * Protocol State Machine:
  * 1. Initialization: AICPU sets aicpu_ready=1
  * 2. Acknowledgment: AICore sets aicore_done=core_id+1
- * 3. Task Dispatch: AICPU writes DATA_MAIN_BASE with the task_id after publishing Task*
- * 4. Task Execution: AICore reads the task and executes
+ * 3. Task Dispatch: AICPU writes DATA_MAIN_BASE after updating the per-core payload
+ * 4. Task Execution: AICore reads the cached PTO2DispatchPayload and executes
  * 5. Task Completion: AICore writes FIN to COND; AICPU observes completion
  * 6. Shutdown: AICPU sets control=1, AICore exits
  *
@@ -105,14 +89,13 @@
  * Field Access Patterns:
  * - aicpu_ready: Written by AICPU, read by AICore
  * - aicore_done: Written by AICore, read by AICPU
- * - task: Written by AICPU, read by AICore (0 = no task assigned)
+ * - task: Written by AICPU, read by AICore (0 = not ready, non-zero = PTO2DispatchPayload*)
  * - core_type: Written by AICPU, read by AICore (CoreType::AIC or CoreType::AIV)
- * - physical_core_id: Written by AICPU, read by AICore (physical core ID)
  */
 struct Handshake {
     volatile uint32_t aicpu_ready;        // AICPU ready signal: 0=not ready, 1=ready
     volatile uint32_t aicore_done;        // AICore ready signal: 0=not ready, core_id+1=ready
-    volatile uint64_t task;               // Task pointer: 0=no task, non-zero=Task* address
+    volatile uint64_t task;               // Init: PTO2DispatchPayload* (set before aicpu_ready); runtime: unused
     volatile CoreType core_type;          // Core type: CoreType::AIC or CoreType::AIV
     volatile uint32_t physical_core_id;   // Physical core ID
     volatile uint32_t aicpu_regs_ready;   // AICPU register init done: 0=pending, 1=done
@@ -127,117 +110,113 @@ struct TensorPair {
     void *host_ptr;
     void *dev_ptr;
     size_t size;
+    // false for read-only INPUT tensors: they are never written by the kernel,
+    // so the end-of-run D2H copy-back is skipped. OUTPUT/INOUT/unknown
+    // keep the safe default of copying back.
+    bool needs_copy_back = true;
 };
 
 /**
- * Task entry in the runtime
+ * Host API function pointers for device memory operations live in the shared
+ * common/host_api.h (included at the top of this header) so the field set
+ * stays identical across runtime variants (tensormap_and_ringbuffer /
+ * host_build_graph) and arches; the platform layer builds one const table and
+ * passes it by address. hbg leaves the trb-only fields (prebuilt-arena cache)
+ * unset — see host_api.h.
+ */
+
+/**
+ * Task structure - Compatibility stub for platform layer
  *
- * Each task has a unique ID (its index in the task array), arguments,
- * and dependency information (fanin/fanout).
+ * RT2 uses PTO2DispatchPayload instead of Task for task dispatch.
+ * This stub exists only for API compatibility with device_runner.cpp.
+ * Since get_task_count() returns 0, this struct is never actually used.
  */
-typedef struct {
-    int task_id;                      // Unique task identifier
-    int func_id;                      // Function identifier
-    uint64_t args[RUNTIME_MAX_ARGS];  // Task arguments
-    int num_args;                     // Number of valid arguments
-
-    // Runtime function pointer address (NEW)
-    // This is the GM address where the kernel binary resides
-    // It's cast to a function pointer at runtime: (KernelFunc)function_bin_addr
-    uint64_t function_bin_addr;  // Address of kernel in device GM memory
-
-    // Core type specification
-    // Specifies which core type this task should run on
-    CoreType core_type;  // CoreType::AIC or CoreType::AIV
-
-    // Dependency tracking (using PTO runtime terminology)
-    std::atomic<int> fanin;          // Number of predecessors (dependencies)
-    int fanout[RUNTIME_MAX_FANOUT];  // Successor task IDs
-    int fanout_count;                // Number of successors
-
-    // DFX-specific fields
-    uint64_t start_time;  // Start time of the task
-    uint64_t end_time;    // End time of the task
-} Task;
+struct Task {
+    int func_id;
+    uint64_t function_bin_addr;
+};
 
 // =============================================================================
 // Runtime Class
 // =============================================================================
 
 /**
- * Runtime class for task dependency management
- *
- * Maintains a fixed-size array of tasks and uses a Queue for ready tasks.
- * Tasks are allocated monotonically and never reused within the same
- * runtime instance.
+ * Runtime class for device execution and handshake control
  *
- * Dependencies are managed manually via add_successor().
+ * This class manages AICPU-AICore communication through handshake buffers.
+ * Task graph construction is handled by PTO2Runtime; this class only handles
+ * execution control and device orchestration state.
  */
 class Runtime {
 public:
-    // ===================== Device-read prefix =====================
-    // Everything from here through `tasks[]` is uploaded to the device. The
-    // ordering is load-bearing: AICore reads `workers[]` at offset 0 and
-    // `tasks[i]` by offset, and `tasks[]` is the LAST device-read member so a
-    // variable-length H2D copy of `offsetof(tasks) + get_task_count()*sizeof(Task)`
-    // (see runtime_device_copy_size) keeps every preceding field at its fixed
-    // offset while shipping only the populated task slots. All fields device
-    // code may read must therefore precede `tasks[]`; the offsetof static_asserts
-    // after the class enforce this. These are public (not hidden behind the
-    // accessors) because they form the host/device ABI, mirroring trb's
-    // DeviceRuntimeLaunchDesc.
+    // Handshake buffers for AICPU-AICore communication
     Handshake workers[RUNTIME_MAX_WORKER];  // Worker (AICore) handshake buffers
     int worker_count;                       // Number of active workers
 
-    // Total AICPU threads launched on this run. host_build_graph has no
-    // orchestrator/scheduler split — every thread dispatches tasks in
-    // round-robin across the assigned cores. See AicpuExecutor::init.
+    // Execution parameters for AICPU scheduling.
+    //
+    // aicpu_thread_num is the *total* AICPU thread count launched on this run
+    // (= orch + schedulers). AicpuExecutor splits this into one orchestrator
+    // thread (highest idx, runs aicpu_orchestration_entry) and the remaining
+    // aicpu_thread_num-1 scheduler threads that dispatch tasks to AICore.
+    // The orch thread also dispatches when env PTO2_ORCH_TO_SCHED is set.
     int aicpu_thread_num;
+    int ready_queue_shards;  // Number of ready queue shards (1..MAX_AICPU_THREADS, default MAX-1)
 
-    // Filter-style affinity gate input (a2a3 onboard). Host fills these before
-    // launch from AICPU OCCUPY; the device gate keeps threads whose
-    // sched_getcpu() lands on one of the cpu_ids. Read device-side in kernel.cpp,
-    // so it must precede `tasks[]`.
+    // Filter-style affinity gate input (a2a3 onboard). Host fills these
+    // before launch from AICPU OCCUPY, and the device gate keeps threads whose
+    // sched_getcpu() lands on one of the cpu_ids. The array position is the
+    // deterministic exec_idx used by AicpuExecutor for sched/orch role
+    // assignment; the highest active index is the orchestrator slot.
     int32_t aicpu_allowed_cpus[MAX_GATE_THREADS];
     int32_t aicpu_allowed_cpu_count;
     int32_t aicpu_launch_count;
 
-    // Next available task ID. Device-read via get_task_count() (AICPU task loop
-    // bound), so it lives in the prefix.
-    int next_task_id;
-
-    // Function address mapping (for API compatibility with rt2). Device-read
-    // under PTO2_PROFILING (dump-args path), so it lives in the prefix.
+    // PTO2 integration: kernel_id -> GM function_bin_addr mapping
+    // NOTE: Made public for direct access from aicore code
     uint64_t func_id_to_addr_[RUNTIME_MAX_FUNC_ID];
 
-    // Tensor info metadata for tensor dump. Device-read via get_tensor_info()
-    // under PTO2_PROFILING, so it lives in the prefix.
-    void *tensor_info_storage_;
-    uint64_t tensor_info_storage_bytes_;
-    uint32_t tensor_info_offsets_[RUNTIME_MAX_TASKS];
-    uint16_t tensor_info_counts_[RUNTIME_MAX_TASKS];
-
-    // Device allocation ranges used to recover tensor buffer addresses from
-    // task.args[]. Device-read via is_tensor_buffer_addr() under PTO2_PROFILING,
-    // so it lives in the prefix.
-    void *tensor_allocation_storage_;
-    uint64_t tensor_allocation_storage_bytes_;
-    uint32_t tensor_allocation_count_;
-
-    // Task storage — LAST device-read member. The variable-length H2D copy stops
-    // at offsetof(tasks) + get_task_count()*sizeof(Task); the unpopulated tail is
-    // never read device-side (get_task() bounds-checks task_id < next_task_id,
-    // and AICore only touches AICPU-dispatched ids).
-    Task tasks[RUNTIME_MAX_TASKS];  // Fixed-size task array
+    // Orchestrator-to-scheduler transition control
+    // When true, orchestrator threads convert to scheduler threads after orchestration completes.
+    // When false (default), orchestrator threads exit after orchestration without dispatching tasks.
+    // Controlled via PTO2_ORCH_TO_SCHED environment variable.
+    bool orch_to_sched;
 
-private:
-    // ===================== Host-only tail =====================
-    // Never crosses to the device (physically after `tasks[]`, excluded from the
-    // H2D copy). See also active_callable_id_ / tensor_pairs_ declared below.
+    // Total tasks submitted by the host orchestrator — handed to the scheduler
+    // (SchedulerContext::on_orchestration_done) in place of latching the SM ring
+    // head on device. host_build_graph builds the whole graph on the host, so
+    // the boot thread reads this instead of counting SM ring heads.
+    int32_t host_total_tasks;
 
-    // Initial ready tasks (computed once, read-only after)
-    int initial_ready_tasks[RUNTIME_MAX_TASKS];
-    int initial_ready_count;
+private:
+    // Kernel binary tracking for cleanup
+    int registered_kernel_func_ids_[RUNTIME_MAX_FUNC_ID];
+    int registered_kernel_count_;
+
+    void *gm_sm_ptr_;                        // GM pointer to PTO2 shared memory (device)
+    void *gm_heap_ptr_;                      // GM heap for orchestrator output buffers (device)
+    void *slot_states_ptr_;                  // Pointer to PTO2TaskSlotState array (scheduler-private, for profiling)
+    ChipStorageTaskArgs orch_args_storage_;  // Copy of args for device
+
+    // Prebuilt-arena fast path (trb only). Set by the host before rtMemcpy'ing
+    // Runtime to device; AICPU reads them in the boot path to skip
+    // runtime_create_from_sm and reuse the pooled, prebuilt arena buffer
+    // (already populated by runtime_init_data_from_layout + wire on host).
+    void *prebuilt_arena_base_;
+    size_t prebuilt_runtime_offset_;
+
+    // Orchestration metadata set by the platform host (DeviceRunner) when
+    // registering a callable. host_build_graph runs the orchestrator on the
+    // host, so the device side no longer reads the SO bytes / symbol names —
+    // but the platform registration path still writes them through these
+    // setters (shared with tensormap_and_ringbuffer), so the fields and their
+    // setters are part of the platform↔runtime ABI and must stay.
+    uint64_t dev_orch_so_addr_;
+    uint64_t dev_orch_so_size_;
+    int32_t active_callable_id_;
+    char device_orch_func_name_[RUNTIME_MAX_ORCH_SYMBOL_NAME];
+    char device_orch_config_name_[RUNTIME_MAX_ORCH_SYMBOL_NAME];
 
 public:
     /**
@@ -246,12 +225,12 @@ class Runtime {
     Runtime();
 
     // =========================================================================
-    // Accessors for the launch/affinity fields.
+    // Accessors for the execution-parameter fields
     //
-    // Mirror the trb Runtime's accessor surface (which forwards into its `dev`
-    // sub-struct) so the shared platform layer compiles against either variant.
-    // host_build_graph keeps these fields as flat public members, so the
-    // accessors just return them; layout and sizeof are unchanged.
+    // These exist with identical signatures on the tensormap_and_ringbuffer
+    // Runtime so the shared platform layer (device_runner*.cpp, kernel.cpp) can
+    // compile against either variant. hbg stores the fields flat (trb keeps
+    // them in a `dev` sub-struct); the accessors hide that difference.
     // =========================================================================
 
     int get_worker_count() const { return worker_count; }
@@ -267,229 +246,80 @@ class Runtime {
     size_t aicpu_allowed_cpus_capacity() const { return sizeof(aicpu_allowed_cpus) / sizeof(aicpu_allowed_cpus[0]); }
 
     // =========================================================================
-    // Task Management
-    // =========================================================================
-
-    /**
-     * Allocate a new task with the given arguments
-     *
-     * @param args      Array of uint64_t arguments
-     * @param num_args  Number of arguments (must be <= RUNTIME_MAX_ARGS)
-     * @param func_id   Function identifier
-     * @param core_type Core type for this task (CoreType::AIC or CoreType::AIV)
-     * @return Task ID (>= 0) on success, -1 on failure
-     */
-    int add_task(uint64_t *args, int num_args, int func_id, CoreType core_type = CoreType::AIC);
-
-    /**
-     * Add a dependency edge: from_task -> to_task
-     *
-     * This adds to_task to from_task's fanout array and increments
-     * to_task's fanin counter.
-     *
-     * @param from_task  Producer task ID
-     * @param to_task    Consumer task ID (depends on from_task)
-     */
-    void add_successor(int from_task, int to_task);
-
-    // =========================================================================
-    // Query Methods
+    // Performance Profiling
     // =========================================================================
 
-    /**
-     * Get a pointer to a task by ID
-     *
-     * @param task_id  Task ID to query
-     * @return Pointer to task, or nullptr if invalid ID
-     */
-    Task *get_task(int task_id);
-
-    /**
-     * Get the total number of tasks in the runtime
-     *
-     * @return Total task count
-     */
-    int get_task_count() const;
-
-    /**
-     * Get initially ready tasks (fanin == 0) as entry point for execution
-     *
-     * This scans all tasks and populates the provided array with task IDs
-     * that have no dependencies (fanin == 0). The runtime can use this
-     * as the starting point for task scheduling.
-     *
-     * @param ready_tasks  Array to populate with ready task IDs (can be
-     * nullptr)
-     * @return Number of initially ready tasks
-     */
-    int get_initial_ready_tasks(int *ready_tasks);
-
     // =========================================================================
-    // Utility Methods
+    // Shared-memory / orchestration argument plumbing
     // =========================================================================
 
+    void *get_gm_sm_ptr() const;
+    void *get_gm_heap_ptr() const;
+    const ChipStorageTaskArgs &get_orch_args() const;
+    void set_gm_sm_ptr(void *p);
+    void set_gm_heap(void *p);
+    void set_slot_states_ptr(void *p);
+    void set_orch_args(const ChipStorageTaskArgs &args);
+
+    // Prebuilt-arena fast path (trb only). Set by host's
+    // bind_callable_to_runtime_impl; consumed by AICPU at boot to attach a
+    // DeviceArena to `prebuilt_arena_base_` and pick up the PTO2Runtime at
+    // `prebuilt_arena_base_ + prebuilt_runtime_offset_`. Both stay zero on
+    // first construction (Runtime() ctor zeros them) so a non-prebuilt boot
+    // path can still detect "no prebuilt image set" via nullptr.
+    void set_prebuilt_arena(void *arena_base, size_t runtime_off);
+    void *get_prebuilt_arena_base() const;
+    size_t get_prebuilt_runtime_offset() const;
+
+    // Orchestration metadata written by the platform host (DeviceRunner) at
+    // callable registration. Shared ABI with tensormap_and_ringbuffer; the
+    // host_build_graph device side no longer reads them (host-orch builds the
+    // graph on the host), so only the setter + get_active_callable_id() the
+    // platform reads are exposed.
+    void set_dev_orch_so(uint64_t dev_addr, uint64_t size);
+    void set_active_callable_id(int32_t callable_id);
+    int32_t get_active_callable_id() const;
+    void set_device_orch_func_name(const char *name);
+    void set_device_orch_config_name(const char *name);
+
+    uint64_t get_function_bin_addr(int func_id) const;
+    void set_function_bin_addr(int func_id, uint64_t addr);
     /**
-     * Print the runtime structure to stdout
-     *
-     * Shows task table with fanin/fanout information.
+     * Replay a previously-uploaded kernel address onto a fresh Runtime
+     * without recording it in registered_kernel_func_ids_. Used by
+     * DeviceRunner::bind_callable_to_runtime so prepared kernel
+     * binaries are not freed by validate_runtime_impl across runs.
      */
-    void print_runtime() const;
-
-    // =========================================================================
-    // Tensor Info Metadata
-    // =========================================================================
+    void replay_function_bin_addr(int func_id, uint64_t addr);
 
-    void set_tensor_info_storage(void *ptr, uint64_t bytes) {
-        tensor_info_storage_ = ptr;
-        tensor_info_storage_bytes_ = bytes;
-    }
-
-    void clear_tensor_info_storage() {
-        tensor_info_storage_ = nullptr;
-        tensor_info_storage_bytes_ = 0;
-    }
-
-    void set_tensor_info_range(int task_id, uint32_t offset, uint16_t count) {
-        if (task_id < 0 || task_id >= RUNTIME_MAX_TASKS) return;
-        tensor_info_offsets_[task_id] = offset;
-        tensor_info_counts_[task_id] = count;
-    }
-
-    const TensorInfo *get_tensor_info(int task_id, int *count) const {
-        if (count != nullptr) {
-            *count = 0;
-        }
-        if (task_id < 0 || task_id >= RUNTIME_MAX_TASKS || tensor_info_storage_ == nullptr) {
-            return nullptr;
-        }
-        uint16_t tensor_info_count = tensor_info_counts_[task_id];
-        if (tensor_info_count == 0) {
-            return nullptr;
-        }
-        if (count != nullptr) {
-            *count = static_cast<int>(tensor_info_count);
-        }
-        const TensorInfo *base = reinterpret_cast<const TensorInfo *>(tensor_info_storage_);
-        return base + tensor_info_offsets_[task_id];
-    }
-
-    void *get_tensor_info_storage() const { return tensor_info_storage_; }
-
-    uint64_t get_tensor_info_storage_bytes() const { return tensor_info_storage_bytes_; }
-
-    void set_tensor_allocation_storage(void *ptr, uint32_t count, uint64_t bytes) {
-        tensor_allocation_storage_ = ptr;
-        tensor_allocation_count_ = count;
-        tensor_allocation_storage_bytes_ = bytes;
-    }
-
-    void clear_tensor_allocation_storage() {
-        tensor_allocation_storage_ = nullptr;
-        tensor_allocation_count_ = 0;
-        tensor_allocation_storage_bytes_ = 0;
-    }
-
-    bool is_tensor_buffer_addr(uint64_t addr) const {
-        if (tensor_allocation_storage_ == nullptr || tensor_allocation_count_ == 0) {
-            return false;
-        }
-        const TensorAllocationInfo *allocations =
-            reinterpret_cast<const TensorAllocationInfo *>(tensor_allocation_storage_);
-        for (uint32_t i = 0; i < tensor_allocation_count_; i++) {
-            if (allocations[i].contains(addr)) {
-                return true;
-            }
-        }
-        return false;
-    }
-
-    void *get_tensor_allocation_storage() const { return tensor_allocation_storage_; }
-
-    uint64_t get_tensor_allocation_storage_bytes() const { return tensor_allocation_storage_bytes_; }
-
-    // =========================================================================
-    // Device Orchestration (stub for API compatibility)
-    // =========================================================================
-
-    /**
-     * Set PTO2 shared memory pointer (stub for host_build_graph).
-     * This is a no-op for host orchestration; only used by rt2.
-     */
-    void set_gm_sm_ptr(void *) { /* no-op */ }
-
-    /**
-     * Get function binary address by func_id.
-     * Used by platform layer to resolve kernel addresses.
-     */
-    uint64_t get_function_bin_addr(int func_id) const {
-        if (func_id < 0 || func_id >= RUNTIME_MAX_FUNC_ID) return 0;
-        return func_id_to_addr_[func_id];
-    }
-
-    /**
-     * Replay a previously-uploaded kernel address onto a fresh Runtime.
-     * Used by DeviceRunner::bind_callable_to_runtime to rebind prepared
-     * kernel binaries onto the runtime before each simpler_run invocation.
-     */
-    void replay_function_bin_addr(int func_id, uint64_t addr) {
-        if (func_id < 0 || func_id >= RUNTIME_MAX_FUNC_ID) return;
-        func_id_to_addr_[func_id] = addr;
-    }
+    int get_registered_kernel_count() const;
+    int get_registered_kernel_func_id(int index) const;
+    void clear_registered_kernels();
 
     // =========================================================================
-    // Host-only state (not copied to device)
+    // Deprecated API (for platform compatibility, always returns 0/nullptr)
+    // Task graph is now managed by PTO2Runtime, not Runtime
     // =========================================================================
 
-    // Per-callable_id dispatch. hbg orch runs on host, so AICPU never reads
-    // `active_callable_id_`; the field exists for parity with the shared
-    // platform layer (DeviceRunner stamps it on every run via
-    // set_active_callable_id).
-    int32_t active_callable_id_{-1};
+    /** @deprecated Task count is now in PTO2 shared memory */
+    int get_task_count() const { return 0; }
 
-    void set_active_callable_id(int32_t callable_id) { active_callable_id_ = callable_id; }
-    int32_t get_active_callable_id() const { return active_callable_id_; }
+    /** @deprecated RT2 uses PTO2DispatchPayload, not Task. Always returns nullptr. */
+    Task *get_task(int) { return nullptr; }
 
     // Host-side tensor ledger for D2H copy-back at finalize. Populated by
-    // runtime_maker.cpp from orch_args at bind time; iterated in
-    // validate_runtime_impl. Not read by AICPU/AICore and — being after
-    // `tasks[]` in the host-only tail — is not uploaded to the device at all.
-    // No fixed cap.
+    // runtime_maker.cpp from orch_args at bind time, then iterated in
+    // validate_runtime_impl. Not read by AICPU/AICore — the device-side
+    // Runtime image carries the std::vector control block as harmless
+    // garbage, identical to host_api above. No fixed cap — grows with the
+    // chip-level entry-tensor count.
     std::vector<TensorPair> tensor_pairs_;
 };
 
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Winvalid-offsetof"
-// Layout invariants for the variable-length H2D copy (runtime_device_copy_size).
-// workers[] must be first (AICore reads runtime->workers[block_idx] at offset 0),
-// and tasks[] must be the highest-offset device-read member so copying
-// offsetof(tasks) + n*sizeof(Task) covers every field the device reads while
-// truncating only the unpopulated task tail. The host-only tail
-// (initial_ready_*, registered_*, active_callable_id_, tensor_pairs_) sits after
-// tasks[] and is excluded from the copy. (offsetof over Runtime is technically
-// non-standard-layout due to the std::vector tail; the pragma silences that —
-// the device-read prefix is all standard-layout scalars/arrays.)
-static_assert(offsetof(Runtime, workers) == 0, "workers[] must be first: AICore reads offset 0");
-static_assert(
-    offsetof(Runtime, tasks) > offsetof(Runtime, aicpu_allowed_cpus),
-    "tasks[] must follow the affinity gate array (device-read at fixed offset)"
-);
-static_assert(
-    offsetof(Runtime, tasks) > offsetof(Runtime, func_id_to_addr_),
-    "tasks[] must follow func_id_to_addr_ (device-read under profiling)"
-);
-static_assert(
-    offsetof(Runtime, tasks) > offsetof(Runtime, tensor_allocation_count_),
-    "tasks[] must be the last device-read member (all profiling metadata precedes it)"
-);
-#pragma GCC diagnostic pop
-
-// Number of bytes of the Runtime image copied to the device. host_build_graph
-// uploads a variable-length prefix: everything from offset 0 up to and including
-// the populated task slots, i.e. offsetof(Runtime, tasks) + n*sizeof(Task).
-// tasks[] is the last device-read member (static_asserts above), so this ships
-// every device-read field while truncating the unpopulated task tail and the
-// host-only members after tasks[]. Mirrors the trb declaration so the shared
-// device_runner_helpers.cpp copy path is runtime-agnostic.
+// Number of bytes of the Runtime image that must be copied to the device.
+// host_build_graph returns sizeof(Runtime) (its device image is the whole
+// object); trb returns sizeof(DeviceRuntimeLaunchDesc). Defined per-runtime so
+// the shared device_runner_helpers.cpp copy path stays runtime-agnostic.
 size_t runtime_device_copy_size(const Runtime &rt);
 
-#endif  // SRC_A2A3_RUNTIME_HOST_BUILD_GRAPH_RUNTIME_RUNTIME_H_
+#endif  // SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_RUNTIME_H_
diff --git a/src/a2a3/runtime/host_build_graph/runtime/scheduler/pto_scheduler.cpp b/src/a2a3/runtime/host_build_graph/runtime/scheduler/pto_scheduler.cpp
new file mode 100644
index 000000000..08be1aee1
--- /dev/null
+++ b/src/a2a3/runtime/host_build_graph/runtime/scheduler/pto_scheduler.cpp
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * PTO Runtime2 - Scheduler Implementation
+ *
+ * Implements scheduler state management, ready queues, and task lifecycle.
+ *
+ * Based on: docs/RUNTIME_LOGIC.md
+ */
+
+#include "pto_scheduler.h"
+#include <inttypes.h>
+#include <stdlib.h>
+#include "common/unified_log.h"
+
+#if PTO2_PROFILING
+// Weak fallbacks for host/UT builds that don't link the scope_stats collector.
+extern "C" __attribute__((weak, visibility("hidden"))) bool is_scope_stats_enabled() { return false; }
+extern "C" __attribute__((weak, visibility("hidden"))) void scope_stats_note_heap_wrap(int) {}
+#endif
+
+// =============================================================================
+// Scheduler Profiling Counters
+// =============================================================================
+
+#if PTO2_SCHED_PROFILING
+#include "common/platform_config.h"
+
+uint64_t g_sched_lock_cycle[PLATFORM_MAX_AICPU_THREADS] = {};
+uint64_t g_sched_fanout_cycle[PLATFORM_MAX_AICPU_THREADS] = {};
+uint64_t g_sched_fanin_cycle[PLATFORM_MAX_AICPU_THREADS] = {};
+uint64_t g_sched_self_consumed_cycle[PLATFORM_MAX_AICPU_THREADS] = {};
+uint64_t g_sched_lock_wait_cycle[PLATFORM_MAX_AICPU_THREADS] = {};
+uint64_t g_sched_push_wait_cycle[PLATFORM_MAX_AICPU_THREADS] = {};
+uint64_t g_sched_pop_wait_cycle[PLATFORM_MAX_AICPU_THREADS] = {};
+uint64_t g_sched_lock_atomic_count[PLATFORM_MAX_AICPU_THREADS] = {};
+uint64_t g_sched_fanout_atomic_count[PLATFORM_MAX_AICPU_THREADS] = {};
+uint64_t g_sched_fanin_atomic_count[PLATFORM_MAX_AICPU_THREADS] = {};
+uint64_t g_sched_self_atomic_count[PLATFORM_MAX_AICPU_THREADS] = {};
+uint64_t g_sched_pop_atomic_count[PLATFORM_MAX_AICPU_THREADS] = {};
+uint64_t g_sched_complete_count[PLATFORM_MAX_AICPU_THREADS] = {};
+
+PTO2SchedProfilingData scheduler_get_profiling(int thread_idx) {
+    PTO2SchedProfilingData d;
+    d.lock_cycle = std::exchange(g_sched_lock_cycle[thread_idx], 0);
+    d.fanout_cycle = std::exchange(g_sched_fanout_cycle[thread_idx], 0);
+    d.fanin_cycle = std::exchange(g_sched_fanin_cycle[thread_idx], 0);
+    d.self_consumed_cycle = std::exchange(g_sched_self_consumed_cycle[thread_idx], 0);
+    d.lock_wait_cycle = std::exchange(g_sched_lock_wait_cycle[thread_idx], 0);
+    d.push_wait_cycle = std::exchange(g_sched_push_wait_cycle[thread_idx], 0);
+    d.pop_wait_cycle = std::exchange(g_sched_pop_wait_cycle[thread_idx], 0);
+    d.lock_atomic_count = std::exchange(g_sched_lock_atomic_count[thread_idx], 0);
+    d.fanout_atomic_count = std::exchange(g_sched_fanout_atomic_count[thread_idx], 0);
+    d.fanin_atomic_count = std::exchange(g_sched_fanin_atomic_count[thread_idx], 0);
+    d.self_atomic_count = std::exchange(g_sched_self_atomic_count[thread_idx], 0);
+    d.pop_atomic_count = std::exchange(g_sched_pop_atomic_count[thread_idx], 0);
+    d.complete_count = std::exchange(g_sched_complete_count[thread_idx], 0);
+    return d;
+}
+#endif
+
+// =============================================================================
+// Debug Utilities
+// =============================================================================
+
+void PTO2SchedulerState::print_stats() {
+    PTO2SchedulerState *sched = this;
+    LOG_INFO_V0("=== Scheduler Statistics ===");
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        if (sched->ring_sched_state.last_task_alive > 0) {
+            LOG_INFO_V0("Ring %d:", r);
+            LOG_INFO_V0("  last_task_alive: %d", sched->ring_sched_state.last_task_alive);
+            auto &dp = sched->ring_sched_state.dep_pool;
+            if (dp.top > 0) {
+                LOG_INFO_V0(
+                    "  dep_pool: top=%d tail=%d used=%d high_water=%d capacity=%d", dp.top, dp.tail, dp.top - dp.tail,
+                    dp.high_water, dp.capacity
+                );
+            }
+        }
+    }
+#if PTO2_SCHED_PROFILING
+    LOG_INFO_V0("tasks_completed:   %lld", (long long)sched->tasks_completed.load(std::memory_order_relaxed));
+    LOG_INFO_V0("tasks_consumed:    %lld", (long long)sched->tasks_consumed.load(std::memory_order_relaxed));
+#endif
+    LOG_INFO_V0("============================");
+}
+
+void PTO2SchedulerState::print_queues() {
+    PTO2SchedulerState *sched = this;
+    LOG_INFO_V0("=== Ready Queues ===");
+
+    const char *shape_names[] = {"AIC", "AIV", "MIX"};
+
+    for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) {
+        LOG_INFO_V0("  %s: count=%" PRIu64, shape_names[i], sched->ready_queues[i].size());
+    }
+    LOG_INFO_V0("  DUMMY: count=%" PRIu64, sched->dummy_ready_queue.size());
+
+    LOG_INFO_V0("====================");
+}
diff --git a/src/a2a3/runtime/host_build_graph/runtime/scheduler/pto_scheduler.h b/src/a2a3/runtime/host_build_graph/runtime/scheduler/pto_scheduler.h
new file mode 100644
index 000000000..cb40adc78
--- /dev/null
+++ b/src/a2a3/runtime/host_build_graph/runtime/scheduler/pto_scheduler.h
@@ -0,0 +1,1493 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+/**
+ * PTO Runtime2 - Scheduler Interface
+ *
+ * The Scheduler is responsible for:
+ * 1. Maintaining per-resource-shape ready queues
+ * 2. Tracking task state (PENDING -> COMPLETED -> CONSUMED)
+ * 3. Managing fanin/fanout refcounts for dependency resolution
+ * 4. Advancing last_task_alive for heap reclamation
+ * 5. Two-stage mixed-task completion (subtask done bits → mixed-task complete)
+ *
+ * The Scheduler runs on Device AI_CPU and processes:
+ * - Task state transitions based on fanin_refcount
+ * - Buffer lifecycle based on fanout_refcount
+ * - Ring pointer advancement for flow control
+ *
+ * Based on: docs/RUNTIME_LOGIC.md
+ */
+
+#pragma once
+
+#include <atomic>
+
+#include "common/core_type.h"
+#include "utils/device_arena.h"
+#include "aicpu/platform_regs.h"  // get_reg_ptr / RegId for the speculative doorbell
+#include "pto_async_wait.h"
+#include "pto_ring_buffer.h"
+#include "pto_runtime2_types.h"
+#include "pto_shared_memory.h"
+
+#include "aicpu/device_time.h"  // get_sys_cnt_aicpu (weak; used by spec doorbell timing too)
+#if PTO2_SCHED_PROFILING
+#define PTO2_SCHED_CYCLE_START() uint64_t _st0 = get_sys_cnt_aicpu(), _st1
+#define PTO2_SCHED_CYCLE_LAP(acc)   \
+    do {                            \
+        _st1 = get_sys_cnt_aicpu(); \
+        acc += (_st1 - _st0);       \
+        _st0 = _st1;                \
+    } while (0)
+#endif
+
+// =============================================================================
+// Ready Queue (Lock-free bounded MPMC — Vyukov design)
+// =============================================================================
+
+/**
+ * Per-slot entry: sequence counter for ABA safety + task payload
+ */
+struct PTO2ReadyQueueSlot {
+    std::atomic<int64_t> sequence;
+    PTO2TaskSlotState *slot_state;
+};
+
+/**
+ * Thread-local ready buffer for local-first dispatch optimization.
+ *
+ * Two buffers per scheduling thread, one per CoreType (AIC=0, AIV=1).
+ * Initialized once before the scheduling loop; must be empty at
+ * the start of each iteration (verified by always_assert).
+ *
+ * Phase 1 fills per-CoreType buffers via on_task_complete().
+ * The dispatch stage drains them local-first via get_ready_tasks_batch,
+ * with any remaining tasks pushed to the global ready queue.
+ */
+// Number of CoreType values eligible for local dispatch (AIC=0, AIV=1)
+static constexpr int PTO2_LOCAL_DISPATCH_TYPE_NUM = 2;
+
+struct PTO2LocalReadyBuffer {
+    PTO2TaskSlotState **slot_states = nullptr;
+    int count = 0;
+    int capacity = 0;
+
+    void reset(PTO2TaskSlotState **buf, int cap) {
+        slot_states = buf;
+        count = 0;
+        capacity = cap;
+    }
+
+    bool try_push(PTO2TaskSlotState *s) {
+        if (slot_states && count < capacity) {
+            slot_states[count++] = s;
+            return true;
+        }
+        return false;
+    }
+
+    PTO2TaskSlotState *pop() { return (count > 0) ? slot_states[--count] : nullptr; }
+};
+
+/**
+ * Lock-free bounded MPMC queue (Dmitry Vyukov design)
+ *
+ * Key properties:
+ * - enqueue_pos and dequeue_pos on separate cache lines (no false sharing)
+ * - Per-slot sequence counter prevents ABA problem
+ * - Empty queue pop returns immediately (single atomic load, no lock)
+ * - CAS contention is split: producers only touch enqueue_pos,
+ *   consumers only touch dequeue_pos
+ */
+struct alignas(64) PTO2ReadyQueue {
+    PTO2ReadyQueueSlot *slots;
+    uint64_t capacity;
+    uint64_t mask;        // capacity - 1
+    char _pad0[64 - 24];  // Pad to own cache line
+
+    std::atomic<uint64_t> enqueue_pos;
+    char _pad1[64 - sizeof(std::atomic<uint64_t>)];  // Own cache line
+
+    std::atomic<uint64_t> dequeue_pos;
+    char _pad2[64 - sizeof(std::atomic<uint64_t>)];  // Own cache line
+
+    uint64_t size() {
+        uint64_t e = enqueue_pos.load(std::memory_order_relaxed);
+        uint64_t d = dequeue_pos.load(std::memory_order_relaxed);
+        return (e >= d) ? (e - d) : 0;
+    }
+
+    bool push(PTO2TaskSlotState *slot_state) {
+        uint64_t pos;
+        PTO2ReadyQueueSlot *slot;
+        while (true) {
+            pos = enqueue_pos.load(std::memory_order_relaxed);
+            slot = &slots[pos & mask];
+            int64_t seq = slot->sequence.load(std::memory_order_acquire);
+            int64_t diff = seq - static_cast<int64_t>(pos);
+            if (diff == 0) {
+                if (enqueue_pos.compare_exchange_weak(
+                        pos, pos + 1, std::memory_order_relaxed, std::memory_order_relaxed
+                    )) {
+                    break;
+                }
+            } else if (diff < 0) {
+                return false;  // Queue full
+            }
+        }
+
+        slot->slot_state = slot_state;
+        slot->sequence.store(static_cast<int64_t>(pos + 1), std::memory_order_release);
+        return true;
+    }
+
+    // Batch push: reserve count slots with a single CAS after confirming
+    // every target slot is available under the usual Vyukov sequence check.
+    void push_batch(PTO2TaskSlotState **items, int count) {
+        if (count == 0) return;
+
+        uint64_t pos;
+        while (true) {
+            pos = enqueue_pos.load(std::memory_order_relaxed);
+            bool ready = true;
+            for (int i = 0; i < count; i++) {
+                PTO2ReadyQueueSlot *slot = &slots[(pos + i) & mask];
+                int64_t seq = slot->sequence.load(std::memory_order_acquire);
+                int64_t diff = seq - static_cast<int64_t>(pos + i);
+                if (diff != 0) {
+                    ready = false;
+                    break;
+                }
+            }
+            if (!ready) {
+                continue;
+            }
+            if (enqueue_pos.compare_exchange_weak(
+                    pos, pos + count, std::memory_order_relaxed, std::memory_order_relaxed
+                )) {
+                break;
+            }
+        }
+
+        for (int i = 0; i < count; i++) {
+            PTO2ReadyQueueSlot *slot = &slots[(pos + i) & mask];
+            slot->slot_state = items[i];
+            slot->sequence.store(static_cast<int64_t>(pos + i + 1), std::memory_order_release);
+        }
+    }
+
+#if PTO2_ORCH_PROFILING || PTO2_SCHED_PROFILING
+    bool push(PTO2TaskSlotState *slot_state, uint64_t &atomic_count, uint64_t &wait_cycle) {
+        uint64_t pos;
+        PTO2ReadyQueueSlot *slot;
+        uint64_t t0 = get_sys_cnt_aicpu();
+        bool contended = false;
+        uint32_t atomic_ops = 0;
+        while (true) {
+            pos = enqueue_pos.load(std::memory_order_relaxed);
+            slot = &slots[pos & mask];
+            int64_t seq = slot->sequence.load(std::memory_order_acquire);
+            int64_t diff = seq - static_cast<int64_t>(pos);
+            atomic_ops += 2;  // enqueue_pos.load + sequence.load
+            if (diff == 0) {
+                if (enqueue_pos.compare_exchange_weak(
+                        pos, pos + 1, std::memory_order_relaxed, std::memory_order_relaxed
+                    )) {
+                    atomic_ops++;  // successful CAS
+                    break;
+                }
+                contended = true;
+                atomic_ops++;  // failed CAS
+            } else if (diff < 0) {
+                return false;  // Queue full
+            } else {
+                contended = true;  // diff > 0: slot not yet released, spin
+            }
+        }
+        atomic_ops++;  // final sequence.store
+        atomic_count += atomic_ops;
+        if (contended) {
+            wait_cycle += (get_sys_cnt_aicpu() - t0);
+        }
+
+        slot->slot_state = slot_state;
+        slot->sequence.store(static_cast<int64_t>(pos + 1), std::memory_order_release);
+        return true;
+    }
+#endif
+
+    PTO2TaskSlotState *pop() {
+        // Fast-path: skip slot load when queue is clearly empty
+        uint64_t d = dequeue_pos.load(std::memory_order_relaxed);
+        uint64_t e = enqueue_pos.load(std::memory_order_relaxed);
+        if (d >= e) {
+            return nullptr;
+        }
+
+        uint64_t pos;
+        PTO2ReadyQueueSlot *slot;
+        while (true) {
+            pos = dequeue_pos.load(std::memory_order_relaxed);
+            slot = &slots[pos & mask];
+            int64_t seq = slot->sequence.load(std::memory_order_acquire);
+            int64_t diff = seq - static_cast<int64_t>(pos + 1);
+            if (diff == 0) {
+                if (dequeue_pos.compare_exchange_weak(
+                        pos, pos + 1, std::memory_order_relaxed, std::memory_order_relaxed
+                    ))
+                    break;
+            } else if (diff < 0) {
+                return nullptr;  // Queue empty
+            }
+        }
+
+        PTO2TaskSlotState *result = slot->slot_state;
+        slot->sequence.store(static_cast<int64_t>(pos + mask + 1), std::memory_order_release);
+        return result;
+    }
+
+#if PTO2_SCHED_PROFILING
+    PTO2TaskSlotState *pop(uint64_t &atomic_count, uint64_t &wait_cycle) {
+        // Fast-path: skip slot load when queue is clearly empty
+        uint64_t d = dequeue_pos.load(std::memory_order_relaxed);
+        uint64_t e = enqueue_pos.load(std::memory_order_relaxed);
+        atomic_count += 2;  // dequeue_pos.load + enqueue_pos.load
+        if (d >= e) {
+            return nullptr;
+        }
+
+        uint64_t pos;
+        PTO2ReadyQueueSlot *slot;
+        uint64_t t0 = get_sys_cnt_aicpu();
+        bool contended = false;
+        uint32_t atomic_ops = 0;
+        while (true) {
+            pos = dequeue_pos.load(std::memory_order_relaxed);
+            slot = &slots[pos & mask];
+            int64_t seq = slot->sequence.load(std::memory_order_acquire);
+            int64_t diff = seq - static_cast<int64_t>(pos + 1);
+            atomic_ops += 2;  // dequeue_pos.load + sequence.load
+            if (diff == 0) {
+                if (dequeue_pos.compare_exchange_weak(
+                        pos, pos + 1, std::memory_order_relaxed, std::memory_order_relaxed
+                    )) {
+                    atomic_ops++;  // successful CAS
+                    break;
+                }
+                contended = true;
+                atomic_ops++;  // failed CAS
+            } else if (diff < 0) {
+                atomic_count += atomic_ops;
+                return nullptr;  // Queue empty
+            } else {
+                contended = true;
+            }
+        }
+        atomic_ops++;  // final sequence.store
+        atomic_count += atomic_ops;
+        if (contended) {
+            wait_cycle += (get_sys_cnt_aicpu() - t0);
+        }
+
+        PTO2TaskSlotState *result = slot->slot_state;
+        slot->sequence.store(static_cast<int64_t>(pos + mask + 1), std::memory_order_release);
+        return result;
+    }
+#endif
+
+    // Batch pop: reserve a contiguous run of ready slots with a single CAS.
+    // Returns actual number of items popped (may be less than max_count).
+    int pop_batch(PTO2TaskSlotState **out, int max_count) {
+        uint64_t pos;
+        int count;
+        while (true) {
+            pos = dequeue_pos.load(std::memory_order_relaxed);
+            count = 0;
+            while (count < max_count) {
+                PTO2ReadyQueueSlot *slot = &slots[(pos + count) & mask];
+                int64_t seq = slot->sequence.load(std::memory_order_acquire);
+                int64_t diff = seq - static_cast<int64_t>(pos + count + 1);
+                if (diff == 0) {
+                    count++;
+                    continue;
+                }
+                if (diff < 0) {
+                    break;
+                }
+                count = -1;
+                break;
+            }
+            if (count == 0) return 0;
+            if (count < 0) continue;
+            if (dequeue_pos.compare_exchange_weak(
+                    pos, pos + count, std::memory_order_relaxed, std::memory_order_relaxed
+                )) {
+                break;
+            }
+        }
+
+        for (int i = 0; i < count; i++) {
+            PTO2ReadyQueueSlot *slot = &slots[(pos + i) & mask];
+            out[i] = slot->slot_state;
+            slot->sequence.store(static_cast<int64_t>(pos + i + mask + 1), std::memory_order_release);
+        }
+        return count;
+    }
+
+#if PTO2_SCHED_PROFILING
+    int pop_batch(PTO2TaskSlotState **out, int max_count, uint64_t &atomic_count, uint64_t &wait_cycle) {
+        uint64_t pos;
+        int count;
+        uint64_t t0 = get_sys_cnt_aicpu();
+        bool contended = false;
+        uint32_t atomic_ops = 0;
+        while (true) {
+            pos = dequeue_pos.load(std::memory_order_relaxed);
+            atomic_ops++;  // dequeue_pos.load
+            count = 0;
+            while (count < max_count) {
+                PTO2ReadyQueueSlot *slot = &slots[(pos + count) & mask];
+                int64_t seq = slot->sequence.load(std::memory_order_acquire);
+                int64_t diff = seq - static_cast<int64_t>(pos + count + 1);
+                atomic_ops++;  // sequence.load
+                if (diff == 0) {
+                    count++;
+                    continue;
+                }
+                if (diff < 0) {
+                    break;
+                }
+                contended = true;
+                count = -1;
+                break;
+            }
+            if (count == 0) {
+                atomic_count += atomic_ops;
+                return 0;
+            }
+            if (count < 0) {
+                continue;
+            }
+            if (dequeue_pos.compare_exchange_weak(
+                    pos, pos + count, std::memory_order_relaxed, std::memory_order_relaxed
+                )) {
+                atomic_ops++;  // successful CAS
+                break;
+            }
+            contended = true;
+            atomic_ops++;  // failed CAS
+        }
+
+        for (int i = 0; i < count; i++) {
+            PTO2ReadyQueueSlot *slot = &slots[(pos + i) & mask];
+            out[i] = slot->slot_state;
+            slot->sequence.store(static_cast<int64_t>(pos + i + mask + 1), std::memory_order_release);
+            atomic_ops++;  // sequence.store
+        }
+        atomic_count += atomic_ops;
+        if (contended) {
+            wait_cycle += (get_sys_cnt_aicpu() - t0);
+        }
+        return count;
+    }
+#endif
+};
+
+// Cold-path ready queue operations (defined in pto_scheduler.cpp). Declared
+// as non-member so PTO2ReadyQueue stays a POD-like struct with cache-line
+// alignment. Storage is owned by the caller-supplied arena.
+//   reserve_layout: declare the slots[] region on the arena (must precede commit)
+//   init_from_layout: bind slots pointer from arena.region_ptr(off) and
+//                     initialize sequence counters
+//   destroy: forget the slots pointer (arena owns the buffer)
+size_t ready_queue_reserve_layout(DeviceArena &arena, uint64_t capacity);
+// Writes everything *except* the arena-internal `slots` pointer field
+// (sequences/positions on the slot array, capacity, mask). Uses
+// arena.region_ptr(slots_off) only to address the slot array for writes;
+// does NOT store the pointer in `queue->slots`. Call
+// `ready_queue_wire_arena_pointers` afterwards to set the field itself.
+bool ready_queue_init_data_from_layout(PTO2ReadyQueue *queue, DeviceArena &arena, size_t slots_off, uint64_t capacity);
+// Stores queue->slots = arena.region_ptr(slots_off). Idempotent.
+void ready_queue_wire_arena_pointers(PTO2ReadyQueue *queue, DeviceArena &arena, size_t slots_off);
+void ready_queue_destroy(PTO2ReadyQueue *queue);
+
+// =============================================================================
+// SPSC Queue (Single-Producer Single-Consumer, wait-free)
+// =============================================================================
+//
+// Bounded ring buffer optimized for the wiring queue use case:
+//   - Producer: orchestrator thread (push)
+//   - Consumer: scheduler thread 0 (pop_batch)
+//
+// Design based on Rigtorp's cached-index technique: each side caches
+// the other's index locally, avoiding cross-core cache line bouncing
+// on the hot path. Only when the local cache says "full" or "empty"
+// does the thread issue an acquire load on the remote index.
+//
+// Memory layout: 5 cache-line-aligned fields ensure zero false sharing.
+
+struct alignas(64) PTO2SpscQueue {
+    // --- Producer cache lines (orchestrator thread) ---
+    alignas(64) std::atomic<uint64_t> head_{0};
+    alignas(64) uint64_t tail_cached_{0};
+
+    // --- Consumer cache lines (scheduler thread 0) ---
+    alignas(64) std::atomic<uint64_t> tail_{0};
+    alignas(64) uint64_t head_cached_{0};
+
+    // --- Shared Cacheline (read only) with mask and data ptr (immutable after init) ---
+    alignas(64) PTO2TaskSlotState **buffer_{nullptr};
+    uint64_t mask_{0};
+
+    // Padding to exactly 5 cache lines
+    char padding[64 - sizeof(PTO2TaskSlotState **) - sizeof(uint64_t)];
+
+    // Reserve the backing buffer region on the supplied arena. Returns the
+    // region offset, to be passed to init_from_layout() after the arena is
+    // committed. Cache-line aligned: the buffer is shared between the
+    // orchestrator (push) and scheduler thread 0 (pop_batch), so its base
+    // must not false-share with neighboring regions.
+    static size_t reserve_layout(DeviceArena &arena, uint64_t capacity) {
+        return arena.reserve(capacity * sizeof(uintptr_t), PTO2_ALIGN_SIZE);
+    }
+
+    // Writes everything except the arena-internal `buffer_` pointer field
+    // (zeros the slot pointer array, mask/head/tail). The host pre-builds the
+    // image without storing a host address in buffer_; the AICPU wires
+    // buffer_ at boot via wire_arena_pointers().
+    bool init_data_from_layout(DeviceArena &arena, size_t buffer_off, uint64_t capacity) {
+        if (capacity == 0 || (capacity & (capacity - 1)) != 0) return false;
+        auto *buf = static_cast<PTO2TaskSlotState **>(arena.region_ptr(buffer_off));
+        // calloc'd-equivalent: zero the slot pointers so spurious early pops
+        // observe nullptr.
+        for (uint64_t i = 0; i < capacity; i++)
+            buf[i] = nullptr;
+        mask_ = capacity - 1;
+        head_.store(0, std::memory_order_relaxed);
+        tail_.store(0, std::memory_order_relaxed);
+        tail_cached_ = 0;
+        head_cached_ = 0;
+        return true;
+    }
+
+    // Wire the arena-internal pointer. Called by both host (with host arena)
+    // and AICPU (with device arena attached to the prebuilt image).
+    void wire_arena_pointers(DeviceArena &arena, size_t buffer_off) {
+        buffer_ = static_cast<PTO2TaskSlotState **>(arena.region_ptr(buffer_off));
+    }
+
+    // Arena owns the buffer; here we only forget our pointer.
+    void destroy() { buffer_ = nullptr; }
+
+    // Push one item (producer only). Returns false if queue is full.
+    // Full condition: next_h - tail > mask_ (i.e. > capacity-1), so the
+    // effective usable capacity is capacity-1 (one slot is wasted as a
+    // sentinel to distinguish full from empty). uint64_t wrapping is safe
+    // since head and tail are monotonically increasing and subtraction
+    // wraps correctly.
+    bool push(PTO2TaskSlotState *item) {
+        uint64_t h = head_.load(std::memory_order_relaxed);
+        uint64_t next_h = h + 1;
+        if (next_h - tail_cached_ > mask_) {
+            tail_cached_ = tail_.load(std::memory_order_acquire);
+            if (next_h - tail_cached_ > mask_) {
+                return false;
+            }
+        }
+        buffer_[h & mask_] = item;
+        head_.store(next_h, std::memory_order_release);
+        return true;
+    }
+
+    // Pop up to max_count items (consumer only). Returns actual count.
+    int pop_batch(PTO2TaskSlotState **out, int max_count) {
+        uint64_t t = tail_.load(std::memory_order_relaxed);
+        uint64_t avail = head_cached_ - t;
+        if (avail < static_cast<uint64_t>(max_count)) {
+            head_cached_ = head_.load(std::memory_order_acquire);
+            avail = head_cached_ - t;
+            if (avail == 0) return 0;
+        }
+        int count = (avail < static_cast<uint64_t>(max_count)) ? static_cast<int>(avail) : max_count;
+        for (int i = 0; i < count; i++) {
+            out[i] = buffer_[(t + i) & mask_];
+        }
+        tail_.store(t + count, std::memory_order_release);
+        return count;
+    }
+
+    // Approximate size (used for backoff decisions, not exact).
+    uint64_t size() const {
+        uint64_t h = head_.load(std::memory_order_acquire);
+        uint64_t t = tail_.load(std::memory_order_acquire);
+        return h - t;
+    }
+
+    // Full ⟺ the producer's next push() would fail: size has reached the
+    // usable capacity (mask_ = capacity - 1, one slot reserved as sentinel).
+    // Used by the wiring-queue deadlock detector to prove the orchestrator is
+    // blocked in push().
+    bool full() const { return size() >= mask_; }
+};
+
+static_assert(sizeof(PTO2SpscQueue) == 5 * 64, "PTO2SpscQueue must be exactly 5 cache lines (320B)");
+// =============================================================================
+
+/**
+ * Statistics returned by mixed-task completion processing
+ */
+struct CompletionStats {
+    int32_t fanout_edges;       // Number of fanout edges traversed (notify consumers)
+    int32_t tasks_enqueued;     // Number of consumers that became READY
+    int32_t fanin_edges;        // Number of fanin edges traversed (release producers)
+    bool mixed_task_completed;  // True only when this callback completed a mixed task
+};
+
+/**
+ * Layout descriptor produced by PTO2SchedulerState::reserve_layout(). Holds
+ * the arena offsets of every sub-region the scheduler needs plus the
+ * capacities used at layout time (init_from_layout reuses them).
+ */
+struct PTO2SchedulerLayout {
+    size_t off_ready_queue_slots[PTO2_NUM_RESOURCE_SHAPES];
+    size_t off_dummy_ready_queue_slots;
+    size_t off_early_dispatch_queue_slots;
+    size_t off_dep_pool_entries;
+    size_t off_wiring_spsc_buffer;
+    uint64_t ready_queue_capacity;
+    uint64_t spsc_capacity;
+    int32_t dep_pool_capacity;
+};
+
+/**
+ * Scheduler state structure
+ *
+ * Contains dynamic state updated during task execution.
+ * Separated from shared memory for cache efficiency.
+ * Hot-path methods are defined inline (implicitly inline as member functions).
+ */
+struct PTO2SchedulerState {
+    // Shared memory access
+    PTO2SharedMemoryHeader *sm_header;
+
+    // Per-ring state
+    struct alignas(64) RingSchedState {
+        // --- Cache Line 0: ring pointer (read-only) + hot path (read-write) ---
+        PTO2SharedMemoryRingHeader *ring;
+        int32_t last_task_alive;
+        std::atomic<int32_t> advance_lock;  // multi-thread CAS
+
+        // --- Cache Line 1+: Thread 0 only (wiring dep_pool) ---
+        alignas(64) PTO2DepListPool dep_pool;
+        // One-shot latch for the wiring-queue deadlock report (thread 0 only):
+        // the drain breaks on dep_pool exhaustion every call while wedged, so
+        // the tier-1 structural diagnostic is emitted once, not per call.
+        bool dep_deadlock_reported = false;
+#if PTO2_PROFILING
+        // Published only for scope_stats; orchestrator must not read dep_pool's non-atomic counters directly.
+        alignas(64) std::atomic<int32_t> dep_pool_snapshot_tail;
+        std::atomic<int32_t> dep_pool_snapshot_top;
+#endif
+
+        // Initialize arena-internal data + arena-external pointers; does NOT
+        // store dep_pool.base (that lives in the runtime arena and is wired
+        // by SchedulerState::wire_arena_pointers). The `ring` field stores
+        // the device address of the SM ring header — computed via offset
+        // arithmetic, no SM dereference.
+        bool init_data_from_layout(void *sm_dev_base);
+        void destroy();
+
+#if PTO2_PROFILING
+        void publish_dep_pool_snapshot() {
+            dep_pool_snapshot_tail.store(dep_pool.tail, std::memory_order_release);
+            dep_pool_snapshot_top.store(dep_pool.top, std::memory_order_release);
+        }
+
+        void read_dep_pool_snapshot(int32_t &tail, int32_t &top) const {
+            top = dep_pool_snapshot_top.load(std::memory_order_acquire);
+            tail = dep_pool_snapshot_tail.load(std::memory_order_acquire);
+            if (tail > top) tail = top;
+        }
+#endif
+    } ring_sched_state;
+
+    // Ready queues remain global (scheduling is ring-agnostic)
+    PTO2ReadyQueue ready_queues[PTO2_NUM_RESOURCE_SHAPES];
+
+    // Dependency-only tasks (active_mask is empty, shape == DUMMY). Drained by
+    // the dispatch loop and completed inline -- never goes to AICore.
+    PTO2ReadyQueue dummy_ready_queue;
+
+    // Wiring subsystem — groups all wiring-related state for cache-line isolation.
+    //
+    // Three cache-line regions by writer:
+    //   1. batch_*  / backoff — thread 0 exclusive (local batch buffer)
+    //   2. queue    — SPSC: orchestrator push, thread 0 pop
+    //   3. orch_needs_drain — orchestrator write, thread 0 read
+    struct alignas(64) WiringState {
+        static constexpr uint64_t BATCH_SIZE = 30;
+        static constexpr int BACKOFF_LIMIT = 32;
+
+        // --- Thread 0 exclusive: local batch buffer + backoff ---
+        int batch_count = 0;
+        int batch_index = 0;
+        int backoff_counter = 0;
+        PTO2TaskSlotState *batch[BATCH_SIZE];
+
+        // --- SPSC queue: orchestrator (push) ↔ thread 0 (pop) ---
+        PTO2SpscQueue queue;
+
+        // --- Orchestrator write, thread 0 read ---
+        alignas(64) std::atomic<bool> orch_needs_drain{false};
+        // Set to 1 only while the orchestrator is actually spinning in
+        // queue.push() (queue full), cleared on a successful push. The wiring
+        // deadlock detector reads this as the producer-blocked observable: it
+        // proves the orchestrator is stuck BEFORE its scope_end, as opposed to
+        // having just filled the queue with its last in-scope push and being
+        // about to call scope_end (which would release the head -> no deadlock).
+        std::atomic<int32_t> producer_blocked{0};
+    } wiring;
+
+    static_assert(
+        offsetof(WiringState, queue) == 256, "WiringState: batch region must be exactly 4 cache lines before queue"
+    );
+    static_assert(sizeof(WiringState) == 640, "WiringState must be exactly 10 cache lines (640B)");
+
+    alignas(64) AsyncWaitList async_wait_list;
+
+    // Statistics (cold path, isolated from hot-path fields)
+#if PTO2_SCHED_PROFILING
+    alignas(64) std::atomic<int64_t> tasks_completed;
+    std::atomic<int64_t> tasks_consumed;
+#endif
+    // =========================================================================
+    // Inline hot-path methods
+    // =========================================================================
+
+    /**
+     * Drain wiring queue: pop submitted tasks and wire their fanout edges.
+     * Called by scheduler thread 0 each loop iteration. Sets fanin_count,
+     * acquires fanout_lock per producer, allocates dep_pool entries, and
+     * pushes ready tasks to the appropriate ready queue.
+     *
+     * @return Number of tasks wired this call.
+     */
+
+    int drain_wiring_queue(bool force_drain = false) {
+        int wired = 0;
+
+        // Refill local batch buffer when exhausted.
+        if (wiring.batch_index >= wiring.batch_count) {
+            // Backoff: defer pop when queue holds fewer than a full batch,
+            // unless force_drain, orch_needs_drain, or backoff limit reached.
+            if (!force_drain && wiring.queue.size() < WiringState::BATCH_SIZE) {
+                if (!wiring.orch_needs_drain.load(std::memory_order_acquire) &&
+                    wiring.backoff_counter < WiringState::BACKOFF_LIMIT) {
+                    wiring.backoff_counter++;
+                    return 0;
+                }
+            }
+            wiring.backoff_counter = 0;
+            wiring.batch_count = wiring.queue.pop_batch(wiring.batch, WiringState::BATCH_SIZE);
+            wiring.batch_index = 0;
+            if (wiring.batch_count == 0) return 0;
+        }
+
+        // Process tasks from local buffer in strict FIFO order.
+        while (wiring.batch_index < wiring.batch_count) {
+            PTO2TaskSlotState *ws = wiring.batch[wiring.batch_index];
+            auto &rss = ring_sched_state;
+            int32_t wfanin = ws->payload->fanin_actual_count;
+
+            if (wfanin > 0 && rss.dep_pool.available() < wfanin) {
+                rss.dep_pool.reclaim(*rss.ring, rss.last_task_alive);
+                if (rss.dep_pool.available() < wfanin) {
+#if PTO2_PROFILING
+                    if (is_scope_stats_enabled()) {
+                        rss.publish_dep_pool_snapshot();
+                    }
+#endif
+                    // dep_pool can't reclaim because the reclaim watermark is
+                    // wedged. This runs on the scheduler thread, so unlike
+                    // alloc()'s detector it cannot self-observe that the
+                    // orchestrator is blocked; wiring.producer_blocked is the
+                    // external certificate -- the orchestrator sets it ONLY while
+                    // it is actually spinning in queue.push() (cleared on a
+                    // successful push), so the "just filled the queue then called
+                    // scope_end" case (push succeeded -> flag stays 0) cannot trip
+                    // a false report. With the producer provably stuck in push
+                    // (program-order before its scope_end) AND the head COMPLETED,
+                    // all consumers released, scope still open (only scope_end
+                    // frees it), scope_end can never run -> provable head-of-line
+                    // deadlock. The producer-blocked gate also pins the head:
+                    // scope_end has not run, so the scope-gated head cannot be
+                    // CONSUMED/reset concurrently while we read it.
+                    if (!rss.dep_deadlock_reported && wiring.producer_blocked.load(std::memory_order_acquire) != 0) {
+                        int32_t last_alive = rss.last_task_alive;
+                        PTO2TaskSlotState &h = rss.ring->get_slot_state_by_task_id(last_alive);
+                        // Read the head under its fanout_lock: fanout_count is a
+                        // lock-protected field, and one snapshot keeps the check
+                        // and the report consistent.
+                        h.lock_fanout();
+                        int32_t state = h.task_state.load(std::memory_order_acquire);
+                        uint32_t fc = h.fanout_count;
+                        uint32_t rc = h.fanout_refcount.load(std::memory_order_acquire);
+                        h.unlock_fanout();
+                        bool head_scope_gated = (state == PTO2_TASK_COMPLETED) && (rc == (fc & ~PTO2_FANOUT_SCOPE_BIT));
+                        if (head_scope_gated) {
+                            rss.dep_deadlock_reported = true;
+                            report_wiring_deadlock(rss, wfanin, last_alive, state, fc, rc);
+                            // Latch the shared fatal so both sides exit fast off
+                            // one error code: the scheduler cold-path poll
+                            // (handle_orchestrator_exit) emergency_shutdowns, and
+                            // the orchestrator's push spin breaks out and unwinds.
+                            if (rss.dep_pool.error_code_ptr != nullptr) {
+                                int32_t expected = PTO2_ERROR_NONE;
+                                rss.dep_pool.error_code_ptr->compare_exchange_strong(
+                                    expected, PTO2_ERROR_DEP_POOL_OVERFLOW, std::memory_order_acq_rel
+                                );
+                            }
+                        }
+                    }
+                    break;  // not enough dep_pool space — keep remainder for next call
+                }
+            }
+
+            wiring.batch_index++;
+            wire_task(rss, ws, wfanin);
+            wired++;
+        }
+
+        return wired;
+    }
+
+    // Tier-1 structural diagnostic for a provable wiring-queue deadlock (head
+    // COMPLETED + all consumers released + scope still open, dep_pool exhausted,
+    // orchestrator provably blocked in push). The head snapshot (state/fc/rc) is
+    // taken under fanout_lock by the caller and passed in, so the report agrees
+    // with the check and reads no lock-protected field unlocked.
+    void report_wiring_deadlock(
+        RingSchedState &rss, int32_t wfanin, int32_t last_alive, int32_t state, uint32_t fc, uint32_t rc
+    ) {
+        LOG_ERROR("========================================");
+        LOG_ERROR("FATAL: Wiring-Queue Deadlock - Dep Pool Exhausted!");
+        LOG_ERROR("========================================");
+        LOG_ERROR("Head task %d COMPLETED, all consumers released, scope still open ->", last_alive);
+        LOG_ERROR("only scope_end can free it, but the orchestrator is blocked on a full wiring");
+        LOG_ERROR("queue (in push, before its scope_end). Provable head-of-line deadlock.");
+        LOG_ERROR(
+            "  Head task %d: state=%d, consumers=%u/%u, scope_released=%d", last_alive, state,
+            rc & ~PTO2_FANOUT_SCOPE_BIT, fc & ~PTO2_FANOUT_SCOPE_BIT, (rc & PTO2_FANOUT_SCOPE_BIT) ? 1 : 0
+        );
+        LOG_ERROR("  Dep pool:   used=%d/%d, needed=%d entries", rss.dep_pool.used(), rss.dep_pool.capacity, wfanin);
+        LOG_ERROR("Solution:");
+        LOG_ERROR("  The open scope's fanout exceeds the dep pool. Either split the scope, or");
+        LOG_ERROR("  raise PTO2_RING_DEP_POOL (compile-time PTO2_DEP_LIST_POOL_SIZE).");
+        LOG_ERROR("========================================");
+    }
+
+    // Route a ready slot to the right global queue. Dummy tasks (empty
+    // active_mask) live in dummy_ready_queue; everything else goes to the
+    // per-shape ready_queues[]. Used by paths that do not have a thread-local
+    // ready buffer (e.g. wiring). See push_ready_routed_local for the
+    // dispatch-time fast path.
+    void push_ready_routed(PTO2TaskSlotState *slot_state) {
+        PTO2ResourceShape shape = slot_state->active_mask.to_shape();
+        if (shape == PTO2ResourceShape::DUMMY) {
+            dummy_ready_queue.push(slot_state);
+        } else {
+            ready_queues[static_cast<int32_t>(shape)].push(slot_state);
+        }
+    }
+
+    /**
+     * Wire fanout edges for a single task. Sets fanin_count, acquires each
+     * producer's fanout_lock, allocates dep_pool entries for live producers,
+     * pushes the task to the ready queue once its fanin refcount is satisfied.
+     */
+    void wire_task(RingSchedState &rss, PTO2TaskSlotState *ws, int32_t wfanin) {
+        PTO2TaskPayload *wp = ws->payload;
+        ws->fanin_count = wfanin + 1;
+
+        if (wfanin != 0) {
+            int32_t early_finished = 0;
+            for_each_fanin_slot_state(*wp, [&](PTO2TaskSlotState *producer) {
+                producer->lock_fanout();
+                int32_t pstate = producer->task_state.load(std::memory_order_acquire);
+                if (pstate >= PTO2_TASK_COMPLETED) {
+                    early_finished++;
+                } else {
+                    producer->fanout_head = rss.dep_pool.prepend(producer->fanout_head, ws);
+                }
+                producer->unlock_fanout();
+            });
+
+            // Seed dispatch_fanin with producers already complete at wiring
+            // time (e.g. buffer-creator tasks that finished before this
+            // consumer entered the graph). Such producers never dispatch at
+            // runtime, so they can never bump dispatch_fanin via the fanout
+            // walk; without this seed the candidate compare
+            // (dispatch_fanin == fanin_actual_count) would be unreachable
+            // whenever any producer is pre-completed. Mirrors the
+            // early_finished seed that ready_fanin gets via init_rc.
+            if (early_finished != 0) {
+                wp->dispatch_fanin.fetch_add(early_finished, std::memory_order_acq_rel);
+            }
+
+            int32_t init_rc = early_finished + 1;
+            int32_t new_rc = ws->fanin_refcount.fetch_add(init_rc, std::memory_order_acq_rel) + init_rc;
+            if (new_rc >= ws->fanin_count) {
+                push_ready_routed(ws);
+            }
+        } else {
+            ws->fanin_refcount.fetch_add(1, std::memory_order_acq_rel);
+            push_ready_routed(ws);
+        }
+
+        ws->dep_pool_mark = rss.dep_pool.top;
+#if PTO2_PROFILING
+        if (is_scope_stats_enabled()) {
+            rss.publish_dep_pool_snapshot();
+        }
+#endif
+    }
+
+    // host_build_graph host-orch: the COMPLETED->CONSUMED flip existed only to
+    // gate execution-time reclaim (now removed) and to serialize against the
+    // orchestrator's concurrent producer claim — which cannot happen here, since
+    // the orchestrator runs to completion on the host before the device
+    // scheduler starts. Nothing reads CONSUMED during device execution
+    // (completion uses completed_tasks_; wait_for_tensor_ready's consumer wait
+    // keys on fanout_refcount), so the flip is gone. fanout_refcount is still
+    // bumped by release_producer / release_producer_scope so the host-side
+    // wait_for_consumers sees consumers retire.
+
+    void release_producer(PTO2TaskSlotState &slot_state) {
+        slot_state.fanout_refcount.fetch_add(1, std::memory_order_acq_rel);
+    }
+
+    // Scope-end release: sets bit31 (PTO2_FANOUT_SCOPE_BIT) instead of bumping a
+    // consumer ref. Called exactly once per task from on_scope_end. Keeping it a
+    // distinct add lets the deadlock detector tell "waiting only on scope_end"
+    // (head COMPLETED, refcount == fanout_count & ~SCOPE_BIT) apart from
+    // "waiting on a consumer".
+    void release_producer_scope(PTO2TaskSlotState &slot_state) {
+        slot_state.fanout_refcount.fetch_add(PTO2_FANOUT_SCOPE_BIT, std::memory_order_acq_rel);
+    }
+
+#if PTO2_ORCH_PROFILING || PTO2_SCHED_PROFILING
+    void release_producer(PTO2TaskSlotState &slot_state, uint64_t &atomic_count) {
+        slot_state.fanout_refcount.fetch_add(1, std::memory_order_acq_rel);
+        atomic_count += 1;  // fanout_refcount.fetch_add
+    }
+
+    void release_producer_scope(PTO2TaskSlotState &slot_state, uint64_t &atomic_count) {
+        slot_state.fanout_refcount.fetch_add(PTO2_FANOUT_SCOPE_BIT, std::memory_order_acq_rel);
+        atomic_count += 1;  // fanout_refcount.fetch_add
+    }
+#endif
+
+    // Speculative early-dispatch release. If the now-ready task was pre-staged
+    // (gated on a core), ring its DATA_MAIN_BASE high-32 doorbell RIGHT HERE in
+    // the completion path — the moment its last producer's FIN satisfies fanin —
+    // instead of routing it through the ready queue and waiting for the dispatch
+    // pass to pop it. Returns true if the task is fully handled (caller must NOT
+    // push to the ready queue). Returns false when the caller must route C
+    // normally: either it was never pre-staged, OR it is a SPMD consumer only
+    // PARTIALLY pre-staged — the gated blocks are released by the doorbells rung
+    // here, and the remaining (next_block_idx .. logical_block_num) blocks
+    // dispatch normally off the ready queue. Lock-free claim shared with Hook 1
+    // (the stager): CAS NONE->DISPATCHED wins => not pre-staged; lose => STAGED
+    // (spin past the brief STAGING window so the mask is visible), then ring.
+
+    // Per-core speculative doorbell table. Hook 1 records each gated core's
+    // (reg_addr, dispatch token) here at stage time; the completion-path release
+    // reads it back for the cores set in the consumer's staged_core_mask. One
+    // global table indexed by core_id (not per-task): gated cores in flight are
+    // bounded by the chip's core count (no two-level pre-dispatch), so this is the
+    // natural capacity and removes the old per-task 3-doorbell cap.
+    struct SpecDoorbell {
+        uint64_t addr{0};
+        uint32_t token{0};
+    };
+    SpecDoorbell spec_doorbell_table[PTO2_SPEC_CORE_MASK_WORDS * 64]{};
+
+    // Cross-thread early-dispatch work queue (a PTO2ReadyQueue MPMC instance,
+    // arena-backed — reserved/wired in pto_runtime2_init alongside the ready queues).
+    // A consumer's SPMD blocks span cores owned by several AICPU threads, but only a
+    // thread RUNNING the consumer's producer discovers it (via the producer's
+    // fanout). When that producer is thread-local (e.g. a 16-block AIV op filling one
+    // thread's cores), the other threads never see the consumer and its blocks on
+    // their cores can't pre-stage. The first claimer pushes the partially-staged
+    // consumer here; every idle thread's early_dispatch pass pops one, stages a range onto
+    // ITS OWN cores (range-claim via next_block_idx), and re-pushes if blocks remain
+    // — exactly mirroring how a partially-dispatched SPMD task is re-pushed to the
+    // ready queue (scheduler_dispatch: pop -> claim -> re-push). A stale/released
+    // entry fails the STAGING check on pop and is dropped; a push that overflows is
+    // logged and the consumer's blocks fall back to normal dispatch.
+    PTO2ReadyQueue early_dispatch_queue;
+
+    static inline void ring_one_doorbell(uint64_t reg_addr, uint32_t token) {
+        volatile uint64_t *dmb = reinterpret_cast<volatile uint64_t *>(get_reg_ptr(reg_addr, RegId::DATA_MAIN_BASE));
+        uint64_t tk = static_cast<uint64_t>(token);
+        *dmb = (tk << 32) | tk;  // 64-bit STR: high=low=token releases the gated AICore
+    }
+
+    // auto-chain depth cap: a candidate inherits the flag only while depth < this.
+    static constexpr uint8_t PTO2_SPEC_CHAIN_MAX = 4;
+
+    // Event-driven candidate detection (the dual of fanin_refcount/ready). Call when a
+    // FLAGGED producer `p` DISPATCHES (starts running): walk its fanout and bump each
+    // consumer's dispatch_fanin. A consumer whose dispatch_fanin reaches
+    // fanin_actual_count (= every producer is either flagged-and-dispatched, or was
+    // already complete when the consumer was wired) is an early-dispatch candidate:
+    // CAS NONE->STAGING (exactly-once) and push to early_dispatch_queue for the idle drain to
+    // pre-stage. Once-guarded per producer so an SPMD producer's block-by-block
+    // dispatch propagates once. Replaces the old per-iteration pass-1 PULL scan.
+    void propagate_dispatch_fanin(PTO2TaskSlotState &p) {
+        if (!(p.payload->allow_early_resolve || p.payload->spec_chain_active.load(std::memory_order_acquire)))
+            return;  // only flagged (codegen or inherited) producers propagate
+        if (p.payload->dispatch_propagated.exchange(1, std::memory_order_acq_rel) != 0)
+            return;  // already propagated once
+        uint8_t child_depth = static_cast<uint8_t>(p.payload->spec_chain_depth + 1);
+        p.lock_fanout();
+        PTO2DepListEntry *edge = p.fanout_head;  // snapshot head, walk lock-free (fanout stable by dispatch)
+        p.unlock_fanout();
+        for (; edge != nullptr; edge = edge->next) {
+            PTO2TaskSlotState *c = edge->slot_state;
+            // Compare to fanin_actual_count (the real producer-edge count), NOT
+            // fanin_count: fanin_count = fanin_actual_count + 1 (a self/wiring +1 that
+            // ready_fanin gets but dispatch_fanin does not). dispatch_fanin starts at
+            // the wiring-time early_finished seed (producers already complete) and is
+            // bumped here by flagged producers; reaching fanin_actual_count means every
+            // producer is flagged-dispatched or was pre-completed.
+            int32_t nf = c->payload->dispatch_fanin.fetch_add(1, std::memory_order_acq_rel) + 1;
+            if (nf != c->payload->fanin_actual_count) continue;
+            if (c->active_mask.requires_sync_start()) continue;  // sync_start can't be block-by-block pre-staged
+            PTO2ResourceShape shape = c->active_mask.to_shape();
+            if (shape != PTO2ResourceShape::AIC && shape != PTO2ResourceShape::AIV && shape != PTO2ResourceShape::MIX)
+                continue;
+            uint8_t expect = PTO2_SPEC_NONE;  // exactly-once: only the CAS winner enqueues
+            if (!c->payload->spec_state.compare_exchange_strong(
+                    expect, PTO2_SPEC_STAGING, std::memory_order_seq_cst, std::memory_order_seq_cst
+                ))
+                continue;
+            if (child_depth < PTO2_SPEC_CHAIN_MAX) {  // auto-chain: C propagates to ITS consumers
+                c->payload->spec_chain_depth = child_depth;
+                c->payload->spec_chain_active.store(1, std::memory_order_release);
+            }
+            early_dispatch_queue.push(c);
+        }
+    }
+
+    // Collects consumers released via the speculative-doorbell path during a
+    // single on_task_complete fanout walk, so their dispatch_fanin
+    // propagation runs AFTER the walk — never between two siblings' doorbells.
+    struct SpecReleaseSink {
+        static constexpr int CAP = 32;
+        PTO2TaskSlotState *items[CAP];
+        int n = 0;
+        inline bool push(PTO2TaskSlotState *s) {
+            if (n >= CAP) return false;
+            items[n++] = s;
+            return true;
+        }
+    };
+
+    inline bool try_speculative_release(PTO2TaskSlotState &slot_state, SpecReleaseSink *sink = nullptr) {
+        // Never staged => CAS NONE->DISPATCHED wins => dispatch normally.
+        uint8_t expect = PTO2_SPEC_NONE;
+        if (slot_state.payload->spec_state.compare_exchange_strong(
+                expect, PTO2_SPEC_DISPATCHED, std::memory_order_seq_cst, std::memory_order_seq_cst
+            )) {
+            return false;
+        }
+        // Staged (STAGING). Flip STAGING->DISPATCHED, THEN read the mask. seq_cst
+        // gives a total order with the concurrent stagers, each of which OR-s its
+        // core into the mask and THEN loads spec_state: a stager whose bit lands
+        // before this CAS is read here and rung; a stager whose bit lands after
+        // sees DISPATCHED and rings that core itself (self-ring in
+        // stage_consumer_blocks). Either way every gated core's doorbell fires once
+        // (a double-ring is harmless — the AICore already matched). This replaces
+        // the old transient-STAGING spin: STAGING is now the stable gated state.
+        expect = PTO2_SPEC_STAGING;
+        slot_state.payload->spec_state.compare_exchange_strong(
+            expect, PTO2_SPEC_DISPATCHED, std::memory_order_seq_cst, std::memory_order_seq_cst
+        );
+        for (int w = 0; w < PTO2_SPEC_CORE_MASK_WORDS; w++) {
+            uint64_t bits = slot_state.payload->staged_core_mask[w].load(std::memory_order_seq_cst);
+            while (bits != 0) {
+                int core_id = w * 64 + __builtin_ctzll(bits);
+                bits &= bits - 1;
+                ring_one_doorbell(spec_doorbell_table[core_id].addr, spec_doorbell_table[core_id].token);
+            }
+        }
+        // This pre-staged consumer was just released by its doorbell — it starts
+        // running NOW, so propagate dispatch_fanin to ITS consumers (auto-chain,
+        // knob A). Defer it via the sink so it runs after the whole fanout walk:
+        // doing it inline here would delay the doorbells of later consumers in the
+        // same producer's fanout. Fallback to inline if no sink / sink full.
+        if (sink == nullptr || !sink->push(&slot_state)) {
+            propagate_dispatch_fanin(slot_state);
+        }
+        // No explicit removal from the cross-thread queue: a still-queued entry for
+        // this consumer is now DISPATCHED and is dropped when a peer pops it.
+        // Fully pre-staged => skip the ready queue. Partially staged SPMD consumer =>
+        // fall through so the caller pushes C; dispatch resumes from next_block_idx.
+        return slot_state.next_block_idx.load(std::memory_order_seq_cst) >= slot_state.logical_block_num;
+    }
+
+    bool release_fanin_and_check_ready(
+        PTO2TaskSlotState &slot_state, PTO2LocalReadyBuffer *local_bufs = nullptr, SpecReleaseSink *sink = nullptr
+    ) {
+        // Atomically increment fanin_refcount and check if all producers are done
+        // ACQ_REL on fanin_refcount already synchronizes with the orchestrator's
+        // init release, making fanin_count visible — plain load suffices.
+        int32_t new_refcount = slot_state.fanin_refcount.fetch_add(1, std::memory_order_acq_rel) + 1;
+
+        if (new_refcount == slot_state.fanin_count) {
+            // Speculative early-dispatch: pre-staged tasks are released by doorbell
+            // here, skipping the ready-queue round-trip entirely.
+            if (try_speculative_release(slot_state, sink)) return true;
+            // Local-first: try per-CoreType thread-local buffer before global queue
+            // Route by active_mask: AIC-containing tasks → buf[0], AIV-only → buf[1]
+            // DUMMY shape is out of range for local_bufs (sized PTO2_NUM_RESOURCE_SHAPES);
+            // dummy slots bypass the local fast path and go straight to dummy_ready_queue.
+            PTO2ResourceShape shape = slot_state.active_mask.to_shape();
+            if (shape == PTO2ResourceShape::DUMMY) {
+                dummy_ready_queue.push(&slot_state);
+            } else if (!local_bufs || !local_bufs[static_cast<int32_t>(shape)].try_push(&slot_state)) {
+                ready_queues[static_cast<int32_t>(shape)].push(&slot_state);
+            }
+            return true;
+        }
+        return false;
+    }
+
+#if PTO2_ORCH_PROFILING || PTO2_SCHED_PROFILING
+    bool release_fanin_and_check_ready(
+        PTO2TaskSlotState &slot_state, uint64_t &atomic_count, uint64_t &push_wait,
+        PTO2LocalReadyBuffer *local_bufs = nullptr, SpecReleaseSink *sink = nullptr
+    ) {
+        int32_t new_refcount = slot_state.fanin_refcount.fetch_add(1, std::memory_order_acq_rel) + 1;
+        atomic_count += 1;  // fanin_refcount.fetch_add
+
+        if (new_refcount == slot_state.fanin_count) {
+            // Speculative early-dispatch: pre-staged tasks are released by doorbell
+            // here, skipping the ready-queue round-trip entirely.
+            if (try_speculative_release(slot_state, sink)) return true;
+            // Local-first: try per-CoreType thread-local buffer before global queue.
+            // Dummy slots bypass local_bufs (out-of-range for PTO2_NUM_RESOURCE_SHAPES)
+            // and go straight to dummy_ready_queue; use the profiling-aware push so
+            // atomic_count / push_wait stay consistent with the non-dummy path.
+            PTO2ResourceShape shape = slot_state.active_mask.to_shape();
+            if (shape == PTO2ResourceShape::DUMMY) {
+                dummy_ready_queue.push(&slot_state, atomic_count, push_wait);
+            } else if (!local_bufs || !local_bufs[static_cast<int32_t>(shape)].try_push(&slot_state)) {
+                ready_queues[static_cast<int32_t>(shape)].push(&slot_state, atomic_count, push_wait);
+            }
+            return true;
+        }
+        return false;
+    }
+#endif
+
+    int get_ready_tasks_batch(
+        PTO2ResourceShape shape, PTO2LocalReadyBuffer &local_buf, PTO2TaskSlotState **out, int max_count
+    ) {
+        int count = 0;
+        while (count < max_count && local_buf.count > 0) {
+            out[count++] = local_buf.slot_states[--local_buf.count];
+        }
+        int remaining = max_count - count;
+        if (remaining > 0) {
+            count += ready_queues[static_cast<int32_t>(shape)].pop_batch(out + count, remaining);
+        }
+        return count;
+    }
+
+#if PTO2_SCHED_PROFILING
+    int get_ready_tasks_batch(
+        PTO2ResourceShape shape, PTO2LocalReadyBuffer &local_buf, PTO2TaskSlotState **out, int max_count,
+        uint64_t &atomic_count, uint64_t &wait_cycle
+    ) {
+        int count = 0;
+        while (count < max_count && local_buf.count > 0) {
+            out[count++] = local_buf.slot_states[--local_buf.count];
+        }
+        int remaining = max_count - count;
+        if (remaining > 0) {
+            count +=
+                ready_queues[static_cast<int32_t>(shape)].pop_batch(out + count, remaining, atomic_count, wait_cycle);
+        }
+        return count;
+    }
+#endif
+
+    void on_scope_end(PTO2TaskSlotState **task_slot_states, int32_t count) {
+#if PTO2_ORCH_PROFILING
+        extern uint64_t g_orch_scope_end_atomic_count;
+        if (count > 0) __builtin_prefetch(task_slot_states[0], 1, 0);
+        for (int32_t i = 0; i < count; i++) {
+            if (i + 1 < count) __builtin_prefetch(task_slot_states[i + 1], 1, 0);
+            release_producer_scope(*task_slot_states[i], g_orch_scope_end_atomic_count);
+        }
+#else
+        if (count > 0) __builtin_prefetch(task_slot_states[0], 1, 0);
+        for (int32_t i = 0; i < count; i++) {
+            if (i + 1 < count) __builtin_prefetch(task_slot_states[i + 1], 1, 0);
+            release_producer_scope(*task_slot_states[i]);
+        }
+#endif
+    }
+
+    /**
+     * Subtask completion: atomic counter model.
+     * Called when a single subtask (AIC, AIV0, or AIV1) finishes on any block.
+     * Atomically increments completed_subtasks and checks whether all subtasks
+     * across all blocks are done.
+     *
+     * @return true if this was the last subtask, completing the entire task.
+     */
+    bool on_subtask_complete(PTO2TaskSlotState &slot_state) {
+        int16_t prev = slot_state.completed_subtasks.fetch_add(1, std::memory_order_acq_rel);
+        return (prev + 1) == slot_state.total_required_subtasks;
+    }
+
+    /**
+     * Two-stage completion: second stage.
+     * Called exactly once when all subtasks of a task are done (i.e.,
+     * on_subtask_complete returned true). Walks the consumer (fanout) list,
+     * decrements each consumer's fanin, pushes newly-ready ones, and rings
+     * doorbells for speculative hits.
+     *
+     * Non-PROFILING returns the consumer-walk count (= edges traversed). The
+     * Resolve swimlane bar reads it to label the bar with how many successors
+     * actually got resolved. PROFILING returns the richer CompletionStats
+     * whose `fanout_edges` carries the same number.
+     */
+#if PTO2_SCHED_PROFILING
+    CompletionStats
+#else
+    uint32_t
+#endif
+    on_task_complete(
+        PTO2TaskSlotState &slot_state,
+#if PTO2_SCHED_PROFILING
+        int thread_idx,
+#endif
+
+        PTO2LocalReadyBuffer *local_bufs = nullptr
+    ) {
+#if PTO2_SCHED_PROFILING
+        CompletionStats stats = {0, 0, 0, true};
+#else
+        uint32_t consumer_walk_count = 0;
+#endif
+#if PTO2_SCHED_PROFILING
+        extern uint64_t g_sched_lock_cycle[], g_sched_fanout_cycle[];
+        extern uint64_t g_sched_lock_atomic_count[], g_sched_lock_wait_cycle[];
+        extern uint64_t g_sched_fanout_atomic_count[], g_sched_push_wait_cycle[];
+        uint64_t lock_atomics = 0, lock_wait = 0;
+        PTO2_SCHED_CYCLE_START();
+#endif
+
+#if PTO2_SCHED_PROFILING
+        slot_state.lock_fanout(lock_atomics, lock_wait);
+#else
+        slot_state.lock_fanout();
+#endif
+        slot_state.task_state.store(PTO2_TASK_COMPLETED, std::memory_order_release);
+        PTO2DepListEntry *current = slot_state.fanout_head;  // Protected by fanout_lock
+        slot_state.unlock_fanout();
+
+#if PTO2_SCHED_PROFILING
+        lock_atomics += 2;  // state.store + unlock.store
+        g_sched_lock_atomic_count[thread_idx] += lock_atomics;
+        g_sched_lock_wait_cycle[thread_idx] += lock_wait;
+        PTO2_SCHED_CYCLE_LAP(g_sched_lock_cycle[thread_idx]);
+#endif
+
+        // Fanout: notify consumers. A pre-staged consumer that becomes ready has
+        // its doorbell rung INLINE (db = nullptr) the moment its node is reached,
+        // not batched to after the whole walk — so a flagged consumer near the
+        // front of the list starts immediately and overlaps the remaining
+        // release_fanin work for the other consumers, instead of waiting for the
+        // full O(fanout-degree) walk (~5us for a 50-consumer producer).
+        //
+        // Safe on silicon: the producer's slot is already COMPLETED here — every
+        // SPMD block has FIN'd AND dcci-flushed its output to HBM before
+        // on_task_complete runs — so a released consumer never reads stale
+        // producer output. (Batching used to align the released wave, but pushed
+        // every doorbell to the end of the walk, defeating the whole point of
+        // speculative early-dispatch: minimal producer-end -> consumer-start.)
+#if PTO2_SCHED_PROFILING
+        uint64_t fanout_atomics = 0, push_wait = 0;
+#endif
+        // Doorbells for released pre-staged consumers fire INLINE in the walk
+        // below; their dispatch_fanin propagation is collected here and replayed
+        // after the walk, so no consumer's doorbell waits on a sibling's propagate.
+        SpecReleaseSink rel_sink;
+        while (current != nullptr) {
+            PTO2TaskSlotState &consumer_slot = *current->slot_state;
+#if PTO2_SCHED_PROFILING
+            stats.fanout_edges++;
+            if (release_fanin_and_check_ready(consumer_slot, fanout_atomics, push_wait, local_bufs, &rel_sink)) {
+                stats.tasks_enqueued++;
+            }
+#else
+            consumer_walk_count++;
+            release_fanin_and_check_ready(consumer_slot, local_bufs, &rel_sink);
+#endif
+            current = current->next;
+        }
+        for (int i = 0; i < rel_sink.n; i++) {
+            propagate_dispatch_fanin(*rel_sink.items[i]);
+        }
+
+#if PTO2_SCHED_PROFILING
+        g_sched_fanout_atomic_count[thread_idx] += fanout_atomics;
+        g_sched_push_wait_cycle[thread_idx] += push_wait;
+        PTO2_SCHED_CYCLE_LAP(g_sched_fanout_cycle[thread_idx]);
+        return stats;
+#else
+        return consumer_walk_count;
+#endif
+    }
+
+    /**
+     * Cold path: release producers (fanin traversal) + check self for CONSUMED.
+     * Returns fanin edge count for profiling.
+     */
+
+#if PTO2_SCHED_PROFILING
+    int32_t on_task_release(PTO2TaskSlotState &slot_state, int32_t thread_idx) {
+        PTO2_SCHED_CYCLE_START();
+        extern uint64_t g_sched_fanin_cycle[], g_sched_fanin_atomic_count[];
+        extern uint64_t g_sched_complete_count[];
+        uint64_t fanin_atomics = 0;
+#else
+    int32_t on_task_release(PTO2TaskSlotState &slot_state) {
+#endif
+        PTO2TaskPayload *payload = slot_state.payload;
+        for_each_fanin_slot_state(*payload, [&](PTO2TaskSlotState *producer_slot_state) {
+#if PTO2_SCHED_PROFILING
+            release_producer(*producer_slot_state, fanin_atomics);
+#else
+            release_producer(*producer_slot_state);
+#endif
+        });
+#if PTO2_SCHED_PROFILING
+        g_sched_fanin_atomic_count[thread_idx] += fanin_atomics;
+        PTO2_SCHED_CYCLE_LAP(g_sched_fanin_cycle[thread_idx]);
+#endif
+
+        // host-orch: no self CONSUMED flip — see release_producer. The task's
+        // own fanout_refcount is already complete via the consumer/scope
+        // releases above; nothing reads CONSUMED during device execution.
+#if PTO2_SCHED_PROFILING
+        g_sched_complete_count[thread_idx]++;
+#endif
+        return payload->fanin_actual_count;
+    }
+
+    // === Cold-path API (defined in pto_scheduler.cpp) ===
+
+    // Phase 1: declare every sub-region (ready_queue slots, dummy queue slots,
+    // per-ring dep_pool entries, wiring SPSC buffer) on the supplied arena.
+    // Capacities are baked into the returned layout; init_data_from_layout uses
+    // the same values.
+    static PTO2SchedulerLayout reserve_layout(DeviceArena &arena, int32_t dep_pool_capacity = PTO2_DEP_LIST_POOL_SIZE);
+
+    // Phase 3a: write everything *except* arena-internal pointer fields.
+    // `sm_dev_base` is the device address of the SM (only stored, never
+    // dereferenced here). Safe to call on a host arena that holds the
+    // prebuilt image buffer. (The orchestrator counterpart takes
+    // task_window_size for ring task_descriptors address arithmetic; the
+    // scheduler only needs the SM header / ring header base addresses,
+    // both window-size-independent.)
+    bool init_data_from_layout(const PTO2SchedulerLayout &layout, DeviceArena &arena, void *sm_dev_base);
+
+    // Phase 3b: write the arena-internal pointer fields
+    // (ready_queues[].slots, dummy_ready_queue.slots, dep_pool.base for each
+    // ring, wiring.queue.buffer_). Called on both host and device sides.
+    void wire_arena_pointers(const PTO2SchedulerLayout &layout, DeviceArena &arena);
+
+    // Forget per-region pointers; arena owns the backing memory.
+    void destroy();
+    void print_stats();
+    void print_queues();
+};
+
+// Scheduler cold-path API is declared as PTO2SchedulerState member functions.
+// See init()/destroy()/print_stats()/print_queues() below the struct definition.
+
+// try_inline_complete_locked: short-circuit NotDeferred completions seen during
+// drain so they don't grow entries[]. Defined here (not in pto_async_wait.h)
+// because PTO2SchedulerState's on_task_complete signature is only known
+// after its full definition above.
+//
+// When the deferred_release_slot_states[] buffer is full, drain it via
+// on_task_release before appending — mirrors the same overflow-drain idiom
+// that scheduler_completion.cpp's inline NotDeferred path uses, so high task
+// rates don't surface as ASYNC_WAIT_OVERFLOW errors.
+inline bool
+AsyncWaitList::try_inline_complete_locked(AsyncWaitList::DrainCompletionSink &sink, PTO2TaskSlotState &slot_state) {
+    // Return value (CompletionStats / consumer-walk count) discarded:
+    // async-wait drain path has no Resolve swimlane bar attached.
+#if PTO2_SCHED_PROFILING
+    (void)sink.sched->on_task_complete(slot_state, sink.thread_idx, sink.local_bufs);
+#else
+    (void)sink.sched->on_task_complete(slot_state, sink.local_bufs);
+#endif
+    if (*sink.deferred_release_count >= sink.deferred_release_capacity) {
+        while (*sink.deferred_release_count > 0) {
+#if PTO2_SCHED_PROFILING
+            (void)sink.sched->on_task_release(
+                *sink.deferred_release_slot_states[--(*sink.deferred_release_count)], sink.thread_idx
+            );
+#else
+            sink.sched->on_task_release(*sink.deferred_release_slot_states[--(*sink.deferred_release_count)]);
+#endif
+        }
+    }
+    sink.deferred_release_slot_states[(*sink.deferred_release_count)++] = &slot_state;
+    sink.inline_completed++;
+    return true;
+}
+
+template <bool Profiling>
+inline AsyncPollResult AsyncWaitList::poll_and_complete(
+    AICoreCompletionMailbox *aicore_mailbox, PTO2SchedulerState *sched, PTO2LocalReadyBuffer *local_bufs,
+    PTO2TaskSlotState **deferred_release_slot_states, int32_t &deferred_release_count, int32_t deferred_release_capacity
+#if PTO2_SCHED_PROFILING
+    ,
+    int thread_idx
+#endif
+) {
+    AsyncPollResult result;
+    if (!try_lock()) return result;
+
+    AsyncWaitList::DrainCompletionSink sink{};
+    sink.sched = sched;
+    sink.local_bufs = local_bufs;
+    sink.deferred_release_slot_states = deferred_release_slot_states;
+    sink.deferred_release_count = &deferred_release_count;
+    sink.deferred_release_capacity = deferred_release_capacity;
+#if PTO2_SCHED_PROFILING
+    sink.thread_idx = thread_idx;
+#endif
+
+    int32_t drain_err = PTO2_ERROR_NONE;
+    drain_aicore_completion_mailbox_locked(aicore_mailbox, sink, drain_err);
+    if (drain_err != PTO2_ERROR_NONE) {
+        result.error_code = drain_err;
+        unlock();
+        return result;
+    }
+    result.completed += sink.inline_completed;
+
+    for (int32_t i = count - 1; i >= 0; --i) {
+        AsyncWaitEntry &entry = entries[i];
+        uintptr_t last_invalidated_counter_line = static_cast<uintptr_t>(-1);
+        for (int32_t c = 0; c < entry.condition_count; c++) {
+            CompletionCondition &cond = entry.conditions[c];
+            if (cond.satisfied) continue;
+            if (cond.completion_type == COMPLETION_TYPE_COUNTER && cond.counter_addr != nullptr) {
+                uintptr_t counter_line = mailbox_cache_line(cond.counter_addr);
+                if (counter_line != last_invalidated_counter_line) {
+                    cache_invalidate_range(reinterpret_cast<const void *>(counter_line), sizeof(uint32_t));
+                    last_invalidated_counter_line = counter_line;
+                }
+            }
+            CompletionPollResult poll = cond.test();
+            if (poll.state == CompletionPollState::FAILED) {
+                result.error_code = poll.error_code;
+                result.failed_slot_state = entry.slot_state;
+                unlock();
+                return result;
+            }
+            if (poll.state == CompletionPollState::READY) {
+                cond.satisfied = true;
+                cond.retire();
+                entry.waiting_completion_count--;
+            }
+        }
+
+        if (entry.normal_done && entry.waiting_completion_count <= 0) {
+            // Return value (CompletionStats / consumer-walk count) discarded:
+            // deferred-completion drain has no Resolve swimlane bar attached.
+#if PTO2_SCHED_PROFILING
+            (void)sched->on_task_complete(*entry.slot_state, thread_idx, local_bufs);
+#else
+            (void)sched->on_task_complete(*entry.slot_state, local_bufs);
+#endif
+            // Drain deferred_release in place when the buffer fills — same
+            // overflow-drain idiom used by complete_slot_task's inline path
+            // and by try_inline_complete_locked. Without this, large bursts
+            // of completable wait_list entries in a single poll surfaced as
+            // ASYNC_WAIT_OVERFLOW under the MPSC model.
+            if (deferred_release_count >= deferred_release_capacity) {
+                while (deferred_release_count > 0) {
+#if PTO2_SCHED_PROFILING
+                    (void)sched->on_task_release(*deferred_release_slot_states[--deferred_release_count], thread_idx);
+#else
+                    sched->on_task_release(*deferred_release_slot_states[--deferred_release_count]);
+#endif
+                }
+            }
+            deferred_release_slot_states[deferred_release_count++] = entry.slot_state;
+            result.completed++;
+
+            int32_t last = count - 1;
+            if (i != last) entries[i] = entries[last];
+            count = last;
+        }
+    }
+
+    unlock();
+    return result;
+}
+
+// =============================================================================
+// Scheduler Profiling Data
+// =============================================================================
+
+#if PTO2_SCHED_PROFILING
+struct PTO2SchedProfilingData {
+    // Sub-phase cycle breakdown within on_task_complete
+    uint64_t lock_cycle;           // lock_fanout + state store + unlock
+    uint64_t fanout_cycle;         // fanout traversal
+    uint64_t fanin_cycle;          // fanin traversal
+    uint64_t self_consumed_cycle;  // self check_and_handle_consumed
+
+    // Wait times
+    uint64_t lock_wait_cycle;  // spin-wait in fanout_lock
+    uint64_t push_wait_cycle;  // CAS contention in push()
+    uint64_t pop_wait_cycle;   // CAS contention in pop()
+
+    // Atomic counts per sub-phase
+    uint64_t lock_atomic_count;
+    uint64_t fanout_atomic_count;
+    uint64_t fanin_atomic_count;
+    uint64_t self_atomic_count;
+    uint64_t pop_atomic_count;
+
+    int64_t complete_count;
+};
+
+/**
+ * Get and reset scheduler profiling data for a specific thread.
+ * Returns accumulated profiling data and resets counters.
+ */
+PTO2SchedProfilingData scheduler_get_profiling(int thread_idx);
+#endif
diff --git a/src/a2a3/runtime/host_build_graph/runtime/scheduler/scheduler_cold_path.cpp b/src/a2a3/runtime/host_build_graph/runtime/scheduler/scheduler_cold_path.cpp
new file mode 100644
index 000000000..03e57ebd5
--- /dev/null
+++ b/src/a2a3/runtime/host_build_graph/runtime/scheduler/scheduler_cold_path.cpp
@@ -0,0 +1,1105 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+#include "scheduler_context.h"
+
+#include <cinttypes>
+#include <cstdio>
+
+#include "common/unified_log.h"
+#include "aicpu/dep_gen_collector_aicpu.h"
+#include "aicpu/device_time.h"
+#include "aicpu/l2_swimlane_collector_aicpu.h"
+#include "aicpu/platform_regs.h"
+#include "aicpu/pmu_collector_aicpu.h"
+#include "aicpu/tensor_dump_aicpu.h"
+#include "common/memory_barrier.h"
+#include "common/l2_swimlane_profiling.h"
+#include "common/platform_config.h"
+#include "pto_runtime2.h"
+#include "pto_shared_memory.h"
+#include "runtime.h"
+#include "spin_hint.h"
+
+// =============================================================================
+// Cold-path helpers for the main dispatch loop (noinline to reduce hot-loop icache)
+// =============================================================================
+
+static void latch_scheduler_error(PTO2SharedMemoryHeader *header, int32_t thread_idx, int32_t error_code) {
+    if (header == nullptr || error_code == PTO2_ERROR_NONE) {
+        return;
+    }
+    // The first error code/thread pair wins; the bitmap cumulatively records all reporting threads.
+    int32_t expected = PTO2_ERROR_NONE;
+    if (header->sched_error_code.compare_exchange_strong(expected, error_code, std::memory_order_acq_rel)) {
+        header->sched_error_thread.store(thread_idx, std::memory_order_release);
+    }
+    if (thread_idx >= 0 && thread_idx < 32) {
+        header->sched_error_bitmap.fetch_or(1U << static_cast<uint32_t>(thread_idx), std::memory_order_acq_rel);
+    }
+}
+
+LoopAction SchedulerContext::handle_orchestrator_exit(
+    int32_t thread_idx, PTO2SharedMemoryHeader *header, Runtime *runtime, int32_t &task_count
+) {
+    if (completed_.load(std::memory_order_acquire)) {
+        return LoopAction::BREAK_LOOP;
+    }
+    int32_t orch_err = header->orch_error_code.load(std::memory_order_acquire);
+    if (orch_err != PTO2_ERROR_NONE) {
+        LOG_ERROR(
+            "Thread %d: Fatal error (code=%d), sending EXIT_SIGNAL to all cores. "
+            "completed_tasks=%d, total_tasks=%d",
+            thread_idx, orch_err, completed_tasks_.load(std::memory_order_relaxed), total_tasks_
+        );
+        if (!completed_.exchange(true, std::memory_order_acq_rel)) {
+            emergency_shutdown(runtime);
+        }
+        return LoopAction::BREAK_LOOP;
+    }
+    int32_t sched_err = header->sched_error_code.load(std::memory_order_acquire);
+    if (sched_err != PTO2_ERROR_NONE) {
+        LOG_ERROR("Thread %d: Scheduler fatal error detected (code=%d)", thread_idx, sched_err);
+        if (!completed_.exchange(true, std::memory_order_acq_rel)) {
+            emergency_shutdown(runtime);
+        }
+        return LoopAction::BREAK_LOOP;
+    }
+
+    bool orch_done = orchestrator_done_;
+    if (!orch_done) return LoopAction::NONE;
+
+    task_count = total_tasks_;
+    if (task_count > 0 && completed_tasks_.load(std::memory_order_relaxed) >= task_count) {
+        completed_.store(true, std::memory_order_release);
+        LOG_INFO_V0(
+            "Thread %d: PTO2 completed tasks %d/%d", thread_idx, completed_tasks_.load(std::memory_order_relaxed),
+            task_count
+        );
+        return LoopAction::BREAK_LOOP;
+    }
+    return LoopAction::NONE;
+}
+
+LoopAction SchedulerContext::handle_core_transition(bool &cores_released) {
+    if (!transition_requested_.load(std::memory_order_acquire)) return LoopAction::NONE;
+    if (!reassigned_.load(std::memory_order_acquire)) {
+        wait_reassign_.fetch_add(1, std::memory_order_release);
+        while (!reassigned_.load(std::memory_order_acquire)) {
+            if (completed_.load(std::memory_order_acquire)) {
+                return LoopAction::BREAK_LOOP;
+            }
+            SPIN_WAIT_HINT();
+        }
+    }
+    cores_released = true;
+    return LoopAction::NONE;
+}
+
+LoopAction
+SchedulerContext::check_idle_fatal_error(int32_t thread_idx, PTO2SharedMemoryHeader *header, Runtime *runtime) {
+    if (completed_.load(std::memory_order_acquire)) {
+        return LoopAction::BREAK_LOOP;
+    }
+    int32_t orch_err = header->orch_error_code.load(std::memory_order_acquire);
+    if (orch_err != PTO2_ERROR_NONE) {
+        LOG_ERROR("Thread %d: Fatal error detected (code=%d), sending EXIT_SIGNAL to all cores", thread_idx, orch_err);
+        if (!completed_.exchange(true, std::memory_order_acq_rel)) {
+            emergency_shutdown(runtime);
+        }
+        return LoopAction::BREAK_LOOP;
+    }
+    int32_t sched_err = header->sched_error_code.load(std::memory_order_acquire);
+    if (sched_err != PTO2_ERROR_NONE) {
+        LOG_ERROR("Thread %d: Scheduler fatal error detected (code=%d)", thread_idx, sched_err);
+        if (!completed_.exchange(true, std::memory_order_acq_rel)) {
+            emergency_shutdown(runtime);
+        }
+        return LoopAction::BREAK_LOOP;
+    }
+    return LoopAction::NONE;
+}
+
+// =============================================================================
+// Stall diagnostic log format.
+//
+// Every line is self-contained — when scheduler threads emit concurrently and
+// device_log interleaves their output, each line still carries enough context
+// to identify which thread / iteration / object it belongs to.
+//
+// Prefix on every line:
+//   [STALL thread=N idle_iterations=K] CATEGORY ...
+//
+// All scheduler threads spinning at the same idle rate hit STALL_LOG_INTERVAL
+// together, so lines with the same idle_iterations belong to one diagnostic
+// round; grep "idle_iterations=N" groups one round's output.
+//
+// Categories (and which thread emits them):
+//   SUMMARY  — completed / total counts and scan totals               (thread 0 only)
+//   TASK     — one per non-completed task scanned from shared rings   (thread 0 only)
+//              - state=RUNNING: includes running_on=[...] cross-ref
+//              - state=READY:   fanin satisfied but no idle core yet
+//              - state=WAIT:    includes missing_deps=N
+//   CLUSTER  — one per cluster owned by this thread                   (every thread)
+//              - busy slot shows kernel + task_id + cond_reg_state;
+//                ANOMALY suffix when COND register is fin while software
+//                still has the slot marked busy.
+//
+// Reader workflow:
+//   1. grep SUMMARY                          -> overall completion status
+//   2. grep "idle_iterations=N TASK"         -> stuck RUNNING task and which
+//                                               core/thread it is on
+//   3. grep "idle_iterations=N CLUSTER.*task=<id>" -> cross-check via the
+//                                                     cluster line (or just
+//                                                     read running_on in step 2)
+// =============================================================================
+
+namespace {
+
+// Format a core's idle/busy state into a fixed buffer. Used inside CLUSTER lines.
+// Layout (idle):    coreN(idle)
+// Layout (busy):    coreN(busy kernel=K task=T cond_reg_state=ack)
+// Layout (anomaly): coreN(busy kernel=K task=T cond_reg_state=fin ANOMALY)
+//
+// Healthy busy: COND register reports ack (AICore still executing). fin means
+// AICore wrote completion but AICPU hasn't recycled the running slot yet —
+// either a completion-poll bug or the diagnostic raced the recycle.
+void format_core_status(
+    char *buf, size_t buf_size, int32_t core_id, bool idle, const CoreExecState *core_state, uint64_t reg_addr_for_cond
+) {
+    if (idle) {
+        snprintf(buf, buf_size, "core%d(idle)", core_id);
+        return;
+    }
+    int32_t kernel = -1;
+    int64_t task_id_raw = -1;
+    if (core_state && core_state->running_slot_state) {
+        int32_t subslot = static_cast<int32_t>(core_state->running_subslot);
+        kernel = core_state->running_slot_state->task->kernel_id[subslot];
+        task_id_raw = static_cast<int64_t>(core_state->running_slot_state->task->task_id.raw);
+    }
+    uint64_t cond_reg = read_reg(reg_addr_for_cond, RegId::COND);
+    int32_t hw_state = EXTRACT_TASK_STATE(cond_reg);
+    const char *cond_reg_state_str = (hw_state == TASK_ACK_STATE) ? "ack" : "fin";
+    if (hw_state == TASK_ACK_STATE) {
+        snprintf(
+            buf, buf_size, "core%d(busy kernel=%d task=%" PRId64 " cond_reg_state=%s)", core_id, kernel, task_id_raw,
+            cond_reg_state_str
+        );
+    } else {
+        snprintf(
+            buf, buf_size,
+            "core%d(busy kernel=%d task=%" PRId64
+            " cond_reg_state=%s ANOMALY cond_tok=%d running_tok=%d pending_tok=%d)",
+            core_id, kernel, task_id_raw, cond_reg_state_str, EXTRACT_TASK_ID(cond_reg),
+            core_state->running_reg_task_id, core_state->pending_reg_task_id
+        );
+    }
+}
+
+}  // namespace
+
+int32_t SchedulerContext::find_core_owner_thread(int32_t core_id) const {
+    for (int32_t t = 0; t < aicpu_thread_num_; t++) {
+        const int32_t *ids = core_trackers_[t].core_ids();
+        int32_t n = core_trackers_[t].core_num();
+        for (int32_t i = 0; i < n; i++) {
+            if (ids[i] == core_id) return t;
+        }
+    }
+    return -1;
+}
+
+bool SchedulerContext::self_owns_running_task(int32_t thread_idx) const {
+    const int32_t *cores = core_trackers_[thread_idx].core_ids();
+    int32_t core_num = core_trackers_[thread_idx].core_num();
+    for (int32_t i = 0; i < core_num; i++) {
+        if (core_exec_states_[cores[i]].running_slot_state != nullptr) {
+            return true;
+        }
+    }
+    return false;
+}
+
+bool SchedulerContext::no_thread_owns_running_task() const {
+    for (int32_t t = 0; t < aicpu_thread_num_; t++) {
+        if (self_owns_running_task(t)) return false;
+    }
+    return true;
+}
+
+void SchedulerContext::log_stall_diagnostics(
+    int32_t thread_idx, int32_t task_count, int32_t idle_iterations, int32_t last_progress_count
+) {
+    CoreTracker &tracker = core_trackers_[thread_idx];
+
+    // T0 owns the shared-ring scan; printing it from other threads would
+    // produce identical TASK lines once per scheduler thread.
+    if (thread_idx == 0) {
+        int32_t cnt_ready = 0, cnt_waiting = 0, cnt_running = 0, submitted_in_ring = 0;
+        for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+            PTO2SharedMemoryRingHeader &ring = *sched_->ring_sched_state.ring;
+            int32_t ring_task_count = ring.fc.current_task_index.load(std::memory_order_relaxed);
+            submitted_in_ring += ring_task_count;
+            for (int32_t si = 0; si < ring_task_count; si++) {
+                PTO2TaskSlotState &slot_state = ring.get_slot_state_by_task_id(si);
+                PTO2TaskState st = slot_state.task_state.load(std::memory_order_relaxed);
+                int32_t rc = slot_state.fanin_refcount.load(std::memory_order_relaxed);
+                int32_t fi = slot_state.fanin_count;
+                int32_t kid_aic = slot_state.task->kernel_id[0];
+                int32_t kid_aiv0 = slot_state.task->kernel_id[1];
+                int32_t kid_aiv1 = slot_state.task->kernel_id[2];
+                int64_t task_id = static_cast<int64_t>(slot_state.task->task_id.raw);
+                if (st >= PTO2_TASK_COMPLETED) continue;
+                // task_state has no intermediate ready/running value — it
+                // stays PENDING until the worker stores COMPLETED. Classify
+                // by the ground truth instead: a slot is RUNNING iff some
+                // core has it as running_slot_state. A task occupies at most
+                // 3 cores (one cluster), all under the same owner thread by
+                // construction of assign_cores_to_threads.
+                char running_on[192] = {0};
+                int32_t owner = -1;
+                int32_t pos = 0;
+                bool is_running = false;
+                for (int32_t cid = 0; cid < cores_total_num_ && pos + 32 < (int32_t)sizeof(running_on); cid++) {
+                    if (core_exec_states_[cid].running_slot_state != &slot_state) continue;
+                    is_running = true;
+                    if (owner < 0) owner = find_core_owner_thread(cid);
+                    const char *sname = subslot_name(core_exec_states_[cid].running_subslot);
+                    int32_t written = snprintf(
+                        running_on + pos, sizeof(running_on) - pos, "%score=%d(%s)", pos == 0 ? "" : " ", cid, sname
+                    );
+                    if (written > 0) pos += written;
+                }
+
+                if (is_running) {
+                    cnt_running++;
+                    if (cnt_running > STALL_DUMP_READY_MAX) continue;
+                    LOG_INFO_V9(
+                        "[STALL thread=%d idle_iterations=%d] TASK ring=%d task_id=%" PRId64
+                        " state=RUNNING fanin_refcount=%d/%d kernels=[aic:%d aiv0:%d aiv1:%d] "
+                        "running_on=[owner_thread=%d cores=[%s]]",
+                        thread_idx, idle_iterations, r, task_id, rc, fi, kid_aic, kid_aiv0, kid_aiv1, owner, running_on
+                    );
+                    continue;
+                }
+                if (rc >= fi) {
+                    cnt_ready++;
+                    if (cnt_ready > STALL_DUMP_READY_MAX) continue;
+                    LOG_INFO_V9(
+                        "[STALL thread=%d idle_iterations=%d] TASK ring=%d task_id=%" PRId64
+                        " state=READY   fanin_refcount=%d/%d kernels=[aic:%d aiv0:%d aiv1:%d]",
+                        thread_idx, idle_iterations, r, task_id, rc, fi, kid_aic, kid_aiv0, kid_aiv1
+                    );
+                    continue;
+                }
+                cnt_waiting++;
+                if (cnt_waiting > STALL_DUMP_WAIT_MAX) continue;
+                LOG_INFO_V9(
+                    "[STALL thread=%d idle_iterations=%d] TASK ring=%d task_id=%" PRId64
+                    " state=WAIT    fanin_refcount=%d/%d kernels=[aic:%d aiv0:%d aiv1:%d] missing_deps=%d",
+                    thread_idx, idle_iterations, r, task_id, rc, fi, kid_aic, kid_aiv0, kid_aiv1, fi - rc
+                );
+            }
+        }
+        int32_t effective_total = task_count > 0 ? task_count : submitted_in_ring;
+        int32_t c = completed_tasks_.load(std::memory_order_relaxed);
+        LOG_INFO_V9(
+            "[STALL thread=%d idle_iterations=%d] SUMMARY completed=%d/%d last_progress_iteration=%d "
+            "scan_ready=%d scan_waiting=%d scan_running=%d",
+            thread_idx, idle_iterations, c, effective_total, last_progress_count, cnt_ready, cnt_waiting, cnt_running
+        );
+    }
+
+    // CLUSTER lines: one per cluster this thread owns.
+    // cluster_id = local_cluster_idx * active_sched_threads_ + thread_idx, matching the
+    // round-robin assignment in assign_cores_to_threads / reassign_cores_for_all_threads.
+    int32_t ast = active_sched_threads_ > 0 ? active_sched_threads_ : aicpu_thread_num_;
+    for (int32_t cli = 0; cli < tracker.get_cluster_count() && cli < STALL_DUMP_CORE_MAX; cli++) {
+        int32_t offset = cli * 3;
+        int32_t aic_id = tracker.get_aic_core_id(offset);
+        int32_t aiv0_id = tracker.get_aiv0_core_id(offset);
+        int32_t aiv1_id = tracker.get_aiv1_core_id(offset);
+        bool aic_idle = tracker.is_aic_core_idle(offset);
+        bool aiv0_idle = tracker.is_aiv0_core_idle(offset);
+        bool aiv1_idle = tracker.is_aiv1_core_idle(offset);
+        int32_t cluster_id = cli * ast + thread_idx;
+        char aic_buf[192], aiv0_buf[192], aiv1_buf[192];
+        format_core_status(
+            aic_buf, sizeof(aic_buf), aic_id, aic_idle, &core_exec_states_[aic_id], core_exec_states_[aic_id].reg_addr
+        );
+        format_core_status(
+            aiv0_buf, sizeof(aiv0_buf), aiv0_id, aiv0_idle, &core_exec_states_[aiv0_id],
+            core_exec_states_[aiv0_id].reg_addr
+        );
+        format_core_status(
+            aiv1_buf, sizeof(aiv1_buf), aiv1_id, aiv1_idle, &core_exec_states_[aiv1_id],
+            core_exec_states_[aiv1_id].reg_addr
+        );
+        LOG_INFO_V9(
+            "[STALL thread=%d idle_iterations=%d] CLUSTER cluster_id=%d aic=%s aiv0=%s aiv1=%s", thread_idx,
+            idle_iterations, cluster_id, aic_buf, aiv0_buf, aiv1_buf
+        );
+    }
+}
+
+void SchedulerContext::log_shutdown_stall_snapshot(
+    int32_t trigger_thread_idx, int32_t trigger_idle_iterations, int32_t trigger_last_progress_count
+) {
+    LOG_WARN(
+        "[SHUTDOWN_SNAPSHOT trigger_thread=%d reason=scheduler_timeout idle_iterations=%d] "
+        "dumping all scheduler threads before emergency shutdown",
+        trigger_thread_idx, trigger_idle_iterations
+    );
+    int32_t thread_count = active_sched_threads_ > 0 ? active_sched_threads_ : aicpu_thread_num_;
+    if (thread_count < 0 || thread_count > MAX_AICPU_THREADS) {
+        LOG_ERROR(
+            "[SHUTDOWN_SNAPSHOT trigger_thread=%d] invalid thread_count=%d, clamping to [0,%d]", trigger_thread_idx,
+            thread_count, MAX_AICPU_THREADS
+        );
+        thread_count = thread_count < 0 ? 0 : MAX_AICPU_THREADS;
+    }
+    for (int32_t t = 0; t < thread_count; t++) {
+        log_stall_diagnostics(t, total_tasks_, trigger_idle_iterations, trigger_last_progress_count);
+    }
+}
+
+int32_t SchedulerContext::handle_timeout_exit(
+    int32_t thread_idx, PTO2SharedMemoryHeader *header, Runtime *runtime, int32_t idle_iterations,
+    int32_t last_progress_count
+#if PTO2_PROFILING
+    ,
+    uint64_t sched_start_ts
+#endif
+) {
+    LOG_ERROR(
+        "[STALL thread=%d idle_iterations=%d] TIMEOUT_EXIT after_idle_iterations=%d", thread_idx, idle_iterations,
+        idle_iterations
+    );
+    latch_scheduler_error(header, thread_idx, PTO2_ERROR_SCHEDULER_TIMEOUT);
+    if (!completed_.exchange(true, std::memory_order_acq_rel)) {
+        log_shutdown_stall_snapshot(thread_idx, idle_iterations, last_progress_count);
+#if PTO2_PROFILING
+        // Capture the in-flight kernels' partial output before signalling the
+        // cores to exit, so the dump reflects the live stuck state.
+        if (is_dump_args_enabled()) {
+            dump_running_task_outputs<PTO2_SUBTASK_SLOT_COUNT>(
+                thread_idx, cores_total_num_,
+                [this](int32_t cid) {
+                    return core_exec_states_[cid].running_slot_state;
+                },
+                [](ActiveMask active_mask, int raw_subtask_id) {
+                    return active_mask.subtask_active(static_cast<PTO2SubtaskSlot>(raw_subtask_id));
+                },
+                [this](int32_t func_id) {
+                    return get_function_bin_addr(func_id);
+                }
+            );
+        }
+#endif
+        emergency_shutdown(runtime);
+    }
+#if PTO2_PROFILING
+    uint64_t sched_timeout_ts = get_sys_cnt_aicpu();
+    LOG_INFO_V9(
+        "Thread %d: sched_start=%" PRIu64 " sched_end(timeout)=%" PRIu64 " sched_cost=%.3fus", thread_idx,
+        static_cast<uint64_t>(sched_start_ts), static_cast<uint64_t>(sched_timeout_ts),
+        cycles_to_us(sched_timeout_ts - sched_start_ts)
+    );
+#endif
+    return -PTO2_ERROR_SCHEDULER_TIMEOUT;
+}
+
+#if PTO2_PROFILING
+void SchedulerContext::log_l2_swimlane_summary(int32_t thread_idx, int32_t cur_thread_completed) {
+    auto &l2_swimlane = sched_l2_swimlane_[thread_idx];
+    uint64_t sched_end_ts = get_sys_cnt_aicpu();
+    LOG_INFO_V9(
+        "Thread %d: sched_start=%" PRIu64 " sched_end=%" PRIu64 " sched_cost=%.3fus", thread_idx,
+        static_cast<uint64_t>(l2_swimlane.sched_start_ts), static_cast<uint64_t>(sched_end_ts),
+        cycles_to_us(sched_end_ts - l2_swimlane.sched_start_ts)
+    );
+
+    uint64_t sched_total = l2_swimlane.sched_wiring_cycle + l2_swimlane.sched_complete_cycle +
+                           l2_swimlane.sched_dispatch_cycle + l2_swimlane.sched_idle_cycle;
+    if (sched_total == 0) sched_total = 1;
+
+#if PTO2_SCHED_PROFILING
+    {
+        PTO2SchedProfilingData sp = scheduler_get_profiling(thread_idx);
+        uint64_t otc_total = sp.lock_cycle + sp.fanout_cycle + sp.fanin_cycle + sp.self_consumed_cycle;
+        uint64_t complete_poll =
+            (l2_swimlane.sched_complete_cycle > otc_total + l2_swimlane.sched_complete_perf_cycle) ?
+                (l2_swimlane.sched_complete_cycle - otc_total - l2_swimlane.sched_complete_perf_cycle) :
+                0;
+        uint64_t dispatch_poll = (l2_swimlane.sched_dispatch_cycle >
+                                  l2_swimlane.sched_dispatch_pop_cycle + l2_swimlane.sched_dispatch_setup_cycle) ?
+                                     (l2_swimlane.sched_dispatch_cycle - l2_swimlane.sched_dispatch_pop_cycle -
+                                      l2_swimlane.sched_dispatch_setup_cycle) :
+                                     0;
+
+        LOG_INFO_V9(
+            "Thread %d: === Scheduler Phase Breakdown: total=%.3fus, %d tasks ===", thread_idx,
+            cycles_to_us(sched_total), cur_thread_completed
+        );
+
+        // fanout / fanin per-thread aggregates live in
+        // sched_overhead_analysis.compute_dag_stats_from_deps (deps.json edges
+        // × core_to_thread).
+        LOG_INFO_V9(
+            "Thread %d:   complete       : %.3fus (%.1f%%)", thread_idx, cycles_to_us(l2_swimlane.sched_complete_cycle),
+            l2_swimlane.sched_complete_cycle * 100.0 / sched_total
+        );
+
+        uint64_t c_parent = l2_swimlane.sched_complete_cycle > 0 ? l2_swimlane.sched_complete_cycle : 1;
+        uint64_t complete_miss_count = (l2_swimlane.complete_probe_count > l2_swimlane.complete_hit_count) ?
+                                           (l2_swimlane.complete_probe_count - l2_swimlane.complete_hit_count) :
+                                           0;
+        double complete_hit_rate = l2_swimlane.complete_probe_count > 0 ?
+                                       l2_swimlane.complete_hit_count * 100.0 / l2_swimlane.complete_probe_count :
+                                       0.0;
+        LOG_INFO_V9(
+            "Thread %d:     poll         : %.3fus (%.1f%%)  hit=%" PRIu64 ", miss=%" PRIu64 ", hit_rate=%.1f%%",
+            thread_idx, cycles_to_us(complete_poll), complete_poll * 100.0 / c_parent,
+            static_cast<uint64_t>(l2_swimlane.complete_hit_count), static_cast<uint64_t>(complete_miss_count),
+            complete_hit_rate
+        );
+        LOG_INFO_V9(
+            "Thread %d:     otc_lock     : %.3fus (%.1f%%)  work=%.3fus wait=%.3fus  atomics=%" PRIu64 "", thread_idx,
+            cycles_to_us(sp.lock_cycle), sp.lock_cycle * 100.0 / c_parent,
+            cycles_to_us(sp.lock_cycle - sp.lock_wait_cycle), cycles_to_us(sp.lock_wait_cycle),
+            static_cast<uint64_t>(sp.lock_atomic_count)
+        );
+        LOG_INFO_V9(
+            "Thread %d:     otc_fanout   : %.3fus (%.1f%%)  work=%.3fus wait=%.3fus  atomics=%" PRIu64 "", thread_idx,
+            cycles_to_us(sp.fanout_cycle), sp.fanout_cycle * 100.0 / c_parent,
+            cycles_to_us(sp.fanout_cycle - sp.push_wait_cycle), cycles_to_us(sp.push_wait_cycle),
+            static_cast<uint64_t>(sp.fanout_atomic_count)
+        );
+        LOG_INFO_V9(
+            "Thread %d:     otc_fanin    : %.3fus (%.1f%%)  atomics=%" PRIu64 "", thread_idx,
+            cycles_to_us(sp.fanin_cycle), sp.fanin_cycle * 100.0 / c_parent,
+            static_cast<uint64_t>(sp.fanin_atomic_count)
+        );
+        LOG_INFO_V9(
+            "Thread %d:     otc_self     : %.3fus (%.1f%%)  atomics=%" PRIu64 "", thread_idx,
+            cycles_to_us(sp.self_consumed_cycle), sp.self_consumed_cycle * 100.0 / c_parent,
+            static_cast<uint64_t>(sp.self_atomic_count)
+        );
+        LOG_INFO_V9(
+            "Thread %d:     perf         : %.3fus (%.1f%%)", thread_idx,
+            cycles_to_us(l2_swimlane.sched_complete_perf_cycle),
+            l2_swimlane.sched_complete_perf_cycle * 100.0 / c_parent
+        );
+
+        LOG_INFO_V9(
+            "Thread %d:   dispatch       : %.3fus (%.1f%%)", thread_idx, cycles_to_us(l2_swimlane.sched_dispatch_cycle),
+            l2_swimlane.sched_dispatch_cycle * 100.0 / sched_total
+        );
+
+        uint64_t d_parent = l2_swimlane.sched_dispatch_cycle > 0 ? l2_swimlane.sched_dispatch_cycle : 1;
+        LOG_INFO_V9(
+            "Thread %d:     poll         : %.3fus (%.1f%%)", thread_idx, cycles_to_us(dispatch_poll),
+            dispatch_poll * 100.0 / d_parent
+        );
+        LOG_INFO_V9(
+            "Thread %d:     pop          : %.3fus (%.1f%%)  work=%.3fus wait=%.3fus  atomics=%" PRIu64 "", thread_idx,
+            cycles_to_us(l2_swimlane.sched_dispatch_pop_cycle), l2_swimlane.sched_dispatch_pop_cycle * 100.0 / d_parent,
+            cycles_to_us(l2_swimlane.sched_dispatch_pop_cycle - sp.pop_wait_cycle), cycles_to_us(sp.pop_wait_cycle),
+            static_cast<uint64_t>(sp.pop_atomic_count)
+        );
+        LOG_INFO_V9(
+            "Thread %d:     setup        : %.3fus (%.1f%%)", thread_idx,
+            cycles_to_us(l2_swimlane.sched_dispatch_setup_cycle),
+            l2_swimlane.sched_dispatch_setup_cycle * 100.0 / d_parent
+        );
+
+#if PTO2_SCHED_PROFILING
+        LOG_INFO_V9(
+            "Thread %d:   wiring         : %.3fus (%.1f%%)  tasks=%d", thread_idx,
+            cycles_to_us(l2_swimlane.sched_wiring_cycle), l2_swimlane.sched_wiring_cycle * 100.0 / sched_total,
+            l2_swimlane.phase_wiring_count
+        );
+#else
+        LOG_INFO_V9(
+            "Thread %d:   wiring         : %.3fus (%.1f%%)", thread_idx, cycles_to_us(l2_swimlane.sched_wiring_cycle),
+            l2_swimlane.sched_wiring_cycle * 100.0 / sched_total
+        );
+#endif
+
+        LOG_INFO_V9(
+            "Thread %d:   idle           : %.3fus (%.1f%%)", thread_idx, cycles_to_us(l2_swimlane.sched_idle_cycle),
+            l2_swimlane.sched_idle_cycle * 100.0 / sched_total
+        );
+
+        if (cur_thread_completed > 0) {
+            LOG_INFO_V9(
+                "Thread %d:   avg/complete   : %.3fus", thread_idx,
+                cycles_to_us(l2_swimlane.sched_complete_cycle) / cur_thread_completed
+            );
+        }
+    }
+#endif
+    LOG_INFO_V9(
+        "Thread %d: Scheduler summary: total_time=%.3fus, loops=%" PRIu64 ", tasks_scheduled=%d", thread_idx,
+        cycles_to_us(sched_total), static_cast<uint64_t>(l2_swimlane.sched_loop_count), cur_thread_completed
+    );
+}
+#endif
+
+// =============================================================================
+// Shutdown: deinit AICore regs for this thread's cores (and PMU finalize if enabled).
+// Orchestrator threads have core_trackers_[thread_idx].core_num() == 0 -> no-op.
+// platform_deinit_aicore_regs is idempotent; safe to call after early completion.
+// =============================================================================
+int32_t SchedulerContext::shutdown(int32_t thread_idx) {
+    const int32_t *cores = core_trackers_[thread_idx].core_ids();
+    int32_t core_num = core_trackers_[thread_idx].core_num();
+    if (core_num == 0) return 0;
+
+#if PTO2_PROFILING
+    if (is_pmu_enabled()) {
+        pmu_aicpu_finalize(cores, core_num);
+    }
+#endif
+
+    LOG_INFO_V0("Thread %d: Shutting down %d cores", thread_idx, core_num);
+    int32_t rc = 0;
+    for (int32_t i = 0; i < core_num; i++) {
+        int32_t core_id = cores[i];
+        uint64_t reg_addr = core_exec_states_[core_id].reg_addr;
+        if (reg_addr != 0) {
+            // Timeout means AICore is unresponsive. Log and continue deiniting remaining cores.
+            if (platform_deinit_aicore_regs(reg_addr) != 0) {
+                LOG_ERROR("Thread %d: Core %d deinit timed out", thread_idx, core_id);
+                rc = -1;
+            }
+        } else {
+            LOG_ERROR("Thread %d: Core %d has invalid register address", thread_idx, core_id);
+        }
+    }
+    LOG_INFO_V0("Thread %d: Shutdown complete", thread_idx);
+    return rc;
+}
+
+// =============================================================================
+// Handshake with all AICore workers; discover core type and reg address.
+// =============================================================================
+int32_t SchedulerContext::handshake_all_cores(Runtime *runtime) {
+    Handshake *all_handshakes = reinterpret_cast<Handshake *>(runtime->workers);
+    cores_total_num_ = runtime->worker_count;
+
+    // Validate cores_total_num_ before using as array index
+    if (cores_total_num_ == 0 || cores_total_num_ > RUNTIME_MAX_WORKER) {
+        LOG_ERROR("Invalid cores_total_num %d (expected 1-%d)", cores_total_num_, RUNTIME_MAX_WORKER);
+        return -1;
+    }
+
+    aic_count_ = 0;
+    aiv_count_ = 0;
+
+    LOG_INFO_V0("Handshaking with %d cores", cores_total_num_);
+
+    // Step 1: Write per-core payload addresses and send handshake signal.
+    // OUT_OF_ORDER_STORE_BARRIER() ensures task is globally visible before
+    // aicpu_ready=1, so AICore reads the correct payload pointer after waking up.
+    for (int32_t i = 0; i < cores_total_num_; i++) {
+        all_handshakes[i].task = reinterpret_cast<uint64_t>(&payload_per_core_[i][0]);
+        OUT_OF_ORDER_STORE_BARRIER();
+        all_handshakes[i].aicpu_ready = 1;
+    }
+    OUT_OF_ORDER_STORE_BARRIER();
+
+    // Get platform physical cores count for validation
+    uint32_t max_physical_cores_count = platform_get_physical_cores_count();
+
+    // Step 2: Wait for all cores to respond, collect core type and register addresses
+    bool handshake_failed = false;
+    for (int32_t i = 0; i < cores_total_num_; i++) {
+        Handshake *hank = &all_handshakes[i];
+
+        while (hank->aicore_regs_ready == 0) {
+            SPIN_WAIT_HINT();
+        }
+
+        uint32_t physical_core_id = hank->physical_core_id;
+
+        if (physical_core_id >= max_physical_cores_count) {
+            LOG_ERROR(
+                "Core %d reported invalid physical_core_id=%u (platform max=%u)", i, physical_core_id,
+                max_physical_cores_count
+            );
+            handshake_failed = true;
+            continue;
+        }
+
+        uint64_t *regs = reinterpret_cast<uint64_t *>(regs_);
+        uint64_t reg_addr = regs[physical_core_id];
+
+        // Initialize AICore registers after discovery (first round)
+        platform_init_aicore_regs(reg_addr);
+        OUT_OF_ORDER_STORE_BARRIER();
+        hank->aicpu_regs_ready = 1;
+
+        OUT_OF_ORDER_STORE_BARRIER();
+
+        while (hank->aicore_done == 0) {
+            SPIN_WAIT_HINT();
+        }
+
+        CoreType type = hank->core_type;
+
+        core_exec_states_[i].reg_addr = reg_addr;
+        core_exec_states_[i].cond_ptr = get_reg_ptr(reg_addr, RegId::COND);
+
+#if PTO2_PROFILING
+        // Record physical_core_id for PMU init later (CoreExecState has no room
+        // for this field under PTO2_PROFILING).
+        physical_core_ids_[i] = physical_core_id;
+#endif
+#if !PTO2_PROFILING
+        core_exec_states_[i].worker_id = i;
+        core_exec_states_[i].physical_core_id = physical_core_id;
+        core_exec_states_[i].core_type = type;
+#endif
+
+        if (type == CoreType::AIC) {
+            aic_worker_ids_[aic_count_++] = i;
+            LOG_INFO_V0("Core %d: AIC, physical_id=%u, reg_addr=0x%lx", i, physical_core_id, reg_addr);
+        } else {
+            aiv_worker_ids_[aiv_count_++] = i;
+            LOG_INFO_V0("Core %d: AIV, physical_id=%u, reg_addr=0x%lx", i, physical_core_id, reg_addr);
+        }
+    }
+
+    if (handshake_failed) {
+        emergency_shutdown(runtime);
+        return -1;
+    }
+
+    LOG_INFO_V0("Core discovery complete: %d AIC, %d AIV", aic_count_, aiv_count_);
+    return 0;
+}
+
+// =============================================================================
+// Assign discovered cores to scheduler threads (cluster-aligned round-robin).
+// =============================================================================
+bool SchedulerContext::assign_cores_to_threads() {
+    // Cluster-aligned round-robin assignment: cluster ci -> sched thread ci % active_sched_threads_.
+    // Each cluster = 1 AIC + 2 adjacent AIV; the triple is always kept together.
+    active_sched_threads_ = (sched_thread_num_ > 0) ? sched_thread_num_ : aicpu_thread_num_;
+    int32_t cluster_count = aic_count_;
+
+    // Max clusters any single sched thread can hold: ceil(cluster_count / active_sched_threads_).
+    int32_t max_clusters_per_thread = (cluster_count + active_sched_threads_ - 1) / active_sched_threads_;
+    int32_t thread_cores_num = max_clusters_per_thread * 3;
+
+    if (thread_cores_num > CoreTracker::MAX_CORE_PER_THREAD) {
+        LOG_ERROR("Can't assign more then 64 cores in per scheduler");
+        return false;
+    }
+
+    LOG_INFO_V0(
+        "Assigning cores (round-robin): %d clusters across %d sched threads (%d AIC, %d AIV)", cluster_count,
+        active_sched_threads_, aic_count_, aiv_count_
+    );
+
+    for (int32_t i = 0; i < RUNTIME_MAX_WORKER; i++) {
+        core_exec_states_[i].running_reg_task_id = AICPU_TASK_INVALID;
+        core_exec_states_[i].pending_reg_task_id = AICPU_TASK_INVALID;
+    }
+
+    // Count clusters per thread first (round-robin may distribute unevenly)
+    int32_t clusters_per_thread[MAX_AICPU_THREADS] = {};
+    for (int32_t ci = 0; ci < cluster_count; ci++) {
+        clusters_per_thread[ci % active_sched_threads_]++;
+    }
+    for (int32_t i = 0; i < active_sched_threads_; i++) {
+        core_trackers_[i].init(clusters_per_thread[i]);
+    }
+
+    int32_t cluster_idx_per_thread[MAX_AICPU_THREADS] = {};
+
+    for (int32_t ci = 0; ci < cluster_count; ci++) {
+        int32_t t = ci % active_sched_threads_;
+
+        int32_t aic_wid = aic_worker_ids_[ci];
+        int32_t aiv0_wid = aiv_worker_ids_[2 * ci];
+        int32_t aiv1_wid = aiv_worker_ids_[2 * ci + 1];
+
+        core_trackers_[t].set_cluster(cluster_idx_per_thread[t]++, aic_wid, aiv0_wid, aiv1_wid);
+
+        LOG_INFO_V0("Thread %d: cluster %d (AIC=%d, AIV0=%d, AIV1=%d)", t, ci, aic_wid, aiv0_wid, aiv1_wid);
+    }
+
+    for (int32_t t = 0; t < aicpu_thread_num_; t++) {
+        LOG_INFO_V0(
+            "Thread %d: total %d cores (%d clusters)", t, core_trackers_[t].core_num(),
+            core_trackers_[t].get_cluster_count()
+        );
+    }
+
+    LOG_INFO_V0(
+        "Config: threads=%d, cores=%d, cores_per_thread=%d", aicpu_thread_num_, cores_total_num_, thread_cores_num
+    );
+    return true;
+}
+
+// =============================================================================
+// Reassign all cores across all threads (sched + orchestrator) after orchestration.
+// =============================================================================
+void SchedulerContext::reassign_cores_for_all_threads() {
+    LOG_INFO_V0(
+        "Reassigning cores (cluster-aligned) for %d threads: %d AIC, %d AIV", aicpu_thread_num_, aic_count_, aiv_count_
+    );
+
+    // Collect running worker_ids from all current trackers
+    bool running_cores[RUNTIME_MAX_WORKER] = {};
+    for (int32_t i = 0; i < aicpu_thread_num_; i++) {
+        auto all_running = core_trackers_[i].get_all_running_cores();
+        int32_t bp;
+        while ((bp = all_running.pop_first()) >= 0) {
+            running_cores[core_trackers_[i].get_core_id_by_offset(bp)] = true;
+        }
+    }
+
+    // Count clusters per thread (round-robin across all threads)
+    int32_t cluster_count = aic_count_;
+    int32_t clusters_per_thread[MAX_AICPU_THREADS] = {};
+    for (int32_t ci = 0; ci < cluster_count; ci++) {
+        clusters_per_thread[ci % aicpu_thread_num_]++;
+    }
+
+    // Re-init all trackers and reset core counts
+    for (int32_t i = 0; i < aicpu_thread_num_; i++) {
+        core_trackers_[i].init(clusters_per_thread[i]);
+    }
+
+    // Assign clusters round-robin and restore running state
+    int32_t cluster_idx_per_thread[MAX_AICPU_THREADS] = {};
+    for (int32_t ci = 0; ci < cluster_count; ci++) {
+        int32_t t = ci % aicpu_thread_num_;
+
+        int32_t aic_wid = aic_worker_ids_[ci];
+        int32_t aiv0_wid = aiv_worker_ids_[2 * ci];
+        int32_t aiv1_wid = aiv_worker_ids_[2 * ci + 1];
+
+        int32_t cl_idx = cluster_idx_per_thread[t]++;
+        core_trackers_[t].set_cluster(cl_idx, aic_wid, aiv0_wid, aiv1_wid);
+
+        // init() marks all idle; toggle cores that were running and restore pending_occupied
+        if (running_cores[aic_wid]) {
+            core_trackers_[t].change_core_state(cl_idx * 3);
+            core_trackers_[t].set_pending_occupied(cl_idx * 3);
+        }
+        if (running_cores[aiv0_wid]) {
+            core_trackers_[t].change_core_state(cl_idx * 3 + 1);
+            core_trackers_[t].set_pending_occupied(cl_idx * 3 + 1);
+        }
+        if (running_cores[aiv1_wid]) {
+            core_trackers_[t].change_core_state(cl_idx * 3 + 2);
+            core_trackers_[t].set_pending_occupied(cl_idx * 3 + 2);
+        }
+    }
+
+    // Log final distribution
+    LOG_INFO_V0("Core reassignment complete:");
+    for (int32_t t = 0; t < aicpu_thread_num_; t++) {
+        int32_t aic_running = core_trackers_[t].get_running_count<CoreType::AIC>();
+        int32_t aiv_running = core_trackers_[t].get_running_count<CoreType::AIV>();
+        LOG_INFO_V0(
+            "  Thread %d: %d cores, %d clusters (AIC running=%d, AIV running=%d)", t, core_trackers_[t].core_num(),
+            core_trackers_[t].get_cluster_count(), aic_running, aiv_running
+        );
+    }
+    active_sched_threads_ = aicpu_thread_num_;
+}
+
+// =============================================================================
+// Emergency shutdown: broadcast exit signal to every handshake'd core and
+// deinit their AICore register blocks. Idempotent.
+// =============================================================================
+void SchedulerContext::emergency_shutdown(Runtime *runtime) {
+    LOG_WARN("Emergency shutdown: sending exit signal to all initialized cores");
+    Handshake *all_handshakes = reinterpret_cast<Handshake *>(runtime->workers);
+    int32_t timeout_count = 0;
+    for (int32_t i = 0; i < cores_total_num_; i++) {
+        Handshake *hank = &all_handshakes[i];
+        OUT_OF_ORDER_STORE_BARRIER();
+        hank->aicpu_regs_ready = 1;
+        if (core_exec_states_[i].reg_addr != 0) {
+            if (platform_deinit_aicore_regs(core_exec_states_[i].reg_addr) != 0) {
+                timeout_count++;
+            }
+        }
+    }
+    if (timeout_count > 0) {
+        LOG_ERROR("Emergency shutdown: %d cores did not acknowledge exit", timeout_count);
+    }
+    LOG_WARN("Emergency shutdown complete");
+}
+
+// =============================================================================
+// Lifecycle: init / deinit
+// =============================================================================
+int32_t SchedulerContext::init(
+    Runtime *runtime, int32_t aicpu_thread_num, int32_t sched_thread_num, bool orch_to_sched, uint64_t regs_base
+) {
+    always_assert(runtime != nullptr);
+
+    // Zero all per-core execution state before handshake
+    memset(core_exec_states_, 0, sizeof(core_exec_states_));
+
+    // Wire thread/transition configuration that handshake/assign need to read.
+    aicpu_thread_num_ = aicpu_thread_num;
+    sched_thread_num_ = sched_thread_num;
+    orch_to_sched_ = orch_to_sched;
+    regs_ = regs_base;
+
+#if PTO2_PROFILING
+    // l2_swimlane_aicpu_init promotes g_l2_swimlane_level from the shared-memory
+    // header — must be called BEFORE caching the level, otherwise the cached
+    // value would still be 0 (only the binary enable bit has been seeded by
+    // kernel.cpp at this point). Reset the cached level on disabled runs so a
+    // prior enabled launch's level can't leak into the phase-record gates in
+    // scheduler_dispatch.
+    if (is_l2_swimlane_enabled()) {
+        l2_swimlane_aicpu_init(runtime->worker_count);
+        l2_swimlane_level_ = get_l2_swimlane_level();
+        if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) {
+            // Sched-phase pool count must match the dump_args_init thread count
+            // below. This block runs before assign_cores_to_threads, so the
+            // active_sched_threads_ member isn't set yet — recompute the same
+            // normalization locally: sched_thread_num_ <= 0 means "use all AICPU
+            // threads as scheduler threads" (see assign_cores_to_threads'
+            // active_sched_threads_). Without it, init_phase would prime zero
+            // sched pools and all sched_phase emits would silently drop.
+            const int active_sched = (sched_thread_num_ > 0) ? sched_thread_num_ : aicpu_thread_num_;
+            const int sched_phase_threads = orch_to_sched_ ? aicpu_thread_num_ : active_sched;
+            // Orchestration is always single-threaded, so orch-phase is one pool
+            // (ordinal 0) in both modes — see record_orch_phase.
+            const int orch_phase_threads = 1;
+            l2_swimlane_aicpu_init_phase(runtime->worker_count, sched_phase_threads, orch_phase_threads);
+        }
+    } else {
+        l2_swimlane_level_ = L2SwimlaneLevel::DISABLED;
+    }
+#endif
+
+    // Discover cores and assign to scheduler threads.
+    int32_t rc = handshake_all_cores(runtime);
+    if (rc != 0) {
+        LOG_ERROR("handshake_all_cores failed");
+        return rc;
+    }
+    if (!assign_cores_to_threads()) {
+        return -1;
+    }
+
+    // Profiling-subsystem buffer/state init: single-threaded cold path, so the
+    // "do it once" guarantee is structural (no CAS needed). Runs after
+    // handshake_all_cores / assign_cores_to_threads because pmu_aicpu_init needs
+    // physical_core_ids_ / cores_total_num_. Mirrors the l2_swimlane_aicpu_init
+    // convention above; the per-thread *_set_orch_thread_idx setters stay on the
+    // orchestrator thread (see aicpu_executor.cpp).
+#if PTO2_PROFILING
+    if (is_dump_args_enabled()) {
+        dump_args_init(orch_to_sched_ ? aicpu_thread_num_ : active_sched_threads_);
+    }
+    if (is_pmu_enabled()) {
+        pmu_aicpu_init(physical_core_ids_, cores_total_num_);
+        LOG_INFO_V0("PMU profiling started on %d cores", cores_total_num_);
+    }
+#endif
+    // dep_gen is host-driven (SubmitTrace) and gated independently of
+    // PTO2_PROFILING. init() only pops the initial buffer from instance 0's
+    // free_queue; the orchestrator thread still records its idx via
+    // dep_gen_aicpu_set_orch_thread_idx() before the first record_submit.
+    if (is_dep_gen_enabled()) {
+        dep_gen_aicpu_init();
+    }
+
+    // Initialize task counters. Task count comes from PTO2 shared memory.
+    if (runtime->get_gm_sm_ptr()) {
+        auto *header = static_cast<PTO2SharedMemoryHeader *>(runtime->get_gm_sm_ptr());
+        // Read at one-time boot init, before the SM is reset for the run, so a
+        // ring not yet written holds uninitialized memory (0xbe... under ASAN's
+        // malloc-fill). Sum in int64 and only count rings whose value is a
+        // plausible task count — (0, PTO2_SCOPE_TASKS_CAP]; a ring cannot hold
+        // more than the scope cap. This rejects any garbage pattern (negative
+        // or positive), so uninitialized rings contribute 0 (the correct boot
+        // count) while valid counts still add up, with no signed overflow.
+        int64_t pto2_count = 0;
+        for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+            int32_t ring_tasks = header->ring.fc.current_task_index.load(std::memory_order_acquire);
+            if (ring_tasks > 0 && ring_tasks <= PTO2_SCOPE_TASKS_CAP) pto2_count += ring_tasks;
+        }
+        total_tasks_ = static_cast<int32_t>(pto2_count);
+    } else {
+        total_tasks_ = 0;
+    }
+    completed_tasks_.store(0, std::memory_order_release);
+
+    // Device orchestration: the orchestrator thread flips this when the graph is built.
+    orchestrator_done_ = false;
+
+    // Clear per-core dispatch payloads
+    memset(payload_per_core_, 0, sizeof(payload_per_core_));
+    memset(deferred_slab_per_core_, 0, sizeof(deferred_slab_per_core_));
+
+    // Initialize per-core GlobalContext (sub_block_id) based on cluster position.
+    // This is done once at startup and never modified afterwards.
+    for (int32_t t = 0; t < sched_thread_num_; t++) {
+        CoreTracker &tracker = core_trackers_[t];
+        for (int32_t c = 0; c < tracker.get_cluster_count(); c++) {
+            int32_t cluster_offset = c * 3;  // Each cluster = 1 AIC + 2 AIV
+            auto aiv0_id = tracker.get_core_id_by_offset(tracker.get_aiv0_core_offset(cluster_offset));
+            auto aiv1_id = tracker.get_core_id_by_offset(tracker.get_aiv1_core_offset(cluster_offset));
+            payload_per_core_[aiv0_id][0].global_context.sub_block_id = 0;
+            payload_per_core_[aiv0_id][1].global_context.sub_block_id = 0;
+            payload_per_core_[aiv1_id][0].global_context.sub_block_id = 1;
+            payload_per_core_[aiv1_id][1].global_context.sub_block_id = 1;
+        }
+    }
+
+    func_id_to_addr_ = runtime->func_id_to_addr_;
+
+    return 0;
+}
+
+void SchedulerContext::deinit() {
+    // Reset all per-core execution state
+    for (int32_t i = 0; i < RUNTIME_MAX_WORKER; i++) {
+        core_exec_states_[i] = {};
+        core_exec_states_[i].running_reg_task_id = AICPU_TASK_INVALID;
+        core_exec_states_[i].pending_reg_task_id = AICPU_TASK_INVALID;
+    }
+
+    // Clear per-core dispatch payloads
+    memset(payload_per_core_, 0, sizeof(payload_per_core_));
+    memset(deferred_slab_per_core_, 0, sizeof(deferred_slab_per_core_));
+
+    // Reset sync-start drain coordination — a previous run that aborted mid-drain
+    // would otherwise leave dirty pending/elected/ack state for the next reuse.
+    drain_state_.sync_start_pending.store(0, std::memory_order_release);
+    drain_state_.drain_worker_elected.store(0, std::memory_order_release);
+    drain_state_.drain_ack_mask.store(0, std::memory_order_release);
+    drain_state_.pending_task.store(nullptr, std::memory_order_release);
+
+    // Reset task counters and orchestrator state
+    completed_tasks_.store(0, std::memory_order_release);
+    total_tasks_ = 0;
+    orchestrator_done_ = false;
+
+    // Reset core transition state
+    transition_requested_.store(false, std::memory_order_release);
+    wait_reassign_.store(0, std::memory_order_release);
+    reassigned_.store(false, std::memory_order_release);
+    completed_.store(false, std::memory_order_release);
+
+    // Reset core discovery and assignment state
+    aic_count_ = 0;
+    aiv_count_ = 0;
+    cores_total_num_ = 0;
+    aicpu_thread_num_ = 0;
+    sched_thread_num_ = 0;
+    orch_to_sched_ = false;
+    active_sched_threads_ = 0;
+    for (int32_t t = 0; t < MAX_AICPU_THREADS; t++) {
+        core_trackers_[t] = CoreTracker{};
+    }
+
+    regs_ = 0;
+    sched_ = nullptr;
+    rt_ = nullptr;
+    func_id_to_addr_ = nullptr;
+}
+
+void SchedulerContext::bind_runtime(PTO2Runtime *rt) {
+    rt_ = rt;
+    sched_ = &rt->scheduler;
+}
+
+// =============================================================================
+// Post-orchestration bookkeeping. Runs on the orchestrator thread once the
+// build phase finishes; folds inline-completed tasks, flips orchestrator_done_,
+// and drives the orchestrator → scheduler core transition (or fatal shutdown).
+// =============================================================================
+void SchedulerContext::on_orchestration_done(
+    Runtime *runtime, PTO2Runtime *rt, int32_t thread_idx, int32_t total_tasks
+) {
+#if PTO2_PROFILING
+    if (l2_swimlane_level_ >= L2SwimlaneLevel::ORCH_PHASES) {
+        // Flush the orchestrator's orch-phase buffer (single instance, pool 0).
+        // The orchestrator has no scheduler-phase pool of its own — those belong
+        // to the scheduler threads and are flushed in scheduler_dispatch.
+        l2_swimlane_aicpu_flush_orch_phase_buffer(thread_idx);
+    }
+#endif
+
+    total_tasks_ = total_tasks;
+
+    // Fold tasks completed inline during orchestration
+    int32_t inline_completed = static_cast<int32_t>(rt->orchestrator.inline_completed_tasks);
+    if (inline_completed > 0) {
+        completed_tasks_.fetch_add(inline_completed, std::memory_order_relaxed);
+#if PTO2_SCHED_PROFILING
+        rt->scheduler.tasks_completed.fetch_add(inline_completed, std::memory_order_relaxed);
+#endif
+    }
+    orchestrator_done_ = true;
+
+    // Check for fatal error from orchestration; if so, shut down immediately.
+    int32_t orch_err = 0;
+    if (sched_->sm_header) {
+        orch_err = sched_->sm_header->orch_error_code.load(std::memory_order_relaxed);
+    }
+    if (orch_err != PTO2_ERROR_NONE) {
+        if (!completed_.exchange(true, std::memory_order_acq_rel)) {
+            emergency_shutdown(runtime);
+        }
+    }
+
+    // Skip core transition on fatal error — cores already shut down above.
+    if (completed_.load(std::memory_order_acquire)) {
+        // Signal transition to unblock scheduler threads waiting at core transition
+        transition_requested_.store(true, std::memory_order_release);
+        reassigned_.store(true, std::memory_order_release);
+    } else if (orch_to_sched_) {
+        LOG_INFO_V0("Thread %d: Set orchestrator_done=true, requesting core transition", thread_idx);
+        transition_requested_.store(true, std::memory_order_release);
+
+        // Wait for scheduler threads to acknowledge transition request
+        while (wait_reassign_.load(std::memory_order_acquire) != sched_thread_num_) {
+            if (completed_.load(std::memory_order_acquire)) {
+                break;
+            }
+            SPIN_WAIT_HINT();
+        }
+        if (!completed_.load(std::memory_order_acquire)) {
+            reassign_cores_for_all_threads();
+            reassigned_.store(true, std::memory_order_release);
+        }
+    }
+
+#if PTO2_PROFILING
+    // Write core-to-thread mapping AFTER reassignment so the profiling data
+    // reflects the final distribution (all active_sched_threads_, including
+    // former orchestrator threads when orch_to_sched_ is enabled).
+    if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) {
+        l2_swimlane_aicpu_init_core_assignments(cores_total_num_);
+        for (int32_t t = 0; t < active_sched_threads_; t++) {
+            l2_swimlane_aicpu_write_core_assignments_for_thread(
+                t, core_trackers_[t].core_ids(), core_trackers_[t].core_num()
+            );
+        }
+    }
+#endif
+}
diff --git a/src/a2a3/runtime/host_build_graph/runtime/scheduler/scheduler_completion.cpp b/src/a2a3/runtime/host_build_graph/runtime/scheduler/scheduler_completion.cpp
new file mode 100644
index 000000000..774589865
--- /dev/null
+++ b/src/a2a3/runtime/host_build_graph/runtime/scheduler/scheduler_completion.cpp
@@ -0,0 +1,614 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+#include "scheduler_context.h"
+
+#include <algorithm>
+
+#include "common/unified_log.h"
+#include "aicpu/device_time.h"
+#include "aicpu/platform_regs.h"
+#include "common/l2_swimlane_profiling.h"
+#include "common/memory_barrier.h"
+#include "common/platform_config.h"
+#include "pto_runtime2.h"
+#include "runtime.h"
+#include "spin_hint.h"
+
+// Performance profiling headers
+#include "aicpu/l2_swimlane_collector_aicpu.h"
+#include "aicpu/pmu_collector_aicpu.h"
+#include "aicpu/tensor_dump_aicpu.h"
+
+// =============================================================================
+// Dual-slot state machine helpers
+// =============================================================================
+
+namespace {
+inline constexpr int32_t PTO2_DEFERRED_RELEASE_CAP = 256;
+}
+
+// Pure function: read register result -> SlotTransition (no side effects).
+SlotTransition SchedulerContext::decide_slot_transition(
+    int32_t reg_task_id, int32_t reg_state, int32_t running_id, int32_t pending_id, bool pending_gated
+) {
+    SlotTransition t;
+    if (pending_id != AICPU_TASK_INVALID && reg_task_id == pending_id) {
+        t.matched = true;
+        t.running_done = true;  // Serial execution: pending event implies running done
+        t.running_freed = true;
+        t.pending_freed = true;
+        if (reg_state == TASK_FIN_STATE) {
+            t.pending_done = true;  // Case 1: pending FIN
+        }
+        // else: Case 2: pending ACK (pending_done stays false)
+    } else if (reg_task_id == running_id) {
+        if (reg_state == TASK_FIN_STATE) {
+            if (pending_id == AICPU_TASK_INVALID) {
+                // Case 3.2: running FIN, no pending -> core goes idle
+                t.matched = true;
+                t.running_done = true;
+                t.running_freed = true;
+            } else if (pending_gated) {
+                // Case 3.3: running FIN, pending is a SPECULATIVE GATED task. The
+                // Case 3.1 "wait for the pending's ack" shortcut assumes the AICore
+                // immediately runs the pending task; a gated task instead spins on
+                // its doorbell and never acks until its producer completes — and
+                // that producer's completion depends on collecting THIS running FIN.
+                // Waiting would deadlock. Complete the running FIN now and promote
+                // the gated task (it then skip-gates until its doorbell). pending is
+                // NOT freed (it promotes, not retires) so the bitmap update keeps the
+                // core off-limits — no second gated block, no doorbell overwrite.
+                t.matched = true;
+                t.running_done = true;
+                t.running_freed = true;
+            }
+            // Case 3.1: running FIN, NON-gated pending exists -> skip (transient
+            // state). Case 1/2 (pending ack/FIN) completes running implicitly.
+        } else {
+            // Case 4: running ACK -- only pending_freed (slot now hardware-latched)
+            t.matched = true;
+            t.pending_freed = true;
+        }
+    }
+    return t;
+}
+
+// Complete one slot's task: subtask counting, mixed completion, deferred release, profiling.
+void SchedulerContext::complete_slot_task(
+    PTO2TaskSlotState &slot_state, int32_t expected_reg_task_id, [[maybe_unused]] PTO2SubtaskSlot subslot,
+    int32_t thread_idx, int32_t core_id, Handshake *hank, int32_t &completed_this_turn,
+    PTO2TaskSlotState *deferred_release_slot_states[], int32_t &deferred_release_count, PTO2LocalReadyBuffer *local_bufs
+#if PTO2_PROFILING
+    ,
+    uint64_t dispatch_ts, uint64_t finish_ts
+#endif
+) {
+#if PTO2_PROFILING
+    auto &l2_swimlane = sched_l2_swimlane_[thread_idx];
+#else
+    (void)hank;
+#endif
+    // MPSC fast-path is opt-in per task: only tasks with at least one subtask
+    // that registered a deferred condition route through the mailbox. Pure
+    // non-deferred tasks complete inline on this thread (matching pre-MPSC
+    // behavior — keeps the common case parallelized across scheduler threads
+    // instead of serializing through the single consumer). The
+    // any_subtask_deferred flag on slot_state is the discriminator; it's set
+    // (release) before on_subtask_complete and read (acquire) after, so the
+    // last subtask sees flag writes from any earlier subtask of the same task.
+    AICoreCompletionMailbox *mailbox = rt_ != nullptr ? rt_->aicore_mailbox : nullptr;
+    bool defer_completion_to_consumer = false;
+
+    if (slot_state.payload != nullptr) {
+        volatile DeferredCompletionSlab *deferred_slab = &deferred_slab_per_core_[core_id][expected_reg_task_id & 1];
+        int32_t slab_err = deferred_slab->error_code;
+        if (slab_err != PTO2_ERROR_NONE) {
+            int32_t expected = PTO2_ERROR_NONE;
+            sched_->sm_header->sched_error_code.compare_exchange_strong(
+                expected, slab_err, std::memory_order_acq_rel, std::memory_order_acquire
+            );
+            completed_.store(true, std::memory_order_release);
+            return;
+        }
+
+        uint32_t cond_count = deferred_slab->count;
+        if (cond_count > MAX_COMPLETIONS_PER_TASK) {
+            int32_t expected = PTO2_ERROR_NONE;
+            sched_->sm_header->sched_error_code.compare_exchange_strong(
+                expected, PTO2_ERROR_ASYNC_REGISTRATION_FAILED, std::memory_order_acq_rel, std::memory_order_acquire
+            );
+            completed_.store(true, std::memory_order_release);
+            return;
+        }
+
+        if (cond_count > 0) {
+            // Publish "this task is deferred" before on_subtask_complete so the
+            // acq_rel fetch_add inside on_subtask_complete makes the flag
+            // visible to whichever subtask sees task_complete=true (which may
+            // be this thread or a later one).
+            slot_state.any_subtask_deferred.store(true, std::memory_order_release);
+
+            const PTO2TaskId token = slot_state.task->task_id;
+            for (uint32_t i = 0; i < cond_count; ++i) {
+                volatile DeferredCompletionEntry *e = &deferred_slab->entries[i];
+                while (!mailbox->try_push_condition(token, e->addr, e->expected_value, e->engine, e->completion_type)) {
+                    sched_->async_wait_list.mpsc_skipped_count.fetch_add(1, std::memory_order_relaxed);
+                    SPIN_WAIT_HINT();
+                }
+            }
+        }
+    }
+
+    bool task_complete = sched_->on_subtask_complete(slot_state);
+
+#if PTO2_PROFILING
+    // Sub-block retire that did not finish the slot: record it so the poll
+    // iteration becomes visible on the scheduler lane (the SPMD harvest tail).
+    if (!task_complete && l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) {
+        l2_swimlane.phase_subretire_count++;
+    }
+#endif
+
+    if (task_complete && slot_state.payload != nullptr &&
+        slot_state.any_subtask_deferred.load(std::memory_order_acquire)) {
+        // Some subtask of this task registered conditions; finish the
+        // registration by handing the slot_state off to the consumer.
+        while (!mailbox->try_push_normal_done(slot_state.task->task_id, reinterpret_cast<uint64_t>(&slot_state))) {
+            sched_->async_wait_list.mpsc_skipped_count.fetch_add(1, std::memory_order_relaxed);
+            SPIN_WAIT_HINT();
+        }
+        defer_completion_to_consumer = true;
+    }
+
+    if (task_complete && !defer_completion_to_consumer) {
+#if PTO2_PROFILING
+        if (is_dump_args_enabled()) {
+            dump_args_for_task<PTO2_SUBTASK_SLOT_COUNT>(
+                thread_idx, slot_state, TensorDumpStage::AFTER_COMPLETION,
+                [](ActiveMask active_mask, int raw_subtask_id) {
+                    return active_mask.subtask_active(static_cast<PTO2SubtaskSlot>(raw_subtask_id));
+                },
+                [this](int32_t func_id) {
+                    return get_function_bin_addr(func_id);
+                }
+            );
+        }
+#endif
+#if PTO2_PROFILING
+        // Time Resolve (walk the consumer list, decrement each consumer's
+        // fanin, push the newly-ready ones, ring doorbells for speculative
+        // hits) so it renders as a child bar nested inside this iteration's
+        // Complete bar. The 1 µs floor below filters out the ~88% of tasks
+        // with 1-2 consumers (~500 ns Resolve) so only the long broadcast /
+        // reduction walks stand out on the lane.
+        uint64_t resolve_t0 = (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) ? get_sys_cnt_aicpu() : 0;
+#endif
+        // [[maybe_unused]] silences -Werror=unused-but-set-variable on the
+        // profiling-flags-smoke build path where PTO2_PROFILING is OFF and
+        // the Resolve emit below is excluded.
+        [[maybe_unused]] uint32_t consumers_resolved = 0;
+#if PTO2_SCHED_PROFILING
+        // SCHED_PROFILING variant takes thread_idx for its per-thread atomic
+        // counter side-effects (g_sched_*_atomic_count[thread_idx], consumed
+        // by the otc_* log lines). It returns CompletionStats whose
+        // `fanout_edges` is the consumer-walk count.
+        consumers_resolved = sched_->on_task_complete(slot_state, thread_idx, local_bufs).fanout_edges;
+#else
+        consumers_resolved = sched_->on_task_complete(slot_state, local_bufs);
+#endif
+#if PTO2_PROFILING
+        if (resolve_t0 != 0) {
+            uint64_t resolve_t1 = get_sys_cnt_aicpu();
+            // Filter: drop Resolve bars under 1 µs so the lane shows only
+            // resolves that did meaningful work (high consumer counts or
+            // doorbells). 50 cycles @ 50 MHz = 1 µs (PLATFORM_PROF_SYS_CNT_FREQ
+            // is the device sys-cnt frequency).
+            constexpr uint64_t RESOLVE_EMIT_MIN_CYCLES = PLATFORM_PROF_SYS_CNT_FREQ / 1'000'000;  // 1 µs
+            if (resolve_t1 - resolve_t0 >= RESOLVE_EMIT_MIN_CYCLES) {
+                l2_swimlane_aicpu_record_sched_phase(
+                    thread_idx, L2SwimlaneSchedPhaseKind::Resolve, resolve_t0, resolve_t1, l2_swimlane.sched_loop_count,
+                    consumers_resolved
+                );
+            }
+        }
+        l2_swimlane.phase_complete_count++;
+#endif
+        if (deferred_release_count < PTO2_DEFERRED_RELEASE_CAP) {
+            deferred_release_slot_states[deferred_release_count++] = &slot_state;
+        } else {
+            LOG_INFO_V9("Thread %d: release", thread_idx);
+            while (deferred_release_count > 0) {
+#if PTO2_SCHED_PROFILING
+                // SCHED_PROFILING variant takes thread_idx for the per-thread
+                // atomic counter side-effects. The return value is unused.
+                (void)sched_->on_task_release(*deferred_release_slot_states[--deferred_release_count], thread_idx);
+#else
+                sched_->on_task_release(*deferred_release_slot_states[--deferred_release_count]);
+#endif
+            }
+            deferred_release_slot_states[deferred_release_count++] = &slot_state;
+        }
+        completed_this_turn++;
+    }
+
+#if PTO2_PROFILING
+    // Level gate: at AICORE_TIMING (level=1) the AICore record alone carries
+    // {start, end, task_token_raw}, host resolves func_id/core_type from
+    // dep_gen / per-core mapping, and AICPU has nothing to write. Only at
+    // AICPU_TIMING (level=2) and above does AICPU contribute dispatch/finish
+    // timestamps via complete_task. Bypassing here saves the per-completion
+    // hot-path cost (counter inc + ring lookup + record store + wmb + buffer
+    // rotation bookkeeping) for runs that only want AICore timing.
+    if (l2_swimlane.l2_swimlane_enabled && l2_swimlane_level_ >= L2SwimlaneLevel::AICPU_TIMING) {
+#if PTO2_SCHED_PROFILING
+        uint64_t t_perf_start = get_sys_cnt_aicpu();
+#endif
+
+        if (l2_swimlane_aicpu_complete_task(
+                core_id, thread_idx, static_cast<uint32_t>(expected_reg_task_id), dispatch_ts, finish_ts
+            ) != 0) {
+            LOG_ERROR(
+                "Core %d: l2_swimlane_aicpu_complete_task failed for task 0x%" PRIx64, core_id,
+                static_cast<uint64_t>(slot_state.task->task_id.raw)
+            );
+        }
+#if PTO2_SCHED_PROFILING
+        l2_swimlane.sched_complete_perf_cycle += (get_sys_cnt_aicpu() - t_perf_start);
+#endif
+    }
+
+    if (is_pmu_enabled()) {
+        pmu_aicpu_record_task(
+            core_id, thread_idx, slot_state.task->task_id.raw,
+            slot_state.task->kernel_id[static_cast<int32_t>(subslot)], hank[core_id].core_type
+        );
+    }
+#endif
+}
+
+// Promote pending slot data to running slot. Clears pending fields.
+void SchedulerContext::promote_pending_to_running(CoreExecState &core) {
+    core.running_slot_state = core.pending_slot_state;
+    core.running_reg_task_id = core.pending_reg_task_id;
+    core.running_subslot = core.pending_subslot;
+#if PTO2_PROFILING
+    core.running_dispatch_timestamp = core.pending_dispatch_timestamp;
+#endif
+    core.pending_slot_state = nullptr;
+    core.pending_reg_task_id = AICPU_TASK_INVALID;
+}
+
+// Clear running slot (core becomes idle).
+void SchedulerContext::clear_running_slot(CoreExecState &core) {
+    core.running_slot_state = nullptr;
+    core.running_reg_task_id = AICPU_TASK_INVALID;
+}
+
+void SchedulerContext::check_running_cores_for_completion(
+    int32_t thread_idx, Handshake *hank, int32_t &completed_this_turn, int32_t &cur_thread_completed,
+    bool &made_progress, PTO2TaskSlotState *deferred_release_slot_states[], int32_t &deferred_release_count,
+    PTO2LocalReadyBuffer *local_bufs
+) {
+#if PTO2_SCHED_PROFILING
+    auto &l2_swimlane = sched_l2_swimlane_[thread_idx];
+#endif
+    CoreTracker &tracker = core_trackers_[thread_idx];
+    auto running_core_states = tracker.get_all_running_cores();
+    while (running_core_states.has_value()) {
+        int32_t bit_pos = running_core_states.pop_first();
+        int32_t core_id = tracker.get_core_id_by_offset(bit_pos);
+        CoreExecState &core = core_exec_states_[core_id];
+
+        // Skip gated speculative cores. A STAGED task is parked on this core
+        // waiting for its doorbell — it physically cannot ACK/FIN yet, so
+        // reading its COND (MMIO, and the core is hot-spinning on its own SPR)
+        // every poll is pure waste that drags out the completion phase. The
+        // doorbell (try_speculative_release) flips spec_state to DISPATCHED, at
+        // which point the core becomes pollable again and its FIN is caught.
+        // Cheap cacheable load; no MMIO. Pending slot is empty while gated.
+        {
+            PTO2TaskSlotState *rs = core.running_slot_state;
+            if (rs != nullptr && rs->payload != nullptr &&
+                rs->payload->spec_state.load(std::memory_order_relaxed) == PTO2_SPEC_STAGING) {
+                continue;
+            }
+        }
+
+        // --- Judgment phase: read register, derive transition ---
+        // Use the precomputed cond_ptr (resolved once in handshake) to skip
+        // the reg_offset switch and reg_addr addition on every poll.
+        uint64_t reg_val = static_cast<uint64_t>(*core.cond_ptr);
+        // ARM64 allows Device-nGnRnE -> Normal-cacheable load reorder; the
+        // rmb() pins any AICore-published cacheable reads downstream of the
+        // FIN observation. Replaces the post-`__sync_synchronize` that the
+        // old read_reg() helper carried implicitly.
+        rmb();
+        int32_t reg_task_id = EXTRACT_TASK_ID(reg_val);
+        int32_t reg_state = EXTRACT_TASK_STATE(reg_val);
+
+#if PTO2_SCHED_PROFILING
+        if (l2_swimlane.l2_swimlane_enabled) {
+            l2_swimlane.complete_probe_count++;
+        }
+#endif
+
+        // A pending task is "gated" when it is a speculative pre-stage still
+        // waiting on its doorbell (STAGED): it will not ack on the producer's FIN,
+        // so the Case 3.1 wait-for-pending-ack shortcut would deadlock. Detect it
+        // so decide_slot_transition completes the running FIN and promotes it.
+        bool pending_gated =
+            (core.pending_slot_state != nullptr && core.pending_slot_state->payload != nullptr &&
+             core.pending_slot_state->payload->spec_state.load(std::memory_order_relaxed) == PTO2_SPEC_STAGING);
+        SlotTransition t = decide_slot_transition(
+            reg_task_id, reg_state, core.running_reg_task_id, core.pending_reg_task_id, pending_gated
+        );
+        if (!t.matched) continue;
+
+#if PTO2_SCHED_PROFILING
+        if (l2_swimlane.l2_swimlane_enabled && (t.running_done || t.pending_done)) {
+            l2_swimlane.complete_hit_count++;
+        }
+#endif
+
+#if PTO2_PROFILING
+        // Capture finish_ts at the FIN observation point — right after rmb()
+        // above pinned the cacheable AICore reads downstream of the register
+        // load, and BEFORE any fanin / deferred-release work. Anything later
+        // (slot transition apply, complete_slot_task fanin processing) would
+        // charge AICPU completion-processing cost to the (end → finish)
+        // span, masking the actual FIN-delivery latency.
+        uint64_t finish_ts = 0;
+        if (l2_swimlane_level_ >= L2SwimlaneLevel::AICPU_TIMING && (t.pending_done || t.running_done)) {
+            finish_ts = get_sys_cnt_aicpu();
+        }
+#endif
+
+        // --- Apply phase: execute actions based on transition ---
+
+        // 1. Complete finished tasks (capture pointers before modifying core state)
+        if (t.pending_done) {
+            complete_slot_task(
+                *core.pending_slot_state, core.pending_reg_task_id, core.pending_subslot, thread_idx, core_id, hank,
+                completed_this_turn, deferred_release_slot_states, deferred_release_count, local_bufs
+#if PTO2_PROFILING
+                ,
+                core.pending_dispatch_timestamp, finish_ts
+#endif
+            );
+            cur_thread_completed++;
+        }
+        if (t.running_done) {
+            complete_slot_task(
+                *core.running_slot_state, core.running_reg_task_id, core.running_subslot, thread_idx, core_id, hank,
+                completed_this_turn, deferred_release_slot_states, deferred_release_count, local_bufs
+#if PTO2_PROFILING
+                ,
+                core.running_dispatch_timestamp, finish_ts
+#endif
+            );
+            cur_thread_completed++;
+        }
+
+        // 2. Update slot data
+        if (t.running_freed) {
+            if (core.pending_slot_state != nullptr && !t.pending_done) {
+                promote_pending_to_running(core);  // Case 2 or Case 3 (with pending)
+            } else {
+                clear_running_slot(core);  // Case 1 or Case 3 (no pending)
+                if (t.pending_done) {
+                    // Case 1: pending FIN observed directly -- clear stale pending fields.
+                    // Without this, pending_reg_task_id retains a stale value that blocks
+                    // clear_pending_occupied and permanently degrades pipelining.
+                    core.pending_slot_state = nullptr;
+                    core.pending_reg_task_id = AICPU_TASK_INVALID;
+                }
+            }
+        }
+
+        // 3. Update tracker bitmap
+        bool is_idle = (core.running_reg_task_id == AICPU_TASK_INVALID);
+        if (is_idle) {
+            tracker.change_core_state(bit_pos);       // Mark idle
+            tracker.clear_pending_occupied(bit_pos);  // Idle safeguard: no payload to protect
+        } else if (t.pending_freed && core.pending_reg_task_id == AICPU_TASK_INVALID) {
+            // Case 4 (running ACK) or Case 2 (pending ACK): clear pending_occupied only
+            // when no pending task is currently held. Otherwise pending slot is occupied
+            // by a pre-loaded task and must stay protected.
+            tracker.clear_pending_occupied(bit_pos);
+        }
+
+        // 4. Progress signal (only when running task completes)
+        if (t.running_done) {
+            made_progress = true;
+        }
+    }
+}
+
+// =============================================================================
+// sync_start drain protocol
+// =============================================================================
+
+// Take ownership of slot_state and signal all threads to enter drain mode.
+// Returns true if this thread won the CAS and owns the drain slot.
+// Returns false if another thread already holds drain; caller must re-push slot_state.
+//
+// Two-phase protocol: CAS 0 -> -1 (sentinel) to claim ownership, store task and
+// reset election flag, then release-store block_num.  Other threads acquire-load
+// sync_start_pending; seeing block_num > 0 ensures all relaxed stores are visible.
+bool SchedulerContext::enter_drain_mode(PTO2TaskSlotState *slot_state, int32_t block_num) {
+    int32_t expected = 0;
+    if (!drain_state_.sync_start_pending.compare_exchange_strong(
+            expected, -1, std::memory_order_relaxed, std::memory_order_relaxed
+        )) {
+        return false;  // Another thread already holds the drain slot.
+    }
+    // We own the drain slot.  Store the task and reset election flag before making it visible.
+    drain_state_.pending_task.store(slot_state, std::memory_order_release);
+    drain_state_.drain_ack_mask.store(0, std::memory_order_relaxed);
+    drain_state_.drain_worker_elected.store(0, std::memory_order_relaxed);
+    // Release store: all stores above are now visible to any thread that
+    // acquire-loads sync_start_pending and sees block_num > 0.
+    drain_state_.sync_start_pending.store(block_num, std::memory_order_release);
+    return true;
+}
+
+// Count total available resources across all scheduler threads for a given shape.
+int32_t SchedulerContext::count_global_available(PTO2ResourceShape shape, uint8_t core_mask) {
+    int32_t total = 0;
+    for (int32_t t = 0; t < active_sched_threads_; t++) {
+        if (shape == PTO2ResourceShape::MIX) {
+            total += core_trackers_[t].count_mix_running_clusters(core_mask);
+        } else {
+            total += core_trackers_[t].get_idle_core_offset_states(shape).count();
+        }
+    }
+    return total;
+}
+
+// Drain worker: dispatch all blocks in one pass across all threads' trackers.
+// Called only when global resources >= block_num, so one pass always suffices.
+// All other threads are spinning -- the drain worker has exclusive tracker access.
+void SchedulerContext::drain_worker_dispatch(int32_t block_num) {
+    PTO2TaskSlotState *slot_state = drain_state_.pending_task.load(std::memory_order_acquire);
+    if (!slot_state) {
+        drain_state_.sync_start_pending.store(0, std::memory_order_release);
+        return;
+    }
+    PTO2ResourceShape shape = slot_state->active_mask.to_shape();
+    uint8_t core_mask = slot_state->active_mask.core_mask();
+
+    for (int32_t t = 0;
+         t < active_sched_threads_ && slot_state->next_block_idx.load(std::memory_order_relaxed) < block_num; t++) {
+        auto valid = (shape == PTO2ResourceShape::MIX) ?
+                         core_trackers_[t].get_mix_running_cluster_offset_states(core_mask) :
+                         core_trackers_[t].get_idle_core_offset_states(shape);
+        int32_t start = slot_state->next_block_idx.load(std::memory_order_relaxed);
+        int32_t remaining = slot_state->logical_block_num - start;
+        int32_t claim = std::min(valid.count(), remaining);
+        slot_state->next_block_idx.store(static_cast<int16_t>(start + claim), std::memory_order_relaxed);
+        PublishHandle handles[CoreTracker::MAX_CLUSTERS * 3];
+        int handle_count = 0;
+        for (int32_t b = 0; b < claim; b++) {
+            auto core_offset = valid.pop_first();
+            handle_count += prepare_block_for_dispatch(
+                t, core_offset, *slot_state, shape, false, start + b, &handles[handle_count]
+            );
+        }
+        wmb();
+        uint64_t dispatch_ts = 0;
+#if PTO2_PROFILING
+        if (l2_swimlane_level_ >= L2SwimlaneLevel::AICPU_TIMING) {
+            dispatch_ts = get_sys_cnt_aicpu();
+        }
+#endif
+        for (int i = 0; i < handle_count; i++) {
+            publish_subtask_to_core(handles[i], dispatch_ts);
+        }
+    }
+
+    // All blocks dispatched -- clear drain state.
+    // Release fence ensures tracker mutations are visible to threads that
+    // acquire-load sync_start_pending == 0 and resume normal operation.
+    std::atomic_thread_fence(std::memory_order_release);
+    drain_state_.pending_task.store(nullptr, std::memory_order_release);
+    drain_state_.drain_ack_mask.store(0, std::memory_order_relaxed);
+    drain_state_.drain_worker_elected.store(0, std::memory_order_relaxed);
+    drain_state_.sync_start_pending.store(0, std::memory_order_release);
+}
+
+// Called by each scheduler thread when drain_state_.sync_start_pending != 0.
+//
+// Protocol (single-stage ack barrier):
+//   1. Ack barrier: all threads signal they've stopped dispatch, then spin
+//      until all ack bits are set.
+//      If this thread's bit gets cleared while waiting, a reset occurred -- return.
+//   2. Election: one thread wins the CAS and becomes the drain worker.
+//      If resources are insufficient, reset ack/election fields and return --
+//      all threads resume completion polling to free running cores, then retry.
+//   3. Dispatch: elected thread dispatches all blocks (one pass, resources guaranteed).
+//      Non-elected threads spin-wait until sync_start_pending == 0.
+//      During dispatch the elected thread has exclusive tracker access.
+void SchedulerContext::handle_drain_mode(int32_t thread_idx) {
+    // Every spin in this function honors is_completed(): once the run latches
+    // completed_ (all tasks done, or a fatal error raised elsewhere), peers leave
+    // the dispatch loop and stop participating in the drain. A thread parked in a
+    // drain spin would then wait forever for acks / a gate-open that can no longer
+    // arrive -- the AICPU watchdog never fires here because these spins live
+    // outside the dispatch loop's wall-clock budget, so the hang escalates straight
+    // to the 3 s STARS op-exec timeout (507018) and poisons the device. Bailing on
+    // completed_ is always safe: any pending sync_start task is either already
+    // dispatched (a stale re-popped slot) or moot under teardown, and deinit()
+    // resets drain_state_ before the next run, so leaving it dirty is harmless.
+    // Spin until drain is fully initialized (sentinel -1 -> block_num > 0).
+    int32_t block_num;
+    do {
+        if (is_completed()) return;
+        block_num = drain_state_.sync_start_pending.load(std::memory_order_acquire);
+    } while (block_num < 0);
+    if (block_num == 0) return;
+
+    uint32_t all_acked = (1u << active_sched_threads_) - 1;
+
+    // Ack barrier -- signal this thread has stopped dispatch.
+    drain_state_.drain_ack_mask.fetch_or(1u << thread_idx, std::memory_order_release);
+
+    // Spin until all threads have acked.
+    // If our bit is cleared while waiting, elected reset due to insufficient resources.
+    while (true) {
+        if (is_completed()) return;
+        uint32_t ack = drain_state_.drain_ack_mask.load(std::memory_order_acquire);
+        if ((ack & all_acked) == all_acked) break;
+        if ((ack & (1u << thread_idx)) == 0) return;
+        SPIN_WAIT_HINT();
+    }
+
+    // Election -- exactly one thread wins the CAS.
+    int32_t expected = 0;
+    drain_state_.drain_worker_elected.compare_exchange_strong(
+        expected, thread_idx + 1, std::memory_order_acquire, std::memory_order_relaxed
+    );
+
+    if (drain_state_.drain_worker_elected.load(std::memory_order_relaxed) != thread_idx + 1) {
+        // Non-elected: spin-wait for drain completion or resource-insufficient reset.
+        while (drain_state_.sync_start_pending.load(std::memory_order_acquire) != 0) {
+            if (is_completed()) return;
+            if (drain_state_.drain_worker_elected.load(std::memory_order_acquire) == 0) return;
+            SPIN_WAIT_HINT();
+        }
+        return;
+    }
+
+    // Elected: check if global resources are sufficient.
+    PTO2TaskSlotState *slot_state = drain_state_.pending_task.load(std::memory_order_acquire);
+    if (slot_state == nullptr) {
+        // pending_task is observed null only when a concurrent drain completion
+        // already cleared it (drain_worker_dispatch nulls it before reopening the
+        // gate). That drain is done and this is a stale-elected thread, so just
+        // release the election lock and return. Do NOT clear drain_ack_mask or
+        // sync_start_pending: a *new* drain run may already be active and
+        // accumulating acks, and zeroing them would corrupt it into a hang.
+        drain_state_.drain_worker_elected.store(0, std::memory_order_release);
+        return;
+    }
+    PTO2ResourceShape shape = slot_state->active_mask.to_shape();
+    int32_t available = count_global_available(shape, slot_state->active_mask.core_mask());
+
+    if (available < block_num) {
+        // Insufficient resources -- reset drain fields so threads can resume
+        // completion polling to free running cores, then retry.
+        drain_state_.drain_ack_mask.store(0, std::memory_order_release);
+        drain_state_.drain_worker_elected.store(0, std::memory_order_release);
+        return;
+    }
+
+    // Dispatch -- all other threads are spinning, elected thread has exclusive tracker access.
+    drain_worker_dispatch(block_num);
+}
diff --git a/src/a2a3/runtime/host_build_graph/runtime/scheduler/scheduler_context.h b/src/a2a3/runtime/host_build_graph/runtime/scheduler/scheduler_context.h
new file mode 100644
index 000000000..3af6dc19d
--- /dev/null
+++ b/src/a2a3/runtime/host_build_graph/runtime/scheduler/scheduler_context.h
@@ -0,0 +1,415 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+#ifndef SCHEDULER_CONTEXT_H
+#define SCHEDULER_CONTEXT_H
+
+#include "aicpu/platform_regs.h"
+#include "common/l2_swimlane_profiling.h"
+#include "common/unified_log.h"
+#include "scheduler_types.h"
+
+#include "scheduler/pto_scheduler.h"
+
+#include "aicore_completion_mailbox.h"
+#include "pto2_dispatch_payload.h"
+
+// These macros are defined in runtime.h, but we cannot include it here
+// (it pulls in Handshake which we only forward-declare).  Mirror the
+// authoritative values so the class layout compiles standalone.
+#ifndef RUNTIME_MAX_WORKER
+#define RUNTIME_MAX_WORKER 72
+#endif
+#ifndef RUNTIME_MAX_FUNC_ID
+#define RUNTIME_MAX_FUNC_ID 1024
+#endif
+
+// Forward declarations — avoid pulling in full headers for pointer/reference params.
+class Runtime;
+struct Handshake;
+struct PTO2Runtime;
+
+/**
+ * SchedulerContext: owns all scheduler-side state and methods.
+ *
+ * Held as a member of AicpuExecutor (sched_ctx_).  The single public entry
+ * point is resolve_and_dispatch(), called once per scheduler thread.
+ *
+ * All dispatch/completion/drain/cold-path logic is implemented as private
+ * member methods, split across three .cpp files by responsibility:
+ *   - scheduler_completion.cpp  (completion polling, drain protocol)
+ *   - scheduler_cold_path.cpp   (exit checks, stall diagnostics, profiling)
+ *   - scheduler_dispatch.cpp    (task dispatch loop and helpers)
+ */
+class SchedulerContext {
+public:
+    // =========================================================================
+    // Lifecycle
+    // =========================================================================
+
+    // Initialize scheduler state from the given runtime and thread layout.
+    // - Discovers cores via handshake_all_cores()
+    // - Assigns cores to scheduler threads
+    // - Resets task counters, payloads, per-core GlobalContext
+    // - Binds func_id_to_addr_ / initial sched_ (if rt is already known)
+    // - Captures AICore-register base (consumed by handshake_all_cores())
+    // Returns 0 on success, negative on failure (handshake / assignment error).
+    int32_t
+    init(Runtime *runtime, int32_t aicpu_thread_num, int32_t sched_thread_num, bool orch_to_sched, uint64_t regs_base);
+
+    // Reset all SchedulerContext-owned state to its post-construction defaults.
+    // Called by AicpuExecutor::deinit() during per-run teardown.
+    void deinit();
+
+    // =========================================================================
+    // Per-thread execution entry points (called by AicpuExecutor::run)
+    // =========================================================================
+
+    // Main scheduler thread entry: poll completion + dispatch ready tasks.
+    int32_t resolve_and_dispatch(Runtime *runtime, int32_t thread_idx);
+
+    // Shutdown AICore registers for this thread's assigned cores.
+    // Also runs PMU finalize (PTO2_PROFILING) before deinit when enabled.
+    // Orchestrator threads (core_trackers_[thread_idx].core_num() == 0) are a no-op.
+    int32_t shutdown(int32_t thread_idx);
+
+    // Run all post-orchestration scheduler bookkeeping:
+    //  - publishes core assignments to the perf collector (PTO2_PROFILING)
+    //  - latches submitted task count from PTO2 shared memory
+    //  - folds inline_completed_tasks into completed_tasks_
+    //  - flips orchestrator_done_ and triggers core transition
+    //    (skipped on fatal error — emergency_shutdown runs instead)
+    // Callers must invoke rt_orchestration_done(rt) before this — that
+    // step belongs to the orchestrator lifecycle, not the scheduler.
+    void on_orchestration_done(Runtime *runtime, PTO2Runtime *rt, int32_t thread_idx, int32_t total_tasks);
+
+    // Bind the PTO2Runtime scheduler pointer. Required in device-orchestration
+    // mode where rt is created by the orchestrator thread after init().
+    void bind_runtime(PTO2Runtime *rt);
+
+    // =========================================================================
+    // State queries / external synchronization points
+    // =========================================================================
+
+    int32_t aic_count() const { return aic_count_; }
+    int32_t aiv_count() const { return aiv_count_; }
+    bool is_completed() const { return completed_.load(std::memory_order_acquire); }
+    int32_t completed_tasks_count() const { return completed_tasks_.load(std::memory_order_acquire); }
+
+private:
+    // =========================================================================
+    // State
+    // =========================================================================
+
+    // --- Scheduler binding & per-core runtime state ---
+    alignas(64) PTO2SchedulerState *sched_{nullptr};
+    PTO2Runtime *rt_{nullptr};
+
+    // Per-core execution state, indexed by core_id (= worker_id)
+    CoreExecState core_exec_states_[RUNTIME_MAX_WORKER];
+
+    // Cluster-ordered core trackers, one per scheduler thread
+    CoreTracker core_trackers_[MAX_AICPU_THREADS];
+
+    // Per-core dispatch payload storage: dual-buffer for pipelining.
+    // buf_idx = reg_task_id & 1; adjacent dispatches alternate automatically.
+    PTO2DispatchPayload payload_per_core_[RUNTIME_MAX_WORKER][2];
+
+    // Per-core deferred-completion software registration storage.  This has
+    // the same runtime lifetime as payload_per_core_, but is kept out of the
+    // dispatch payload so normal task dispatch layout and cache footprint stay
+    // unchanged.
+    DeferredCompletionSlab deferred_slab_per_core_[RUNTIME_MAX_WORKER][2];
+
+    // sync_start drain coordination
+    SyncStartDrainState drain_state_;
+
+#if PTO2_PROFILING
+    SchedL2SwimlaneCounters sched_l2_swimlane_[MAX_AICPU_THREADS];
+    // Cached once at init() from get_l2_swimlane_level(), AFTER
+    // l2_swimlane_aicpu_init has promoted the level from the shared-memory header.
+    L2SwimlaneLevel l2_swimlane_level_{L2SwimlaneLevel::DISABLED};
+#endif
+
+    // --- Task-execution tracking ---
+    std::atomic<int32_t> completed_tasks_{0};
+    int32_t total_tasks_{0};
+    // Device orchestration: set by last orchestrator when graph is built; schedulers poll it.
+    // volatile prevents the compiler from hoisting the load out of spin loops.
+    volatile bool orchestrator_done_{false};
+    std::atomic<bool> completed_{false};
+    uint64_t *func_id_to_addr_{nullptr};
+
+    // --- Core-transition coordination ---
+    std::atomic<bool> transition_requested_{false};
+    std::atomic<int32_t> wait_reassign_{0};
+    std::atomic<bool> reassigned_{false};
+
+    // --- Thread/core configuration ---
+    int32_t active_sched_threads_{0};
+    int32_t sched_thread_num_{0};
+    bool orch_to_sched_{false};
+    int32_t aicpu_thread_num_{0};
+    int32_t cores_total_num_{0};
+
+    // Cluster-ordered worker_id lists, populated by handshake_all_cores().
+    int32_t aic_worker_ids_[RUNTIME_MAX_WORKER]{};
+    int32_t aiv_worker_ids_[RUNTIME_MAX_WORKER]{};
+    int32_t aic_count_{0};
+    int32_t aiv_count_{0};
+
+    // Platform AICore-register base array (set by AicpuExecutor before init()).
+    uint64_t regs_{0};
+
+#if PTO2_PROFILING
+    // PMU profiling: physical core IDs for PMU MMIO base resolution.
+    // Separate storage because CoreExecState's 64-byte budget has no room for
+    // physical_core_id when PTO2_PROFILING=1.
+    uint32_t physical_core_ids_[RUNTIME_MAX_WORKER]{};
+#endif
+
+    // =========================================================================
+    // Core management (scheduler_cold_path.cpp)
+    // =========================================================================
+
+    // Handshake with all AICore workers; populates core_exec_states_, worker id lists.
+    int32_t handshake_all_cores(Runtime *runtime);
+
+    // Assign discovered cores (cluster = 1 AIC + 2 AIV) round-robin across scheduler threads.
+    bool assign_cores_to_threads();
+
+    // Re-distribute all cores across all threads after orchestration completes.
+    void reassign_cores_for_all_threads();
+
+    // Emergency shutdown: broadcast exit signal to every handshake'd core and
+    // deinit their AICore register blocks. Idempotent.
+    void emergency_shutdown(Runtime *runtime);
+
+    // =========================================================================
+    // Dispatch (scheduler_dispatch.cpp)
+    // =========================================================================
+
+    static const char *shape_name(PTO2ResourceShape shape);
+
+    // Lower-case rendering of PTO2SubtaskSlot, used by dispatch and stall logs.
+    // Kept lower-case to match the `kernels=[aic:N aiv0:N aiv1:N]` field
+    // convention already established in the stall log family.
+    static inline const char *subslot_name(PTO2SubtaskSlot s) {
+        switch (s) {
+        case PTO2SubtaskSlot::AIC:
+            return "aic";
+        case PTO2SubtaskSlot::AIV0:
+            return "aiv0";
+        case PTO2SubtaskSlot::AIV1:
+            return "aiv1";
+        }
+        return "?";
+    }
+
+    int pop_ready_tasks_batch(
+        PTO2ResourceShape shape, int32_t thread_idx, PTO2LocalReadyBuffer &local_buf, PTO2TaskSlotState **out,
+        int max_count
+    );
+
+    void build_payload(
+        PTO2DispatchPayload &dispatch_payload, PTO2TaskSlotState &slot_state, PTO2SubtaskSlot subslot,
+        const AsyncCtx &async_ctx, int32_t block_idx
+    );
+
+    // Batched-dispatch primitives. prepare_* builds the payload and per-core
+    // state; publish_* issues the MMIO register write. Callers must wmb()
+    // between the prepare batch and the publish batch, then sample
+    // get_sys_cnt_aicpu() once and pass it to publish_* for every handle.
+    //
+    // dispatch_timestamp_slot points to the CoreExecState slot
+    // (pending_dispatch_timestamp / running_dispatch_timestamp) selected at
+    // prepare time, or nullptr when L2 swimlane is below AICPU_TIMING and no
+    // dispatch timestamp is being recorded.
+    struct PublishHandle {
+        uint64_t reg_addr;
+        uint32_t reg_task_id;
+        int32_t core_offset;
+        uint64_t *dispatch_timestamp_slot;
+    };
+
+    PublishHandle prepare_subtask_to_core(
+        int32_t thread_idx, int32_t core_offset, PTO2TaskSlotState &slot_state, PTO2SubtaskSlot subslot,
+        bool to_pending, int32_t block_idx
+    );
+
+    inline void publish_subtask_to_core(const PublishHandle &h, uint64_t dispatch_ts) {
+        if (h.dispatch_timestamp_slot != nullptr) {
+            *h.dispatch_timestamp_slot = dispatch_ts;
+        }
+        write_reg(h.reg_addr, RegId::DATA_MAIN_BASE, static_cast<uint64_t>(h.reg_task_id));
+    }
+
+    // Fan out one block's subtasks (1 for AIC/AIV, 1-3 for MIX) into the
+    // caller-supplied handles buffer. Returns the number of handles written.
+    int prepare_block_for_dispatch(
+        int32_t thread_idx, int32_t core_offset, PTO2TaskSlotState &slot_state, PTO2ResourceShape shape,
+        bool to_pending, int32_t block_idx, PublishHandle *out_handles
+    );
+
+    void dispatch_shape(
+        int32_t thread_idx, PTO2ResourceShape shape, CoreTracker::DispatchPhase phase, PTO2LocalReadyBuffer &local_buf,
+        CoreTracker &tracker, bool &entered_drain, bool &made_progress, bool &try_pushed
+    );
+
+    // Speculative early-dispatch (Hook 1). After normal dispatch leaves idle
+    // cores spare, pre-stage the consumers of any RUNNING flagged producer onto
+    // those cores with not_ready=1 (gated). Touches no dependency state — the
+    // task is released by the doorbell at its normal ready-pop (Hook 2).
+    int32_t try_speculative_early_dispatch(int32_t thread_idx);
+
+    // Stage the already-claimed range [start, start+count) of consumer `c` onto
+    // thread_idx's idle (RUNNING slot) then pending (gated-pending, promote-on-FIN)
+    // cores from the provided free-core sets. The caller advances next_block_idx and
+    // re-pushes `c` BEFORE calling, so this expensive prepare+publish runs
+    // concurrently with peers (mirrors the normal SPMD dispatch path). Returns the
+    // number of blocks staged.
+    int32_t stage_consumer_blocks(
+        int32_t thread_idx, PTO2TaskSlotState *c, PTO2ResourceShape shape, int32_t start, int32_t count,
+        CoreTracker::BitStates &idle, CoreTracker::BitStates &pend
+    );
+
+    // One pass of "Phase 4" in the resolve_and_dispatch loop: IDLE-stage dispatch
+    // for MIX then (if no mix residual) AIC/AIV; mid-flush of local buffers; then
+    // PENDING-stage dispatch with cross-thread idle gating. MIX is strictly
+    // prioritized — when mix residual is detected after MIX-IDLE, AIC/AIV are
+    // skipped for the whole pass but MIX-PENDING still runs.
+    //
+    // Forward-progress argument for AIC/AIV: skip_aic_aiv is sticky for the
+    // current pass only. The next loop iteration re-evaluates after Phase 1
+    // completion polling and the global MIX queue draining (here or on any
+    // peer thread). AIC/AIV starvation is therefore bounded by MIX throughput,
+    // not unbounded — once mix completes on at least one cluster, the next
+    // pass either drains the residual or admits AIC/AIV.
+    void dispatch_ready_tasks(
+        int32_t thread_idx, CoreTracker &tracker, PTO2LocalReadyBuffer (&local_bufs)[PTO2_NUM_RESOURCE_SHAPES],
+        bool pmu_active, bool &made_progress, bool &try_pushed
+    );
+
+    // Returns true if any *other* scheduler thread currently has an idle core
+    // matching `shape`. Used as a scheduling hint on the PENDING dispatch path
+    // — see the implementation in scheduler_dispatch.cpp for the hint-semantics
+    // rationale and the safety argument against the drain worker.
+    bool has_idle_in_other_threads(int32_t self_thread_idx, PTO2ResourceShape shape) const;
+
+    // True if mix tasks remain anywhere this thread could see them: the caller's
+    // MIX local LIFO stack or the global MIX ready queue. Approximate —
+    // PTO2ReadyQueue::size() (see pto_scheduler.h) snapshots its enqueue/dequeue
+    // positions with std::memory_order_relaxed and may interleave with concurrent
+    // push/pop. Don't confuse with PTO2SpscQueue::size(), which uses acquire
+    // loads — that one isn't on this path. A stale read here causes at most one
+    // extra/missed AIC/AIV skip and self-corrects on the next loop iteration.
+    bool has_residual_mix(const PTO2LocalReadyBuffer &mix_local_buf) const {
+        return mix_local_buf.count > 0 || sched_->ready_queues[static_cast<int32_t>(PTO2ResourceShape::MIX)].size() > 0;
+    }
+
+    // =========================================================================
+    // Completion & drain (scheduler_completion.cpp)
+    // =========================================================================
+
+    static SlotTransition decide_slot_transition(
+        int32_t reg_task_id, int32_t reg_state, int32_t running_id, int32_t pending_id, bool pending_gated = false
+    );
+
+    void complete_slot_task(
+        PTO2TaskSlotState &slot_state, int32_t expected_reg_task_id, PTO2SubtaskSlot subslot, int32_t thread_idx,
+        int32_t core_id, Handshake *hank, int32_t &completed_this_turn,
+        PTO2TaskSlotState *deferred_release_slot_states[], int32_t &deferred_release_count,
+        PTO2LocalReadyBuffer *local_bufs
+#if PTO2_PROFILING
+        ,
+        uint64_t dispatch_ts, uint64_t finish_ts
+#endif
+    );
+
+    static void promote_pending_to_running(CoreExecState &core);
+    static void clear_running_slot(CoreExecState &core);
+
+    void check_running_cores_for_completion(
+        int32_t thread_idx, Handshake *hank, int32_t &completed_this_turn, int32_t &cur_thread_completed,
+        bool &made_progress, PTO2TaskSlotState *deferred_release_slot_states[], int32_t &deferred_release_count,
+        PTO2LocalReadyBuffer *local_bufs
+    );
+
+    bool enter_drain_mode(PTO2TaskSlotState *slot_state, int32_t block_num);
+    int32_t count_global_available(PTO2ResourceShape shape, uint8_t core_mask);
+    void drain_worker_dispatch(int32_t block_num);
+    void handle_drain_mode(int32_t thread_idx);
+
+    // =========================================================================
+    // Cold path: exit checks, stall diagnostics, profiling (scheduler_cold_path.cpp)
+    // =========================================================================
+
+    __attribute__((noinline, cold)) LoopAction
+    handle_orchestrator_exit(int32_t thread_idx, PTO2SharedMemoryHeader *header, Runtime *runtime, int32_t &task_count);
+
+    __attribute__((noinline, cold)) LoopAction handle_core_transition(bool &cores_released);
+
+    __attribute__((noinline, cold)) LoopAction
+    check_idle_fatal_error(int32_t thread_idx, PTO2SharedMemoryHeader *header, Runtime *runtime);
+
+    __attribute__((noinline, cold)) void
+    log_stall_diagnostics(int32_t thread_idx, int32_t task_count, int32_t idle_iterations, int32_t last_progress_count);
+
+    __attribute__((noinline, cold)) void log_shutdown_stall_snapshot(
+        int32_t trigger_thread_idx, int32_t trigger_idle_iterations, int32_t trigger_last_progress_count
+    );
+
+    // Reverse lookup: given a global core_id, find which scheduler thread's
+    // tracker owns it. Returns -1 if not found. Linear scan — only used on
+    // the cold diagnostic path.
+    int32_t find_core_owner_thread(int32_t core_id) const;
+
+    // Does this thread own any core with a RUNNING task (running_slot_state set)?
+    // Gates the scheduler timeout fatal latch: a thread without an owned
+    // RUNNING task has no first-hand evidence of a stuck dispatch and must
+    // not declare global fatal on its own idle observation. The thread that
+    // does own the stuck task will reach the budget on its own polls and
+    // latch with valid evidence (or recover when the COND register flips).
+    bool self_owns_running_task(int32_t thread_idx) const;
+
+    // Does *any* scheduler thread own a RUNNING task? Used as the second
+    // fatal-latch condition: if the wall-clock budget elapsed AND no thread
+    // owns RUNNING work AND tasks remain incomplete, the system is in a
+    // pre-dispatch / WAIT-only deadlock (e.g. dependency cycle) and the
+    // ownerless idle threads are the only observers — let one of them latch.
+    bool no_thread_owns_running_task() const;
+
+    __attribute__((noinline, cold)) int32_t handle_timeout_exit(
+        int32_t thread_idx, PTO2SharedMemoryHeader *header, Runtime *runtime, int32_t idle_iterations,
+        int32_t last_progress_count
+#if PTO2_PROFILING
+        ,
+        uint64_t sched_start_ts
+#endif
+    );
+
+#if PTO2_PROFILING
+    __attribute__((noinline, cold)) void log_l2_swimlane_summary(int32_t thread_idx, int32_t cur_thread_completed);
+#endif
+
+    // =========================================================================
+    // Small inline helpers
+    // =========================================================================
+
+    uint64_t get_function_bin_addr(int func_id) const {
+        if (!func_id_to_addr_ || func_id < 0 || func_id >= RUNTIME_MAX_FUNC_ID) {
+            LOG_ERROR("func_id=%d is out of range [0, %d) or map is null", func_id, RUNTIME_MAX_FUNC_ID);
+            return 0;
+        }
+        return func_id_to_addr_[func_id];
+    }
+};
+
+#endif  // SCHEDULER_CONTEXT_H
diff --git a/src/a2a3/runtime/host_build_graph/runtime/scheduler/scheduler_dispatch.cpp b/src/a2a3/runtime/host_build_graph/runtime/scheduler/scheduler_dispatch.cpp
new file mode 100644
index 000000000..0fcc0ea5d
--- /dev/null
+++ b/src/a2a3/runtime/host_build_graph/runtime/scheduler/scheduler_dispatch.cpp
@@ -0,0 +1,1473 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+#include "scheduler_context.h"
+
+#include <algorithm>
+#include <cinttypes>
+#include <limits>
+
+#include "common.h"  // debug_assert
+
+#include "common/unified_log.h"
+#include "aicpu/device_time.h"
+#include "aicpu/platform_regs.h"
+#include "callable.h"
+#include "common/l2_swimlane_profiling.h"
+#include "common/memory_barrier.h"
+#include "common/platform_config.h"
+#include "pto_runtime2.h"
+#include "runtime.h"
+#include "spin_hint.h"
+
+// Performance profiling headers
+#include "aicpu/l2_swimlane_collector_aicpu.h"
+#include "aicpu/pmu_collector_aicpu.h"
+#include "aicpu/tensor_dump_aicpu.h"
+
+#ifndef unlikely
+#define unlikely(x) __builtin_expect(!!(x), 0)
+#endif
+
+// =============================================================================
+// Dispatch helpers
+// =============================================================================
+
+namespace {
+inline constexpr int32_t PTO2_DEFERRED_RELEASE_CAP = 256;
+}
+
+// The speculative core bitmask (PTO2_SPEC_CORE_MASK_WORDS * 64 bits) must cover
+// every global core_id, and the per-core doorbell table is sized to match.
+static_assert(
+    RUNTIME_MAX_WORKER <= PTO2_SPEC_CORE_MASK_WORDS * 64, "staged_core_mask too small for RUNTIME_MAX_WORKER cores"
+);
+
+const char *SchedulerContext::shape_name(PTO2ResourceShape shape) {
+    switch (shape) {
+    case PTO2ResourceShape::AIC:
+        return "AIC";
+    case PTO2ResourceShape::AIV:
+        return "AIV";
+    case PTO2ResourceShape::MIX:
+        return "MIX";
+    case PTO2ResourceShape::DUMMY:
+        return "DUMMY";
+    }
+    return "UNKNOWN";
+}
+
+bool SchedulerContext::has_idle_in_other_threads(int32_t self_thread_idx, PTO2ResourceShape shape) const {
+    // Cross-thread read of peer trackers without explicit synchronization. The
+    // backing `core_states_` is a naturally aligned uint64_t; aarch64 guarantees
+    // single-copy atomicity for an 8-byte aligned load, so no torn read. The
+    // value is consumed only as a scheduling *hint* — a stale read at worst
+    // causes one missed/extra pending dispatch, corrected on the next iteration.
+    // Drain-mode cross-thread writes are serialized by handle_drain_mode's ack
+    // barrier (all peers spin out of the dispatch path before any tracker
+    // mutation), so this routine is never racing the drain worker.
+    for (int32_t t = 0; t < active_sched_threads_; t++) {
+        if (t == self_thread_idx) continue;
+        if (core_trackers_[t].get_idle_core_offset_states(shape).has_value()) {
+            return true;
+        }
+    }
+    return false;
+}
+
+int SchedulerContext::pop_ready_tasks_batch(
+    PTO2ResourceShape shape, int32_t thread_idx, PTO2LocalReadyBuffer &local_buf, PTO2TaskSlotState **out, int max_count
+) {
+#if PTO2_PROFILING
+    auto &l2_swimlane = sched_l2_swimlane_[thread_idx];
+#if PTO2_SCHED_PROFILING
+    extern uint64_t g_sched_pop_atomic_count[], g_sched_pop_wait_cycle[];
+    uint64_t t_pop_start = get_sys_cnt_aicpu();
+    int count = sched_->get_ready_tasks_batch(
+        shape, local_buf, out, max_count, g_sched_pop_atomic_count[thread_idx], g_sched_pop_wait_cycle[thread_idx]
+    );
+    l2_swimlane.sched_dispatch_pop_cycle += (get_sys_cnt_aicpu() - t_pop_start);
+#else
+    int count = sched_->get_ready_tasks_batch(shape, local_buf, out, max_count);
+#endif
+    if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) {
+        if (count > 0) {
+            l2_swimlane.pop_hit += count;
+        } else {
+            l2_swimlane.pop_miss++;
+        }
+    }
+#else
+    (void)thread_idx;
+    int count = sched_->get_ready_tasks_batch(shape, local_buf, out, max_count);
+#endif
+    return count;
+}
+
+void SchedulerContext::build_payload(
+    PTO2DispatchPayload &dispatch_payload, PTO2TaskSlotState &slot_state, PTO2SubtaskSlot subslot,
+    const AsyncCtx &async_ctx, int32_t block_idx
+) {
+    int32_t slot_idx = static_cast<int32_t>(subslot);
+    uint64_t callable_addr = get_function_bin_addr(slot_state.task->kernel_id[slot_idx]);
+    const CoreCallable *callable = reinterpret_cast<const CoreCallable *>(callable_addr);
+    dispatch_payload.function_bin_addr = callable->resolved_addr();
+    auto &payload = *slot_state.payload;
+    int n = 0;
+    for (int32_t i = 0; i < payload.tensor_count; i++) {
+        dispatch_payload.args[n++] = reinterpret_cast<uint64_t>(&payload.tensors[i]);
+    }
+    for (int32_t i = 0; i < payload.scalar_count; i++) {
+        dispatch_payload.args[n++] = payload.scalars[i];
+    }
+    dispatch_payload.local_context.block_idx = block_idx;
+    dispatch_payload.local_context.block_num = slot_state.logical_block_num;
+    dispatch_payload.local_context.async_ctx = async_ctx;
+    dispatch_payload.args[PAYLOAD_LOCAL_CONTEXT_INDEX] = reinterpret_cast<uint64_t>(&dispatch_payload.local_context);
+    dispatch_payload.args[PAYLOAD_GLOBAL_CONTEXT_INDEX] = reinterpret_cast<uint64_t>(&dispatch_payload.global_context);
+    // Speculative early-dispatch: a task being staged (Hook 1 set spec_state to
+    // STAGING before this call) is gated — the AICore must wait for the
+    // DATA_MAIN_BASE high-32 doorbell. All other dispatches run on pickup.
+    dispatch_payload.not_ready =
+        (slot_state.payload->spec_state.load(std::memory_order_relaxed) == PTO2_SPEC_STAGING) ? 1 : 0;
+}
+
+SchedulerContext::PublishHandle SchedulerContext::prepare_subtask_to_core(
+    int32_t thread_idx, int32_t core_offset, PTO2TaskSlotState &slot_state, PTO2SubtaskSlot subslot, bool to_pending,
+    int32_t block_idx
+) {
+    CoreTracker &tracker = core_trackers_[thread_idx];
+    auto core_id = tracker.get_core_id_by_offset(core_offset);
+    CoreExecState &core_exec_state = core_exec_states_[core_id];
+
+    core_exec_state.dispatch_seq++;
+    uint32_t reg_task_id = core_exec_state.dispatch_seq & TASK_ID_MASK;
+    static_assert(
+        (TASK_ID_MASK - AICORE_EXIT_SIGNAL + 1) % 2 == 0, "Sentinel skip must be even to preserve dual-buffer parity"
+    );
+    if (reg_task_id >= AICORE_EXIT_SIGNAL) {
+        core_exec_state.dispatch_seq += (TASK_ID_MASK - reg_task_id + 1);
+        reg_task_id = core_exec_state.dispatch_seq & TASK_ID_MASK;
+    }
+
+    uint32_t buf_idx = reg_task_id & 1u;
+    PTO2DispatchPayload &payload = payload_per_core_[core_id][buf_idx];
+    DeferredCompletionSlab *deferred_slab = &deferred_slab_per_core_[core_id][buf_idx];
+    deferred_slab->count = 0;
+    deferred_slab->error_code = PTO2_ERROR_NONE;
+    AsyncCtx async_ctx = AsyncCtx::make(slot_state.task->task_id, deferred_slab);
+    build_payload(payload, slot_state, subslot, async_ctx, block_idx);
+
+    if (to_pending) {
+        core_exec_state.pending_subslot = subslot;
+        core_exec_state.pending_slot_state = &slot_state;
+        core_exec_state.pending_reg_task_id = static_cast<int32_t>(reg_task_id);
+    } else {
+        core_exec_state.running_subslot = subslot;
+        core_exec_state.running_slot_state = &slot_state;
+        core_exec_state.running_reg_task_id = static_cast<int32_t>(reg_task_id);
+        tracker.change_core_state(core_offset);
+    }
+    tracker.set_pending_occupied(core_offset);
+
+    LOG_DEBUG(
+        "Thread %d: Dispatched %s %s task %" PRId64 " kernel_id=[%d,%d,%d] block_idx=%d/total_blocks=%d to"
+        " core_offset=%d core_id=%d reg_task_id=%u",
+        thread_idx, to_pending ? "pending" : "idle", subslot_name(subslot),
+        static_cast<int64_t>(slot_state.task->task_id.raw), slot_state.task->kernel_id[0],
+        slot_state.task->kernel_id[1], slot_state.task->kernel_id[2], block_idx, slot_state.logical_block_num,
+        core_offset, core_id, reg_task_id
+    );
+
+    // AICore buffer rotation lives on the dispatch path: count this dispatch
+    // and rotate before write_reg when we're about to cross a BUFFER_SIZE
+    // boundary. The completion-before-dispatch invariant makes this race-free
+    // (all prior tasks on this core have FIN'd, so AICore has dcci'd their
+    // records out of the old buffer). Gated on the same enable bit as flush
+    // so level=1 (AICORE_TIMING-only) participates without needing complete_task.
+#if PTO2_PROFILING
+    if (l2_swimlane_level_ != L2SwimlaneLevel::DISABLED) {
+        l2_swimlane_aicpu_on_aicore_dispatch(core_id, thread_idx);
+    }
+#endif
+
+    uint64_t *dispatch_timestamp_slot = nullptr;
+#if PTO2_PROFILING
+    if (l2_swimlane_level_ >= L2SwimlaneLevel::AICPU_TIMING) {
+        dispatch_timestamp_slot =
+            to_pending ? &core_exec_state.pending_dispatch_timestamp : &core_exec_state.running_dispatch_timestamp;
+    }
+#endif
+
+    return PublishHandle{core_exec_state.reg_addr, reg_task_id, core_offset, dispatch_timestamp_slot};
+}
+
+int SchedulerContext::prepare_block_for_dispatch(
+    int32_t thread_idx, int32_t core_offset, PTO2TaskSlotState &slot_state, PTO2ResourceShape shape, bool to_pending,
+    int32_t block_idx, PublishHandle *out_handles
+) {
+#if PTO2_PROFILING
+    if (is_dump_args_enabled()) {
+        dump_args_for_task<PTO2_SUBTASK_SLOT_COUNT>(
+            thread_idx, slot_state, TensorDumpStage::BEFORE_DISPATCH,
+            [](ActiveMask active_mask, int raw_subtask_id) {
+                return active_mask.subtask_active(static_cast<PTO2SubtaskSlot>(raw_subtask_id));
+            },
+            [this](int32_t func_id) {
+                return get_function_bin_addr(func_id);
+            }
+        );
+    }
+#endif
+    CoreTracker &tracker = core_trackers_[thread_idx];
+    if (shape == PTO2ResourceShape::MIX) {
+        uint8_t cmask = slot_state.active_mask.core_mask();
+        int n = 0;
+        if (cmask & PTO2_SUBTASK_MASK_AIC) {
+            out_handles[n++] = prepare_subtask_to_core(
+                thread_idx, tracker.get_aic_core_offset(core_offset), slot_state, PTO2SubtaskSlot::AIC, to_pending,
+                block_idx
+            );
+        }
+        if (cmask & PTO2_SUBTASK_MASK_AIV0) {
+            out_handles[n++] = prepare_subtask_to_core(
+                thread_idx, tracker.get_aiv0_core_offset(core_offset), slot_state, PTO2SubtaskSlot::AIV0, to_pending,
+                block_idx
+            );
+        }
+        if (cmask & PTO2_SUBTASK_MASK_AIV1) {
+            out_handles[n++] = prepare_subtask_to_core(
+                thread_idx, tracker.get_aiv1_core_offset(core_offset), slot_state, PTO2SubtaskSlot::AIV1, to_pending,
+                block_idx
+            );
+        }
+#if PTO2_PROFILING
+        sched_l2_swimlane_[thread_idx].phase_dispatch_count += __builtin_popcount(cmask);
+#endif
+        return n;
+    } else if (shape == PTO2ResourceShape::AIC) {
+        out_handles[0] =
+            prepare_subtask_to_core(thread_idx, core_offset, slot_state, PTO2SubtaskSlot::AIC, to_pending, block_idx);
+#if PTO2_PROFILING
+        sched_l2_swimlane_[thread_idx].phase_dispatch_count += 1;
+#endif
+        return 1;
+    } else {
+        out_handles[0] =
+            prepare_subtask_to_core(thread_idx, core_offset, slot_state, PTO2SubtaskSlot::AIV0, to_pending, block_idx);
+#if PTO2_PROFILING
+        sched_l2_swimlane_[thread_idx].phase_dispatch_count += 1;
+#endif
+        return 1;
+    }
+}
+
+void SchedulerContext::dispatch_shape(
+    int32_t thread_idx, PTO2ResourceShape shape, CoreTracker::DispatchPhase phase, PTO2LocalReadyBuffer &local_buf,
+    CoreTracker &tracker, bool &entered_drain, bool &made_progress, bool &try_pushed
+) {
+#if PTO2_SCHED_PROFILING
+    auto &l2_swimlane = sched_l2_swimlane_[thread_idx];
+#endif
+    if (entered_drain) return;
+
+    bool is_pending = (phase == CoreTracker::DispatchPhase::PENDING);
+    bool is_mix = (shape == PTO2ResourceShape::MIX);
+    auto cores = is_mix ? tracker.get_cluster_offset_states() : tracker.get_dispatchable_cores(shape, phase);
+    if (!cores.has_value()) return;
+
+    while (cores.has_value() && !entered_drain) {
+        int want = cores.count();
+        PTO2TaskSlotState *batch[CoreTracker::MAX_CLUSTERS * 3];
+        int got = pop_ready_tasks_batch(shape, thread_idx, local_buf, batch, want);
+        if (got == 0) break;
+
+        // sync_start exclusion gate.
+        //
+        // When the popped batch contains a sync_start task we MUST publish each
+        // prior task with its own wmb so AICore receives them with time
+        // separation. The drain coordinator's `count_global_available()` check
+        // reads the per-thread CoreTracker, and although `prepare_block_for_dispatch`
+        // marks cores occupied synchronously, the head-start between successive
+        // tasks is what lets the surrounding completion loop catch up on FINs in
+        // the retry window when the sync_start task hits insufficient resources.
+        // Bursting all prior tasks at the end of the pop (cross-task batching)
+        // collapses that head-start and causes spmd_sync_start_stress to time
+        // out via 507018 on ~40% of runs — see
+        // docs/investigations/2026-06-cross-task-batched-publish.md.
+        //
+        // When the batch carries no sync_start task, no drain entry can happen
+        // in this pop, so we hoist `handles[]`, `wmb()`, and the publish loop
+        // out of the per-task body. One wmb amortizes across all tasks and one
+        // dispatch_ts is shared, which restores ~60 ns first-to-last AICore
+        // start span for single-block decode kernels (out_proj, q_proj, ...).
+        // Detection is a single mask check per task — cheap relative to even
+        // one register write.
+        bool any_sync_start = false;
+        for (int bi = 0; bi < got; bi++) {
+            if (batch[bi]->active_mask.requires_sync_start()) {
+                any_sync_start = true;
+                break;
+            }
+        }
+
+        // handles[] is sized for the MIX worst case: total claims across the
+        // pop bounded by `cores.count() ≤ MAX_CLUSTERS`, and each block
+        // contributes ≤ 3 subtasks for MIX.
+        PublishHandle handles[CoreTracker::MAX_CLUSTERS * 3];
+        int handle_count = 0;
+        bool dispatched_any = false;
+        // Slots dispatched this pop whose dispatch_fanin must be propagated to
+        // consumers. Deferred until AFTER publish (below) so a flagged producer's
+        // fanout walk never sits between claiming cores and publishing its own
+        // blocks — doing it inline delays this thread's blocks while peer threads
+        // co-dispatching the same SPMD task publish immediately, misaligning the
+        // task's block starts. Bounded by cores.count() ≤ MAX_CLUSTERS dispatches.
+        PTO2TaskSlotState *prop_list[CoreTracker::MAX_CLUSTERS];
+        int prop_n = 0;
+#if PTO2_SCHED_PROFILING
+        uint64_t t_setup_start = get_sys_cnt_aicpu();
+#endif
+
+        // Flush prepared-but-unpublished handles. Required before
+        // `enter_drain_mode` so the drain coordinator sees cores as occupied,
+        // and at the per-task boundary when `any_sync_start` is true.
+        auto flush_publish = [&]() {
+            if (handle_count == 0) return;
+            wmb();
+            uint64_t dispatch_ts = 0;
+#if PTO2_PROFILING
+            if (l2_swimlane_level_ >= L2SwimlaneLevel::AICPU_TIMING) {
+                dispatch_ts = get_sys_cnt_aicpu();
+            }
+#endif
+            for (int i = 0; i < handle_count; i++) {
+                publish_subtask_to_core(handles[i], dispatch_ts);
+            }
+            handle_count = 0;
+            made_progress = true;
+        };
+
+        for (int bi = 0; bi < got; bi++) {
+            PTO2TaskSlotState *slot_state = batch[bi];
+            CoreTracker::BitStates selected_mix_clusters(0ULL);
+
+            if (is_mix) {
+                auto candidates = cores;
+                uint8_t cmask = slot_state->active_mask.core_mask();
+                auto wanted = is_pending ? CoreTracker::MixPlacement::PENDING : CoreTracker::MixPlacement::RUNNING;
+                while (candidates.has_value()) {
+                    int32_t cluster_offset = candidates.pop_first();
+                    if (tracker.classify_mix_cluster(cluster_offset, cmask) == wanted) {
+                        selected_mix_clusters |= CoreTracker::BitStates(1ULL << cluster_offset);
+                    }
+                }
+                if (!selected_mix_clusters.has_value()) {
+                    sched_->ready_queues[static_cast<int32_t>(shape)].push(slot_state);
+                    continue;
+                }
+            }
+
+            // (Speculative pre-staged tasks never reach this ready-pop: they are
+            // released by their doorbell in release_fanin_and_check_ready the
+            // instant their last producer completes — see try_speculative_release.)
+
+            if (slot_state->active_mask.requires_sync_start()) {
+                if (is_pending) {
+                    sched_->ready_queues[static_cast<int32_t>(shape)].push(slot_state);
+                    continue;
+                }
+                int32_t available = is_mix ? selected_mix_clusters.count() : cores.count();
+                if (available < slot_state->logical_block_num) {
+                    flush_publish();
+                    if (!enter_drain_mode(slot_state, slot_state->logical_block_num)) {
+                        sched_->ready_queues[static_cast<int32_t>(shape)].push(slot_state);
+                    }
+                    for (int rem = bi + 1; rem < got; rem++) {
+                        sched_->ready_queues[static_cast<int32_t>(shape)].push(batch[rem]);
+                    }
+                    entered_drain = true;
+                    break;
+                }
+            }
+
+            if (!cores.has_value()) {
+                flush_publish();
+                sched_->ready_queues[static_cast<int32_t>(shape)].push_batch(&batch[bi], got - bi);
+                break;
+            }
+
+            dispatched_any = true;
+            try_pushed = true;
+            // Record for deferred dispatch_fanin propagation after this pop's
+            // blocks are published (see after the loop). propagate's own guard
+            // filters non-flagged slots, so recording unconditionally is cheap.
+            if (prop_n < static_cast<int>(sizeof(prop_list) / sizeof(prop_list[0]))) {
+                prop_list[prop_n++] = slot_state;
+            }
+            // Claim a contiguous range of blocks, hand the slot back to the
+            // ready queue immediately, then perform the expensive dispatches.
+            // This lets other schedulers concurrently claim and dispatch the
+            // remaining blocks of the same SPMD task instead of spinning while
+            // this thread fills all its own cores. Only local `start + b` is
+            // read after the push — `next_block_idx` may already be advanced
+            // by another scheduler that popped the slot.
+            int32_t start = slot_state->next_block_idx.load(std::memory_order_relaxed);
+            int32_t remaining = slot_state->logical_block_num - start;
+            int32_t available = is_mix ? selected_mix_clusters.count() : cores.count();
+            int32_t claim = std::min(available, remaining);
+            slot_state->next_block_idx.store(static_cast<int16_t>(start + claim), std::memory_order_relaxed);
+
+            if (start + claim < slot_state->logical_block_num) {
+                sched_->ready_queues[static_cast<int32_t>(shape)].push(slot_state);
+            }
+
+            for (int32_t b = 0; b < claim; b++) {
+                auto core_offset = is_mix ? selected_mix_clusters.pop_first() : cores.pop_first();
+                if (is_mix) {
+                    cores.clear_bit(core_offset);
+                }
+                handle_count += prepare_block_for_dispatch(
+                    thread_idx, core_offset, *slot_state, shape, is_pending, start + b, &handles[handle_count]
+                );
+            }
+
+            // Sync_start exclusion: flush per task so prior tasks have head-
+            // start time before any sync_start drain check. Normal batches
+            // fall through and accumulate for one cross-task flush at the
+            // end of the pop.
+            if (any_sync_start) {
+                flush_publish();
+            }
+        }
+
+        flush_publish();
+        // Blocks are published; now propagate dispatch_fanin for any flagged
+        // producers dispatched above (knob A: producer is running). Off the
+        // pre-publish path so it cannot delay or misalign their blocks.
+        for (int i = 0; i < prop_n; i++) {
+            sched_->propagate_dispatch_fanin(*prop_list[i]);
+        }
+#if PTO2_SCHED_PROFILING
+        l2_swimlane.sched_dispatch_setup_cycle += (get_sys_cnt_aicpu() - t_setup_start);
+#endif
+
+        if (!dispatched_any) break;
+
+        if (!cores.has_value()) {
+            cores = is_mix ? tracker.get_cluster_offset_states() : tracker.get_dispatchable_cores(shape, phase);
+        }
+    }
+}
+
+void SchedulerContext::dispatch_ready_tasks(
+    int32_t thread_idx, CoreTracker &tracker, PTO2LocalReadyBuffer (&local_bufs)[PTO2_NUM_RESOURCE_SHAPES],
+    bool pmu_active, bool &made_progress, bool &try_pushed
+) {
+    using Phase = CoreTracker::DispatchPhase;
+    constexpr int32_t MIX_I = static_cast<int32_t>(PTO2ResourceShape::MIX);
+
+    // MIX is handled explicitly at the top of each stage; only AIC/AIV cycle
+    // through this 2-elem array, with order toggled by thread parity for
+    // shape-level load balancing across threads.
+    static constexpr PTO2ResourceShape kAicAivOrder[2][2] = {
+        {PTO2ResourceShape::AIC, PTO2ResourceShape::AIV},
+        {PTO2ResourceShape::AIV, PTO2ResourceShape::AIC},
+    };
+    const PTO2ResourceShape *aic_aiv = kAicAivOrder[thread_idx & 1];
+
+    // Spill overflow from local_bufs to the shared ready queue BEFORE we start
+    // dispatching. release_fanin's fast path packs all newly-ready consumers
+    // into the producing thread's local_bufs (zero atomic, peer-invisible). For
+    // batch releases (e.g. attn_fence → 50 out_proj consumers) that
+    // overshoots this thread's slot budget so peers are starving while we
+    // hoard. The cross-thread invisibility window between "complete pushes 50
+    // to local" and "IDLE-AIC's mid-phase flush exposes overflow to shared"
+    // is what shows up in the swimlane as the multi-microsecond inter-thread
+    // stagger on out_proj's first wave.
+    //
+    // Gate conditions:
+    //   (a) local count exceeds this thread's per-shape block budget — we
+    //       can't dispatch them all even with both RUNNING+PENDING slots;
+    //   (b) at least one peer has idle cores in this shape — they want work.
+    // Both must hold to avoid wasting a CAS push when we could profitably
+    // self-dispatch the overflow. Condition (b) reads peer CoreTracker
+    // (plain 8-byte load on a rarely-contended cache line, ~5 ns) — we
+    // deliberately avoid ready_queues[s].size() here, which is two atomic
+    // loads on lines pushers + poppers actively bounce.
+    //
+    // Capacity derives from how cores are partitioned across sched threads:
+    //   per-shape budget = (PLATFORM_MAX_BLOCKDIM / active_sched_threads_)
+    //                       × cores_per_blockdim_for_that_shape
+    //   MIX is 1 cluster per block dim, so its budget equals the block-dim
+    //   share without multiplying.
+    //
+    // Push the trailing `excess` slot pointers — O(1) count decrement, no
+    // memmove. push_batch is one CAS for the whole excess; peers see the
+    // batch immediately and can race for them.
+    const int32_t bd_per_thread = PLATFORM_MAX_BLOCKDIM / active_sched_threads_;
+    const int32_t thread_capacity[PTO2_NUM_RESOURCE_SHAPES] = {
+        /*AIC=*/bd_per_thread * PLATFORM_AIC_CORES_PER_BLOCKDIM,
+        /*AIV=*/bd_per_thread * PLATFORM_AIV_CORES_PER_BLOCKDIM,
+        /*MIX=*/bd_per_thread,
+    };
+    for (int32_t s = 0; s < PTO2_NUM_RESOURCE_SHAPES; s++) {
+        auto &lb = local_bufs[s];
+        int32_t excess = lb.count - thread_capacity[s];
+        if (excess <= 0) continue;
+        if (!has_idle_in_other_threads(thread_idx, static_cast<PTO2ResourceShape>(s))) continue;
+        sched_->ready_queues[s].push_batch(&lb.slot_states[lb.count - excess], excess);
+        lb.count -= excess;
+    }
+
+    auto flush_local_bufs = [&]() {
+        for (int32_t s = 0; s < PTO2_NUM_RESOURCE_SHAPES; s++) {
+            auto &lb = local_bufs[s];
+            if (lb.count > 0) {
+                sched_->ready_queues[s].push_batch(lb.slot_states, lb.count);
+                lb.count = 0;
+            }
+        }
+    };
+    // Every return path below must flush; wrap in RAII so we cannot forget.
+    // The mid-function flush between IDLE and PENDING is still called
+    // explicitly — guard only covers exit.
+    struct FlushGuard {
+        decltype(flush_local_bufs) &flush_fn;
+        ~FlushGuard() { flush_fn(); }
+    } flush_guard{flush_local_bufs};
+
+    bool entered_drain = false;
+
+    // ===== IDLE stage =====
+    dispatch_shape(
+        thread_idx, PTO2ResourceShape::MIX, Phase::IDLE, local_bufs[MIX_I], tracker, entered_drain, made_progress,
+        try_pushed
+    );
+    if (entered_drain) return;
+
+    // MIX-IDLE residual: AIC/AIV (both IDLE and PENDING) yield for this pass.
+    // MIX-PENDING below still runs — that is the core of "mix strict priority":
+    // pending slots are spent on mix before AIC/AIV get any chance.
+    bool skip_aic_aiv = has_residual_mix(local_bufs[MIX_I]);
+
+    if (!skip_aic_aiv) {
+        for (int i = 0; i < 2; i++) {
+            PTO2ResourceShape s = aic_aiv[i];
+            dispatch_shape(
+                thread_idx, s, Phase::IDLE, local_bufs[static_cast<int32_t>(s)], tracker, entered_drain, made_progress,
+                try_pushed
+            );
+            if (entered_drain) return;
+        }
+    }
+
+    // Flush between IDLE and PENDING so PENDING-stage queue-size checks and any
+    // peer-thread reads see the IDLE-stage release_fanin output.
+    flush_local_bufs();
+
+    if (pmu_active) return;
+
+    // ===== PENDING stage =====
+    // MIX-PENDING gate: skip when a peer has an idle MIX-capable cluster — that
+    // peer's next IDLE-MIX iteration will pull the mix task from the global
+    // queue (already flushed above) at lower latency than us pre-loading a
+    // pending slot here. Forward progress for MIX is preserved: at least one
+    // thread will run MIX-IDLE next pass and consume the residual.
+    //
+    // The gate is NOT subject to skip_aic_aiv — residual mix continues to drain
+    // via pending slots on this thread when no peer is idle.
+    if (!has_idle_in_other_threads(thread_idx, PTO2ResourceShape::MIX)) {
+        dispatch_shape(
+            thread_idx, PTO2ResourceShape::MIX, Phase::PENDING, local_bufs[MIX_I], tracker, entered_drain,
+            made_progress, try_pushed
+        );
+        if (entered_drain) return;
+    }
+
+    // Re-check after MIX-PENDING. If MIX-IDLE already set skip_aic_aiv, leave
+    // it set; otherwise, escalate iff PENDING-MIX left residual.
+    if (!skip_aic_aiv && has_residual_mix(local_bufs[MIX_I])) {
+        skip_aic_aiv = true;
+    }
+
+    // PENDING-MIX may have re-populated AIC/AIV local_bufs via release_fanin
+    // during in-flight completions; flush_guard ensures these don't carry
+    // across to the next iteration's IDLE stage.
+    if (skip_aic_aiv) return;
+
+    // AIC/AIV-PENDING gate: a peer-idle skip is a delay, not a loss — the peer
+    // will pull from the global queue on its next IDLE pass.
+    for (int i = 0; i < 2; i++) {
+        PTO2ResourceShape s = aic_aiv[i];
+        if (has_idle_in_other_threads(thread_idx, s)) continue;
+        dispatch_shape(
+            thread_idx, s, Phase::PENDING, local_bufs[static_cast<int32_t>(s)], tracker, entered_drain, made_progress,
+            try_pushed
+        );
+        if (entered_drain) return;
+    }
+}
+
+// Stage the ALREADY-CLAIMED range [start, start+count) of consumer `c` onto
+// thread_idx's idle then pending cores. The caller (the queue drain) has advanced
+// next_block_idx by `count` under pop-exclusivity AND re-pushed `c` for peers
+// BEFORE calling this — so this, the expensive prepare+publish, runs CONCURRENTLY
+// with peers staging other ranges of the same consumer. This mirrors the normal
+// SPMD dispatch path (claim range -> store next_block_idx -> re-push -> dispatch).
+// `idle`/`pend` are this thread's free-core sets, sized so idle.count+pend.count >=
+// count (the caller clamped the claim to them), so all `count` blocks get a core.
+//
+// Rule 1: idle cores -> gated task in the RUNNING slot. Rule 2: PENDING slot of
+// cores running a real task -> promoted in when that task FINs (gated-pending Case
+// 3.3 in decide_slot_transition completes the running FIN + promotes instead of
+// waiting for an ack the gated task never sends). Each staged core stays
+// pending_occupied while gated, so no second gated block stacks on it.
+//
+// Self-ring: release flips STAGING->DISPATCHED then rings the mask. A block staged
+// after that flip isn't in the mask release read, so this thread rings it here. The
+// seq_cst order between "OR mask then load spec_state" (here) and "store DISPATCHED
+// then read mask" (release) guarantees every gated core's doorbell fires.
+int32_t SchedulerContext::stage_consumer_blocks(
+    int32_t thread_idx, PTO2TaskSlotState *c, PTO2ResourceShape shape, int32_t start, int32_t count,
+    CoreTracker::BitStates &idle, CoreTracker::BitStates &pend
+) {
+    CoreTracker &tracker = core_trackers_[thread_idx];
+    // Stamp the real pre-stage time (NOT 0) so the swimlane shows these blocks
+    // dispatched during the producer's run, not at trace start.
+    uint64_t early_dispatch_ts = get_sys_cnt_aicpu();
+    uint64_t my_cores[PTO2_SPEC_CORE_MASK_WORDS] = {0};  // cores this thread gated (for self-ring)
+    int32_t staged = 0;
+    int32_t block = start;
+    auto stage_from = [&](CoreTracker::BitStates &avail, bool to_pending) {
+        // Mirror the normal flush_publish (scheduler_dispatch.cpp wmb()+publish loop):
+        // prepare all claimed blocks' payloads, one wmb(), then publish. The wmb
+        // guarantees the not_ready gate + args are globally visible before any
+        // DATA_MAIN_BASE token — without it a gated core can pick up the token and
+        // dcci a stale payload (the doorbell/release path mirrors normal dispatch).
+        PublishHandle handles[CoreTracker::MAX_CLUSTERS * 3];
+        int n = 0;
+        while (count > 0 && avail.has_value()) {
+            int32_t core_offset = avail.pop_first();
+            n += prepare_block_for_dispatch(thread_idx, core_offset, *c, shape, to_pending, block, &handles[n]);
+            block++;
+            count--;
+            staged++;
+        }
+        if (n == 0) return;
+        wmb();
+        for (int i = 0; i < n; i++) {
+            publish_subtask_to_core(handles[i], early_dispatch_ts);
+            int32_t cid = tracker.get_core_id_by_offset(handles[i].core_offset);
+            sched_->spec_doorbell_table[cid].addr = handles[i].reg_addr;
+            sched_->spec_doorbell_table[cid].token = handles[i].reg_task_id;
+            my_cores[cid >> 6] |= (1ULL << (cid & 63));
+        }
+    };
+    if (idle.has_value()) stage_from(idle, /*to_pending=*/false);
+    if (pend.has_value()) stage_from(pend, /*to_pending=*/true);
+    // Publish all this thread's gated cores into the shared mask in one OR per word
+    // (vs one per subtask) so release sees them; seq_cst keeps the self-ring order.
+    for (int w = 0; w < PTO2_SPEC_CORE_MASK_WORDS; w++)
+        if (my_cores[w] != 0) c->payload->staged_core_mask[w].fetch_or(my_cores[w], std::memory_order_seq_cst);
+
+    // If release already flipped DISPATCHED, it may have read the mask before our
+    // bits landed — ring our own cores so none is left gated forever.
+    if (staged > 0 && c->payload->spec_state.load(std::memory_order_seq_cst) == PTO2_SPEC_DISPATCHED) {
+        for (int w = 0; w < PTO2_SPEC_CORE_MASK_WORDS; w++) {
+            uint64_t bits = my_cores[w];
+            while (bits != 0) {
+                int cid = w * 64 + __builtin_ctzll(bits);
+                bits &= bits - 1;
+                PTO2SchedulerState::ring_one_doorbell(
+                    sched_->spec_doorbell_table[cid].addr, sched_->spec_doorbell_table[cid].token
+                );
+            }
+        }
+    }
+    return staged;
+}
+
+// Early-dispatch drain (idle pass). Candidates are pushed to early_dispatch_queue
+// EVENT-DRIVEN by propagate_dispatch_fanin (a flagged producer's dispatch bumps its
+// consumers' dispatch_fanin; reaching fanin_count enqueues the consumer) — there is
+// no per-iteration PULL scan here anymore. This pass only DRAINS the queue.
+// Returns the number of blocks staged this pass (for the EarlyDispatch swimlane bar).
+int32_t SchedulerContext::try_speculative_early_dispatch(int32_t thread_idx) {
+    constexpr int PTO2_EARLY_DISPATCH_DRAIN_MAX = 8;  // bounded pops per pass
+    CoreTracker &tracker = core_trackers_[thread_idx];
+    int32_t total_staged = 0;
+
+    // Drain the queue — mirrors the normal SPMD dispatch path. Pop a consumer,
+    // CLAIM a range sized to THIS thread's free cores by advancing next_block_idx with
+    // a CAS (atomic — next_block_idx is shared with normal dispatch, which also claims
+    // it if release routes the consumer to the ready queue, so a plain store could
+    // double-dispatch), RE-PUSH it for peers, THEN do the expensive prepare+publish.
+    // Re-pushing before staging lets peers claim the next range and stage CONCURRENTLY
+    // — a wide consumer (online_softmax, 48 blocks) is filled by all idle threads in
+    // parallel instead of a serial winner-then-peer daisy chain. Bounded pops/pass.
+    for (int n = 0; n < PTO2_EARLY_DISPATCH_DRAIN_MAX; n++) {
+        PTO2TaskSlotState *c = sched_->early_dispatch_queue.pop();
+        if (c == nullptr) break;
+        if (c->payload->spec_state.load(std::memory_order_acquire) != PTO2_SPEC_STAGING) continue;  // released
+        PTO2ResourceShape shape = c->active_mask.to_shape();
+        auto idle = tracker.get_idle_core_offset_states(shape);
+        auto pend = tracker.get_pending_core_offset_states(shape);
+        int32_t freecores = (idle.has_value() ? idle.count() : 0) + (pend.has_value() ? pend.count() : 0);
+        if (freecores == 0) {  // no free cores of this shape — give it back for peers and stop
+            sched_->early_dispatch_queue.push(c);
+            break;
+        }
+        // CAS-claim a contiguous range [start, start+claim) sized to this thread's
+        // free cores; CAS keeps it atomic against peers AND normal dispatch.
+        int32_t start = 0, claim = 0;
+        while (true) {
+            int16_t cur = c->next_block_idx.load(std::memory_order_relaxed);
+            if (cur >= c->logical_block_num) break;  // fully claimed
+            int32_t cnt = c->logical_block_num - cur;
+            if (cnt > freecores) cnt = freecores;
+            if (c->next_block_idx.compare_exchange_weak(
+                    cur, static_cast<int16_t>(cur + cnt), std::memory_order_seq_cst, std::memory_order_relaxed
+                )) {
+                start = cur;
+                claim = cnt;
+                break;
+            }
+        }
+        if (claim == 0) continue;  // nothing left to claim -> drop (no re-push)
+        // Re-push for concurrent peers BEFORE the expensive staging.
+        if (start + claim < c->logical_block_num) {
+            if (!sched_->early_dispatch_queue.push(c))
+                LOG_INFO_V9(
+                    "[SPEC] queue full on re-push, consumer=%" PRId64, static_cast<int64_t>(c->task->task_id.raw)
+                );
+        }
+        total_staged += stage_consumer_blocks(thread_idx, c, shape, start, claim, idle, pend);
+    }
+    return total_staged;
+}
+
+// =============================================================================
+// Main scheduler dispatch loop
+// =============================================================================
+
+int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_idx) {
+    always_assert(sched_ != nullptr);
+    CoreTracker &tracker = core_trackers_[thread_idx];
+    LOG_INFO_V0("Thread %d: resolve_and_dispatch entry", thread_idx);
+
+    PTO2SharedMemoryHeader *header = sched_->sm_header;
+    if (!header) {
+        LOG_ERROR("PTO2 dispatch: header is null");
+        return -1;
+    }
+    LOG_INFO_V0(
+        "Thread %d: header=%p, task_desc_offset[0]=%lu, window_size=%lu", thread_idx, static_cast<void *>(header),
+        static_cast<uint64_t>(header->ring.task_descriptors_offset),
+        static_cast<uint64_t>(header->ring.task_window_size)
+    );
+
+    Handshake *hank = static_cast<Handshake *>(runtime->workers);
+    LOG_INFO_V0(
+        "Thread %d: hank=%p, window_size=%lu", thread_idx, static_cast<void *>(hank),
+        static_cast<uint64_t>(header->ring.task_window_size)
+    );
+
+    LOG_INFO_V0("Thread %d: PTO2 dispatch starting with %d cores", thread_idx, core_trackers_[thread_idx].core_num());
+    int32_t cur_thread_completed = 0;
+    // Non-zero once a scheduler-hang timeout latches; returned in place of the
+    // completed count so the caller still sees the negative error rc while the
+    // shared end-of-loop flush below runs.
+    int32_t timeout_rc = 0;
+    int32_t idle_iterations = 0;
+    int32_t last_progress_count = 0;
+#if PTO2_PROFILING
+    auto &l2_swimlane = sched_l2_swimlane_[thread_idx];
+    l2_swimlane.reset();
+    l2_swimlane.l2_swimlane_enabled = (l2_swimlane_level_ != L2SwimlaneLevel::DISABLED);
+#endif
+
+    constexpr int LOCAL_READY_CAP_PER_TYPE = 64;
+    PTO2TaskSlotState *local_ptrs[PTO2_NUM_RESOURCE_SHAPES][LOCAL_READY_CAP_PER_TYPE];
+    PTO2LocalReadyBuffer local_bufs[PTO2_NUM_RESOURCE_SHAPES];
+    for (int32_t i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) {
+        local_bufs[i].reset(local_ptrs[i], LOCAL_READY_CAP_PER_TYPE);
+    }
+    PTO2TaskSlotState *deferred_release_slot_states[PTO2_DEFERRED_RELEASE_CAP];
+    int32_t deferred_release_count = 0;
+
+    bool cores_released = false;
+
+    // PMU runs require single-issue dispatch — overlapping in-flight tasks
+    // pollute per-task PMU counters, so skip the PENDING pre-load phase.
+    // Cached at function scope: is_pmu_enabled() is extern "C" and the
+    // compiler cannot hoist it across the dispatch loop on its own.
+    const bool pmu_active = is_pmu_enabled();
+
+#if PTO2_PROFILING
+    l2_swimlane.sched_start_ts = get_sys_cnt_aicpu();
+#endif
+
+#if PTO2_PROFILING
+    // Queue-depth snapshot carried across the iteration boundary: each phase
+    // emit consumes (phase_start_*) and refreshes them with its own end snapshot
+    // so the next phase's "at_start" equals the previous phase's "at_end".
+    //
+    // L2SWIMLANE_NUM_QUEUE_SHAPES (3) matches PTO2_NUM_RESOURCE_SHAPES: AIC/AIV/MIX.
+    //
+    // **Hot-path cost discipline.** Local depth (this thread's PTO2LocalReadyBuffer)
+    // is a single int read on a register-cached stack — free. Shared depth
+    // (PTO2ReadyQueue::size) is two atomic relaxed loads against cache lines
+    // that all peer sched threads also write to (enqueue_pos and dequeue_pos
+    // bounce on every flush_local_bufs + every pop). With both phases emitting
+    // per iter that's 12 cross-core loads × thousands of iters per run, a
+    // measurable AICPU slowdown. Mitigation: lazy + per-iter cached shared
+    // snapshot, refreshed at most once per iteration. The complete-emit and
+    // dispatch-emit in the same iter both reuse the same shared sample; the
+    // big transitions (local→shared flush) still show up across iter boundaries.
+    static_assert(
+        L2SWIMLANE_NUM_QUEUE_SHAPES == PTO2_NUM_RESOURCE_SHAPES,
+        "queue snapshot width must match runtime resource shape count"
+    );
+    int16_t phase_start_local[L2SWIMLANE_NUM_QUEUE_SHAPES] = {0};
+    int16_t phase_start_shared[L2SWIMLANE_NUM_QUEUE_SHAPES] = {0};
+    int16_t iter_shared_snapshot[L2SWIMLANE_NUM_QUEUE_SHAPES] = {0};
+    bool iter_shared_sampled = false;
+    auto capture_local_snapshot = [&](int16_t local_out[L2SWIMLANE_NUM_QUEUE_SHAPES]) {
+        for (int s = 0; s < L2SWIMLANE_NUM_QUEUE_SHAPES; s++) {
+            local_out[s] = static_cast<int16_t>(local_bufs[s].count);
+        }
+    };
+    auto get_or_sample_shared = [&]() -> const int16_t * {
+        if (!iter_shared_sampled) {
+            // Clamp to int16_t max before narrowing. PTO2_PROF_READYQUEUE_SIZE
+            // is in the low thousands today but could grow with platform
+            // scaling — without clamp, sizes above 32767 wrap to negatives
+            // and silently corrupt the snapshot.
+            constexpr size_t kMax = static_cast<size_t>(std::numeric_limits<int16_t>::max());
+            for (int s = 0; s < L2SWIMLANE_NUM_QUEUE_SHAPES; s++) {
+                const size_t qsize = sched_->ready_queues[s].size();
+                iter_shared_snapshot[s] = static_cast<int16_t>(std::min(qsize, kMax));
+            }
+            iter_shared_sampled = true;
+        }
+        return iter_shared_snapshot;
+    };
+    auto capture_phase_end = [&](int16_t local_out[L2SWIMLANE_NUM_QUEUE_SHAPES],
+                                 int16_t shared_out[L2SWIMLANE_NUM_QUEUE_SHAPES]) {
+        capture_local_snapshot(local_out);
+        const int16_t *shared_cached = get_or_sample_shared();
+        for (int s = 0; s < L2SWIMLANE_NUM_QUEUE_SHAPES; s++)
+            shared_out[s] = shared_cached[s];
+    };
+    if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) {
+        capture_phase_end(phase_start_local, phase_start_shared);
+    }
+#endif
+
+    // Wall-clock timestamp of the last completed task on this thread.
+    // Updated on made_progress; consulted to decide whether the wall-clock
+    // budget for declaring a scheduler hang has elapsed. Initialized to
+    // "now" so the first budget cycle starts when this thread does, not at
+    // an undefined value.
+    uint64_t last_progress_ts = get_sys_cnt_aicpu();
+
+    while (true) {
+        if (completed_.load(std::memory_order_acquire)) {
+            break;
+        }
+        bool made_progress = false;
+#if PTO2_PROFILING
+        CYCLE_COUNT_START();
+        l2_swimlane.sched_loop_count++;
+        uint64_t _t0_phase = _t0;
+        // Release is the only "no Complete/Dispatch bar" attribution we keep —
+        // emitted with its own span in the idle branch below. Iterations that
+        // only scan/poll show as blank gaps; the per-loop Poll/Scan bars (PR
+        // #1079 debug overlay) were removed since "scheduler is polling when
+        // there's nothing to do" carries no actionable signal.
+        // Per-iter lazy shared-queue snapshot: first phase emit in this iter
+        // pays the atomic-load cost, subsequent emits in the same iter reuse
+        // the cached value. Reset here so we re-sample exactly once per iter
+        // (or skip entirely on iters with no phase emit).
+        iter_shared_sampled = false;
+#endif
+        int32_t task_count = 0;
+        if (!tracker.has_any_running_cores()) {
+            LoopAction action = handle_orchestrator_exit(thread_idx, header, runtime, task_count);
+            if (action == LoopAction::BREAK_LOOP) break;
+        }
+
+        if (!cores_released && orch_to_sched_) {
+            LoopAction action = handle_core_transition(cores_released);
+            if (action == LoopAction::BREAK_LOOP) break;
+        }
+
+#if PTO2_PROFILING
+        CYCLE_COUNT_LAP(l2_swimlane.sched_idle_cycle);
+#endif
+
+        // Phase 1: Check running cores for completion
+        int32_t completed_this_turn = 0;
+
+        bool try_completed = tracker.has_any_running_cores();
+        if (try_completed) {
+            check_running_cores_for_completion(
+                thread_idx, hank, completed_this_turn, cur_thread_completed, made_progress,
+                deferred_release_slot_states, deferred_release_count, local_bufs
+            );
+        }
+        if (completed_this_turn > 0) {
+#if PTO2_SCHED_PROFILING
+            sched_->tasks_completed.fetch_add(completed_this_turn, std::memory_order_relaxed);
+#endif
+            int32_t prev = completed_tasks_.fetch_add(completed_this_turn, std::memory_order_relaxed);
+            int32_t new_total = prev + completed_this_turn;
+            last_progress_count = new_total;
+            if (thread_idx == 0 && task_count > 0) {
+                if (new_total <= PROGRESS_VERBOSE_THRESHOLD ||
+                    new_total / PROGRESS_LOG_INTERVAL != prev / PROGRESS_LOG_INTERVAL || new_total >= task_count) {
+                    LOG_INFO_V9(
+                        "PTO2 progress: completed=%d total=%d (%.1f%%)", new_total, task_count,
+                        100.0 * new_total / task_count
+                    );
+                }
+            }
+        }
+
+        if (rt_ != nullptr && rt_->aicore_mailbox != nullptr &&
+            (sched_->async_wait_list.count > 0 || rt_->aicore_mailbox->has_pending())) {
+            AsyncPollResult poll_result = sched_->async_wait_list.poll_and_complete<false>(
+                rt_->aicore_mailbox, sched_, local_bufs, deferred_release_slot_states, deferred_release_count,
+                PTO2_DEFERRED_RELEASE_CAP
+#if PTO2_SCHED_PROFILING
+                ,
+                thread_idx
+#endif
+            );
+            if (poll_result.error_code != PTO2_ERROR_NONE) {
+                int32_t expected = PTO2_ERROR_NONE;
+                header->sched_error_code.compare_exchange_strong(
+                    expected, poll_result.error_code, std::memory_order_acq_rel, std::memory_order_acquire
+                );
+                completed_.store(true, std::memory_order_release);
+                break;
+            }
+            if (poll_result.completed > 0) {
+#if PTO2_SCHED_PROFILING
+                sched_->tasks_completed.fetch_add(poll_result.completed, std::memory_order_relaxed);
+#endif
+                int32_t prev = completed_tasks_.fetch_add(poll_result.completed, std::memory_order_relaxed);
+                int32_t new_total = prev + poll_result.completed;
+                last_progress_count = new_total;
+                made_progress = true;
+            }
+        }
+
+#if PTO2_PROFILING
+        if (!try_completed) {
+            CYCLE_COUNT_LAP(l2_swimlane.sched_idle_cycle);
+        } else {
+            CYCLE_COUNT_LAP(l2_swimlane.sched_complete_cycle);
+            // Emit on any completion work this iteration — a finished slot OR
+            // sub-block retires that did not finish a slot. The latter makes the
+            // SPMD harvest tail visible (count field = blocks processed this
+            // iteration; on a pure-retire iteration phase_complete_count is 0).
+            if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES &&
+                (l2_swimlane.phase_complete_count > 0 || l2_swimlane.phase_subretire_count > 0)) {
+                // Local depth is cheap (this thread's own buffer counter).
+                // Shared depth is NOT sampled here: complete's release_fanin
+                // pushes to local_bufs in the fast path (try_push succeeds
+                // until cap=64). Shared only changes on dispatch's flush
+                // path. Carrying phase_start_shared forward as end_shared
+                // is the right answer 99% of the time AND skips three
+                // contended atomic loads per emit.
+                int16_t phase_end_local[L2SWIMLANE_NUM_QUEUE_SHAPES];
+                capture_local_snapshot(phase_end_local);
+                l2_swimlane_aicpu_record_sched_phase(
+                    thread_idx, L2SwimlaneSchedPhaseKind::Complete, _t0_phase, _t1, l2_swimlane.sched_loop_count,
+                    l2_swimlane.phase_complete_count + l2_swimlane.phase_subretire_count, /*pop_hit=*/0,
+                    /*pop_miss=*/0, phase_start_shared, phase_start_shared
+                );
+                for (int s = 0; s < L2SWIMLANE_NUM_QUEUE_SHAPES; s++) {
+                    phase_start_local[s] = phase_end_local[s];
+                    // phase_start_shared unchanged — carried forward
+                }
+                _t0_phase = _t1;
+                l2_swimlane.phase_complete_count = 0;
+                l2_swimlane.phase_subretire_count = 0;
+            }
+        }
+#endif
+
+        bool try_pushed = false;
+
+        // Phase 2 drain check
+        if (drain_state_.sync_start_pending.load(std::memory_order_acquire) != 0) {
+            handle_drain_mode(thread_idx);
+            continue;
+        }
+
+        // Phase 3: Drain wiring queue (thread 0 only)
+        int wired = 0;
+        if (thread_idx == 0) {
+            wired = sched_->drain_wiring_queue(orchestrator_done_);
+            if (wired > 0) {
+                made_progress = true;
+#if PTO2_SCHED_PROFILING
+                l2_swimlane.phase_wiring_count += wired;
+#endif
+            }
+        }
+#if PTO2_PROFILING
+        CYCLE_COUNT_LAP(l2_swimlane.sched_wiring_cycle);
+        // Wire outer phase: emit one bar covering this iter's drain_wiring_queue
+        // pass when it wired any tasks. tasks_processed = wired count. Resolve
+        // does NOT nest under Wire — wiring only enqueues, the consumer release
+        // happens later in Complete/Dummy.
+        if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES && wired > 0) {
+            int16_t phase_end_local[L2SWIMLANE_NUM_QUEUE_SHAPES];
+            capture_local_snapshot(phase_end_local);
+            l2_swimlane_aicpu_record_sched_phase(
+                thread_idx, L2SwimlaneSchedPhaseKind::Wire, _t0_phase, _t1, l2_swimlane.sched_loop_count,
+                static_cast<uint32_t>(wired), /*pop_hit=*/0, /*pop_miss=*/0, phase_start_shared, phase_start_shared
+            );
+            for (int s = 0; s < L2SWIMLANE_NUM_QUEUE_SHAPES; s++) {
+                phase_start_local[s] = phase_end_local[s];
+            }
+            _t0_phase = _t1;
+        }
+#endif
+
+        // Phase 3b: Drain dummy ready queue (thread 0 only).
+        //
+        // Dependency-only tasks bypass AICore dispatch: they go through the
+        // scheduler so fanin/fanout edges stay consistent, but completion is
+        // signalled inline here. Pinned to thread 0 to avoid cross-thread
+        // races and to keep cache hot near the wiring drain above.
+        if (thread_idx == 0) {
+            constexpr int DUMMY_DRAIN_BATCH = 16;
+            PTO2TaskSlotState *dummy_batch[DUMMY_DRAIN_BATCH];
+            int dummy_got = sched_->dummy_ready_queue.pop_batch(dummy_batch, DUMMY_DRAIN_BATCH);
+#if PTO2_PROFILING
+            // Dummy outer phase: covers handling of all dummies popped this
+            // iter. Per-dummy DummyTask markers are emitted to a SEPARATE lane
+            // (Worker View AICPU_N) by the converter, so they do not nest
+            // under this bar. Resolve emits below DO land on the sched lane
+            // and nest under this Dummy outer by time containment.
+            uint64_t dummy_outer_t0 =
+                (dummy_got > 0 && l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) ? get_sys_cnt_aicpu() : 0;
+#endif
+            for (int di = 0; di < dummy_got; di++) {
+                PTO2TaskSlotState &dummy_slot = *dummy_batch[di];
+
+                // ----- DummyTask phase: dummy "task" identity marker. --------
+                // The dummy has no AICore presence — start ≈ end (1 cycle
+                // wide, just "we identified it"). Converter renders this on
+                // Worker View's DUMMY_T{thread} lane so the DAG node is
+                // visually present. tasks_processed = task_token low 32 bits
+                // (= local_id within ring) so deps.json flow arrows can land.
+                // The Resolve work that follows is emitted separately below.
+#if PTO2_PROFILING
+                if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) {
+                    uint64_t dummy_marker_t = get_sys_cnt_aicpu();
+                    uint32_t dummy_id_low32 = static_cast<uint32_t>(dummy_slot.task->task_id.raw & 0xFFFFFFFFu);
+                    l2_swimlane_aicpu_record_sched_phase(
+                        thread_idx, L2SwimlaneSchedPhaseKind::DummyTask, dummy_marker_t, dummy_marker_t,
+                        sched_l2_swimlane_[thread_idx].sched_loop_count, dummy_id_low32
+                    );
+                }
+#endif
+
+                // ----- Resolve work: walk this dummy's consumer list. ------
+                // Same 1 µs filter as the main-path Resolve emit suppresses
+                // dummies whose consumer release runs sub-microsecond.
+#if PTO2_PROFILING
+                uint64_t dummy_resolve_t0 =
+                    (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) ? get_sys_cnt_aicpu() : 0;
+#endif
+                // [[maybe_unused]] silences -Werror=unused-but-set-variable on
+                // the profiling-flags-smoke build path where PTO2_PROFILING is
+                // OFF and the Resolve emit below is excluded.
+                [[maybe_unused]] uint32_t dummy_consumers = 0;
+#if PTO2_SCHED_PROFILING
+                dummy_consumers = sched_->on_task_complete(dummy_slot, thread_idx, local_bufs).fanout_edges;
+#else
+                dummy_consumers = sched_->on_task_complete(dummy_slot, local_bufs);
+#endif
+#if PTO2_PROFILING
+                if (dummy_resolve_t0 != 0) {
+                    uint64_t dummy_resolve_t1 = get_sys_cnt_aicpu();
+                    constexpr uint64_t RESOLVE_EMIT_MIN_CYCLES = PLATFORM_PROF_SYS_CNT_FREQ / 1'000'000;  // 1 µs
+                    if (dummy_resolve_t1 - dummy_resolve_t0 >= RESOLVE_EMIT_MIN_CYCLES) {
+                        l2_swimlane_aicpu_record_sched_phase(
+                            thread_idx, L2SwimlaneSchedPhaseKind::Resolve, dummy_resolve_t0, dummy_resolve_t1,
+                            sched_l2_swimlane_[thread_idx].sched_loop_count, dummy_consumers
+                        );
+                    }
+                }
+#endif
+                // Dummy tasks have no subtasks to retire and no fanout pre-conditions
+                // beyond their own producers; release self-reference so the slot can
+                // reach CONSUMED once all consumers drain.
+                deferred_release_slot_states[deferred_release_count++] = &dummy_slot;
+                if (deferred_release_count >= PTO2_DEFERRED_RELEASE_CAP) {
+                    while (deferred_release_count > 0) {
+#if PTO2_SCHED_PROFILING
+                        (void)sched_->on_task_release(
+                            *deferred_release_slot_states[--deferred_release_count], thread_idx
+                        );
+#else
+                        sched_->on_task_release(*deferred_release_slot_states[--deferred_release_count]);
+#endif
+                    }
+                }
+                int32_t prev = completed_tasks_.fetch_add(1, std::memory_order_relaxed);
+                last_progress_count = prev + 1;
+                cur_thread_completed++;
+            }
+            if (dummy_got > 0) {
+                made_progress = true;
+            }
+#if PTO2_PROFILING
+            // Emit Dummy outer over the whole dummy_drain pass. Span starts at
+            // dummy_outer_t0 (captured before the pop_batch) and ends at "now".
+            // tasks_processed = dummy_got. Advancing _t0_phase here makes the
+            // following Dispatch / EarlyDispatch / second-Complete bars start
+            // at this end.
+            if (dummy_outer_t0 != 0) {
+                int16_t phase_end_local[L2SWIMLANE_NUM_QUEUE_SHAPES];
+                capture_local_snapshot(phase_end_local);
+                uint64_t dummy_outer_t1 = get_sys_cnt_aicpu();
+                l2_swimlane_aicpu_record_sched_phase(
+                    thread_idx, L2SwimlaneSchedPhaseKind::Dummy, dummy_outer_t0, dummy_outer_t1,
+                    l2_swimlane.sched_loop_count, static_cast<uint32_t>(dummy_got), /*pop_hit=*/0,
+                    /*pop_miss=*/0, phase_start_shared, phase_start_shared
+                );
+                for (int s = 0; s < L2SWIMLANE_NUM_QUEUE_SHAPES; s++) {
+                    phase_start_local[s] = phase_end_local[s];
+                }
+                _t0_phase = dummy_outer_t1;
+                // We do NOT re-sync _t0/_t1 — the dummy span will be absorbed
+                // into the next CYCLE_COUNT_LAP accumulator. The phase-model
+                // anchor (_t0_phase) is the authoritative source for bar spans
+                // on the swimlane; the cycle accumulators are coarse aggregates.
+            }
+#endif
+        }
+
+        // Phase 4: MIX-strict-priority dispatch with phase-split and
+        // cross-thread idle gating. See dispatch_ready_tasks for the policy.
+#if PTO2_PROFILING
+        uint64_t dispatch_t0 = (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) ? get_sys_cnt_aicpu() : 0;
+#endif
+        dispatch_ready_tasks(thread_idx, tracker, local_bufs, pmu_active, made_progress, try_pushed);
+#if PTO2_PROFILING
+        // Emit Dispatch IMMEDIATELY after dispatch_ready_tasks so its span
+        // covers the actual publish work — not the trailing second-poll /
+        // early-dispatch time. (Pre-redesign the Dispatch emit lived at iter
+        // end with span extending past the second poll, which made finish_time
+        // events from the second poll fall under the Dispatch bar rather than
+        // a Complete bar of their own — confusing for trace consumers.)
+        if (dispatch_t0 != 0 && try_pushed && l2_swimlane.phase_dispatch_count > 0) {
+            uint64_t dispatch_t1 = get_sys_cnt_aicpu();
+            uint64_t pop_hit_delta = l2_swimlane.pop_hit - l2_swimlane.pop_hit_at_last_emit;
+            uint64_t pop_miss_delta = l2_swimlane.pop_miss - l2_swimlane.pop_miss_at_last_emit;
+            debug_assert(pop_hit_delta < (1ULL << 32));
+            debug_assert(pop_miss_delta < (1ULL << 32));
+            int16_t phase_end_local[L2SWIMLANE_NUM_QUEUE_SHAPES];
+            int16_t phase_end_shared[L2SWIMLANE_NUM_QUEUE_SHAPES];
+            capture_phase_end(phase_end_local, phase_end_shared);
+            l2_swimlane_aicpu_record_sched_phase(
+                thread_idx, L2SwimlaneSchedPhaseKind::Dispatch, _t0_phase, dispatch_t1, l2_swimlane.sched_loop_count,
+                l2_swimlane.phase_dispatch_count, static_cast<uint32_t>(pop_hit_delta),
+                static_cast<uint32_t>(pop_miss_delta), phase_start_shared, phase_end_shared
+            );
+            for (int s = 0; s < L2SWIMLANE_NUM_QUEUE_SHAPES; s++) {
+                phase_start_local[s] = phase_end_local[s];
+                phase_start_shared[s] = phase_end_shared[s];
+            }
+            _t0_phase = dispatch_t1;
+            l2_swimlane.phase_dispatch_count = 0;
+            l2_swimlane.pop_hit_at_last_emit = l2_swimlane.pop_hit;
+            l2_swimlane.pop_miss_at_last_emit = l2_swimlane.pop_miss;
+        }
+#endif
+
+        // Phase 4b: early-dispatch onto spare cores, but ONLY when this thread is
+        // otherwise idle — nothing was dispatched this iteration AND no ready work is
+        // queued for any shape. Early-dispatch competes with normal dispatch for
+        // pending slots, so gating on "no ready work" keeps it from delaying a real
+        // ready task; skipping the producer-fanout scan when busy also removes its
+        // per-iteration cost (the discovery walk only runs on genuinely idle passes).
+        bool any_ready_work = try_pushed;
+        for (int s = 0; !any_ready_work && s < PTO2_NUM_RESOURCE_SHAPES; s++) {
+            if (sched_->ready_queues[s].size() > 0 || local_bufs[s].count > 0) any_ready_work = true;
+        }
+#if PTO2_PROFILING
+        bool early_dispatch_record = l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES;
+        uint64_t early_dispatch_t0 = early_dispatch_record ? get_sys_cnt_aicpu() : 0;
+#endif
+        // Skip speculative early-dispatch under PMU: dispatch_ready_tasks already
+        // withholds PENDING dispatch when pmu_active to preserve single-issue PMU
+        // windows, and staging gated work into idle/pending slots would perturb the
+        // same windows.
+        [[maybe_unused]] int32_t staged_count =
+            (pmu_active || any_ready_work) ? 0 : try_speculative_early_dispatch(thread_idx);
+#if PTO2_PROFILING
+        // Emit an EarlyDispatch bar so a staging-dominated iteration is attributed
+        // to early-dispatch rather than disappearing into a blank gap.
+        if (early_dispatch_record && staged_count > 0) {
+            uint64_t early_dispatch_t1 = get_sys_cnt_aicpu();
+            l2_swimlane_aicpu_record_sched_phase(
+                thread_idx, L2SwimlaneSchedPhaseKind::EarlyDispatch, early_dispatch_t0, early_dispatch_t1,
+                sched_l2_swimlane_[thread_idx].sched_loop_count, static_cast<uint32_t>(staged_count)
+            );
+            // prepare_block_for_dispatch bumped phase_dispatch_count while staging;
+            // those blocks belong to this EarlyDispatch bar, so clear the counter
+            // before it leaks into the next Dispatch bar.
+            sched_l2_swimlane_[thread_idx].phase_dispatch_count = 0;
+            // Advance _t0_phase so the following second-poll's Complete bar
+            // starts at the EarlyDispatch end, not before it (otherwise their
+            // spans overlap and the outer-phase mutual-exclusion breaks).
+            _t0_phase = early_dispatch_t1;
+        }
+#endif
+
+        // Second completion poll. dispatch_ready_tasks + try_speculative_early_dispatch
+        // above can take several us in a busy window; a producer block that FINs
+        // during them would otherwise wait for the NEXT iteration's top-of-loop
+        // Phase-1 poll (the ~7us detection latency that delays a flagged
+        // producer's doorbell). Re-polling here observes those FINs immediately,
+        // so the doorbell fires this iteration. Idempotent (the poll is a poll);
+        // we drain deferred releases eagerly to keep the buffer from growing.
+#if PTO2_PROFILING
+        uint64_t complete2_t0 = (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) ? get_sys_cnt_aicpu() : 0;
+#endif
+        if (tracker.has_any_running_cores()) {
+            int32_t completed_2nd = 0;
+            check_running_cores_for_completion(
+                thread_idx, hank, completed_2nd, cur_thread_completed, made_progress, deferred_release_slot_states,
+                deferred_release_count, local_bufs
+            );
+            if (completed_2nd > 0) {
+#if PTO2_SCHED_PROFILING
+                sched_->tasks_completed.fetch_add(completed_2nd, std::memory_order_relaxed);
+#endif
+                completed_tasks_.fetch_add(completed_2nd, std::memory_order_relaxed);
+                last_progress_count = completed_tasks_.load(std::memory_order_relaxed);
+            }
+            // Eager drain so the second poll can't push deferred_release toward
+            // its cap between idle iterations.
+            while (deferred_release_count >= PTO2_DEFERRED_RELEASE_CAP - 96) {
+#if PTO2_SCHED_PROFILING
+                (void)sched_->on_task_release(*deferred_release_slot_states[--deferred_release_count], thread_idx);
+#else
+                sched_->on_task_release(*deferred_release_slot_states[--deferred_release_count]);
+#endif
+            }
+        }
+#if PTO2_PROFILING
+        // Complete2 outer phase: covers second-poll FIN observation. Without
+        // this emit, FIN counts from the second poll would carry over into the
+        // next iter's first-Complete bar and be displayed with a span that
+        // doesn't actually include those FINs' timestamps (visible mismatch
+        // between Complete bar span and per-task finish_time in Worker /
+        // Scheduler View).
+        if (complete2_t0 != 0 && (l2_swimlane.phase_complete_count > 0 || l2_swimlane.phase_subretire_count > 0)) {
+            uint64_t complete2_t1 = get_sys_cnt_aicpu();
+            int16_t phase_end_local[L2SWIMLANE_NUM_QUEUE_SHAPES];
+            capture_local_snapshot(phase_end_local);
+            l2_swimlane_aicpu_record_sched_phase(
+                thread_idx, L2SwimlaneSchedPhaseKind::Complete, complete2_t0, complete2_t1,
+                l2_swimlane.sched_loop_count, l2_swimlane.phase_complete_count + l2_swimlane.phase_subretire_count,
+                /*pop_hit=*/0,
+                /*pop_miss=*/0, phase_start_shared, phase_start_shared
+            );
+            for (int s = 0; s < L2SWIMLANE_NUM_QUEUE_SHAPES; s++) {
+                phase_start_local[s] = phase_end_local[s];
+            }
+            _t0_phase = complete2_t1;
+            l2_swimlane.phase_complete_count = 0;
+            l2_swimlane.phase_subretire_count = 0;
+        }
+
+        // Cycle-counter LAP for the iter tail. Dispatch's emit moved earlier
+        // (see Phase 4 above) so this branch only routes the time accumulator.
+        if (!try_pushed) {
+            CYCLE_COUNT_LAP(l2_swimlane.sched_idle_cycle);
+        } else {
+            CYCLE_COUNT_LAP(l2_swimlane.sched_dispatch_cycle);
+        }
+#endif
+
+#if !PTO2_PROFILING
+        (void)try_completed;
+        (void)try_pushed;
+#endif
+
+        if (made_progress) {
+            idle_iterations = 0;
+            last_progress_ts = get_sys_cnt_aicpu();
+        } else {
+#if PTO2_PROFILING
+            uint64_t rel_t0 = (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES && deferred_release_count > 0) ?
+                                  get_sys_cnt_aicpu() :
+                                  0;
+#endif
+            while (deferred_release_count > 0) {
+#if PTO2_SCHED_PROFILING
+                (void)sched_->on_task_release(*deferred_release_slot_states[--deferred_release_count], thread_idx);
+#else
+                sched_->on_task_release(*deferred_release_slot_states[--deferred_release_count]);
+#endif
+            }
+#if PTO2_PROFILING
+            // Release is a distinct operation from the poll scan — emit it with
+            // its own span (Perfetto nests it inside the surrounding poll/idle
+            // run by time-containment) rather than competing with poll for one
+            // per-iteration label.
+            if (rel_t0 != 0) {
+                l2_swimlane_aicpu_record_sched_phase(
+                    thread_idx, L2SwimlaneSchedPhaseKind::Release, rel_t0, get_sys_cnt_aicpu(),
+                    l2_swimlane.sched_loop_count, /*tasks_processed=*/0
+                );
+            }
+#endif
+            idle_iterations++;
+
+            if (idle_iterations % FATAL_ERROR_CHECK_INTERVAL == 0) {
+                LoopAction action = check_idle_fatal_error(thread_idx, header, runtime);
+                if (action == LoopAction::BREAK_LOOP) break;
+            }
+
+            if (idle_iterations % STALL_LOG_INTERVAL == 0) {
+                log_stall_diagnostics(thread_idx, total_tasks_, idle_iterations, last_progress_count);
+            }
+            // Wall-clock budget gate, with two fatal-latch branches:
+            //
+            // 1. Self owns a RUNNING task — first-hand evidence the
+            //    dispatch is stuck. Latch.
+            // 2. No thread anywhere owns a RUNNING task AND tasks remain
+            //    unfinished — the system is in a pre-dispatch / WAIT-only
+            //    deadlock (e.g. dependency cycle). Ownerless idle threads
+            //    are the only observers; let this one latch on the global
+            //    evidence (`completed_tasks_ < total_tasks_` and
+            //    `no_thread_owns_running_task()`).
+            //
+            // Otherwise: a sibling thread owns a RUNNING task but hasn't
+            // hit its own budget yet (typical distributed startup-skew
+            // case) — refresh last_progress_ts and keep spinning. The
+            // STALL diagnostic above still fires periodically so
+            // observability is preserved.
+            if (get_sys_cnt_aicpu() - last_progress_ts > SCHEDULER_TIMEOUT_CYCLES) {
+                bool self_owns = self_owns_running_task(thread_idx);
+                bool global_stuck = !self_owns && total_tasks_ > 0 &&
+                                    completed_tasks_.load(std::memory_order_relaxed) < total_tasks_ &&
+                                    no_thread_owns_running_task();
+                if (self_owns || global_stuck) {
+                    // Latch the error + emergency_shutdown, then break to the
+                    // shared end-of-loop cleanup so the diagnostic buffers get
+                    // flushed to the host. An early return here would strand the
+                    // stuck task's already-dumped inputs and every completed
+                    // task's in/out records in the unflushed per-thread dump
+                    // buffer — exactly the state we need to triage the hang.
+                    timeout_rc = handle_timeout_exit(
+                        thread_idx, header, runtime, idle_iterations, last_progress_count
+#if PTO2_PROFILING
+                        ,
+                        l2_swimlane.sched_start_ts
+#endif
+                    );
+                    break;
+                }
+                last_progress_ts = get_sys_cnt_aicpu();
+            }
+            SPIN_WAIT_HINT();
+#if PTO2_PROFILING
+            CYCLE_COUNT_LAP(l2_swimlane.sched_idle_cycle);
+            // _t0_phase advances through idle laps so the next emitted
+            // COMPLETE/DISPATCH bar starts at the iter it actually ran in, not
+            // at the start of the preceding idle stretch. The idle/poll time
+            // itself is attributed by the activity-fill below — no blanks.
+            if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) {
+                _t0_phase = _t1;
+            }
+#endif
+        }
+    }
+
+    // Drain any entries left in the deferred-release batch. The in-loop flush
+    // only fires on idle iterations and on buffer-full; a loop exit while the
+    // last iteration made progress can leave entries un-released. Drop them
+    // here so every consumed producer slot completes its on_task_release
+    // regardless of which loop-exit path fired.
+    while (deferred_release_count > 0) {
+#if PTO2_SCHED_PROFILING
+        (void)sched_->on_task_release(*deferred_release_slot_states[--deferred_release_count], thread_idx);
+#else
+        sched_->on_task_release(*deferred_release_slot_states[--deferred_release_count]);
+#endif
+    }
+
+#if PTO2_PROFILING
+    // Final-drain: emit any pop_hit / pop_miss accrued since the last
+    // dispatch emit (typically the trailing idle loops while waiting for
+    // orchestrator_done_) as a zero-duration synthetic dispatch record so
+    // sum(record.pop_*) reconciles with the run-cumulative counter.
+    // Gate on SCHED_PHASES — at lower levels the phase buffer is never
+    // flushed (see below), so writing this record would be wasted work.
+    if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) {
+        uint64_t final_pop_hit_delta = l2_swimlane.pop_hit - l2_swimlane.pop_hit_at_last_emit;
+        uint64_t final_pop_miss_delta = l2_swimlane.pop_miss - l2_swimlane.pop_miss_at_last_emit;
+        debug_assert(final_pop_hit_delta < (1ULL << 32));
+        debug_assert(final_pop_miss_delta < (1ULL << 32));
+        if (final_pop_hit_delta != 0 || final_pop_miss_delta != 0) {
+            uint64_t t_now = get_sys_cnt_aicpu();
+            int16_t phase_end_local[L2SWIMLANE_NUM_QUEUE_SHAPES];
+            int16_t phase_end_shared[L2SWIMLANE_NUM_QUEUE_SHAPES];
+            capture_phase_end(phase_end_local, phase_end_shared);
+            l2_swimlane_aicpu_record_sched_phase(
+                thread_idx, L2SwimlaneSchedPhaseKind::Dispatch, t_now, t_now, l2_swimlane.sched_loop_count, 0,
+                static_cast<uint32_t>(final_pop_hit_delta), static_cast<uint32_t>(final_pop_miss_delta),
+                phase_end_shared, phase_end_shared
+            );
+            l2_swimlane.pop_hit_at_last_emit = l2_swimlane.pop_hit;
+            l2_swimlane.pop_miss_at_last_emit = l2_swimlane.pop_miss;
+        }
+    }
+    log_l2_swimlane_summary(thread_idx, cur_thread_completed);
+#endif
+
+#if PTO2_PROFILING
+    if (l2_swimlane.l2_swimlane_enabled) {
+        l2_swimlane_aicpu_flush(
+            thread_idx, core_trackers_[thread_idx].core_ids(), core_trackers_[thread_idx].core_num()
+        );
+        if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) {
+            l2_swimlane_aicpu_flush_sched_phase_buffer(thread_idx);
+        }
+    }
+#endif
+#if PTO2_PROFILING
+    if (is_dump_args_enabled()) {
+        dump_args_flush(thread_idx);
+    }
+#endif
+#if PTO2_PROFILING
+    if (is_pmu_enabled()) {
+        pmu_aicpu_flush_buffers(
+            thread_idx, core_trackers_[thread_idx].core_ids(), core_trackers_[thread_idx].core_num()
+        );
+    }
+#endif
+
+    return timeout_rc != 0 ? timeout_rc : cur_thread_completed;
+}
diff --git a/src/a2a3/runtime/host_build_graph/runtime/scheduler/scheduler_types.h b/src/a2a3/runtime/host_build_graph/runtime/scheduler/scheduler_types.h
new file mode 100644
index 000000000..8e777f32c
--- /dev/null
+++ b/src/a2a3/runtime/host_build_graph/runtime/scheduler/scheduler_types.h
@@ -0,0 +1,479 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+#ifndef SCHEDULER_TYPES_H
+#define SCHEDULER_TYPES_H
+
+#include <atomic>
+#include <cstdint>
+
+#include "common/core_type.h"
+#include "common/platform_config.h"
+#include "pto_runtime2_types.h"
+#include "spin_hint.h"
+
+// host_build_graph host-orch build: PTO2Runtime embeds PTO2SchedulerState by
+// value, so this header is compiled into the host libhost_runtime.so. The AICPU
+// spin_hint.h that defines PLATFORM_SCHEDULER_TIMEOUT_MS is not on the host
+// include path; supply it here. The value only sizes an on-device scheduler
+// timeout and is never consumed host-side (the scheduler does not run on the
+// host). host_runtime_EXPORTS is CMake's auto-define for the host shared-lib
+// target, so the AICPU/AICore builds keep the real platform constant.
+#ifdef host_runtime_EXPORTS
+constexpr int32_t PLATFORM_SCHEDULER_TIMEOUT_MS = 2000;
+#endif
+
+// =============================================================================
+// Profiling macros (compile-time gated)
+// =============================================================================
+
+#if PTO2_PROFILING
+#include "aicpu/device_time.h"
+// Accumulated nanoseconds per sub-step
+#define CYCLE_COUNT_START() uint64_t _t0 = get_sys_cnt_aicpu(), _t1
+#define CYCLE_COUNT_LAP(acc)       \
+    do {                           \
+        _t1 = get_sys_cnt_aicpu(); \
+        acc += (_t1 - _t0);        \
+        _t0 = _t1;                 \
+    } while (0)
+#else
+#define CYCLE_COUNT_START()
+#define CYCLE_COUNT_LAP(acc)
+#endif
+
+// =============================================================================
+// Scheduler constants
+// =============================================================================
+
+constexpr int32_t MAX_AICPU_THREADS = PLATFORM_MAX_AICPU_THREADS;
+
+// Periodic cadence (in idle iterations) for emitting the per-thread STALL
+// diagnostic while no progress is being made. Purely an observability knob,
+// independent of the wall-clock timeout below: small enough to fire a few times
+// before the budget expires, large enough not to flood device_log.
+constexpr int32_t STALL_LOG_INTERVAL = 480000;
+constexpr int32_t FATAL_ERROR_CHECK_INTERVAL = 1024;  // Check orchestrator error every N idle iters
+
+// Wall-clock budget for declaring "no progress = scheduler timeout". Replaces
+// the per-thread iteration-count cap that once lived here as MAX_IDLE_ITERATIONS
+// for the fatal-latch decision; STALL_LOG_INTERVAL above keeps the per-thread
+// diagnostic cadence.
+//
+// Using wall-clock here is load-bearing for distributed runs: with per-thread
+// iteration counts, a pure-idle thread spinning ~115 ns/iter hits the cap in
+// ~92 ms while a sibling thread polling a RUNNING task takes ~200 ms for the
+// same iteration count. The fast spinner racing ahead and latching fatal
+// kills the slower-but-correct poller mid-poll — see the distributed
+// startup-skew scenario in issue #897.
+//
+// The budget is platform-defined (PLATFORM_SCHEDULER_TIMEOUT_MS in spin_hint.h)
+// because the safe value differs per variant: onboard trims it to 2 s so the
+// AICPU detects a hang and flushes its diagnostics (tensor dump, in-flight
+// partial output) before STARS reaps the op and poisons the context (chain:
+// this < op-exec < host stream-sync, platform_config.h); sim has no STARS to
+// race and keeps the full 5 s #897 headroom. See spin_hint.h for the per-variant
+// rationale.
+constexpr int32_t SCHEDULER_TIMEOUT_MS = PLATFORM_SCHEDULER_TIMEOUT_MS;
+constexpr uint64_t SCHEDULER_TIMEOUT_CYCLES =
+    static_cast<uint64_t>(SCHEDULER_TIMEOUT_MS) * (PLATFORM_PROF_SYS_CNT_FREQ / 1000);
+constexpr int32_t STALL_DUMP_READY_MAX = 8;
+constexpr int32_t STALL_DUMP_WAIT_MAX = 4;
+constexpr int32_t STALL_DUMP_CORE_MAX = 8;
+constexpr int32_t PROGRESS_VERBOSE_THRESHOLD = 10;  // log every completion for the first N tasks
+constexpr int32_t PROGRESS_LOG_INTERVAL = 250;      // log every N completions after threshold
+
+// =============================================================================
+// Control flow signal from cold-path helpers back to the main dispatch loop.
+// =============================================================================
+
+enum class LoopAction : int8_t {
+    NONE,        // cold path did not trigger; proceed normally
+    BREAK_LOOP,  // equivalent to 'break' from the while(true) loop
+};
+
+// =============================================================================
+// Per-core state: one cache line per core to eliminate false sharing
+// and co-locate all hot-path fields for minimal cache misses.
+// Dual-slot layout: running (currently executing) + pending (pre-loaded, awaiting hardware pickup).
+// =============================================================================
+
+struct alignas(64) CoreExecState {
+    // --- Hot fields (completion + dispatch, every iteration) ---
+    uint64_t reg_addr;                      // offset  0: register base address (set once in handshake)
+    PTO2TaskSlotState *running_slot_state;  // offset  8: slot state for running task (nullptr = empty)
+    PTO2TaskSlotState *pending_slot_state;  // offset 16: slot state for pending task (nullptr = empty)
+    int32_t running_reg_task_id;            // offset 24: register task ID (AICPU_TASK_INVALID = idle)
+    int32_t pending_reg_task_id;            // offset 28: pending register task ID (AICPU_TASK_INVALID = none)
+    uint32_t dispatch_seq;                  // offset 32: monotonic dispatch counter
+    PTO2SubtaskSlot running_subslot;        // offset 36: which subtask slot is running
+    PTO2SubtaskSlot pending_subslot;        // offset 37: which subtask slot is pending
+    uint8_t pad0_[2];                       // offset 38: alignment padding
+    // Precomputed COND register pointer; resolved once in handshake so the
+    // hot completion poll does a single volatile load instead of recomputing
+    // reg_base + reg_offset(COND) on every iteration.
+    volatile uint32_t *cond_ptr;  // offset 40: precomputed pointer to COND register
+#if PTO2_PROFILING
+    // --- Profiling fields (dispatch path, compile-time gated) ---
+    uint64_t running_dispatch_timestamp;  // offset 48: AICPU dispatch timestamp for running task
+    uint64_t pending_dispatch_timestamp;  // offset 56: AICPU dispatch timestamp for pending task
+#else
+    // --- Cold fields (init/diagnostics only, never in hot path) ---
+    int32_t worker_id;          // offset 48: index in runtime.workers[]
+    uint32_t physical_core_id;  // offset 52: hardware physical core ID
+    CoreType core_type;         // offset 56: AIC or AIV (enum class : int32_t)
+    uint8_t pad2_[4];           // offset 60: pad to 64 bytes
+#endif
+};
+static_assert(sizeof(CoreExecState) == 64, "CoreExecState must occupy exactly one cache line");
+
+// =============================================================================
+// CoreTracker: cluster-based bitmask tracker for idle/running core state.
+//
+// core_states_ encodes per-cluster core idle/running in 3 bits per cluster:
+//   bit i*3   = AIC of cluster i   (1 = idle, 0 = running)
+//   bit i*3+1 = AIV0 of cluster i
+//   bit i*3+2 = AIV1 of cluster i
+// Max 21 clusters per tracker (63 bits in uint64_t).
+// =============================================================================
+
+class alignas(64) CoreTracker {
+public:
+    static inline int32_t MAX_CORE_PER_THREAD = 63;
+    static constexpr int32_t MAX_CLUSTERS = 63 / 3;
+
+public:
+    CoreTracker() = default;
+
+    class BitStates {
+    public:
+        BitStates() = default;
+
+        explicit BitStates(uint64_t states) :
+            states_(states) {}
+        void init() { states_ = 0; }
+
+        BitStates operator~() const { return BitStates(~states_); }
+        BitStates operator&(const BitStates &other) const { return BitStates(states_ & other.states_); }
+        BitStates operator|(const BitStates &other) const { return BitStates(states_ | other.states_); }
+        BitStates operator^(const BitStates &other) const { return BitStates(states_ ^ other.states_); }
+        BitStates operator>>(int32_t offset) const { return BitStates(states_ >> offset); }
+        BitStates operator<<(int32_t offset) const { return BitStates(states_ << offset); }
+        void operator&=(const BitStates &other) { states_ &= other.states_; }
+        void operator|=(const BitStates &other) { states_ |= other.states_; }
+        void operator^=(const BitStates &other) { states_ ^= other.states_; }
+
+        bool has_value() const { return states_ > 0; }
+        int32_t count() const { return __builtin_popcountll(states_); }
+        void clear_bit(int32_t offset) { states_ &= ~(1ULL << offset); }
+
+        // Extract the lowest set bit from mask, clear it, and return its position.
+        // Returns -1 if mask is empty.
+        int32_t pop_first() {
+            if (states_ == 0) return -1;
+            int32_t pos = __builtin_ctzll(states_);
+            states_ &= states_ - 1;
+            return pos;
+        }
+
+    private:
+        uint64_t states_{0};
+    };
+
+public:
+    void init(int32_t cluster_count) {
+        cluster_count_ = cluster_count;
+        aic_mask_.init();
+        aiv_mask_.init();
+        pending_occupied_.init();
+        for (int32_t i = 0; i < cluster_count; i++) {
+            aic_mask_ |= BitStates(1ULL << (i * 3));
+            aiv_mask_ |= BitStates(6ULL << (i * 3));
+        }
+        core_states_ = aic_mask_ | aiv_mask_;
+    }
+
+    void set_cluster(int32_t cluster_idx, int32_t aic_wid, int32_t aiv0_wid, int32_t aiv1_wid) {
+        core_id_map_[cluster_idx * 3] = aic_wid;
+        core_id_map_[cluster_idx * 3 + 1] = aiv0_wid;
+        core_id_map_[cluster_idx * 3 + 2] = aiv1_wid;
+    }
+
+    int32_t get_cluster_count() const { return cluster_count_; }
+
+    // --- Running core queries ---
+
+    template <CoreType CT>
+    bool has_running_cores() const {
+        if constexpr (CT == CoreType::AIC) {
+            return ((~core_states_) & aic_mask_).has_value();
+        } else {
+            return ((~core_states_) & aiv_mask_).has_value();
+        }
+    }
+
+    bool has_any_running_cores() const { return ((~core_states_) & (aic_mask_ | aiv_mask_)).has_value(); }
+
+    template <CoreType CT>
+    int32_t get_running_count() const {
+        if constexpr (CT == CoreType::AIC) {
+            return ((~core_states_) & aic_mask_).count();
+        } else {
+            return ((~core_states_) & aiv_mask_).count();
+        }
+    }
+
+    // Return an opaque bitmask for iterating running cores of a given type.
+    // Use pop_first() to extract core bit offsets one at a time.
+    template <CoreType CT>
+    BitStates get_running_cores() const {
+        if constexpr (CT == CoreType::AIC) {
+            return (~core_states_) & aic_mask_;
+        } else {
+            return (~core_states_) & aiv_mask_;
+        }
+    }
+
+    BitStates get_all_running_cores() const { return (~core_states_) & (aic_mask_ | aiv_mask_); }
+    BitStates get_cluster_offset_states() const { return aic_mask_; }
+
+    // --- Cluster matching ---
+
+    BitStates get_valid_cluster_offset_states(PTO2ResourceShape shape) const {
+        switch (shape) {
+        case PTO2ResourceShape::AIC:
+            return core_states_ & aic_mask_;
+        case PTO2ResourceShape::AIV:
+            return ((core_states_ >> 1) | (core_states_ >> 2)) & aic_mask_;
+        case PTO2ResourceShape::MIX:
+            return (core_states_ >> 1) & (core_states_ >> 2) & core_states_ & aic_mask_;
+        case PTO2ResourceShape::DUMMY:
+            // DUMMY tasks never reach the core-tracker dispatch path; they are
+            // completed inline by resolve_and_dispatch via dummy_ready_queue.
+            return BitStates(0ULL);
+        }
+        return BitStates(0ULL);
+    }
+
+    int32_t get_aic_core_id(int32_t cluster_offset) const { return core_id_map_[cluster_offset]; }
+    int32_t get_aiv0_core_id(int32_t cluster_offset) const { return core_id_map_[cluster_offset + 1]; }
+    int32_t get_aiv1_core_id(int32_t cluster_offset) const { return core_id_map_[cluster_offset + 2]; }
+
+    int32_t get_aic_core_offset(int32_t cluster_offset) const { return cluster_offset; }
+    int32_t get_aiv0_core_offset(int32_t cluster_offset) const { return cluster_offset + 1; }
+    int32_t get_aiv1_core_offset(int32_t cluster_offset) const { return cluster_offset + 2; }
+
+    bool is_aic_core_idle(int32_t cluster_offset) const {
+        return ((core_states_ >> cluster_offset) & BitStates(1ULL)).has_value();
+    }
+    bool is_aiv0_core_idle(int32_t cluster_offset) const {
+        return ((core_states_ >> (cluster_offset + 1)) & BitStates(1ULL)).has_value();
+    }
+    bool is_aiv1_core_idle(int32_t cluster_offset) const {
+        return ((core_states_ >> (cluster_offset + 2)) & BitStates(1ULL)).has_value();
+    }
+
+    // --- State mutation ---
+
+    // Toggle bit at the given bit offset (running <-> idle)
+    void change_core_state(int32_t bit_offset) { core_states_ ^= BitStates(1ULL << bit_offset); }
+
+    // --- Pending-occupied tracking ---
+    // Tracks whether a core's pending payload slot is occupied (awaiting hardware ACK).
+    // SET on dispatch (both running-first and pending), CLEAR on idle or pending_freed.
+
+    void set_pending_occupied(int32_t bit_offset) { pending_occupied_ |= BitStates(1ULL << bit_offset); }
+    void clear_pending_occupied(int32_t bit_offset) {
+        pending_occupied_ ^= (pending_occupied_ & BitStates(1ULL << bit_offset));
+    }
+
+    // --- Two-phase dispatch queries ---
+
+    // Idle dispatch: returns bit offsets of idle cores for the given shape.
+    // For AIC: 1 bit per cluster (core offset == cluster offset).
+    // For AIV: 1 bit per AIV core (2 bits per cluster at aiv_mask_ positions).
+    // Only AIC needs pending_occupied filtering: by invariant, idle cores (core_states_ bit=1)
+    // always have pending_occupied=0, so AIV/MIX need no extra filtering.
+    // Skipping the AIC-centric filter also fixes a latent bug where a running+pending AIC core
+    // would incorrectly block AIV idle dispatch on the same cluster.
+    BitStates get_idle_core_offset_states(PTO2ResourceShape shape) const {
+        if (shape == PTO2ResourceShape::AIC) {
+            return get_valid_cluster_offset_states(shape) & ~(pending_occupied_ & aic_mask_);
+        }
+        if (shape == PTO2ResourceShape::AIV) {
+            return core_states_ & aiv_mask_;
+        }
+        return get_valid_cluster_offset_states(shape);  // MIX: cluster-level
+    }
+
+    // Pending dispatch: returns bit offsets of cores eligible for pending-slot dispatch.
+    // AIC: 1 bit per cluster (aic_mask_ positions). AIV: 1 bit per AIV core (aiv_mask_ positions).
+    // Runtime MIX dispatch uses classify_mix_cluster() so the decision follows the task's active_mask.
+    enum class MixPlacement : uint8_t { RUNNING, PENDING, REJECT };
+
+    // A MIX block must place all cores named by active_mask the same way:
+    // all idle means running placement, all running means pending placement,
+    // and any mixed state is retried later.
+    MixPlacement classify_mix_cluster(int32_t cluster_offset, uint8_t core_mask) const {
+        BitStates used(0ULL);
+        if (core_mask & PTO2_SUBTASK_MASK_AIC) {
+            used |= BitStates(1ULL << cluster_offset);
+        }
+        if (core_mask & PTO2_SUBTASK_MASK_AIV0) {
+            used |= BitStates(1ULL << (cluster_offset + 1));
+        }
+        if (core_mask & PTO2_SUBTASK_MASK_AIV1) {
+            used |= BitStates(1ULL << (cluster_offset + 2));
+        }
+        if (!used.has_value() || (pending_occupied_ & used).has_value()) {
+            return MixPlacement::REJECT;
+        }
+
+        BitStates idle = core_states_ & used;
+        if (idle.count() == used.count()) {
+            return MixPlacement::RUNNING;
+        }
+        if (!idle.has_value()) {
+            return MixPlacement::PENDING;
+        }
+        return MixPlacement::REJECT;
+    }
+
+    BitStates get_mix_running_cluster_offset_states(uint8_t core_mask) const {
+        BitStates result(0ULL);
+        BitStates candidates = get_cluster_offset_states();
+        while (candidates.has_value()) {
+            int32_t cluster_offset = candidates.pop_first();
+            if (classify_mix_cluster(cluster_offset, core_mask) == MixPlacement::RUNNING) {
+                result |= BitStates(1ULL << cluster_offset);
+            }
+        }
+        return result;
+    }
+
+    int32_t count_mix_running_clusters(uint8_t core_mask) const {
+        return get_mix_running_cluster_offset_states(core_mask).count();
+    }
+
+    BitStates get_pending_core_offset_states(PTO2ResourceShape shape) const {
+        if (shape == PTO2ResourceShape::MIX) {
+            // Shape-level query kept conservative for legacy callers/tests.
+            // The real MIX dispatch path applies active_mask in classify_mix_cluster().
+            // Any core without a pending payload can accept a dispatch (idle or running).
+            BitStates available = ~pending_occupied_;
+            BitStates mix_available =
+                (available & aic_mask_) & ((available >> 1) & aic_mask_) & ((available >> 2) & aic_mask_);
+            // Pending MIX can only reuse a fully-running cluster. Partially-running clusters
+            // could split one MIX block across immediate and pending placement.
+            BitStates running = ~core_states_;
+            BitStates cluster_all_running =
+                (running & aic_mask_) & ((running >> 1) & aic_mask_) & ((running >> 2) & aic_mask_);
+            return mix_available & cluster_all_running;
+        }
+        if (shape == PTO2ResourceShape::AIC) {
+            return (~core_states_) & aic_mask_ & ~(pending_occupied_ & aic_mask_);
+        }
+        // AIV
+        return (~core_states_) & aiv_mask_ & ~pending_occupied_;
+    }
+
+    // --- Two-phase dispatch unified query ---
+
+    enum class DispatchPhase : uint8_t { IDLE, PENDING };
+
+    BitStates get_dispatchable_cores(PTO2ResourceShape shape, DispatchPhase phase) const {
+        return (phase == DispatchPhase::IDLE) ? get_idle_core_offset_states(shape) :
+                                                get_pending_core_offset_states(shape);
+    }
+
+    // --- Bit offset <-> worker_id mapping ---
+
+    int32_t get_core_id_by_offset(int32_t offset) const { return core_id_map_[offset]; }
+
+    const int32_t *core_ids() const { return core_id_map_; }
+    int32_t core_num() const { return cluster_count_ * 3; }
+
+private:
+    int32_t cluster_count_;
+    BitStates aic_mask_;
+    BitStates aiv_mask_;
+    BitStates core_states_;
+    BitStates pending_occupied_;
+    int32_t core_id_map_[63];  // bit_position -> worker_id, max 21 clusters * 3
+};
+
+// =============================================================================
+// SlotTransition: pure event signals from a single register poll.
+// true = event occurred, false = no-op (maintain current state).
+// =============================================================================
+
+struct SlotTransition {
+    bool running_done = false;   // running task completed
+    bool pending_done = false;   // pending task completed
+    bool running_freed = false;  // running slot data should be released
+    bool pending_freed = false;  // pending_occupied can be cleared
+    bool matched = false;        // some case was hit (otherwise skip apply)
+};
+
+// =============================================================================
+// Profiling counters (compile-time gated)
+// =============================================================================
+
+#if PTO2_PROFILING
+struct alignas(64) SchedL2SwimlaneCounters {
+    bool l2_swimlane_enabled{false};
+    uint64_t sched_start_ts{0};
+    uint64_t sched_complete_cycle{0};
+    uint64_t sched_dispatch_cycle{0};
+    uint64_t sched_wiring_cycle{0};
+    uint64_t sched_idle_cycle{0};
+    uint64_t sched_loop_count{0};
+    uint32_t phase_complete_count{0};
+    // Sub-block retires that did NOT finish a slot (SPMD blocks of a multi-block
+    // task retiring one at a time). Counted separately so the Complete-phase
+    // emit can fire on poll iterations that only retired sub-blocks — otherwise
+    // the serial-harvest tail of an SPMD slot is invisible (no slot completes
+    // until the last block, leaving the scheduler lane blank for that window).
+    uint32_t phase_subretire_count{0};
+    uint32_t phase_dispatch_count{0};
+    // Per-emit delta is (current - *_at_last_emit). Accumulated only when
+    // l2_swimlane_level_ >= SCHED_PHASES.
+    uint64_t pop_hit{0};
+    uint64_t pop_miss{0};
+    uint64_t pop_hit_at_last_emit{0};
+    uint64_t pop_miss_at_last_emit{0};
+#if PTO2_SCHED_PROFILING
+    uint32_t phase_wiring_count{0};
+    uint64_t complete_probe_count{0};
+    uint64_t complete_hit_count{0};
+    uint64_t sched_complete_perf_cycle{0};
+    uint64_t sched_dispatch_pop_cycle{0};
+    uint64_t sched_dispatch_setup_cycle{0};
+#endif
+    void reset() { *this = SchedL2SwimlaneCounters{}; }
+};
+#endif
+
+// =============================================================================
+// sync_start drain coordination
+// =============================================================================
+
+// When sync_start_pending != 0, all scheduler threads skip dispatch
+// (only process completions) until the drain worker finishes launching all blocks.
+struct alignas(64) SyncStartDrainState {
+    std::atomic<int32_t> sync_start_pending{0};    // 0=normal; -1=initializing; >0=active (value=block_num)
+    std::atomic<int32_t> drain_worker_elected{0};  // 0=none; >0: elected thread's (thread_idx+1)
+    std::atomic<uint32_t> drain_ack_mask{0};       // bit per thread; all-set = all threads reached ack barrier
+    std::atomic<PTO2TaskSlotState *> pending_task{nullptr};  // held task (not re-queued)
+    int32_t _pad[10];
+};
+static_assert(sizeof(SyncStartDrainState) == 64);
+
+#endif  // SCHEDULER_TYPES_H
diff --git a/src/a2a3/runtime/host_build_graph/runtime/shared/pto_runtime2_init.cpp b/src/a2a3/runtime/host_build_graph/runtime/shared/pto_runtime2_init.cpp
new file mode 100644
index 000000000..87a6757cf
--- /dev/null
+++ b/src/a2a3/runtime/host_build_graph/runtime/shared/pto_runtime2_init.cpp
@@ -0,0 +1,400 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Host/AICPU shared runtime-arena layout, init_data and wire implementations.
+ *
+ * Lives under runtime/shared/ so it is included in both the host_runtime.so
+ * build (host pre-populates the prebuilt arena image) and the aicpu_runtime
+ * build (AICPU runs wire_arena_pointers + destroy after attach). The
+ * device-only parts of pto_runtime2.cpp / pto_orchestrator.cpp / pto_scheduler.cpp
+ * (ops table, scope/submit/dispatch business logic, profiling) stay in their
+ * original files and the aicpu build only.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+
+#include <limits>
+
+#include "pto_orchestrator.h"
+#include "pto_runtime2.h"
+#include "pto_ring_buffer.h"
+#include "pto_shared_memory.h"
+#include "pto_tensormap.h"
+#include "scheduler/pto_scheduler.h"
+
+static bool sum_ring_heap_sizes(const uint64_t heap_sizes[PTO2_MAX_RING_DEPTH], uint64_t *total) {
+    uint64_t sum = 0;
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        if (heap_sizes[r] > std::numeric_limits<uint64_t>::max() - sum) {
+            LOG_ERROR("Total ring heap size overflows uint64_t");
+            return false;
+        }
+        sum += heap_sizes[r];
+    }
+    *total = sum;
+    return true;
+}
+
+// =============================================================================
+// Ready queue
+// =============================================================================
+
+size_t ready_queue_reserve_layout(DeviceArena &arena, uint64_t capacity) {
+    // Align the slots[] base to a full cache line so MPMC CAS traffic on the
+    // first slot cannot false-share with whatever region sits in front of us
+    // (e.g. orchestrator tensormap heads written by the orch thread).
+    return arena.reserve(capacity * sizeof(PTO2ReadyQueueSlot), PTO2_ALIGN_SIZE);
+}
+
+bool ready_queue_init_data_from_layout(PTO2ReadyQueue *queue, DeviceArena &arena, size_t slots_off, uint64_t capacity) {
+    // Address the slots region for data writes without storing the pointer in
+    // queue->slots — that field is set by ready_queue_wire_arena_pointers.
+    auto *slots_arena = static_cast<PTO2ReadyQueueSlot *>(arena.region_ptr(slots_off));
+    queue->capacity = capacity;
+    queue->mask = capacity - 1;
+    queue->enqueue_pos.store(0, std::memory_order_relaxed);
+    queue->dequeue_pos.store(0, std::memory_order_relaxed);
+
+    for (uint64_t i = 0; i < capacity; i++) {
+        slots_arena[i].sequence.store((int64_t)i, std::memory_order_relaxed);
+        slots_arena[i].slot_state = nullptr;
+    }
+
+    return true;
+}
+
+void ready_queue_wire_arena_pointers(PTO2ReadyQueue *queue, DeviceArena &arena, size_t slots_off) {
+    queue->slots = static_cast<PTO2ReadyQueueSlot *>(arena.region_ptr(slots_off));
+}
+
+void ready_queue_destroy(PTO2ReadyQueue *queue) {
+    // Arena owns the slots[] buffer; just forget the pointer.
+    queue->slots = nullptr;
+}
+
+// =============================================================================
+// Scheduler
+// =============================================================================
+
+bool PTO2SchedulerState::RingSchedState::init_data_from_layout(void *sm_dev_base) {
+    // ring stores the device address of the SM ring header — pure offset
+    // arithmetic, no SM load.
+    ring = pto2_sm_layout::ring_header_addr(sm_dev_base);
+    last_task_alive = 0;
+    advance_lock.store(0, std::memory_order_relaxed);
+#if PTO2_PROFILING
+    dep_pool_snapshot_tail.store(1, std::memory_order_relaxed);
+    dep_pool_snapshot_top.store(1, std::memory_order_relaxed);
+#endif
+
+    // Per-slot SM-side initialization (reset_for_reuse + fanin_count/active_mask
+    // zero) lives in PTO2SharedMemoryHandle::init_header_per_ring so the AICPU
+    // performs it during SM reset; host prebuilt-arena init skips SM access here.
+
+    return true;
+}
+
+void PTO2SchedulerState::RingSchedState::destroy() { ring = nullptr; }
+
+PTO2SchedulerLayout PTO2SchedulerState::reserve_layout(DeviceArena &arena, int32_t dep_pool_capacity) {
+    PTO2SchedulerLayout layout{};
+    layout.ready_queue_capacity = PTO2_READY_QUEUE_SIZE;
+    layout.spsc_capacity = PTO2_WRIRING_QUEUE_SIZE;
+    layout.dep_pool_capacity = dep_pool_capacity;
+
+    for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) {
+        layout.off_ready_queue_slots[i] = ready_queue_reserve_layout(arena, PTO2_READY_QUEUE_SIZE);
+    }
+    layout.off_dummy_ready_queue_slots = ready_queue_reserve_layout(arena, PTO2_READY_QUEUE_SIZE);
+    layout.off_early_dispatch_queue_slots = ready_queue_reserve_layout(arena, PTO2_EARLY_DISPATCH_QUEUE_SIZE);
+    // Force a cache-line base so writes from scheduler thread 0 (sole writer of
+    // the dep_pool) do not invalidate adjacent multi-threaded regions like
+    // ready_queue.slots.
+    layout.off_dep_pool_entries =
+        arena.reserve(static_cast<size_t>(dep_pool_capacity) * sizeof(PTO2DepListEntry), PTO2_ALIGN_SIZE);
+    layout.off_wiring_spsc_buffer = PTO2SpscQueue::reserve_layout(arena, PTO2_WRIRING_QUEUE_SIZE);
+    return layout;
+}
+
+bool PTO2SchedulerState::init_data_from_layout(
+    const PTO2SchedulerLayout &layout, DeviceArena &arena, void *sm_dev_base
+) {
+    PTO2SchedulerState *sched = this;
+    sched->sm_header = reinterpret_cast<PTO2SharedMemoryHeader *>(sm_dev_base);
+#if PTO2_SCHED_PROFILING
+    sched->tasks_completed.store(0, std::memory_order_relaxed);
+    sched->tasks_consumed.store(0, std::memory_order_relaxed);
+#endif
+
+    if (!sched->ring_sched_state.init_data_from_layout(sm_dev_base)) {
+        return false;
+    }
+
+    for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) {
+        if (!ready_queue_init_data_from_layout(
+                &sched->ready_queues[i], arena, layout.off_ready_queue_slots[i], layout.ready_queue_capacity
+            )) {
+            return false;
+        }
+    }
+    if (!ready_queue_init_data_from_layout(
+            &sched->dummy_ready_queue, arena, layout.off_dummy_ready_queue_slots, layout.ready_queue_capacity
+        )) {
+        return false;
+    }
+    if (!ready_queue_init_data_from_layout(
+            &sched->early_dispatch_queue, arena, layout.off_early_dispatch_queue_slots, PTO2_EARLY_DISPATCH_QUEUE_SIZE
+        )) {
+        return false;
+    }
+
+    auto *orch_err = pto2_sm_layout::orch_error_code_addr(sm_dev_base);
+    auto *dep_entries = static_cast<PTO2DepListEntry *>(arena.region_ptr(layout.off_dep_pool_entries));
+    memset(dep_entries, 0, static_cast<size_t>(layout.dep_pool_capacity) * sizeof(PTO2DepListEntry));
+    sched->ring_sched_state.dep_pool.init(dep_entries, layout.dep_pool_capacity, orch_err);
+
+    if (!sched->wiring.queue.init_data_from_layout(arena, layout.off_wiring_spsc_buffer, layout.spsc_capacity)) {
+        return false;
+    }
+    sched->wiring.batch_count = 0;
+    sched->wiring.batch_index = 0;
+    sched->wiring.backoff_counter = 0;
+
+    return true;
+}
+
+void PTO2SchedulerState::wire_arena_pointers(const PTO2SchedulerLayout &layout, DeviceArena &arena) {
+    PTO2SchedulerState *sched = this;
+    for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) {
+        ready_queue_wire_arena_pointers(&sched->ready_queues[i], arena, layout.off_ready_queue_slots[i]);
+    }
+    ready_queue_wire_arena_pointers(&sched->dummy_ready_queue, arena, layout.off_dummy_ready_queue_slots);
+    ready_queue_wire_arena_pointers(&sched->early_dispatch_queue, arena, layout.off_early_dispatch_queue_slots);
+    sched->ring_sched_state.dep_pool.base =
+        static_cast<PTO2DepListEntry *>(arena.region_ptr(layout.off_dep_pool_entries));
+    sched->wiring.queue.wire_arena_pointers(arena, layout.off_wiring_spsc_buffer);
+}
+
+void PTO2SchedulerState::destroy() {
+    PTO2SchedulerState *sched = this;
+    sched->ring_sched_state.destroy();
+    sched->ring_sched_state.dep_pool.base = nullptr;
+    sched->wiring.queue.destroy();
+    for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) {
+        ready_queue_destroy(&sched->ready_queues[i]);
+    }
+    ready_queue_destroy(&sched->dummy_ready_queue);
+    ready_queue_destroy(&sched->early_dispatch_queue);
+}
+
+// =============================================================================
+// Orchestrator
+// =============================================================================
+
+PTO2OrchestratorLayout
+PTO2OrchestratorState::reserve_layout(DeviceArena &arena, int32_t task_window_size, int32_t dep_pool_capacity) {
+    PTO2OrchestratorLayout layout{};
+    layout.scope_tasks_cap = PTO2_SCOPE_TASKS_CAP;
+    layout.scope_stack_capacity = PTO2_MAX_SCOPE_DEPTH;
+    layout.dep_pool_capacity = dep_pool_capacity;
+
+    const size_t fanin_pool_bytes =
+        PTO2_ALIGN_UP(static_cast<size_t>(dep_pool_capacity) * sizeof(PTO2FaninSpillEntry), PTO2_ALIGN_SIZE);
+    layout.off_fanin_pool = arena.reserve(fanin_pool_bytes, PTO2_ALIGN_SIZE);
+
+    always_assert(task_window_size > 0 && (task_window_size & (task_window_size - 1)) == 0);
+    const size_t seen_epoch_bytes =
+        PTO2_ALIGN_UP(static_cast<size_t>(task_window_size) * sizeof(uint32_t), PTO2_ALIGN_SIZE);
+    layout.off_fanin_seen_epoch = arena.reserve(seen_epoch_bytes, PTO2_ALIGN_SIZE);
+
+    layout.off_scope_tasks =
+        arena.reserve(static_cast<size_t>(layout.scope_tasks_cap) * sizeof(uintptr_t), alignof(PTO2TaskSlotState *));
+    layout.off_scope_begins =
+        arena.reserve(static_cast<size_t>(layout.scope_stack_capacity) * sizeof(int32_t), alignof(int32_t));
+    layout.tensor_map = PTO2TensorMap::reserve_layout_default(arena, task_window_size);
+    return layout;
+}
+
+bool PTO2OrchestratorState::init_data_from_layout(
+    const PTO2OrchestratorLayout &layout, DeviceArena &arena, void *sm_dev_base, void *gm_heap, uint64_t heap_size,
+    uint64_t task_window_size
+) {
+    auto *orch = this;
+    *orch = PTO2OrchestratorState{};
+
+    orch->sm_header = reinterpret_cast<PTO2SharedMemoryHeader *>(sm_dev_base);
+    orch->gm_heap_base = gm_heap;
+    orch->gm_heap_size = heap_size;
+    orch->fatal = false;
+
+    auto *orch_err = pto2_sm_layout::orch_error_code_addr(sm_dev_base);
+    auto *task_descs_dev = pto2_sm_layout::ring_task_descriptors_addr(sm_dev_base, task_window_size);
+    auto *slot_states_dev = pto2_sm_layout::ring_slot_states_addr(sm_dev_base, task_window_size);
+    auto *cur_idx_dev = pto2_sm_layout::ring_current_task_index_addr(sm_dev_base);
+    auto *last_alive_dev = pto2_sm_layout::ring_last_task_alive_addr(sm_dev_base);
+
+    orch->ring.task_allocator.init(
+        task_descs_dev, static_cast<int32_t>(task_window_size), cur_idx_dev, last_alive_dev, gm_heap, heap_size,
+        orch_err, slot_states_dev
+    );
+
+    const size_t fanin_pool_bytes =
+        PTO2_ALIGN_UP(static_cast<size_t>(layout.dep_pool_capacity) * sizeof(PTO2FaninSpillEntry), PTO2_ALIGN_SIZE);
+    auto *fanin_entries = static_cast<PTO2FaninSpillEntry *>(arena.region_ptr(layout.off_fanin_pool));
+    memset(fanin_entries, 0, fanin_pool_bytes);
+    orch->ring.fanin_pool.init(fanin_entries, layout.dep_pool_capacity, orch_err);
+
+    const size_t seen_epoch_bytes =
+        PTO2_ALIGN_UP(static_cast<size_t>(layout.tensor_map.task_window_size) * sizeof(uint32_t), PTO2_ALIGN_SIZE);
+    auto *seen_epoch = static_cast<uint32_t *>(arena.region_ptr(layout.off_fanin_seen_epoch));
+    memset(seen_epoch, 0, seen_epoch_bytes);
+    orch->fanin_seen_epoch = seen_epoch;
+
+    if (!orch->tensor_map.init_data_from_layout(layout.tensor_map, arena)) {
+        return false;
+    }
+
+    orch->scope_tasks_size = 0;
+    orch->scope_tasks_capacity = layout.scope_tasks_cap;
+    orch->scope_stack_top = -1;
+    orch->scope_stack_capacity = layout.scope_stack_capacity;
+    orch->manual_begin_depth = PTO2_MAX_SCOPE_DEPTH;
+
+    return true;
+}
+
+void PTO2OrchestratorState::wire_arena_pointers(
+    const PTO2OrchestratorLayout &layout, DeviceArena &arena, PTO2SchedulerState *scheduler_arg
+) {
+    auto *orch = this;
+    orch->ring.fanin_pool.base = static_cast<PTO2FaninSpillEntry *>(arena.region_ptr(layout.off_fanin_pool));
+    orch->fanin_seen_epoch = static_cast<uint32_t *>(arena.region_ptr(layout.off_fanin_seen_epoch));
+    orch->tensor_map.wire_arena_pointers(layout.tensor_map, arena);
+    orch->scope_tasks = static_cast<PTO2TaskSlotState **>(arena.region_ptr(layout.off_scope_tasks));
+    orch->scope_begins = static_cast<int32_t *>(arena.region_ptr(layout.off_scope_begins));
+    orch->scheduler = scheduler_arg;
+}
+
+void PTO2OrchestratorState::destroy() {
+    auto *orch = this;
+    orch->tensor_map.destroy();
+    orch->ring.fanin_pool.base = nullptr;
+    orch->fanin_seen_epoch = nullptr;
+    orch->scope_tasks = nullptr;
+    orch->scope_begins = nullptr;
+}
+
+void PTO2OrchestratorState::set_scheduler(PTO2SchedulerState *scheduler) { this->scheduler = scheduler; }
+
+// =============================================================================
+// Top-level runtime arena
+// =============================================================================
+
+PTO2RuntimeArenaLayout
+runtime_reserve_layout(DeviceArena &arena, uint64_t task_window_size, int32_t dep_pool_capacity) {
+    uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH];
+    uint64_t heap_sizes[PTO2_MAX_RING_DEPTH];
+    int32_t dep_pool_capacities[PTO2_MAX_RING_DEPTH];
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        task_window_sizes[r] = task_window_size;
+        heap_sizes[r] = 0;
+        dep_pool_capacities[r] = dep_pool_capacity;
+    }
+    return runtime_reserve_layout(arena, task_window_sizes, heap_sizes, dep_pool_capacities);
+}
+
+PTO2RuntimeArenaLayout runtime_reserve_layout(
+    DeviceArena &arena, const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH],
+    const uint64_t heap_sizes[PTO2_MAX_RING_DEPTH], const int32_t dep_pool_capacities[PTO2_MAX_RING_DEPTH]
+) {
+    PTO2RuntimeArenaLayout layout{};
+
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        layout.task_window_sizes[r] = task_window_sizes[r];
+        layout.heap_sizes[r] = heap_sizes[r];
+        layout.dep_pool_capacities[r] = dep_pool_capacities[r];
+    }
+
+    layout.off_sm_handle = arena.reserve(sizeof(PTO2SharedMemoryHandle), alignof(PTO2SharedMemoryHandle));
+    layout.orch = PTO2OrchestratorState::reserve_layout(
+        arena, static_cast<int32_t>(task_window_sizes[0]), dep_pool_capacities[0]
+    );
+    layout.sched = PTO2SchedulerState::reserve_layout(arena, dep_pool_capacities[0]);
+    layout.off_runtime = arena.reserve(sizeof(PTO2Runtime), PTO2_ALIGN_SIZE);
+    layout.off_mailbox = arena.reserve(sizeof(AICoreCompletionMailbox), alignof(AICoreCompletionMailbox));
+
+    layout.arena_size = arena.total_size();
+    return layout;
+}
+
+PTO2Runtime *runtime_init_data_from_layout(
+    DeviceArena &arena, const PTO2RuntimeArenaLayout &layout, PTO2RuntimeMode mode, void *sm_dev_base,
+    uint64_t /*sm_size*/, void *gm_heap_dev_base, uint64_t heap_size
+) {
+    uint64_t heap_sizes[PTO2_MAX_RING_DEPTH];
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        heap_sizes[r] = heap_size;
+    }
+    return runtime_init_data_from_layout(arena, layout, mode, sm_dev_base, 0, gm_heap_dev_base, heap_sizes);
+}
+
+PTO2Runtime *runtime_init_data_from_layout(
+    DeviceArena &arena, const PTO2RuntimeArenaLayout &layout, PTO2RuntimeMode mode, void *sm_dev_base,
+    uint64_t /*sm_size*/, void *gm_heap_dev_base, const uint64_t heap_sizes[PTO2_MAX_RING_DEPTH]
+) {
+    PTO2Runtime *rt = static_cast<PTO2Runtime *>(arena.region_ptr(layout.off_runtime));
+    memset(rt, 0, sizeof(*rt));
+
+    auto *sm_wrap = static_cast<PTO2SharedMemoryHandle *>(arena.region_ptr(layout.off_sm_handle));
+    memset(sm_wrap, 0, sizeof(*sm_wrap));
+
+    // rt->ops is filled by the AICPU at boot.
+    rt->mode = mode;
+    rt->gm_heap = gm_heap_dev_base;
+    uint64_t total_heap_size = 0;
+    if (!sum_ring_heap_sizes(heap_sizes, &total_heap_size)) {
+        return nullptr;
+    }
+    rt->gm_heap_size = total_heap_size;
+    rt->gm_heap_owned = false;
+    rt->total_cycles = 0;
+
+    if (!rt->orchestrator.init_data_from_layout(
+            layout.orch, arena, sm_dev_base, gm_heap_dev_base, heap_sizes[0], layout.task_window_sizes[0]
+        )) {
+        return nullptr;
+    }
+    if (!rt->scheduler.init_data_from_layout(layout.sched, arena, sm_dev_base)) {
+        return nullptr;
+    }
+
+    auto *mailbox = static_cast<AICoreCompletionMailbox *>(arena.region_ptr(layout.off_mailbox));
+    memset(mailbox, 0, sizeof(*mailbox));
+
+    return rt;
+}
+
+void runtime_wire_arena_pointers(DeviceArena &arena, const PTO2RuntimeArenaLayout &layout, PTO2Runtime *rt) {
+    rt->sm_handle = static_cast<PTO2SharedMemoryHandle *>(arena.region_ptr(layout.off_sm_handle));
+    rt->aicore_mailbox = static_cast<AICoreCompletionMailbox *>(arena.region_ptr(layout.off_mailbox));
+    rt->orchestrator.wire_arena_pointers(layout.orch, arena, &rt->scheduler);
+    rt->scheduler.wire_arena_pointers(layout.sched, arena);
+}
+
+void runtime_destroy(PTO2Runtime *rt, DeviceArena & /*arena*/) {
+    // Arena buffer is pooled across runs by DeviceRunner — never freed here.
+    if (!rt) return;
+    rt->scheduler.destroy();
+    rt->orchestrator.destroy();
+    rt->aicore_mailbox = nullptr;
+    rt->sm_handle = nullptr;
+}
diff --git a/src/a2a3/runtime/host_build_graph/runtime/shared/pto_shared_memory.cpp b/src/a2a3/runtime/host_build_graph/runtime/shared/pto_shared_memory.cpp
new file mode 100644
index 000000000..223736478
--- /dev/null
+++ b/src/a2a3/runtime/host_build_graph/runtime/shared/pto_shared_memory.cpp
@@ -0,0 +1,258 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * PTO Runtime2 - Shared Memory Implementation
+ *
+ * Implements shared memory allocation, initialization, and management
+ * for Orchestrator-Scheduler communication.
+ *
+ * Based on: docs/RUNTIME_LOGIC.md
+ */
+
+#include "pto_shared_memory.h"
+#include <inttypes.h>
+#include <stdlib.h>
+#include <string.h>
+#include "common/unified_log.h"
+
+// =============================================================================
+// Size Calculation
+// =============================================================================
+
+uint64_t PTO2SharedMemoryHandle::calculate_size(uint64_t task_window_size) {
+    uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH];
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        task_window_sizes[r] = task_window_size;
+    }
+    return calculate_size_per_ring(task_window_sizes);
+}
+
+uint64_t PTO2SharedMemoryHandle::calculate_size_per_ring(const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH]) {
+    // Total SM size = offset just past the ring's slot_states, from the single
+    // source of truth for the layout (pto2_sm_layout::ring_segment_offsets).
+    return pto2_sm_layout::ring_segment_offsets(task_window_sizes[0]).end;
+}
+
+// =============================================================================
+// Creation and Destruction
+// =============================================================================
+
+void PTO2SharedMemoryHandle::setup_pointers_per_ring(const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH]) {
+    char *base = (char *)sm_base;
+    header = (PTO2SharedMemoryHeader *)base;
+
+    // Per-ring descriptors / payloads / slot_states — offsets from the single
+    // source of truth (pto2_sm_layout::ring_segment_offsets), so this setup and
+    // the device-address helpers cannot drift.
+    auto off = pto2_sm_layout::ring_segment_offsets(task_window_sizes[0]);
+    auto &ring = header->ring;
+    ring.task_descriptors = (PTO2TaskDescriptor *)(base + off.descriptors);
+    ring.task_payloads = (PTO2TaskPayload *)(base + off.payloads);
+    ring.slot_states = (PTO2TaskSlotState *)(base + off.slot_states);
+}
+
+void PTO2SharedMemoryHandle::setup_pointers(uint64_t task_window_size) {
+    uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH];
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        task_window_sizes[r] = task_window_size;
+    }
+    setup_pointers_per_ring(task_window_sizes);
+}
+
+bool PTO2SharedMemoryHandle::init(
+    void *sm_base_arg, uint64_t sm_size_arg, uint64_t task_window_size, uint64_t heap_size
+) {
+    uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH];
+    uint64_t heap_sizes[PTO2_MAX_RING_DEPTH];
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        task_window_sizes[r] = task_window_size;
+        heap_sizes[r] = heap_size;
+    }
+    return init_per_ring(sm_base_arg, sm_size_arg, task_window_sizes, heap_sizes);
+}
+
+bool PTO2SharedMemoryHandle::init_per_ring(
+    void *sm_base_arg, uint64_t sm_size_arg, const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH],
+    const uint64_t heap_sizes[PTO2_MAX_RING_DEPTH]
+) {
+    if (!sm_base_arg || sm_size_arg == 0) return false;
+    if (sm_size_arg < calculate_size_per_ring(task_window_sizes)) return false;
+
+    sm_base = sm_base_arg;
+    sm_size = sm_size_arg;
+    is_owner = false;
+    setup_pointers_per_ring(task_window_sizes);
+    init_header_per_ring(task_window_sizes, heap_sizes);
+    return true;
+}
+
+bool PTO2SharedMemoryHandle::attach_populated(
+    void *sm_base_arg, uint64_t sm_size_arg, const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH]
+) {
+    if (!sm_base_arg || sm_size_arg == 0) return false;
+    if (sm_size_arg < calculate_size_per_ring(task_window_sizes)) return false;
+
+    sm_base = sm_base_arg;
+    sm_size = sm_size_arg;
+    is_owner = false;
+    setup_pointers_per_ring(task_window_sizes);
+    // Deliberately NO init_header_per_ring: the SM already holds the host
+    // orchestrator's task graph (descriptors, slot states, ring counters).
+    return true;
+}
+
+PTO2SharedMemoryHandle *PTO2SharedMemoryHandle::create_and_init_default(DeviceArena &arena) {
+    const uint64_t buffer_size = calculate_size(PTO2_TASK_WINDOW_SIZE);
+    const size_t off_handle = arena.reserve(sizeof(PTO2SharedMemoryHandle), alignof(PTO2SharedMemoryHandle));
+    const size_t off_buffer = arena.reserve(static_cast<size_t>(buffer_size), PTO2_ALIGN_SIZE);
+    if (arena.commit() == nullptr) return nullptr;
+
+    auto *handle = static_cast<PTO2SharedMemoryHandle *>(arena.region_ptr(off_handle));
+    memset(handle, 0, sizeof(*handle));
+    void *buffer = arena.region_ptr(off_buffer);
+    memset(buffer, 0, static_cast<size_t>(buffer_size));
+    if (!handle->init(buffer, buffer_size, PTO2_TASK_WINDOW_SIZE, PTO2_HEAP_SIZE)) return nullptr;
+    return handle;
+}
+
+void PTO2SharedMemoryHandle::destroy() {
+    // Arena-owned wrappers (is_owner == false) are reclaimed by arena.release();
+    // calling destroy on them is a no-op so existing callers stay safe.
+    if (is_owner && sm_base) {
+        free(sm_base);
+        free(this);
+    }
+}
+
+// =============================================================================
+// Initialization
+// =============================================================================
+//
+// no need init data in pool, init pool data when used
+void PTO2SharedMemoryHandle::init_header(uint64_t task_window_size, uint64_t heap_size) {
+    uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH];
+    uint64_t heap_sizes[PTO2_MAX_RING_DEPTH];
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        task_window_sizes[r] = task_window_size;
+        heap_sizes[r] = heap_size;
+    }
+    init_header_per_ring(task_window_sizes, heap_sizes);
+}
+
+void PTO2SharedMemoryHandle::init_header_per_ring(
+    const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH], const uint64_t heap_sizes[PTO2_MAX_RING_DEPTH]
+) {
+    // Flow control (starts at 0)
+    header->ring.fc.init();
+
+    header->orchestrator_done.store(0, std::memory_order_relaxed);
+
+    // Ring layout info
+    uint64_t offset = PTO2_ALIGN_UP(sizeof(PTO2SharedMemoryHeader), PTO2_ALIGN_SIZE);
+    header->ring.task_window_size = task_window_sizes[0];
+    header->ring.task_window_mask = static_cast<int32_t>(task_window_sizes[0] - 1);
+    header->ring.heap_size = heap_sizes[0];
+    header->ring.task_descriptors_offset = offset;
+    offset += PTO2_ALIGN_UP(task_window_sizes[0] * sizeof(PTO2TaskDescriptor), PTO2_ALIGN_SIZE);
+    offset += PTO2_ALIGN_UP(task_window_sizes[0] * sizeof(PTO2TaskPayload), PTO2_ALIGN_SIZE);
+    offset += PTO2_ALIGN_UP(task_window_sizes[0] * sizeof(PTO2TaskSlotState), PTO2_ALIGN_SIZE);
+
+    header->total_size = sm_size;
+    header->graph_output_ptr.store(0, std::memory_order_relaxed);
+    header->graph_output_size.store(0, std::memory_order_relaxed);
+
+    // Error reporting
+    header->orch_error_code.store(PTO2_ERROR_NONE, std::memory_order_relaxed);
+    header->sched_error_bitmap.store(0, std::memory_order_relaxed);
+    header->sched_error_code.store(PTO2_ERROR_NONE, std::memory_order_relaxed);
+    header->sched_error_thread.store(-1, std::memory_order_relaxed);
+
+    // Per-ring slot_states reset. Previously lived in
+    // PTO2SchedulerState::RingSchedState::init(), but it writes into
+    // ring->slot_states[] which is SM-side storage — keeping it here lets
+    // host-side prebuilt-arena init skip all SM dereferences.
+    // reset_for_reuse() prepares dynamic fanout/refcount fields so the first
+    // submit doesn't need an explicit reset.
+    auto &ring = header->ring;
+    for (uint64_t i = 0; i < task_window_sizes[0]; i++) {
+        ring.slot_states[i].reset_for_reuse();
+        ring.slot_states[i].fanin_count = 0;
+        ring.slot_states[i].active_mask = ActiveMask{};
+    }
+}
+
+// =============================================================================
+// Debug Utilities
+// =============================================================================
+
+void PTO2SharedMemoryHandle::print_layout() {
+    if (!header) return;
+
+    PTO2SharedMemoryHeader *h = header;
+
+    LOG_INFO_V0("=== PTO2 Shared Memory Layout ===");
+    LOG_INFO_V0("Base address:       %p", sm_base);
+    LOG_INFO_V0("Total size:         %" PRIu64 " bytes", h->total_size);
+    LOG_INFO_V0("Ring depth:         %d", PTO2_MAX_RING_DEPTH);
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        LOG_INFO_V0("Ring %d:", r);
+        LOG_INFO_V0("  task_window_size: %" PRIu64, h->ring.task_window_size);
+        LOG_INFO_V0("  heap_size:        %" PRIu64 " bytes", h->ring.heap_size);
+        LOG_INFO_V0(
+            "  descriptors_off:  %" PRIu64 " (0x%" PRIx64 ")", h->ring.task_descriptors_offset,
+            h->ring.task_descriptors_offset
+        );
+        LOG_INFO_V0("  current_task_idx: %d", h->ring.fc.current_task_index.load(std::memory_order_acquire));
+        LOG_INFO_V0("  last_task_alive:  %d", h->ring.fc.last_task_alive.load(std::memory_order_acquire));
+    }
+    LOG_INFO_V0("orchestrator_done:  %d", h->orchestrator_done.load(std::memory_order_acquire));
+    LOG_INFO_V0("Error state:");
+    LOG_INFO_V0("  orch_error_code:    %d", h->orch_error_code.load(std::memory_order_relaxed));
+    LOG_INFO_V0("  sched_error_bitmap: 0x%x", h->sched_error_bitmap.load(std::memory_order_relaxed));
+    LOG_INFO_V0("  sched_error_code:   %d", h->sched_error_code.load(std::memory_order_relaxed));
+    LOG_INFO_V0("  sched_error_thread: %d", h->sched_error_thread.load(std::memory_order_relaxed));
+    LOG_INFO_V0("================================");
+}
+
+bool PTO2SharedMemoryHandle::validate() {
+    if (!sm_base) return false;
+    if (!header) return false;
+
+    PTO2SharedMemoryHeader *h = header;
+
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        if (!h->ring.fc.validate(this, r)) return false;
+    }
+
+    return true;
+}
+
+bool PTO2RingFlowControl::validate(PTO2SharedMemoryHandle *handle, int32_t ring_id) const {
+    if (!handle) return false;
+    if (!handle->header) return false;
+    if (ring_id < 0 || ring_id >= PTO2_MAX_RING_DEPTH) return false;
+
+    const PTO2SharedMemoryHeader *h = handle->header;
+
+    // Check that offsets are within bounds
+    if (h->ring.task_descriptors_offset >= h->total_size) return false;
+
+    // Check pointer alignment
+    if ((uintptr_t)h->ring.task_descriptors % PTO2_ALIGN_SIZE != 0) return false;
+
+    // Check flow control pointer sanity
+    int32_t current = current_task_index.load(std::memory_order_acquire);
+    int32_t last_alive = last_task_alive.load(std::memory_order_acquire);
+    if (current < 0) return false;
+    if (last_alive < 0) return false;
+
+    return true;
+}
diff --git a/src/a2a3/runtime/host_build_graph/runtime/shared/pto_tensormap.cpp b/src/a2a3/runtime/host_build_graph/runtime/shared/pto_tensormap.cpp
new file mode 100644
index 000000000..69570a01b
--- /dev/null
+++ b/src/a2a3/runtime/host_build_graph/runtime/shared/pto_tensormap.cpp
@@ -0,0 +1,246 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * PTO Runtime2 - TensorMap Implementation
+ *
+ * Implements TensorMap with ring buffer pool, lazy invalidation,
+ * and chain truncation optimization.
+ *
+ * Key features:
+ * 1. O(1) insert at bucket head
+ * 2. O(valid_entries) lookup with chain truncation
+ * 3. Automatic stale entry cleanup during lookup
+ * 4. Periodic explicit cleanup for long chains
+ *
+ * Based on: docs/RUNTIME_LOGIC.md
+ */
+
+#include "pto_tensormap.h"
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "common.h"
+#include "common/unified_log.h"
+
+// =============================================================================
+// TensorMap Lookup Chain Length Statistics (compile-time toggle)
+// =============================================================================
+#if PTO2_TENSORMAP_PROFILING
+uint64_t g_lookup_chain_total = 0;
+uint64_t g_lookup_count = 0;
+int32_t g_lookup_chain_max = 0;
+uint64_t g_lookup_overlap_checks = 0;
+uint64_t g_lookup_overlap_hits = 0;
+uint64_t g_insert_count = 0;
+#endif
+
+// =============================================================================
+// Initialization and Destruction
+// =============================================================================
+
+PTO2TensorMapLayout PTO2TensorMap::reserve_layout(
+    DeviceArena &arena, int32_t new_num_buckets, int32_t new_pool_size, int32_t new_task_window_size
+) {
+    // num_buckets must be a power of two for the hash truncation to work.
+    always_assert((new_num_buckets & (new_num_buckets - 1)) == 0);
+
+    PTO2TensorMapLayout layout{};
+    layout.num_buckets = new_num_buckets;
+    layout.pool_size = new_pool_size;
+    layout.task_window_size = new_task_window_size;
+
+    layout.off_buckets = arena.reserve(
+        static_cast<size_t>(new_num_buckets) * sizeof(PTO2TensorMapEntry *), alignof(PTO2TensorMapEntry *)
+    );
+    layout.off_entry_pool =
+        arena.reserve(static_cast<size_t>(new_pool_size) * sizeof(PTO2TensorMapEntry), alignof(PTO2TensorMapEntry));
+    layout.off_free_entry_list =
+        arena.reserve(static_cast<size_t>(new_pool_size) * sizeof(PTO2TensorMapEntry *), alignof(PTO2TensorMapEntry *));
+    layout.off_task_entry_heads = arena.reserve(
+        static_cast<size_t>(new_task_window_size) * sizeof(PTO2TensorMapEntry *), alignof(PTO2TensorMapEntry *)
+    );
+    return layout;
+}
+
+PTO2TensorMapLayout PTO2TensorMap::reserve_layout_default(DeviceArena &arena, int32_t new_task_window_size) {
+    return reserve_layout(arena, PTO2_TENSORMAP_NUM_BUCKETS, PTO2_TENSORMAP_POOL_SIZE, new_task_window_size);
+}
+
+bool PTO2TensorMap::init_data_from_layout(const PTO2TensorMapLayout &layout, DeviceArena &arena) {
+    num_buckets = layout.num_buckets;
+    pool_size = layout.pool_size;
+
+    // Address arena regions for data writes; do not store these in struct
+    // fields (wire_arena_pointers does that).
+    auto *buckets_arena = static_cast<PTO2TensorMapEntry **>(arena.region_ptr(layout.off_buckets));
+    auto *entry_pool_arena = static_cast<PTO2TensorMapEntry *>(arena.region_ptr(layout.off_entry_pool));
+    auto *free_list_arena = static_cast<PTO2TensorMapEntry **>(arena.region_ptr(layout.off_free_entry_list));
+
+    // buckets[]: empty == nullptr.
+    for (int32_t i = 0; i < num_buckets; i++) {
+        buckets_arena[i] = nullptr;
+    }
+
+    // entry_pool: zero-init equivalent to the previous calloc(entry_pool, ...).
+    // The pool's persistent invariant after init is "bucket_index == -1 means
+    // not linked", set explicitly below.
+    memset(entry_pool_arena, 0, static_cast<size_t>(pool_size) * sizeof(PTO2TensorMapEntry));
+    for (int32_t i = 0; i < pool_size; i++) {
+        entry_pool_arena[i].bucket_index = -1;
+        entry_pool_arena[i].next_in_bucket = nullptr;
+        entry_pool_arena[i].prev_in_bucket = nullptr;
+        entry_pool_arena[i].next_in_task = nullptr;
+        entry_pool_arena[i].prev_in_task = nullptr;
+        entry_pool_arena[i].producer_task_id = PTO2TaskId{};
+    }
+
+    // free_entry_list: zeroed (was calloc'd before); contents become meaningful
+    // only after entries are freed back, so the body of the array stays as 0.
+    memset(free_list_arena, 0, static_cast<size_t>(pool_size) * sizeof(PTO2TensorMapEntry *));
+
+    next_entry_idx = 0;
+    free_num = 0;
+
+    auto *heads_arena = static_cast<PTO2TensorMapEntry **>(arena.region_ptr(layout.off_task_entry_heads));
+    for (int32_t i = 0; i < layout.task_window_size; i++) {
+        heads_arena[i] = nullptr;
+    }
+    task_window_size = layout.task_window_size;
+    last_task_alive_cached = 0;
+    last_cleanup = 0;
+
+    return true;
+}
+
+void PTO2TensorMap::wire_arena_pointers(const PTO2TensorMapLayout &layout, DeviceArena &arena) {
+    buckets = static_cast<PTO2TensorMapEntry **>(arena.region_ptr(layout.off_buckets));
+    entry_pool = static_cast<PTO2TensorMapEntry *>(arena.region_ptr(layout.off_entry_pool));
+    free_entry_list = static_cast<PTO2TensorMapEntry **>(arena.region_ptr(layout.off_free_entry_list));
+    task_entry_heads = static_cast<PTO2TensorMapEntry **>(arena.region_ptr(layout.off_task_entry_heads));
+}
+
+void PTO2TensorMap::destroy() {
+    // Arena owns the backing memory; here we only forget our pointers so any
+    // stray post-destroy access trips a nullptr dereference instead of reading
+    // a recycled allocation.
+    buckets = nullptr;
+    entry_pool = nullptr;
+    free_entry_list = nullptr;
+    task_entry_heads = nullptr;
+}
+
+// =============================================================================
+// Debug Utilities
+// =============================================================================
+
+void PTO2TensorMap::print_stats() {
+    int32_t valid = 0;
+    int32_t stale = 0;
+    int32_t empty_buckets = 0;
+    int32_t max_chain = 0;
+    int64_t total_chain = 0;
+    int32_t non_empty_buckets = 0;
+
+    // Count entries
+    for (int32_t i = 0; i < pool_size; i++) {
+        if (entry_pool[i].bucket_index != -1) {
+            if (entry_valid(entry_pool[i])) {
+                valid++;
+            } else {
+                stale++;
+            }
+        }
+    }
+
+    // Count bucket stats
+    for (int32_t b = 0; b < num_buckets; b++) {
+        int32_t chain_len = 0;
+        auto cur_entry = buckets[b];
+
+        while (cur_entry != nullptr) {
+            chain_len++;
+            cur_entry = cur_entry->next_in_bucket;
+        }
+
+        if (chain_len == 0) {
+            empty_buckets++;
+        } else {
+            non_empty_buckets++;
+            total_chain += chain_len;
+            if (chain_len > max_chain) {
+                max_chain = chain_len;
+            }
+        }
+    }
+
+    LOG_INFO_V0("=== TensorMap Statistics ===");
+    LOG_INFO_V0("Pool size:           %d", pool_size);
+    LOG_INFO_V0("Pool next entry idx: %d", next_entry_idx);
+    LOG_INFO_V0("Pool free_num:       %d", free_num);
+    LOG_INFO_V0("Num buckets:         %d", num_buckets);
+    LOG_INFO_V0("Valid entries:       %d", valid);
+    LOG_INFO_V0("Stale entries:       %d", stale);
+    LOG_INFO_V0("Empty buckets:       %d", empty_buckets);
+    LOG_INFO_V0("Max chain len:       %d", max_chain);
+    LOG_INFO_V0("Avg chain len:       %.2f", non_empty_buckets > 0 ? (float)total_chain / non_empty_buckets : 0);
+    LOG_INFO_V0("Last task alive:     %d", last_task_alive_cached);
+    LOG_INFO_V0("============================");
+}
+
+int32_t PTO2TensorMap::valid_count() {
+    int32_t count = 0;
+
+    for (int32_t i = 0; i < pool_size; i++) {
+        if (entry_pool[i].bucket_index != -1 && entry_valid(entry_pool[i])) {
+            count++;
+        }
+    }
+
+    return count;
+}
+
+void PTO2TensorMap::sync_tensormap(PTO2TaskId task_id, int32_t sm_last_task_alive) {
+    auto local_id = task_id.local();
+    sync_validity(sm_last_task_alive);
+
+    // Only attempt cleanup when last_task_alive has actually advanced;
+    // otherwise cleanup_retired would empty-loop and we'd spin forever.
+    auto overlap = get_task_local_id_slot(local_id) == get_task_local_id_slot(last_cleanup);
+    if (sm_last_task_alive - last_cleanup >= PTO2_TENSORMAP_CLEANUP_INTERVAL || overlap) {
+        cleanup_retired(last_cleanup, sm_last_task_alive);
+        last_cleanup = sm_last_task_alive;
+    }
+}
+
+// =============================================================================
+// TensorMap Lookup Profiling
+// =============================================================================
+#if PTO2_TENSORMAP_PROFILING
+PTO2TensorMapProfilingData pto2_tensormap_get_profiling() {
+    PTO2TensorMapProfilingData d;
+    d.lookup_chain_total = g_lookup_chain_total;
+    d.lookup_count = g_lookup_count;
+    d.lookup_chain_max = g_lookup_chain_max;
+    d.overlap_checks = g_lookup_overlap_checks;
+    d.overlap_hits = g_lookup_overlap_hits;
+    d.insert_count = g_insert_count;
+
+    // Reset
+    g_lookup_chain_total = 0;
+    g_lookup_count = 0;
+    g_lookup_chain_max = 0;
+    g_lookup_overlap_checks = 0;
+    g_lookup_overlap_hits = 0;
+    g_insert_count = 0;
+    return d;
+}
+#endif
diff --git a/src/a2a3/runtime/host_build_graph/runtime/shared/runtime.cpp b/src/a2a3/runtime/host_build_graph/runtime/shared/runtime.cpp
new file mode 100644
index 000000000..79a800181
--- /dev/null
+++ b/src/a2a3/runtime/host_build_graph/runtime/shared/runtime.cpp
@@ -0,0 +1,158 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Runtime Class - Implementation
+ *
+ * Device execution and handshake control.
+ * Task graph construction is handled by PTO2Runtime.
+ */
+
+#include "runtime.h"
+
+#include "common/unified_log.h"
+#include "pto_runtime2_types.h"
+#include "pto_shared_memory.h"
+
+// =============================================================================
+// Constructor
+// =============================================================================
+
+Runtime::Runtime() {
+    // NOTE: host_api is initialized in InitRuntime() (host-only code)
+    // because the CApi functions don't exist when compiled for device.
+
+    // Initialize handshake buffers
+    memset(workers, 0, sizeof(workers));
+    worker_count = 0;
+    aicpu_thread_num = 1;
+    ready_queue_shards = RUNTIME_DEFAULT_READY_QUEUE_SHARDS;
+    memset(aicpu_allowed_cpus, 0, sizeof(aicpu_allowed_cpus));
+    aicpu_allowed_cpu_count = 0;
+    aicpu_launch_count = 0;
+    orch_to_sched = false;
+    host_total_tasks = 0;
+
+    // Initialize shared-memory / orchestration argument plumbing
+    gm_sm_ptr_ = nullptr;
+    gm_heap_ptr_ = nullptr;
+    slot_states_ptr_ = nullptr;
+    orch_args_storage_.clear();
+    prebuilt_arena_base_ = nullptr;
+    prebuilt_runtime_offset_ = 0;
+
+    active_callable_id_ = -1;
+    dev_orch_so_addr_ = 0;
+    dev_orch_so_size_ = 0;
+    device_orch_func_name_[0] = '\0';
+    device_orch_config_name_[0] = '\0';
+
+    // Initialize kernel binary tracking
+    registered_kernel_count_ = 0;
+
+    // Initialize function address mapping
+    for (int i = 0; i < RUNTIME_MAX_FUNC_ID; i++) {
+        func_id_to_addr_[i] = 0;
+    }
+}
+
+// =============================================================================
+// Shared-memory / orchestration argument plumbing
+// =============================================================================
+
+void *Runtime::get_gm_sm_ptr() const { return gm_sm_ptr_; }
+void *Runtime::get_gm_heap_ptr() const { return gm_heap_ptr_; }
+const ChipStorageTaskArgs &Runtime::get_orch_args() const { return orch_args_storage_; }
+void Runtime::set_gm_sm_ptr(void *p) { gm_sm_ptr_ = p; }
+void Runtime::set_gm_heap(void *p) { gm_heap_ptr_ = p; }
+void Runtime::set_slot_states_ptr(void *p) { slot_states_ptr_ = p; }
+void Runtime::set_orch_args(const ChipStorageTaskArgs &args) { orch_args_storage_ = args; }
+
+void Runtime::set_prebuilt_arena(void *arena_base, size_t runtime_off) {
+    prebuilt_arena_base_ = arena_base;
+    prebuilt_runtime_offset_ = runtime_off;
+}
+void *Runtime::get_prebuilt_arena_base() const { return prebuilt_arena_base_; }
+size_t Runtime::get_prebuilt_runtime_offset() const { return prebuilt_runtime_offset_; }
+
+// Orchestration metadata written by the platform host (DeviceRunner) at
+// callable registration. host_build_graph runs the orchestrator on the host so
+// the device side never reads these back, but the platform registration path is
+// shared with tensormap_and_ringbuffer and still writes them.
+void Runtime::set_dev_orch_so(uint64_t dev_addr, uint64_t size) {
+    dev_orch_so_addr_ = dev_addr;
+    dev_orch_so_size_ = size;
+}
+
+void Runtime::set_active_callable_id(int32_t callable_id) { active_callable_id_ = callable_id; }
+
+int32_t Runtime::get_active_callable_id() const { return active_callable_id_; }
+
+void Runtime::set_device_orch_func_name(const char *name) {
+    if (name == nullptr) {
+        device_orch_func_name_[0] = '\0';
+        return;
+    }
+    std::strncpy(device_orch_func_name_, name, RUNTIME_MAX_ORCH_SYMBOL_NAME - 1);
+    device_orch_func_name_[RUNTIME_MAX_ORCH_SYMBOL_NAME - 1] = '\0';
+}
+
+void Runtime::set_device_orch_config_name(const char *name) {
+    if (name == nullptr) {
+        device_orch_config_name_[0] = '\0';
+        return;
+    }
+    std::strncpy(device_orch_config_name_, name, RUNTIME_MAX_ORCH_SYMBOL_NAME - 1);
+    device_orch_config_name_[RUNTIME_MAX_ORCH_SYMBOL_NAME - 1] = '\0';
+}
+
+uint64_t Runtime::get_function_bin_addr(int func_id) const {
+    if (func_id < 0 || func_id >= RUNTIME_MAX_FUNC_ID) return 0;
+    return func_id_to_addr_[func_id];
+}
+
+void Runtime::set_function_bin_addr(int func_id, uint64_t addr) {
+    if (func_id < 0 || func_id >= RUNTIME_MAX_FUNC_ID) {
+        LOG_ERROR("[Runtime] func_id=%d is out of range [0, %d)", func_id, RUNTIME_MAX_FUNC_ID);
+        return;
+    }
+    if (addr != 0 && func_id_to_addr_[func_id] == 0) {
+        if (registered_kernel_count_ < RUNTIME_MAX_FUNC_ID) {
+            registered_kernel_func_ids_[registered_kernel_count_++] = func_id;
+        } else {
+            LOG_ERROR(
+                "[Runtime] Registration limit reached (%d). Cannot track func_id=%d for cleanup.", RUNTIME_MAX_FUNC_ID,
+                func_id
+            );
+        }
+    }
+    func_id_to_addr_[func_id] = addr;
+}
+
+void Runtime::replay_function_bin_addr(int func_id, uint64_t addr) {
+    if (func_id < 0 || func_id >= RUNTIME_MAX_FUNC_ID) {
+        LOG_ERROR("[Runtime] func_id=%d is out of range [0, %d)", func_id, RUNTIME_MAX_FUNC_ID);
+        return;
+    }
+    func_id_to_addr_[func_id] = addr;
+}
+
+int Runtime::get_registered_kernel_count() const { return registered_kernel_count_; }
+
+int Runtime::get_registered_kernel_func_id(int index) const {
+    if (index < 0 || index >= registered_kernel_count_) return -1;
+    return registered_kernel_func_ids_[index];
+}
+
+void Runtime::clear_registered_kernels() { registered_kernel_count_ = 0; }
+
+// host_build_graph's device image is the whole Runtime object (host-orch builds
+// the graph on the host, but the entire Runtime is still rtMemcpy'd to device).
+size_t runtime_device_copy_size(const Runtime &) { return sizeof(Runtime); }
diff --git a/src/a2a3/runtime/host_build_graph/runtime/tensor_create_info.h b/src/a2a3/runtime/host_build_graph/runtime/tensor_create_info.h
new file mode 100644
index 000000000..912839a34
--- /dev/null
+++ b/src/a2a3/runtime/host_build_graph/runtime/tensor_create_info.h
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * TensorCreateInfo — submit-time create-info for runtime-allocated outputs.
+ *
+ * Runtime-only: this header (and the materialization helpers below) are NOT
+ * part of the wire/host-facing Tensor in src/common/task_interface/tensor.h.
+ * It carries the metadata required to materialize a fresh contiguous output:
+ * dtype, ndims, shapes, manual_dep, and an optional initial value fill. Its
+ * 64B layout mirrors Tensor cache line 1 so init_tensor_from_create_info() can
+ * copy the whole line with a single memcpy.
+ */
+
+#pragma once
+
+#include <cstring>
+#include <memory.h>
+#include <stdint.h>
+
+#include "data_type.h"
+#include "tensor.h"
+
+class alignas(64) TensorCreateInfo {
+public:
+    TensorCreateInfo(
+        const uint32_t shapes_in[], uint32_t ndims_in, DataType dtype_in = DataType::FLOAT32, bool manual_dep_in = false
+    ) :
+        initial_value(0),
+        has_initial_value(false),
+        __pad2__(0),
+        start_offset(0),  // mirrors Tensor::start_offset; pre-zeroed for create-info outputs
+        version(0),
+        ndims(ndims_in),
+        dtype(dtype_in),
+        manual_dep(manual_dep_in),
+        is_contiguous(true),  // mirrors Tensor::is_contiguous; pre-set for create-info outputs
+        __pad_flags__(0) {
+        // Bound the write below: shapes[] holds MAX_TENSOR_DIMS, and ndims_in
+        // comes from user-submitted output shapes — guard before the loop so an
+        // oversized rank can't overrun the fixed array.
+        always_assert(ndims_in > 0 && ndims_in <= MAX_TENSOR_DIMS);
+        for (uint32_t i = 0; i < ndims_in; i++) {
+            shapes[i] = shapes_in[i];
+        }
+    }
+
+    void copy(const TensorCreateInfo &other) { memcpy(this, &other, sizeof(other)); }
+
+    template <typename T = uint64_t>
+    void set_initial_value(T value) {
+        has_initial_value = true;
+        initial_value = to_u64(value);
+    }
+
+    uint64_t buffer_size_bytes() const {
+        uint64_t total = 1;
+        for (uint32_t i = 0; i < ndims; i++) {
+            total *= shapes[i];
+        }
+        return total * get_element_size(dtype);
+    }
+
+public:
+    // --- Bytes [0, 32): TensorCreateInfo-only fields ---
+    // These occupy the same positions as Tensor::buffer, Tensor::owner_task_id,
+    // and Tensor::start_offset. The runtime overwrites owner metadata after the
+    // memcpy and recomputes start_offset / stride during payload materialization.
+    uint64_t initial_value;
+    bool has_initial_value;
+    uint8_t __pad1__[7];
+    uint64_t __pad2__;      // → Tensor::owner_task_id (overwritten post-memcpy)
+    uint64_t start_offset;  // mirrors Tensor::start_offset; always 0 for create-info outputs
+
+    // --- Bytes [32, 64): Matches Tensor cache line 1 layout ---
+    int32_t version;  // Always 0 for create-info outputs
+    uint32_t ndims;
+    DataType dtype;
+    bool manual_dep;
+    bool is_contiguous;                // Always true for create-info outputs
+    uint8_t __pad_flags__;             // → Tensor::child_memory (always 0 for create-info outputs)
+    uint32_t shapes[MAX_TENSOR_DIMS];  // → Tensor::shapes
+
+    TensorCreateInfo() = default;
+};
+
+// TensorCreateInfo layout must match Tensor cacheline 1 for memcpy optimization
+static_assert(sizeof(TensorCreateInfo) == 64, "TensorCreateInfo must match Tensor cacheline 1 size (64 bytes)");
+static_assert(offsetof(TensorCreateInfo, start_offset) == offsetof(Tensor, start_offset));
+static_assert(offsetof(TensorCreateInfo, version) == offsetof(Tensor, version));
+static_assert(offsetof(TensorCreateInfo, ndims) == offsetof(Tensor, ndims));
+static_assert(offsetof(TensorCreateInfo, dtype) == offsetof(Tensor, dtype));
+static_assert(offsetof(TensorCreateInfo, manual_dep) == offsetof(Tensor, manual_dep));
+static_assert(offsetof(TensorCreateInfo, is_contiguous) == offsetof(Tensor, is_contiguous));
+static_assert(offsetof(TensorCreateInfo, __pad_flags__) == offsetof(Tensor, child_memory));
+static_assert(offsetof(TensorCreateInfo, shapes) == offsetof(Tensor, shapes));
+
+// ============================================================================
+// Materialization helpers — operate on a Tensor& through its public members.
+// Factored out of Tensor (which now lives in the wire/host-facing common
+// header) so the create-info dependency stays runtime-only.
+// ============================================================================
+
+/// Fill the entire backing buffer of `t` with `initial_value` (doubling memcpy).
+inline void fill_tensor_initial_value(Tensor &t, uint64_t initial_value) {
+    always_assert(reinterpret_cast<char *>(t.buffer.addr) != nullptr);
+    uint64_t elem_size = get_element_size(t.dtype);
+    char *dst = reinterpret_cast<char *>(t.buffer.addr);
+    constexpr uint64_t blk_size = 64;
+    uint64_t blk = (t.buffer.size < blk_size) ? t.buffer.size : blk_size;
+    for (uint64_t b = 0; b < blk; b += elem_size) {
+        memcpy(dst + b, &initial_value, elem_size);
+    }
+    uint64_t filled = blk;
+    while (filled < t.buffer.size) {
+        uint64_t copy_size = ((t.buffer.size - filled) < filled) ? (t.buffer.size - filled) : filled;
+        memcpy(dst + filled, dst, copy_size);
+        filled += copy_size;
+    }
+}
+
+/// Materialize a TensorCreateInfo into `t` (fresh contiguous output).
+/// Single 64B memcpy covers cache line 1; `ci` pre-initialises start_offset (=0)
+/// and is_contiguous (=true) in its line-1 slots so they need no reset here.
+/// Cache line 2 (stride/extent) is computed from `ci.shapes` in a single reverse pass.
+inline void init_tensor_from_create_info(Tensor &t, const TensorCreateInfo &ci, void *addr, uint64_t buffer_size) {
+    always_assert(ci.ndims > 0 && ci.ndims <= MAX_TENSOR_DIMS);
+    memcpy(&t, &ci, 64);
+    t.buffer = {reinterpret_cast<uint64_t>(addr), buffer_size};
+    t.owner_task_id = PTO2TaskId::invalid();  // caller (orchestrator) overwrites with actual task_id
+    uint32_t s = 1;
+    for (int32_t i = static_cast<int32_t>(t.ndims) - 1; i >= 0; --i) {
+        t.strides[i] = s;
+        s *= t.shapes[i];
+    }
+    t.extent_elem_cache = s;
+    if (ci.has_initial_value) {
+        fill_tensor_initial_value(t, ci.initial_value);
+    }
+}
diff --git a/src/a2a3/runtime/host_build_graph/runtime/tensor_info.h b/src/a2a3/runtime/host_build_graph/runtime/tensor_info.h
deleted file mode 100644
index 504d313d0..000000000
--- a/src/a2a3/runtime/host_build_graph/runtime/tensor_info.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * Copyright (c) PyPTO Contributors.
- * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
- * CANN Open Software License Agreement Version 2.0 (the "License").
- * Please refer to the License for details. You may not use this file except in compliance with the License.
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
- * See LICENSE in the root of the software repository for the full text of the License.
- * -----------------------------------------------------------------------------------------------------------
- */
-
-#ifndef SRC_A2A3_RUNTIME_HOST_BUILD_GRAPH_RUNTIME_TENSOR_INFO_H_
-#define SRC_A2A3_RUNTIME_HOST_BUILD_GRAPH_RUNTIME_TENSOR_INFO_H_
-
-#include <cstdint>
-
-#include "common/platform_config.h"
-#include "data_type.h"
-#include "tensor.h"
-
-struct TensorInfo {
-    DataType dtype;
-    uint8_t ndims;
-    uint16_t reserved;
-    uint32_t shapes[PLATFORM_DUMP_MAX_DIMS];
-    uint32_t raw_shapes[PLATFORM_DUMP_MAX_DIMS];
-    uint32_t offsets[PLATFORM_DUMP_MAX_DIMS];
-};
-
-static_assert(sizeof(TensorInfo) == 64, "TensorInfo must stay compact");
-
-struct TensorAllocationInfo {
-    uint64_t base_addr;
-    uint64_t size_bytes;
-
-    bool contains(uint64_t addr) const { return addr >= base_addr && addr < base_addr + size_bytes; }
-};
-
-static_assert(sizeof(TensorAllocationInfo) == 16, "TensorAllocationInfo must stay compact");
-
-inline TensorInfo make_tensor_info(
-    DataType dtype, uint32_t ndims, const uint32_t *shapes, const uint32_t *raw_shapes = nullptr,
-    const uint32_t *offsets = nullptr
-) {
-    TensorInfo info = {};
-    info.dtype = dtype;
-    info.ndims = static_cast<uint8_t>(ndims);
-    for (uint32_t i = 0; i < ndims && i < PLATFORM_DUMP_MAX_DIMS; i++) {
-        info.shapes[i] = shapes[i];
-        info.raw_shapes[i] = (raw_shapes != nullptr) ? raw_shapes[i] : shapes[i];
-        info.offsets[i] = (offsets != nullptr) ? offsets[i] : 0;
-    }
-    return info;
-}
-
-inline TensorInfo make_tensor_info_from_tensor_arg(const Tensor &tensor) {
-    return make_tensor_info(tensor.dtype, tensor.ndims, tensor.shapes);
-}
-
-#endif  // SRC_A2A3_RUNTIME_HOST_BUILD_GRAPH_RUNTIME_TENSOR_INFO_H_
diff --git a/src/a5/platform/include/common/platform_config.h b/src/a5/platform/include/common/platform_config.h
index d33d6f74c..c24876a5c 100644
--- a/src/a5/platform/include/common/platform_config.h
+++ b/src/a5/platform/include/common/platform_config.h
@@ -279,6 +279,19 @@ constexpr int PLATFORM_DUMP_READYQUEUE_SIZE = PLATFORM_MAX_AICPU_THREADS * PLATF
  */
 constexpr int PLATFORM_DUMP_TIMEOUT_SECONDS = 30;
 
+/**
+ * Dump-args mask pool dimensions. The pool is keyed by (ring_id, slot) packed
+ * from a PTO2 task_id, so it must span the largest ring depth and task window
+ * any runtime built against this platform can use. The dump infra is shared by
+ * every runtime (device-orch tensormap_and_ringbuffer at ring depth 4 and the
+ * single-ring host-orch host_build_graph), so these are sized to the maximum
+ * rather than coupled to one runtime's pto_runtime2_types.h — a runtime that
+ * lowers its own PTO2_MAX_RING_DEPTH must not shrink the pool other runtimes
+ * rely on (see set_dump_args_task_mask's ring_id bound check).
+ */
+constexpr uint32_t PLATFORM_DUMP_MASK_POOL_MAX_RINGS = 4;
+constexpr uint32_t PLATFORM_DUMP_MASK_POOL_MAX_SLOTS = 16384;
+
 // =============================================================================
 // PMU Profiling Configuration
 // =============================================================================
diff --git a/src/a5/platform/include/common/tensor_dump.h b/src/a5/platform/include/common/tensor_dump.h
index dd68fbe87..ab066f1d3 100644
--- a/src/a5/platform/include/common/tensor_dump.h
+++ b/src/a5/platform/include/common/tensor_dump.h
@@ -48,7 +48,6 @@
 #include <cstdint>
 
 #include "common/platform_config.h"
-#include "host_build_graph/runtime/pto_runtime2_types.h"
 
 // =============================================================================
 // Constants
@@ -88,8 +87,8 @@ using TensorDumpArgMask = uint64_t;
 // Zero preserves legacy "dump all tasks" behavior unless selective mode is enabled.
 constexpr TensorDumpArgMask TENSOR_DUMP_ARG_MASK_NONE = 0;
 constexpr uint32_t TENSOR_DUMP_ARG_MASK_BITS = 64;
-constexpr uint32_t TENSOR_DUMP_MASK_POOL_MAX_RINGS = PTO2_MAX_RING_DEPTH;
-constexpr uint32_t TENSOR_DUMP_MASK_POOL_MAX_SLOTS = PTO2_TASK_WINDOW_SIZE;
+constexpr uint32_t TENSOR_DUMP_MASK_POOL_MAX_RINGS = PLATFORM_DUMP_MASK_POOL_MAX_RINGS;
+constexpr uint32_t TENSOR_DUMP_MASK_POOL_MAX_SLOTS = PLATFORM_DUMP_MASK_POOL_MAX_SLOTS;
 constexpr uint32_t TENSOR_DUMP_MASK_POOL_DEFAULT_SLOT_MASK = TENSOR_DUMP_MASK_POOL_MAX_SLOTS - 1;
 constexpr uint8_t TENSOR_DUMP_RECORD_FLAG_ARG_INDEX_AMBIGUOUS = 1u << 0;
 
diff --git a/src/common/platform/onboard/host/c_api_shared.cpp b/src/common/platform/onboard/host/c_api_shared.cpp
index 20cdf91b4..d7f2793b8 100644
--- a/src/common/platform/onboard/host/c_api_shared.cpp
+++ b/src/common/platform/onboard/host/c_api_shared.cpp
@@ -130,6 +130,25 @@ static void set_retained_temp_buffer(void *addr, size_t size) {
     } catch (...) {}
 }
 
+// SVM map/unmap bridge for host_build_graph's host-side orchestrator. Reaches
+// the per-thread DeviceRunner via current_runner() — NOT routed through the
+// Runtime.host_api struct, so the tensormap_and_ringbuffer / a5 HostApi ABI is
+// untouched. Non-static so the hbg runtime_maker (linked into the same
+// host_runtime .so) can call it directly via an extern declaration.
+void *svm_register_via_runner(void *dev_ptr, size_t size) {
+    try {
+        return current_runner()->svm_register(dev_ptr, size);
+    } catch (...) {
+        return nullptr;
+    }
+}
+
+void svm_unregister_via_runner(void *dev_ptr) {
+    try {
+        current_runner()->svm_unregister(dev_ptr);
+    } catch (...) {}
+}
+
 static uint64_t upload_chip_callable_buffer_wrapper(const void *callable) {
     try {
         return current_runner()->upload_chip_callable_buffer(static_cast<const ChipCallable *>(callable));
diff --git a/src/common/platform/onboard/host/device_runner_base.h b/src/common/platform/onboard/host/device_runner_base.h
index 9f7616f3f..6bc59c91b 100644
--- a/src/common/platform/onboard/host/device_runner_base.h
+++ b/src/common/platform/onboard/host/device_runner_base.h
@@ -99,6 +99,24 @@ class DeviceRunnerBase : public L3L2OrchCommBackend {
     int l3_l2_orch_comm_init(void *control_block, size_t control_block_size);
     int l3_l2_orch_comm_shutdown();
 
+    /**
+     * Map a device buffer into the host address space and return a
+     * host-readable VA (or nullptr on failure); svm_unregister releases it.
+     * Used by host_build_graph, whose orchestrator runs on the host and needs
+     * to read control tensors (e.g. paged_attention's context_lens) whose
+     * buffer.addr is a device address. On a2a3 onboard this wraps
+     * halHostRegister(DEV_SVM_MAP_HOST); the returned VA may differ from
+     * dev_ptr, so callers must use it, not dev_ptr, for host access.
+     * Register/unregister must be paired (unregister before free_tensor).
+     * Base default: unsupported (returns nullptr / no-op); a2a3 overrides.
+     */
+    virtual void *svm_register(void *dev_ptr, std::size_t bytes) {
+        (void)dev_ptr;
+        (void)bytes;
+        return nullptr;
+    }
+    virtual void svm_unregister(void *dev_ptr) { (void)dev_ptr; }
+
     /**
      * Commit the three per-Worker pooled regions (PTO2 GM heap, PTO2
      * shared memory, trb prebuilt runtime arena) as three independent
diff --git a/src/common/platform/sim/host/c_api_shared.cpp b/src/common/platform/sim/host/c_api_shared.cpp
index ab26a22a4..307f5e8be 100644
--- a/src/common/platform/sim/host/c_api_shared.cpp
+++ b/src/common/platform/sim/host/c_api_shared.cpp
@@ -127,6 +127,23 @@ static void set_retained_temp_buffer(void *addr, size_t size) {
     } catch (...) {}
 }
 
+// SVM map/unmap bridge for host_build_graph (see onboard c_api_shared for
+// rationale). On sim the "device" pointer is already host-readable, so
+// svm_register is identity. Non-static so hbg runtime_maker can extern-call it.
+void *svm_register_via_runner(void *dev_ptr, size_t size) {
+    try {
+        return current_runner()->svm_register(dev_ptr, size);
+    } catch (...) {
+        return nullptr;
+    }
+}
+
+void svm_unregister_via_runner(void *dev_ptr) {
+    try {
+        current_runner()->svm_unregister(dev_ptr);
+    } catch (...) {}
+}
+
 static uint64_t upload_chip_callable_buffer_wrapper(const void *callable) {
     try {
         return current_runner()->upload_chip_callable_buffer(static_cast<const ChipCallable *>(callable));
diff --git a/src/common/platform/sim/host/device_runner_base.h b/src/common/platform/sim/host/device_runner_base.h
index 911ad53e2..625d78a51 100644
--- a/src/common/platform/sim/host/device_runner_base.h
+++ b/src/common/platform/sim/host/device_runner_base.h
@@ -104,6 +104,16 @@ class SimDeviceRunnerBase : public L3L2OrchCommBackend {
     int l3_l2_orch_comm_init(void *control_block, size_t control_block_size);
     int l3_l2_orch_comm_shutdown();
 
+    // SVM map/unmap for host_build_graph's host-side orchestrator. On sim,
+    // allocate_tensor returns a plain host pointer, so the "device" address is
+    // already host-readable — svm_register is identity and svm_unregister a
+    // no-op. Mirrors the onboard DeviceRunnerBase API (separate class trees).
+    void *svm_register(void *dev_ptr, size_t bytes) {
+        (void)bytes;
+        return dev_ptr;
+    }
+    void svm_unregister(void *dev_ptr) { (void)dev_ptr; }
+
     int record_device_orch_callable(
         int32_t callable_id, const void *orch_so_data, size_t orch_so_size, const char *func_name,
         const char *config_name, std::vector<std::pair<int, uint64_t>> kernel_addrs, std::vector<ArgDirection> signature
diff --git a/tests/st/a2a3/host_build_graph/bgemm/kernels/aic/kernel_gemm_tile.cpp b/tests/st/a2a3/host_build_graph/bgemm/kernels/aic/kernel_gemm_tile.cpp
index bdd81ac51..fafc05815 100644
--- a/tests/st/a2a3/host_build_graph/bgemm/kernels/aic/kernel_gemm_tile.cpp
+++ b/tests/st/a2a3/host_build_graph/bgemm/kernels/aic/kernel_gemm_tile.cpp
@@ -9,17 +9,24 @@
  * -----------------------------------------------------------------------------------------------------------
  */
 /**
- * Tile-based Matrix Multiplication Kernel (Cube Core)
+ * Tiled GEMM kernel (AIC, submit_task / Tensor* ABI)
  *
- * Computes: output = input_a @ input_b (64x64 tile matmul)
- * Uses TMATMUL instruction
+ * Implements: P = A @ B for a single 64x64 tile.
+ *
+ * Args (Tensor*):
+ *   args[0] = A (INPUT)
+ *   args[1] = B (INPUT)
+ *   args[2] = P (OUTPUT)
  */
 
 #include <cstdint>
+
 #include <pto/pto-inst.hpp>
 #include <pto/common/constants.hpp>
 #include <pto/common/pto_tile.hpp>
 
+#include "tensor.h"
+
 using namespace pto;
 
 #include "pipe_sync.h"
@@ -41,9 +48,13 @@ AICORE constexpr inline T CeilAlign(T num_1, T num_2) {
 }
 
 extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) {
-    __gm__ float *input_a = reinterpret_cast<__gm__ float *>(args[0]);
-    __gm__ float *input_b = reinterpret_cast<__gm__ float *>(args[1]);
-    __gm__ float *output = reinterpret_cast<__gm__ float *>(args[2]);
+    __gm__ Tensor *a_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]);
+    __gm__ Tensor *b_tensor = reinterpret_cast<__gm__ Tensor *>(args[1]);
+    __gm__ Tensor *p_tensor = reinterpret_cast<__gm__ Tensor *>(args[2]);
+
+    __gm__ float *input_a = reinterpret_cast<__gm__ float *>(a_tensor->buffer.addr) + a_tensor->start_offset;
+    __gm__ float *input_b = reinterpret_cast<__gm__ float *>(b_tensor->buffer.addr) + b_tensor->start_offset;
+    __gm__ float *output = reinterpret_cast<__gm__ float *>(p_tensor->buffer.addr) + p_tensor->start_offset;
 
     constexpr int TILE = 64;
     constexpr int blockAlign = C0_SIZE_BYTE / sizeof(float);
diff --git a/tests/st/a2a3/host_build_graph/bgemm/kernels/aiv/kernel_tile_add.cpp b/tests/st/a2a3/host_build_graph/bgemm/kernels/aiv/kernel_tile_add.cpp
index 42cf46e09..f33401edf 100644
--- a/tests/st/a2a3/host_build_graph/bgemm/kernels/aiv/kernel_tile_add.cpp
+++ b/tests/st/a2a3/host_build_graph/bgemm/kernels/aiv/kernel_tile_add.cpp
@@ -9,16 +9,22 @@
  * -----------------------------------------------------------------------------------------------------------
  */
 /**
- * Tile-based Element-wise Addition Kernel (Vector Core)
+ * Tiled accumulate-add kernel (AIV, submit_task / Tensor* ABI)
  *
- * Computes: output = input_a + input_b (64x64 tile addition)
- * Uses TADD instruction
+ * Implements: C[i] = C[i] + P[i] in place over a single 64x64 tile.
+ *
+ * Args (Tensor*):
+ *   args[0] = C (INOUT)
+ *   args[1] = P (INPUT)
  */
 
 #include <cstdint>
+
 #include <pto/pto-inst.hpp>
 #include <pto/common/constants.hpp>
 
+#include "tensor.h"
+
 using namespace pto;
 
 #include "pipe_sync.h"
@@ -32,9 +38,11 @@ using namespace pto;
 #endif
 
 extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) {
-    __gm__ float *input_a = reinterpret_cast<__gm__ float *>(args[0]);
-    __gm__ float *input_b = reinterpret_cast<__gm__ float *>(args[1]);
-    __gm__ float *output = reinterpret_cast<__gm__ float *>(args[2]);
+    __gm__ Tensor *c_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]);
+    __gm__ Tensor *p_tensor = reinterpret_cast<__gm__ Tensor *>(args[1]);
+
+    __gm__ float *c_ptr = reinterpret_cast<__gm__ float *>(c_tensor->buffer.addr) + c_tensor->start_offset;
+    __gm__ float *p_ptr = reinterpret_cast<__gm__ float *>(p_tensor->buffer.addr) + p_tensor->start_offset;
 
     constexpr int TILE = 64;
 
@@ -43,25 +51,24 @@ extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ in
     using GlobalData = GlobalTensor<float, DynShapeDim5, DynStridDim5>;
     using TileData = Tile<TileType::Vec, float, TILE, TILE, BLayout::RowMajor, -1, -1>;
 
-    TileData aTile(TILE, TILE);
-    TileData bTile(TILE, TILE);
+    TileData cTile(TILE, TILE);
+    TileData pTile(TILE, TILE);
     TileData outTile(TILE, TILE);
-    TASSIGN(aTile, 0x0);
-    TASSIGN(bTile, 0x10000);
+    TASSIGN(cTile, 0x0);
+    TASSIGN(pTile, 0x10000);
     TASSIGN(outTile, 0x20000);
 
-    GlobalData aGlobal(input_a);
-    GlobalData bGlobal(input_b);
-    GlobalData outGlobal(output);
+    GlobalData cGlobal(c_ptr);
+    GlobalData pGlobal(p_ptr);
 
-    TLOAD(aTile, aGlobal);
-    TLOAD(bTile, bGlobal);
+    TLOAD(cTile, cGlobal);
+    TLOAD(pTile, pGlobal);
     set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
     wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
-    TADD(outTile, aTile, bTile);
+    TADD(outTile, cTile, pTile);
     set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
     wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
-    TSTORE(outGlobal, outTile);
+    TSTORE(cGlobal, outTile);
 
     pipe_sync();
 }
diff --git a/tests/st/a2a3/host_build_graph/bgemm/kernels/orchestration/bgemm_orch.cpp b/tests/st/a2a3/host_build_graph/bgemm/kernels/orchestration/bgemm_orch.cpp
index 927a69e8a..a72e7ca08 100644
--- a/tests/st/a2a3/host_build_graph/bgemm/kernels/orchestration/bgemm_orch.cpp
+++ b/tests/st/a2a3/host_build_graph/bgemm/kernels/orchestration/bgemm_orch.cpp
@@ -9,143 +9,90 @@
  * -----------------------------------------------------------------------------------------------------------
  */
 /**
- * BGEMM Orchestration Function (Host Build Graph Runtime)
+ * BGEMM orchestration — submit_task / TensorMap form
  *
- * Builds the task graph for tiled matrix multiplication: C = A @ B
+ * Tiled C = A @ B, tile 64x64, grid 4x4x4.  Per output tile (m,n), for each k:
+ *   P_k = A[m,k] @ B[k,n]   (gemm_tile, AIC)  — fresh runtime-allocated buffer
+ *   C[m,n] += P_k           (tile_add, AIV, INOUT C tile)
  *
- * Configuration:
- *   - Tile size: 64 x 64
- *   - Grid: 4 x 4 x 4 (GRID_M x GRID_K x GRID_N)
+ * All dependencies are TensorMap-derived:
+ *   - gemm_k -> add_k via the freshly-allocated P_k.
+ *   - add_{k-1} -> add_k via the C[m,n] tile (add_inout overlap).
+ * Allocating a fresh P per k (instead of reusing one P buffer per tile) removes
+ * the write-after-read hazard the explicit-edge version needed an extra edge
+ * for. C arrives zero-initialized (pure OUTPUT), so the accumulation is exact.
  *
- * Memory layout (tile-first):
- *   A: [BATCH, GRID_M, GRID_K, TILE_M, TILE_K]
- *   B: [BATCH, GRID_K, GRID_N, TILE_K, TILE_N]
- *   C: [BATCH, GRID_M, GRID_N, TILE_M, TILE_N]
- *
- * Task graph per output tile:
- *   for k in [0, GRID_K):
- *     P = A[m,k] @ B[k,n]    (gemm_tile on Cube core)
- *     C[m,n] = C[m,n] + P    (tile_add on Vector core)
+ * Arg layout: [A (IN), B (IN), C (OUT)] — flattened 1-D tile-first tensors.
  */
 
-#include <iostream>
-#include <vector>
-
-#include "orchestration_api.h"  // NOLINT(build/include_subdir)
-
-extern "C" {
-
-constexpr int TILE = 64;
-constexpr int GRID_M = 4;
-constexpr int GRID_K = 4;
-constexpr int GRID_N = 4;
-constexpr int BATCH = 1;
+#include <stdint.h>
 
-constexpr size_t TILE_BYTES = TILE * TILE * sizeof(float);
+#include "pto_orchestration_api.h"  // NOLINT(build/include_subdir)
 
-int build_bgemm_graph(OrchestrationRuntime *runtime, const ChipStorageTaskArgs &orch_args) {
-    // Expected orch_args: [A, B, C] — 3 tensors
-    if (orch_args.tensor_count() < 3) {
-        std::cerr << "build_bgemm_graph: Expected at least 3 tensors, got " << orch_args.tensor_count() << '\n';
-        return -1;
-    }
+#define FUNC_GEMM 0
+#define FUNC_TILE_ADD 1
 
-    void *host_A = orch_args.tensor(0).data_as<void>();
-    void *host_B = orch_args.tensor(1).data_as<void>();
-    void *host_C = orch_args.tensor(2).data_as<void>();
-    size_t size_A = orch_args.tensor(0).nbytes();
-    size_t size_B = orch_args.tensor(1).nbytes();
-    size_t size_C = orch_args.tensor(2).nbytes();
-
-    std::cout << "\n=== build_bgemm_graph ===" << '\n';
-    std::cout << "Grid: " << GRID_M << " x " << GRID_K << " x " << GRID_N << '\n';
-
-    // Allocate device memory and copy inputs
-    void *dev_A = device_malloc(runtime, size_A);
-    if (!dev_A) return -1;
-    copy_to_device(runtime, dev_A, host_A, size_A);
-
-    void *dev_B = device_malloc(runtime, size_B);
-    if (!dev_B) {
-        device_free(runtime, dev_A);
-        return -1;
-    }
-    copy_to_device(runtime, dev_B, host_B, size_B);
+extern "C" {
 
-    void *dev_C = device_malloc(runtime, size_C);
-    if (!dev_C) {
-        device_free(runtime, dev_A);
-        device_free(runtime, dev_B);
-        return -1;
-    }
-    copy_to_device(runtime, dev_C, host_C, size_C);
-    record_tensor_pair(runtime, host_C, dev_C, size_C);
+static constexpr int TILE = 64;
+static constexpr int GRID_M = 4;
+static constexpr int GRID_K = 4;
+static constexpr int GRID_N = 4;
+static constexpr int BATCH = 1;
+static constexpr uint32_t TILE_ELEMS = TILE * TILE;
+
+__attribute__((visibility("default"))) PTO2OrchestrationConfig aicpu_orchestration_config(const L2TaskArgs &orch_args) {
+    (void)orch_args;
+    return PTO2OrchestrationConfig{
+        .expected_arg_count = 3,
+    };
+}
 
-    // Allocate intermediate P buffers (one per C tile)
-    constexpr int NUM_P_BUFFERS = BATCH * GRID_M * GRID_N;
-    std::vector<void *> dev_P(NUM_P_BUFFERS, nullptr);
-    for (int i = 0; i < NUM_P_BUFFERS; i++) {
-        dev_P[i] = device_malloc(runtime, TILE_BYTES);
-        if (!dev_P[i]) {
-            for (int j = 0; j < i; j++) {
-                device_free(runtime, dev_P[j]);
-            }
-            device_free(runtime, dev_A);
-            device_free(runtime, dev_B);
-            device_free(runtime, dev_C);
-            return -1;
-        }
-    }
+__attribute__((visibility("default"))) void aicpu_orchestration_entry(const L2TaskArgs &orch_args) {
+    const Tensor &a = orch_args.tensor(0).ref();
+    const Tensor &b = orch_args.tensor(1).ref();
+    const Tensor &c = orch_args.tensor(2).ref();
 
-    // Track last add task for each C tile (for K accumulation dependency)
-    std::vector<int> last_add_task(BATCH * GRID_M * GRID_N, -1);
+    uint32_t tile_shape[1] = {TILE_ELEMS};
+    TensorCreateInfo p_ci(tile_shape, 1, DataType::FLOAT32);
 
-    // Build task graph: 4-level tiling loop
     for (int batch = 0; batch < BATCH; batch++) {
         for (int m_idx = 0; m_idx < GRID_M; m_idx++) {
             for (int n_idx = 0; n_idx < GRID_N; n_idx++) {
                 for (int k_idx = 0; k_idx < GRID_K; k_idx++) {
-                    // Calculate tile offsets
-                    size_t A_offset = (batch * GRID_M * GRID_K + m_idx * GRID_K + k_idx) * TILE_BYTES;
-                    size_t B_offset = (batch * GRID_K * GRID_N + k_idx * GRID_N + n_idx) * TILE_BYTES;
-                    size_t C_offset = (batch * GRID_M * GRID_N + m_idx * GRID_N + n_idx) * TILE_BYTES;
-
-                    int c_tile_idx = batch * GRID_M * GRID_N + m_idx * GRID_N + n_idx;
-
-                    // Task 1: P = A[m,k] @ B[k,n] (gemm_tile on Cube core)
-                    uint64_t args_gemm[6];
-                    args_gemm[0] = reinterpret_cast<uint64_t>(static_cast<char *>(dev_A) + A_offset);
-                    args_gemm[1] = reinterpret_cast<uint64_t>(static_cast<char *>(dev_B) + B_offset);
-                    args_gemm[2] = reinterpret_cast<uint64_t>(dev_P[c_tile_idx]);
-                    args_gemm[3] = TILE;
-                    args_gemm[4] = TILE;
-                    args_gemm[5] = TILE;
-                    int t_gemm = add_task(runtime, args_gemm, 6, 0, CoreType::AIC);
-
-                    // Task 2: C[m,n] = C[m,n] + P (tile_add on Vector core)
-                    uint64_t args_add[5];
-                    args_add[0] = reinterpret_cast<uint64_t>(static_cast<char *>(dev_C) + C_offset);
-                    args_add[1] = reinterpret_cast<uint64_t>(dev_P[c_tile_idx]);
-                    args_add[2] = reinterpret_cast<uint64_t>(static_cast<char *>(dev_C) + C_offset);
-                    args_add[3] = TILE;
-                    args_add[4] = TILE;
-                    int t_add = add_task(runtime, args_add, 5, 1, CoreType::AIV);
-
-                    // Dependency: gemm must complete before add
-                    add_successor(runtime, t_gemm, t_add);
-
-                    // Dependency: previous add must complete before current gemm (K accumulation)
-                    if (last_add_task[c_tile_idx] >= 0) {
-                        add_successor(runtime, last_add_task[c_tile_idx], t_gemm);
-                    }
-                    last_add_task[c_tile_idx] = t_add;
+                    uint32_t a_off[1] = {
+                        static_cast<uint32_t>((batch * GRID_M * GRID_K + m_idx * GRID_K + k_idx) * TILE_ELEMS)
+                    };
+                    uint32_t b_off[1] = {
+                        static_cast<uint32_t>((batch * GRID_K * GRID_N + k_idx * GRID_N + n_idx) * TILE_ELEMS)
+                    };
+                    uint32_t c_off[1] = {
+                        static_cast<uint32_t>((batch * GRID_M * GRID_N + m_idx * GRID_N + n_idx) * TILE_ELEMS)
+                    };
+
+                    Tensor a_view = a.view(tile_shape, a_off);
+                    Tensor b_view = b.view(tile_shape, b_off);
+                    Tensor c_view = c.view(tile_shape, c_off);
+
+                    // P_k = A[m,k] @ B[k,n]
+                    L0TaskArgs p_gemm;
+                    p_gemm.add_input(a_view);
+                    p_gemm.add_input(b_view);
+                    p_gemm.add_output(p_ci);
+                    TaskOutputTensors p_out = rt_submit_aic_task(FUNC_GEMM, p_gemm);
+                    Tensor p = p_out.get_ref(0);
+
+                    // C[m,n] += P_k
+                    L0TaskArgs p_add;
+                    p_add.add_inout(c_view);
+                    p_add.add_input(p);
+                    rt_submit_aiv_task(FUNC_TILE_ADD, p_add);
                 }
             }
         }
     }
 
-    std::cout << "Created " << get_task_count(runtime) << " tasks\n";
-    return 0;
+    LOG_INFO_V9("[bgemm_orch] Submitted tiled C = A @ B");
 }
 
 }  // extern "C"
diff --git a/tests/st/a2a3/host_build_graph/bgemm/test_bgemm.py b/tests/st/a2a3/host_build_graph/bgemm/test_bgemm.py
index d65851df9..8d33972aa 100644
--- a/tests/st/a2a3/host_build_graph/bgemm/test_bgemm.py
+++ b/tests/st/a2a3/host_build_graph/bgemm/test_bgemm.py
@@ -37,7 +37,7 @@ class TestBgemmHostBuildGraph(SceneTestCase):
     CALLABLE = {
         "orchestration": {
             "source": "kernels/orchestration/bgemm_orch.cpp",
-            "function_name": "build_bgemm_graph",
+            "function_name": "aicpu_orchestration_entry",
             "signature": [D.IN, D.IN, D.OUT],
         },
         "incores": [
@@ -60,7 +60,7 @@ class TestBgemmHostBuildGraph(SceneTestCase):
         {
             "name": "default",
             "platforms": ["a2a3sim", "a2a3"],
-            "config": {"aicpu_thread_num": 3, "block_dim": 24},
+            "config": {"aicpu_thread_num": 4, "block_dim": 24},
             "params": {},
         },
     ]
diff --git a/tests/st/a2a3/host_build_graph/dump_tensor/kernels/aiv/kernel_add.cpp b/tests/st/a2a3/host_build_graph/dump_tensor/kernels/aiv/kernel_add.cpp
index 2383026bf..53d38ad2f 100644
--- a/tests/st/a2a3/host_build_graph/dump_tensor/kernels/aiv/kernel_add.cpp
+++ b/tests/st/a2a3/host_build_graph/dump_tensor/kernels/aiv/kernel_add.cpp
@@ -8,10 +8,22 @@
  * See LICENSE in the root of the software repository for the full text of the License.
  * -----------------------------------------------------------------------------------------------------------
  */
+/**
+ * Element-wise addition kernel (submit_task / Tensor* ABI)
+ *
+ * Implements: out[i] = src0[i] + src1[i] over a single 128x128 tile.
+ *
+ * Args (Tensor*):
+ *   args[0] = src0 (INPUT)
+ *   args[1] = src1 (INPUT)
+ *   args[2] = out  (OUTPUT)
+ */
 
 #include <cstdint>
 #include <pto/pto-inst.hpp>
 
+#include "tensor.h"
+
 using namespace pto;
 
 #include "pipe_sync.h"
@@ -25,9 +37,13 @@ using namespace pto;
 #endif
 
 extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) {
-    __gm__ float *src0 = reinterpret_cast<__gm__ float *>(args[0]);
-    __gm__ float *src1 = reinterpret_cast<__gm__ float *>(args[1]);
-    __gm__ float *out = reinterpret_cast<__gm__ float *>(args[2]);
+    __gm__ Tensor *src0_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]);
+    __gm__ Tensor *src1_tensor = reinterpret_cast<__gm__ Tensor *>(args[1]);
+    __gm__ Tensor *out_tensor = reinterpret_cast<__gm__ Tensor *>(args[2]);
+
+    __gm__ float *src0 = reinterpret_cast<__gm__ float *>(src0_tensor->buffer.addr) + src0_tensor->start_offset;
+    __gm__ float *src1 = reinterpret_cast<__gm__ float *>(src1_tensor->buffer.addr) + src1_tensor->start_offset;
+    __gm__ float *out = reinterpret_cast<__gm__ float *>(out_tensor->buffer.addr) + out_tensor->start_offset;
 
     constexpr int kTRows_ = 128;
     constexpr int kTCols_ = 128;
diff --git a/tests/st/a2a3/host_build_graph/dump_tensor/kernels/aiv/kernel_add_scalar_inplace.cpp b/tests/st/a2a3/host_build_graph/dump_tensor/kernels/aiv/kernel_add_scalar_inplace.cpp
index 7ae104641..6804a2b5a 100644
--- a/tests/st/a2a3/host_build_graph/dump_tensor/kernels/aiv/kernel_add_scalar_inplace.cpp
+++ b/tests/st/a2a3/host_build_graph/dump_tensor/kernels/aiv/kernel_add_scalar_inplace.cpp
@@ -8,10 +8,21 @@
  * See LICENSE in the root of the software repository for the full text of the License.
  * -----------------------------------------------------------------------------------------------------------
  */
+/**
+ * In-place scalar addition kernel (submit_task / Tensor* ABI)
+ *
+ * Implements: inout[i] = inout[i] + scalar over a single 128x128 tile.
+ *
+ * Args:
+ *   args[0] = inout (INOUT, Tensor*)
+ *   args[1] = scalar (float bits packed in uint64_t — tensors precede scalars)
+ */
 
 #include <cstdint>
 #include <pto/pto-inst.hpp>
 
+#include "tensor.h"
+
 using namespace pto;
 
 #include "pipe_sync.h"
@@ -25,15 +36,17 @@ using namespace pto;
 #endif
 
 extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) {
-    __gm__ float *inout = reinterpret_cast<__gm__ float *>(args[0]);
+    __gm__ Tensor *inout_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]);
 
     union {
         uint64_t u64;
         float f32;
     } converter;
-    converter.u64 = args[1];
+    converter.u64 = static_cast<uint64_t>(args[1]);
     float scalar = converter.f32;
 
+    __gm__ float *inout = reinterpret_cast<__gm__ float *>(inout_tensor->buffer.addr) + inout_tensor->start_offset;
+
     constexpr int kTRows_ = 128;
     constexpr int kTCols_ = 128;
     constexpr int vRows = 128;
diff --git a/tests/st/a2a3/host_build_graph/dump_tensor/kernels/orchestration/dump_tensor_orch.cpp b/tests/st/a2a3/host_build_graph/dump_tensor/kernels/orchestration/dump_tensor_orch.cpp
index 8c8d807c4..5cf8045ca 100644
--- a/tests/st/a2a3/host_build_graph/dump_tensor/kernels/orchestration/dump_tensor_orch.cpp
+++ b/tests/st/a2a3/host_build_graph/dump_tensor/kernels/orchestration/dump_tensor_orch.cpp
@@ -9,65 +9,59 @@
  * -----------------------------------------------------------------------------------------------------------
  */
 /**
- * Dump-tensor interface demo for host_build_graph.
+ * Dump-tensor orchestration — submit_task / TensorMap form
  *
- * Demonstrates the two ways to register tensor metadata for dump:
- *   Task 0 (add):                add_task() + set_tensor_info_to_task()
- *   Task 1 (add_scalar_inplace): add_task_with_tensor_info()
+ * Builds: f = (a + b) + 1
+ *   t0: f = a + b            (kernel_add, AIV)
+ *   t1: f = f + 1 in place   (kernel_add_scalar_inplace, AIV, INOUT)
  *
- * Computation: f = (a + b) + 1  (a=2, b=3 → f=6)
+ * The t0 -> t1 dependency is discovered by the TensorMap: t0 produces f
+ * (add_output), t1 reads-and-writes f (add_inout). Per-task tensor metadata
+ * for the dump subsystem is derived automatically from the Arg tensors.
+ *
+ * Arg layout: [a (IN), b (IN), f (OUT)].
  */
 
-#include "orchestration_api.h"  // NOLINT(build/include_subdir)
-
-extern "C" {
+#include <stdint.h>
 
-int build_dump_tensor_graph(OrchestrationRuntime *runtime, const ChipStorageTaskArgs &orch_args) {
-    void *host_a = orch_args.tensor(0).data_as<void>();
-    void *host_b = orch_args.tensor(1).data_as<void>();
-    void *host_f = orch_args.tensor(2).data_as<void>();
-    size_t size_a = orch_args.tensor(0).nbytes();
-    size_t size_b = orch_args.tensor(1).nbytes();
-    size_t size_f = orch_args.tensor(2).nbytes();
-    uint32_t size = orch_args.tensor(0).shapes[0];
+#include "pto_orchestration_api.h"  // NOLINT(build/include_subdir)
 
-    TensorInfo ext_a_info = make_tensor_info_from_tensor_arg(orch_args.tensor(0));
-    TensorInfo ext_b_info = make_tensor_info_from_tensor_arg(orch_args.tensor(1));
-    TensorInfo ext_f_info = make_tensor_info_from_tensor_arg(orch_args.tensor(2));
+#define FUNC_ADD 0
+#define FUNC_ADD_SCALAR_INPLACE 1
 
-    void *dev_a = device_malloc(runtime, size_a);
-    copy_to_device(runtime, dev_a, host_a, size_a);
+extern "C" {
 
-    void *dev_b = device_malloc(runtime, size_b);
-    copy_to_device(runtime, dev_b, host_b, size_b);
+__attribute__((visibility("default"))) PTO2OrchestrationConfig aicpu_orchestration_config(const L2TaskArgs &orch_args) {
+    (void)orch_args;
+    return PTO2OrchestrationConfig{
+        .expected_arg_count = 3,
+    };
+}
 
-    void *dev_f = device_malloc(runtime, size_f);
-    record_tensor_pair(runtime, host_f, dev_f, size_f);
+__attribute__((visibility("default"))) void aicpu_orchestration_entry(const L2TaskArgs &orch_args) {
+    const Tensor &a = orch_args.tensor(0).ref();
+    const Tensor &b = orch_args.tensor(1).ref();
+    const Tensor &f = orch_args.tensor(2).ref();
 
-    // Task 0: a + b → f  (add_task + set_tensor_info_to_task)
-    uint64_t args_t0[4] = {
-        reinterpret_cast<uint64_t>(dev_a),
-        reinterpret_cast<uint64_t>(dev_b),
-        reinterpret_cast<uint64_t>(dev_f),
-        size,
-    };
-    int t0 = add_task(runtime, args_t0, 4, 0, CoreType::AIV);
-    TensorInfo t0_info[] = {ext_a_info, ext_b_info, ext_f_info};
-    set_tensor_info_to_task(runtime, t0, t0_info, 3);
+    // t0: f = a + b
+    L0TaskArgs p0;
+    p0.add_input(a);
+    p0.add_input(b);
+    p0.add_output(f);
+    rt_submit_aiv_task(FUNC_ADD, p0);
 
-    // Task 1: f += 1.0  (add_task_with_tensor_info)
+    // t1: f = f + 1 (in place); INOUT establishes the dependency on t0 via f
     union {
         float f32;
         uint64_t u64;
-    } sc;
-    sc.f32 = 1.0f;
-    uint64_t args_t1[3] = {reinterpret_cast<uint64_t>(dev_f), sc.u64, size};
-    TensorInfo t1_info[] = {ext_f_info};
-    int t1 = add_task_with_tensor_info(runtime, args_t1, 3, 1, CoreType::AIV, t1_info, 1);
-
-    add_successor(runtime, t0, t1);
+    } sconv;
+    sconv.f32 = 1.0f;
+    L0TaskArgs p1;
+    p1.add_inout(f);
+    p1.add_scalar(sconv.u64);
+    rt_submit_aiv_task(FUNC_ADD_SCALAR_INPLACE, p1);
 
-    return 0;
+    LOG_INFO_V9("[dump_tensor_orch] Submitted f = (a + b) + 1");
 }
 
 }  // extern "C"
diff --git a/tests/st/a2a3/host_build_graph/dump_tensor/test_dump_tensor_example.py b/tests/st/a2a3/host_build_graph/dump_tensor/test_dump_tensor_example.py
index 531d98f5f..e34ec5714 100644
--- a/tests/st/a2a3/host_build_graph/dump_tensor/test_dump_tensor_example.py
+++ b/tests/st/a2a3/host_build_graph/dump_tensor/test_dump_tensor_example.py
@@ -27,7 +27,7 @@ class TestDumpTensorExample(SceneTestCase):
     CALLABLE = {
         "orchestration": {
             "source": "kernels/orchestration/dump_tensor_orch.cpp",
-            "function_name": "build_dump_tensor_graph",
+            "function_name": "aicpu_orchestration_entry",
             "signature": [D.IN, D.IN, D.OUT],
         },
         "incores": [
@@ -50,7 +50,7 @@ class TestDumpTensorExample(SceneTestCase):
         {
             "name": "default",
             "platforms": ["a2a3"],
-            "config": {"aicpu_thread_num": 3, "block_dim": 3},
+            "config": {"aicpu_thread_num": 4, "block_dim": 3},
             "params": {},
         },
     ]
diff --git a/tests/st/a2a3/host_build_graph/matmul/kernels/aic/kernel_matmul.cpp b/tests/st/a2a3/host_build_graph/matmul/kernels/aic/kernel_matmul.cpp
index 64b5037ae..a54e02dab 100644
--- a/tests/st/a2a3/host_build_graph/matmul/kernels/aic/kernel_matmul.cpp
+++ b/tests/st/a2a3/host_build_graph/matmul/kernels/aic/kernel_matmul.cpp
@@ -9,18 +9,22 @@
  * -----------------------------------------------------------------------------------------------------------
  */
 /**
- * Matrix Multiplication Kernel (AIC)
+ * Matrix multiplication kernel (AIC, submit_task / Tensor* ABI)
  *
- * Implements: out = src0 @ src1 (matrix multiplication)
+ * Implements: out = src0 @ src1.  Half precision inputs, float output.
+ * Single 128x128 tile.  Flow: TLOAD -> TMOV -> TMATMUL -> TSTORE.
  *
- * This kernel performs matrix multiplication on AIC (AI Cube) core.
- * Uses half precision input and float output for compatibility with both sim and NPU.
- * Simplified flow: TLOAD -> TMOV -> TMATMUL -> TSTORE
+ * Args (Tensor*):
+ *   args[0] = src0 (INPUT, half)
+ *   args[1] = src1 (INPUT, half)
+ *   args[2] = out  (OUTPUT, float)
  */
 
 #include <cstdint>
 #include <pto/pto-inst.hpp>
 
+#include "tensor.h"
+
 using namespace pto;
 
 #include "pipe_sync.h"
@@ -38,29 +42,20 @@ constexpr int validM = 128;
 constexpr int validK = 128;
 constexpr int validN = 128;
 
-// Aligned dimensions (align to 16 for half type)
 constexpr int blockAlign = 16;
 constexpr int M = 128;
 constexpr int K = 128;
 constexpr int N = 128;
 
-/**
- * Matrix multiplication kernel implementation
- *
- * Unified signature: all arguments passed via int64_t array
- * @param args  Argument array:
- *              args[0] = src0 pointer (left matrix, MxK, half)
- *              args[1] = src1 pointer (right matrix, KxN, half)
- *              args[2] = out pointer (output matrix, MxN, float)
- *              args[3] = size (number of elements, unused for matmul)
- */
 extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) {
-    // Unpack arguments - half input, float output
-    __gm__ half *src0 = reinterpret_cast<__gm__ half *>(args[0]);
-    __gm__ half *src1 = reinterpret_cast<__gm__ half *>(args[1]);
-    __gm__ float *out = reinterpret_cast<__gm__ float *>(args[2]);
+    __gm__ Tensor *src0_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]);
+    __gm__ Tensor *src1_tensor = reinterpret_cast<__gm__ Tensor *>(args[1]);
+    __gm__ Tensor *out_tensor = reinterpret_cast<__gm__ Tensor *>(args[2]);
+
+    __gm__ half *src0 = reinterpret_cast<__gm__ half *>(src0_tensor->buffer.addr) + src0_tensor->start_offset;
+    __gm__ half *src1 = reinterpret_cast<__gm__ half *>(src1_tensor->buffer.addr) + src1_tensor->start_offset;
+    __gm__ float *out = reinterpret_cast<__gm__ float *>(out_tensor->buffer.addr) + out_tensor->start_offset;
 
-    // Global tensor types
     using GlobalDataSrc0 = GlobalTensor<
         half, Shape<1, 1, 1, validM, validK>, Stride<validM * validK, validM * validK, validM * validK, validK, 1>>;
     using GlobalDataSrc1 = GlobalTensor<
@@ -72,11 +67,9 @@ extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ in
     GlobalDataSrc1 src1Global(src1);
     GlobalDataOut dstGlobal(out);
 
-    // L1 buffer tiles for loading data (half precision)
     using TileMatAData = Tile<TileType::Mat, half, M, K, BLayout::ColMajor, validM, validK, SLayout::RowMajor, 512>;
     using TileMatBData = Tile<TileType::Mat, half, K, N, BLayout::ColMajor, validK, validN, SLayout::RowMajor, 512>;
 
-    // Cube tiles for matmul - half * half -> float
     using LeftTile = TileLeft<half, M, K, validM, validK>;
     using RightTile = TileRight<half, K, N, validK, validN>;
     using AccTile = TileAcc<float, M, N, validM, validN>;
@@ -93,27 +86,23 @@ extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ in
     TASSIGN(bTile, 0x0);
     TASSIGN(cTile, 0x0);
 
-    // TLOAD: Load from GM to L1
     TLOAD(aMatTile, src0Global);
     TLOAD(bMatTile, src1Global);
 
     set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
     wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
 
-    // TMOV: Move from L1 to Cube
     TMOV(aTile, aMatTile);
     TMOV(bTile, bMatTile);
 
     set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
     wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
 
-    // TMATMUL: Matrix multiplication (half * half -> float)
     TMATMUL(cTile, aTile, bTile);
 
     set_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
     wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
 
-    // TSTORE: Store result to GM
     TSTORE(dstGlobal, cTile);
 
     pipe_sync();
diff --git a/tests/st/a2a3/host_build_graph/matmul/kernels/aiv/kernel_add_exp.cpp b/tests/st/a2a3/host_build_graph/matmul/kernels/aiv/kernel_add_exp.cpp
index 6c28a78d8..bebb96c7e 100644
--- a/tests/st/a2a3/host_build_graph/matmul/kernels/aiv/kernel_add_exp.cpp
+++ b/tests/st/a2a3/host_build_graph/matmul/kernels/aiv/kernel_add_exp.cpp
@@ -9,17 +9,21 @@
  * -----------------------------------------------------------------------------------------------------------
  */
 /**
- * Element-wise Add then Exp Kernel
+ * Element-wise add then exp kernel (submit_task / Tensor* ABI)
  *
- * Implements: out[i] = exp(src0[i] + src1[i])
+ * Implements: out[i] = exp(src0[i] + src1[i]).  Single 128x128 tile.
  *
- * This kernel performs element-wise addition of two tensors followed by
- * exponential operation.
+ * Args (Tensor*):
+ *   args[0] = src0 (INPUT, float)
+ *   args[1] = src1 (INPUT, float)
+ *   args[2] = out  (OUTPUT, float)
  */
 
 #include <cstdint>
 #include <pto/pto-inst.hpp>
 
+#include "tensor.h"
+
 using namespace pto;
 
 #include "pipe_sync.h"
@@ -32,24 +36,15 @@ using namespace pto;
 #define __aicore__ [aicore]
 #endif
 
-/**
- * Add + Exp kernel implementation
- *
- * Unified signature: all arguments passed via int64_t array
- * @param args  Argument array:
- *              args[0] = src0 pointer (first input tensor)
- *              args[1] = src1 pointer (second input tensor)
- *              args[2] = out pointer (output tensor)
- *              args[3] = size (number of elements)
- */
 extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) {
-    // Unpack arguments
-    __gm__ float *src0 = reinterpret_cast<__gm__ float *>(args[0]);
-    __gm__ float *src1 = reinterpret_cast<__gm__ float *>(args[1]);
-    __gm__ float *out = reinterpret_cast<__gm__ float *>(args[2]);
-    int size = static_cast<int>(args[3]);
+    __gm__ Tensor *src0_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]);
+    __gm__ Tensor *src1_tensor = reinterpret_cast<__gm__ Tensor *>(args[1]);
+    __gm__ Tensor *out_tensor = reinterpret_cast<__gm__ Tensor *>(args[2]);
+
+    __gm__ float *src0 = reinterpret_cast<__gm__ float *>(src0_tensor->buffer.addr) + src0_tensor->start_offset;
+    __gm__ float *src1 = reinterpret_cast<__gm__ float *>(src1_tensor->buffer.addr) + src1_tensor->start_offset;
+    __gm__ float *out = reinterpret_cast<__gm__ float *>(out_tensor->buffer.addr) + out_tensor->start_offset;
 
-    // Configuration: float, 128, 128, 128, 128
     constexpr int kTRows_ = 128;
     constexpr int kTCols_ = 128;
     constexpr int vRows = 128;
@@ -60,7 +55,6 @@ extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ in
     using GlobalData = GlobalTensor<float, DynShapeDim5, DynStridDim5>;
     using TileData = Tile<TileType::Vec, float, kTRows_, kTCols_, BLayout::RowMajor, -1, -1>;
 
-    // Optimized memory: only 3 tiles needed (reuse for intermediate and output)
     TileData src0Tile(vRows, vCols);
     TileData src1Tile(vRows, vCols);
     TileData dstTile(vRows, vCols);
@@ -76,9 +70,7 @@ extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ in
     TLOAD(src1Tile, src1Global);
     set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
     wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
-    // Add: result in src0Tile (reuse)
     TADD(src0Tile, src0Tile, src1Tile);
-    // Exp: result in dstTile
     TEXP(dstTile, src0Tile);
     set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
     wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
diff --git a/tests/st/a2a3/host_build_graph/matmul/kernels/aiv/kernel_log_sqrt.cpp b/tests/st/a2a3/host_build_graph/matmul/kernels/aiv/kernel_log_sqrt.cpp
index cf6559b99..4293e3316 100644
--- a/tests/st/a2a3/host_build_graph/matmul/kernels/aiv/kernel_log_sqrt.cpp
+++ b/tests/st/a2a3/host_build_graph/matmul/kernels/aiv/kernel_log_sqrt.cpp
@@ -9,17 +9,21 @@
  * -----------------------------------------------------------------------------------------------------------
  */
 /**
- * Element-wise Log then Sqrt Kernel
+ * Element-wise log then sqrt kernel (submit_task / Tensor* ABI)
  *
- * Implements: out[i] = sqrt(log(src[i]))
+ * Implements: out[i] = sqrt(log(src[i])).  Both input and output are half
+ * precision for matmul compatibility.  Single 128x128 tile.
  *
- * This kernel performs element-wise natural logarithm followed by square root.
- * Both input and output are half precision for matmul compatibility.
+ * Args (Tensor*):
+ *   args[0] = src (INPUT, half)
+ *   args[1] = out (OUTPUT, half)
  */
 
 #include <cstdint>
 #include <pto/pto-inst.hpp>
 
+#include "tensor.h"
+
 using namespace pto;
 
 #include "pipe_sync.h"
@@ -32,28 +36,18 @@ using namespace pto;
 #define __aicore__ [aicore]
 #endif
 
-/**
- * Log + Sqrt kernel implementation (half precision in/out)
- *
- * Unified signature: all arguments passed via int64_t array
- * @param args  Argument array:
- *              args[0] = src pointer (input tensor, half)
- *              args[1] = out pointer (output tensor, half)
- *              args[2] = size (number of elements)
- */
 extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) {
-    // Unpack arguments
-    __gm__ half *src = reinterpret_cast<__gm__ half *>(args[0]);
-    __gm__ half *out = reinterpret_cast<__gm__ half *>(args[1]);
-    int size = static_cast<int>(args[2]);
+    __gm__ Tensor *src_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]);
+    __gm__ Tensor *out_tensor = reinterpret_cast<__gm__ Tensor *>(args[1]);
+
+    __gm__ half *src = reinterpret_cast<__gm__ half *>(src_tensor->buffer.addr) + src_tensor->start_offset;
+    __gm__ half *out = reinterpret_cast<__gm__ half *>(out_tensor->buffer.addr) + out_tensor->start_offset;
 
-    // Configuration
     constexpr int kTRows_ = 128;
     constexpr int kTCols_ = 128;
     constexpr int vRows = 128;
     constexpr int vCols = 128;
 
-    // Half types for input and output
     using DynShapeDim5Half = Shape<1, 1, 1, vRows, vCols>;
     using DynStridDim5Half = Stride<1, 1, 1, kTCols_, 1>;
     using GlobalDataHalf = GlobalTensor<half, DynShapeDim5Half, DynStridDim5Half>;
diff --git a/tests/st/a2a3/host_build_graph/matmul/kernels/orchestration/matmul_orch.cpp b/tests/st/a2a3/host_build_graph/matmul/kernels/orchestration/matmul_orch.cpp
index 96e8e39be..9e6e2e8bb 100644
--- a/tests/st/a2a3/host_build_graph/matmul/kernels/orchestration/matmul_orch.cpp
+++ b/tests/st/a2a3/host_build_graph/matmul/kernels/orchestration/matmul_orch.cpp
@@ -9,169 +9,81 @@
  * -----------------------------------------------------------------------------------------------------------
  */
 /**
- * Matmul Example Orchestration Function Implementation
+ * Matmul diamond orchestration — submit_task / TensorMap form
  *
- * Builds the task graph for formula: F = exp(sqrt(log(A)) @ W1 + sqrt(log(A)) @ W2)
+ * Builds: F = exp(sqrt(log(A)) @ W1 + sqrt(log(A)) @ W2)
  *
- * Task graph (diamond topology):
- *       t0 (sqrt+log, AIV)
+ *       t0 (sqrt(log), AIV)
  *      /  \
  *    t1    t2   (matmul, AIC)
  *      \  /
  *       t3 (add+exp, AIV)
  *
- * This orchestration function:
- * 1. Receives ChipStorageTaskArgs with tensor metadata (pointers, shapes, dtypes)
- * 2. Allocates device memory via orchestration API helpers
- * 3. Copies input data to device via orchestration API helpers
- * 4. Records output tensor for copy-back during finalize
- * 5. Builds the task graph with 4 tasks (2 AIV + 2 AIC)
+ * Dependencies are discovered by the TensorMap from the add_input/add_output
+ * directions. Intermediate b is FP16 (matmul input); c, d are FP32.
+ *
+ * Arg layout: [a (IN, fp16), w1 (IN, fp16), w2 (IN, fp16), f (OUT, fp32)].
  */
 
-#include <cstdint>
-#include <iostream>
-
-#include "orchestration_api.h"  // NOLINT(build/include_subdir)
-
-extern "C" {
-
-int build_matmul_graph(OrchestrationRuntime *runtime, const ChipStorageTaskArgs &orch_args) {
-    // Validate argument count
-    // Expected orch_args: [a, w1, w2, f] — 4 tensors
-    if (orch_args.tensor_count() < 4) {
-        std::cerr << "build_matmul_graph: Expected at least 4 tensors, got " << orch_args.tensor_count() << '\n';
-        return -1;
-    }
-
-    // Extract host pointers and sizes from tensor metadata
-    void *host_a = orch_args.tensor(0).data_as<void>();
-    void *host_w1 = orch_args.tensor(1).data_as<void>();
-    void *host_w2 = orch_args.tensor(2).data_as<void>();
-    void *host_f = orch_args.tensor(3).data_as<void>();
-    size_t size_a = orch_args.tensor(0).nbytes();
-    size_t size_w1 = orch_args.tensor(1).nbytes();
-    size_t size_w2 = orch_args.tensor(2).nbytes();
-    size_t size_f = orch_args.tensor(3).nbytes();
-    uint32_t SIZE = orch_args.tensor(0).shapes[0];
-
-    std::cout << "\n=== build_matmul_graph: Creating Task Runtime ===" << '\n';
-    std::cout << "Formula: F = exp(sqrt(log(A)) @ W1 + sqrt(log(A)) @ W2)\n";
-    std::cout << "SIZE: " << SIZE << " elements\n";
-
-    // Allocate device memory and copy inputs
-    std::cout << "\n=== Allocating Device Memory ===" << '\n';
-
-    void *dev_a = device_malloc(runtime, size_a);
-    if (!dev_a) {
-        std::cerr << "Error: Failed to allocate device memory for A\n";
-        return -1;
-    }
-    copy_to_device(runtime, dev_a, host_a, size_a);
-    std::cout << "Tensor A: " << size_a << " bytes copied to device\n";
-
-    void *dev_w1 = device_malloc(runtime, size_w1);
-    if (!dev_w1) {
-        std::cerr << "Error: Failed to allocate device memory for W1\n";
-        device_free(runtime, dev_a);
-        return -1;
-    }
-    copy_to_device(runtime, dev_w1, host_w1, size_w1);
-    std::cout << "Tensor W1: " << size_w1 << " bytes copied to device\n";
+#include <stdint.h>
 
-    void *dev_w2 = device_malloc(runtime, size_w2);
-    if (!dev_w2) {
-        std::cerr << "Error: Failed to allocate device memory for W2\n";
-        device_free(runtime, dev_a);
-        device_free(runtime, dev_w1);
-        return -1;
-    }
-    copy_to_device(runtime, dev_w2, host_w2, size_w2);
-    std::cout << "Tensor W2: " << size_w2 << " bytes copied to device\n";
+#include "pto_orchestration_api.h"  // NOLINT(build/include_subdir)
 
-    void *dev_f = device_malloc(runtime, size_f);
-    if (!dev_f) {
-        std::cerr << "Error: Failed to allocate device memory for F\n";
-        device_free(runtime, dev_a);
-        device_free(runtime, dev_w1);
-        device_free(runtime, dev_w2);
-        return -1;
-    }
-    // Record output tensor for copy-back during finalize
-    record_tensor_pair(runtime, host_f, dev_f, size_f);
-    std::cout << "Tensor F (output): " << size_f << " bytes allocated\n";
+#define FUNC_LOG_SQRT 0
+#define FUNC_MATMUL 1
+#define FUNC_ADD_EXP 2
 
-    // Allocate intermediate tensors (b, c, d)
-    // dev_b is half precision (output of log_sqrt kernel, input to matmul)
-    // dev_c, dev_d are float precision (output of matmul kernels)
-    size_t BYTES_HALF = SIZE * sizeof(uint16_t);        // half = 2 bytes
-    size_t BYTES_FLOAT = SIZE * sizeof(float);          // float = 4 bytes
-    void *dev_b = device_malloc(runtime, BYTES_HALF);   // sqrt(log(A)) - half output
-    void *dev_c = device_malloc(runtime, BYTES_FLOAT);  // B @ W1 - float output
-    void *dev_d = device_malloc(runtime, BYTES_FLOAT);  // B @ W2 - float output
-
-    if (!dev_b || !dev_c || !dev_d) {
-        std::cerr << "Error: Failed to allocate intermediate tensors\n";
-        device_free(runtime, dev_a);
-        device_free(runtime, dev_w1);
-        device_free(runtime, dev_w2);
-        device_free(runtime, dev_f);
-        if (dev_b) device_free(runtime, dev_b);
-        if (dev_c) device_free(runtime, dev_c);
-        if (dev_d) device_free(runtime, dev_d);
-        return -1;
-    }
-
-    std::cout << "Allocated intermediate tensors: B (" << BYTES_HALF << " bytes, half), C (" << BYTES_FLOAT
-              << " bytes, float), D (" << BYTES_FLOAT << " bytes, float)\n";
-
-    // Task 0: B = sqrt(log(A)) (func_id=0: kernel_log_sqrt, AIV)
-    uint64_t args_t0[3];
-    args_t0[0] = reinterpret_cast<uint64_t>(dev_a);  // src
-    args_t0[1] = reinterpret_cast<uint64_t>(dev_b);  // out
-    args_t0[2] = SIZE;                               // size
-    int t0 = add_task(runtime, args_t0, 3, 0, CoreType::AIV);
-
-    // Task 1: C = B @ W1 (func_id=1: kernel_matmul, AIC)
-    uint64_t args_t1[4];
-    args_t1[0] = reinterpret_cast<uint64_t>(dev_b);   // src0 (left matrix)
-    args_t1[1] = reinterpret_cast<uint64_t>(dev_w1);  // src1 (right matrix)
-    args_t1[2] = reinterpret_cast<uint64_t>(dev_c);   // out
-    args_t1[3] = SIZE;                                // size
-    int t1 = add_task(runtime, args_t1, 4, 1, CoreType::AIC);
-
-    // Task 2: D = B @ W2 (func_id=1: kernel_matmul, AIC)
-    uint64_t args_t2[4];
-    args_t2[0] = reinterpret_cast<uint64_t>(dev_b);   // src0 (left matrix)
-    args_t2[1] = reinterpret_cast<uint64_t>(dev_w2);  // src1 (right matrix)
-    args_t2[2] = reinterpret_cast<uint64_t>(dev_d);   // out
-    args_t2[3] = SIZE;                                // size
-    int t2 = add_task(runtime, args_t2, 4, 1, CoreType::AIC);
-
-    // Task 3: F = exp(C + D) (func_id=2: kernel_add_exp, AIV)
-    uint64_t args_t3[4];
-    args_t3[0] = reinterpret_cast<uint64_t>(dev_c);  // src0
-    args_t3[1] = reinterpret_cast<uint64_t>(dev_d);  // src1
-    args_t3[2] = reinterpret_cast<uint64_t>(dev_f);  // out
-    args_t3[3] = SIZE;                               // size
-    int t3 = add_task(runtime, args_t3, 4, 2, CoreType::AIV);
-
-    // Add dependencies (diamond: t0→t1→t3, t0→t2→t3)
-    add_successor(runtime, t0, t1);  // t0 → t1
-    add_successor(runtime, t0, t2);  // t0 → t2
-    add_successor(runtime, t1, t3);  // t1 → t3
-    add_successor(runtime, t2, t3);  // t2 → t3
-
-    std::cout << "\nTasks:\n";
-    std::cout << "  task" << t0 << ": B = sqrt(log(A))   [AIV]\n";
-    std::cout << "  task" << t1 << ": C = B @ W1         [AIC]\n";
-    std::cout << "  task" << t2 << ": D = B @ W2         [AIC]\n";
-    std::cout << "  task" << t3 << ": F = exp(C + D)     [AIV]\n";
-    std::cout << "Dependencies: t0→t1→t3, t0→t2→t3 (diamond)\n";
+extern "C" {
 
-    std::cout << "Created runtime with " << get_task_count(runtime) << " tasks\n";
-    print_runtime(runtime);
+__attribute__((visibility("default"))) PTO2OrchestrationConfig aicpu_orchestration_config(const L2TaskArgs &orch_args) {
+    (void)orch_args;
+    return PTO2OrchestrationConfig{
+        .expected_arg_count = 4,
+    };
+}
 
-    return 0;
+__attribute__((visibility("default"))) void aicpu_orchestration_entry(const L2TaskArgs &orch_args) {
+    const Tensor &a = orch_args.tensor(0).ref();
+    const Tensor &w1 = orch_args.tensor(1).ref();
+    const Tensor &w2 = orch_args.tensor(2).ref();
+    const Tensor &f = orch_args.tensor(3).ref();  // external output, written in place
+
+    uint32_t SIZE = a.shapes[0];
+    uint32_t shapes[1] = {SIZE};
+    TensorCreateInfo b_ci(shapes, 1, DataType::FLOAT16);  // sqrt(log(A)) — half
+    TensorCreateInfo cd_ci(shapes, 1, DataType::FLOAT32);
+
+    // task0: b = sqrt(log(a))
+    L0TaskArgs p0;
+    p0.add_input(a);
+    p0.add_output(b_ci);
+    TaskOutputTensors b_out = rt_submit_aiv_task(FUNC_LOG_SQRT, p0);
+    Tensor b = b_out.get_ref(0);
+
+    // task1: c = b @ w1
+    L0TaskArgs p1;
+    p1.add_input(b);
+    p1.add_input(w1);
+    p1.add_output(cd_ci);
+    TaskOutputTensors c_out = rt_submit_aic_task(FUNC_MATMUL, p1);
+    Tensor c = c_out.get_ref(0);
+
+    // task2: d = b @ w2
+    L0TaskArgs p2;
+    p2.add_input(b);
+    p2.add_input(w2);
+    p2.add_output(cd_ci);
+    TaskOutputTensors d_out = rt_submit_aic_task(FUNC_MATMUL, p2);
+    Tensor d = d_out.get_ref(0);
+
+    // task3: f = exp(c + d)
+    L0TaskArgs p3;
+    p3.add_input(c);
+    p3.add_input(d);
+    p3.add_output(f);
+    rt_submit_aiv_task(FUNC_ADD_EXP, p3);
+
+    LOG_INFO_V9("[matmul_orch] Submitted 4-task diamond");
 }
 
 }  // extern "C"
diff --git a/tests/st/a2a3/host_build_graph/matmul/test_matmul.py b/tests/st/a2a3/host_build_graph/matmul/test_matmul.py
index e503629fe..e2d053a04 100644
--- a/tests/st/a2a3/host_build_graph/matmul/test_matmul.py
+++ b/tests/st/a2a3/host_build_graph/matmul/test_matmul.py
@@ -29,7 +29,7 @@ class TestMatmulHostBuildGraph(SceneTestCase):
     CALLABLE = {
         "orchestration": {
             "source": "kernels/orchestration/matmul_orch.cpp",
-            "function_name": "build_matmul_graph",
+            "function_name": "aicpu_orchestration_entry",
             "signature": [D.IN, D.IN, D.IN, D.OUT],
         },
         "incores": [
@@ -58,7 +58,7 @@ class TestMatmulHostBuildGraph(SceneTestCase):
         {
             "name": "default",
             "platforms": ["a2a3sim", "a2a3"],
-            "config": {"aicpu_thread_num": 3, "block_dim": 3},
+            "config": {"aicpu_thread_num": 4, "block_dim": 3},
             "params": {},
         },
     ]
diff --git a/tests/st/a2a3/host_build_graph/paged_attention/kernels/aic/aic_pv_matmul.cpp b/tests/st/a2a3/host_build_graph/paged_attention/kernels/aic/aic_pv_matmul.cpp
index 0288f96a5..0220a6bbb 100644
--- a/tests/st/a2a3/host_build_graph/paged_attention/kernels/aic/aic_pv_matmul.cpp
+++ b/tests/st/a2a3/host_build_graph/paged_attention/kernels/aic/aic_pv_matmul.cpp
@@ -21,6 +21,8 @@
 #include <cstdint>
 #include <pto/pto-inst.hpp>
 
+#include "tensor.h"
+
 using namespace pto;
 
 #include "pipe_sync.h"
@@ -34,19 +36,19 @@ using namespace pto;
 #endif
 
 template <int M, int K, int N>
-static __aicore__ void pv_matmul_impl(__gm__ uint8_t *pij_raw, __gm__ uint8_t *vj_raw, __gm__ uint8_t *oi_raw) {
-    __gm__ bfloat16_t *pij = reinterpret_cast<__gm__ bfloat16_t *>(pij_raw);
-    __gm__ bfloat16_t *vj = reinterpret_cast<__gm__ bfloat16_t *>(vj_raw);
-    __gm__ float *oi = reinterpret_cast<__gm__ float *>(oi_raw);
+static __aicore__ void pv_matmul_impl(__gm__ Tensor *pij, __gm__ Tensor *vj, __gm__ Tensor *oi) {
+    __gm__ bfloat16_t *pij_addr = reinterpret_cast<__gm__ bfloat16_t *>(pij->buffer.addr);
+    __gm__ bfloat16_t *vj_addr = reinterpret_cast<__gm__ bfloat16_t *>(vj->buffer.addr);
+    __gm__ float *oi_addr = reinterpret_cast<__gm__ float *>(oi->buffer.addr);
 
     // pij (M, K) bf16, vj (K, N) bf16 in ND (row-major), oi_new (M, N) fp32
     using GlobalA = GlobalTensor<bfloat16_t, Shape<1, 1, 1, M, K>, Stride<M * K, M * K, M * K, K, 1>>;
     using GlobalB = GlobalTensor<bfloat16_t, Shape<1, 1, 1, K, N>, Stride<K * N, K * N, K * N, N, 1>>;
     using GlobalOut = GlobalTensor<float, Shape<1, 1, 1, M, N>, Stride<M * N, M * N, M * N, N, 1>>;
 
-    GlobalA pijGlobal(pij);
-    GlobalB vjGlobal(vj);
-    GlobalOut oiGlobal(oi);
+    GlobalA pijGlobal(pij_addr + pij->start_offset);
+    GlobalB vjGlobal(vj_addr + vj->start_offset);
+    GlobalOut oiGlobal(oi_addr + oi->start_offset);
 
     // L1 Mat tiles: standard ND pattern for both A and B
     using TileMatA = Tile<TileType::Mat, bfloat16_t, M, K, BLayout::ColMajor, M, K, SLayout::RowMajor, 512>;
@@ -69,15 +71,17 @@ static __aicore__ void pv_matmul_impl(__gm__ uint8_t *pij_raw, __gm__ uint8_t *v
     TASSIGN(bTile, 0x0);
     TASSIGN(cTile, 0x0);
 
-    // Load pij and vj to L1
+    // Load pij and vj to L1 with separate events for pipeline overlap
     TLOAD(aMatTile, pijGlobal);
+    set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);  // A load done
     TLOAD(bMatTile, vjGlobal);
+    set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID1);  // B load done
 
-    set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
+    // Move A to L0A as soon as A load completes (B may still be loading)
     wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
-
-    // Move to L0A/L0B
     TMOV(aTile, aMatTile);
+    // Move B to L0B after B load completes
+    wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID1);
     TMOV(bTile, bMatTile);
 
     set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
@@ -95,16 +99,12 @@ static __aicore__ void pv_matmul_impl(__gm__ uint8_t *pij_raw, __gm__ uint8_t *v
 }
 
 extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) {
-    __gm__ uint8_t *pij = reinterpret_cast<__gm__ uint8_t *>(args[0]);
-    __gm__ uint8_t *vj = reinterpret_cast<__gm__ uint8_t *>(args[1]);
-    __gm__ uint8_t *oi_new = reinterpret_cast<__gm__ uint8_t *>(args[2]);
-    int q_tile_size = static_cast<int>(args[3]);
-    // args[4] = block_size, args[5] = head_dim
-
-    int block_size = static_cast<int>(args[4]);
-    int head_dim = static_cast<int>(args[5]);
+    __gm__ Tensor *pij = reinterpret_cast<__gm__ Tensor *>(args[0]);
+    __gm__ Tensor *vj = reinterpret_cast<__gm__ Tensor *>(args[1]);
+    __gm__ Tensor *oi_new = reinterpret_cast<__gm__ Tensor *>(args[2]);
+    uint64_t q_tile_size = static_cast<uint64_t>(pij->shapes[0]);
 
-    if (q_tile_size == 16 && block_size == 16) {
+    if (q_tile_size == 16 && pij->shapes[1] <= 16) {
         pv_matmul_impl<16, 16, 16>(pij, vj, oi_new);
     } else if (q_tile_size == 16) {
         pv_matmul_impl<16, 128, 128>(pij, vj, oi_new);
diff --git a/tests/st/a2a3/host_build_graph/paged_attention/kernels/aic/aic_qk_matmul.cpp b/tests/st/a2a3/host_build_graph/paged_attention/kernels/aic/aic_qk_matmul.cpp
index 6fcb4f04a..efd423bd6 100644
--- a/tests/st/a2a3/host_build_graph/paged_attention/kernels/aic/aic_qk_matmul.cpp
+++ b/tests/st/a2a3/host_build_graph/paged_attention/kernels/aic/aic_qk_matmul.cpp
@@ -21,6 +21,8 @@
 #include <cstdint>
 #include <pto/pto-inst.hpp>
 
+#include "tensor.h"
+
 using namespace pto;
 
 #include "pipe_sync.h"
@@ -34,10 +36,10 @@ using namespace pto;
 #endif
 
 template <int M, int K, int N>
-static __aicore__ void qk_matmul_impl(__gm__ uint8_t *qi_raw, __gm__ uint8_t *kj_raw, __gm__ uint8_t *sij_raw) {
-    __gm__ bfloat16_t *qi = reinterpret_cast<__gm__ bfloat16_t *>(qi_raw);
-    __gm__ bfloat16_t *kj = reinterpret_cast<__gm__ bfloat16_t *>(kj_raw);
-    __gm__ float *sij = reinterpret_cast<__gm__ float *>(sij_raw);
+static __aicore__ void qk_matmul_impl(__gm__ Tensor *qi, __gm__ Tensor *kj, __gm__ Tensor *sij) {
+    __gm__ bfloat16_t *qi_addr = reinterpret_cast<__gm__ bfloat16_t *>(qi->buffer.addr);
+    __gm__ bfloat16_t *kj_addr = reinterpret_cast<__gm__ bfloat16_t *>(kj->buffer.addr);
+    __gm__ float *sij_addr = reinterpret_cast<__gm__ float *>(sij->buffer.addr);
 
     // qi (M, K) bf16 in ND (row-major) layout
     using GlobalA = GlobalTensor<bfloat16_t, Shape<1, 1, 1, M, K>, Stride<M * K, M * K, M * K, K, 1>>;
@@ -45,9 +47,9 @@ static __aicore__ void qk_matmul_impl(__gm__ uint8_t *qi_raw, __gm__ uint8_t *kj
     using GlobalB = GlobalTensor<bfloat16_t, Shape<1, 1, 1, K, N>, Stride<K * N, K * N, K * N, 1, K>, Layout::DN>;
     using GlobalOut = GlobalTensor<float, Shape<1, 1, 1, M, N>, Stride<M * N, M * N, M * N, N, 1>>;
 
-    GlobalA qiGlobal(qi);
-    GlobalB kjGlobal(kj);
-    GlobalOut sijGlobal(sij);
+    GlobalA qiGlobal(qi_addr + qi->start_offset);
+    GlobalB kjGlobal(kj_addr + kj->start_offset);
+    GlobalOut sijGlobal(sij_addr + sij->start_offset);
 
     // L1 Mat tiles: A is standard ND, B uses transposed-B pattern (RowMajor/ColMajor)
     using TileMatA = Tile<TileType::Mat, bfloat16_t, M, K, BLayout::ColMajor, M, K, SLayout::RowMajor, 512>;
@@ -70,21 +72,23 @@ static __aicore__ void qk_matmul_impl(__gm__ uint8_t *qi_raw, __gm__ uint8_t *kj
     TASSIGN(bTile, 0x0);
     TASSIGN(cTile, 0x0);
 
-    // Load qi and kj to L1
+    // Load A and B to L1 with separate events for pipeline overlap
     TLOAD(aMatTile, qiGlobal);
+    set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);  // A load done
     TLOAD(bMatTile, kjGlobal);
+    set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID1);  // B load done
 
-    set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
+    // Move A to L0A as soon as A load completes (B may still be loading)
     wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
-
-    // Move to L0A/L0B
     TMOV(aTile, aMatTile);
+    // Move B to L0B after B load completes
+    wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID1);
     TMOV(bTile, bMatTile);
 
     set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
     wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
 
-    // Single matmul: (M,K) x (K,N) -> (M,N)
+    // Matmul
     TMATMUL(cTile, aTile, bTile);
 
     set_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
@@ -96,16 +100,12 @@ static __aicore__ void qk_matmul_impl(__gm__ uint8_t *qi_raw, __gm__ uint8_t *kj
 }
 
 extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) {
-    __gm__ uint8_t *qi = reinterpret_cast<__gm__ uint8_t *>(args[0]);
-    __gm__ uint8_t *kj = reinterpret_cast<__gm__ uint8_t *>(args[1]);
-    __gm__ uint8_t *sij = reinterpret_cast<__gm__ uint8_t *>(args[2]);
-    int q_tile_size = static_cast<int>(args[3]);
-    // args[4] = head_dim (128), args[5] = block_size
-
-    int head_dim = static_cast<int>(args[4]);
-    int block_size = static_cast<int>(args[5]);
+    __gm__ Tensor *qi = reinterpret_cast<__gm__ Tensor *>(args[0]);
+    __gm__ Tensor *kj = reinterpret_cast<__gm__ Tensor *>(args[1]);
+    __gm__ Tensor *sij = reinterpret_cast<__gm__ Tensor *>(args[2]);
+    uint64_t q_tile_size = static_cast<uint64_t>(qi->shapes[0]);
 
-    if (q_tile_size == 16 && head_dim == 16) {
+    if (q_tile_size == 16 && qi->shapes[1] <= 16) {
         qk_matmul_impl<16, 16, 16>(qi, kj, sij);
     } else if (q_tile_size == 16) {
         qk_matmul_impl<16, 128, 128>(qi, kj, sij);
diff --git a/tests/st/a2a3/host_build_graph/paged_attention/kernels/aiv/aiv_online_update.cpp b/tests/st/a2a3/host_build_graph/paged_attention/kernels/aiv/aiv_online_update.cpp
index dfa61577b..ded4dcad8 100644
--- a/tests/st/a2a3/host_build_graph/paged_attention/kernels/aiv/aiv_online_update.cpp
+++ b/tests/st/a2a3/host_build_graph/paged_attention/kernels/aiv/aiv_online_update.cpp
@@ -14,15 +14,17 @@
 //   Case1: oi/oi_new are (16, 128), mij/lij/mi/li are 16-element vectors
 //   Case2: oi/oi_new are (64, 128), mij/lij/mi/li are 64-element vectors
 //
-// Scalar layout strategy:
-//   M scalar floats stored contiguously in GM can be loaded as either:
-//   - ND (kScalarRows, kScalarCols) RowMajor for element-wise ops (TMAX, TSUB, TEXP, TMUL, TADD)
-//   - DN (kAlignedRows, 1) ColMajor for row-broadcast ops (TROWEXPANDMUL, TROWEXPANDDIV)
-//   Conversion between layouts uses GM round-trip: ND TSTORE → DN TLOAD.
+// Scalar layout strategy using TRESHAPE (zero-copy UB reshape):
+//   Scalars loaded as DN ColMajor (M, 1) for TROWEXPANDMUL/TROWEXPANDDIV.
+//   For element-wise ops (TMAX, TSUB, TEXP, etc.), TRESHAPE to RowMajor (1, M).
+//   After arithmetic, TRESHAPE back to ColMajor (M, 1) for row-broadcast ops.
+//   This eliminates the GM round-trip (TSTORE ND → TLOAD DN) used in the original.
 
 #include <cstdint>
 #include <pto/pto-inst.hpp>
 
+#include "tensor.h"
+
 using namespace pto;
 
 #include "pipe_sync.h"
@@ -37,22 +39,17 @@ using namespace pto;
 
 template <int M, int N>
 static __aicore__ void online_update_impl(
-    __gm__ uint8_t *mij_raw, __gm__ uint8_t *lij_raw, __gm__ uint8_t *oi_new_raw, __gm__ uint8_t *mi_raw,
-    __gm__ uint8_t *li_raw, __gm__ uint8_t *oi_raw, int is_first, int is_last, __gm__ uint8_t *dst_raw
+    __gm__ Tensor *mij, __gm__ Tensor *lij, __gm__ Tensor *oi_new, __gm__ Tensor *mi, __gm__ Tensor *li,
+    __gm__ Tensor *oi, uint64_t is_first, uint64_t is_last, __gm__ Tensor *dst
 ) {
-    __gm__ float *mij_ptr = reinterpret_cast<__gm__ float *>(mij_raw);
-    __gm__ float *lij_ptr = reinterpret_cast<__gm__ float *>(lij_raw);
-    __gm__ float *oi_new_ptr = reinterpret_cast<__gm__ float *>(oi_new_raw);
-    __gm__ float *mi_ptr = reinterpret_cast<__gm__ float *>(mi_raw);
-    __gm__ float *li_ptr = reinterpret_cast<__gm__ float *>(li_raw);
-    __gm__ float *oi_ptr = reinterpret_cast<__gm__ float *>(oi_raw);
-    __gm__ float *dst_ptr = reinterpret_cast<__gm__ float *>(dst_raw);
-
-    // Scalar tile dimensions for RowMajor layout:
-    // kScalarCols = 32 bytes / 4 bytes per float = 8 floats per row (one 32-byte block)
-    // kScalarRows = M / 8 (M=16 → 2 rows, M=64 → 8 rows)
-    constexpr int kScalarCols = 32 / sizeof(float);
-    constexpr int kScalarRows = M / kScalarCols;
+    __gm__ float *mij_ptr = reinterpret_cast<__gm__ float *>(mij->buffer.addr);
+    __gm__ float *lij_ptr = reinterpret_cast<__gm__ float *>(lij->buffer.addr);
+    __gm__ float *oi_new_ptr = reinterpret_cast<__gm__ float *>(oi_new->buffer.addr);
+    __gm__ float *mi_ptr = reinterpret_cast<__gm__ float *>(mi->buffer.addr);
+    __gm__ float *li_ptr = reinterpret_cast<__gm__ float *>(li->buffer.addr);
+    __gm__ float *oi_ptr = reinterpret_cast<__gm__ float *>(oi->buffer.addr);
+    __gm__ float *dst_ptr = reinterpret_cast<__gm__ float *>(dst->buffer.addr);
+
     // Aligned rows for ColMajor DN tiles (32-byte alignment)
     constexpr int kAlignedRows = ((M * sizeof(float) + 31) / 32) * (32 / sizeof(float));
 
@@ -61,77 +58,84 @@ static __aicore__ void online_update_impl(
     // Data (M, N) RowMajor
     using GlobalDataMxN = GlobalTensor<float, Shape<1, 1, 1, M, N>, Stride<1, 1, 1, N, 1>>;
 
-    // Scalar ND: M contiguous floats as (kScalarRows, kScalarCols) RowMajor
+    // Scalar DN: M contiguous floats as (kAlignedRows, 1) ColMajor for TROWEXPAND ops and loading
+    using GlobalScalarDN = GlobalTensor<float, Shape<1, 1, 1, kAlignedRows, 1>, Stride<1, 1, 1, 1, 1>, Layout::DN>;
+
+    // Scalar ND: for storing mi_new and li_new back to GM
+    constexpr int kScalarCols = 32 / sizeof(float);
+    constexpr int kScalarRows = M / kScalarCols;
     using GlobalScalarND =
         GlobalTensor<float, Shape<1, 1, 1, kScalarRows, kScalarCols>, Stride<1, 1, 1, kScalarCols, 1>>;
 
-    // Scalar DN: same M contiguous floats as (kAlignedRows, 1) ColMajor
-    using GlobalScalarDN = GlobalTensor<float, Shape<1, 1, 1, kAlignedRows, 1>, Stride<1, 1, 1, 1, 1>, Layout::DN>;
-
     // --- GlobalTensor instances ---
 
-    GlobalDataMxN oiNewGlobal(oi_new_ptr);
-    GlobalDataMxN oiGlobal(oi_ptr);
-    GlobalDataMxN dstGlobal(dst_ptr);
+    GlobalDataMxN oiNewGlobal(oi_new_ptr + oi_new->start_offset);
+    GlobalDataMxN oiGlobal(oi_ptr + oi->start_offset);
+    GlobalDataMxN dstGlobal(dst_ptr + dst->start_offset);
 
-    // ND globals for scalar element-wise operations
-    GlobalScalarND mijGlobalND(mij_ptr);
-    GlobalScalarND lijGlobalND(lij_ptr);
-    GlobalScalarND miGlobalND(mi_ptr);
-    GlobalScalarND liGlobalND(li_ptr);
+    // DN globals for loading scalars as ColMajor
+    GlobalScalarDN mijGlobalDN(mij_ptr + mij->start_offset);
+    GlobalScalarDN lijGlobalDN(lij_ptr + lij->start_offset);
+    GlobalScalarDN miGlobalDN(mi_ptr + mi->start_offset);
+    GlobalScalarDN liGlobalDN(li_ptr + li->start_offset);
 
-    // DN globals aliased to same GM for ColMajor reload (used after ND TSTORE)
-    GlobalScalarDN mijGlobalDN(mij_ptr);
-    GlobalScalarDN lijGlobalDN(lij_ptr);
-    GlobalScalarDN liGlobalDN(li_ptr);
+    // ND globals for storing scalar results
+    GlobalScalarND miGlobalND(mi_ptr + mi->start_offset);
+    GlobalScalarND liGlobalND(li_ptr + li->start_offset);
 
     // --- Tile types ---
 
     using TileDataMxN = Tile<TileType::Vec, float, M, N, BLayout::RowMajor, M, N>;
+    using TileScalarDN = Tile<TileType::Vec, float, kAlignedRows, 1, BLayout::ColMajor, M, 1>;
+
+    // RowMajor (1, M) tiles for element-wise arithmetic via TRESHAPE
+    using TileScalarRow = Tile<TileType::Vec, float, 1, M, BLayout::RowMajor, 1, M>;
+
+    // ND tile for storing back to GM
     using TileScalarND =
         Tile<TileType::Vec, float, kScalarRows, kScalarCols, BLayout::RowMajor, kScalarRows, kScalarCols>;
-    using TileScalarDN = Tile<TileType::Vec, float, kAlignedRows, 1, BLayout::ColMajor, M, 1>;
 
     // --- UB memory layout ---
 
     constexpr int kDataBytes = M * N * sizeof(float);
-    constexpr int kScalarNDBytes = kScalarRows * kScalarCols * sizeof(float);
     constexpr int kScalarDNBytes = kAlignedRows * sizeof(float);
 
     // Data tiles
     TileDataMxN oiNewTile;
     TileDataMxN oiTile;
 
-    // Scalar ND tiles for element-wise arithmetic
-    TileScalarND mijND, lijND, miND, liND;
-    TileScalarND miNewND, alphaND, betaND, tmpND;
+    // Scalar DN tiles loaded from GM (ColMajor)
+    TileScalarDN mijDN, lijDN, miDN, liDN;
 
-    // Scalar DN tiles for TROWEXPAND operations
-    TileScalarDN alphaDN, betaDN, liDN;
+    // Temporary DN tiles for results
+    TileScalarDN miNewDN, alphaDN, betaDN, liNewDN, tmpDN;
 
     TASSIGN(oiNewTile, 0);
     TASSIGN(oiTile, kDataBytes);
-    TASSIGN(mijND, 2 * kDataBytes);
-    TASSIGN(lijND, 2 * kDataBytes + kScalarNDBytes);
-    TASSIGN(miND, 2 * kDataBytes + 2 * kScalarNDBytes);
-    TASSIGN(liND, 2 * kDataBytes + 3 * kScalarNDBytes);
-    TASSIGN(miNewND, 2 * kDataBytes + 4 * kScalarNDBytes);
-    TASSIGN(alphaND, 2 * kDataBytes + 5 * kScalarNDBytes);
-    TASSIGN(betaND, 2 * kDataBytes + 6 * kScalarNDBytes);
-    TASSIGN(tmpND, 2 * kDataBytes + 7 * kScalarNDBytes);
-    TASSIGN(alphaDN, 2 * kDataBytes + 8 * kScalarNDBytes);
-    TASSIGN(betaDN, 2 * kDataBytes + 8 * kScalarNDBytes + kScalarDNBytes);
-    TASSIGN(liDN, 2 * kDataBytes + 8 * kScalarNDBytes + 2 * kScalarDNBytes);
+    TASSIGN(mijDN, 2 * kDataBytes);
+    TASSIGN(lijDN, 2 * kDataBytes + kScalarDNBytes);
+    TASSIGN(miDN, 2 * kDataBytes + 2 * kScalarDNBytes);
+    TASSIGN(liDN, 2 * kDataBytes + 3 * kScalarDNBytes);
+    TASSIGN(miNewDN, 2 * kDataBytes + 4 * kScalarDNBytes);
+    TASSIGN(alphaDN, 2 * kDataBytes + 5 * kScalarDNBytes);
+    TASSIGN(betaDN, 2 * kDataBytes + 6 * kScalarDNBytes);
+    TASSIGN(liNewDN, 2 * kDataBytes + 7 * kScalarDNBytes);
+    TASSIGN(tmpDN, 2 * kDataBytes + 8 * kScalarDNBytes);
 
     if (is_first) {
         // --- First block: copy inputs to accumulators ---
         TLOAD(oiNewTile, oiNewGlobal);
-        TLOAD(mijND, mijGlobalND);
-        TLOAD(lijND, lijGlobalND);
+        TLOAD(mijDN, mijGlobalDN);
+        TLOAD(lijDN, lijGlobalDN);
         set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
         wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
 
-        // Passthrough to MTE3 (no V compute needed)
+        // Store mi = mij, li = lij, oi = oi_new
+        // Alias ND tiles to the same UB as DN tiles for storing as ND format
+        TileScalarND mijND, lijND;
+        TASSIGN(mijND, 2 * kDataBytes);                   // alias same UB as mijDN
+        TASSIGN(lijND, 2 * kDataBytes + kScalarDNBytes);  // alias same UB as lijDN
+
         set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
         wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
         TSTORE(miGlobalND, mijND);    // mi = mij
@@ -140,13 +144,10 @@ static __aicore__ void online_update_impl(
 
         if (is_last) {
             // Single block: normalize dst = oi_new / lij
-            // lij stored to li buffer in ND format; reload as DN for TROWEXPANDDIV
-            set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
-            wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
-            TLOAD(liDN, liGlobalDN);
-            set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1);
-            wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1);
-            TROWEXPANDDIV(oiNewTile, oiNewTile, liDN);
+            // lijDN already in ColMajor DN format, use directly for TROWEXPANDDIV
+            set_flag(PIPE_MTE3, PIPE_V, EVENT_ID0);
+            wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID0);
+            TROWEXPANDDIV(oiNewTile, oiNewTile, lijDN);
             set_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);
             wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);
             TSTORE(dstGlobal, oiNewTile);
@@ -154,96 +155,98 @@ static __aicore__ void online_update_impl(
     } else {
         // --- Subsequent blocks: accumulate ---
 
-        // Phase 1: Load all inputs
+        // Load all inputs
         TLOAD(oiNewTile, oiNewGlobal);
         TLOAD(oiTile, oiGlobal);
-        TLOAD(mijND, mijGlobalND);
-        TLOAD(lijND, lijGlobalND);
-        TLOAD(miND, miGlobalND);
-        TLOAD(liND, liGlobalND);
+        TLOAD(mijDN, mijGlobalDN);
+        TLOAD(lijDN, lijGlobalDN);
+        TLOAD(miDN, miGlobalDN);
+        TLOAD(liDN, liGlobalDN);
         set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
         wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
 
-        // Phase 2: Scalar arithmetic in RowMajor (kScalarRows, kScalarCols)
-        // pipe_barrier(PIPE_V) required between each dependent vector operation
-        // to resolve RAW hazards on shared UB tiles.
-        TMAX(miNewND, miND, mijND);  // mi_new = max(mi, mij)
+        // TRESHAPE: ColMajor(M,1) → RowMajor(1,M) for element-wise arithmetic
+        TileScalarRow miRow, mijRow, liRow, lijRow;
+        TRESHAPE(miRow, miDN);
+        TRESHAPE(mijRow, mijDN);
+        TRESHAPE(liRow, liDN);
+        TRESHAPE(lijRow, lijDN);
+
+        // Scalar arithmetic in RowMajor (1, M) layout
+        TileScalarRow miNewRow, alphaRow, betaRow, liNewRow, tmpRow;
+        TASSIGN(miNewRow, 2 * kDataBytes + 4 * kScalarDNBytes);
+        TASSIGN(alphaRow, 2 * kDataBytes + 5 * kScalarDNBytes);
+        TASSIGN(betaRow, 2 * kDataBytes + 6 * kScalarDNBytes);
+        TASSIGN(liNewRow, 2 * kDataBytes + 7 * kScalarDNBytes);
+        TASSIGN(tmpRow, 2 * kDataBytes + 8 * kScalarDNBytes);
+
+        TMAX(miNewRow, miRow, mijRow);  // mi_new = max(mi, mij)
         pipe_barrier(PIPE_V);
-        TSUB(alphaND, miND, miNewND);  // alpha = mi - mi_new
+        TSUB(alphaRow, miRow, miNewRow);  // alpha_exp = mi - mi_new
         pipe_barrier(PIPE_V);
-        TEXP(alphaND, alphaND);  // alpha = exp(mi - mi_new)
+        TEXP(alphaRow, alphaRow);  // alpha = exp(mi - mi_new)
         pipe_barrier(PIPE_V);
-        TSUB(betaND, mijND, miNewND);  // beta = mij - mi_new
+        TSUB(betaRow, mijRow, miNewRow);  // beta_exp = mij - mi_new
         pipe_barrier(PIPE_V);
-        TEXP(betaND, betaND);  // beta = exp(mij - mi_new)
+        TEXP(betaRow, betaRow);  // beta = exp(mij - mi_new)
         pipe_barrier(PIPE_V);
-        TMUL(liND, alphaND, liND);  // li = alpha * li
+        TMUL(tmpRow, alphaRow, liRow);  // alpha * li
         pipe_barrier(PIPE_V);
-        TMUL(tmpND, betaND, lijND);  // tmp = beta * lij
+        TMUL(liNewRow, betaRow, lijRow);  // beta * lij
         pipe_barrier(PIPE_V);
-        TADD(liND, liND, tmpND);  // li = alpha * li + beta * lij (= li_new)
+        TADD(liNewRow, tmpRow, liNewRow);  // li_new = alpha*li + beta*lij
 
-        // Phase 3: Store scalar results to GM (ND format)
-        // mi_new → mi accumulator, li_new → li accumulator
-        // alpha → mij buffer (reuse), beta → lij buffer (reuse)
-        set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
-        wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
-        TSTORE(miGlobalND, miNewND);   // persist mi_new
-        TSTORE(liGlobalND, liND);      // persist li_new
-        TSTORE(mijGlobalND, alphaND);  // temp: alpha to mij buffer
-        TSTORE(lijGlobalND, betaND);   // temp: beta to lij buffer
-
-        // Phase 4: Reload alpha, beta (and li if last) as ColMajor DN
-        set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
-        wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
-        TLOAD(alphaDN, mijGlobalDN);  // alpha from mij buffer as DN
-        TLOAD(betaDN, lijGlobalDN);   // beta from lij buffer as DN
-        if (is_last) {
-            TLOAD(liDN, liGlobalDN);  // li_new from li buffer as DN
-        }
-        set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1);
-        wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1);
+        // TRESHAPE back: RowMajor(1,M) → ColMajor(M,1) for TROWEXPANDMUL
+        TRESHAPE(alphaDN, alphaRow);
+        TRESHAPE(betaDN, betaRow);
 
-        // Phase 5: Scale data tiles using row-broadcast multiply
+        // Scale data tiles using row-broadcast multiply
         TROWEXPANDMUL(oiTile, oiTile, alphaDN);       // oi *= alpha
         TROWEXPANDMUL(oiNewTile, oiNewTile, betaDN);  // oi_new *= beta
         pipe_barrier(PIPE_V);
         TADD(oiTile, oiTile, oiNewTile);  // oi = alpha*oi + beta*oi_new
 
+        // Store mi_new and li_new to GM (ND format)
+        // Alias ND tiles to the same UB locations as miNewRow and liNewRow
+        TileScalarND miNewND, liNewND;
+        TASSIGN(miNewND, 2 * kDataBytes + 4 * kScalarDNBytes);
+        TASSIGN(liNewND, 2 * kDataBytes + 7 * kScalarDNBytes);
+
         if (is_last) {
-            // Phase 6: Normalize and output
+            // Normalize and output: dst = oi / li_new
+            TRESHAPE(liNewDN, liNewRow);
             pipe_barrier(PIPE_V);
-            TROWEXPANDDIV(oiTile, oiTile, liDN);  // dst = oi / li_new
-            set_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);
-            wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);
+            TROWEXPANDDIV(oiTile, oiTile, liNewDN);
+            set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+            wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+            TSTORE(miGlobalND, miNewND);  // persist mi_new
+            TSTORE(liGlobalND, liNewND);  // persist li_new
             TSTORE(dstGlobal, oiTile);
         } else {
-            // Phase 6: Store updated accumulators
-            set_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);
-            wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);
+            // Store updated accumulators
+            set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+            wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+            TSTORE(miGlobalND, miNewND);  // persist mi_new
+            TSTORE(liGlobalND, liNewND);  // persist li_new
             TSTORE(oiGlobal, oiTile);
         }
     }
-
     pipe_sync();
 }
 
 extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) {
-    __gm__ uint8_t *mij = reinterpret_cast<__gm__ uint8_t *>(args[0]);
-    __gm__ uint8_t *lij = reinterpret_cast<__gm__ uint8_t *>(args[1]);
-    __gm__ uint8_t *oi_new = reinterpret_cast<__gm__ uint8_t *>(args[2]);
-    __gm__ uint8_t *mi = reinterpret_cast<__gm__ uint8_t *>(args[3]);
-    __gm__ uint8_t *li = reinterpret_cast<__gm__ uint8_t *>(args[4]);
-    __gm__ uint8_t *oi = reinterpret_cast<__gm__ uint8_t *>(args[5]);
-    int is_first = static_cast<int>(args[6]);
-    int is_last = static_cast<int>(args[7]);
-    __gm__ uint8_t *dst = reinterpret_cast<__gm__ uint8_t *>(args[8]);
-    int q_tile_size = static_cast<int>(args[9]);
-    // args[10] = head_dim (128)
-
-    int head_dim = static_cast<int>(args[10]);
-
-    if (q_tile_size == 16 && head_dim == 16) {
+    __gm__ Tensor *mij = reinterpret_cast<__gm__ Tensor *>(args[0]);
+    __gm__ Tensor *lij = reinterpret_cast<__gm__ Tensor *>(args[1]);
+    __gm__ Tensor *oi_new = reinterpret_cast<__gm__ Tensor *>(args[2]);
+    __gm__ Tensor *mi = reinterpret_cast<__gm__ Tensor *>(args[3]);
+    __gm__ Tensor *li = reinterpret_cast<__gm__ Tensor *>(args[4]);
+    __gm__ Tensor *oi = reinterpret_cast<__gm__ Tensor *>(args[5]);
+    __gm__ Tensor *dst = reinterpret_cast<__gm__ Tensor *>(args[6]);
+    uint64_t is_first = static_cast<uint64_t>(args[7]);
+    uint64_t is_last = static_cast<uint64_t>(args[8]);
+    uint64_t q_tile_size = static_cast<uint64_t>(mij->shapes[0]);
+
+    if (q_tile_size == 16 && oi_new->shapes[1] <= 16) {
         online_update_impl<16, 16>(mij, lij, oi_new, mi, li, oi, is_first, is_last, dst);
     } else if (q_tile_size == 16) {
         online_update_impl<16, 128>(mij, lij, oi_new, mi, li, oi, is_first, is_last, dst);
diff --git a/tests/st/a2a3/host_build_graph/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp b/tests/st/a2a3/host_build_graph/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp
index f91b1d71b..8f0c41775 100644
--- a/tests/st/a2a3/host_build_graph/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp
+++ b/tests/st/a2a3/host_build_graph/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp
@@ -28,6 +28,8 @@
 #include <cstdint>
 #include <pto/pto-inst.hpp>
 
+#include "tensor.h"
+
 using namespace pto;
 
 #include "pipe_sync.h"
@@ -42,13 +44,13 @@ using namespace pto;
 
 template <int M, int N>
 static __aicore__ void softmax_prepare_impl(
-    __gm__ uint8_t *sij_raw, float scale_value, __gm__ uint8_t *pij_raw, __gm__ uint8_t *mij_raw,
-    __gm__ uint8_t *lij_raw, int valid_len
+    __gm__ Tensor *sij, float scale_value, __gm__ Tensor *pij, __gm__ Tensor *mij, __gm__ Tensor *lij
 ) {
-    __gm__ float *sij = reinterpret_cast<__gm__ float *>(sij_raw);
-    __gm__ bfloat16_t *pij = reinterpret_cast<__gm__ bfloat16_t *>(pij_raw);
-    __gm__ float *mij = reinterpret_cast<__gm__ float *>(mij_raw);
-    __gm__ float *lij = reinterpret_cast<__gm__ float *>(lij_raw);
+    uint64_t valid_len = static_cast<uint64_t>(sij->shapes[1]);
+    __gm__ float *sij_addr = reinterpret_cast<__gm__ float *>(sij->buffer.addr);
+    __gm__ bfloat16_t *pij_addr = reinterpret_cast<__gm__ bfloat16_t *>(pij->buffer.addr);
+    __gm__ float *mij_addr = reinterpret_cast<__gm__ float *>(mij->buffer.addr);
+    __gm__ float *lij_addr = reinterpret_cast<__gm__ float *>(lij->buffer.addr);
 
     constexpr int kAlignedRows = ((M * sizeof(float) + 31) / 32) * (32 / sizeof(float));
 
@@ -56,10 +58,10 @@ static __aicore__ void softmax_prepare_impl(
     using GlobalDataMxN_bf16 = GlobalTensor<bfloat16_t, Shape<1, 1, 1, M, N>, Stride<1, 1, 1, N, 1>>;
     using GlobalScalarDN = GlobalTensor<float, Shape<1, 1, 1, kAlignedRows, 1>, Stride<1, 1, 1, 1, 1>, Layout::DN>;
 
-    GlobalDataMxN sijGlobal(sij);
-    GlobalDataMxN_bf16 pijGlobal(pij);
-    GlobalScalarDN mijGlobal(mij);
-    GlobalScalarDN lijGlobal(lij);
+    GlobalDataMxN sijGlobal(sij_addr + sij->start_offset);
+    GlobalDataMxN_bf16 pijGlobal(pij_addr + pij->start_offset);
+    GlobalScalarDN mijGlobal(mij_addr + mij->start_offset);
+    GlobalScalarDN lijGlobal(lij_addr + lij->start_offset);
 
     // Dynamic-cols tile: marks which columns are valid for TFILLPAD boundary
     using TileSijDyn = Tile<TileType::Vec, float, M, N, BLayout::RowMajor, M, -1>;
@@ -90,6 +92,7 @@ static __aicore__ void softmax_prepare_impl(
     TASSIGN(pijBf16Tile, 3 * M * N * sizeof(float) + 2 * kAlignedRows * sizeof(float));
 
     // Load full sij (M, N) tile from GM - all N columns including garbage for partial blocks
+    // printf("sij addr incore %x\n", sij->buffer.addr);
     TLOAD(sijTile, sijGlobal);
     set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
     wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
@@ -100,45 +103,54 @@ static __aicore__ void softmax_prepare_impl(
     pipe_barrier(PIPE_V);
 
     TMULS(sijTile, sijTile, scale_value);
+    pipe_barrier(PIPE_V);
     TROWMAX(maxTile, sijTile, tmpTile);
+    pipe_barrier(PIPE_V);
     TROWEXPANDSUB(pijTile, sijTile, maxTile);
+    pipe_barrier(PIPE_V);
     TEXP(pijTile, pijTile);
-    // Truncate pij to bf16 first, then compute lij from truncated values (matches golden)
+    // Truncate pij to bf16 first
+    pipe_barrier(PIPE_V);
     TCVT(pijBf16Tile, pijTile, RoundMode::CAST_ROUND);
+    set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);  // pij bf16 ready, can store early
+
+    // Continue computing: bf16 → f32 and rowsum while pij store proceeds in parallel
+    pipe_barrier(PIPE_V);
     TCVT(pijTile, pijBf16Tile, RoundMode::CAST_ROUND);
+    pipe_barrier(PIPE_V);
     TROWSUM(sumTile, pijTile, tmpTile);
+    set_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);  // sum ready
 
-    set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+    // Store pij (overlaps with TCVT + TROWSUM above)
     wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+    TSTORE(pijGlobal, pijBf16Tile);
+
+    // Store max and sum
     TSTORE(mijGlobal, maxTile);
+    wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);
     TSTORE(lijGlobal, sumTile);
-    TSTORE(pijGlobal, pijBf16Tile);
 
     pipe_sync();
 }
 
 extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) {
-    __gm__ uint8_t *sij = reinterpret_cast<__gm__ uint8_t *>(args[0]);
+    __gm__ Tensor *sij = reinterpret_cast<__gm__ Tensor *>(args[0]);
+    __gm__ Tensor *pij = reinterpret_cast<__gm__ Tensor *>(args[1]);
+    __gm__ Tensor *mij = reinterpret_cast<__gm__ Tensor *>(args[2]);
+    __gm__ Tensor *lij = reinterpret_cast<__gm__ Tensor *>(args[3]);
     union {
         uint64_t u;
         float f;
     } scale_conv;
-    scale_conv.u = static_cast<uint64_t>(args[1]);
+    scale_conv.u = static_cast<uint64_t>(args[4]);
     float scale_value = scale_conv.f;
-    __gm__ uint8_t *pij = reinterpret_cast<__gm__ uint8_t *>(args[2]);
-    __gm__ uint8_t *mij = reinterpret_cast<__gm__ uint8_t *>(args[3]);
-    __gm__ uint8_t *lij = reinterpret_cast<__gm__ uint8_t *>(args[4]);
-    int q_tile_size = static_cast<int>(args[5]);
-    // args[6] = block_size
-    int valid_len = static_cast<int>(args[7]);
-
-    int block_size = static_cast<int>(args[6]);
+    uint64_t q_tile_size = static_cast<uint64_t>(sij->shapes[0]);
 
-    if (q_tile_size == 16 && block_size == 16) {
-        softmax_prepare_impl<16, 16>(sij, scale_value, pij, mij, lij, valid_len);
+    if (q_tile_size == 16 && pij->shapes[1] <= 16) {
+        softmax_prepare_impl<16, 16>(sij, scale_value, pij, mij, lij);
     } else if (q_tile_size == 16) {
-        softmax_prepare_impl<16, 128>(sij, scale_value, pij, mij, lij, valid_len);
+        softmax_prepare_impl<16, 128>(sij, scale_value, pij, mij, lij);
     } else {
-        softmax_prepare_impl<64, 64>(sij, scale_value, pij, mij, lij, valid_len);
+        softmax_prepare_impl<64, 64>(sij, scale_value, pij, mij, lij);
     }
 }
diff --git a/tests/st/a2a3/host_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp b/tests/st/a2a3/host_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp
index 142921213..2ed86cdf2 100644
--- a/tests/st/a2a3/host_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp
+++ b/tests/st/a2a3/host_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp
@@ -9,247 +9,284 @@
  * -----------------------------------------------------------------------------------------------------------
  */
 /**
- * Paged Attention Orchestration - Production Scale
+ * Paged Attention Orchestration Function - 16x16 Version
  *
- * Supports production-scale paged attention with:
- *   Query: (batch, q_head_num, head_dim) bf16
- *   Key:   (total_blocks, block_size, kv_head_num, head_dim) bf16 (NOT transposed)
- *   Value: (total_blocks, block_size, kv_head_num, head_dim) bf16
- *   Output: (batch, q_head_num, head_dim) float32
+ * Simplified for 16x16 framework-generated matmul kernels.
+ * Each block processes a single 16x16 matmul operation.
  *
- * Head tiling: q_tile_size = min(num_heads, 128)
- * GQA: kv_head_num can differ from q_head_num
- *
- * ChipStorageTaskArgs layout: tensors=[query, key_cache, value_cache, block_table, context_lens, out], scalars=[scale]
+ * Memory Layout:
+ *   Query: (batch, 16, 16) - one 16x16 tile per batch
+ *   Key:   (total_blocks, 16, 16) - stored as K^T for direct matmul
+ *   Value: (total_blocks, 16, 16) - direct format
  */
 
 #include <algorithm>
+#include <cinttypes>
+#include <cstdint>
 #include <cstring>
-#include <iostream>
 
-#include "orchestration_api.h"  // NOLINT(build/include_subdir)
+#include "pto_orchestration_api.h"
 
 #define FUNC_QK_MATMUL 0
 #define FUNC_SOFTMAX_PREPARE 1
 #define FUNC_PV_MATMUL 2
 #define FUNC_ONLINE_UPDATE 3
+constexpr uint64_t PLATFORM_PROF_SYS_CNT_FREQ = 50000000;  // 50 MHz
 
-extern "C" {
-
-int build_paged_attention_graph(OrchestrationRuntime *runtime, const ChipStorageTaskArgs &orch_args) {
-    if (orch_args.tensor_count() < 6) {
-        std::cerr << "Expected at least 6 tensors, got " << orch_args.tensor_count() << '\n';
-        return -1;
-    }
-
-    // Extract host pointers from tensor metadata
-    void *host_query = orch_args.tensor(0).data_as<void>();
-    void *host_key_cache = orch_args.tensor(1).data_as<void>();
-    void *host_value_cache = orch_args.tensor(2).data_as<void>();
-    int *host_block_table = orch_args.tensor(3).data_as<int>();
-    int *host_context_lens = orch_args.tensor(4).data_as<int>();
-    void *host_out = orch_args.tensor(5).data_as<void>();
-
-    // Extract sizes from tensor metadata
-    size_t query_size = orch_args.tensor(0).nbytes();
-    size_t key_cache_size = orch_args.tensor(1).nbytes();
-    size_t value_cache_size = orch_args.tensor(2).nbytes();
-    size_t out_size = orch_args.tensor(5).nbytes();
-
-    // Read dimensions from tensor shapes
-    // query: (batch, num_heads, head_dim)
-    uint32_t batch = orch_args.tensor(0).shapes[0];
-    uint32_t num_heads = orch_args.tensor(0).shapes[1];
-    uint32_t head_dim = orch_args.tensor(0).shapes[2];
-
-    // key_cache: (total_blocks, block_size, kv_head_num, head_dim)
-    uint32_t block_size = orch_args.tensor(1).shapes[1];
-    uint32_t kv_head_num = orch_args.tensor(1).shapes[2];
-
-    // block_table: (batch, max_num_blocks_per_req)
-    uint32_t max_num_blocks = orch_args.tensor(3).shapes[1];
-
-    // scale: first scalar argument (reinterpret uint64_t bits as float)
-    uint64_t scale_value_bits = orch_args.scalar(0);
-    float scale_value;
-    memcpy(&scale_value, &scale_value_bits, sizeof(float));
-
-    uint32_t q_tile_size = std::min(num_heads, 128u);
-    uint32_t num_head_tiles = (num_heads + q_tile_size - 1) / q_tile_size;
-
-    std::cout << "\n=== build_paged_attention_graph ===" << '\n';
-    std::cout << "batch=" << batch << ", num_heads=" << num_heads << ", kv_head_num=" << kv_head_num
-              << ", head_dim=" << head_dim << '\n';
-    std::cout << "block_size=" << block_size << ", max_num_blocks=" << max_num_blocks << '\n';
-    std::cout << "q_tile_size=" << q_tile_size << ", num_head_tiles=" << num_head_tiles << '\n';
-
-    // Allocate device memory for inputs/outputs
-    void *dev_query = device_malloc(runtime, query_size);
-    void *dev_key_cache = device_malloc(runtime, key_cache_size);
-    void *dev_value_cache = device_malloc(runtime, value_cache_size);
-    void *dev_out = device_malloc(runtime, out_size);
-
-    if (!dev_query || !dev_key_cache || !dev_value_cache || !dev_out) {
-        std::cerr << "Error: Failed to allocate device memory\n";
-        return -1;
-    }
+inline double cycles_to_us(uint64_t cycles) {
+    return (static_cast<double>(cycles) / PLATFORM_PROF_SYS_CNT_FREQ) * 1000000.0;
+}
 
-    copy_to_device(runtime, dev_query, host_query, query_size);
-    copy_to_device(runtime, dev_key_cache, host_key_cache, key_cache_size);
-    copy_to_device(runtime, dev_value_cache, host_value_cache, value_cache_size);
-    record_tensor_pair(runtime, host_out, dev_out, out_size);
-
-    // Buffer sizes depend on q_tile_size and block_size
-    size_t sij_size = static_cast<size_t>(q_tile_size) * block_size * sizeof(float);
-    size_t pij_size = static_cast<size_t>(q_tile_size) * block_size * sizeof(uint16_t);
-    size_t mij_size = static_cast<size_t>(q_tile_size) * sizeof(float);
-    size_t lij_size = mij_size;
-    size_t oi_new_size = static_cast<size_t>(q_tile_size) * head_dim * sizeof(float);
-
-    // Per-batch-per-block intermediate buffers
-    uint32_t total_buffers = batch * max_num_blocks;
-    void **dev_sij_arr = new void *[total_buffers];
-    void **dev_pij_arr = new void *[total_buffers];
-    void **dev_mij_arr = new void *[total_buffers];
-    void **dev_lij_arr = new void *[total_buffers];
-    void **dev_oi_new_arr = new void *[total_buffers];
-
-    for (uint32_t i = 0; i < total_buffers; i++) {
-        dev_sij_arr[i] = device_malloc(runtime, sij_size);
-        dev_pij_arr[i] = device_malloc(runtime, pij_size);
-        dev_mij_arr[i] = device_malloc(runtime, mij_size);
-        dev_lij_arr[i] = device_malloc(runtime, lij_size);
-        dev_oi_new_arr[i] = device_malloc(runtime, oi_new_size);
-    }
+inline uint64_t get_sys_cnt_aicpu() {
+#if defined(__aarch64__)
+    uint64_t ticks;
+    asm volatile("mrs %0, cntvct_el0" : "=r"(ticks));
+    return ticks;
+#elif defined(__x86_64__)
+    return 0;
+#else
+    return 0;
+#endif
+}
 
-    // Per-(batch, head_tile) accumulators
-    uint32_t total_accums = batch * num_head_tiles;
-    size_t mi_size = static_cast<size_t>(q_tile_size) * sizeof(float);
-    size_t li_size = mi_size;
-    size_t oi_size = static_cast<size_t>(q_tile_size) * head_dim * sizeof(float);
+#ifdef ENABLE_PROFILING
+#define CYCLE_COUNT_START() uint64_t _t0 = get_sys_cnt_aicpu(), _t1
+#define CYCLE_COUNT_LAP(acc)       \
+    do {                           \
+        _t1 = get_sys_cnt_aicpu(); \
+        acc += (_t1 - _t0);        \
+        _t0 = _t1;                 \
+    } while (0)
+#define PROF_INC(counter, n) (counter) += (n)
+#else
+#define CYCLE_COUNT_START() (void)0
+#define CYCLE_COUNT_LAP(acc) (void)0
+#define PROF_INC(counter, n) (void)0
+#endif
 
-    void **dev_mi_arr = new void *[total_accums];
-    void **dev_li_arr = new void *[total_accums];
-    void **dev_oi_arr = new void *[total_accums];
+extern "C" {
 
-    for (uint32_t i = 0; i < total_accums; i++) {
-        dev_mi_arr[i] = device_malloc(runtime, mi_size);
-        dev_li_arr[i] = device_malloc(runtime, li_size);
-        dev_oi_arr[i] = device_malloc(runtime, oi_size);
-    }
+__attribute__((visibility("default"))) PTO2OrchestrationConfig aicpu_orchestration_config(const L2TaskArgs &orch_args) {
+    (void)orch_args;
+    return PTO2OrchestrationConfig{
+        .expected_arg_count = 7,
+    };
+}
 
-    std::cout << "Allocated " << total_buffers << " per-block buffers\n";
-    std::cout << "Allocated " << total_accums << " per-(batch,head_tile) accumulators\n";
-
-    int total_tasks = 0;
-
-    for (uint32_t b_idx = 0; b_idx < batch; b_idx++) {
-        int cur_seq = host_context_lens[b_idx];
-        uint32_t bn_this_batch = (static_cast<uint32_t>(cur_seq) + block_size - 1) / block_size;
-
-        for (uint32_t ht = 0; ht < num_head_tiles; ht++) {
-            uint32_t cur_offset = ht * q_tile_size;
-
-            // Query: (batch, q_head_num, head_dim) bf16
-            // qi points to heads [cur_offset .. cur_offset+q_tile_size) for batch b_idx
-            uint8_t *qi_ptr = reinterpret_cast<uint8_t *>(dev_query) +
-                              static_cast<int64_t>(b_idx * num_heads + cur_offset) * head_dim * sizeof(uint16_t);
-
-            // Output: (batch * q_head_num, head_dim) float32
-            uint8_t *out_ptr = reinterpret_cast<uint8_t *>(dev_out) +
-                               static_cast<int64_t>(b_idx * num_heads + cur_offset) * head_dim * sizeof(float);
-
-            // GQA: which kv_head this head tile maps to
-            uint32_t kv_head_idx = cur_offset / (num_heads / kv_head_num);
-
-            // Per-(batch, head_tile) accumulators
-            uint32_t accum_idx = b_idx * num_head_tiles + ht;
-            void *dev_mi = dev_mi_arr[accum_idx];
-            void *dev_li = dev_li_arr[accum_idx];
-            void *dev_oi = dev_oi_arr[accum_idx];
-
-            int t_up_prev = -1;
-
-            for (uint32_t bn = 0; bn < bn_this_batch; bn++) {
-                int cur_block_idx = host_block_table[b_idx * max_num_blocks + bn];
-                int valid_len = std::min(static_cast<int>(block_size), cur_seq - static_cast<int>(bn * block_size));
-
-                // Key: (total_blocks, block_size, kv_head_num, head_dim) bf16
-                uint8_t *kj_ptr = reinterpret_cast<uint8_t *>(dev_key_cache) +
-                                  (static_cast<int64_t>(cur_block_idx) * block_size * kv_head_num + kv_head_idx) *
-                                      head_dim * sizeof(uint16_t);
-
-                // Value: (total_blocks, block_size, kv_head_num, head_dim) bf16
-                uint8_t *vj_ptr = reinterpret_cast<uint8_t *>(dev_value_cache) +
-                                  (static_cast<int64_t>(cur_block_idx) * block_size * kv_head_num + kv_head_idx) *
-                                      head_dim * sizeof(uint16_t);
-
-                uint32_t buf_idx = b_idx * max_num_blocks + bn;
-                void *dev_sij = dev_sij_arr[buf_idx];
-                void *dev_pij = dev_pij_arr[buf_idx];
-                void *dev_mij = dev_mij_arr[buf_idx];
-                void *dev_lij = dev_lij_arr[buf_idx];
-                void *dev_oi_new = dev_oi_new_arr[buf_idx];
-
-                // QK: qi(M, K) @ kj.T(K, N) -> sij(M, N)
-                uint64_t qk_args[6] = {reinterpret_cast<uint64_t>(qi_ptr),  reinterpret_cast<uint64_t>(kj_ptr),
-                                       reinterpret_cast<uint64_t>(dev_sij), static_cast<uint64_t>(q_tile_size),
-                                       static_cast<uint64_t>(head_dim),     static_cast<uint64_t>(block_size)};
-                int t_qk = add_task(runtime, qk_args, 6, FUNC_QK_MATMUL, CoreType::AIC);
-                total_tasks++;
-
-                // SF: scale, rowmax, exp, rowsum -> pij, mij, lij
-                uint64_t sf_args[8] = {reinterpret_cast<uint64_t>(dev_sij), scale_value_bits,
-                                       reinterpret_cast<uint64_t>(dev_pij), reinterpret_cast<uint64_t>(dev_mij),
-                                       reinterpret_cast<uint64_t>(dev_lij), static_cast<uint64_t>(q_tile_size),
-                                       static_cast<uint64_t>(block_size),   static_cast<uint64_t>(valid_len)};
-                int t_sf = add_task(runtime, sf_args, 8, FUNC_SOFTMAX_PREPARE, CoreType::AIV);
-                total_tasks++;
-
-                // PV: pij(M, K') @ vj(K', N') -> oi_new(M, N')
-                uint64_t pv_args[6] = {reinterpret_cast<uint64_t>(dev_pij),    reinterpret_cast<uint64_t>(vj_ptr),
-                                       reinterpret_cast<uint64_t>(dev_oi_new), static_cast<uint64_t>(q_tile_size),
-                                       static_cast<uint64_t>(block_size),      static_cast<uint64_t>(head_dim)};
-                int t_pv = add_task(runtime, pv_args, 6, FUNC_PV_MATMUL, CoreType::AIC);
-                total_tasks++;
-
-                add_successor(runtime, t_qk, t_sf);
-                add_successor(runtime, t_sf, t_pv);
-
-                // Online Update: serialized across blocks (each depends on previous)
-                int is_first = (bn == 0) ? 1 : 0;
-                int is_last = (bn == bn_this_batch - 1) ? 1 : 0;
-
-                uint64_t up_args[11] = {reinterpret_cast<uint64_t>(dev_mij),    reinterpret_cast<uint64_t>(dev_lij),
-                                        reinterpret_cast<uint64_t>(dev_oi_new), reinterpret_cast<uint64_t>(dev_mi),
-                                        reinterpret_cast<uint64_t>(dev_li),     reinterpret_cast<uint64_t>(dev_oi),
-                                        static_cast<uint64_t>(is_first),        static_cast<uint64_t>(is_last),
-                                        reinterpret_cast<uint64_t>(out_ptr),    static_cast<uint64_t>(q_tile_size),
-                                        static_cast<uint64_t>(head_dim)};
-                int t_up = add_task(runtime, up_args, 11, FUNC_ONLINE_UPDATE, CoreType::AIV);
-                total_tasks++;
-
-                add_successor(runtime, t_pv, t_up);
-                if (t_up_prev >= 0) {
-                    add_successor(runtime, t_up_prev, t_up);
+__attribute__((visibility("default"))) void aicpu_orchestration_entry(const L2TaskArgs &orch_args) {
+#ifdef ENABLE_PROFILING
+    uint64_t prof_param_extract = 0;
+    uint64_t prof_ext_tensor = 0;
+    uint64_t prof_scope = 0;
+    uint64_t prof_make_tensor = 0;
+    uint64_t prof_tensor_view = 0;
+    uint64_t prof_param_setup = 0;
+    uint64_t prof_submit_task = 0;
+    int prof_submit_count = 0;
+    int prof_make_count = 0;
+    int prof_view_count = 0;
+#endif
+
+    CYCLE_COUNT_START();
+
+    // Read dimensions from tensor metadata
+    uint64_t batch = orch_args.tensor(0).ref().shapes[0];
+    uint64_t num_heads = orch_args.tensor(0).ref().shapes[1];
+    uint64_t head_dim = orch_args.tensor(0).ref().shapes[2];
+    DataType data_type = orch_args.tensor(0).ref().dtype;
+
+    uint64_t block_size = orch_args.tensor(1).ref().shapes[1];
+    uint64_t block_num = orch_args.tensor(3).ref().shapes[1];
+
+    uint64_t scale_value = orch_args.scalar(0);
+
+    uint64_t q_head_num = num_heads;
+    uint64_t q_tile = std::min(num_heads, static_cast<uint64_t>(128));
+    uint64_t q_loop = (q_head_num + q_tile - 1) / q_tile;
+    CYCLE_COUNT_LAP(prof_param_extract);
+
+    LOG_INFO_V9(">>>>>> batch = %" PRIu64, batch);
+
+    // Reshape tensors for kernel consumption (2D flattened)
+    void *query_ptr = orch_args.tensor(0).ref().data_as<void>();
+    void *kc_ptr = orch_args.tensor(1).ref().data_as<void>();
+    void *vc_ptr = orch_args.tensor(2).ref().data_as<void>();
+    void *out_ptr = orch_args.tensor(5).ref().data_as<void>();
+
+    uint64_t total_blocks_count = orch_args.tensor(1).ref().shapes[0];
+
+    uint32_t query_shapes[2] = {static_cast<uint32_t>(batch * num_heads), static_cast<uint32_t>(head_dim)};
+    uint32_t key_cache_shapes[2] = {
+        static_cast<uint32_t>(total_blocks_count * block_size), static_cast<uint32_t>(head_dim)
+    };
+    uint32_t value_cache_shapes[2] = {
+        static_cast<uint32_t>(total_blocks_count * block_size), static_cast<uint32_t>(head_dim)
+    };
+    uint32_t out_shapes[2] = {static_cast<uint32_t>(batch * num_heads), static_cast<uint32_t>(head_dim)};
+    Tensor query = make_tensor_external(query_ptr, query_shapes, 2, data_type);
+    Tensor key_cache = make_tensor_external(kc_ptr, key_cache_shapes, 2, data_type);
+    Tensor value_cache = make_tensor_external(vc_ptr, value_cache_shapes, 2, data_type);
+    Tensor out = make_tensor_external(out_ptr, out_shapes, 2, DataType::FLOAT32);
+    CYCLE_COUNT_LAP(prof_ext_tensor);
+
+    uint32_t bt_shapes[2] = {static_cast<uint32_t>(batch), static_cast<uint32_t>(block_num)};
+    Tensor block_table =
+        make_tensor_external(orch_args.tensor(3).ref().data_as<void>(), bt_shapes, 2, DataType::INT32, false);
+    uint32_t cl_shapes[1] = {static_cast<uint32_t>(batch)};
+    Tensor context_lens =
+        make_tensor_external(orch_args.tensor(4).ref().data_as<void>(), cl_shapes, 1, DataType::INT32, false);
+
+    // Create infos are loop-invariant — shapes depend only on q_tile/head_dim/block_size
+    uint32_t tile2d_shapes[2] = {static_cast<uint32_t>(q_tile), static_cast<uint32_t>(head_dim)};
+    uint32_t scalar_shapes[1] = {static_cast<uint32_t>(q_tile)};
+    uint32_t sij_shapes[2] = {static_cast<uint32_t>(q_tile), static_cast<uint32_t>(block_size)};
+    TensorCreateInfo tile2d_ci(tile2d_shapes, 2, DataType::FLOAT32);
+    TensorCreateInfo scalar_ci(scalar_shapes, 1, DataType::FLOAT32);
+    TensorCreateInfo sij_ci(sij_shapes, 2, DataType::FLOAT32);
+    TensorCreateInfo pij_f16_ci(sij_shapes, 2, data_type);
+
+    PROF_INC(prof_make_count, 4);
+    CYCLE_COUNT_LAP(prof_make_tensor);
+
+    for (uint64_t b_idx = 0; b_idx < batch; b_idx++) {
+        uint32_t cl_idx[1] = {static_cast<uint32_t>(b_idx)};
+        uint64_t cur_seq = static_cast<uint64_t>(get_tensor_data<int32_t>(context_lens, 1, cl_idx));
+        uint64_t bn_this_batch = (cur_seq + block_size - 1) / block_size;
+        for (uint64_t q_idx = 0; q_idx < q_loop; q_idx++) {
+            PTO2_SCOPE() {
+                CYCLE_COUNT_LAP(prof_scope);
+                uint64_t cur_offset = b_idx * q_head_num + q_idx * q_tile;
+
+                uint32_t qi_offsets[2] = {static_cast<uint32_t>(cur_offset), 0};
+                Tensor qi = query.view(tile2d_shapes, qi_offsets);
+                uint32_t out_view_offsets[2] = {static_cast<uint32_t>(cur_offset), 0};
+                Tensor out_view = out.view(tile2d_shapes, out_view_offsets);
+                PROF_INC(prof_view_count, 2);
+                CYCLE_COUNT_LAP(prof_tensor_view);
+
+                CYCLE_COUNT_LAP(prof_param_setup);
+                TaskOutputTensors alloc_outs = alloc_tensors(tile2d_ci, scalar_ci, scalar_ci);
+                const Tensor &oi = alloc_outs.get_ref(0);
+                const Tensor &li_update = alloc_outs.get_ref(1);
+                const Tensor &mi_update = alloc_outs.get_ref(2);
+                PROF_INC(prof_submit_count, 1);
+                CYCLE_COUNT_LAP(prof_submit_task);
+
+                for (uint64_t bn = 0; bn < bn_this_batch; bn++) {
+                    PTO2_SCOPE_GUARD();
+
+                    uint32_t bt_idx[2] = {static_cast<uint32_t>(b_idx), static_cast<uint32_t>(bn)};
+                    uint64_t cur_block_idx = static_cast<uint64_t>(get_tensor_data<int32_t>(block_table, 2, bt_idx));
+                    uint64_t valid_len = std::min(block_size, cur_seq - bn * block_size);
+                    CYCLE_COUNT_LAP(prof_param_extract);
+
+                    uint32_t kv_shapes[2] = {static_cast<uint32_t>(block_size), static_cast<uint32_t>(head_dim)};
+                    uint32_t kv_offsets[2] = {static_cast<uint32_t>(cur_block_idx * block_size), 0};
+                    Tensor kj = key_cache.view(kv_shapes, kv_offsets);
+                    Tensor vj = value_cache.view(kv_shapes, kv_offsets);
+                    PROF_INC(prof_view_count, 2);
+                    CYCLE_COUNT_LAP(prof_tensor_view);
+
+                    L0TaskArgs params_qk;
+                    params_qk.add_input(qi);
+                    params_qk.add_input(kj);
+                    params_qk.add_output(sij_ci);
+                    CYCLE_COUNT_LAP(prof_param_setup);
+                    TaskOutputTensors qk_outs = rt_submit_aic_task(FUNC_QK_MATMUL, params_qk);
+                    const Tensor &sij = qk_outs.get_ref(0);
+                    PROF_INC(prof_submit_count, 1);
+                    CYCLE_COUNT_LAP(prof_submit_task);
+
+                    uint32_t sij_valid_shapes[2] = {static_cast<uint32_t>(q_tile), static_cast<uint32_t>(valid_len)};
+                    uint32_t sij_valid_offsets[2] = {0, 0};
+                    Tensor sij_valid = sij.view(sij_valid_shapes, sij_valid_offsets);
+                    PROF_INC(prof_view_count, 1);
+                    CYCLE_COUNT_LAP(prof_tensor_view);
+
+                    L0TaskArgs params_sf;
+                    params_sf.add_input(sij_valid);
+                    params_sf.add_output(pij_f16_ci);
+                    params_sf.add_output(scalar_ci);
+                    params_sf.add_output(scalar_ci);
+                    params_sf.add_scalar(scale_value);
+                    CYCLE_COUNT_LAP(prof_param_setup);
+                    TaskOutputTensors sf_outs = rt_submit_aiv_task(FUNC_SOFTMAX_PREPARE, params_sf);
+                    const Tensor &pij_f16 = sf_outs.get_ref(0);
+                    const Tensor &mi = sf_outs.get_ref(1);
+                    const Tensor &li = sf_outs.get_ref(2);
+                    PROF_INC(prof_submit_count, 1);
+                    CYCLE_COUNT_LAP(prof_submit_task);
+
+                    L0TaskArgs params_pv;
+                    params_pv.add_input(pij_f16);
+                    params_pv.add_input(vj);
+                    params_pv.add_output(tile2d_ci);
+                    CYCLE_COUNT_LAP(prof_param_setup);
+                    TaskOutputTensors pv_outs = rt_submit_aic_task(FUNC_PV_MATMUL, params_pv);
+                    const Tensor &oi_tmp = pv_outs.get_ref(0);
+                    PROF_INC(prof_submit_count, 1);
+                    CYCLE_COUNT_LAP(prof_submit_task);
+
+                    uint64_t is_first = (bn == 0) ? 1 : 0;
+                    uint64_t is_last = (bn == bn_this_batch - 1) ? 1 : 0;
+                    CYCLE_COUNT_LAP(prof_param_extract);
+
+                    L0TaskArgs params_up;
+                    params_up.add_input(mi);
+                    params_up.add_input(li);
+                    params_up.add_input(oi_tmp);
+                    params_up.add_inout(mi_update);
+                    params_up.add_inout(li_update);
+                    params_up.add_inout(oi);
+                    params_up.add_inout(out_view);
+                    params_up.add_scalar(is_first);
+                    params_up.add_scalar(is_last);
+                    CYCLE_COUNT_LAP(prof_param_setup);
+                    rt_submit_aiv_task(FUNC_ONLINE_UPDATE, params_up);
+                    PROF_INC(prof_submit_count, 1);
+                    CYCLE_COUNT_LAP(prof_submit_task);
                 }
-                t_up_prev = t_up;
             }
+            CYCLE_COUNT_LAP(prof_scope);
         }
     }
 
-    delete[] dev_sij_arr;
-    delete[] dev_pij_arr;
-    delete[] dev_mij_arr;
-    delete[] dev_lij_arr;
-    delete[] dev_oi_new_arr;
-    delete[] dev_mi_arr;
-    delete[] dev_li_arr;
-    delete[] dev_oi_arr;
-
-    std::cout << "Created " << total_tasks << " tasks\n";
-    print_runtime(runtime);
-
-    return 0;
-}
+#ifdef ENABLE_PROFILING
+    uint64_t total = prof_param_extract + prof_ext_tensor + prof_make_tensor + prof_tensor_view + prof_param_setup +
+                     prof_submit_task + prof_scope;
+    LOG_INFO_V9(
+        "=== PagedAttn Orch Profiling: %d submits, %d makes, %d views, total=%.3fus ===", prof_submit_count,
+        prof_make_count, prof_view_count, cycles_to_us(total)
+    );
+    if (total > 0) {
+        LOG_INFO_V9(
+            "  param_extract    : %7.3fus (%5.1f%%)", cycles_to_us(prof_param_extract),
+            prof_param_extract * 100.0 / total
+        );
+        LOG_INFO_V9(
+            "  ext_tensor(x4)   : %7.3fus (%5.1f%%)", cycles_to_us(prof_ext_tensor), prof_ext_tensor * 100.0 / total
+        );
+        LOG_INFO_V9(
+            "  create_info(x%d) : %7.3fus (%5.1f%%)  avg=%.3fus", prof_make_count, cycles_to_us(prof_make_tensor),
+            prof_make_tensor * 100.0 / total,
+            prof_make_count > 0 ? cycles_to_us(prof_make_tensor) / prof_make_count : 0.0
+        );
+        LOG_INFO_V9(
+            "  tensor_view(x%d) : %7.3fus (%5.1f%%)  avg=%.3fus", prof_view_count, cycles_to_us(prof_tensor_view),
+            prof_tensor_view * 100.0 / total,
+            prof_view_count > 0 ? cycles_to_us(prof_tensor_view) / prof_view_count : 0.0
+        );
+        LOG_INFO_V9(
+            "  param_setup      : %7.3fus (%5.1f%%)", cycles_to_us(prof_param_setup), prof_param_setup * 100.0 / total
+        );
+        LOG_INFO_V9("  scope            : %7.3fus (%5.1f%%)", cycles_to_us(prof_scope), prof_scope * 100.0 / total);
+        LOG_INFO_V9(
+            "  submit_task(x%d) : %7.3fus (%5.1f%%)  avg=%.3fus", prof_submit_count, cycles_to_us(prof_submit_task),
+            prof_submit_task * 100.0 / total,
+            prof_submit_count > 0 ? cycles_to_us(prof_submit_task) / prof_submit_count : 0.0
+        );
+    }
+#endif
 }
+
+}  // extern "C"
diff --git a/tests/st/a2a3/host_build_graph/paged_attention/test_paged_attention.py b/tests/st/a2a3/host_build_graph/paged_attention/test_paged_attention.py
index 232b68b29..d8fba334a 100644
--- a/tests/st/a2a3/host_build_graph/paged_attention/test_paged_attention.py
+++ b/tests/st/a2a3/host_build_graph/paged_attention/test_paged_attention.py
@@ -31,7 +31,7 @@ class TestPagedAttentionHostBuildGraph(SceneTestCase):
     CALLABLE = {
         "orchestration": {
             "source": "kernels/orchestration/paged_attention_orch.cpp",
-            "function_name": "build_paged_attention_graph",
+            "function_name": "aicpu_orchestration_entry",
             "signature": [D.IN, D.IN, D.IN, D.IN, D.IN, D.OUT],
         },
         "incores": [
@@ -68,9 +68,16 @@ class TestPagedAttentionHostBuildGraph(SceneTestCase):
 
     CASES = [
         {
+            # Marked manual for host_build_graph: this batch=256 case submits
+            # ~64K tasks, and host-orchestration populates the whole task graph
+            # before the device schedules — so the ring/heap cannot reclaim
+            # mid-orchestration and must hold the entire graph at once. That
+            # exceeds the default ring window / GM heap. Run it explicitly with
+            # a large PTO2_RING_TASK_WINDOW / PTO2_RING_HEAP if needed.
             "name": "Case1",
             "platforms": ["a2a3"],
-            "config": {"aicpu_thread_num": 3, "block_dim": 24},
+            "config": {"aicpu_thread_num": 4, "block_dim": 24},
+            "manual": True,
             "params": {
                 "batch": 256,
                 "num_heads": 16,
@@ -85,7 +92,7 @@ class TestPagedAttentionHostBuildGraph(SceneTestCase):
         {
             "name": "Case2",
             "platforms": ["a2a3"],
-            "config": {"aicpu_thread_num": 3, "block_dim": 24},
+            "config": {"aicpu_thread_num": 4, "block_dim": 24},
             "manual": True,
             "params": {
                 "batch": 64,
@@ -101,7 +108,7 @@ class TestPagedAttentionHostBuildGraph(SceneTestCase):
         {
             "name": "small1",
             "platforms": ["a2a3sim", "a2a3"],
-            "config": {"aicpu_thread_num": 3, "block_dim": 3},
+            "config": {"aicpu_thread_num": 4, "block_dim": 3},
             "params": {
                 "batch": 1,
                 "num_heads": 16,
@@ -116,7 +123,7 @@ class TestPagedAttentionHostBuildGraph(SceneTestCase):
         {
             "name": "small2",
             "platforms": ["a2a3sim", "a2a3"],
-            "config": {"aicpu_thread_num": 3, "block_dim": 3},
+            "config": {"aicpu_thread_num": 4, "block_dim": 3},
             "manual": True,
             "params": {
                 "batch": 1,
diff --git a/tests/st/a2a3/host_build_graph/prepared_callable/test_prepared_callable.py b/tests/st/a2a3/host_build_graph/prepared_callable/test_prepared_callable.py
index 0bbe3feb1..629e9c796 100644
--- a/tests/st/a2a3/host_build_graph/prepared_callable/test_prepared_callable.py
+++ b/tests/st/a2a3/host_build_graph/prepared_callable/test_prepared_callable.py
@@ -59,7 +59,7 @@ class TestPreparedCallableHbg(SceneTestCase):
     CALLABLE = {
         "orchestration": {
             "source": f"{_VECTOR_KERNELS}/orchestration/example_orch.cpp",
-            "function_name": "build_example_graph",
+            "function_name": "aicpu_orchestration_entry",
             "signature": [D.IN, D.IN, D.OUT],
         },
         "incores": [
@@ -84,7 +84,7 @@ class TestPreparedCallableHbg(SceneTestCase):
         ],
     }
 
-    _COMMON_CONFIG = {"aicpu_thread_num": 3, "block_dim": 3}
+    _COMMON_CONFIG = {"aicpu_thread_num": 4, "block_dim": 3}
     _PLATFORMS = ["a2a3sim", "a2a3"]
 
     CASES = [
diff --git a/tests/st/a2a3/host_build_graph/vector_example/kernels/aiv/kernel_add.cpp b/tests/st/a2a3/host_build_graph/vector_example/kernels/aiv/kernel_add.cpp
index 83316981f..53d38ad2f 100644
--- a/tests/st/a2a3/host_build_graph/vector_example/kernels/aiv/kernel_add.cpp
+++ b/tests/st/a2a3/host_build_graph/vector_example/kernels/aiv/kernel_add.cpp
@@ -9,19 +9,21 @@
  * -----------------------------------------------------------------------------------------------------------
  */
 /**
- * Element-wise Tensor Addition Kernel
+ * Element-wise addition kernel (submit_task / Tensor* ABI)
  *
- * Implements: out[i] = src0[i] + src1[i]
+ * Implements: out[i] = src0[i] + src1[i] over a single 128x128 tile.
  *
- * This kernel performs element-wise addition of two tensors. It's compiled
- * separately as a standalone kernel and linked with the dispatcher using
- * function pointers, demonstrating the separation pattern used in production
- * systems where kernel binaries are loaded dynamically.
+ * Args (Tensor*):
+ *   args[0] = src0 (INPUT)
+ *   args[1] = src1 (INPUT)
+ *   args[2] = out  (OUTPUT)
  */
 
 #include <cstdint>
 #include <pto/pto-inst.hpp>
 
+#include "tensor.h"
+
 using namespace pto;
 
 #include "pipe_sync.h"
@@ -34,24 +36,15 @@ using namespace pto;
 #define __aicore__ [aicore]
 #endif
 
-/**
- * Element-wise addition kernel implementation
- *
- * Unified signature: all arguments passed via int64_t array
- * @param args  Argument array:
- *              args[0] = src0 pointer (first input tensor)
- *              args[1] = src1 pointer (second input tensor)
- *              args[2] = out pointer (output tensor)
- *              args[3] = size (number of elements)
- */
 extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) {
-    // Unpack arguments (order matches runtimemaker.cpp)
-    __gm__ float *src0 = reinterpret_cast<__gm__ float *>(args[0]);
-    __gm__ float *src1 = reinterpret_cast<__gm__ float *>(args[1]);
-    __gm__ float *out = reinterpret_cast<__gm__ float *>(args[2]);
-    int size = static_cast<int>(args[3]);
+    __gm__ Tensor *src0_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]);
+    __gm__ Tensor *src1_tensor = reinterpret_cast<__gm__ Tensor *>(args[1]);
+    __gm__ Tensor *out_tensor = reinterpret_cast<__gm__ Tensor *>(args[2]);
+
+    __gm__ float *src0 = reinterpret_cast<__gm__ float *>(src0_tensor->buffer.addr) + src0_tensor->start_offset;
+    __gm__ float *src1 = reinterpret_cast<__gm__ float *>(src1_tensor->buffer.addr) + src1_tensor->start_offset;
+    __gm__ float *out = reinterpret_cast<__gm__ float *>(out_tensor->buffer.addr) + out_tensor->start_offset;
 
-    // Configuration: float, 128, 128, 128, 128
     constexpr int kTRows_ = 128;
     constexpr int kTCols_ = 128;
     constexpr int vRows = 128;
diff --git a/tests/st/a2a3/host_build_graph/vector_example/kernels/aiv/kernel_add_scalar.cpp b/tests/st/a2a3/host_build_graph/vector_example/kernels/aiv/kernel_add_scalar.cpp
index f7e5af7c8..6ebdb6a8a 100644
--- a/tests/st/a2a3/host_build_graph/vector_example/kernels/aiv/kernel_add_scalar.cpp
+++ b/tests/st/a2a3/host_build_graph/vector_example/kernels/aiv/kernel_add_scalar.cpp
@@ -9,19 +9,21 @@
  * -----------------------------------------------------------------------------------------------------------
  */
 /**
- * Scalar Addition Kernel
+ * Scalar addition kernel (submit_task / Tensor* ABI)
  *
- * Implements: out[i] = src[i] + scalar
+ * Implements: out[i] = src[i] + scalar over a single 128x128 tile.
  *
- * This kernel adds a scalar value to each element of a tensor. It's compiled
- * separately as a standalone kernel and linked with the dispatcher using
- * function pointers, demonstrating the separation pattern used in production
- * systems where kernel binaries are loaded dynamically.
+ * Args:
+ *   args[0] = src (INPUT, Tensor*)
+ *   args[1] = out (OUTPUT, Tensor*)
+ *   args[2] = scalar (float bits packed in uint64_t — tensors precede scalars)
  */
 
 #include <cstdint>
 #include <pto/pto-inst.hpp>
 
+#include "tensor.h"
+
 using namespace pto;
 
 #include "pipe_sync.h"
@@ -34,32 +36,20 @@ using namespace pto;
 #define __aicore__ [aicore]
 #endif
 
-/**
- * Scalar addition kernel implementation
- *
- * Unified signature: all arguments passed via int64_t array
- * @param args  Argument array:
- *              args[0] = src pointer (input tensor)
- *              args[1] = scalar value (as uint64_t, needs conversion to float)
- *              args[2] = out pointer (output tensor)
- *              args[3] = size (number of elements)
- */
 extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) {
-    // Unpack arguments
-    __gm__ float *src = reinterpret_cast<__gm__ float *>(args[0]);
+    __gm__ Tensor *src_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]);
+    __gm__ Tensor *out_tensor = reinterpret_cast<__gm__ Tensor *>(args[1]);
 
-    // Convert scalar from uint64_t to float
     union {
         uint64_t u64;
         float f32;
     } converter;
-    converter.u64 = args[1];
+    converter.u64 = static_cast<uint64_t>(args[2]);
     float scalar = converter.f32;
 
-    __gm__ float *out = reinterpret_cast<__gm__ float *>(args[2]);
-    int size = static_cast<int>(args[3]);
+    __gm__ float *src = reinterpret_cast<__gm__ float *>(src_tensor->buffer.addr) + src_tensor->start_offset;
+    __gm__ float *out = reinterpret_cast<__gm__ float *>(out_tensor->buffer.addr) + out_tensor->start_offset;
 
-    // Configuration: float, 128, 128, 128, 128
     constexpr int kTRows_ = 128;
     constexpr int kTCols_ = 128;
     constexpr int vRows = 128;
diff --git a/tests/st/a2a3/host_build_graph/vector_example/kernels/aiv/kernel_mul.cpp b/tests/st/a2a3/host_build_graph/vector_example/kernels/aiv/kernel_mul.cpp
index 2b7661e8c..8ad9a9826 100644
--- a/tests/st/a2a3/host_build_graph/vector_example/kernels/aiv/kernel_mul.cpp
+++ b/tests/st/a2a3/host_build_graph/vector_example/kernels/aiv/kernel_mul.cpp
@@ -9,19 +9,21 @@
  * -----------------------------------------------------------------------------------------------------------
  */
 /**
- * Element-wise Tensor Multiplication Kernel
+ * Element-wise multiplication kernel (submit_task / Tensor* ABI)
  *
- * Implements: out[i] = src0[i] * src1[i]
+ * Implements: out[i] = src0[i] * src1[i] over a single 128x128 tile.
  *
- * This kernel performs element-wise multiplication of two tensors. It's
- * compiled separately as a standalone kernel and linked with the dispatcher
- * using function pointers, demonstrating the separation pattern used in
- * production systems where kernel binaries are loaded dynamically.
+ * Args (Tensor*):
+ *   args[0] = src0 (INPUT)
+ *   args[1] = src1 (INPUT)
+ *   args[2] = out  (OUTPUT)
  */
 
 #include <cstdint>
 #include <pto/pto-inst.hpp>
 
+#include "tensor.h"
+
 using namespace pto;
 
 #include "pipe_sync.h"
@@ -34,24 +36,15 @@ using namespace pto;
 #define __aicore__ [aicore]
 #endif
 
-/**
- * Element-wise multiplication kernel implementation
- *
- * Unified signature: all arguments passed via int64_t array
- * @param args  Argument array:
- *              args[0] = src0 pointer (first input tensor)
- *              args[1] = src1 pointer (second input tensor)
- *              args[2] = out pointer (output tensor)
- *              args[3] = size (number of elements)
- */
 extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) {
-    // Unpack arguments (order matches runtimemaker.cpp)
-    __gm__ float *src0 = reinterpret_cast<__gm__ float *>(args[0]);
-    __gm__ float *src1 = reinterpret_cast<__gm__ float *>(args[1]);
-    __gm__ float *out = reinterpret_cast<__gm__ float *>(args[2]);
-    int size = static_cast<int>(args[3]);
+    __gm__ Tensor *src0_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]);
+    __gm__ Tensor *src1_tensor = reinterpret_cast<__gm__ Tensor *>(args[1]);
+    __gm__ Tensor *out_tensor = reinterpret_cast<__gm__ Tensor *>(args[2]);
+
+    __gm__ float *src0 = reinterpret_cast<__gm__ float *>(src0_tensor->buffer.addr) + src0_tensor->start_offset;
+    __gm__ float *src1 = reinterpret_cast<__gm__ float *>(src1_tensor->buffer.addr) + src1_tensor->start_offset;
+    __gm__ float *out = reinterpret_cast<__gm__ float *>(out_tensor->buffer.addr) + out_tensor->start_offset;
 
-    // Configuration: float, 128, 128, 128, 128
     constexpr int kTRows_ = 128;
     constexpr int kTCols_ = 128;
     constexpr int vRows = 128;
diff --git a/tests/st/a2a3/host_build_graph/vector_example/kernels/orchestration/example_orch.cpp b/tests/st/a2a3/host_build_graph/vector_example/kernels/orchestration/example_orch.cpp
index 82555d9ef..e13d57d22 100644
--- a/tests/st/a2a3/host_build_graph/vector_example/kernels/orchestration/example_orch.cpp
+++ b/tests/st/a2a3/host_build_graph/vector_example/kernels/orchestration/example_orch.cpp
@@ -9,152 +9,83 @@
  * -----------------------------------------------------------------------------------------------------------
  */
 /**
- * Example Orchestration Function Implementation
+ * Example Orchestration Function — submit_task / TensorMap form
  *
- * Builds the task graph for formula: (a + b + 1)(a + b + 2)
+ * Builds the task graph for: f = (a + b + 1) * (a + b + 2)
  *
- * This orchestration function:
- * 1. Receives ChipStorageTaskArgs with tensor metadata (pointers, shapes, dtypes)
- * 2. Allocates device memory via orchestration API helpers
- * 3. Copies input data to device via orchestration API helpers
- * 4. Records output tensor for copy-back during finalize
- * 5. Builds the task graph
+ * Dependencies are discovered automatically by the TensorMap from the
+ * add_input/add_output directions: c is produced by task0 and consumed by
+ * task1/task2; d and e are produced by task1/task2 and consumed by task3.
+ *
+ * Arg layout: [a (IN), b (IN), f (OUT)] — 3 external tensors. The intermediate
+ * tensors c, d, e are allocated by the runtime from the HeapRing.
  */
 
-#include <iostream>
-
-#include "orchestration_api.h"  // NOLINT(build/include_subdir)
-
-extern "C" {
-
-int build_example_graph(OrchestrationRuntime *runtime, const ChipStorageTaskArgs &orch_args) {
-    // Validate argument count
-    // Expected orch_args: [a, b, f] — 3 tensors
-    if (orch_args.tensor_count() < 3) {
-        std::cerr << "build_example_graph: Expected at least 3 tensors, got " << orch_args.tensor_count() << '\n';
-        return -1;
-    }
+#include <stdint.h>
 
-    // Extract host pointers, sizes, and element count from tensor metadata
-    void *host_a = orch_args.tensor(0).data_as<void>();
-    void *host_b = orch_args.tensor(1).data_as<void>();
-    void *host_f = orch_args.tensor(2).data_as<void>();
-    size_t size_a = orch_args.tensor(0).nbytes();
-    size_t size_b = orch_args.tensor(1).nbytes();
-    size_t size_f = orch_args.tensor(2).nbytes();
-    uint32_t SIZE = orch_args.tensor(0).shapes[0];
+#include "pto_orchestration_api.h"  // NOLINT(build/include_subdir)
 
-    std::cout << "\n=== build_example_graph: Creating Task Runtime ===" << '\n';
-    std::cout << "Formula: (a + b + 1)(a + b + 2)\n";
-    std::cout << "SIZE: " << SIZE << " elements\n";
+#define FUNC_ADD 0         // c = a + b
+#define FUNC_ADD_SCALAR 1  // out = src + scalar
+#define FUNC_MUL 2         // f = d * e
 
-    // Allocate device memory and copy inputs
-    std::cout << "\n=== Allocating Device Memory ===" << '\n';
-
-    void *dev_a = device_malloc(runtime, size_a);
-    if (!dev_a) {
-        std::cerr << "Error: Failed to allocate device memory for a\n";
-        return -1;
-    }
-    copy_to_device(runtime, dev_a, host_a, size_a);
-    std::cout << "Tensor a: " << size_a << " bytes copied to device\n";
-
-    void *dev_b = device_malloc(runtime, size_b);
-    if (!dev_b) {
-        std::cerr << "Error: Failed to allocate device memory for b\n";
-        device_free(runtime, dev_a);
-        return -1;
-    }
-    copy_to_device(runtime, dev_b, host_b, size_b);
-    std::cout << "Tensor b: " << size_b << " bytes copied to device\n";
-
-    void *dev_f = device_malloc(runtime, size_f);
-    if (!dev_f) {
-        std::cerr << "Error: Failed to allocate device memory for f\n";
-        device_free(runtime, dev_a);
-        device_free(runtime, dev_b);
-        return -1;
-    }
-    // Record output tensor for copy-back during finalize
-    record_tensor_pair(runtime, host_f, dev_f, size_f);
-    std::cout << "Tensor f (output): " << size_f << " bytes allocated\n";
+extern "C" {
 
-    // Allocate intermediate tensors (c, d, e)
-    size_t BYTES = SIZE * sizeof(float);
-    void *dev_c = device_malloc(runtime, BYTES);
-    void *dev_d = device_malloc(runtime, BYTES);
-    void *dev_e = device_malloc(runtime, BYTES);
+__attribute__((visibility("default"))) PTO2OrchestrationConfig aicpu_orchestration_config(const L2TaskArgs &orch_args) {
+    (void)orch_args;
+    return PTO2OrchestrationConfig{
+        .expected_arg_count = 3,
+    };
+}
 
-    if (!dev_c || !dev_d || !dev_e) {
-        std::cerr << "Error: Failed to allocate intermediate tensors\n";
-        device_free(runtime, dev_a);
-        device_free(runtime, dev_b);
-        device_free(runtime, dev_f);
-        if (dev_c) device_free(runtime, dev_c);
-        if (dev_d) device_free(runtime, dev_d);
-        if (dev_e) device_free(runtime, dev_e);
-        return -1;
-    }
+__attribute__((visibility("default"))) void aicpu_orchestration_entry(const L2TaskArgs &orch_args) {
+    const Tensor &a = orch_args.tensor(0).ref();
+    const Tensor &b = orch_args.tensor(1).ref();
+    const Tensor &f = orch_args.tensor(2).ref();  // external output, written in place
 
-    std::cout << "Allocated intermediate tensors c, d, e\n";
+    uint32_t SIZE = a.shapes[0];
+    uint32_t inter_shapes[1] = {SIZE};
+    TensorCreateInfo inter_ci(inter_shapes, 1, DataType::FLOAT32);
 
-    // Helper union to encode float scalar as uint64_t
     union {
         float f32;
         uint64_t u64;
-    } scalar_converter;
-
-    // Task 0: c = a + b (func_id=0: kernel_add, AIV)
-    uint64_t args_t0[4];
-    args_t0[0] = reinterpret_cast<uint64_t>(dev_a);  // src0
-    args_t0[1] = reinterpret_cast<uint64_t>(dev_b);  // src1
-    args_t0[2] = reinterpret_cast<uint64_t>(dev_c);  // out
-    args_t0[3] = SIZE;                               // size
-    int t0 = add_task(runtime, args_t0, 4, 0, CoreType::AIV);
-
-    // Task 1: d = c + 1 (func_id=1: kernel_add_scalar, AIV)
-    uint64_t args_t1[4];
-    args_t1[0] = reinterpret_cast<uint64_t>(dev_c);  // src
-    scalar_converter.f32 = 1.0f;
-    args_t1[1] = scalar_converter.u64;               // scalar=1.0
-    args_t1[2] = reinterpret_cast<uint64_t>(dev_d);  // out
-    args_t1[3] = SIZE;                               // size
-    int t1 = add_task(runtime, args_t1, 4, 1, CoreType::AIV);
-
-    // Task 2: e = c + 2 (func_id=1: kernel_add_scalar, AIV)
-    uint64_t args_t2[4];
-    args_t2[0] = reinterpret_cast<uint64_t>(dev_c);  // src
-    scalar_converter.f32 = 2.0f;
-    args_t2[1] = scalar_converter.u64;               // scalar=2.0
-    args_t2[2] = reinterpret_cast<uint64_t>(dev_e);  // out
-    args_t2[3] = SIZE;                               // size
-    int t2 = add_task(runtime, args_t2, 4, 1, CoreType::AIV);
-
-    // Task 3: f = d * e (func_id=2: kernel_mul, AIV)
-    uint64_t args_t3[4];
-    args_t3[0] = reinterpret_cast<uint64_t>(dev_d);  // src0
-    args_t3[1] = reinterpret_cast<uint64_t>(dev_e);  // src1
-    args_t3[2] = reinterpret_cast<uint64_t>(dev_f);  // out
-    args_t3[3] = SIZE;                               // size
-    int t3 = add_task(runtime, args_t3, 4, 2, CoreType::AIV);
-
-    // Add dependencies
-    add_successor(runtime, t0, t1);  // t0 → t1
-    add_successor(runtime, t0, t2);  // t0 → t2
-    add_successor(runtime, t1, t3);  // t1 → t3
-    add_successor(runtime, t2, t3);  // t2 → t3
-
-    std::cout << "\nTasks:\n";
-    std::cout << "  task" << t0 << ": c = a + b\n";
-    std::cout << "  task" << t1 << ": d = c + 1\n";
-    std::cout << "  task" << t2 << ": e = c + 2\n";
-    std::cout << "  task" << t3 << ": f = d * e\n";
-    std::cout << "Dependencies: t0→t1, t0→t2, t1→t3, t2→t3\n";
-
-    std::cout << "Created runtime with " << get_task_count(runtime) << " tasks\n";
-    print_runtime(runtime);
-
-    return 0;
+    } sconv;
+
+    // task0: c = a + b
+    L0TaskArgs p_add;
+    p_add.add_input(a);
+    p_add.add_input(b);
+    p_add.add_output(inter_ci);
+    TaskOutputTensors c_out = rt_submit_aiv_task(FUNC_ADD, p_add);
+    Tensor c = c_out.get_ref(0);
+
+    // task1: d = c + 1
+    L0TaskArgs p_d;
+    p_d.add_input(c);
+    p_d.add_output(inter_ci);
+    sconv.f32 = 1.0f;
+    p_d.add_scalar(sconv.u64);
+    TaskOutputTensors d_out = rt_submit_aiv_task(FUNC_ADD_SCALAR, p_d);
+    Tensor d = d_out.get_ref(0);
+
+    // task2: e = c + 2
+    L0TaskArgs p_e;
+    p_e.add_input(c);
+    p_e.add_output(inter_ci);
+    sconv.f32 = 2.0f;
+    p_e.add_scalar(sconv.u64);
+    TaskOutputTensors e_out = rt_submit_aiv_task(FUNC_ADD_SCALAR, p_e);
+    Tensor e = e_out.get_ref(0);
+
+    // task3: f = d * e  (write into the external output tensor)
+    L0TaskArgs p_mul;
+    p_mul.add_input(d);
+    p_mul.add_input(e);
+    p_mul.add_output(f);
+    rt_submit_aiv_task(FUNC_MUL, p_mul);
+
+    LOG_INFO_V9("[example_orch] Submitted 4 tasks for f = (a + b + 1) * (a + b + 2)");
 }
 
 }  // extern "C"
diff --git a/tests/st/a2a3/host_build_graph/vector_example/test_vector_example.py b/tests/st/a2a3/host_build_graph/vector_example/test_vector_example.py
index 48fd44fae..fc209da16 100644
--- a/tests/st/a2a3/host_build_graph/vector_example/test_vector_example.py
+++ b/tests/st/a2a3/host_build_graph/vector_example/test_vector_example.py
@@ -29,7 +29,7 @@ class TestVectorExampleHostBuildGraph(SceneTestCase):
     CALLABLE = {
         "orchestration": {
             "source": "kernels/orchestration/example_orch.cpp",
-            "function_name": "build_example_graph",
+            "function_name": "aicpu_orchestration_entry",
             "signature": [D.IN, D.IN, D.OUT],
         },
         "incores": [
@@ -58,7 +58,7 @@ class TestVectorExampleHostBuildGraph(SceneTestCase):
         {
             "name": "default",
             "platforms": ["a2a3sim", "a2a3"],
-            "config": {"aicpu_thread_num": 3, "block_dim": 3},
+            "config": {"aicpu_thread_num": 4, "block_dim": 3},
             "params": {},
         },
     ]