From 30df9098ae941a12d82369fe51d5f314abdaaf8b Mon Sep 17 00:00:00 2001
From: s00831018 <sergio.miguel.martin@huawei.com>
Date: Thu, 25 Jun 2026 12:55:35 +0200
Subject: [PATCH 01/14] =?UTF-8?q?Polling=20scheduler=20design=20=E2=80=94?=
 =?UTF-8?q?=20rebase=20squash=20onto=20upstream/main?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Consolidates the polling-redesign series (originally 6 commits on
polling-pr-minimal at base fcc33bcb) onto current upstream/main as a
single squash. Resolves 24 file conflicts using the recipe in
RECONCILIATION_NOTES.md:

- Polling-redesigned files (pto_scheduler.h, ring_buffer, shared_memory,
  runtime, tensormap, runtime2_types, orchestrator.cpp, scheduler_*.cpp):
  take theirs (polling).
- Upstream-evolved files with feature additions (aicpu_executor.cpp,
  runtime_maker.cpp, orchestration/, platform_aicpu_affinity.cpp):
  take ours (upstream/main).

Cross-cutting adapter overloads added so upstream call sites compile
against polling structs:
- PTO2RuntimeArenaLayout: task_window_sizes[]/heap_sizes[]/dep_pool_capacities[]
  per-ring arrays (was single scalars).
- runtime_reserve_layout: per-ring overload + single-size broadcast adapter.
- runtime_init_data_from_layout: heap_sizes[] per-ring overload + scalar
  adapter.
- runtime_destroy(rt, arena): 2-arg overload forwarding to single-arg.
- PTO2SharedMemoryHandle::init_per_ring: forwards to init_header_per_ring.
- PTO2OrchestratorState::l2_swimlane_level: L2SwimlaneLevel field for
  upstream aicpu_executor's orchestrator-to-scheduler bridge.
- SchedulerContext::on_orchestration_done: thread_idx overload.

Renames applied (commit 10a7680b's surface):
- bare `Arg` -> `L0TaskArgs` in pto_runtime2.h, pto_orchestrator.h,
  pto_dep_compute.h, pto_runtime2_types.h.
- `TensorRef::ptr` -> `.ref()` (returns reference, callers take address).
- `.create_info` -> `.create_info()` method call.

dep_gen_aicpu_record_submit signature: pass args.launch_spec.block_num()
(matches upstream a5's call).

The needs_copy_back D2H optimization is already in upstream/main, so the
polling-pr-minimal commit 3a1bc17e that restored it on the old base is
unnecessary here and dropped.

Compile-clean after this commit. Three further bug fixes follow.
---
 RECONCILIATION_NOTES.md                       |  134 ++
 .../common/intrinsic.h                        |    4 +-
 .../docs/MULTI_RING.md                        |   93 +-
 .../docs/RUNTIME_LOGIC.md                     |    8 +-
 .../docs/SCALAR_DATA_ACCESS.md                |    2 +-
 .../docs/device_log_profiling.md              |    2 +-
 .../docs/profiling_levels.md                  |    6 +-
 .../host/dep_gen_replay.cpp                   |    2 +-
 .../orchestration/common.cpp                  |    7 +
 .../runtime/aicore_completion_mailbox.h       |  102 +-
 .../runtime/aicore_completion_mailbox_types.h |   28 +-
 .../backend/sdma/sdma_completion_kernel.h     |   83 +-
 .../backend/sdma/sdma_completion_scheduler.h  |   25 +-
 .../runtime/pto2_dispatch_payload.h           |   61 +-
 .../runtime/pto_async_kernel_api.h            |   81 +-
 .../runtime/pto_async_wait.h                  |  209 +-
 .../runtime/pto_completion_token.h            |   15 +-
 .../runtime/pto_dep_compute.h                 |  143 +-
 .../runtime/pto_orchestrator.cpp              | 1141 +----------
 .../runtime/pto_orchestrator.h                |  701 +++++--
 .../runtime/pto_ring_buffer.cpp               |  176 +-
 .../runtime/pto_ring_buffer.h                 |  743 +------
 .../runtime/pto_runtime2.cpp                  |  304 +--
 .../runtime/pto_runtime2.h                    |  560 ++++--
 .../runtime/pto_runtime2_types.h              |  419 +---
 .../runtime/pto_shared_memory.h               |  441 +++--
 .../runtime/pto_submit_types.h                |  137 +-
 .../runtime/pto_tensormap.h                   |  624 +++---
 .../runtime/scheduler/pto_scheduler.cpp       |  102 +-
 .../runtime/scheduler/pto_scheduler.h         | 1731 +++++------------
 .../runtime/scheduler/scheduler_cold_path.cpp | 1101 +----------
 .../scheduler/scheduler_completion.cpp        |  606 +-----
 .../runtime/scheduler/scheduler_context.h     | 1625 +++++++++++++---
 .../runtime/scheduler/scheduler_dispatch.cpp  | 1476 +-------------
 .../runtime/scheduler/scheduler_types.h       |  491 ++---
 .../runtime/shared/pto_runtime2_init.cpp      |  603 +-----
 .../runtime/shared/pto_shared_memory.cpp      |  242 +--
 .../runtime/shared/pto_tensormap.cpp          |  286 +--
 .../runtime/shared/runtime.cpp                |  108 +-
 39 files changed, 4152 insertions(+), 10470 deletions(-)
 create mode 100644 RECONCILIATION_NOTES.md

diff --git a/RECONCILIATION_NOTES.md b/RECONCILIATION_NOTES.md
new file mode 100644
index 000000000..7307af7a4
--- /dev/null
+++ b/RECONCILIATION_NOTES.md
@@ -0,0 +1,134 @@
+# Polling PR — Rebase Reconciliation Notes
+
+State of `polling-pr-minimal` (HEAD `188be7e4`) vs `upstream/main`
+(currently `ecfb1663`, 14 commits ahead of the PR's base `fcc33bcb`).
+
+## TL;DR
+
+- **`git rebase upstream/main` produces 15 file conflicts.** Mechanical
+  resolution (take "theirs" for files we rewrote, take "ours" for files
+  with upstream feature additions, rename `Arg→L0TaskArgs` and `.ptr→.ref()`
+  throughout) gets the tree to **compile clean**.
+- **Compile-clean tree still hangs at runtime** with the now-familiar
+  507018 AICore op-timeout. The hang is a **protocol-level mismatch**
+  between upstream's evolved init/dispatch handshake and the polling-side
+  SHM/scheduler — not a few-more-renames-away fix. Estimated 1-2 days of
+  targeted protocol alignment + re-benchmarking to land cleanly.
+- Decision (24 Jun 2026): pause the rebase; reviewer/maintainer to either
+  rebase as part of merge or this PR will be rebased in a future session.
+
+## Upstream commits since `fcc33bcb`
+
+| Commit | Touches polling design? | Why |
+|---|---|---|
+| `10a7680b` `Refactor: tensormap L0/L2TaskArgs arg hierarchy` | **Yes — heavy** | `Arg` → `Arg<MaxT,MaxS>` template; `L0TaskArgs = Arg<32,16>` for core submit; `TensorRef::ptr` → `.ref()`/`.create_info()` accessors. Renames propagate through every submit signature. |
+| `c6354842` `feat(runtime): unify runtime_env ring sizing` | **Yes — heavy** | `PTO2RuntimeArenaLayout` gains `task_window_sizes[PTO2_MAX_RING_DEPTH]`/`heap_sizes[]`/`dep_pool_capacities[]` arrays. New `init_per_ring` on `PTO2SharedMemoryHandle`. `runtime_init_data_from_layout` per-ring overload. |
+| `6dd8a5dc` `consolidate profiling init into SchedulerContext::init()` | Yes — medium | `SchedulerContext::init()` signature changed; `l2_swimlane_level` moved from `PTO2OrchestratorState` to `SchedulerContext`. `runtime_destroy(rt, arena)` 2-arg signature. |
+| `6c3a9e49` `consumed/reuse deadlock fix` | **No** | Fixes interaction between `fanout_refcount` / `fanout_count` / `task_state=CONSUMED` / `scope_end` producer-release — all four mechanisms removed by the polling design. |
+| `11f0bf40` `AICPU callable prewarm` | Yes — light | Adds `aicpu_prewarm_callable` C entry to `aicpu_executor.cpp`. |
+| `4725ef7b` `dispatcher fresh-process retry` | Yes — light | Adds retry path in `device_runner.cpp`. |
+| `78b123e7` `rename init-claim flag to init_claimed_` | Trivial | Field rename in scheduler. |
+| `ae59a8e9` `in-place card recovery` | No | `device_runner.cpp` only. |
+| `3aa94a99` `close unpublished sim host orchestration handles` | No | Sim host only. |
+| `e2112e9f` `restore SDMA async completion demo` | No | Example. |
+| Others (`ecfb1663`, `cce30871`, `2f77399a`, `e583b8a0`) | Trivial | CI / docs / examples. |
+
+## Per-file conflict matrix
+
+After `git rebase upstream/main`, 15 files conflict. Recommended
+resolution + work needed:
+
+| File | Recommended action | Status |
+|---|---|---|
+| `runtime/pto_runtime2_types.h` | take theirs (polling) | ✓ compile-fixed |
+| `runtime/pto_runtime2.h` | take theirs + add per-ring overloads | ✓ compile-fixed (added `runtime_reserve_layout` and `runtime_init_data_from_layout` per-ring overloads; added `runtime_destroy(rt, arena)` overload) |
+| `runtime/pto_runtime2.cpp` | take theirs (stub) | ✓ |
+| `runtime/pto_orchestrator.cpp` | take theirs (stub) | ✓ |
+| `runtime/pto_orchestrator.h` | take theirs + rename `Arg → L0TaskArgs` + `.create_info →`→`.create_info()` + `.ptr → &.ref()` + add `l2_swimlane_level` field | ✓ compile-fixed |
+| `runtime/pto_dep_compute.h` | take theirs + `inputs.tensors[i].ptr → &inputs.tensors[i].ref()` | ✓ compile-fixed |
+| `runtime/scheduler/pto_scheduler.h` | take theirs (polling) | ✓ |
+| `runtime/scheduler/scheduler_context.h` | take theirs + add `thread_idx` to `on_orchestration_done` signature | ✓ compile-fixed |
+| `runtime/scheduler/scheduler_cold_path.cpp` | take theirs (stub) | ✓ |
+| `runtime/scheduler/scheduler_dispatch.cpp` | take theirs (stub) | ✓ |
+| `runtime/pto_shared_memory.h` | take theirs (polling) + add `init_per_ring` method (broadcast to scalar init) | ✓ compile-fixed |
+| `runtime/runtime.h` | add `needs_copy_back` to `TensorPair` (upstream-API compat) | ✓ compile-fixed |
+| `aicpu/aicpu_executor.cpp` | take ours (upstream — has prewarm, profiling consolidation, deadlock-fix-related changes) | ✓ compile-fixed via signature adapters above |
+| `host/runtime_maker.cpp` | take ours (upstream — has per-ring env parsing #1128) | ✓ compile-fixed |
+| `orchestration/pto_arg_with_deps.h` | take ours (upstream) | ✓ trivial |
+| `orchestration/pto_orchestration_api.h` | take ours (upstream) | ✓ trivial |
+| `docs/MULTI_RING.md` | take theirs (updated for polling) | ✓ trivial |
+
+## Runtime hang — root cause hypothesis
+
+After the compile-clean tree above runs `paged_attention` Case1, AICore
+times out at 507018 with no orchestration log past the `simpler-dispatcher`
+init. Suspect chain:
+
+1. **`init_per_ring` is a stub**. My implementation broadcasts
+   `task_window_sizes[0]` to the old scalar `init_header` /
+   `setup_pointers`. If upstream's `aicpu_executor` writes
+   `prebuilt_layout.task_window_sizes[r]` for r > 0 with different values
+   than [0], the SHM layout's per-ring offsets diverge from what the
+   AICPU expects → wrong pointers → silent corruption or hang.
+2. **`PTO2OrchestratorState::l2_swimlane_level`** is back as a field, but
+   upstream's `SchedulerContext::init` may now own that state. Adding
+   the field in two places creates a tearing concern only if both writers
+   actually fire — unlikely to be the hang root cause but worth checking.
+3. **`runtime_destroy(rt, arena)`**: my overload calls the 1-arg form,
+   but upstream's `arena` parameter may be used for staged teardown
+   (e.g., scope finalize). The polling design's destroy doesn't need it
+   but the *order* of teardown might matter for upstream's aicpu_executor
+   loop. Not the boot-time hang, but a leak/reset issue downstream.
+4. **AICPU dispatch handshake**: upstream's aicpu_executor may have
+   ordering expectations around when the polling design's wiring queue
+   is initialized vs when the AICore handshake fires. The polling
+   scheduler initializes wiring lazily in `init_data_from_layout`; if
+   upstream's executor handshakes AICore *before* the wiring queue is
+   ready, AICore spins for tasks that never arrive.
+
+The fix path: thread true per-ring sizes through `PTO2SharedMemoryHandle`
+(currently the polling code uses a uniform per-ring layout — needs to
+honor the array), then add a runtime trace point at the boundary
+between aicpu_executor's `init_per_ring` call and the scheduler's first
+`drain_wiring_queue` to confirm where the AICore handshake is firing
+vs when the wiring becomes ready.
+
+## What to do next session
+
+1. `git rebase upstream/main`, apply the resolutions above (the order is
+   mechanical now that this doc records them).
+2. Build (should compile clean as documented).
+3. Run `paged_attention` Case1 to confirm the runtime hang reproduces.
+4. Add device-side `LOG_INFO_V0` traces at:
+   - `PTO2SharedMemoryHandle::init_per_ring` entry/exit (per ring)
+   - `AicpuExecutor::run` immediately before / after the first scheduler
+     `drain_wiring_queue` call
+   - `SchedulerContext::on_orchestration_done` entry
+5. Diagnose the gap revealed by the traces; align the polling SHM /
+   wiring init order with upstream's handshake.
+6. Re-run the 26-test benchmark sweep (the one in `PR_NOTES.md`) and
+   confirm parity with the pre-rebase result.
+
+## Quick repro recipe
+
+```bash
+git checkout polling-pr-minimal             # HEAD = 188be7e4
+git rebase upstream/main                    # 15 conflicts
+
+# Take theirs (polling) for files we rewrote:
+git checkout --theirs \
+  src/a2a3/runtime/tensormap_and_ringbuffer/runtime/{pto_runtime2_types.h,pto_runtime2.cpp,pto_runtime2.h,pto_orchestrator.cpp,pto_orchestrator.h,pto_dep_compute.h,scheduler/pto_scheduler.h,scheduler/scheduler_context.h,scheduler/scheduler_cold_path.cpp,scheduler/scheduler_dispatch.cpp} \
+  src/a2a3/runtime/tensormap_and_ringbuffer/docs/MULTI_RING.md
+
+# Take ours (upstream) for files where upstream adds features:
+git checkout --ours \
+  src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp \
+  src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp \
+  src/a2a3/runtime/tensormap_and_ringbuffer/orchestration/{pto_arg_with_deps.h,pto_orchestration_api.h}
+
+git add -u src/
+
+# Apply compile-fixes (see "Per-file conflict matrix" for details).
+# Build is clean after these. Runtime hangs — see "Runtime hang — root
+# cause hypothesis" above for the next investigation steps.
+```
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/common/intrinsic.h b/src/a2a3/runtime/tensormap_and_ringbuffer/common/intrinsic.h
index 768e6a612..ba83a8b5c 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/common/intrinsic.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/common/intrinsic.h
@@ -63,7 +63,7 @@
  *     compiled, ran without error, and produced wrong output. Use
  *     `get_sub_block_id(args)` instead, which reads from the runtime's
  *     `GlobalContext.sub_block_id` that the scheduler initializes per
- *     AIV core in `scheduler_cold_path.cpp::SchedulerContext::init`.
+ *     AIV core in `scheduler_context.h::SchedulerContext::init`.
  *
  *   - `get_block_idx()` and `get_block_num()` are not redirected to
  *     simpler's LocalContext either — use the `(args)` variants below
@@ -97,7 +97,7 @@ static constexpr int32_t PTO2_EXT_PARAMS_COUNT = 2;
 
 /**
  * Args[] suffix indices for context pointers.
- * Derived from MAX_TENSOR_ARGS(32) + MAX_SCALAR_ARGS(16).
+ * Derived from MAX_TENSOR_ARGS(16) + MAX_SCALAR_ARGS(32).
  * Users should not depend on these values; use the Get* functions below.
  */
 static constexpr int32_t SPMD_LOCAL_CONTEXT_INDEX = 48;
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/docs/MULTI_RING.md b/src/a2a3/runtime/tensormap_and_ringbuffer/docs/MULTI_RING.md
index dbbfb5cd0..0ec9b155f 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/docs/MULTI_RING.md
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/docs/MULTI_RING.md
@@ -179,8 +179,9 @@ Each ring's `last_task_alive` advances independently:
 
 ```text
 advance_ring_pointers(ring_id):  // protected by per-ring advance_lock
-    la = ring->fc.last_task_alive
-    while ring->get_slot_state_by_task_id(la).task_state >= CONSUMED:
+    watermark = ring->completed_watermark
+    la = last_task_alive
+    while la <= watermark and watermark >= slot[la].last_consumer_local_id:
         reset slot for reuse
         la++
     sync_to_sm()  // release-store last_task_alive
@@ -235,91 +236,25 @@ AICore uses `last_reg_val` to detect new dispatches — identical values cause s
 | `PTO2_HEAP_SIZE` | 256 MB | 1 GB |
 | `PTO2_DEP_LIST_POOL_SIZE` | 16384 | 65536 |
 
-### 7.2 Runtime Overrides
-
-Each ring resource (`ring_task_window` / `ring_heap` / `ring_dep_pool`) is a
-single `CallConfig.runtime_env` field that accepts **either** a scalar (broadcast
-to every ring) **or** a list of four per-ring values. Precedence is resolved
-independently for each resource and ring:
-
-```text
-per-ring CallConfig entry (a scalar is broadcast to every entry)
-  > per-ring PTO2_RING_* env value
-  > scalar PTO2_RING_* env value
-  > compile-time default
-```
-
-`ring_id` is the scope-depth ring selected by the runtime:
-
-```text
-scope depth 0 -> ring 0
-scope depth 1 -> ring 1
-scope depth 2 -> ring 2
-scope depth >=3 -> ring 3
-```
+### 7.2 Runtime Environment Overrides
 
-Per-task via `CallConfig.runtime_env` — different L2 tasks in one launch can
-each carry their own sizes. Invalid values raise at submit time (`validate()`).
-Assign a scalar to size every ring the same:
-
-```python
-cfg = CallConfig()
-cfg.runtime_env.ring_task_window = 128   # power of 2, >= 4
-cfg.runtime_env.ring_heap = 262144       # bytes/ring, >= 1024
-cfg.runtime_env.ring_dep_pool = 256      # 4 .. INT32_MAX
-orchestrator.submit_next_level(handle, args, cfg)
-```
-
-Assign a four-entry list to tune the scope-depth rings independently. The list
-must contain exactly four entries; use `0` for an entry that should fall through
-to the next precedence tier. All `CallConfig` values are integer byte/count
-values, and each field always reads back as a four-entry list.
-
-```python
-cfg = CallConfig()
-cfg.runtime_env.ring_task_window = [8192, 16384, 131072, 524288]
-cfg.runtime_env.ring_heap = [
-    128 * 1024 * 1024,
-    256 * 1024 * 1024,
-    384 * 1024 * 1024,
-    512 * 1024 * 1024,
-]
-cfg.runtime_env.ring_dep_pool = [4096, 8192, 16384, 32768]
-orchestrator.submit_next_level(handle, args, cfg)
-```
-
-Scene tests set the same keys under a nested `runtime_env` block in the
-per-case `config` dict — each value is a scalar or a four-entry list:
-
-```python
-"config": {
-    "runtime_env": {
-        "ring_task_window": [8192, 16384, 131072, 524288],
-        "ring_heap": [134217728, 268435456, 402653184, 536870912],
-        "ring_dep_pool": 256,  # scalar broadcasts to every ring
-    }
-}
-```
-
-Process-wide env fallback accepts either one scalar value or exactly four
-comma-separated per-ring values. Invalid env values are logged and ignored, then
-fall through to defaults. `PTO2_RING_HEAP` values are integer bytes:
+Uniform (applies to all rings):
 
 ```bash
-# Uniform, old behavior:
 PTO2_RING_TASK_WINDOW=1024
 PTO2_RING_HEAP=1048576
 PTO2_RING_DEP_POOL=1024
-
-# Per-ring, indexed by ring_id 0..3:
-PTO2_RING_TASK_WINDOW=8192,16384,131072,524288
-PTO2_RING_HEAP=134217728,268435456,402653184,536870912
-PTO2_RING_DEP_POOL=4096,8192,16384,32768
 ```
 
-Use `--enable-scope-stats` to confirm the effective values for a real run. The
-first line of `scope_stats/scope_stats.jsonl` includes `task_window_max`,
-`heap_max`, and `dep_pool_max`, indexed by `ring`.
+In `kernel_config.py`:
+
+```python
+RUNTIME_ENV = {
+    "PTO2_RING_TASK_WINDOW": "128",
+    "PTO2_RING_HEAP": "262144",
+    "PTO2_RING_DEP_POOL": "256",
+}
+```
 
 ### 7.3 Sizing Guidelines
 
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/docs/RUNTIME_LOGIC.md b/src/a2a3/runtime/tensormap_and_ringbuffer/docs/RUNTIME_LOGIC.md
index ef59d2e98..66e41cd38 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/docs/RUNTIME_LOGIC.md
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/docs/RUNTIME_LOGIC.md
@@ -538,7 +538,7 @@ This is protected by a per-ring try-lock (`advance_lock`) in `RingSchedState`, e
 
 ### 8.5 SchedulerContext
 
-All scheduler-side state and methods live in `SchedulerContext` (`runtime/scheduler/scheduler_context.h`). It is held as a `sched_ctx_` member of `AicpuExecutor`; `AicpuExecutor` is a thin wrapper that owns the lifecycle atomics and the orchestration SO handle, and delegates everything else to `SchedulerContext`.
+All scheduler-side state and methods live in `SchedulerContext` (`runtime/scheduler_context.h`). It is held as a `sched_ctx_` member of `AicpuExecutor`; `AicpuExecutor` is a thin wrapper that owns the lifecycle atomics and the orchestration SO handle, and delegates everything else to `SchedulerContext`.
 
 Public surface (called from `AicpuExecutor::init/run/deinit`):
 
@@ -552,11 +552,7 @@ Public surface (called from `AicpuExecutor::init/run/deinit`):
 | `deinit()` | once per run | Reset every scheduler-owned field to its post-construction default |
 | Read-only accessors | various | `aic_count()` / `aiv_count()` / `is_completed()` / `completed_tasks_count()` |
 
-Private internals are split across three .cpp files by responsibility:
-
-- `scheduler_completion.cpp` — completion polling, drain protocol
-- `scheduler_dispatch.cpp` — task dispatch loop and helpers
-- `scheduler_cold_path.cpp` — exit checks, stall diagnostics, profiling, lifecycle (`init/deinit`), core management (`handshake_all_cores` / `assign_cores_to_threads` / `emergency_shutdown`), and `on_orchestration_done`
+Private internals all live inline in `scheduler_context.h`, covering completion polling, drain protocol, task dispatch loop and helpers, exit checks, stall diagnostics, profiling, lifecycle (`init/deinit`), core management (`handshake_all_cores` / `assign_cores_to_threads` / `reassign_cores_for_all_threads` / `emergency_shutdown`), and `on_orchestration_done`.
 
 `AicpuExecutor` calls neither `handshake_*`, `assign_*`, `reassign_*`, nor `emergency_shutdown` directly — they are private, invoked only by `init` and `on_orchestration_done`.
 
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/docs/SCALAR_DATA_ACCESS.md b/src/a2a3/runtime/tensormap_and_ringbuffer/docs/SCALAR_DATA_ACCESS.md
index ef1de83b4..94cc8a569 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/docs/SCALAR_DATA_ACCESS.md
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/docs/SCALAR_DATA_ACCESS.md
@@ -32,7 +32,7 @@ addr null-check → TensorMap lookup → spin-wait producer COMPLETED → comput
 
 - **addr null-check**: `buffer.addr == 0` means unallocated — log error, return 0
 - **TensorMap lookup**: find producer task by `buffer.addr`
-- **spin-wait**: wait until producer `task_state >= PTO2_TASK_COMPLETED`
+- **spin-wait**: wait until producer's `completion_flags[local_id & mask] == 1`
 - **No producer** (lookup callback never fires): skip waiting, read immediately
 
 ### 3.2 set_tensor_data Flow
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/docs/device_log_profiling.md b/src/a2a3/runtime/tensormap_and_ringbuffer/docs/device_log_profiling.md
index af661d440..a5aa05bdd 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/docs/device_log_profiling.md
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/docs/device_log_profiling.md
@@ -52,7 +52,7 @@ Thread 3: PTO2 total submitted tasks = 16704
 
 ### Field Reference
 
-| Field | Source (`pto_orchestrator.cpp`) | Description |
+| Field | Source (`pto_orchestrator.h`) | Description |
 | ----- | ------------------------------- | ----------- |
 | **cost** | Wall-clock around `orch_func()` call | Total time including orchestration logic + scope overhead |
 | **total** | Sum of all sub-steps below | Accumulated time inside `submit_task` across all tasks |
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/docs/profiling_levels.md b/src/a2a3/runtime/tensormap_and_ringbuffer/docs/profiling_levels.md
index b49025e11..e6c70c06b 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/docs/profiling_levels.md
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/docs/profiling_levels.md
@@ -48,7 +48,7 @@ Each sub-level macro requires `PTO2_PROFILING=1`:
 
 - Debug/diagnostic logs (always present)
 - Progress tracking (`PTO2 progress: completed=...`)
-- Stall detection and dump (triggered after the `SCHEDULER_TIMEOUT_MS` wall-clock no-progress budget)
+- Stall detection and dump (triggered only after `MAX_IDLE_ITERATIONS` idle loops)
 - Deadlock/livelock detection (`diagnose_stuck_state`, called on stall)
 
 **What's NOT compiled:**
@@ -255,7 +255,7 @@ Identity fields the AICPU side used to write at level 1 (`func_id`,
   collector (`L2SwimlaneCollector::set_core_types`).
 
 AICore buffer rotation no longer piggy-backs on `complete_task`. AICPU
-counts dispatches per core in the dispatch path (scheduler_dispatch in
+counts dispatches per core in the dispatch path (scheduler_context in
 tensormap_and_ringbuffer; aicpu_executor in host_build_graph) and rotates
 the AICore buffer when the count is about to cross a
 `PLATFORM_AICORE_BUFFER_SIZE` boundary — strictly before
@@ -428,7 +428,7 @@ definitions to runtime headers.
 ### Code Locations
 
 - Macro defaults and validation: `src/common/task_interface/profiling_config.h`
-- Scheduler profiling: `src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp` and `scheduler_cold_path.cpp`
+- Scheduler profiling: `src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler_context.h`
 - Orchestrator profiling: `src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp`
 - TensorMap profiling: `src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h`
 
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/host/dep_gen_replay.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/host/dep_gen_replay.cpp
index 779f92b58..16938562d 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/host/dep_gen_replay.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/host/dep_gen_replay.cpp
@@ -556,7 +556,7 @@ dep_gen_replay_emit_deps_json(const DepGenRecord *records, size_t num_records, c
         // `explicit_dep_count` / `over->dep_count` originate from device
         // shared memory and are bounded by the writer to the array sizes, but
         // we clamp on read too so a corrupted record never drives an OOB read
-        // off the end of rec.explicit_deps[64] / over->deps[582].
+        // off the end of rec.explicit_deps[64] / over->deps[326].
         const uint64_t *deps_data;
         int32_t dc;
         if (rec.flags & DEP_GEN_FLAG_HAS_OVERFLOW) {
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/orchestration/common.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/orchestration/common.cpp
index c4878a1c2..8768359de 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/orchestration/common.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/orchestration/common.cpp
@@ -10,6 +10,13 @@
  */
 #include "common.h"
 
+// LOG_ERROR can't be pulled from common/unified_log.h here because that header
+// would re-#define LOG_INFO_V0..V9 already provided by pto_orchestration_api.h
+// (orchestration routes them through the runtime ops table). For the limited
+// use inside this file, write directly to stderr.
+#include <cstdio>
+#define LOG_ERROR(fmt, ...) std::fprintf(stderr, "[ERROR] " fmt "\n", ##__VA_ARGS__)
+
 #ifdef __linux__
 #include <cxxabi.h>
 #include <dlfcn.h>
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/aicore_completion_mailbox.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/aicore_completion_mailbox.h
index 0f73a043a..d2eb173c2 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/aicore_completion_mailbox.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/aicore_completion_mailbox.h
@@ -19,21 +19,10 @@
 #include "pto_constants.h"
 #include "pto_task_id.h"
 
-// AICPU-only MPSC ring used to convey deferred-completion observations from
-// FIN-handling scheduler threads to the dispatch thread. Producers push under
-// CAS on `head`; the single consumer (dispatch thread, under AsyncWaitList::
-// busy) drains in seq order. Kernel-side code never touches this struct —
-// AICore writes go into DeferredCompletionSlab (see
-// aicore_completion_mailbox_types.h), which the FIN thread reads, flattens
-// into messages here, and forwards.
-
 #define AICORE_COMPLETION_MAILBOX_CAPACITY 4096u
 #define AICORE_COMPLETION_MAILBOX_MASK (AICORE_COMPLETION_MAILBOX_CAPACITY - 1u)
 
-static_assert(
-    (AICORE_COMPLETION_MAILBOX_CAPACITY & (AICORE_COMPLETION_MAILBOX_CAPACITY - 1u)) == 0,
-    "AICORE_COMPLETION_MAILBOX_CAPACITY must be a power of two"
-);
+static_assert((AICORE_COMPLETION_MAILBOX_CAPACITY & (AICORE_COMPLETION_MAILBOX_CAPACITY - 1u)) == 0, "AICORE_COMPLETION_MAILBOX_CAPACITY must be a power of two");
 
 // Mailbox message discriminator. CONDITION carries one deferred-completion
 // observation flattened from a DeferredCompletionEntry. TASK_NORMAL_DONE
@@ -45,16 +34,10 @@ static_assert(
 #define MSG_KIND_CONDITION 0u
 #define MSG_KIND_TASK_NORMAL_DONE 1u
 
-struct AICoreCompletionMailboxMessage {
-    // Per-slot ready flag. Producer publishes `tail+1` after filling the rest
-    // of the slot with a release store; consumer waits for the matching seq
-    // value with an acquire load. The release-acquire pair publishes all
-    // other fields below as a side effect, so they stay plain.
+struct AICoreCompletionMailboxMessage
+{
     std::atomic<uint64_t> seq;
     PTO2TaskId task_token;
-    // CONDITION: completion observation addr (counter / SDMA event record).
-    // TASK_NORMAL_DONE: PTO2TaskSlotState pointer carried over to the consumer
-    //   so it can finalize the AsyncWaitEntry.slot_state binding.
     uint64_t addr;
     uint32_t expected_value;
     uint32_t engine;
@@ -64,19 +47,11 @@ struct AICoreCompletionMailboxMessage {
 };
 
 static_assert(sizeof(AICoreCompletionMailboxMessage) == PTO2_ALIGN_SIZE, "AICoreCompletionMailboxMessage layout drift");
-static_assert(
-    sizeof(std::atomic<uint64_t>) == sizeof(uint64_t),
-    "std::atomic<uint64_t> must be layout-compatible with uint64_t for the message slot layout to hold"
-);
-static_assert(
-    std::atomic<uint64_t>::is_always_lock_free,
-    "AICoreCompletionMailbox requires lock-free uint64_t atomics on every supported target"
-);
-
-// POD view of a drained message. `seq` is the ring's publication flag, not
-// payload, so try_pop copies out only the fields below (and seq is not even
-// copyable — it is a std::atomic).
-struct AICoreCompletionMsgView {
+static_assert(sizeof(std::atomic<uint64_t>) == sizeof(uint64_t), "std::atomic<uint64_t> must be layout-compatible with uint64_t for the message slot layout to hold");
+static_assert(std::atomic<uint64_t>::is_always_lock_free, "AICoreCompletionMailbox requires lock-free uint64_t atomics on every supported target");
+
+struct AICoreCompletionMsgView
+{
     PTO2TaskId task_token{PTO2TaskId::invalid()};
     uint64_t addr{0};
     uint32_t expected_value{0};
@@ -85,7 +60,8 @@ struct AICoreCompletionMsgView {
     uint32_t kind{0};
 };
 
-struct AICoreCompletionMailbox {
+struct AICoreCompletionMailbox
+{
     // head and tail live on their own cache lines so producer CAS contention
     // on head can't false-share with the consumer's tail updates.
     alignas(PTO2_ALIGN_SIZE) std::atomic<uint64_t> head;
@@ -96,32 +72,21 @@ struct AICoreCompletionMailbox {
 
     // Cheap, lock-free pending hint. Callers may invoke this outside the
     // consumer lock; a stale answer only over/under-triggers a drain attempt.
-    bool has_pending() { return tail.load(std::memory_order_acquire) < head.load(std::memory_order_acquire); }
-
-    // MPSC push for a CONDITION message. Returns false when the ring is full
-    // (head - tail >= CAPACITY); caller should SPIN_WAIT_HINT and retry.
-    // Lock-free: CAS the shared head to claim a slot, write the fields, then
-    // release-store seq so the single consumer observes the publication.
-    //
-    // The head CAS is relaxed: head is a pure ticket counter and carries no
-    // data to the consumer — publication is solely the seq release-store, and
-    // slot-reuse safety rests on the acquire load of tail. The relaxed failure
-    // order is likewise sufficient since a lost CAS just re-reads head and
-    // retries. compare_exchange_weak is used because this loop already re-reads
-    // head and re-checks fullness, so masking LL/SC spurious failures (what
-    // _strong adds on aarch64) would only be a redundant inner retry.
-    //
-    // Safe to call concurrently from any number of producers; structurally
-    // independent of the AsyncWaitList::busy lock.
-    bool try_push_condition(
-        PTO2TaskId task_token, uint64_t addr, uint32_t expected_value, uint32_t engine, int32_t completion_type
-    ) {
-        while (true) {
+    bool has_pending()
+    {
+        return tail.load(std::memory_order_acquire) < head.load(std::memory_order_acquire);
+    }
+
+    bool try_push_condition(PTO2TaskId task_token, uint64_t addr, uint32_t expected_value, uint32_t engine, int32_t completion_type)
+    {
+        while (true)
+        {
             uint64_t h = head.load(std::memory_order_relaxed);
             uint64_t t = tail.load(std::memory_order_acquire);
             if (h - t >= AICORE_COMPLETION_MAILBOX_CAPACITY) return false;
             uint64_t new_head = h + 1;
-            if (head.compare_exchange_weak(h, new_head, std::memory_order_relaxed, std::memory_order_relaxed)) {
+            if (head.compare_exchange_weak(h, new_head, std::memory_order_relaxed, std::memory_order_relaxed))
+            {
                 AICoreCompletionMailboxMessage *slot = &entries[h & AICORE_COMPLETION_MAILBOX_MASK];
                 slot->task_token.raw = task_token.raw;
                 slot->addr = addr;
@@ -136,16 +101,16 @@ struct AICoreCompletionMailbox {
         }
     }
 
-    // MPSC push for a TASK_NORMAL_DONE sentinel. Carries the PTO2TaskSlotState
-    // pointer in the `addr` field so the consumer can finish binding the
-    // AsyncWaitEntry.slot_state without going back to the FIN-handling thread.
-    bool try_push_normal_done(PTO2TaskId task_token, uint64_t slot_state_addr) {
-        while (true) {
+    bool try_push_normal_done(PTO2TaskId task_token, uint64_t slot_state_addr)
+    {
+        while (true)
+        {
             uint64_t h = head.load(std::memory_order_relaxed);
             uint64_t t = tail.load(std::memory_order_acquire);
             if (h - t >= AICORE_COMPLETION_MAILBOX_CAPACITY) return false;
             uint64_t new_head = h + 1;
-            if (head.compare_exchange_weak(h, new_head, std::memory_order_relaxed, std::memory_order_relaxed)) {
+            if (head.compare_exchange_weak(h, new_head, std::memory_order_relaxed, std::memory_order_relaxed))
+            {
                 AICoreCompletionMailboxMessage *slot = &entries[h & AICORE_COMPLETION_MAILBOX_MASK];
                 slot->task_token.raw = task_token.raw;
                 slot->addr = slot_state_addr;
@@ -159,13 +124,8 @@ struct AICoreCompletionMailbox {
         }
     }
 
-    // Single-consumer transport-level dequeue (caller holds the consumer lock).
-    // Returns false at the first not-yet-published slot (gap) or when empty;
-    // otherwise copies the next message in tail order into `out`, advances
-    // tail, and returns true. tail is consumer-only-written (relaxed read);
-    // head bounds the scan (relaxed); the seq acquire is the real publication
-    // gate; the tail release publishes "slot free" to reusing producers.
-    bool try_pop(AICoreCompletionMsgView &out) {
+    bool try_pop(AICoreCompletionMsgView &out)
+    {
         uint64_t t = tail.load(std::memory_order_relaxed);
         uint64_t h = head.load(std::memory_order_relaxed);
         if (t >= h) return false;
@@ -182,8 +142,6 @@ struct AICoreCompletionMailbox {
     }
 };
 
-static_assert(
-    sizeof(AICoreCompletionMailbox) % PTO2_ALIGN_SIZE == 0, "AICoreCompletionMailbox size must be cache-line aligned"
-);
+static_assert(sizeof(AICoreCompletionMailbox) % PTO2_ALIGN_SIZE == 0, "AICoreCompletionMailbox size must be cache-line aligned");
 
 #endif  // SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_AICORE_COMPLETION_MAILBOX_H_
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/aicore_completion_mailbox_types.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/aicore_completion_mailbox_types.h
index da0d89ad7..5617cd6d4 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/aicore_completion_mailbox_types.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/aicore_completion_mailbox_types.h
@@ -16,16 +16,6 @@
 
 #include "pto_constants.h"
 
-// Types shared across the AICore↔AICPU boundary.
-//
-// This header is reachable from AICore-side translation units (via
-// pto_async_kernel_api.h / pto_completion_token.h / sdma_completion_kernel.h)
-// and must stay parseable by every AICore toolchain configuration: no
-// <atomic>, no __atomic_* intrinsics, no MPSC ring buffer struct.
-//
-// The MPSC ring (AICoreCompletionMailbox) and its push/drain helpers live in
-// aicore_completion_mailbox.h, which is AICPU-only.
-
 inline constexpr int32_t MAX_COMPLETIONS_PER_TASK = 64;
 
 #define COMPLETION_ENGINE_SDMA 0u
@@ -36,14 +26,8 @@ inline constexpr int32_t MAX_COMPLETIONS_PER_TASK = 64;
 #define COMPLETION_TYPE_COUNTER 0
 #define COMPLETION_TYPE_SDMA_EVENT_RECORD 1
 
-// DeferredCompletionEntry / DeferredCompletionSlab back the per-task scratch
-// area that AICore writes into to record "this completion has to be observed
-// before the task can retire." The FIN-handling scheduler thread reads the
-// slab, flattens entries into AICoreCompletionMailbox messages, and forwards
-// them to the dispatch thread. `volatile` here is load-bearing: writers live
-// on AICore and readers on AICPU, so the qualifier is the correct way to
-// pin the compiler against caching / reordering on either side.
-struct DeferredCompletionEntry {
+struct DeferredCompletionEntry
+{
     uint64_t addr;
     uint32_t expected_value;
     uint32_t engine;
@@ -53,15 +37,13 @@ struct DeferredCompletionEntry {
 
 static_assert(sizeof(DeferredCompletionEntry) == 24, "DeferredCompletionEntry layout drift");
 
-struct alignas(PTO2_ALIGN_SIZE) DeferredCompletionSlab {
+struct alignas(PTO2_ALIGN_SIZE) DeferredCompletionSlab
+{
     volatile uint32_t count;
     volatile int32_t error_code;
     DeferredCompletionEntry entries[MAX_COMPLETIONS_PER_TASK];
 };
 
-static_assert(
-    sizeof(DeferredCompletionSlab) % PTO2_ALIGN_SIZE == 0,
-    "DeferredCompletionSlab size must preserve array element cache-line boundaries"
-);
+static_assert(sizeof(DeferredCompletionSlab) % PTO2_ALIGN_SIZE == 0, "DeferredCompletionSlab size must preserve array element cache-line boundaries");
 
 #endif  // SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_AICORE_COMPLETION_MAILBOX_TYPES_H_
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/backend/sdma/sdma_completion_kernel.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/backend/sdma/sdma_completion_kernel.h
index 0ff21908f..eff33dba6 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/backend/sdma/sdma_completion_kernel.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/backend/sdma/sdma_completion_kernel.h
@@ -31,24 +31,15 @@
 // <pto/npu/comm/async/sdma/sdma_types.hpp> just to spell their scratch tile.
 inline constexpr uint32_t SDMA_SCRATCH_ALIGNMENT = pto::comm::sdma::UB_ALIGN_SIZE;
 
-enum class SdmaOp : uint8_t {
+enum class SdmaOp : uint8_t
+{
     TGET = 0,
     TPUT = 1,
 };
 
-// SdmaRequestDescriptor bundles everything send_request_entry needs to drive
-// one SDMA transfer + completion registration. It is a template because the
-// destination / source / scratch types carry tensor shape & stride at compile
-// time; the SdmaTget() / SdmaTput() helpers below let callers skip the
-// template arguments.
-//
-// sync_id selects which event-record slot inside the workspace the engine
-// writes into. Concurrent dispatches must use distinct sync_ids; today every
-// caller submits one request per kernel invocation so passing 0 is safe.
-// Future work (see .docs/25.comm-api-refactor/03.implementation-plan.md §5.2)
-// will fold sync_id allocation into the adapter.
 template <typename DstTensor, typename SrcTensor, typename ScratchTileT>
-struct SdmaRequestDescriptor {
+struct SdmaRequestDescriptor
+{
     SdmaOp op;
     DstTensor dst;
     SrcTensor src;
@@ -58,45 +49,38 @@ struct SdmaRequestDescriptor {
 };
 
 template <typename DstTensor, typename SrcTensor, typename ScratchTileT>
-inline __aicore__ SdmaRequestDescriptor<DstTensor, SrcTensor, ScratchTileT> SdmaTget(
-    const DstTensor &dst, const SrcTensor &src, const ScratchTileT &scratch, __gm__ uint8_t *workspace,
-    uint32_t sync_id = 0
-) {
-    return SdmaRequestDescriptor<DstTensor, SrcTensor, ScratchTileT>{SdmaOp::TGET, dst,       src,
-                                                                     scratch,      workspace, sync_id};
+inline __aicore__ SdmaRequestDescriptor<DstTensor, SrcTensor, ScratchTileT> SdmaTget(const DstTensor &dst, const SrcTensor &src, const ScratchTileT &scratch, __gm__ uint8_t *workspace, uint32_t sync_id = 0)
+{
+    return SdmaRequestDescriptor<DstTensor, SrcTensor, ScratchTileT>{SdmaOp::TGET, dst, src, scratch, workspace, sync_id};
 }
 
 template <typename DstTensor, typename SrcTensor, typename ScratchTileT>
-inline __aicore__ SdmaRequestDescriptor<DstTensor, SrcTensor, ScratchTileT> SdmaTput(
-    const DstTensor &dst, const SrcTensor &src, const ScratchTileT &scratch, __gm__ uint8_t *workspace,
-    uint32_t sync_id = 0
-) {
-    return SdmaRequestDescriptor<DstTensor, SrcTensor, ScratchTileT>{SdmaOp::TPUT, dst,       src,
-                                                                     scratch,      workspace, sync_id};
+inline __aicore__ SdmaRequestDescriptor<DstTensor, SrcTensor, ScratchTileT> SdmaTput(const DstTensor &dst, const SrcTensor &src, const ScratchTileT &scratch, __gm__ uint8_t *workspace, uint32_t sync_id = 0)
+{
+    return SdmaRequestDescriptor<DstTensor, SrcTensor, ScratchTileT>{SdmaOp::TPUT, dst, src, scratch, workspace, sync_id};
 }
 
 namespace pto2::detail {
 
-inline __aicore__ void register_sdma_event_record(AsyncCtx &ctx, volatile __gm__ void *record_addr) {
-    CompletionToken token{
-        reinterpret_cast<uint64_t>(record_addr), 0, COMPLETION_ENGINE_SDMA, COMPLETION_TYPE_SDMA_EVENT_RECORD, 0
-    };
+inline __aicore__ void register_sdma_event_record(AsyncCtx &ctx, volatile __gm__ void *record_addr)
+{
+    CompletionToken token{reinterpret_cast<uint64_t>(record_addr), 0, COMPLETION_ENGINE_SDMA, COMPLETION_TYPE_SDMA_EVENT_RECORD, 0};
     (void)register_completion_condition(ctx, token);
 }
 
 template <typename PtoAsyncEvent, typename PtoAsyncSession>
-inline __aicore__ void
-register_pto_async_event(AsyncCtx &ctx, const PtoAsyncEvent &event, const PtoAsyncSession &session) {
-    if (ctx.task_token.is_invalid() || ctx.completion_count == nullptr || ctx.completion_entries == nullptr) {
+inline __aicore__ void register_pto_async_event(AsyncCtx &ctx, const PtoAsyncEvent &event, const PtoAsyncSession &session)
+{
+    if (ctx.task_token.is_invalid() || ctx.completion_count == nullptr || ctx.completion_entries == nullptr)
+    {
         (void)event.Wait(session);
         return;
     }
-    if (event.handle == 0) {
-        return;
-    }
+    if (event.handle == 0) return;
 
     const uint32_t engine = static_cast<uint32_t>(event.engine);
-    if (engine != static_cast<uint32_t>(::pto::comm::DmaEngine::SDMA)) {
+    if (engine != static_cast<uint32_t>(::pto::comm::DmaEngine::SDMA))
+    {
         defer_error(ctx, PTO2_ERROR_ASYNC_COMPLETION_INVALID);
         return;
     }
@@ -105,38 +89,29 @@ register_pto_async_event(AsyncCtx &ctx, const PtoAsyncEvent &event, const PtoAsy
     uint32_t sync_id = 0;
     __gm__ uint8_t *recv_workspace = nullptr;
     uint32_t queue_num = 0;
-    if (!::pto::comm::sdma::detail::PrepareEventCheck(
-            session.sdmaSession, tmp_buf, sync_id, recv_workspace, queue_num
-        )) {
+    if (!::pto::comm::sdma::detail::PrepareEventCheck(session.sdmaSession, tmp_buf, sync_id, recv_workspace, queue_num))
+    {
         defer_error(ctx, PTO2_ERROR_ASYNC_COMPLETION_INVALID);
         return;
     }
-    for (uint32_t queue_id = 0; queue_id < queue_num; ++queue_id) {
-        register_sdma_event_record(ctx, ::pto::comm::sdma::detail::GetEventRecord(recv_workspace, queue_id));
-    }
+    for (uint32_t queue_id = 0; queue_id < queue_num; ++queue_id) register_sdma_event_record(ctx, ::pto::comm::sdma::detail::GetEventRecord(recv_workspace, queue_id));
 }
 
 }  // namespace pto2::detail
 
-// SDMA overload of the runtime's send_request_entry. Submits the descriptor
-// to PTO-ISA, then registers the resulting AsyncEvent's GM flag(s) into the
-// AsyncCtx deferred-wait slab and flushes. Returns false on submit/session
-// failure (also records the error in ctx.completion_error_code).
 template <typename DstTensor, typename SrcTensor, typename ScratchTileT>
-inline __aicore__ bool
-send_request_entry(AsyncCtx &ctx, SdmaRequestDescriptor<DstTensor, SrcTensor, ScratchTileT> desc) {
+inline __aicore__ bool send_request_entry(AsyncCtx &ctx, SdmaRequestDescriptor<DstTensor, SrcTensor, ScratchTileT> desc)
+{
     pto::comm::AsyncSession session;
-    if (!pto::comm::BuildAsyncSession(desc.scratch, desc.workspace, session, desc.sync_id)) {
+    if (!pto::comm::BuildAsyncSession(desc.scratch, desc.workspace, session, desc.sync_id))
+    {
         pto2::detail::defer_error(ctx, PTO2_ERROR_ASYNC_COMPLETION_INVALID);
         return false;
     }
 
     pto::comm::AsyncEvent event;
-    if (desc.op == SdmaOp::TGET) {
-        event = pto::comm::TGET_ASYNC(desc.dst, desc.src, session);
-    } else {
-        event = pto::comm::TPUT_ASYNC(desc.dst, desc.src, session);
-    }
+    if (desc.op == SdmaOp::TGET) event = pto::comm::TGET_ASYNC(desc.dst, desc.src, session);
+    else event = pto::comm::TPUT_ASYNC(desc.dst, desc.src, session);
     pto2::detail::register_pto_async_event(ctx, event, session);
     pto2::detail::defer_flush(ctx);
     return true;
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/backend/sdma/sdma_completion_scheduler.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/backend/sdma/sdma_completion_scheduler.h
index 689219c35..577e5138d 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/backend/sdma/sdma_completion_scheduler.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/backend/sdma/sdma_completion_scheduler.h
@@ -19,10 +19,8 @@
 #include "pto_completion_token.h"
 #include "pto_runtime_status.h"
 
-// runtime-side mirror of the PTO-ISA SdmaEventRecord. SDMA backend is the only
-// allowed holder of this ABI knowledge; the generic scheduler dispatches into
-// the helpers below through the completion ops table.
-struct SdmaEventRecord {
+struct SdmaEventRecord
+{
     uint32_t flag;
     uint32_t sq_tail;
     uint64_t channel_info;
@@ -31,25 +29,24 @@ struct SdmaEventRecord {
 static_assert(sizeof(SdmaEventRecord) == 16, "SDMA event record ABI drift");
 static_assert(offsetof(SdmaEventRecord, sq_tail) == 4, "SDMA event record ABI drift");
 
-inline uintptr_t sdma_completion_cache_line(const volatile void *addr) {
+inline uintptr_t sdma_completion_cache_line(const volatile void *addr)
+{
     return reinterpret_cast<uintptr_t>(addr) & ~(uintptr_t(PTO2_ALIGN_SIZE) - 1u);
 }
 
-inline CompletionPollResult poll_sdma_event_record(uint64_t record_addr) {
-    if (record_addr == 0) {
-        return {CompletionPollState::FAILED, PTO2_ERROR_ASYNC_COMPLETION_INVALID};
-    }
-    volatile SdmaEventRecord *record =
-        reinterpret_cast<volatile SdmaEventRecord *>(static_cast<uintptr_t>(record_addr));
+inline CompletionPollResult poll_sdma_event_record(uint64_t record_addr)
+{
+    if (record_addr == 0) return {CompletionPollState::FAILED, PTO2_ERROR_ASYNC_COMPLETION_INVALID};
+    volatile SdmaEventRecord *record = reinterpret_cast<volatile SdmaEventRecord *>(static_cast<uintptr_t>(record_addr));
     cache_invalidate_range(reinterpret_cast<const void *>(sdma_completion_cache_line(record)), PTO2_ALIGN_SIZE);
     uint32_t flag = __atomic_load_n(&record->flag, __ATOMIC_ACQUIRE);
     return {flag != 0 ? CompletionPollState::READY : CompletionPollState::PENDING, PTO2_ERROR_NONE};
 }
 
-inline void retire_sdma_event_record(uint64_t record_addr) {
+inline void retire_sdma_event_record(uint64_t record_addr)
+{
     if (record_addr == 0) return;
-    volatile SdmaEventRecord *record =
-        reinterpret_cast<volatile SdmaEventRecord *>(static_cast<uintptr_t>(record_addr));
+    volatile SdmaEventRecord *record = reinterpret_cast<volatile SdmaEventRecord *>(static_cast<uintptr_t>(record_addr));
     cache_invalidate_range(reinterpret_cast<const void *>(sdma_completion_cache_line(record)), PTO2_ALIGN_SIZE);
     uint32_t completed_tail = __atomic_load_n(&record->sq_tail, __ATOMIC_ACQUIRE);
     uint64_t channel_info_addr = __atomic_load_n(&record->channel_info, __ATOMIC_ACQUIRE);
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto2_dispatch_payload.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto2_dispatch_payload.h
index e1bb3465e..bd9b1adb8 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto2_dispatch_payload.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto2_dispatch_payload.h
@@ -9,29 +9,6 @@
  * -----------------------------------------------------------------------------------------------------------
  */
 
-/**
- * @file pto2_dispatch_payload.h
- * @brief Per-core dispatch payload for AICore kernel execution
- *
- * PTO2DispatchPayload holds the kernel function address, a per-core args[]
- * array, and embedded SPMD context (LocalContext + GlobalContext).  AICPU
- * maintains a static array of these (one per core).
- *
- * GlobalContext (sub_block_id) is initialized once at runtime startup via
- * init_global_context() and never modified afterwards.
- *
- * LocalContext (block_idx, block_num) and args[] are rebuilt by build_payload()
- * before each dispatch.  Both context struct pointers are written into the
- * args[] suffix on every dispatch (since args[] is rebuilt entirely each time).
- *
- * AICore caches a pointer to its per-core slot at startup and reads from
- * it on each dispatch.  The struct is cache-line aligned to avoid false
- * sharing across concurrently dispatched cores.
- *
- * The DATA_MAIN_BASE register protocol is unchanged from the base runtime:
- * a monotonically increasing reg_task_id signals new work to AICore.
- */
-
 #pragma once
 
 #include <stdint.h>
@@ -39,7 +16,6 @@
 #include "arg_direction.h"
 #include "intrinsic.h"
 
-/** Max dispatch arguments: 16 scalars + up to 32 tensor pointers + ext params */
 #ifndef PTO2_DISPATCH_MAX_ARGS
 #define PTO2_DISPATCH_MAX_ARGS (MAX_TENSOR_ARGS + MAX_SCALAR_ARGS + PTO2_EXT_PARAMS_COUNT)
 #endif
@@ -49,36 +25,16 @@
 #endif
 
 // Verify hardcoded indices in intrinsic.h match the computed values.
-static_assert(
-    (MAX_TENSOR_ARGS + MAX_SCALAR_ARGS) == SPMD_LOCAL_CONTEXT_INDEX, "LOCAL_CONTEXT_INDEX out of sync with intrinsic.h"
-);
-static_assert(
-    (MAX_TENSOR_ARGS + MAX_SCALAR_ARGS + 1) == SPMD_GLOBAL_CONTEXT_INDEX,
-    "GLOBAL_CONTEXT_INDEX out of sync with intrinsic.h"
-);
+static_assert((MAX_TENSOR_ARGS + MAX_SCALAR_ARGS) == SPMD_LOCAL_CONTEXT_INDEX, "LOCAL_CONTEXT_INDEX out of sync with intrinsic.h");
+static_assert((MAX_TENSOR_ARGS + MAX_SCALAR_ARGS + 1) == SPMD_GLOBAL_CONTEXT_INDEX, "GLOBAL_CONTEXT_INDEX out of sync with intrinsic.h");
 
-/**
- * Per-core dispatch payload: function address + args[] + SPMD context.
- *
- * AICPU maintains a static array s_payload_per_core[RUNTIME_MAX_WORKER].
- * AICore caches a pointer to its per-core slot at startup (via Handshake.task)
- * and reads from it on each dispatch.
- *
- * The struct is cache-line aligned to prevent false sharing across
- * concurrently dispatched cores.
- */
-struct alignas(64) PTO2DispatchPayload {
-    uint64_t function_bin_addr;            /**< Kernel entry address in GM (set by Scheduler) */
-    uint64_t args[PTO2_DISPATCH_MAX_ARGS]; /**< Kernel arguments (GM pointers + scalars + ext params) */
+struct alignas(64) PTO2DispatchPayload
+{
+    uint64_t function_bin_addr;
+    uint64_t args[PTO2_DISPATCH_MAX_ARGS];
 
-    /** Per-dispatch context: block_idx and block_num.
-     *  Written by build_payload() before each dispatch.
-     *  args[SPMD_LOCAL_CONTEXT_INDEX] points here. */
     LocalContext local_context;
 
-    /** Per-core global context: sub_block_id (AIV lane identity).
-     *  Initialized once by init_global_context() at runtime startup.
-     *  args[SPMD_GLOBAL_CONTEXT_INDEX] points here. */
     GlobalContext global_context;
 
     /** Speculative early-dispatch gate. 0 = ready: AICore executes on pickup.
@@ -88,10 +44,7 @@ struct alignas(64) PTO2DispatchPayload {
     uint8_t reserved_payload_abi_pad[4];
 
     static_assert(sizeof(args[0]) == 8);
-    static_assert(
-        PTO2_ALIGN_UP((MAX_TENSOR_ARGS + MAX_SCALAR_ARGS) * sizeof(args[0]), 64) ==
-        (MAX_TENSOR_ARGS + MAX_SCALAR_ARGS) * sizeof(args[0])
-    );
+    static_assert(PTO2_ALIGN_UP((MAX_TENSOR_ARGS + MAX_SCALAR_ARGS) * sizeof(args[0]), 64) == (MAX_TENSOR_ARGS + MAX_SCALAR_ARGS) * sizeof(args[0]));
 };
 
 static_assert(sizeof(PTO2DispatchPayload) == 512, "PTO2DispatchPayload hardware ABI size drift");
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_async_kernel_api.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_async_kernel_api.h
index cf6eb4790..357a1fdcf 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_async_kernel_api.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_async_kernel_api.h
@@ -29,13 +29,10 @@
 #define __gm__
 #endif
 
-// Public surface: get_async_ctx, async_ctx_is_deferred,
-// register_completion_condition, send_notification,
-// save_expected_notification_counter. Everything else lives in
-// pto2::detail and is reserved for backend adapters / internal use.
 namespace pto2::detail {
 
-inline __aicore__ void defer_load_slab(AsyncCtx &ctx) {
+inline __aicore__ void defer_load_slab(AsyncCtx &ctx)
+{
     if (ctx.completion_count == nullptr) return;
 #if defined(__CCE_KT_TEST__) || defined(__CCE_AICORE__) || defined(__DAV_C220__)
     uintptr_t line = reinterpret_cast<uintptr_t>(ctx.completion_count) & ~(uintptr_t(PTO2_ALIGN_SIZE) - 1u);
@@ -45,41 +42,33 @@ inline __aicore__ void defer_load_slab(AsyncCtx &ctx) {
 #endif
 }
 
-inline __aicore__ void defer_error(AsyncCtx &ctx, int32_t error_code) {
-    if (ctx.task_token.is_valid() && ctx.completion_error_code != nullptr) {
-        *ctx.completion_error_code = error_code;
-    }
+inline __aicore__ void defer_error(AsyncCtx &ctx, int32_t error_code)
+{
+    if (ctx.task_token.is_valid() && ctx.completion_error_code != nullptr) *ctx.completion_error_code = error_code;
 }
 
-inline __aicore__ void defer_flush_range(volatile __gm__ void *addr, uint32_t size_bytes) {
+inline __aicore__ void defer_flush_range(volatile __gm__ void *addr, uint32_t size_bytes)
+{
     if (addr == nullptr || size_bytes == 0) return;
 #if defined(__CCE_KT_TEST__) || defined(__CCE_AICORE__) || defined(__DAV_C220__)
     uintptr_t start = reinterpret_cast<uintptr_t>(addr) & ~(uintptr_t(PTO2_ALIGN_SIZE) - 1u);
-    uintptr_t end =
-        (reinterpret_cast<uintptr_t>(addr) + size_bytes + PTO2_ALIGN_SIZE - 1u) & ~(uintptr_t(PTO2_ALIGN_SIZE) - 1u);
-    for (uintptr_t p = start; p < end; p += PTO2_ALIGN_SIZE) {
-        dcci((__gm__ int32_t *)p, SINGLE_CACHE_LINE, CACHELINE_OUT);
-    }
+    uintptr_t end = (reinterpret_cast<uintptr_t>(addr) + size_bytes + PTO2_ALIGN_SIZE - 1u) & ~(uintptr_t(PTO2_ALIGN_SIZE) - 1u);
+    for (uintptr_t p = start; p < end; p += PTO2_ALIGN_SIZE) dcci((__gm__ int32_t *)p, SINGLE_CACHE_LINE, CACHELINE_OUT);
 #else
     (void)addr;
     (void)size_bytes;
 #endif
 }
 
-inline __aicore__ void defer_flush(AsyncCtx &ctx) {
+inline __aicore__ void defer_flush(AsyncCtx &ctx)
+{
     if (ctx.task_token.is_invalid() || ctx.completion_count == nullptr) return;
 #if defined(__CCE_KT_TEST__) || defined(__CCE_AICORE__) || defined(__DAV_C220__)
     uint32_t count = *ctx.completion_count;
-    if (count > ctx.completion_capacity) {
-        count = ctx.completion_capacity;
-    }
+    if (count > ctx.completion_capacity) count = ctx.completion_capacity;
     uint32_t flush_bytes = static_cast<uint32_t>(sizeof(*ctx.completion_count));
-    if (ctx.completion_error_code != nullptr) {
-        flush_bytes += static_cast<uint32_t>(sizeof(*ctx.completion_error_code));
-    }
-    if (ctx.completion_entries != nullptr) {
-        flush_bytes += count * static_cast<uint32_t>(sizeof(DeferredCompletionEntry));
-    }
+    if (ctx.completion_error_code != nullptr) flush_bytes += static_cast<uint32_t>(sizeof(*ctx.completion_error_code));
+    if (ctx.completion_entries != nullptr) flush_bytes += count * static_cast<uint32_t>(sizeof(DeferredCompletionEntry));
     defer_flush_range(ctx.completion_count, flush_bytes);
 #if defined(__CPU_SIM)
     dsb(0);
@@ -95,9 +84,9 @@ inline __aicore__ void defer_flush(AsyncCtx &ctx) {
 
 }  // namespace pto2::detail
 
-inline __aicore__ AsyncCtx get_async_ctx(__gm__ int64_t *args) {
-    __gm__ LocalContext *lc =
-        reinterpret_cast<__gm__ LocalContext *>(static_cast<uintptr_t>(args[PAYLOAD_LOCAL_CONTEXT_INDEX]));
+inline __aicore__ AsyncCtx get_async_ctx(__gm__ int64_t *args)
+{
+    __gm__ LocalContext *lc = reinterpret_cast<__gm__ LocalContext *>(static_cast<uintptr_t>(args[PAYLOAD_LOCAL_CONTEXT_INDEX]));
     AsyncCtx ctx{};
     ctx.completion_count = lc->async_ctx.completion_count;
     ctx.completion_error_code = lc->async_ctx.completion_error_code;
@@ -108,23 +97,19 @@ inline __aicore__ AsyncCtx get_async_ctx(__gm__ int64_t *args) {
     return ctx;
 }
 
-inline __aicore__ bool async_ctx_is_deferred(const AsyncCtx &ctx) { return ctx.task_token.is_valid(); }
+inline __aicore__ bool async_ctx_is_deferred(const AsyncCtx &ctx)
+{
+    return ctx.task_token.is_valid();
+}
 
-// Canonical writer: backend submit handlers build a CompletionToken and pass
-// it here. Writes one DeferredCompletionEntry to the AsyncCtx slab and
-// bumps completion_count. Returns false on overflow (also stores
-// PTO2_ERROR_ASYNC_WAIT_OVERFLOW in ctx.completion_error_code) or when ctx is
-// not currently a deferred context.
-inline __aicore__ bool register_completion_condition(AsyncCtx &ctx, const CompletionToken &token) {
-    if (ctx.task_token.is_invalid() || ctx.completion_count == nullptr || ctx.completion_entries == nullptr) {
-        return false;
-    }
+inline __aicore__ bool register_completion_condition(AsyncCtx &ctx, const CompletionToken &token)
+{
+    if (ctx.task_token.is_invalid() || ctx.completion_count == nullptr || ctx.completion_entries == nullptr) return false;
 
     uint32_t idx = *ctx.completion_count;
-    if (idx >= ctx.completion_capacity) {
-        if (ctx.completion_error_code != nullptr) {
-            *ctx.completion_error_code = PTO2_ERROR_ASYNC_WAIT_OVERFLOW;
-        }
+    if (idx >= ctx.completion_capacity)
+    {
+        if (ctx.completion_error_code != nullptr) *ctx.completion_error_code = PTO2_ERROR_ASYNC_WAIT_OVERFLOW;
         return false;
     }
 
@@ -138,18 +123,16 @@ inline __aicore__ bool register_completion_condition(AsyncCtx &ctx, const Comple
     return true;
 }
 
-inline __aicore__ void
-send_notification(volatile __gm__ void *remote_counter_addr, int32_t value, pto::comm::NotifyOp notify_op) {
+inline __aicore__ void send_notification(volatile __gm__ void *remote_counter_addr, int32_t value, pto::comm::NotifyOp notify_op)
+{
     __gm__ int32_t *counter = reinterpret_cast<__gm__ int32_t *>(const_cast<__gm__ void *>(remote_counter_addr));
     pto::comm::Signal signal(counter);
     pto::comm::TNOTIFY(signal, value, notify_op);
 }
 
-inline __aicore__ void
-save_expected_notification_counter(AsyncCtx &ctx, volatile __gm__ void *counter_addr, uint32_t expected_value) {
-    CompletionToken token{
-        reinterpret_cast<uint64_t>(counter_addr), expected_value, COMPLETION_ENGINE_SDMA, COMPLETION_TYPE_COUNTER, 0
-    };
+inline __aicore__ void save_expected_notification_counter(AsyncCtx &ctx, volatile __gm__ void *counter_addr, uint32_t expected_value)
+{
+    CompletionToken token{reinterpret_cast<uint64_t>(counter_addr), expected_value, COMPLETION_ENGINE_SDMA, COMPLETION_TYPE_COUNTER, 0};
     (void)register_completion_condition(ctx, token);
     pto2::detail::defer_flush(ctx);
 }
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_async_wait.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_async_wait.h
index 42a947418..d4c55765a 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_async_wait.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_async_wait.h
@@ -29,12 +29,8 @@ struct CompletionStats;
 
 inline constexpr int32_t MAX_ASYNC_WAITS = 64;
 
-// The mailbox transport (has_pending / try_push_condition /
-// try_push_normal_done / try_pop) lives as AICoreCompletionMailbox member
-// functions in aicore_completion_mailbox.h. This file only holds the
-// application layer: translating drained messages into wait-list state.
-
-inline uintptr_t mailbox_cache_line(const volatile void *addr) {
+inline uintptr_t mailbox_cache_line(const volatile void *addr)
+{
     return reinterpret_cast<uintptr_t>(addr) & ~(uintptr_t(PTO2_ALIGN_SIZE) - 1u);
 }
 
@@ -43,12 +39,14 @@ struct CompletionCondition;
 using CompletionPollFn = CompletionPollResult (*)(const CompletionCondition &);
 using CompletionRetireFn = void (*)(CompletionCondition &);
 
-struct CompletionBackendOps {
+struct CompletionBackendOps
+{
     CompletionPollFn poll;
     CompletionRetireFn retire;
 };
 
-struct CompletionCondition {
+struct CompletionCondition
+{
     AsyncEngine engine{ASYNC_ENGINE_SDMA};
     int32_t completion_type{COMPLETION_TYPE_COUNTER};
     bool satisfied{false};
@@ -61,28 +59,27 @@ struct CompletionCondition {
     void retire();
 };
 
-// Per-completion-type ops. SDMA_EVENT_RECORD detail lives in
-// backend/sdma/sdma_completion_scheduler.h; the op wrappers below are thin
-// glue mapping CompletionCondition.addr into the backend's raw-addr helpers.
-inline CompletionPollResult counter_poll_op(const CompletionCondition &cond) {
-    if (cond.counter_addr == nullptr) {
-        return {CompletionPollState::FAILED, PTO2_ERROR_ASYNC_COMPLETION_INVALID};
-    }
-    return {
-        *cond.counter_addr >= cond.expected_value ? CompletionPollState::READY : CompletionPollState::PENDING,
-        PTO2_ERROR_NONE
-    };
+inline CompletionPollResult counter_poll_op(const CompletionCondition &cond)
+{
+    if (cond.counter_addr == nullptr) return {CompletionPollState::FAILED, PTO2_ERROR_ASYNC_COMPLETION_INVALID};
+    return {*cond.counter_addr >= cond.expected_value ? CompletionPollState::READY : CompletionPollState::PENDING, PTO2_ERROR_NONE};
 }
 
-inline void counter_retire_op(CompletionCondition & /*cond*/) {}
+inline void counter_retire_op(CompletionCondition &)
+{}
 
-inline CompletionPollResult sdma_event_record_poll_op(const CompletionCondition &cond) {
+inline CompletionPollResult sdma_event_record_poll_op(const CompletionCondition &cond)
+{
     return poll_sdma_event_record(cond.addr);
 }
 
-inline void sdma_event_record_retire_op(CompletionCondition &cond) { retire_sdma_event_record(cond.addr); }
+inline void sdma_event_record_retire_op(CompletionCondition &cond)
+{
+    retire_sdma_event_record(cond.addr);
+}
 
-inline const CompletionBackendOps *completion_backend_ops_for(int completion_type) {
+inline const CompletionBackendOps *completion_backend_ops_for(int completion_type)
+{
     static const CompletionBackendOps kOps[] = {
         {counter_poll_op, counter_retire_op},                      // COMPLETION_TYPE_COUNTER = 0
         {sdma_event_record_poll_op, sdma_event_record_retire_op},  // COMPLETION_TYPE_SDMA_EVENT_RECORD = 1
@@ -92,27 +89,24 @@ inline const CompletionBackendOps *completion_backend_ops_for(int completion_typ
     return &kOps[completion_type];
 }
 
-inline CompletionPollResult CompletionCondition::test() const {
-    if (satisfied) {
-        return {CompletionPollState::READY, PTO2_ERROR_NONE};
-    }
+inline CompletionPollResult CompletionCondition::test() const
+{
+    if (satisfied) return {CompletionPollState::READY, PTO2_ERROR_NONE};
     const CompletionBackendOps *ops = completion_backend_ops_for(completion_type);
-    if (ops == nullptr || ops->poll == nullptr) {
-        return {CompletionPollState::FAILED, PTO2_ERROR_ASYNC_COMPLETION_INVALID};
-    }
+    if (ops == nullptr || ops->poll == nullptr) return {CompletionPollState::FAILED, PTO2_ERROR_ASYNC_COMPLETION_INVALID};
     return ops->poll(*this);
 }
 
-inline void CompletionCondition::retire() {
+inline void CompletionCondition::retire()
+{
     if (retired) return;
     const CompletionBackendOps *ops = completion_backend_ops_for(completion_type);
-    if (ops != nullptr && ops->retire != nullptr) {
-        ops->retire(*this);
-    }
+    if (ops != nullptr && ops->retire != nullptr) ops->retire(*this);
     retired = true;
 }
 
-struct AsyncWaitEntry {
+struct AsyncWaitEntry
+{
     PTO2TaskSlotState *slot_state{nullptr};
     PTO2TaskId task_token{PTO2TaskId::invalid()};
     CompletionCondition conditions[MAX_COMPLETIONS_PER_TASK];
@@ -121,14 +115,17 @@ struct AsyncWaitEntry {
     bool normal_done{false};
 };
 
-struct AsyncPollResult {
+struct AsyncPollResult
+{
     int32_t completed{0};
     int32_t error_code{PTO2_ERROR_NONE};
     PTO2TaskSlotState *failed_slot_state{nullptr};
 };
 
-inline const char *async_engine_name(AsyncEngine engine) {
-    switch (engine) {
+inline const char *async_engine_name(AsyncEngine engine)
+{
+    switch (engine)
+    {
     case ASYNC_ENGINE_SDMA:
         return "SDMA";
     case ASYNC_ENGINE_ROCE:
@@ -142,81 +139,69 @@ inline const char *async_engine_name(AsyncEngine engine) {
     }
 }
 
-struct AsyncWaitList {
+struct AsyncWaitList
+{
     std::atomic<int32_t> busy{0};
     AsyncWaitEntry entries[MAX_ASYNC_WAITS];
     int32_t count{0};
-    // Diagnostic: counts every FIN-side try_push that hit a full mailbox.
-    // Expected to stay zero on real workloads (ring is 4096 entries); a
-    // non-zero value means consumers are too slow or the ring is undersized.
-    // Read by scheduler shutdown / l2 perf summary; not on the hot path.
     std::atomic<uint64_t> mpsc_skipped_count{0};
 
-    void reset_for_reuse() {
+    void reset_for_reuse()
+    {
         busy.store(0, std::memory_order_relaxed);
         count = 0;
         mpsc_skipped_count.store(0, std::memory_order_relaxed);
     }
 
-    bool try_lock() {
+    bool try_lock()
+    {
         int32_t expected = 0;
         return busy.compare_exchange_strong(expected, 1, std::memory_order_acquire, std::memory_order_relaxed);
     }
 
-    void unlock() { busy.store(0, std::memory_order_release); }
+    void unlock()
+    {
+        busy.store(0, std::memory_order_release);
+    }
 
-    AsyncWaitEntry *find_entry_by_token(PTO2TaskId token) {
-        for (int32_t i = 0; i < count; i++) {
+    AsyncWaitEntry *find_entry_by_token(PTO2TaskId token)
+    {
+        for (int32_t i = 0; i < count; i++)
             if (entries[i].task_token == token) return &entries[i];
-        }
         return nullptr;
     }
 
-    // Captures the side-channel a scheduler-aware drain needs to complete
-    // NotDeferred tasks inline (without storing a transient entry in
-    // entries[]).
-    struct DrainCompletionSink {
+    struct DrainCompletionSink
+    {
         PTO2SchedulerState *sched{nullptr};
-        PTO2LocalReadyBuffer *local_bufs{nullptr};
-        PTO2TaskSlotState **deferred_release_slot_states{nullptr};
-        int32_t *deferred_release_count{nullptr};
-        int32_t deferred_release_capacity{0};
         int32_t inline_completed{0};
-#if PTO2_SCHED_PROFILING
-        int32_t thread_idx{0};
-#endif
 
-        bool can_inline_complete() const { return sched != nullptr; }
+        bool can_inline_complete() const
+        {
+            return sched != nullptr;
+        }
     };
 
-    // Inline-complete a NotDeferred task during drain. Returns false on
-    // deferred_release_slot_states overflow.
+    // Inline-complete a NotDeferred task during drain.
     bool try_inline_complete_locked(DrainCompletionSink &sink, PTO2TaskSlotState &slot_state);
 
-    // Single-consumer drain: pop each published message in tail order and
-    // translate it into wait-list state. An empty sink (sched == nullptr) just
-    // materializes entries; a sched-aware sink additionally inline-completes
-    // lonely NotDeferred NORMAL_DONEs without ever growing entries[].
-    int32_t drain_aicore_completion_mailbox_locked(
-        AICoreCompletionMailbox *aicore_mailbox, DrainCompletionSink &sink, int32_t &error_code
-    ) {
+    int32_t drain_aicore_completion_mailbox_locked(AICoreCompletionMailbox *aicore_mailbox, DrainCompletionSink &sink, int32_t &error_code)
+    {
         error_code = PTO2_ERROR_NONE;
         if (aicore_mailbox == nullptr) return 0;
 
         int32_t drained = 0;
         AICoreCompletionMsgView msg;
-        // try_pop is the transport layer (seq-gated, in-order dequeue); this
-        // loop is the application layer (translate each message into wait-list
-        // state). try_pop returns false at the first gap or when empty.
-        while (aicore_mailbox->try_pop(msg)) {
+        while (aicore_mailbox->try_pop(msg))
+        {
             drained++;
-            if (msg.kind == MSG_KIND_CONDITION) {
+            if (msg.kind == MSG_KIND_CONDITION)
+            {
                 AsyncWaitEntry *entry = find_entry_by_token(msg.task_token);
-                if (entry == nullptr) {
-                    // First message for this task — materialize the entry here.
-                    // slot_state stays null until the matching TASK_NORMAL_DONE
-                    // sentinel arrives.
-                    if (count >= MAX_ASYNC_WAITS) {
+                if (entry == nullptr)
+                {
+                    if (count >= MAX_ASYNC_WAITS)
+                    {
                         error_code = PTO2_ERROR_ASYNC_WAIT_OVERFLOW;
                         return drained;
                     }
@@ -227,28 +212,21 @@ struct AsyncWaitList {
                     entry->waiting_completion_count = 0;
                     entry->normal_done = false;
                 }
-                if (!append_condition_locked(
-                        *entry, msg.addr, msg.expected_value, static_cast<AsyncEngine>(msg.engine), msg.completion_type,
-                        error_code
-                    )) {
-                    return drained;
-                }
-            } else if (msg.kind == MSG_KIND_TASK_NORMAL_DONE) {
-                PTO2TaskSlotState *slot_state_ptr =
-                    reinterpret_cast<PTO2TaskSlotState *>(static_cast<uintptr_t>(msg.addr));
+                if (!append_condition_locked(*entry, msg.addr, msg.expected_value, static_cast<AsyncEngine>(msg.engine), msg.completion_type, error_code)) return drained;
+            }
+            else if (msg.kind == MSG_KIND_TASK_NORMAL_DONE)
+            {
+                PTO2TaskSlotState *slot_state_ptr = reinterpret_cast<PTO2TaskSlotState *>(static_cast<uintptr_t>(msg.addr));
                 AsyncWaitEntry *entry = find_entry_by_token(msg.task_token);
-                if (entry == nullptr) {
-                    // Producers strictly order: all CONDITIONs for token T are
-                    // pushed before the matching NORMAL_DONE (the acq_rel on
-                    // on_subtask_complete enforces this across producers). So
-                    // observing NORMAL_DONE first => the task registered no
-                    // conditions => NotDeferred. Complete it inline when the
-                    // sink allows; otherwise fall back to the entry-store path.
-                    if (sink.can_inline_complete()) {
+                if (entry == nullptr)
+                {
+                    if (sink.can_inline_complete())
+                    {
                         (void)try_inline_complete_locked(sink, *slot_state_ptr);
                         continue;
                     }
-                    if (count >= MAX_ASYNC_WAITS) {
+                    if (count >= MAX_ASYNC_WAITS)
+                    {
                         error_code = PTO2_ERROR_ASYNC_WAIT_OVERFLOW;
                         return drained;
                     }
@@ -258,13 +236,15 @@ struct AsyncWaitList {
                     entry->condition_count = 0;
                     entry->waiting_completion_count = 0;
                     entry->normal_done = true;
-                } else {
-                    if (entry->slot_state == nullptr) {
-                        entry->slot_state = slot_state_ptr;
-                    }
+                }
+                else
+                {
+                    if (entry->slot_state == nullptr) entry->slot_state = slot_state_ptr;
                     entry->normal_done = true;
                 }
-            } else {
+            }
+            else
+            {
                 error_code = PTO2_ERROR_ASYNC_REGISTRATION_FAILED;
                 return drained;
             }
@@ -272,11 +252,10 @@ struct AsyncWaitList {
         return drained;
     }
 
-    bool append_condition_locked(
-        AsyncWaitEntry &entry, uint64_t addr, uint32_t expected_value, AsyncEngine engine, int32_t completion_type,
-        int32_t &error_code
-    ) {
-        if (entry.condition_count >= MAX_COMPLETIONS_PER_TASK) {
+    bool append_condition_locked(AsyncWaitEntry &entry, uint64_t addr, uint32_t expected_value, AsyncEngine engine, int32_t completion_type, int32_t &error_code)
+    {
+        if (entry.condition_count >= MAX_COMPLETIONS_PER_TASK)
+        {
             error_code = PTO2_ERROR_ASYNC_REGISTRATION_FAILED;
             return false;
         }
@@ -286,24 +265,14 @@ struct AsyncWaitList {
         cond.satisfied = false;
         cond.retired = false;
         cond.addr = addr;
-        cond.counter_addr = completion_type == COMPLETION_TYPE_COUNTER ?
-                                reinterpret_cast<volatile uint32_t *>(static_cast<uintptr_t>(addr)) :
-                                nullptr;
+        cond.counter_addr = completion_type == COMPLETION_TYPE_COUNTER ? reinterpret_cast<volatile uint32_t *>(static_cast<uintptr_t>(addr)) : nullptr;
         cond.expected_value = expected_value;
         entry.waiting_completion_count++;
         return true;
     }
 
     template <bool Profiling>
-    AsyncPollResult poll_and_complete(
-        AICoreCompletionMailbox *aicore_mailbox, PTO2SchedulerState *sched, PTO2LocalReadyBuffer *local_bufs,
-        PTO2TaskSlotState **deferred_release_slot_states, int32_t &deferred_release_count,
-        int32_t deferred_release_capacity
-#if PTO2_SCHED_PROFILING
-        ,
-        int thread_idx
-#endif
-    );
+    AsyncPollResult poll_and_complete(AICoreCompletionMailbox *aicore_mailbox, PTO2SchedulerState *sched);
 };
 
 #endif  // PTO_ASYNC_WAIT_H
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_completion_token.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_completion_token.h
index c5a8c345f..d017f8597 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_completion_token.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_completion_token.h
@@ -17,13 +17,8 @@
 #include "aicore_completion_mailbox_types.h"
 #include "pto_runtime_status.h"
 
-// CompletionToken is the runtime-internal POD that backend submit handlers
-// produce and the generic register_completion_condition() consumes. It is the
-// ABI contract for "this is one completion to wait on" — independent of which
-// backend (SDMA, RoCE, notification counter, ...) generated it. Each backend's
-// (poll, retire) pair is registered in pto_async_wait.h's ops table, keyed by
-// completion_type.
-struct CompletionToken {
+struct CompletionToken
+{
     uint64_t addr;
     uint32_t expected_value;
     uint32_t engine;
@@ -31,13 +26,15 @@ struct CompletionToken {
     uint64_t backend_cookie;
 };
 
-enum class CompletionPollState : uint8_t {
+enum class CompletionPollState : uint8_t
+{
     PENDING = 0,
     READY = 1,
     FAILED = 2,
 };
 
-struct CompletionPollResult {
+struct CompletionPollResult
+{
     CompletionPollState state{CompletionPollState::PENDING};
     int32_t error_code{PTO2_ERROR_NONE};
 };
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_dep_compute.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_dep_compute.h
index f8392dfbf..5373b20f2 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_dep_compute.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_dep_compute.h
@@ -9,37 +9,6 @@
  * -----------------------------------------------------------------------------------------------------------
  */
 
-/**
- * @file pto_dep_compute.h
- * @brief Dependency computation primitives shared by runtime submit_task and dep_gen replay.
- *
- * Two header-only template entry points:
- *
- *   compute_task_fanin     — STEP 3 in submit_task: per-tensor creator retention (Step A)
- *                            + tensormap.lookup for INPUT/INOUT (Step B). Calls back into
- *                            user-supplied `emit` for each producer it identifies.
- *
- *   register_task_outputs  — STEP 4 in submit_task: tensormap.insert for INOUT and
- *                            OUTPUT_EXISTING tensors. No callbacks.
- *
- * STEP 1 (explicit_deps) is intentionally left at the runtime call site because its
- * `last_task_alive` shortcut + unchecked slot lookup is subtly different from the
- * `slot_state->task->task_id == producer` reuse check in STEP 3. Unifying them would
- * require two emit semantics or a marginal behavior change in transients — not worth
- * the minor structural overlap. Replay handles STEP 1 with a one-line loop of its own.
- *
- * The Emit callback contract:
- *   bool emit(PTO2TaskId producer);
- *     - return true to continue (whether or not the producer was actually recorded —
- *       producer-not-alive / dedup-hit / etc. all return true silently)
- *     - return false to signal fatal (e.g. fanin spill overflow); caller bails
- *
- * Performance: Emit is a template parameter, not std::function. Both runtime
- * (lambda capturing fanin_builder + sm_header) and replay (lambda capturing edge
- * vector) instantiate at the call site and inline through. Do NOT replace with
- * std::function — it would break the inlining and add ~5 ns/call to the orch hot path.
- */
-
 #ifndef SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_PTO_DEP_COMPUTE_H_
 #define SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_PTO_DEP_COMPUTE_H_
 
@@ -50,14 +19,8 @@
 #include "pto_types.h"  // TensorRef
 #include "tensor.h"
 
-/**
- * View struct for inputs to compute_task_fanin / register_task_outputs.
- *
- * Both runtime and replay assemble one of these from their own data sources
- * (runtime: from Arg accessors; replay: from SubmitTraceEntry fields). All
- * pointer arrays must remain valid for the duration of the call.
- */
-struct DepInputs {
+struct DepInputs
+{
     int32_t tensor_count;
     const TensorRef *tensors;        // length = tensor_count (union; OUTPUT slots' .ptr is unused)
     const TensorArgType *arg_types;  // length = tensor_count
@@ -65,28 +28,16 @@ struct DepInputs {
     const PTO2TaskId *explicit_deps;  // length = explicit_dep_count (validity checked by caller)
 };
 
-/**
- * Compute fanin for a task being submitted (STEP 3: Step A creator retention +
- * Step B tensormap modifier lookup).
- *
- * For each non-OUTPUT tensor:
- *   - If owner_task_id is valid, emit(owner)
- *   - For INPUT/INOUT (and not manual_dep), tensor_map.lookup(*tensor) and emit
- *     each matching producer. INOUT+COVERED triggers tensor_map.remove_entry(entry).
- *
- * @return true on success (or producer-skipped-silently); false if emit signaled
- *         fatal — caller should propagate (after any fatal bookkeeping done by emit).
- */
 template <typename Emit>
-[[nodiscard]] inline bool
-compute_task_fanin(const DepInputs &inputs, PTO2TensorMap &tensor_map, bool in_manual_scope, Emit emit) {
-    if (in_manual_scope) {
-        return true;
-    }
+[[nodiscard]] inline bool compute_task_fanin(const DepInputs &inputs, PTO2TensorMap &tensor_map, bool in_manual_scope, Emit emit)
+{
+    if (in_manual_scope) return true;
 
-    for (int32_t i = 0; i < inputs.tensor_count; i++) {
+    for (int32_t i = 0; i < inputs.tensor_count; i++)
+    {
         TensorArgType ptype = inputs.arg_types[i];
-        if (ptype == TensorArgType::OUTPUT) {
+        if (ptype == TensorArgType::OUTPUT)
+        {
             // Runtime-created OUTPUT tensors are not looked up in the TensorMap since
             // they have no dependencies.
             continue;
@@ -96,84 +47,42 @@ compute_task_fanin(const DepInputs &inputs, PTO2TensorMap &tensor_map, bool in_m
 
         // Step A: creator retention — all existing tensors extend their creator lifetime.
         PTO2TaskId owner = tensor->owner_task_id;
-        if (owner.is_valid()) {
-            if (!emit(owner)) {
-                return false;
-            }
+        if (owner.is_valid())
+        {
+            if (!emit(owner)) return false;
         }
 
         // Step B: only INPUT/INOUT need modifier dependency lookup.
-        if (ptype != TensorArgType::INPUT && ptype != TensorArgType::INOUT) {
-            continue;
-        }
-        if (tensor->manual_dep) {
-            continue;
-        }
+        if (ptype != TensorArgType::INPUT && ptype != TensorArgType::INOUT) continue;
+        if (tensor->manual_dep) continue;
 
         bool fatal = false;
         tensor_map.lookup(*tensor, [&](PTO2TensorMapEntry &entry, OverlapStatus overlap_status) -> bool {
-            if (!emit(entry.producer_task_id)) {
+            if (!emit(entry.producer_task_id))
+            {
                 fatal = true;
                 return false;  // stop iteration
             }
-            if (ptype == TensorArgType::INOUT && overlap_status == OverlapStatus::COVERED) {
-                tensor_map.remove_entry(entry);
-            }
+            if (ptype == TensorArgType::INOUT && overlap_status == OverlapStatus::COVERED) tensor_map.remove_entry(entry);
             return true;
         });
-        if (fatal) {
-            return false;
-        }
+        if (fatal) return false;
     }
     return true;
 }
 
-/**
- * Register a task's outputs in the tensormap (STEP 4 in submit_task).
- *
- * For INOUT and OUTPUT_EXISTING tensors (excluding manual_dep), inserts the
- * tensor into tensor_map keyed by its buffer.addr with `task_id` as producer.
- *
- * No-op when in_manual_scope.
- */
-inline void
-register_task_outputs(const DepInputs &inputs, PTO2TaskId task_id, PTO2TensorMap &tensor_map, bool in_manual_scope) {
-    if (in_manual_scope) {
-        return;
-    }
-    for (int32_t i = 0; i < inputs.tensor_count; i++) {
+inline void register_task_outputs(const DepInputs &inputs, PTO2TaskId task_id, PTO2TensorMap &tensor_map, bool in_manual_scope)
+{
+    if (in_manual_scope) return;
+    for (int32_t i = 0; i < inputs.tensor_count; i++)
+    {
         TensorArgType ptype = inputs.arg_types[i];
-        if (ptype == TensorArgType::INOUT || ptype == TensorArgType::OUTPUT_EXISTING) {
+        if (ptype == TensorArgType::INOUT || ptype == TensorArgType::OUTPUT_EXISTING)
+        {
             const Tensor *tensor = &inputs.tensors[i].ref();
-            if (!tensor->manual_dep) {
-                tensor_map.insert(*tensor, task_id);
-            }
-        }
-    }
-}
-
-/**
- * Count the tensormap entries register_task_outputs() will insert for this task.
- *
- * Mirrors register_task_outputs()'s selection exactly (INOUT / OUTPUT_EXISTING,
- * excluding manual_dep), so the returned value is the precise number of
- * new_entry() calls that step makes. The orchestrator uses it to reserve pool
- * capacity before inserting. Returns 0 in a manual scope (no registration).
- */
-inline int32_t count_registrable_outputs(const DepInputs &inputs, bool in_manual_scope) {
-    if (in_manual_scope) {
-        return 0;
-    }
-    int32_t needed = 0;
-    for (int32_t i = 0; i < inputs.tensor_count; i++) {
-        TensorArgType ptype = inputs.arg_types[i];
-        if (ptype == TensorArgType::INOUT || ptype == TensorArgType::OUTPUT_EXISTING) {
-            if (!inputs.tensors[i].ref().manual_dep) {
-                needed++;
-            }
+            if (!tensor->manual_dep) tensor_map.insert(*tensor, task_id);
         }
     }
-    return needed;
 }
 
 #endif  // SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_PTO_DEP_COMPUTE_H_
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp
index b7cc58794..f01e93fb7 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp
@@ -9,1142 +9,5 @@
  * -----------------------------------------------------------------------------------------------------------
  */
 
-/**
- * PTO Runtime2 - Orchestrator Implementation
- *
- * Implements orchestrator state management, scope handling, and task submission.
- *
- * Based on: docs/RUNTIME_LOGIC.md
- */
-
-#include "pto_orchestrator.h"
-
-#include <assert.h>
-#include <inttypes.h>
-#include <stdio.h>
-#include <stdarg.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "aicpu/dep_gen_collector_aicpu.h"
-#include "common/dep_gen.h"
-#include "common/unified_log.h"
-#include "pto_dep_compute.h"
-#include "pto_runtime2_types.h"
-#include "pto_shared_memory.h"
-#include "pto_tensormap.h"
-#include "pto_types.h"
-#include "tensor.h"
-
-#if PTO2_PROFILING
-#include "aicpu/scope_stats_collector_aicpu.h"
-#include "aicpu/tensor_dump_aicpu.h"
-#endif
-
-// Verify the captured Tensor blob size in DepGenRecord matches the runtime
-// Tensor layout. The platform header defines DEP_GEN_TENSOR_SIZE without
-// including runtime/tensor.h, so this check lives at the orch callsite.
-static_assert(sizeof(Tensor) == DEP_GEN_TENSOR_SIZE, "DepGenRecord::tensors slot size out of sync with sizeof(Tensor)");
-// DEP_GEN_MAX_EXPLICIT_DEPS is a diagnostic-side capture cap only; the runtime
-// imposes no hard cap on explicit dep count. If a submit exceeds this cap,
-// dep_gen_aicpu_record_submit() logs and truncates — runtime correctness is
-// unaffected, only the captured replay record is truncated.
-
-// Weak fallbacks: dep_gen_collector_aicpu.cpp provides the strong symbols in
-// AICPU builds. Host builds (host_build_graph runtime, future dep_gen replay)
-// link these no-op stubs so the runtime translation unit is self-contained.
-// Visibility is hidden so the HOST .so doesn't export them into the global
-// dynamic symbol table where they'd shadow the AICPU .so's strong symbols
-// (same pattern as get_sys_cnt_aicpu / l2_swimlane_aicpu_record_orch_phase below).
-extern "C" __attribute__((weak, visibility("hidden"))) bool is_dep_gen_enabled() { return false; }
-__attribute__((weak, visibility("hidden"))) void dep_gen_aicpu_record_submit(
-    uint64_t, bool, int, const void *const *, const uint8_t *, int, const uint64_t *, int, const int32_t[3]
-) {}
-
-// Scope_stats enable gate, queried via the same predicate idiom as
-// is_dep_gen_enabled above. The AICPU collector links the strong definition;
-// host builds fall back to this weak `false`. Gating here still skips the
-// cross-agent occupancy reads that feed the sample when scope_stats is disabled.
-extern "C" __attribute__((weak, visibility("hidden"))) bool is_scope_stats_enabled() { return false; }
-
-// Heap-ring wrap report, called from the allocator (pto_ring_buffer.h) on each
-// wrap. Strong definition lives in the AICPU collector; host builds fall back to
-// this weak no-op so the runtime translation unit stays self-contained.
-extern "C" __attribute__((weak, visibility("hidden"))) void scope_stats_note_heap_wrap(int) {}
-
-// =============================================================================
-// Orchestrator Profiling (compile-time toggle)
-// =============================================================================
-#if PTO2_ORCH_PROFILING
-#include "aicpu/device_time.h"
-#include "aicpu/l2_swimlane_collector_aicpu.h"
-// Weak fallback for builds that don't link device_time.cpp (e.g. host).
-// The strong symbol from platform/.../device_time.cpp wins in the AICPU build.
-//
-// IMPORTANT: visibility("hidden") is required to prevent the HOST .so from
-// exporting this weak fallback into the global dynamic symbol table via
-// RTLD_GLOBAL. Without it, when the AICPU .so is loaded and its PLT entry
-// for get_sys_cnt_aicpu is resolved, the dynamic linker finds the HOST .so's
-// weak definition first (already in global table) and uses it — returning 0.
-// With hidden visibility, the HOST .so does not export this symbol globally,
-// so the AICPU .so's PLT resolves to its own strong definition from
-// device_time.cpp.
-__attribute__((weak, visibility("hidden"))) uint64_t get_sys_cnt_aicpu() { return 0; }
-// Weak fallback for builds that don't link l2_swimlane_collector_aicpu.cpp.
-// The strong symbol from the AICPU build wins when profiling is available.
-// Also hidden to prevent HOST .so from polluting the global symbol table.
-__attribute__((weak, visibility("hidden"))) void
-l2_swimlane_aicpu_record_orch_phase(uint64_t, uint64_t, uint64_t, uint32_t) {}
-// Accumulated cycles per sub-step (only needed for ORCH_PROFILING export)
-static uint64_t g_orch_sync_cycle = 0;       // tensormap sync
-static uint64_t g_orch_alloc_cycle = 0;      // unified task+heap alloc
-static uint64_t g_orch_args_cycle = 0;       // param copy
-static uint64_t g_orch_lookup_cycle = 0;     // tensormap lookup + dep building
-static uint64_t g_orch_insert_cycle = 0;     // tensormap insert
-static uint64_t g_orch_fanin_cycle = 0;      // fanin list + early-return check
-static uint64_t g_orch_scope_end_cycle = 0;  // scope_end overhead
-static int64_t g_orch_submit_count = 0;
-static uint32_t g_orch_submit_idx = 0;
-uint64_t g_orch_alloc_wait_cycle = 0;
-uint64_t g_orch_fanin_wait_cycle = 0;
-uint64_t g_orch_alloc_atomic_count = 0;
-uint64_t g_orch_args_atomic_count = 0;
-uint64_t g_orch_scope_end_atomic_count = 0;
-// Cycle accumulation is unconditional under PTO2_ORCH_PROFILING (that's what
-// the flag is for) and feeds the per-sub-step `g_orch_*_cycle` cumulatives
-// printed in the cold-path log.
-//
-// Per-submit ORCH_SUBMIT record is the only swim-lane emit on the orch
-// path — one record per submit_task() / alloc_tensors() call spanning
-// the entire [start, end] window. Per-sub-step phase records were dropped
-// in favour of the cumulatives + per-submit envelope; the dispatcher
-// already inserts one record at the end of each submit path via
-// CYCLE_COUNT_ORCH_SUBMIT_RECORD.
-#define CYCLE_COUNT_START()                                                        \
-    bool _prof_active = (orch->l2_swimlane_level >= L2SwimlaneLevel::ORCH_PHASES); \
-    uint64_t _t0 = get_sys_cnt_aicpu(), _t1;                                       \
-    uint64_t _submit_start_ts = _t0
-#define CYCLE_COUNT_LAP(acc)       \
-    do {                           \
-        _t1 = get_sys_cnt_aicpu(); \
-        acc += (_t1 - _t0);        \
-        _t0 = _t1;                 \
-    } while (0)
-#define CYCLE_COUNT_ORCH_SUBMIT_RECORD(tid)                                                       \
-    do {                                                                                          \
-        if (_prof_active) {                                                                       \
-            l2_swimlane_aicpu_record_orch_phase(_submit_start_ts, _t1, (tid), g_orch_submit_idx); \
-        }                                                                                         \
-    } while (0)
-#elif PTO2_PROFILING
-#include "aicpu/device_time.h"
-#include "aicpu/l2_swimlane_collector_aicpu.h"
-__attribute__((weak, visibility("hidden"))) uint64_t get_sys_cnt_aicpu() { return 0; }
-__attribute__((weak, visibility("hidden"))) void
-l2_swimlane_aicpu_record_orch_phase(uint64_t, uint64_t, uint64_t, uint32_t) {}
-// submit_idx needed for swimlane task_id tagging (no cycle accumulation at this level)
-static uint32_t g_orch_submit_idx = 0;
-#define CYCLE_COUNT_START()                                                        \
-    bool _prof_active = (orch->l2_swimlane_level >= L2SwimlaneLevel::ORCH_PHASES); \
-    uint64_t _t0 = _prof_active ? get_sys_cnt_aicpu() : 0, _t1 = 0;                \
-    uint64_t _submit_start_ts = _t0
-#define CYCLE_COUNT_LAP(acc) \
-    do {                     \
-    } while (0)
-#define CYCLE_COUNT_ORCH_SUBMIT_RECORD(tid)                                                       \
-    do {                                                                                          \
-        if (_prof_active) {                                                                       \
-            _t1 = get_sys_cnt_aicpu();                                                            \
-            l2_swimlane_aicpu_record_orch_phase(_submit_start_ts, _t1, (tid), g_orch_submit_idx); \
-        }                                                                                         \
-    } while (0)
-#else
-#define CYCLE_COUNT_START()
-#define CYCLE_COUNT_LAP(acc)
-#define CYCLE_COUNT_ORCH_SUBMIT_RECORD(tid)
-#endif
-
-static int32_t orch_mark_fatal(PTO2OrchestratorState *orch, int32_t error_code) {
-    always_assert(orch != nullptr);
-    orch->fatal = true;
-    if (error_code == PTO2_ERROR_NONE || orch->sm_header == nullptr) {
-        return PTO2_ERROR_NONE;
-    }
-
-    int32_t expected = PTO2_ERROR_NONE;
-    std::atomic<int32_t> &orch_error_code = orch->sm_header->orch_error_code;
-    if (orch_error_code.compare_exchange_strong(expected, error_code, std::memory_order_acq_rel)) {
-        return error_code;
-    }
-    return expected;
-}
-
-static void
-orch_report_fatal_v(PTO2OrchestratorState *orch, int32_t error_code, const char *func, const char *fmt, va_list args) {
-    int32_t latched_code = orch_mark_fatal(orch, error_code);
-
-#if PTO2_PROFILING
-    // Flush the current scope's peaks BEFORE the FATAL log line, so the
-    // diagnostic context (which pool/window filled up) appears right next to
-    // the failure reason. on_fatal is latched, so duplicate fatals from
-    // different layers don't print multiple stats lines.
-    scope_stats_on_fatal();
-#endif
-
-    if (fmt == nullptr || fmt[0] == '\0') {
-        if (latched_code != PTO2_ERROR_NONE && latched_code != error_code) {
-            unified_log_error(func, "FATAL(code=%d, latched=%d)", error_code, latched_code);
-        } else {
-            unified_log_error(func, "FATAL(code=%d)", error_code);
-        }
-        return;
-    }
-
-    char message[1024];
-    vsnprintf(message, sizeof(message), fmt, args);
-    if (latched_code != PTO2_ERROR_NONE && latched_code != error_code) {
-        unified_log_error(func, "FATAL(code=%d, latched=%d): %s", error_code, latched_code, message);
-        return;
-    }
-    unified_log_error(func, "FATAL(code=%d): %s", error_code, message);
-}
-
-void PTO2OrchestratorState::report_fatal(int32_t error_code, const char *func, const char *fmt, ...) {
-    auto *orch = this;
-    va_list args;
-    va_start(args, fmt);
-    orch_report_fatal_v(orch, error_code, func, fmt, args);
-    va_end(args);
-}
-
-static uint32_t next_fanin_seen_epoch(PTO2OrchestratorState *orch) {
-    uint32_t next = orch->fanin_seen_current_epoch + 1;
-    if (next == 0) {
-        for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-            memset(
-                orch->fanin_seen_epoch[r], 0,
-                static_cast<size_t>(orch->sm_header->rings[r].task_window_size) * sizeof(uint32_t)
-            );
-        }
-        next = 1;
-    }
-    orch->fanin_seen_current_epoch = next;
-    return next;
-}
-
-struct PTO2FaninBuilder {
-    PTO2FaninBuilder(PTO2OrchestratorState *orch, PTO2FaninPool &spill_pool, uint32_t seen_epoch) :
-        count(0),
-        spill_start(0),
-        orch(orch),
-        seen_epoch(seen_epoch),
-        spill_pool(spill_pool) {}
-    int32_t count{0};
-    int32_t spill_start{0};
-    PTO2OrchestratorState *orch{nullptr};
-    uint32_t seen_epoch{0};
-    PTO2FaninPool &spill_pool;
-    PTO2TaskSlotState *inline_slots[PTO2_FANIN_INLINE_CAP];
-
-    template <typename Fn>
-    PTO2FaninForEachReturn<Fn> for_each(Fn &&fn) const {
-        return for_each_fanin_storage(inline_slots, count, spill_start, spill_pool, static_cast<Fn &&>(fn));
-    }
-
-    bool mark_seen(uint8_t prod_ring, int32_t prod_slot) {
-        if (prod_ring >= PTO2_MAX_RING_DEPTH || prod_slot < 0) {
-            return false;
-        }
-        uint32_t *seen = orch->fanin_seen_epoch[prod_ring];
-        uint32_t slot = static_cast<uint32_t>(prod_slot);
-        if (seen[slot] == seen_epoch) {
-            return true;
-        }
-        seen[slot] = seen_epoch;
-        return false;
-    }
-};
-
-static bool append_fanin_or_fail(
-    PTO2OrchestratorState *orch, uint8_t prod_ring, int32_t prod_slot, PTO2TaskSlotState *prod_state,
-    PTO2TaskId producer_task_id, PTO2FaninBuilder *fanin_builder, uint8_t ring_id
-) {
-    // Decide-and-claim under the producer's fanout_lock. Two conditions make this
-    // resolved slot a non-dependency, and both must be checked together with the
-    // fanout_count++ so the producer cannot slip from live to consumed/reused in
-    // between:
-    //   (1) Generation mismatch — the producer was CONSUMED, its slot
-    //       reset_for_reuse'd and rebound to a newer task. The cached
-    //       owner_task_id still resolves to this slot, but it no longer holds our
-    //       producer; ++'ing it would corrupt an unrelated task.
-    //   (2) Already CONSUMED in place — finished, output ready, no real edge.
-    // In either case, adding it to the fanin and bumping fanout_count would leave
-    // a stale ++/release pair (wire_task drops the fanout edge but keeps the fanin
-    // slot, so on_task_release still release_producer()'s it) that desyncs the
-    // slot's refcount (rc != fc) and wedges in-order reclaim. Claiming a live
-    // producer under the lock pins it: fanout_count now counts us, so it cannot
-    // reach CONSUMED (rc == fc) until we release it in on_task_release, keeping the
-    // slot's generation stable until then. check_and_handle_consumed flips
-    // COMPLETED->CONSUMED under the same lock, so the check and the ++ are atomic
-    // against the consume. fanout_count is lock-protected per the
-    // PTO2TaskSlotState contract.
-    //
-    // Dedup (mark_seen) happens HERE, gated on a live producer — NOT before the
-    // gone check. mark_seen keys only on (ring, slot); a stale owner that resolves
-    // to a reused slot must not record it as seen, or a later dependency on the
-    // live generation in the same submission would hit mark_seen and be skipped
-    // without claiming it (dropped edge). Marking only when !gone keeps the dedup
-    // keyed to the live producer, and doing it before the ++ still suppresses a
-    // double-count for a producer named twice in one submission.
-    prod_state->lock_fanout();
-    bool gone = prod_state->task == nullptr || prod_state->task->task_id.local() != producer_task_id.local() ||
-                prod_state->task_state.load(std::memory_order_acquire) == PTO2_TASK_CONSUMED;
-    bool claim = !gone && !fanin_builder->mark_seen(prod_ring, prod_slot);
-    if (claim) {
-        // Low bits hold the consumer count; bit31 is the scope ref. The consumer
-        // count must never carry into bit31 (would corrupt the scope-release
-        // flag) — true for any sane fanout (<< 2^31).
-        assert(
-            (prod_state->fanout_count & ~PTO2_FANOUT_SCOPE_BIT) < (PTO2_FANOUT_SCOPE_BIT - 1) &&
-            "fanout consumer count overflow into scope bit"
-        );
-        prod_state->fanout_count++;
-    }
-    prod_state->unlock_fanout();
-#if PTO2_ORCH_PROFILING
-    // lock + unlock always; one fanout_count store when we actually claim.
-    g_orch_args_atomic_count += claim ? 3 : 2;
-#endif
-    // gone (stale/consumed) or an already-seen duplicate live producer: no new
-    // fanin edge either way.
-    if (!claim) {
-        return true;
-    }
-
-    if (fanin_builder->count < PTO2_FANIN_INLINE_CAP) {
-        fanin_builder->inline_slots[fanin_builder->count++] = prod_state;
-        return true;
-    }
-
-    PTO2FaninPool &fanin_pool = fanin_builder->spill_pool;
-    if (!fanin_pool.ensure_space(orch->sm_header->rings[ring_id], 1)) {
-        orch_mark_fatal(orch, PTO2_ERROR_DEP_POOL_OVERFLOW);
-        return false;
-    }
-    int32_t spill_idx = fanin_pool.top;
-    PTO2FaninSpillEntry *entry = fanin_pool.alloc();
-    if (entry == nullptr) {
-        orch_mark_fatal(orch, PTO2_ERROR_DEP_POOL_OVERFLOW);
-        return false;
-    }
-    if (fanin_builder->count == PTO2_FANIN_INLINE_CAP) {
-        fanin_builder->spill_start = spill_idx;
-    }
-    entry->slot_state = prod_state;
-    fanin_builder->count++;
-    return true;
-}
-
-static void scope_tasks_push(PTO2OrchestratorState *orch, PTO2TaskSlotState *task_slot_state);
-
-struct PTO2PreparedTask {
-    PTO2TaskId task_id = PTO2TaskId::invalid();
-    PTO2TaskAllocResult alloc_result = {-1, 0, nullptr, nullptr};
-    PTO2TaskDescriptor *task = nullptr;
-    PTO2TaskPayload *payload = nullptr;
-    PTO2TaskSlotState *slot_state = nullptr;
-};
-
-static PTO2OutputLayout calculate_output_layout(const L0TaskArgs &args) {
-    PTO2OutputLayout layout;
-    for (int32_t i = 0; i < args.tensor_count(); i++) {
-        if (args.tag(i) != TensorArgType::OUTPUT) {
-            continue;
-        }
-        layout.offsets[i] = layout.total_output_size;
-        layout.buffer_sizes[i] =
-            PTO2_ALIGN_UP(args.tensor(i).create_info().buffer_size_bytes(), PTO2_PACKED_OUTPUT_ALIGN);
-        layout.total_output_size += layout.buffer_sizes[i];
-    }
-    return layout;
-}
-
-static bool check_scope_can_accept_task(PTO2OrchestratorState *orch, PTO2TaskAllocator &allocator, uint8_t ring_id) {
-    always_assert(orch->scope_stack_top >= 0 && "Cannot submit task outside a scope");
-
-    int32_t scope_task_count = orch->scope_tasks_size - orch->scope_begins[orch->scope_stack_top];
-    if (scope_task_count < allocator.window_size() - 1) {
-        return true;
-    }
-
-    int32_t active_count = allocator.active_count();
-
-    LOG_ERROR("========================================");
-    LOG_ERROR("FATAL: Scope Deadlock Detected! (ring %d)", ring_id);
-    LOG_ERROR("========================================");
-    LOG_ERROR("Tasks in current scope (%d) >= task_window_size (%d).", scope_task_count, allocator.window_size());
-    LOG_ERROR("  scope_depth:        %d", orch->scope_stack_top + 1);
-    LOG_ERROR("  ring_id:            %d", ring_id);
-    LOG_ERROR("  scope_task_count:   %d", scope_task_count);
-    LOG_ERROR("  active_tasks:       %d / %d", active_count, allocator.window_size());
-    LOG_ERROR("Root Cause:");
-    LOG_ERROR("  Tasks within a scope hold a fanout_count reference that is only");
-    LOG_ERROR("  released at scope_end. When scope task count >= window_size,");
-    LOG_ERROR("  no slots can be reclaimed -> deadlock.");
-    LOG_ERROR("Solution:");
-    LOG_ERROR("  1. Reduce tasks per scope (use batching/unroll)");
-    LOG_ERROR("  2. Increase task window (current: %d)", allocator.window_size());
-    LOG_ERROR("     Compile-time: PTO2_TASK_WINDOW_SIZE in pto_runtime2_types.h");
-    LOG_ERROR("     Runtime env:  PTO2_RING_TASK_WINDOW=<power-of-2>");
-    LOG_ERROR("  3. Split work across multiple scopes");
-    LOG_ERROR("========================================");
-    orch_mark_fatal(orch, PTO2_ERROR_SCOPE_DEADLOCK);
-    return false;
-}
-
-static bool prepare_task(
-    PTO2OrchestratorState *orch, const L0TaskArgs &args, int32_t total_output_size, ActiveMask active_mask,
-    PTO2PreparedTask *out
-) {
-    uint8_t ring_id = orch->current_ring_id();
-    auto &allocator = orch->rings[ring_id].task_allocator;
-
-    if (!check_scope_can_accept_task(orch, allocator, ring_id)) {
-        return false;
-    }
-
-    out->alloc_result = allocator.alloc(total_output_size);
-    if (out->alloc_result.failed()) {
-        orch_mark_fatal(orch, PTO2_ERROR_HEAP_RING_DEADLOCK);
-        return false;
-    }
-
-    out->task_id = PTO2TaskId::make(ring_id, static_cast<uint32_t>(out->alloc_result.task_id));
-    out->slot_state = &orch->sm_header->rings[ring_id].get_slot_state_by_slot(out->alloc_result.slot);
-    out->task = &orch->sm_header->rings[ring_id].task_descriptors[out->alloc_result.slot];
-    out->payload = &orch->sm_header->rings[ring_id].task_payloads[out->alloc_result.slot];
-
-    // Reset the fanout/fanin bookkeeping for this reuse. The allocator only
-    // returns a slot whose previous occupant is CONSUMED and quiescent (alloc
-    // spins until last_task_alive passes it; in-order reclaim + acquire load),
-    // and the slot is not published to any scheduler thread until the
-    // wiring.queue.push at the end of submit_task_common — so this reset is
-    // race-free. Doing it here (not relying on the scheduler's eager
-    // reset-after-CONSUMED, which only covers the contiguously-reclaimed tail)
-    // makes every reused slot self-clean, which lets the per-boot SM init skip
-    // its O(window) per-slot loop. bind_ring is slot-invariant but cheap to
-    // re-assert on the already-dirtied cache line.
-    out->slot_state->bind_ring(ring_id);
-    out->slot_state->reset_for_reuse();
-    out->slot_state->fanin_count = 0;
-
-    out->payload->prefetch(args.tensor_count(), args.scalar_count());
-
-    // Re-bind payload/task pointers each submit. Value is per-slot constant
-    // (same as &task_payloads[slot] / &task_descriptors[slot]), but writing
-    // here lets RingSchedState::init() skip the O(window_size) bind loop.
-    // Both writes hit the same 64B slot_state cache line we're about to
-    // dirty below, so the extra cost is two stores on an already-hot line.
-    // Must precede the scheduler wiring.queue.push at the end of
-    // submit_task_common — that push is the first read of slot_state->task /
-    // slot_state->payload by another thread.
-    out->slot_state->bind_buffers(out->payload, out->task);
-
-    // prepare_task does NO payload writes: all payload content (tensors/scalars +
-    // early-dispatch spec fields) is initialized in PTO2TaskPayload::init, the
-    // single payload-init point, which runs before the scheduler wiring push.
-
-    // Fields already reset by advance_ring_pointers (eager reset after CONSUMED):
-    //   fanout_lock=0, fanout_count=PTO2_FANOUT_SCOPE_BIT, fanout_head=nullptr,
-    //   fanin_refcount=0, fanout_refcount=0, completed_subtasks=0, next_block_idx=0
-    // Fields immutable after RingSchedState::init():
-    //   ring_id
-    // task_state left as CONSUMED by eager reset (safe for stale wait_for_tensor
-    // observers); set to PENDING here when orchestrator actually reuses the slot.
-    out->slot_state->task_state.store(PTO2_TASK_PENDING, std::memory_order_relaxed);
-    int16_t block_num = args.launch_spec.block_num();
-    out->slot_state->total_required_subtasks =
-        static_cast<int16_t>(block_num * __builtin_popcount(active_mask.core_mask()));
-    out->slot_state->logical_block_num = block_num;
-    out->slot_state->active_mask = active_mask;
-    // fanin_count is set by scheduler during wiring
-    scope_tasks_push(orch, out->slot_state);
-
-    return true;
-}
-
-// =============================================================================
-// Scope Management
-// =============================================================================
-
-static void scope_tasks_push(PTO2OrchestratorState *orch, PTO2TaskSlotState *task_slot_state) {
-    if (orch->scope_tasks_size >= orch->scope_tasks_capacity) {
-        // scope_tasks lives in the per-Worker arena (single backing allocation),
-        // so realloc is not legal. Capacity is the total in-flight slot budget
-        // (sum of the per-ring task windows; see reserve_layout) — hitting it means
-        // every ring is saturated, so no further push could succeed regardless of
-        // buffer growth.
-        orch->report_fatal(
-            PTO2_ERROR_SCOPE_TASKS_OVERFLOW, __FUNCTION__,
-            "scope_tasks buffer saturated at %d entries (all rings full)", orch->scope_tasks_capacity
-        );
-        return;
-    }
-    orch->scope_tasks[orch->scope_tasks_size++] = task_slot_state;
-}
-
-void PTO2OrchestratorState::begin_scope(PTO2ScopeMode mode) {
-    auto *orch = this;
-    if (orch->fatal) {
-        return;
-    }
-    assert(orch->scope_stack_top < static_cast<int32_t>(orch->scope_stack_capacity - 1) && "Scope stack overflow");
-    if (mode == PTO2ScopeMode::AUTO && orch->in_manual_scope()) {
-        report_fatal(PTO2_ERROR_INVALID_ARGS, __FUNCTION__, "auto scope nested inside manual scope is not supported");
-        return;
-    }
-
-    bool already_in_manual_scope = orch->in_manual_scope();
-    ++orch->scope_stack_top;
-    orch->scope_begins[orch->scope_stack_top] = orch->scope_tasks_size;
-    if (mode == PTO2ScopeMode::MANUAL && !already_in_manual_scope) {
-        orch->manual_begin_depth = orch->scope_stack_top;
-    }
-#if PTO2_PROFILING
-    // Gate via is_scope_stats_enabled() (weak-false in host builds) BEFORE the
-    // collector call: when disabled we pay nothing. Sample the current ring's
-    // task/heap start-end and tensormap usage at the scope boundary.
-    if (is_scope_stats_enabled()) {
-        uint8_t ring_id = orch->current_ring_id();
-        auto &alloc = orch->rings[ring_id].task_allocator;
-        int32_t dep_pool_tail = 0;
-        int32_t dep_pool_top = 0;
-        if (orch->scheduler) {
-            orch->scheduler->ring_sched_states[ring_id].read_dep_pool_snapshot(dep_pool_tail, dep_pool_top);
-        }
-        scope_stats_begin(
-            ring_id, alloc.task_tail(), alloc.task_head(), alloc.heap_tail(), alloc.heap_top(), dep_pool_tail,
-            dep_pool_top, orch->tensor_map.current_used()
-        );
-    }
-#endif
-}
-
-void PTO2OrchestratorState::end_scope() {
-    auto *orch = this;
-    if (orch->fatal) {
-        return;
-    }
-    assert(orch->scope_stack_top >= 0 && "Scope stack underflow");
-
-    // Snapshot the ring start/end BEFORE the orchestrator drains pending tasks
-    // via scheduler->on_scope_end, so the end record reflects the scope's
-    // occupancy at close, not the residual after teardown.
-#if PTO2_PROFILING
-    // Gate via is_scope_stats_enabled() (see begin_scope). One collector call
-    // emits the end-boundary record and tears down bookkeeping.
-    if (is_scope_stats_enabled()) {
-        uint8_t ring_id = orch->current_ring_id();
-        auto &alloc = orch->rings[ring_id].task_allocator;
-        int32_t dep_pool_tail = 0;
-        int32_t dep_pool_top = 0;
-        if (orch->scheduler) {
-            orch->scheduler->ring_sched_states[ring_id].read_dep_pool_snapshot(dep_pool_tail, dep_pool_top);
-        }
-        scope_stats_end(
-            ring_id, alloc.task_tail(), alloc.task_head(), alloc.heap_tail(), alloc.heap_top(), dep_pool_tail,
-            dep_pool_top, orch->tensor_map.current_used()
-        );
-    }
-#endif
-
-#if PTO2_ORCH_PROFILING
-    uint64_t _se0 = get_sys_cnt_aicpu();
-#endif
-
-    bool ending_manual_scope = orch->scope_stack_top == orch->manual_begin_depth;
-    int32_t begin = orch->scope_begins[orch->scope_stack_top--];
-    int32_t count = orch->scope_tasks_size - begin;
-    if (ending_manual_scope) {
-        orch->manual_begin_depth = PTO2_MAX_SCOPE_DEPTH;
-    }
-
-    if (orch->scheduler && count > 0) {
-        orch->scheduler->on_scope_end(&orch->scope_tasks[begin], count);
-    }
-
-    // Rewind the task buffer — these entries are no longer needed
-    orch->scope_tasks_size = begin;
-
-#if PTO2_ORCH_PROFILING
-    uint64_t _se1 = get_sys_cnt_aicpu();
-    g_orch_scope_end_cycle += (_se1 - _se0);
-#endif
-}
-
-// =============================================================================
-// Task Submission
-// =============================================================================
-
-// Ensure the tensormap entry pool has room for `needed` inserts before STEP 4
-// registers this task's outputs. The pool is watermark-reclaimed like the
-// task/heap/fanin pools — retired tasks' entries free once last_task_alive
-// advances — so an exhausted pool is back-pressure, not a hard error. Reclaim
-// across all rings (entries from every ring share one pool); if still short,
-// spin until reclaim actually frees entries, with the same 500 ms wall-clock
-// backstop as the task allocator and fanin spill pool. A pool that stays full
-// (no entry freed) is a genuine deadlock: latch PTO2_ERROR_TENSORMAP_OVERFLOW
-// and bail. Returns false on deadlock or on a fatal already latched by another
-// party. Cold path — the fast path returns immediately when the pool has room.
-static bool ensure_tensormap_capacity(PTO2OrchestratorState *orch, int32_t needed) {
-    PTO2TensorMap &tm = orch->tensor_map;
-    if (tm.free_entries() >= needed) {
-        return true;
-    }
-
-    int32_t alive[PTO2_MAX_RING_DEPTH];
-    auto read_alive = [&]() {
-        for (int32_t r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-            // Relaxed: a self-correcting poll re-read every reclaim tick, so a stale
-            // watermark only defers reclaim one tick and never over-frees.
-            alive[r] = orch->sm_header->rings[r].fc.last_task_alive.load(std::memory_order_relaxed);
-        }
-    };
-
-    read_alive();
-    int64_t cur_alive_sum = tm.reclaim_retired_all(alive);  // kept for the deadlock diagnostic
-    int32_t prev_free = tm.free_entries();
-    if (prev_free >= needed) {
-        return true;
-    }
-
-    int spin_count = 0;
-    uint64_t block_cycle0 = 0;  // wall-clock anchor for the deadlock backstop
-    bool block_timing = false;  // false until the first no-reclaim-progress tick
-    while (tm.free_entries() < needed) {
-        spin_count++;
-
-        // Reclaim (and the all-ring watermark reads it needs) is the costly part of
-        // this spin and the only path that frees entries; gate it to a periodic tick.
-        // Cold path, but the spin itself is tight.
-        if ((spin_count & 31) == 0) {
-            read_alive();
-            cur_alive_sum = tm.reclaim_retired_all(alive);
-            int32_t cur_free = tm.free_entries();
-            if (cur_free >= needed) {
-                return true;
-            }
-            // Progress is entries actually freed, NOT watermark movement: a ring can
-            // retire zero-output tasks (count_registrable_outputs == 0), advancing
-            // last_task_alive without freeing any entry. Gating the backstop on
-            // free_entries() keeps a wedged pool from dodging the timeout while some
-            // unrelated ring keeps draining.
-            if (cur_free > prev_free) {
-                spin_count = 0;
-                prev_free = cur_free;
-                block_timing = false;
-            }
-        }
-
-        if ((spin_count & 1023) == 0) {
-            // A fatal latched elsewhere breaks this otherwise-unbounded spin.
-            if (orch->sm_header->orch_error_code.load(std::memory_order_acquire) != PTO2_ERROR_NONE) {
-                return false;
-            }
-            // Absolute-time backstop, matching the task allocator: stable across
-            // chips/contention, unlike a fixed spin count. get_sys_cnt_aicpu()
-            // is an MMIO read, so sample it only once per 1024 spins.
-            uint64_t now = get_sys_cnt_aicpu();
-            if (!block_timing) {
-                block_cycle0 = now;
-                block_timing = true;
-            } else if (now - block_cycle0 >= PTO2_ALLOC_DEADLOCK_TIMEOUT_CYCLES) {
-                LOG_ERROR("========================================");
-                LOG_ERROR("FATAL: TensorMap Entry Pool Deadlock Detected!");
-                LOG_ERROR("========================================");
-                LOG_ERROR("TensorMap entry pool freed no entries for ~500 ms while a task waits.");
-                LOG_ERROR("  - Pool used:   %d / %d", tm.current_used(), tm.pool_capacity());
-                LOG_ERROR("  - Needed:      %d entries", needed);
-                LOG_ERROR("  - last_task_alive (sum across rings): %" PRId64, cur_alive_sum);
-                LOG_ERROR("Diagnosis:");
-                LOG_ERROR("  No retiring task is freeing tensormap entries (last_task_alive may");
-                LOG_ERROR("  still move on rings with no registered outputs). Check TaskRing");
-                LOG_ERROR("  diagnostics for the stalled producer.");
-                LOG_ERROR("Solution:");
-                LOG_ERROR("  Increase PTO2_TENSORMAP_POOL_SIZE (current: %d).", tm.pool_capacity());
-                LOG_ERROR("========================================");
-                orch_mark_fatal(orch, PTO2_ERROR_TENSORMAP_OVERFLOW);
-                return false;
-            }
-        }
-        SPIN_WAIT_HINT();
-    }
-    return true;
-}
-
-// Shared body for submit_task / submit_dummy_task. Caller has already validated
-// args.has_error, decided active_mask (empty for dummy), and resolved the per-slot
-// kernel_ids (all INVALID_KERNEL_ID for dummy). Performs tensormap sync, fanin
-// computation (explicit_deps + auto), output registration, slot init, and pushes
-// to the scheduler wiring queue.
-static TaskOutputTensors submit_task_common(
-    PTO2OrchestratorState *orch, const L0TaskArgs &args, ActiveMask active_mask, int32_t aic_kernel_id,
-    int32_t aiv0_kernel_id, int32_t aiv1_kernel_id
-) {
-    CYCLE_COUNT_START();
-    TaskOutputTensors result;
-    PTO2OutputLayout layout = calculate_output_layout(args);
-    PTO2PreparedTask prepared;
-    if (!prepare_task(orch, args, layout.total_output_size, active_mask, &prepared)) {
-        return result;
-    }
-    uint8_t ring_id = prepared.task_id.ring();
-    PTO2SchedulerState *sched = orch->scheduler;
-    PTO2RingFlowControl &fc = orch->sm_header->rings[ring_id].fc;
-    PTO2TaskId task_id = prepared.task_id;
-    PTO2TaskSlotState &cur_slot_state = *prepared.slot_state;
-    PTO2TaskDescriptor &task = *prepared.task;
-    PTO2TaskPayload &payload = *prepared.payload;
-    result.set_task_id(task_id);
-
-    // dep_gen capture point: snapshot the orch submit_task inputs while the
-    // tensormap is still in its pre-lookup state for this task. Replay reads
-    // these records offline to reconstruct the complete dep graph — the sole
-    // source of truth for fanout now that the swimlane hot path no longer
-    // records it.
-#if PTO2_PROFILING
-    if (is_dep_gen_enabled()) {
-        const void *tensor_ptrs[MAX_TENSOR_ARGS];
-        // TensorArgType is `enum class : int32_t` (4 bytes); the on-disk record
-        // packs arg_types as uint8_t[16] (5-value enum fits in a byte). Narrow
-        // each tag here rather than letting the AICPU writer reinterpret a
-        // 4×-wider array as bytes — that path silently lost two of every three
-        // tags on little-endian and synthesized phantom self-edges in replay.
-        uint8_t arg_types_u8[MAX_TENSOR_ARGS];
-        // Clamp to MAX_TENSOR_ARGS even though the Arg builder caps adds at
-        // MAX_TENSOR_ARGS: defensive against any future builder bypass /
-        // shared-memory bit-flip that could otherwise overrun the two
-        // MAX_TENSOR_ARGS-sized stack buffers above.
-        const int tc_raw = args.tensor_count();
-        const int tc = tc_raw > MAX_TENSOR_ARGS ? MAX_TENSOR_ARGS : tc_raw;
-        for (int i = 0; i < tc; i++) {
-            // OUTPUT slots carry create_info (not yet a Tensor); skip them —
-            // they have no producer to look up and replay's per-tensor loop
-            // also skips OUTPUT.
-            tensor_ptrs[i] = (args.tag(i) == TensorArgType::OUTPUT) ? nullptr : &args.tensor(i).ref();
-            arg_types_u8[i] = static_cast<uint8_t>(args.tag(i));
-        }
-        const int32_t kernel_ids_capture[3] = {aic_kernel_id, aiv0_kernel_id, aiv1_kernel_id};
-        dep_gen_aicpu_record_submit(
-            task_id.raw, orch->in_manual_scope(), tc, tensor_ptrs, arg_types_u8,
-            static_cast<int>(args.explicit_dep_count()), reinterpret_cast<const uint64_t *>(args.explicit_deps_data()),
-            args.launch_spec.block_num(), kernel_ids_capture
-        );
-    }
-#endif
-
-    PTO2FaninBuilder fanin_builder(orch, orch->rings[ring_id].fanin_pool, next_fanin_seen_epoch(orch));
-
-    CYCLE_COUNT_LAP(g_orch_alloc_cycle);
-
-#if PTO2_PROFILING
-    if (layout.total_output_size > 0) {
-        orch->buffers_allocated++;
-        orch->bytes_allocated += layout.total_output_size;
-    }
-#endif
-
-    // === STEP 2: Sync TensorMap validity and optional cleanup ===
-    // Read current last_task_alive from shared memory for this ring
-    int32_t sm_last_task_alive = fc.last_task_alive.load(std::memory_order_acquire);
-
-    orch->tensor_map.sync_tensormap(task_id, sm_last_task_alive);
-
-    CYCLE_COUNT_LAP(g_orch_sync_cycle);
-
-    for (uint32_t i = 0; i < args.explicit_dep_count(); i++) {
-        PTO2TaskId dep_task_id = args.explicit_dep(i);
-        if (!dep_task_id.is_valid()) {
-            orch->report_fatal(
-                PTO2_ERROR_INVALID_ARGS, __FUNCTION__, "Arg.set_dependencies(...) requires valid task ids"
-            );
-            return result;
-        }
-        uint8_t dep_ring_id = dep_task_id.ring();
-        PTO2SharedMemoryRingHeader &dep_ring = orch->sm_header->rings[dep_ring_id];
-        int32_t dep_local_task_id = static_cast<int32_t>(dep_task_id.local());
-        int32_t dep_last_task_alive = dep_ring.fc.last_task_alive.load(std::memory_order_acquire);
-        if (dep_local_task_id < dep_last_task_alive) {
-            continue;
-        }
-        int32_t dep_slot = dep_ring.get_slot_by_task_id(dep_local_task_id);
-        PTO2TaskSlotState *producer_slot_state = &dep_ring.get_slot_state_by_slot(dep_slot);
-        if (!append_fanin_or_fail(
-                orch, dep_ring_id, dep_slot, producer_slot_state, dep_task_id, &fanin_builder, ring_id
-            )) {
-            return result;
-        }
-    }
-
-    // === STEP 3: Lookup inputs (creator retention + tensormap modifier lookup) ===
-    DepInputs dep_inputs{
-        args.tensor_count(),       args.tensor_data(), args.tag_data(), static_cast<int32_t>(args.explicit_dep_count()),
-        args.explicit_deps_data(),
-    };
-
-    auto runtime_emit = [&](PTO2TaskId producer_task_id) -> bool {
-        uint8_t prod_ring = producer_task_id.ring();
-        PTO2SharedMemoryRingHeader &producer_ring = orch->sm_header->rings[prod_ring];
-        int32_t prod_slot = producer_ring.get_slot_by_task_id(static_cast<int32_t>(producer_task_id.local()));
-        PTO2TaskSlotState *prod_state = &producer_ring.get_slot_state_by_slot(prod_slot);
-        return append_fanin_or_fail(orch, prod_ring, prod_slot, prod_state, producer_task_id, &fanin_builder, ring_id);
-    };
-
-    if (!compute_task_fanin(dep_inputs, orch->tensor_map, orch->in_manual_scope(), runtime_emit)) {
-        return result;
-    }
-
-    CYCLE_COUNT_LAP(g_orch_lookup_cycle);
-
-    // === STEP 4: Register outputs/inouts in TensorMap (must be separate from lookup) ===
-    // Reserve pool capacity for this task's inserts before registering. The pool
-    // is shared across rings and reclaimed as last_task_alive advances; an
-    // exhausted pool back-pressures here (and detects a wedged watermark) rather
-    // than tripping new_entry()'s hard assert mid-registration.
-    int32_t tensormap_needed = count_registrable_outputs(dep_inputs, orch->in_manual_scope());
-    if (tensormap_needed > 0 && !ensure_tensormap_capacity(orch, tensormap_needed)) {
-        return result;
-    }
-    register_task_outputs(dep_inputs, task_id, orch->tensor_map, orch->in_manual_scope());
-
-    CYCLE_COUNT_LAP(g_orch_insert_cycle);
-
-    // === STEP 5: Batch-write to GM (single cache line burst) ===
-    // Deferred from allocation phase to avoid scattered GM writes that get
-    // evicted by TensorMap lookup/insert cache pressure.
-    __builtin_prefetch(&task, 1, 1);
-    task.task_id = task_id;
-    task.kernel_id[static_cast<int>(PTO2SubtaskSlot::AIC)] = aic_kernel_id;
-    task.kernel_id[static_cast<int>(PTO2SubtaskSlot::AIV0)] = aiv0_kernel_id;
-    task.kernel_id[static_cast<int>(PTO2SubtaskSlot::AIV1)] = aiv1_kernel_id;
-    task.packed_buffer_base = prepared.alloc_result.packed_base;
-    task.packed_buffer_end = prepared.alloc_result.packed_end;
-
-    // fanout_count was already incremented per live producer inside
-    // append_fanin_or_fail, atomically with the consumed/generation check under
-    // the producer's fanout_lock. Doing it there (rather than a separate pass
-    // here) is what prevents a producer from transitioning to CONSUMED between
-    // the dependency decision and the claim.
-    int32_t inline_count = std::min(fanin_builder.count, PTO2_FANIN_INLINE_CAP);
-    // Store fanin metadata in payload for scheduler to iterate
-    payload.fanin_actual_count = fanin_builder.count;
-    payload.fanin_spill_start = fanin_builder.spill_start;
-    payload.fanin_spill_pool = &fanin_builder.spill_pool;
-    for (int i = 0; i < inline_count; i++) {
-        payload.fanin_inline_slot_states[i] = fanin_builder.inline_slots[i];
-    }
-
-    payload.init(args, result, prepared.alloc_result, layout);
-#if PTO2_PROFILING
-    if (is_dump_args_enabled()) {
-        if (args.scalar_count() > 0) {
-            set_dump_args_task_scalar_dtypes(
-                task_id.raw, static_cast<uint32_t>(args.scalar_count()), args.scalar_dtypes()
-            );
-        }
-        // Selective vs full dump is latched at dump_args_init from DumpDataHeader
-        // (host-decided before any dispatch), so it is race-free regardless of
-        // submission order. Here we only record each marked task's arg mask and
-        // metadata flags, which selective collection consults.
-        if (args.dump_arg_mask() != 0) {
-            set_dump_args_task_mask(task_id.raw, args.dump_arg_mask(), args.dump_arg_index_ambiguous_mask());
-        }
-    }
-#endif
-
-    CYCLE_COUNT_LAP(g_orch_args_cycle);
-
-    // === STEP 6: push to wiring queue ===
-    // Deferred wiring: orchestrator only stores dependency metadata and increments
-    // fanout_count. The actual fanout_head wiring (lock + dep_pool + early_finished)
-    // is handled asynchronously by scheduler thread 0 via the wiring queue.
-    // Push to global wiring queue — scheduler sets fanin_count, wires fanout, checks readiness
-    if (!sched->wiring.queue.push(&cur_slot_state)) {
-        // producer_blocked is the wiring deadlock detector's "orchestrator is
-        // stuck in push" observable: set ONLY while we actually spin (queue
-        // full), cleared on exit, so the just-filled-then-scope_end case (push
-        // succeeded, no spin) never trips a false deadlock. Also poll the shared
-        // orch_error_code so a fatal latched by any party (e.g. that detector)
-        // breaks this otherwise-unbounded spin and unwinds orchestration.
-        sched->wiring.producer_blocked.store(1, std::memory_order_release);
-        while (!sched->wiring.queue.push(&cur_slot_state)) {
-            if (orch->sm_header->orch_error_code.load(std::memory_order_acquire) != PTO2_ERROR_NONE) {
-                orch->fatal = true;
-                sched->wiring.producer_blocked.store(0, std::memory_order_release);
-                return result;
-            }
-            SPIN_WAIT_HINT();
-        }
-        sched->wiring.producer_blocked.store(0, std::memory_order_release);
-    }
-
-    CYCLE_COUNT_LAP(g_orch_fanin_cycle);
-    CYCLE_COUNT_ORCH_SUBMIT_RECORD(task_id.raw);
-
-#if PTO2_PROFILING
-    orch->tasks_submitted++;
-#if PTO2_ORCH_PROFILING
-    g_orch_submit_count++;
-#endif
-    g_orch_submit_idx++;
-#endif
-    return result;
-}
-
-TaskOutputTensors PTO2OrchestratorState::submit_task(const MixedKernels &mixed_kernels, const L0TaskArgs &args) {
-    auto *orch = this;
-
-    // Orchestration API should short-circuit after fatal, but keep this entry
-    // robust as a no-op in case a caller reaches it directly.
-    if (orch->fatal) {
-        return TaskOutputTensors{};
-    }
-
-    // Validate Arg construction (errors recorded by add_input/add_output/etc.)
-    if (args.has_error) {
-        LOG_ERROR("========================================");
-        LOG_ERROR("FATAL: Invalid Arg Detected!");
-        LOG_ERROR("========================================");
-        LOG_ERROR("Error: %s", args.error_msg ? args.error_msg : "(unknown)");
-        LOG_ERROR("  tensor_count: %d, scalar_count: %d", args.tensor_count(), args.scalar_count());
-        LOG_ERROR("This is a bug in the orchestration code.");
-        LOG_ERROR("========================================");
-        orch_mark_fatal(orch, PTO2_ERROR_INVALID_ARGS);
-        return TaskOutputTensors{};
-    }
-    always_assert(orch->scheduler != nullptr);
-    // === Validate submit inputs ===
-    ActiveMask active_mask = mixed_kernels.to_active_mask();
-    always_assert(static_cast<bool>(active_mask) && "MixedKernels must have at least one active slot");
-
-    int16_t block_num = args.launch_spec.block_num();
-    always_assert(block_num >= 1 && "block_num must be >= 1");
-
-    // Normalize single-AIV tasks: if only aiv1 is set (no aic, no aiv0), move
-    // it to the aiv0 slot.  This guarantees the dispatch path can always use
-    // PTO2SubtaskSlot::AIV0 for single-AIV shapes without inspecting active_mask.
-    // Mixed tasks (AIC+AIV) keep their original AIV identity so the correct
-    // hardware channel (AIV0→AIC vs AIV1→AIC) is used at dispatch time.
-    MixedKernels normalized = mixed_kernels;
-    bool has_aic = active_mask.has_mask(PTO2_SUBTASK_MASK_AIC);
-    bool has_aiv0 = active_mask.has_mask(PTO2_SUBTASK_MASK_AIV0);
-    bool has_aiv1 = active_mask.has_mask(PTO2_SUBTASK_MASK_AIV1);
-    if (!has_aic && has_aiv1 && !has_aiv0) {
-        normalized.aiv0_kernel_id = normalized.aiv1_kernel_id;
-        normalized.aiv1_kernel_id = INVALID_KERNEL_ID;
-        active_mask = normalized.to_active_mask();
-    }
-
-    // Encode require_sync_start into active_mask bit 3 (only meaningful for tasks with block_num > 1)
-    if (block_num > 1 && args.launch_spec.require_sync_start()) {
-        // Deadlock check: block_num >= total available slots of the required type.
-        // For MIX/AIC: limit is total_cluster_count (one AIC per cluster).
-        // For AIV:     limit is total_aiv_count.
-        PTO2ResourceShape shape = active_mask.to_shape();
-        int32_t limit = (shape == PTO2ResourceShape::AIV) ? orch->total_aiv_count : orch->total_cluster_count;
-        if (limit > 0 && block_num > limit) {
-            report_fatal(
-                PTO2_ERROR_REQUIRE_SYNC_START_INVALID, __FUNCTION__,
-                "require_sync_start block_num=%d > limit=%d (deadlock guaranteed)", block_num, limit
-            );
-            return TaskOutputTensors{};
-        }
-        active_mask.set_sync_start();
-    }
-
-    return submit_task_common(
-        orch, args, active_mask, normalized.aic_kernel_id, normalized.aiv0_kernel_id, normalized.aiv1_kernel_id
-    );
-}
-
-// Submit a dependency-only task: full dependency graph participation
-// (tensormap lookup/insert, explicit_deps, manual_dep, manual_scope) but no
-// AICore dispatch. Empty active_mask routes the slot to the DUMMY ready
-// bucket; dispatch loop short-circuits to completion. Accepts the same Arg
-// shape as submit_task; scalars are permitted but never consumed.
-TaskOutputTensors PTO2OrchestratorState::submit_dummy_task(const L0TaskArgs &args) {
-    auto *orch = this;
-
-    if (orch->fatal) {
-        return TaskOutputTensors{};
-    }
-
-    if (args.has_error) {
-        LOG_ERROR("========================================");
-        LOG_ERROR("FATAL: Invalid Arg in submit_dummy_task!");
-        LOG_ERROR("========================================");
-        LOG_ERROR("Error: %s", args.error_msg ? args.error_msg : "(unknown)");
-        LOG_ERROR("  tensor_count: %d, scalar_count: %d", args.tensor_count(), args.scalar_count());
-        LOG_ERROR("========================================");
-        orch_mark_fatal(orch, PTO2_ERROR_INVALID_ARGS);
-        return TaskOutputTensors{};
-    }
-    always_assert(orch->scheduler != nullptr);
-
-    return submit_task_common(orch, args, ActiveMask{}, INVALID_KERNEL_ID, INVALID_KERNEL_ID, INVALID_KERNEL_ID);
-}
-
-TaskOutputTensors PTO2OrchestratorState::alloc_tensors(const L0TaskArgs &args) {
-    auto *orch = this;
-    // Orchestration API should short-circuit after fatal, but keep this entry
-    // robust as a no-op in case a caller reaches it directly.
-    if (orch->fatal) {
-        return TaskOutputTensors{};
-    }
-
-    if (args.tensor_count() <= 0) {
-        report_fatal(PTO2_ERROR_INVALID_ARGS, __FUNCTION__, "alloc_tensors requires at least one TensorCreateInfo");
-        return TaskOutputTensors{};
-    }
-    if (args.scalar_count() != 0) {
-        report_fatal(PTO2_ERROR_INVALID_ARGS, __FUNCTION__, "alloc_tensors only accepts output TensorCreateInfo args");
-        return TaskOutputTensors{};
-    }
-    for (int32_t i = 0; i < args.tensor_count(); i++) {
-        if (args.tag(i) != TensorArgType::OUTPUT) {
-            report_fatal(
-                PTO2_ERROR_INVALID_ARGS, __FUNCTION__, "alloc_tensors only accepts output TensorCreateInfo args"
-            );
-            return TaskOutputTensors{};
-        }
-    }
-
-    CYCLE_COUNT_START();
-
-    if (args.has_error) {
-        report_fatal(
-            PTO2_ERROR_INVALID_ARGS, __FUNCTION__, "%s",
-            args.error_msg ? args.error_msg : "alloc_tensors failed to construct output-only Arg"
-        );
-        return TaskOutputTensors{};
-    }
-
-    PTO2OutputLayout layout = calculate_output_layout(args);
-    PTO2PreparedTask prepared;
-    if (!prepare_task(orch, args, layout.total_output_size, ActiveMask{}, &prepared)) {
-        return TaskOutputTensors{};
-    }
-
-    PTO2TaskDescriptor &task = *prepared.task;
-    PTO2TaskPayload &payload = *prepared.payload;
-
-    CYCLE_COUNT_LAP(g_orch_alloc_cycle);
-
-#if PTO2_PROFILING
-    if (layout.total_output_size > 0) {
-        orch->buffers_allocated++;
-        orch->bytes_allocated += layout.total_output_size;
-    }
-#endif
-
-    task.task_id = prepared.task_id;
-    task.kernel_id[static_cast<int>(PTO2SubtaskSlot::AIC)] = INVALID_KERNEL_ID;
-    task.kernel_id[static_cast<int>(PTO2SubtaskSlot::AIV0)] = INVALID_KERNEL_ID;
-    task.kernel_id[static_cast<int>(PTO2SubtaskSlot::AIV1)] = INVALID_KERNEL_ID;
-    task.packed_buffer_base = prepared.alloc_result.packed_base;
-    task.packed_buffer_end = prepared.alloc_result.packed_end;
-
-    TaskOutputTensors outputs;
-    outputs.set_task_id(prepared.task_id);
-    payload.init(args, outputs, prepared.alloc_result, layout);
-    payload.fanin_actual_count = 0;
-    payload.fanin_spill_start = 0;
-    payload.fanin_spill_pool = &orch->rings[prepared.task_id.ring()].fanin_pool;
-    CYCLE_COUNT_LAP(g_orch_args_cycle);
-
-    if (prepared.slot_state != nullptr) {
-        // Hidden alloc tasks complete inline in the orchestrator before any
-        // consumer can exist, so they have no fanout to notify and no worker
-        // subtasks to retire. Running the full on_task_complete path
-        // would only pay unnecessary fanout_lock / traversal overhead here.
-        // The generic slot initialization done in prepare_task() is still
-        // required so scope_end can release the producer-side reference and
-        // drive the slot to CONSUMED, but worker dispatch fields are never
-        // observed for hidden alloc tasks.
-        prepared.slot_state->task_state.store(PTO2_TASK_COMPLETED, std::memory_order_release);
-    }
-    orch->inline_completed_tasks++;
-
-    CYCLE_COUNT_LAP(g_orch_fanin_cycle);
-    CYCLE_COUNT_ORCH_SUBMIT_RECORD(prepared.task_id.raw);
-
-#if PTO2_PROFILING
-    orch->tasks_submitted++;
-#if PTO2_ORCH_PROFILING
-    g_orch_submit_count++;
-#endif
-    g_orch_submit_idx++;
-#endif
-
-    return outputs;
-}
-
-// =============================================================================
-// Flow Control
-// =============================================================================
-
-void PTO2OrchestratorState::mark_done() {
-    auto *orch = this;
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        int32_t total_tasks = orch->rings[r].task_allocator.active_count();
-        if (total_tasks > 0) {
-            LOG_INFO_V0("=== [Orchestrator] ring %d: total_tasks=%d ===", r, total_tasks);
-        }
-        auto &fanin_pool = orch->rings[r].fanin_pool;
-        if (fanin_pool.top > 1) {
-            LOG_INFO_V0(
-                "=== [FaninPool %d] top=%d tail=%d used=%d high_water=%d capacity=%d ===", r, fanin_pool.top,
-                fanin_pool.tail, fanin_pool.top - fanin_pool.tail, fanin_pool.high_water, fanin_pool.capacity
-            );
-        }
-    }
-    orch->sm_header->orchestrator_done.store(1, std::memory_order_release);
-    orch->scope_tasks_size = 0;
-    orch->scope_stack_top = -1;
-    orch->manual_begin_depth = PTO2_MAX_SCOPE_DEPTH;
-#if !PTO2_ORCH_PROFILING && PTO2_PROFILING
-    g_orch_submit_idx = 0;
-#endif
-}
-
-#if PTO2_ORCH_PROFILING
-PTO2OrchProfilingData orchestrator_get_profiling() {
-    PTO2OrchProfilingData d;
-    d.sync_cycle = g_orch_sync_cycle;
-    d.alloc_cycle = g_orch_alloc_cycle;
-    d.args_cycle = g_orch_args_cycle;
-    d.lookup_cycle = g_orch_lookup_cycle;
-    d.insert_cycle = g_orch_insert_cycle;
-    d.fanin_cycle = g_orch_fanin_cycle;
-    d.scope_end_cycle = g_orch_scope_end_cycle;
-    d.submit_count = g_orch_submit_count;
-    d.alloc_wait_cycle = g_orch_alloc_wait_cycle;
-    d.fanin_wait_cycle = g_orch_fanin_wait_cycle;
-    d.alloc_atomic_count = g_orch_alloc_atomic_count;
-    d.args_atomic_count = g_orch_args_atomic_count;
-    d.scope_end_atomic_count = g_orch_scope_end_atomic_count;
-
-    // Reset
-    g_orch_sync_cycle = g_orch_alloc_cycle = g_orch_args_cycle = 0;
-    g_orch_lookup_cycle = g_orch_insert_cycle = 0;
-    g_orch_fanin_cycle = g_orch_scope_end_cycle = 0;
-    g_orch_submit_count = 0;
-    g_orch_submit_idx = 0;
-    g_orch_alloc_wait_cycle = 0;
-    g_orch_fanin_wait_cycle = 0;
-    g_orch_alloc_atomic_count = 0;
-    g_orch_args_atomic_count = 0;
-    g_orch_scope_end_atomic_count = 0;
-    return d;
-}
-#endif
+// Polling redesign: pto_orchestrator logic is now inlined in pto_orchestrator.h. This translation
+// unit is kept empty to preserve the upstream/main file layout.
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h
index 5ceb9af85..aa8602443 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h
@@ -8,22 +8,6 @@
  * See LICENSE in the root of the software repository for the full text of the License.
  * -----------------------------------------------------------------------------------------------------------
  */
-/**
- * PTO Runtime2 - Orchestrator Interface
- *
- * The Orchestrator is responsible for:
- * 1. Executing the orchestration function (Turing-complete control flow)
- * 2. Allocating intermediate buffers from the heap
- * 3. Submitting tasks via async InCore function calls
- * 4. Building the dependency graph using TensorMap
- * 5. Managing buffer scopes for lifecycle control
- *
- * The Orchestrator can run on either:
- * - Host CPU (lower latency for complex control, easier debugging)
- * - Device AI_CPU (lower latency for task submission)
- *
- * Based on: docs/RUNTIME_LOGIC.md
- */
 
 #ifndef PTO_ORCHESTRATOR_H
 #define PTO_ORCHESTRATOR_H
@@ -38,32 +22,72 @@
 #include "pto_tensormap.h"
 #include "pto_types.h"
 
-/**
- * Layout descriptor produced by PTO2OrchestratorState::reserve_layout(). Holds
- * arena offsets for every sub-region the orchestrator owns (per-ring fanin
- * pools, scope arrays, plus the nested PTO2TensorMap layout).
- */
-struct PTO2OrchestratorLayout {
-    size_t off_fanin_pool[PTO2_MAX_RING_DEPTH];
-    size_t off_fanin_seen_epoch[PTO2_MAX_RING_DEPTH];
+#include <stdarg.h>
+#include <stdio.h>
+#include <limits>
+#include "aicpu/dep_gen_collector_aicpu.h"
+#include "common/dep_gen.h"
+#include "pto_dep_compute.h"
+#include "tensor.h"
+
+struct PTO2OrchestratorState;
+
+// Full definitions of helper aggregate types that the inline methods on
+// PTO2OrchestratorState (and the helpers below) construct by value.
+struct PTO2PreparedTask
+{
+    PTO2TaskId task_id = PTO2TaskId::invalid();
+    PTO2TaskAllocResult alloc_result = {-1, 0, nullptr, nullptr};
+    PTO2TaskDescriptor *task = nullptr;
+    PTO2TaskPayload *payload = nullptr;
+    PTO2TaskSlotState *slot_state = nullptr;
+};
+
+struct PTO2FaninBuilder
+{
+    int32_t count{0};
+    PTO2TaskSlotState *slots[PTO2_MAX_FANIN];
+    int32_t local_ids[PTO2_MAX_FANIN];
+    uint8_t ring_ids[PTO2_MAX_FANIN];
+
+    bool contains(PTO2TaskSlotState *prod_state) const
+    {
+        for (int32_t i = 0; i < count; i++)
+            if (slots[i] == prod_state) return true;
+        return false;
+    }
+};
+
+// Forward declarations of helpers defined below — needed because the inline
+// methods on PTO2OrchestratorState reference them.
+inline int32_t orch_mark_fatal(PTO2OrchestratorState *orch, int32_t error_code);
+inline void orch_report_fatal_v(PTO2OrchestratorState *orch, int32_t error_code, const char *fmt, va_list args);
+inline void scope_tasks_push(PTO2OrchestratorState *orch, PTO2TaskSlotState *task_slot_state);
+inline bool prepare_task(PTO2OrchestratorState *orch, const L0TaskArgs &args, int32_t total_output_size, ActiveMask active_mask, PTO2PreparedTask *out);
+inline PTO2OutputLayout calculate_output_layout(const L0TaskArgs &args);
+inline bool append_fanin_or_fail(PTO2OrchestratorState *orch, PTO2TaskSlotState *prod_state, int32_t prod_local_id, PTO2FaninBuilder *fanin_builder);
+inline bool check_scope_can_accept_task(PTO2OrchestratorState *orch, PTO2TaskAllocator &allocator);
+inline void prefetch_payload(PTO2TaskPayload *payload, int32_t tensor_count, int32_t scalar_count);
+inline TaskOutputTensors submit_task_common(PTO2OrchestratorState *orch, const L0TaskArgs &args, ActiveMask active_mask, int32_t aic_kernel_id, int32_t aiv0_kernel_id, int32_t aiv1_kernel_id);
+
+struct PTO2OrchestratorLayout
+{
     size_t off_scope_tasks;
     size_t off_scope_begins;
     PTO2TensorMapLayout tensor_map;
-    int32_t dep_pool_capacities[PTO2_MAX_RING_DEPTH];
+    int32_t dep_pool_capacity;
     int32_t scope_tasks_cap;
     uint64_t scope_stack_capacity;
 };
 
-// =============================================================================
-// Orchestrator State
-// =============================================================================
+struct PTO2OrchestratorState
+{
+    // L2 swimlane profiling level — read by upstream aicpu_executor when
+    // bridging orchestrator init into the scheduler context. The polling
+    // design doesn't gate behavior on this directly, but the field must
+    // exist for the upstream code path to compile.
+    L2SwimlaneLevel l2_swimlane_level{L2SwimlaneLevel::DISABLED};
 
-/**
- * Orchestrator state structure (private to Orchestrator)
- *
- * Contains all state needed for task graph construction and buffer management.
- */
-struct PTO2OrchestratorState {
     // === SHARED MEMORY ACCESS ===
     PTO2SharedMemoryHeader *sm_header;
 
@@ -75,10 +99,6 @@ struct PTO2OrchestratorState {
     // === TENSOR MAP (Private) ===
     PTO2TensorMap tensor_map;  // Producer lookup
 
-    // === SCOPE STACK (Private) ===
-    // Single contiguous buffer of task IDs, partitioned by scope level.
-    // scope_begins[i] is the index into scope_tasks where scope i starts.
-    // Tasks for the top scope occupy [scope_begins[top], scope_tasks_size).
     PTO2TaskSlotState **scope_tasks;  // Flat buffer of taskSlotState (all scopes concatenated)
     int32_t scope_tasks_size;         // Number of task IDs currently in the buffer
     int32_t scope_tasks_capacity;     // Allocated capacity of scope_tasks
@@ -87,127 +107,538 @@ struct PTO2OrchestratorState {
     uint64_t scope_stack_capacity;    // Max nesting depth (PTO2_MAX_SCOPE_DEPTH)
     int32_t manual_begin_depth{PTO2_MAX_SCOPE_DEPTH};
 
-    // === SCHEDULER REFERENCE ===
-    // Note: In simulated mode, orchestrator and scheduler share address space
-    // In real mode, they communicate via shared memory only
     PTO2SchedulerState *scheduler;  // For simulated mode only
 
     // Total core counts set once at executor init; used for submit-time deadlock detection.
     int32_t total_cluster_count{0};  // AIC cores = MIX clusters
     int32_t total_aiv_count{0};      // AIV cores (= 2 × clusters on standard hardware)
-#if PTO2_PROFILING
-    // L2 swimlane_level copied from get_l2_swimlane_level().
-    L2SwimlaneLevel l2_swimlane_level{L2SwimlaneLevel::DISABLED};
-#endif
 
     // === GM HEAP (for output buffers) ===
     void *gm_heap_base;     // Base address of GM heap
     uint64_t gm_heap_size;  // Total size of GM heap (all rings)
 
-    // === FATAL ERROR ===
-    // Fatal error flag (single-thread access by orchestrator, no atomic needed)
-    // Cross-thread notification uses shared memory orch_error_code (atomic)
     bool fatal;
 
-    // Hidden alloc tasks complete synchronously inside the orchestrator and
-    // therefore bypass the executor's normal worker-completion counter path.
-    // The executor adds this count into its completed_tasks_ progress counter
-    // after orchestration finishes so shutdown/profiling totals remain closed.
     int64_t inline_completed_tasks{0};
 
     // === STATISTICS ===
-#if PTO2_PROFILING
-    int64_t tasks_submitted;
-    int64_t buffers_allocated;
-    int64_t bytes_allocated;
-#endif
-
-    /**
-     * Get current ring index from scope depth.
-     * Maps scope depth to ring_id: min(scope_depth, PTO2_MAX_RING_DEPTH - 1)
-     */
-    uint8_t current_ring_id() const {
+
+    uint8_t current_ring_id() const
+    {
         int32_t depth = scope_stack_top;
         if (depth < 0) depth = 0;
         return depth < PTO2_MAX_RING_DEPTH ? static_cast<uint8_t>(depth) : PTO2_MAX_RING_DEPTH - 1;
     }
 
-    bool in_manual_scope() const { return scope_stack_top >= manual_begin_depth; }
-
-    // === Cold-path API (defined in pto_orchestrator.cpp) ===
-
-    // Phase 1: declare every sub-region (per-ring fanin pool, scope arrays,
-    // tensor_map sub-layout) on the supplied arena. task_window_sizes feeds
-    // the nested tensor_map layout. Returned layout is consumed by
-    // init_from_layout.
-    static PTO2OrchestratorLayout reserve_layout(
-        DeviceArena &arena, const int32_t task_window_sizes[PTO2_MAX_RING_DEPTH],
-        int32_t dep_pool_capacity = PTO2_DEP_LIST_POOL_SIZE
-    );
-    static PTO2OrchestratorLayout reserve_layout(
-        DeviceArena &arena, const int32_t task_window_sizes[PTO2_MAX_RING_DEPTH],
-        const int32_t dep_pool_capacities[PTO2_MAX_RING_DEPTH]
-    );
-
-    // Phase 3a: write everything *except* arena-internal pointer fields.
-    // sm_dev_base is the SM device address (only stored, never dereferenced);
-    // task_window_size feeds the per-ring SM address arithmetic. Safe to call
-    // on a host arena that holds the prebuilt image.
-    bool init_data_from_layout(
-        const PTO2OrchestratorLayout &layout, DeviceArena &arena, void *sm_dev_base, void *gm_heap, uint64_t heap_size,
-        uint64_t task_window_size
-    );
-    bool init_data_from_layout(
-        const PTO2OrchestratorLayout &layout, DeviceArena &arena, void *sm_dev_base, void *gm_heap,
-        const uint64_t heap_sizes[PTO2_MAX_RING_DEPTH], const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH]
-    );
-    bool reset_for_reuse(
-        const PTO2OrchestratorLayout &layout, void *sm_dev_base, void *gm_heap,
-        const uint64_t heap_sizes[PTO2_MAX_RING_DEPTH], const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH]
-    );
-
-    // Phase 3b: write the arena-internal pointer fields (scope_tasks,
-    // scope_begins, rings[].fanin_pool.base, tensor_map.{buckets,entry_pool,
-    // free_entry_list,task_entry_heads}, scheduler reference).
-    // Idempotent — host runs once on the image, AICPU runs once after attach.
-    void wire_arena_pointers(const PTO2OrchestratorLayout &layout, DeviceArena &arena, PTO2SchedulerState *scheduler);
+    bool in_manual_scope() const
+    {
+        return scope_stack_top >= manual_begin_depth;
+    }
+
+    // === Cold-path API ===
+
+    static PTO2OrchestratorLayout reserve_layout(DeviceArena &arena, const int32_t task_window_sizes[PTO2_MAX_RING_DEPTH], int32_t dep_pool_capacity)
+    {
+        PTO2OrchestratorLayout layout{};
+        layout.dep_pool_capacity = dep_pool_capacity;
+        // scope_tasks holds every task in the open scope across all rings, so its cap
+        // is the real in-flight budget = sum of the (runtime) per-ring windows.
+        // Accumulate in int64; each window is validated <= INT32_MAX individually but
+        // their sum can exceed it. See upstream #1192.
+        int64_t scope_tasks_cap = 0;
+        for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++)
+        {
+            always_assert(task_window_sizes[r] > 0);
+            scope_tasks_cap += task_window_sizes[r];
+        }
+        always_assert(scope_tasks_cap <= std::numeric_limits<int32_t>::max());
+        layout.scope_tasks_cap = static_cast<int32_t>(scope_tasks_cap);
+        layout.scope_stack_capacity = PTO2_MAX_SCOPE_DEPTH;
+
+        layout.off_scope_tasks = arena.reserve(static_cast<size_t>(layout.scope_tasks_cap) * sizeof(PTO2TaskSlotState *), alignof(PTO2TaskSlotState *));
+        layout.off_scope_begins = arena.reserve(static_cast<size_t>(layout.scope_stack_capacity) * sizeof(int32_t), alignof(int32_t));
+        layout.tensor_map = PTO2TensorMap::reserve_layout_default(arena, task_window_sizes);
+        return layout;
+    }
+
+    bool init_data_from_layout(const PTO2OrchestratorLayout &layout, DeviceArena &arena, void *sm_dev_base, void *gm_heap, uint64_t heap_size, uint64_t task_window_size)
+    {
+        auto *orch = this;
+        *orch = PTO2OrchestratorState{};
+
+        orch->sm_header = reinterpret_cast<PTO2SharedMemoryHeader *>(sm_dev_base);
+        orch->gm_heap_base = gm_heap;
+        orch->gm_heap_size = heap_size * PTO2_MAX_RING_DEPTH;
+        orch->fatal = false;
+
+        // Mirror the SM API's per-ring window-size shape so a future per-ring
+        // SM layout cannot silently disagree with the addresses we compute here.
+        uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH];
+        for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) task_window_sizes[r] = task_window_size;
+
+        auto *orch_err = pto2_sm_layout::orch_error_code_addr(sm_dev_base);
+        for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++)
+        {
+            void *ring_heap_base = reinterpret_cast<char *>(gm_heap) + r * heap_size;
+            auto *task_descs_dev = pto2_sm_layout::ring_task_descriptors_addr(sm_dev_base, task_window_sizes, r);
+            auto *cur_idx_dev = pto2_sm_layout::ring_current_task_index_addr(sm_dev_base, r);
+            auto *last_alive_dev = pto2_sm_layout::ring_last_task_alive_addr(sm_dev_base, r);
+
+            orch->rings[r].task_allocator.init(task_descs_dev, static_cast<int32_t>(task_window_size), cur_idx_dev, last_alive_dev, ring_heap_base, heap_size, orch_err);
+        }
+
+        if (!orch->tensor_map.init_data_from_layout(layout.tensor_map, arena)) return false;
+
+        orch->scope_tasks_size = 0;
+        orch->scope_tasks_capacity = layout.scope_tasks_cap;
+        orch->scope_stack_top = -1;
+        orch->scope_stack_capacity = layout.scope_stack_capacity;
+        orch->manual_begin_depth = PTO2_MAX_SCOPE_DEPTH;
+
+        return true;
+    }
+
+    void wire_arena_pointers(const PTO2OrchestratorLayout &layout, DeviceArena &arena, PTO2SchedulerState *scheduler_arg)
+    {
+        auto *orch = this;
+        orch->tensor_map.wire_arena_pointers(layout.tensor_map, arena);
+        orch->scope_tasks = static_cast<PTO2TaskSlotState **>(arena.region_ptr(layout.off_scope_tasks));
+        orch->scope_begins = static_cast<int32_t *>(arena.region_ptr(layout.off_scope_begins));
+        orch->scheduler = scheduler_arg;
+    }
 
     // Forget pointers; arena owns the backing buffers.
-    void destroy();
-    void set_scheduler(PTO2SchedulerState *scheduler);
-    void report_fatal(int32_t error_code, const char *func, const char *fmt, ...);
-    void begin_scope(PTO2ScopeMode mode = PTO2ScopeMode::AUTO);
-    void end_scope();
-    TaskOutputTensors submit_task(const MixedKernels &mixed_kernels, const L0TaskArgs &args);
-    TaskOutputTensors submit_dummy_task(const L0TaskArgs &args);
-    TaskOutputTensors alloc_tensors(const L0TaskArgs &args);
-    void mark_done();
-};
+    void destroy()
+    {
+        auto *orch = this;
+        orch->tensor_map.destroy();
+        orch->scope_tasks = nullptr;
+        orch->scope_begins = nullptr;
+    }
+    void set_scheduler(PTO2SchedulerState *scheduler)
+    {
+        this->scheduler = scheduler;
+    }
+    void report_fatal(int32_t error_code, [[maybe_unused]] const char *func, const char *fmt, ...)
+    {
+        auto *orch = this;
+        va_list args;
+        va_start(args, fmt);
+        orch_report_fatal_v(orch, error_code, fmt, args);
+        va_end(args);
+    }
+    void begin_scope(PTO2ScopeMode mode)
+    {
+        auto *orch = this;
+        if (orch->fatal) return;
+        assert(orch->scope_stack_top < static_cast<int32_t>(orch->scope_stack_capacity - 1) && "Scope stack overflow");
+        if (mode == PTO2ScopeMode::AUTO && orch->in_manual_scope())
+        {
+            report_fatal(PTO2_ERROR_INVALID_ARGS, __FUNCTION__, "auto scope nested inside manual scope is not supported");
+            return;
+        }
+
+        bool already_in_manual_scope = orch->in_manual_scope();
+        ++orch->scope_stack_top;
+        orch->scope_begins[orch->scope_stack_top] = orch->scope_tasks_size;
+        if (mode == PTO2ScopeMode::MANUAL && !already_in_manual_scope) orch->manual_begin_depth = orch->scope_stack_top;
+    }
+    void end_scope()
+    {
+        auto *orch = this;
+        if (orch->fatal) return;
+        assert(orch->scope_stack_top >= 0 && "Scope stack underflow");
+
+        bool ending_manual_scope = orch->scope_stack_top == orch->manual_begin_depth;
+        int32_t begin = orch->scope_begins[orch->scope_stack_top--];
+        if (ending_manual_scope) orch->manual_begin_depth = PTO2_MAX_SCOPE_DEPTH;
+
+        // Watermark-based reclamation: scope-end has no work to do — consumers
+        // no longer need to notify producers.
+        orch->scope_tasks_size = begin;
+    }
+    TaskOutputTensors submit_task(const MixedKernels &mixed_kernels, const L0TaskArgs &args)
+    {
+        auto *orch = this;
+
+        // Orchestration API should short-circuit after fatal, but keep this entry
+        // robust as a no-op in case a caller reaches it directly.
+        if (orch->fatal) return TaskOutputTensors{};
+
+        // Validate Arg construction (errors recorded by add_input/add_output/etc.)
+        if (args.has_error)
+        {
+            orch_mark_fatal(orch, PTO2_ERROR_INVALID_ARGS);
+            return TaskOutputTensors{};
+        }
+        always_assert(orch->scheduler != nullptr);
+        // === Validate submit inputs ===
+        ActiveMask active_mask = mixed_kernels.to_active_mask();
+        always_assert(static_cast<bool>(active_mask) && "MixedKernels must have at least one active slot");
+
+        int16_t block_num = args.launch_spec.block_num();
+        always_assert(block_num >= 1 && "block_num must be >= 1");
+
+        MixedKernels normalized = mixed_kernels;
+        bool has_aic = active_mask.has_mask(PTO2_SUBTASK_MASK_AIC);
+        bool has_aiv0 = active_mask.has_mask(PTO2_SUBTASK_MASK_AIV0);
+        bool has_aiv1 = active_mask.has_mask(PTO2_SUBTASK_MASK_AIV1);
+        if (!has_aic && has_aiv1 && !has_aiv0)
+        {
+            normalized.aiv0_kernel_id = normalized.aiv1_kernel_id;
+            normalized.aiv1_kernel_id = INVALID_KERNEL_ID;
+            active_mask = normalized.to_active_mask();
+        }
+
+        // Encode require_sync_start into active_mask bit 3 (only meaningful for tasks with block_num > 1)
+        if (block_num > 1 && args.launch_spec.require_sync_start())
+        {
+            PTO2ResourceShape shape = active_mask.to_shape();
+            int32_t limit = (shape == PTO2ResourceShape::AIV) ? orch->total_aiv_count : orch->total_cluster_count;
+            if (limit > 0 && block_num > limit)
+            {
+                report_fatal(PTO2_ERROR_REQUIRE_SYNC_START_INVALID, __FUNCTION__, "require_sync_start block_num=%d > limit=%d (deadlock guaranteed)", block_num, limit);
+                return TaskOutputTensors{};
+            }
+            active_mask.set_sync_start();
+        }
+
+        return submit_task_common(orch, args, active_mask, normalized.aic_kernel_id, normalized.aiv0_kernel_id, normalized.aiv1_kernel_id);
+    }
+    TaskOutputTensors submit_dummy_task(const L0TaskArgs &args)
+    {
+        auto *orch = this;
+
+        if (orch->fatal) return TaskOutputTensors{};
+
+        if (args.has_error)
+        {
+            orch_mark_fatal(orch, PTO2_ERROR_INVALID_ARGS);
+            return TaskOutputTensors{};
+        }
+        always_assert(orch->scheduler != nullptr);
+
+        return submit_task_common(orch, args, ActiveMask{}, INVALID_KERNEL_ID, INVALID_KERNEL_ID, INVALID_KERNEL_ID);
+    }
+    TaskOutputTensors alloc_tensors(const L0TaskArgs &args)
+    {
+        auto *orch = this;
+        // Orchestration API should short-circuit after fatal, but keep this entry
+        // robust as a no-op in case a caller reaches it directly.
+        if (orch->fatal) return TaskOutputTensors{};
 
-// =============================================================================
-// Orchestrator Profiling Data
-// =============================================================================
-
-#if PTO2_ORCH_PROFILING
-struct PTO2OrchProfilingData {
-    uint64_t sync_cycle;
-    uint64_t alloc_cycle;  // Combined task slot + heap allocation
-    uint64_t args_cycle;
-    uint64_t lookup_cycle;
-    uint64_t insert_cycle;
-    uint64_t fanin_cycle;
-    uint64_t scope_end_cycle;
-    int64_t submit_count;
-    // Wait time tracking for blocking phases
-    uint64_t alloc_wait_cycle;  // Cycles spent waiting in unified alloc
-    uint64_t fanin_wait_cycle;  // Cycles spent waiting in fanout_lock
-    // Atomic operation counts per phase
-    uint64_t alloc_atomic_count;
-    uint64_t args_atomic_count;
-    uint64_t scope_end_atomic_count;
+        if (args.tensor_count() <= 0)
+        {
+            report_fatal(PTO2_ERROR_INVALID_ARGS, __FUNCTION__, "alloc_tensors requires at least one TensorCreateInfo");
+            return TaskOutputTensors{};
+        }
+        if (args.scalar_count() != 0)
+        {
+            report_fatal(PTO2_ERROR_INVALID_ARGS, __FUNCTION__, "alloc_tensors only accepts output TensorCreateInfo args");
+            return TaskOutputTensors{};
+        }
+        for (int32_t i = 0; i < args.tensor_count(); i++)
+        {
+            if (args.tag(i) != TensorArgType::OUTPUT)
+            {
+                report_fatal(PTO2_ERROR_INVALID_ARGS, __FUNCTION__, "alloc_tensors only accepts output TensorCreateInfo args");
+                return TaskOutputTensors{};
+            }
+        }
+
+        if (args.has_error)
+        {
+            report_fatal(PTO2_ERROR_INVALID_ARGS, __FUNCTION__, "%s", args.error_msg ? args.error_msg : "alloc_tensors failed to construct output-only Arg");
+            return TaskOutputTensors{};
+        }
+
+        PTO2OutputLayout layout = calculate_output_layout(args);
+        PTO2PreparedTask prepared;
+        if (!prepare_task(orch, args, layout.total_output_size, ActiveMask{}, &prepared)) return TaskOutputTensors{};
+
+        PTO2TaskDescriptor &task = *prepared.task;
+        PTO2TaskPayload &payload = *prepared.payload;
+
+        task.task_id = prepared.task_id;
+        task.kernel_id[static_cast<int>(PTO2SubtaskSlot::AIC)] = INVALID_KERNEL_ID;
+        task.kernel_id[static_cast<int>(PTO2SubtaskSlot::AIV0)] = INVALID_KERNEL_ID;
+        task.kernel_id[static_cast<int>(PTO2SubtaskSlot::AIV1)] = INVALID_KERNEL_ID;
+        task.packed_buffer_base = prepared.alloc_result.packed_base;
+        task.packed_buffer_end = prepared.alloc_result.packed_end;
+
+        TaskOutputTensors outputs;
+        outputs.set_task_id(prepared.task_id);
+        payload.init(args, outputs, prepared.alloc_result, layout);
+        payload.fanin_count = 0;
+
+        if (prepared.slot_state != nullptr)
+        {
+            // (m) Inline completion uses completion_flags only.
+            uint8_t ring_id = prepared.task_id.ring();
+            auto &ring = orch->sm_header->rings[ring_id];
+            const int32_t my_id = static_cast<int32_t>(prepared.task_id.local());
+            const int32_t mask = ring.task_window_mask;
+            ring.completion_flags[prepared.alloc_result.slot].store(1, std::memory_order_release);
+            // Inline-completed slots never reach on_mixed_task_complete, so
+            // CAS-advance the per-ring completed_watermark here. Without this,
+            // wait_for_tensor_ready(wait_for_consumers=true) on an alloc'd slot
+            // (e.g. set_tensor_data on its output) hangs because the watermark
+            // gate target (slot's own local_id) is never reached if no real
+            // task with local_id > my_id completes.
+            int32_t w = ring.completed_watermark.load(std::memory_order_acquire);
+            while (w < my_id)
+            {
+                int32_t next = w + 1;
+                if (ring.completion_flags[next & mask].load(std::memory_order_acquire) == 0) break;
+                if (ring.completed_watermark.compare_exchange_weak(w, next, std::memory_order_acq_rel, std::memory_order_acquire))
+                {
+                    w = next;
+                }
+            }
+        }
+        orch->inline_completed_tasks++;
+
+        return outputs;
+    }
+    void mark_done()
+    {
+        auto *orch = this;
+        orch->sm_header->orchestrator_done.store(1, std::memory_order_release);
+        orch->scope_tasks_size = 0;
+        orch->scope_stack_top = -1;
+        orch->manual_begin_depth = PTO2_MAX_SCOPE_DEPTH;
+    }
 };
 
-PTO2OrchProfilingData orchestrator_get_profiling();
-#endif
+// -----------------------------------------------------------------------------
+// Helpers
+// -----------------------------------------------------------------------------
+
+inline int32_t orch_mark_fatal(PTO2OrchestratorState *orch, int32_t error_code)
+{
+    always_assert(orch != nullptr);
+    orch->fatal = true;
+    if (error_code == PTO2_ERROR_NONE || orch->sm_header == nullptr) return PTO2_ERROR_NONE;
+
+    int32_t expected = PTO2_ERROR_NONE;
+    std::atomic<int32_t> &orch_error_code = orch->sm_header->orch_error_code;
+    if (orch_error_code.compare_exchange_strong(expected, error_code, std::memory_order_acq_rel)) return error_code;
+    return expected;
+}
+
+inline void orch_report_fatal_v(PTO2OrchestratorState *orch, int32_t error_code, const char *, va_list)
+{
+    // fmt + args are accepted for future logging-sink wiring but are not yet
+    // routed anywhere — the error_code is latched in shared memory via
+    // orch_mark_fatal and that's what callers actually observe.
+    orch_mark_fatal(orch, error_code);
+}
+
+inline bool append_fanin_or_fail(PTO2OrchestratorState *orch, PTO2TaskSlotState *prod_state, int32_t prod_local_id, PTO2FaninBuilder *fanin_builder)
+{
+    if (fanin_builder->contains(prod_state)) return true;
+    if (fanin_builder->count >= PTO2_MAX_FANIN)
+    {
+        orch_mark_fatal(orch, PTO2_ERROR_DEP_POOL_OVERFLOW);
+        return false;
+    }
+    int32_t idx = fanin_builder->count++;
+    fanin_builder->slots[idx] = prod_state;
+    fanin_builder->local_ids[idx] = prod_local_id;
+    fanin_builder->ring_ids[idx] = prod_state->ring_id;
+    return true;
+}
+
+inline PTO2OutputLayout calculate_output_layout(const L0TaskArgs &args)
+{
+    PTO2OutputLayout layout;
+    for (int32_t i = 0; i < args.tensor_count(); i++)
+    {
+        if (args.tag(i) != TensorArgType::OUTPUT) continue;
+        layout.offsets[i] = layout.total_output_size;
+        layout.buffer_sizes[i] = PTO2_ALIGN_UP(args.tensor(i).create_info().buffer_size_bytes(), PTO2_PACKED_OUTPUT_ALIGN);
+        layout.total_output_size += layout.buffer_sizes[i];
+    }
+    return layout;
+}
+
+inline bool check_scope_can_accept_task(PTO2OrchestratorState *orch, PTO2TaskAllocator &allocator)
+{
+    always_assert(orch->scope_stack_top >= 0 && "Cannot submit task outside a scope");
+
+    int32_t scope_task_count = orch->scope_tasks_size - orch->scope_begins[orch->scope_stack_top];
+    if (scope_task_count < allocator.window_size() - 1) return true;
+
+    orch_mark_fatal(orch, PTO2_ERROR_SCOPE_DEADLOCK);
+    return false;
+}
+
+inline void prefetch_payload(PTO2TaskPayload *payload, int32_t tensor_count, int32_t scalar_count)
+{
+    for (int32_t i = 0; i < tensor_count; i++)
+    {
+        __builtin_prefetch(&payload->tensors[i], 1, 3);
+        __builtin_prefetch(reinterpret_cast<char *>(&payload->tensors[i]) + 64, 1, 3);
+    }
+    for (int32_t i = 0; i < scalar_count; i += 8) __builtin_prefetch(&payload->scalars[i], 1, 3);
+    __builtin_prefetch(payload, 1, 3);
+    __builtin_prefetch(reinterpret_cast<char *>(payload) + 64, 1, 3);
+    __builtin_prefetch(reinterpret_cast<char *>(payload) + 128, 1, 3);
+}
+
+inline bool prepare_task(PTO2OrchestratorState *orch, const L0TaskArgs &args, int32_t total_output_size, ActiveMask active_mask, PTO2PreparedTask *out)
+{
+    uint8_t ring_id = orch->current_ring_id();
+    auto &allocator = orch->rings[ring_id].task_allocator;
+
+    if (!check_scope_can_accept_task(orch, allocator)) return false;
+
+    out->alloc_result = allocator.alloc(total_output_size);
+    if (out->alloc_result.failed())
+    {
+        orch_mark_fatal(orch, PTO2_ERROR_HEAP_RING_DEADLOCK);
+        return false;
+    }
+
+    out->task_id = PTO2TaskId::make(ring_id, static_cast<uint32_t>(out->alloc_result.task_id));
+    out->slot_state = &orch->sm_header->rings[ring_id].get_slot_state_by_slot(out->alloc_result.slot);
+    out->task = &orch->sm_header->rings[ring_id].task_descriptors[out->alloc_result.slot];
+    out->payload = &orch->sm_header->rings[ring_id].task_payloads[out->alloc_result.slot];
+
+    prefetch_payload(out->payload, args.tensor_count(), args.scalar_count());
+
+    out->slot_state->bind_buffers(out->payload, out->task);
+
+    // Clear the polling-fast completion byte for the newly-allocated slot.
+    // The previous incarnation's completer set this byte to 1; we publish 0
+    // before this task can be added as a fanin to any consumer (single-
+    // orchestrator-thread guarantee) and before the wiring-queue push
+    // (release-acquire) makes the slot visible to thread 0.
+    orch->sm_header->rings[ring_id].completion_flags[out->alloc_result.slot].store(0, std::memory_order_relaxed);
+    // Seed last_consumer_local_id to self — with no consumers, the slot is
+    // safe to reclaim as soon as the watermark reaches this task itself.
+    out->slot_state->last_consumer_local_id = out->alloc_result.task_id;
+    int16_t block_num = args.launch_spec.block_num();
+    out->slot_state->total_required_subtasks = static_cast<int16_t>(block_num * __builtin_popcount(active_mask.core_mask()));
+    out->slot_state->logical_block_num = block_num;
+    out->slot_state->active_mask = active_mask;
+    scope_tasks_push(orch, out->slot_state);
+
+    return true;
+}
+
+inline void scope_tasks_push(PTO2OrchestratorState *orch, PTO2TaskSlotState *task_slot_state)
+{
+    if (orch->scope_tasks_size >= orch->scope_tasks_capacity)
+    {
+        orch->report_fatal(PTO2_ERROR_SCOPE_TASKS_OVERFLOW, __FUNCTION__, "scope_tasks buffer saturated at %d entries (all rings full)", orch->scope_tasks_capacity);
+        return;
+    }
+    orch->scope_tasks[orch->scope_tasks_size++] = task_slot_state;
+}
+
+inline TaskOutputTensors submit_task_common(PTO2OrchestratorState *orch, const L0TaskArgs &args, ActiveMask active_mask, int32_t aic_kernel_id, int32_t aiv0_kernel_id, int32_t aiv1_kernel_id)
+{
+    TaskOutputTensors result;
+    PTO2OutputLayout layout = calculate_output_layout(args);
+    PTO2PreparedTask prepared;
+    if (!prepare_task(orch, args, layout.total_output_size, active_mask, &prepared)) return result;
+    uint8_t ring_id = prepared.task_id.ring();
+    PTO2SchedulerState *sched = orch->scheduler;
+    PTO2RingFlowControl &fc = orch->sm_header->rings[ring_id].fc;
+    PTO2TaskId task_id = prepared.task_id;
+    PTO2TaskSlotState &cur_slot_state = *prepared.slot_state;
+    PTO2TaskDescriptor &task = *prepared.task;
+    PTO2TaskPayload &payload = *prepared.payload;
+    result.set_task_id(task_id);
+
+    if (is_dep_gen_enabled())
+    {
+        const void *tensor_ptrs[MAX_TENSOR_ARGS];
+        uint8_t arg_types_u8[MAX_TENSOR_ARGS];
+        const int tc_raw = args.tensor_count();
+        const int tc = tc_raw > MAX_TENSOR_ARGS ? MAX_TENSOR_ARGS : tc_raw;
+        for (int i = 0; i < tc; i++)
+        {
+            tensor_ptrs[i] = (args.tag(i) == TensorArgType::OUTPUT) ? nullptr : &args.tensor(i).ref();
+            arg_types_u8[i] = static_cast<uint8_t>(args.tag(i));
+        }
+        const int32_t kernel_ids_capture[3] = {aic_kernel_id, aiv0_kernel_id, aiv1_kernel_id};
+        dep_gen_aicpu_record_submit(task_id.raw, orch->in_manual_scope(), tc, tensor_ptrs, arg_types_u8, static_cast<int>(args.explicit_dep_count()), reinterpret_cast<const uint64_t *>(args.explicit_deps_data()), args.launch_spec.block_num(), kernel_ids_capture);
+    }
+
+    PTO2FaninBuilder fanin_builder;
+
+    int32_t sm_last_task_alive = fc.last_task_alive.load(std::memory_order_acquire);
+    orch->tensor_map.sync_tensormap(task_id, sm_last_task_alive);
+
+    for (uint32_t i = 0; i < args.explicit_dep_count(); i++)
+    {
+        PTO2TaskId dep_task_id = args.explicit_dep(i);
+        if (!dep_task_id.is_valid())
+        {
+            orch->report_fatal(PTO2_ERROR_INVALID_ARGS, __FUNCTION__, "Arg.set_dependencies(...) requires valid task ids");
+            return result;
+        }
+        PTO2SharedMemoryRingHeader &dep_ring = orch->sm_header->rings[dep_task_id.ring()];
+        int32_t dep_local_task_id = static_cast<int32_t>(dep_task_id.local());
+        int32_t dep_last_task_alive = dep_ring.fc.last_task_alive.load(std::memory_order_acquire);
+        if (dep_local_task_id < dep_last_task_alive) continue;
+        PTO2TaskSlotState *producer_slot_state = &dep_ring.get_slot_state_by_task_id(dep_local_task_id);
+        if (!append_fanin_or_fail(orch, producer_slot_state, dep_local_task_id, &fanin_builder)) return result;
+    }
+
+    DepInputs dep_inputs{
+        args.tensor_count(), args.tensor_data(), args.tag_data(), static_cast<int32_t>(args.explicit_dep_count()), args.explicit_deps_data(),
+    };
+
+    auto runtime_emit = [&](PTO2TaskId producer_task_id) -> bool {
+        int32_t prod_local = static_cast<int32_t>(producer_task_id.local());
+        PTO2TaskSlotState *prod_state = &orch->sm_header->rings[producer_task_id.ring()].get_slot_state_by_task_id(prod_local);
+        return append_fanin_or_fail(orch, prod_state, prod_local, &fanin_builder);
+    };
+
+    if (!compute_task_fanin(dep_inputs, orch->tensor_map, orch->in_manual_scope(), runtime_emit)) return result;
+
+    register_task_outputs(dep_inputs, task_id, orch->tensor_map, orch->in_manual_scope());
+
+    __builtin_prefetch(&task, 1, 1);
+    task.task_id = task_id;
+    task.kernel_id[static_cast<int>(PTO2SubtaskSlot::AIC)] = aic_kernel_id;
+    task.kernel_id[static_cast<int>(PTO2SubtaskSlot::AIV0)] = aiv0_kernel_id;
+    task.kernel_id[static_cast<int>(PTO2SubtaskSlot::AIV1)] = aiv1_kernel_id;
+    task.packed_buffer_base = prepared.alloc_result.packed_base;
+    task.packed_buffer_end = prepared.alloc_result.packed_end;
+
+    // Push this consumer's local_id into each producer's last_consumer high-
+    // water-mark, replacing the per-completion fanout_refcount notification.
+    // Reclamation gates on the per-ring completed_watermark reaching this
+    // value. Only update for same-ring fanin: cross-ring consumers live in a
+    // different local_id space, so their id is meaningless to the producer's
+    // ring's watermark. Cross-ring producer slots reclaim on scope_end /
+    // ring wrap instead — acceptable since cross-ring fanin (e.g.
+    // alloc_tensors output) is sparse.
+    const uint8_t self_ring = task_id.ring();
+    const int32_t self_local = static_cast<int32_t>(task_id.local());
+    for (int32_t i = 0; i < fanin_builder.count; i++)
+    {
+        PTO2TaskSlotState *prod = fanin_builder.slots[i];
+        if (prod->ring_id != self_ring) continue;
+        if (self_local > prod->last_consumer_local_id) prod->last_consumer_local_id = self_local;
+    }
+
+    payload.fanin_count = fanin_builder.count;
+    for (int32_t i = 0; i < fanin_builder.count; i++)
+    {
+        payload.fanin_local_ids[i] = fanin_builder.local_ids[i];
+        payload.fanin_ring_ids[i] = fanin_builder.ring_ids[i];
+    }
+
+    payload.init(args, result, prepared.alloc_result, layout);
+
+    while (!sched->wiring.queue.push(&cur_slot_state)) SPIN_WAIT_HINT();
+
+    return result;
+}
 
 #endif  // PTO_ORCHESTRATOR_H
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.cpp
index c2d7e7660..095c60d38 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.cpp
@@ -8,178 +8,6 @@
  * See LICENSE in the root of the software repository for the full text of the License.
  * -----------------------------------------------------------------------------------------------------------
  */
-/**
- * PTO Runtime2 - Ring Buffer Implementation
- *
- * Implements DepListPool ring buffer for zero-overhead dependency management.
- * TaskAllocator methods are defined inline in pto_ring_buffer.h.
- *
- * Based on: docs/RUNTIME_LOGIC.md
- */
-
-#include "pto_ring_buffer.h"
-#include <inttypes.h>
-#include <string.h>
-#include "common/unified_log.h"
-#include "scheduler/pto_scheduler.h"
-
-static void latch_pool_error(std::atomic<int32_t> *error_code_ptr, int32_t error_code) {
-    if (error_code_ptr == nullptr) {
-        return;
-    }
-    int32_t expected = PTO2_ERROR_NONE;
-    error_code_ptr->compare_exchange_strong(expected, error_code, std::memory_order_acq_rel);
-}
-
-// =============================================================================
-// Fanin Spill Pool Implementation
-// =============================================================================
-void PTO2FaninPool::reclaim(PTO2SharedMemoryRingHeader &ring, int32_t sm_last_task_alive) {
-    if (sm_last_task_alive <= reclaim_task_cursor) return;
-
-    int32_t scan_end = sm_last_task_alive;
-    for (int32_t task_id = reclaim_task_cursor; task_id < scan_end; ++task_id) {
-        PTO2TaskPayload &payload = ring.get_payload_by_task_id(task_id);
-        if (payload.fanin_spill_pool != this) {
-            continue;
-        }
-
-        int32_t inline_count = std::min(payload.fanin_actual_count, PTO2_FANIN_INLINE_CAP);
-        int32_t spill_edge_count = payload.fanin_actual_count - inline_count;
-        if (spill_edge_count > 0) {
-            advance_tail(payload.fanin_spill_start + spill_edge_count);
-        }
-    }
-    reclaim_task_cursor = scan_end;
-}
-
-bool PTO2FaninPool::ensure_space(PTO2SharedMemoryRingHeader &ring, int32_t needed) {
-    if (available() >= needed) return true;
-
-    int spin_count = 0;
-    int32_t prev_last_alive = ring.fc.last_task_alive.load(std::memory_order_acquire);
-    uint64_t block_cycle0 = 0;  // wall-clock anchor for the deadlock backstop
-    bool block_timing = false;  // false until the first no-reclaim-progress spin
-    while (available() < needed) {
-        reclaim(ring, prev_last_alive);
-        if (available() >= needed) return true;
-
-        spin_count++;
-
-        int32_t cur_last_alive = ring.fc.last_task_alive.load(std::memory_order_acquire);
-        if (cur_last_alive > prev_last_alive) {
-            spin_count = 0;
-            prev_last_alive = cur_last_alive;
-            block_timing = false;
-        } else if ((spin_count & 1023) == 0) {
-            // A fatal latched elsewhere breaks this otherwise-unbounded spin; the
-            // caller maps the failed ensure_space to orch_mark_fatal. Cold path.
-            if (error_code_ptr != nullptr && error_code_ptr->load(std::memory_order_acquire) != PTO2_ERROR_NONE) {
-                return false;
-            }
-            // Absolute-time backstop, matching the task allocator: stable across
-            // chips/contention, unlike a fixed spin count. get_sys_cnt_aicpu()
-            // is an MMIO read, so sample it only once per 1024 spins.
-            uint64_t now = get_sys_cnt_aicpu();
-            if (!block_timing) {
-                block_cycle0 = now;
-                block_timing = true;
-            } else if (now - block_cycle0 >= PTO2_ALLOC_DEADLOCK_TIMEOUT_CYCLES) {
-                int32_t current = ring.fc.current_task_index.load(std::memory_order_acquire);
-                LOG_ERROR("========================================");
-                LOG_ERROR("FATAL: Fanin Spill Pool Deadlock Detected!");
-                LOG_ERROR("========================================");
-                LOG_ERROR("Fanin spill pool cannot reclaim space after ~500 ms (no progress).");
-                LOG_ERROR(
-                    "  - Pool used:     %d / %d (%.1f%%)", used(), capacity,
-                    (capacity > 0) ? (100.0 * used() / capacity) : 0.0
-                );
-                LOG_ERROR("  - Pool top:      %d (linear)", top);
-                LOG_ERROR("  - Pool tail:     %d (linear)", tail);
-                LOG_ERROR("  - High water:    %d", high_water);
-                LOG_ERROR("  - Needed:        %d entries", needed);
-                LOG_ERROR("  - last_task_alive: %d (stuck here)", cur_last_alive);
-                LOG_ERROR("  - current_task:    %d", current);
-                LOG_ERROR("  - In-flight tasks: %d", current - cur_last_alive);
-                LOG_ERROR("Diagnosis:");
-                LOG_ERROR("  last_task_alive is not advancing, so fanin spill pool tail");
-                LOG_ERROR("  cannot reclaim. Check TaskRing diagnostics for root cause.");
-                LOG_ERROR("Solution:");
-                LOG_ERROR(
-                    "  Increase fanin spill pool capacity (current: %d, recommended: %d)", capacity, high_water * 2
-                );
-                LOG_ERROR("  Compile-time: PTO2_DEP_LIST_POOL_SIZE in pto_runtime2_types.h");
-                LOG_ERROR("  Runtime env:  PTO2_RING_DEP_POOL=%d", high_water * 2);
-                LOG_ERROR("========================================");
-                latch_pool_error(error_code_ptr, PTO2_ERROR_DEP_POOL_OVERFLOW);
-                return false;
-            }
-        }
-        SPIN_WAIT_HINT();
-    }
-    return true;
-}
-
-// =============================================================================
-// Dependency List Pool Implementation
-// =============================================================================
-void PTO2DepListPool::reclaim(PTO2SharedMemoryRingHeader &ring, int32_t sm_last_task_alive) {
-    if (sm_last_task_alive >= last_reclaimed + PTO2_DEP_POOL_CLEANUP_INTERVAL && sm_last_task_alive > 0) {
-        int32_t mark = ring.get_slot_state_by_task_id(sm_last_task_alive - 1).dep_pool_mark;
-        if (mark > 0) {
-            advance_tail(mark);
-        }
-        last_reclaimed = sm_last_task_alive;
-    }
-}
-
-bool PTO2DepListPool::ensure_space(PTO2SharedMemoryRingHeader &ring, int32_t needed) {
-    if (available() >= needed) return true;
-
-    int spin_count = 0;
-    int32_t prev_last_alive = ring.fc.last_task_alive.load(std::memory_order_acquire);
-    while (available() < needed) {
-        reclaim(ring, prev_last_alive);
-        if (available() >= needed) return true;
-
-        spin_count++;
-
-        // Progress detection: reset spin counter if last_task_alive advances
-        int32_t cur_last_alive = ring.fc.last_task_alive.load(std::memory_order_acquire);
-        if (cur_last_alive > prev_last_alive) {
-            spin_count = 0;
-            prev_last_alive = cur_last_alive;
-        }
 
-        if (spin_count >= PTO2_DEP_POOL_SPIN_LIMIT) {
-            int32_t current = ring.fc.current_task_index.load(std::memory_order_acquire);
-            LOG_ERROR("========================================");
-            LOG_ERROR("FATAL: Dependency Pool Deadlock Detected!");
-            LOG_ERROR("========================================");
-            LOG_ERROR("DepListPool cannot reclaim space after %d spins (no progress).", spin_count);
-            LOG_ERROR(
-                "  - Pool used:     %d / %d (%.1f%%)", used(), capacity,
-                (capacity > 0) ? (100.0 * used() / capacity) : 0.0
-            );
-            LOG_ERROR("  - Pool top:      %d (linear)", top);
-            LOG_ERROR("  - Pool tail:     %d (linear)", tail);
-            LOG_ERROR("  - High water:    %d", high_water);
-            LOG_ERROR("  - Needed:        %d entries", needed);
-            LOG_ERROR("  - last_task_alive: %d (stuck here)", cur_last_alive);
-            LOG_ERROR("  - current_task:    %d", current);
-            LOG_ERROR("  - In-flight tasks: %d", current - cur_last_alive);
-            LOG_ERROR("Diagnosis:");
-            LOG_ERROR("  last_task_alive is not advancing, so dep pool tail");
-            LOG_ERROR("  cannot reclaim. Check TaskRing diagnostics for root cause.");
-            LOG_ERROR("Solution:");
-            LOG_ERROR("  Increase dep pool capacity (current: %d, recommended: %d)", capacity, high_water * 2);
-            LOG_ERROR("  Compile-time: PTO2_DEP_LIST_POOL_SIZE in pto_runtime2_types.h");
-            LOG_ERROR("  Runtime env:  PTO2_RING_DEP_POOL=%d", high_water * 2);
-            LOG_ERROR("========================================");
-            latch_pool_error(error_code_ptr, PTO2_ERROR_DEP_POOL_OVERFLOW);
-            return false;
-        }
-        SPIN_WAIT_HINT();
-    }
-    return true;
-}
+// Polling redesign: pto_ring_buffer logic is now inlined in pto_ring_buffer.h. This translation
+// unit is kept empty to preserve the upstream/main file layout.
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h
index 97f318d40..3faef6b4c 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h
@@ -8,28 +8,6 @@
  * See LICENSE in the root of the software repository for the full text of the License.
  * -----------------------------------------------------------------------------------------------------------
  */
-/**
- * PTO Runtime2 - Ring Buffer Data Structures
- *
- * Implements ring buffer designs for zero-overhead memory management:
- *
- * 1. TaskAllocator - Unified task slot + output buffer allocation
- *    - Combines task ring (slot allocation) and heap ring (output buffer allocation)
- *    - Single spin-wait loop with unified back-pressure and deadlock detection
- *    - O(1) bump allocation for both task slots and heap buffers
- *
- * 2. FaninPool - Fanin spill entry allocation
- *    - Ring buffer for spilled fanin entries
- *    - O(1) append allocation
- *    - Implicit reclamation with task ring
- *
- * 3. DepListPool - Dependency list entry allocation
- *    - Ring buffer for linked list entries
- *    - O(1) prepend operation
- *    - Implicit reclamation with task ring
- *
- * Based on: docs/RUNTIME_LOGIC.md
- */
 
 #ifndef PTO_RING_BUFFER_H
 #define PTO_RING_BUFFER_H
@@ -40,67 +18,28 @@
 
 #include "pto_runtime2_types.h"
 #include "pto_shared_memory.h"
-#include "aicpu/device_time.h"       // get_sys_cnt_aicpu (deadlock wall-clock backstop)
-#include "common/platform_config.h"  // PLATFORM_PROF_SYS_CNT_FREQ (deadlock wall-clock)
-#include "common/unified_log.h"
-
-#if PTO2_PROFILING
-// Heap-ring wrap reporting — the allocator is the only place each individual
-// wrap is observable, so it notifies the scope_stats collector here. Gated:
-// pays nothing (no include, no call) when profiling is compiled out.
-#include "aicpu/scope_stats_collector_aicpu.h"
-#endif
 
 // Block notification interval (in spin counts)
 #define PTO2_BLOCK_NOTIFY_INTERVAL 10000
-// Heap/task deadlock is detected structurally (head task COMPLETED + all
-// consumers released + scope still open -> only scope_end can free it, which a
-// blocked orchestrator can never reach). This wall-clock value is only a
-// backstop for the residual case the structural test can't prove locally; it is
-// an ABSOLUTE TIME (not a spin count), so it is stable across chips/contention.
-#define PTO2_ALLOC_DEADLOCK_TIMEOUT_CYCLES (PLATFORM_PROF_SYS_CNT_FREQ / 2)  // 500 ms
+// Alloc spin limit - after this, report deadlock and exit
+#define PTO2_ALLOC_SPIN_LIMIT 100000
 
 // Dep pool spin limit - if exceeded, dep pool capacity too small for workload
 #define PTO2_DEP_POOL_SPIN_LIMIT 100000
 
-// =============================================================================
-// Task Allocator (unified task slot + heap buffer allocation)
-// =============================================================================
+inline void latch_pool_error(std::atomic<int32_t> *error_code_ptr, int32_t error_code)
+{
+    if (error_code_ptr == nullptr) return;
+    int32_t expected = PTO2_ERROR_NONE;
+    error_code_ptr->compare_exchange_strong(expected, error_code, std::memory_order_acq_rel);
+}
 
-/**
- * Unified task slot + heap buffer allocator.
- *
- * Since task and heap are always allocated together and the orchestrator is
- * single-threaded, both pointers (task index, heap top) are tracked locally
- * and published to shared memory via plain store — no fetch_add or CAS needed.
- *
- * The alloc() method checks both resources BEFORE committing to either,
- * eliminating the need for rollback on partial failure.
- */
-class PTO2TaskAllocator {
+class PTO2TaskAllocator
+{
 public:
-    /**
-     * Initialize the allocator with task ring and heap ring resources.
-     *
-     * All pointer arguments are device addresses (live in SM / GM heap); this
-     * function only stores them, no dereferences, so it is safe to invoke
-     * from host code that constructs a prebuilt arena image.
-     *
-     * Production callers leave `initial_local_task_id` at 0: the SM ring
-     * flow-control counters that current_index_ptr / last_alive_ptr point at
-     * start at zero (PTO2RingFlowControl::init() runs on the AICPU during SM
-     * reset), so we keep local_task_id_ aligned with that without reading the
-     * SM. Tests that drive SM state directly may pass a non-zero seed to
-     * exercise corner cases like task IDs near INT32_MAX.
-     */
-    void init(
-        PTO2TaskDescriptor *descriptors, int32_t window_size, std::atomic<int32_t> *current_index_ptr,
-        std::atomic<int32_t> *last_alive_ptr, void *heap_base, uint64_t heap_size, std::atomic<int32_t> *error_code_ptr,
-        PTO2TaskSlotState *slot_states = nullptr, int32_t initial_local_task_id = 0, uint8_t ring_id = 0
-    ) {
+    void init(PTO2TaskDescriptor *descriptors, int32_t window_size, std::atomic<int32_t> *current_index_ptr, std::atomic<int32_t> *last_alive_ptr, void *heap_base, uint64_t heap_size, std::atomic<int32_t> *error_code_ptr, int32_t initial_local_task_id = 0)
+    {
         descriptors_ = descriptors;
-        slot_states_ = slot_states;
-        ring_id_ = ring_id;
         window_size_ = window_size;
         window_mask_ = window_size - 1;
         current_index_ptr_ = current_index_ptr;
@@ -114,127 +53,84 @@ class PTO2TaskAllocator {
         last_alive_seen_ = 0;
     }
 
-    /**
-     * Allocate a task slot and its associated output buffer in one call.
-     *
-     * Both task index and heap top are maintained as local counters and
-     * published to shared memory only on success. Since the orchestrator is
-     * single-threaded, no CAS or fetch_add is needed — just check-then-commit.
-     *
-     * @param output_size  Total packed output size in bytes (0 = no heap needed)
-     * @return Allocation result; check failed() for errors
-     */
-    PTO2TaskAllocResult alloc(int32_t output_size) {
-        uint64_t aligned_size =
-            output_size > 0 ? PTO2_ALIGN_UP(static_cast<uint64_t>(output_size), PTO2_ALIGN_SIZE) : 0;
+    PTO2TaskAllocResult alloc(int32_t output_size)
+    {
+        uint64_t aligned_size = output_size > 0 ? PTO2_ALIGN_UP(static_cast<uint64_t>(output_size), PTO2_ALIGN_SIZE) : 0;
 
         int spin_count = 0;
         int32_t prev_last_alive = last_alive_ptr_->load(std::memory_order_acquire);
         int32_t last_alive = prev_last_alive;
         update_heap_tail(last_alive);
         bool blocked_on_heap = false;
-        uint64_t block_cycle0 = 0;  // wall-clock anchor for the deadlock backstop
-        bool block_timing = false;  // false until the first no-reclaim-progress spin
-#if PTO2_ORCH_PROFILING
-        uint64_t wait_start = 0;
-        bool waiting = false;
-#endif
 
-        while (true) {
+        while (true)
+        {
             // Check both resources; commit only if both available
-            if (local_task_id_ - last_alive + 1 < window_size_) {
+            if (local_task_id_ - last_alive + 1 < window_size_)
+            {
                 void *heap_ptr = try_bump_heap(aligned_size);
-                if (heap_ptr) {
+                if (heap_ptr)
+                {
                     int32_t task_id = commit_task();
-#if PTO2_ORCH_PROFILING
-                    record_wait(spin_count, wait_start, waiting);
-#endif
                     return {task_id, task_id & window_mask_, heap_ptr, static_cast<char *>(heap_ptr) + aligned_size};
                 }
                 blocked_on_heap = true;
-            } else {
+            }
+            else
+            {
                 blocked_on_heap = false;
             }
 
             // Spin: wait for scheduler to advance last_task_alive
             spin_count++;
-#if PTO2_ORCH_PROFILING
-            if (!waiting) {
-                wait_start = get_sys_cnt_aicpu();
-                waiting = true;
-            }
-#endif
             last_alive = last_alive_ptr_->load(std::memory_order_acquire);
             update_heap_tail(last_alive);
-            if (last_alive > prev_last_alive) {
-                // Reclaim advanced -> productive backpressure, not a deadlock.
+            if (last_alive > prev_last_alive)
+            {
                 spin_count = 0;
                 prev_last_alive = last_alive;
-                block_timing = false;
-            } else if ((spin_count & 1023) == 0) {
-                // A fatal latched elsewhere (e.g. the scheduler-side wiring
-                // deadlock detector) breaks this otherwise-unbounded spin; the
-                // caller maps the failed alloc to orch_mark_fatal. Polled on the
-                // cold path only -- error_code_ptr_ is orch_error_code.
-                if (error_code_ptr_ != nullptr && error_code_ptr_->load(std::memory_order_acquire) != PTO2_ERROR_NONE) {
-                    return {-1, -1, nullptr, nullptr};
-                }
-                // Reclaim watermark is stuck. Run the deadlock checks only once
-                // per 1024 spins: get_sys_cnt_aicpu() is an MMIO read and
-                // head_blocked_on_scope_end() walks the head slot, neither of
-                // which needs to fire on every hot spin (1024 spins is far below
-                // the wall-clock timeout, so detection latency is unaffected).
-                // (1) Structural, immediate: if the head task is COMPLETED with
-                // every consumer released but its scope still open, only
-                // scope_end can free it and a blocked orchestrator can never
-                // call it -> provable deadlock now.
-                if (head_blocked_on_scope_end(last_alive)) {
-                    report_deadlock(output_size, blocked_on_heap, /*scope_gated=*/true);
-                    return {-1, -1, nullptr, nullptr};
-                }
-                // (2) Wall-clock backstop for the residual case the local head
-                // test can't prove (e.g. a closed sibling whose consumer is
-                // deferred). Absolute time, not a spin count.
-                uint64_t now = get_sys_cnt_aicpu();
-                if (!block_timing) {
-                    block_cycle0 = now;
-                    block_timing = true;
-                } else if (now - block_cycle0 >= PTO2_ALLOC_DEADLOCK_TIMEOUT_CYCLES) {
-                    report_deadlock(output_size, blocked_on_heap, /*scope_gated=*/false);
+            }
+            else
+            {
+                if (spin_count % PTO2_BLOCK_NOTIFY_INTERVAL == 0)
+                {}
+                if (spin_count >= PTO2_ALLOC_SPIN_LIMIT)
+                {
+                    report_deadlock(blocked_on_heap);
                     return {-1, -1, nullptr, nullptr};
                 }
-                if (spin_count % PTO2_BLOCK_NOTIFY_INTERVAL == 0) {
-                    LOG_WARN(
-                        "[TaskAllocator ring=%u] BLOCKED: tasks=%d/%d, heap_used=%" PRIu64 "/%" PRIu64
-                        ", heap_available=%" PRIu64 ", heap_cursor=%" PRIu64 ", on=%s, spins=%d",
-                        static_cast<unsigned>(ring_id_), local_task_id_ - last_alive, window_size_, heap_used_bytes(),
-                        heap_size_, heap_available(), heap_top_, blocked_on_heap ? "heap" : "task", spin_count
-                    );
-                }
             }
             SPIN_WAIT_HINT();
         }
     }
 
-    // =========================================================================
-    // State queries
-    // =========================================================================
-
-    int32_t active_count() const {
+    int32_t active_count() const
+    {
         int32_t last_alive = last_alive_ptr_->load(std::memory_order_acquire);
         return local_task_id_ - last_alive;
     }
 
     // Task ring start/end: tail = oldest live task (last_task_alive), head =
     // next task id to allocate. head - tail == active_count().
-    int32_t task_tail() const { return last_alive_ptr_->load(std::memory_order_acquire); }
-    int32_t task_head() const { return local_task_id_; }
+    int32_t task_tail() const
+    {
+        return last_alive_ptr_->load(std::memory_order_acquire);
+    }
+    int32_t task_head() const
+    {
+        return local_task_id_;
+    }
 
-    int32_t window_size() const { return window_size_; }
+    int32_t window_size() const
+    {
+        return window_size_;
+    }
 
-    uint64_t heap_available() const {
+    uint64_t heap_available() const
+    {
         uint64_t tail = heap_tail_;
-        if (heap_top_ >= tail) {
+        if (heap_top_ >= tail)
+        {
             uint64_t at_end = heap_size_ - heap_top_;
             uint64_t at_begin = tail;
             return at_end > at_begin ? at_end : at_begin;
@@ -242,12 +138,22 @@ class PTO2TaskAllocator {
         return tail - heap_top_;
     }
 
-    uint64_t heap_top() const { return heap_top_; }
+    uint64_t heap_top() const
+    {
+        return heap_top_;
+    }
     // Heap ring start: reclaim pointer (oldest byte still live). heap_top() is
     // the end (next allocation). heap_top - heap_tail == heap_used_bytes().
-    uint64_t heap_tail() const { return heap_tail_; }
-    uint64_t heap_capacity() const { return heap_size_; }
-    uint64_t heap_used_bytes() const {
+    uint64_t heap_tail() const
+    {
+        return heap_tail_;
+    }
+    uint64_t heap_capacity() const
+    {
+        return heap_size_;
+    }
+    uint64_t heap_used_bytes() const
+    {
         if (heap_size_ == 0) return 0;
         return (heap_top_ + heap_size_ - heap_tail_) % heap_size_;
     }
@@ -255,10 +161,6 @@ class PTO2TaskAllocator {
 private:
     // --- Task Ring ---
     PTO2TaskDescriptor *descriptors_ = nullptr;
-    // Parallel to descriptors_, indexed by task_id & window_mask_. Read-only here,
-    // used by the deadlock detector to inspect the head task's state + fanout.
-    PTO2TaskSlotState *slot_states_ = nullptr;
-    uint8_t ring_id_ = 0;
     int32_t window_size_ = 0;
     int32_t window_mask_ = 0;
     std::atomic<int32_t> *current_index_ptr_ = nullptr;
@@ -277,526 +179,73 @@ class PTO2TaskAllocator {
     // --- Shared ---
     std::atomic<int32_t> *error_code_ptr_ = nullptr;
 
-    // =========================================================================
-    // Internal helpers
-    // =========================================================================
-
-    /**
-     * Commit a task slot: bump local counter and publish to shared memory.
-     * Must only be called after space check has passed.
-     */
-    int32_t commit_task() {
+    int32_t commit_task()
+    {
         int32_t task_id = local_task_id_++;
         current_index_ptr_->store(local_task_id_, std::memory_order_release);
         return task_id;
     }
 
-    /**
-     * Derive heap_tail_ from the last consumed task's packed_buffer_end.
-     *
-     * Every task has a valid packed_buffer_end (equal to packed_buffer_base
-     * for zero-size allocations), so the last consumed task always determines
-     * the correct heap_tail — no backward scan needed.
-     */
-    void update_heap_tail(int32_t last_alive) {
+    void update_heap_tail(int32_t last_alive)
+    {
         if (last_alive <= last_alive_seen_) return;
         last_alive_seen_ = last_alive;
 
         PTO2TaskDescriptor &desc = descriptors_[(last_alive - 1) & window_mask_];
-        uint64_t old_tail = heap_tail_;
-        heap_tail_ =
-            static_cast<uint64_t>(static_cast<char *>(desc.packed_buffer_end) - static_cast<char *>(heap_base_));
-#if PTO2_PROFILING
-        // Reclaim pointer moves forward monotonically in ring order; a decrease
-        // means it wrapped past heap_size_ (occupancy < heap_size_ guarantees at
-        // most one wrap per call). Report it so scope_stats can unroll.
-        if (is_scope_stats_enabled() && heap_tail_ < old_tail) {
-            scope_stats_note_heap_wrap(SCOPE_STATS_HEAP_SIDE_RECLAIM);
-        }
-#else
-        (void)old_tail;
-#endif
+        heap_tail_ = static_cast<uint64_t>(static_cast<char *>(desc.packed_buffer_end) - static_cast<char *>(heap_base_));
     }
 
-    /**
-     * Bump the heap pointer for the given allocation size.
-     * Returns the allocated pointer, or nullptr if insufficient space.
-     * When alloc_size == 0, returns current position without advancing.
-     */
-    void *try_bump_heap(uint64_t alloc_size) {
+    void *try_bump_heap(uint64_t alloc_size)
+    {
         uint64_t top = heap_top_;
-        if (alloc_size == 0) {
-            return static_cast<char *>(heap_base_) + top;
-        }
+        if (alloc_size == 0) return static_cast<char *>(heap_base_) + top;
         uint64_t tail = heap_tail_;
         void *result;
 
-        if (top >= tail) {
+        if (top >= tail)
+        {
             uint64_t space_at_end = heap_size_ - top;
-            if (space_at_end >= alloc_size) {
+            if (space_at_end >= alloc_size)
+            {
                 result = static_cast<char *>(heap_base_) + top;
                 heap_top_ = top + alloc_size;
-            } else if (tail > alloc_size) {
-                LOG_DEBUG(
-                    "try_bump_heap wrap-around alloc: top=%" PRIu64 ", tail=%" PRIu64 ", alloc=%" PRIu64, top, tail,
-                    alloc_size
-                );
+            }
+            else if (tail > alloc_size)
+            {
                 result = heap_base_;
                 heap_top_ = alloc_size;
-#if PTO2_PROFILING
-                // Allocation pointer just wrapped past heap_size_; report it so
-                // scope_stats can unroll the wrapping offset into a monotonic value.
-                // The collector attributes the wrap to the current scope's ring.
-                if (is_scope_stats_enabled()) scope_stats_note_heap_wrap(SCOPE_STATS_HEAP_SIDE_ALLOC);
-#endif
-            } else {
-                LOG_DEBUG(
-                    "try_bump_heap failed (top>=tail): top=%" PRIu64 ", tail=%" PRIu64 ", alloc=%" PRIu64
-                    ", heap_size=%" PRIu64,
-                    top, tail, alloc_size, heap_size_
-                );
-                return nullptr;
             }
-        } else {
-            if (tail - top > alloc_size) {
-                result = static_cast<char *>(heap_base_) + top;
-                heap_top_ = top + alloc_size;
-            } else {
-                LOG_DEBUG(
-                    "try_bump_heap failed (top<tail): top=%" PRIu64 ", tail=%" PRIu64 ", alloc=%" PRIu64
-                    ", free_gap=%" PRIu64,
-                    top, tail, alloc_size, tail - top
-                );
+            else
+            {
                 return nullptr;
             }
         }
-
-        return result;
-    }
-
-#if PTO2_ORCH_PROFILING
-    void record_wait(int spin_count, uint64_t wait_start, bool waiting) {
-        if (waiting) {
-            extern uint64_t g_orch_alloc_wait_cycle;
-            g_orch_alloc_wait_cycle += (get_sys_cnt_aicpu() - wait_start);
+        else if (tail - top > alloc_size)
+        {
+            result = static_cast<char *>(heap_base_) + top;
+            heap_top_ = top + alloc_size;
         }
+        else
         {
-            extern uint64_t g_orch_alloc_atomic_count;
-            g_orch_alloc_atomic_count += spin_count + 1;
+            return nullptr;
         }
-    }
-#endif
 
-    /**
-     * Structural deadlock test on the reclaim head.
-     *
-     * The head (oldest un-CONSUMED task, at last_task_alive) gates all
-     * reclamation. If it is COMPLETED and every consumer reference is released
-     * (low bits of fanout_refcount == consumer count) but the scope reference
-     * (bit31) is still unset, the only release left is its scope_end. Because
-     * this is evaluated while the orchestrator is blocked in alloc(), scope_end
-     * can never be reached -> provable deadlock, no timeout required.
-     *
-     * The COMPLETED guard is mandatory: a zero-consumer task has
-     * refcount == 0 == (count & ~SCOPE_BIT) from birth, before it has run.
-     */
-    bool head_blocked_on_scope_end(int32_t head_task_id) const {
-        if (slot_states_ == nullptr) return false;
-        PTO2TaskSlotState &h = slot_states_[head_task_id & window_mask_];
-        if (h.task_state.load(std::memory_order_acquire) != PTO2_TASK_COMPLETED) return false;
-        uint32_t rc = h.fanout_refcount.load(std::memory_order_acquire);
-        return rc == (h.fanout_count & ~PTO2_FANOUT_SCOPE_BIT);
+        return result;
     }
 
-    /**
-     * Report deadlock with targeted diagnostics. scope_gated == true means the
-     * head-of-line structural test proved it (waiting only on scope_end);
-     * false means the wall-clock backstop fired.
-     */
-    void report_deadlock(int32_t requested_output_size, bool heap_blocked, bool scope_gated) {
-        int32_t last_alive = last_alive_ptr_->load(std::memory_order_acquire);
-        int32_t active_tasks = local_task_id_ - last_alive;
-        uint64_t htail = heap_tail_;
-
-        LOG_ERROR("========================================");
-        if (heap_blocked) {
-            LOG_ERROR("FATAL: Task Allocator Deadlock - Heap Exhausted! ring=%u", static_cast<unsigned>(ring_id_));
-        } else {
-            LOG_ERROR("FATAL: Task Allocator Deadlock - Task Ring Full! ring=%u", static_cast<unsigned>(ring_id_));
-        }
-        LOG_ERROR("========================================");
-        if (scope_gated) {
-            LOG_ERROR("Head task %d COMPLETED, all consumers released, scope still open ->", last_alive);
-            LOG_ERROR("only scope_end can free it and the orchestrator is blocked here.");
-            LOG_ERROR("Provable head-of-line deadlock.");
-        } else {
-            LOG_ERROR(
-                "No reclaim progress for ~500 ms (%" PRIu64 " cycles wall clock).",
-                (uint64_t)PTO2_ALLOC_DEADLOCK_TIMEOUT_CYCLES
-            );
-        }
-        LOG_ERROR(
-            "  Task ring %u: current=%d, last_alive=%d, active=%d/%d (%.1f%%)", static_cast<unsigned>(ring_id_),
-            local_task_id_, last_alive, active_tasks, window_size_, 100.0 * active_tasks / window_size_
-        );
-        LOG_ERROR(
-            "  Heap ring %u: top=%" PRIu64 ", tail=%" PRIu64 ", size=%" PRIu64 ", used=%" PRIu64 ", available=%" PRIu64,
-            static_cast<unsigned>(ring_id_), heap_top_, htail, heap_size_, heap_used_bytes(), heap_available()
-        );
-        if (heap_blocked) {
-            LOG_ERROR("  Requested:  %d bytes", requested_output_size);
-        }
-        // Head-task state dump: what the reclaim watermark is actually waiting on.
-        if (slot_states_ != nullptr) {
-            PTO2TaskSlotState &h = slot_states_[last_alive & window_mask_];
-            uint32_t fc = h.fanout_count;
-            uint32_t rc = h.fanout_refcount.load(std::memory_order_acquire);
-            LOG_ERROR(
-                "  Head task %d: state=%d, consumers=%u/%u, scope_released=%d", last_alive,
-                static_cast<int>(h.task_state.load(std::memory_order_acquire)), rc & ~PTO2_FANOUT_SCOPE_BIT,
-                fc & ~PTO2_FANOUT_SCOPE_BIT, (rc & PTO2_FANOUT_SCOPE_BIT) ? 1 : 0
-            );
-        }
-        LOG_ERROR("Solution:");
-        if (scope_gated) {
-            LOG_ERROR("  The open scope's own allocation exceeds this ring. Either:");
-            LOG_ERROR("  1. Split the scope / reduce per-scope allocation (reclaim sooner), or");
-            LOG_ERROR("  2. Size the ring >= the scope's peak live-set (heap*2 may not be enough).");
-        } else if (heap_blocked) {
-            LOG_ERROR(
-                "  Increase heap (current: %" PRIu64 "); env PTO2_RING_HEAP=<bytes> (e.g. %" PRIu64 ")", heap_size_,
-                heap_size_ * 2
-            );
-            LOG_ERROR(
-                "  If one increase completes, it was under-provisioned; otherwise debug the stuck head consumer."
-            );
-        } else {
-            LOG_ERROR(
-                "  Increase task window (current: %d); env PTO2_RING_TASK_WINDOW=<pow2> (e.g. %d)", window_size_,
-                active_tasks * 2
-            );
-            LOG_ERROR(
-                "  If one increase completes, it was under-provisioned; otherwise debug the stuck head consumer."
-            );
-        }
-        LOG_ERROR("========================================");
-        if (error_code_ptr_) {
+    void report_deadlock(bool heap_blocked)
+    {
+        if (error_code_ptr_)
+        {
             int32_t code = heap_blocked ? PTO2_ERROR_HEAP_RING_DEADLOCK : PTO2_ERROR_FLOW_CONTROL_DEADLOCK;
             error_code_ptr_->store(code, std::memory_order_release);
         }
     }
 };
 
-// =============================================================================
-// Fanin Spill Pool
-// =============================================================================
-
-/**
- * Fanin spill pool structure
- *
- * True ring buffer for allocating spilled fanin entries.
- * Entries are reclaimed when their consumer tasks become CONSUMED.
- *
- * Linear counters (top, tail) grow monotonically; the physical index
- * is obtained via modulo: base[linear_index % capacity].
- */
-struct PTO2FaninPool {
-    PTO2FaninSpillEntry *base;       // Pool base address
-    int32_t capacity;                // Total number of entries
-    int32_t top;                     // Linear next-allocation counter (starts from 1)
-    int32_t tail;                    // Linear first-alive counter (entries before this are dead)
-    int32_t high_water;              // Peak concurrent usage (top - tail)
-    int32_t reclaim_task_cursor{0};  // Last task id scanned for reclaim on this pool
-
-    std::atomic<int32_t> *error_code_ptr = nullptr;
-
-    void init(PTO2FaninSpillEntry *in_base, int32_t in_capacity, std::atomic<int32_t> *in_error_code_ptr) {
-        base = in_base;
-        capacity = in_capacity;
-        top = 1;
-        tail = 1;
-        high_water = 0;
-        reclaim_task_cursor = 0;
-        base[0].slot_state = nullptr;
-        error_code_ptr = in_error_code_ptr;
-    }
-
-    void reset_for_reuse(std::atomic<int32_t> *in_error_code_ptr) {
-        top = 1;
-        tail = 1;
-        high_water = 0;
-        reclaim_task_cursor = 0;
-        base[0].slot_state = nullptr;
-        error_code_ptr = in_error_code_ptr;
-    }
-
-    void reclaim(PTO2SharedMemoryRingHeader &ring, int32_t sm_last_task_alive);
-
-    bool ensure_space(PTO2SharedMemoryRingHeader &ring, int32_t needed);
-
-    PTO2FaninSpillEntry *alloc() {
-        int32_t used = top - tail;
-        if (used >= capacity) {
-            LOG_ERROR("========================================");
-            LOG_ERROR("FATAL: Fanin Spill Pool Overflow!");
-            LOG_ERROR("========================================");
-            LOG_ERROR("Fanin spill pool exhausted: %d entries alive (capacity=%d).", used, capacity);
-            LOG_ERROR("  - Pool top:      %d (linear)", top);
-            LOG_ERROR("  - Pool tail:     %d (linear)", tail);
-            LOG_ERROR("  - High water:    %d", high_water);
-            LOG_ERROR("Solution:");
-            LOG_ERROR("  Increase fanin spill pool capacity (current: %d, recommended: %d).", capacity, capacity * 2);
-            LOG_ERROR("  Compile-time: PTO2_DEP_LIST_POOL_SIZE in pto_runtime2_types.h");
-            LOG_ERROR("  Runtime env:  PTO2_RING_DEP_POOL=%d", capacity * 2);
-            LOG_ERROR("========================================");
-            if (error_code_ptr) {
-                error_code_ptr->store(PTO2_ERROR_DEP_POOL_OVERFLOW, std::memory_order_release);
-            }
-            return nullptr;
-        }
-        int32_t idx = top % capacity;
-        top++;
-        used++;
-        if (used > high_water) high_water = used;
-        return &base[idx];
-    }
-
-    void advance_tail(int32_t new_tail) {
-        if (new_tail > tail) {
-            tail = new_tail;
-        }
-    }
-
-    int32_t used() const { return top - tail; }
-
-    int32_t available() const { return capacity - used(); }
-};
-
-template <typename Fn>
-using PTO2FaninCallbackResult = std::invoke_result_t<Fn &, PTO2TaskSlotState *>;
-
-template <typename Fn>
-using PTO2FaninForEachReturn = std::conditional_t<std::is_same_v<PTO2FaninCallbackResult<Fn>, void>, void, bool>;
-
-template <typename InlineSlots, typename Fn>
-inline PTO2FaninForEachReturn<Fn> for_each_fanin_storage(
-    InlineSlots &&inline_slot_states, int32_t fanin_count, int32_t spill_start, PTO2FaninPool &spill_pool, Fn &&fn
-) {
-    using FaninCallbackResult = PTO2FaninCallbackResult<Fn>;
-    static_assert(
-        std::is_same_v<FaninCallbackResult, void> || std::is_same_v<FaninCallbackResult, bool>,
-        "fanin callback must return void or bool"
-    );
-
-    if constexpr (std::is_void_v<FaninCallbackResult>) {
-        int32_t inline_count = std::min(fanin_count, PTO2_FANIN_INLINE_CAP);
-        for (int32_t i = 0; i < inline_count; i++) {
-            fn(inline_slot_states[i]);
-        }
-
-        int32_t spill_count = fanin_count - inline_count;
-        if (spill_count <= 0) {
-            return;
-        }
-
-        int32_t start_idx = spill_start % spill_pool.capacity;
-        int32_t first_count = std::min(spill_count, spill_pool.capacity - start_idx);
-        PTO2FaninSpillEntry *first = spill_pool.base + start_idx;
-        for (int32_t i = 0; i < first_count; i++) {
-            fn(first[i].slot_state);
-        }
-
-        int32_t second_count = spill_count - first_count;
-        for (int32_t i = 0; i < second_count; i++) {
-            fn(spill_pool.base[i].slot_state);
-        }
-        return;
-    } else {
-        int32_t inline_count = std::min(fanin_count, PTO2_FANIN_INLINE_CAP);
-        for (int32_t i = 0; i < inline_count; i++) {
-            if (!fn(inline_slot_states[i])) {
-                return false;
-            }
-        }
-
-        int32_t spill_count = fanin_count - inline_count;
-        if (spill_count <= 0) {
-            return true;
-        }
-
-        int32_t start_idx = spill_start % spill_pool.capacity;
-        int32_t first_count = std::min(spill_count, spill_pool.capacity - start_idx);
-        PTO2FaninSpillEntry *first = spill_pool.base + start_idx;
-        for (int32_t i = 0; i < first_count; i++) {
-            if (!fn(first[i].slot_state)) {
-                return false;
-            }
-        }
-
-        int32_t second_count = spill_count - first_count;
-        for (int32_t i = 0; i < second_count; i++) {
-            if (!fn(spill_pool.base[i].slot_state)) {
-                return false;
-            }
-        }
-        return true;
-    }
-}
-
-template <typename Fn>
-inline PTO2FaninForEachReturn<Fn> for_each_fanin_slot_state(const PTO2TaskPayload &payload, Fn &&fn) {
-    return for_each_fanin_storage(
-        payload.fanin_inline_slot_states, payload.fanin_actual_count, payload.fanin_spill_start,
-        *payload.fanin_spill_pool, static_cast<Fn &&>(fn)
-    );
-}
-
-// =============================================================================
-// Dependency List Pool
-// =============================================================================
-
-/**
- * Dependency list pool structure
- *
- * True ring buffer for allocating linked list entries.
- * Entries are reclaimed when their producer tasks become CONSUMED,
- * as tracked by the orchestrator via dep_pool_mark per task.
- *
- * Linear counters (top, tail) grow monotonically; the physical index
- * is obtained via modulo: base[linear_index % capacity].
- */
-struct PTO2DepListPool {
-    PTO2DepListEntry *base;     // Pool base address
-    int32_t capacity;           // Total number of entries
-    int32_t top;                // Linear next-allocation counter (starts from 1)
-    int32_t tail;               // Linear first-alive counter (entries before this are dead)
-    int32_t high_water;         // Peak concurrent usage (top - tail)
-    int32_t last_reclaimed{0};  // last_task_alive at last successful reclamation
-
-    // Error code pointer for fatal error reporting (→ sm_header->orch_error_code)
-    std::atomic<int32_t> *error_code_ptr = nullptr;
-
-    /**
-     *
-     * Initialize dependency list pool
-     * @param base      Pool base address from shared memory
-     * @param capacity  Total number of entries
-     */
-    void init(PTO2DepListEntry *in_base, int32_t in_capacity, std::atomic<int32_t> *in_error_code_ptr) {
-        base = in_base;
-        capacity = in_capacity;
-        top = 1;   // Start from 1, 0 means NULL/empty
-        tail = 1;  // Match initial top (no reclaimable entries yet)
-        high_water = 0;
-        last_reclaimed = 0;
-
-        // Initialize entry 0 as NULL marker
-        base[0].slot_state = nullptr;
-        base[0].next = nullptr;
-
-        error_code_ptr = in_error_code_ptr;
-    }
-
-    void reset_for_reuse(std::atomic<int32_t> *in_error_code_ptr) {
-        top = 1;
-        tail = 1;
-        high_water = 0;
-        last_reclaimed = 0;
-        base[0].slot_state = nullptr;
-        base[0].next = nullptr;
-        error_code_ptr = in_error_code_ptr;
-    }
-
-    /**
-     * Reclaim dead entries based on scheduler's slot state dep_pool_mark.
-     * Safe to call multiple times — only advances tail forward.
-     *
-     * @param ring             Ring header (for reading slot dep_pool_mark)
-     * @param sm_last_task_alive Current last_task_alive from shared memory
-     */
-    void reclaim(PTO2SharedMemoryRingHeader &ring, int32_t sm_last_task_alive);
-
-    /**
-     * Ensure dep pool for a specific ring has at least `needed` entries available.
-     * Spin-waits for reclamation if under pressure. Detects deadlock if no progress.
-     */
-    bool ensure_space(PTO2SharedMemoryRingHeader &ring, int32_t needed);
-
-    /**
-     * Allocate a single entry from the pool (single-thread per pool instance)
-     *
-     * @return Pointer to allocated entry, or nullptr on fatal error
-     */
-    PTO2DepListEntry *alloc() {
-        int32_t used = top - tail;
-        if (used >= capacity) {
-            LOG_ERROR("========================================");
-            LOG_ERROR("FATAL: Dependency Pool Overflow!");
-            LOG_ERROR("========================================");
-            LOG_ERROR("DepListPool exhausted: %d entries alive (capacity=%d).", used, capacity);
-            LOG_ERROR("  - Pool top:      %d (linear)", top);
-            LOG_ERROR("  - Pool tail:     %d (linear)", tail);
-            LOG_ERROR("  - High water:    %d", high_water);
-            LOG_ERROR("Solution:");
-            LOG_ERROR("  Increase dep pool capacity (current: %d, recommended: %d).", capacity, capacity * 2);
-            LOG_ERROR("  Compile-time: PTO2_DEP_LIST_POOL_SIZE in pto_runtime2_types.h");
-            LOG_ERROR("  Runtime env:  PTO2_RING_DEP_POOL=%d", capacity * 2);
-            LOG_ERROR("========================================");
-            if (error_code_ptr) {
-                error_code_ptr->store(PTO2_ERROR_DEP_POOL_OVERFLOW, std::memory_order_release);
-            }
-            return nullptr;
-        }
-        int32_t idx = top % capacity;
-        top++;
-        used++;
-        if (used > high_water) high_water = used;
-        return &base[idx];
-    }
-
-    /**
-     * Advance the tail pointer, reclaiming dead entries.
-     * Called by the orchestrator based on last_task_alive advancement.
-     */
-    void advance_tail(int32_t new_tail) {
-        if (new_tail > tail) {
-            tail = new_tail;
-        }
-    }
-
-    /**
-     * Prepend a task ID to a dependency list
-     *
-     * O(1) operation: allocates new entry and links to current head.
-     *
-     * @param current_head  Current list head offset (0 = empty list)
-     * @param task_slot     Task slot to prepend
-     * @return New head offset
-     */
-    PTO2DepListEntry *prepend(PTO2DepListEntry *cur, PTO2TaskSlotState *slot_state) {
-        PTO2DepListEntry *new_entry = alloc();
-        if (!new_entry) return nullptr;
-        new_entry->slot_state = slot_state;
-        new_entry->next = cur;
-        return new_entry;
-    }
-
-    int32_t used() const { return top - tail; }
-
-    int32_t available() const { return capacity - used(); }
-};
-
-// =============================================================================
-// Ring Set (per-depth aggregate)
-// =============================================================================
-
-/**
- * Groups a TaskAllocator and DepPool into one per-depth unit.
- * PTO2_MAX_RING_DEPTH instances provide independent reclamation per scope depth.
- */
-struct PTO2RingSet {
+struct PTO2RingSet
+{
     PTO2TaskAllocator task_allocator;
-    PTO2FaninPool fanin_pool;
 };
 
 #endif  // PTO_RING_BUFFER_H
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp
index 83a44c957..16d6ffa9a 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp
@@ -9,305 +9,5 @@
  * -----------------------------------------------------------------------------------------------------------
  */
 
-/**
- * PTO Runtime2 - Main Implementation
- *
- * Implements the unified runtime API that combines orchestrator and scheduler.
- *
- * Based on: docs/RUNTIME_LOGIC.md
- */
-
-#include "pto_runtime2.h"
-
-#include <stdarg.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include <algorithm>
-
-#include "aicpu/device_time.h"
-#include "common/platform_config.h"  // PLATFORM_PROF_SYS_CNT_FREQ (data-wait deadline)
-#include "common/unified_log.h"
-#if PTO2_PROFILING
-#include "aicpu/scope_stats_collector_aicpu.h"
-#endif
-
-// Weak fallback for HOST .so builds (never called, but satisfies linker).
-// The AICPU build links the strong symbol from platform/.../device_time.cpp.
-// Hidden visibility prevents HOST .so from polluting global symbol table.
-__attribute__((weak, visibility("hidden"))) uint64_t get_sys_cnt_aicpu() { return 0; }
-
-// Derived here, not in pto_runtime2_types.h: that header is included by orchestrations
-// that define PLATFORM_PROF_SYS_CNT_FREQ locally, so pulling the platform header into
-// it caused a redefinition conflict (#1189). Scaling MS by the counter frequency (like
-// SCHEDULER_TIMEOUT_CYCLES) keeps the data-wait wall-clock identical across arches.
-static constexpr uint64_t PTO2_TENSOR_DATA_TIMEOUT_CYCLES =
-    (PTO2_TENSOR_DATA_TIMEOUT_MS * PLATFORM_PROF_SYS_CNT_FREQ) / 1000;
-
-// =============================================================================
-// Orchestration Ops Table (function-pointer dispatch for orchestration .so)
-// =============================================================================
-
-static TaskOutputTensors submit_task_impl(PTO2Runtime *rt, const MixedKernels &mixed_kernels, const L0TaskArgs &args) {
-    return rt->orchestrator.submit_task(mixed_kernels, args);
-}
-
-static TaskOutputTensors alloc_tensors_impl(PTO2Runtime *rt, const L0TaskArgs &args) {
-    return rt->orchestrator.alloc_tensors(args);
-}
-
-static TaskOutputTensors submit_dummy_task_impl(PTO2Runtime *rt, const L0TaskArgs &args) {
-    return rt->orchestrator.submit_dummy_task(args);
-}
-
-void rt_scope_begin(PTO2Runtime *rt) {
-    PTO2ScopeMode mode = rt->pending_scope_mode;
-    rt->pending_scope_mode = PTO2ScopeMode::AUTO;
-    rt->orchestrator.begin_scope(mode);
-}
-
-void rt_scope_end(PTO2Runtime *rt) { rt->orchestrator.end_scope(); }
-
-void rt_orchestration_done(PTO2Runtime *rt) { rt->orchestrator.mark_done(); }
-
-static bool is_fatal_impl(PTO2Runtime *rt) { return rt->orchestrator.fatal; }
-
-void rt_report_fatal(PTO2Runtime *rt, int32_t error_code, const char *func, const char *fmt, ...) {
-    va_list args;
-    va_start(args, fmt);
-    if (fmt == nullptr || fmt[0] == '\0') {
-        rt->orchestrator.report_fatal(error_code, func, nullptr);
-    } else {
-        char message[1024];
-        vsnprintf(message, sizeof(message), fmt, args);
-        rt->orchestrator.report_fatal(error_code, func, "%s", message);
-    }
-    va_end(args);
-}
-
-// Wait for all producers of this tensor to be safe for data access.
-// Checks owner metadata (lifecycle anchor) and OverlapMap (modifier writers).
-// For reads: wait until each producer COMPLETED (done writing).
-// For writes: also wait until all consumers done reading
-//   (consumer low bits of fanout_refcount >= consumer count, excluding the
-//    bit31 scope reference).
-// Uses cycle-based timeout (checked every 1024 spins).
-// Returns false on timeout (sets orch.fatal).
-MAYBE_UNINITIALIZED_BEGIN
-static bool wait_for_tensor_ready(PTO2Runtime *rt, const Tensor &tensor, bool wait_for_consumers, const char *caller) {
-    PTO2TaskId owner = tensor.owner_task_id;
-    PTO2OrchestratorState &orch = rt->orchestrator;
-
-    // Segmented wait: collect up to kSegmentCap producer slots, then flush by
-    // spinning on each. When the segment fills, we wait for the accumulated
-    // batch before continuing to gather more. Dedup is per-segment only; a
-    // producer that appears in two segments is waited on twice, which is
-    // idempotent (task_state is monotonic) and only adds one atomic load on
-    // the second encounter.
-    constexpr int kSegmentCap = 64;
-    const PTO2TaskSlotState *seg[kSegmentCap];
-    int seg_count = 0;
-    bool signaled = false;
-    bool failed = false;
-
-    auto wait_one_producer = [&](const PTO2TaskSlotState &slot) {
-        uint8_t ring_id = slot.ring_id;
-        int32_t local_id = static_cast<int32_t>(slot.task->task_id.local());
-        uint64_t t0 = get_sys_cnt_aicpu();
-        int32_t spin_count = 0;
-        while (slot.task_state.load(std::memory_order_acquire) < PTO2_TASK_COMPLETED) {
-            SPIN_WAIT_HINT();
-            if ((++spin_count & 1023) == 0) {
-                // A fatal latched elsewhere (e.g. the scheduler-side wiring
-                // deadlock detector) breaks this wait; cold path only.
-                if (orch.sm_header->orch_error_code.load(std::memory_order_acquire) != PTO2_ERROR_NONE) {
-                    failed = true;
-                    return;
-                }
-                if (get_sys_cnt_aicpu() - t0 > PTO2_TENSOR_DATA_TIMEOUT_CYCLES) {
-                    orch.report_fatal(
-                        PTO2_ERROR_TENSOR_WAIT_TIMEOUT, caller,
-                        "Timeout (%llu cycles): producer (ring=%d, local=%d) not completed",
-                        (unsigned long long)PTO2_TENSOR_DATA_TIMEOUT_CYCLES, ring_id, local_id
-                    );
-                    failed = true;
-                    return;
-                }
-            }
-        }
-    };
-
-    auto wait_one_consumers = [&](const PTO2TaskSlotState &slot) {
-        uint8_t ring_id = slot.ring_id;
-        int32_t local_id = slot.task->task_id.local();
-        uint64_t t0 = get_sys_cnt_aicpu();
-        int32_t spin_count = 0;
-        while ((slot.fanout_refcount.load(std::memory_order_acquire) & ~PTO2_FANOUT_SCOPE_BIT) <
-               (slot.fanout_count & ~PTO2_FANOUT_SCOPE_BIT)) {
-            SPIN_WAIT_HINT();
-            if ((++spin_count & 1023) == 0) {
-                // A fatal latched elsewhere (e.g. the scheduler-side wiring
-                // deadlock detector) breaks this wait; cold path only.
-                if (orch.sm_header->orch_error_code.load(std::memory_order_acquire) != PTO2_ERROR_NONE) {
-                    failed = true;
-                    return;
-                }
-                if (get_sys_cnt_aicpu() - t0 > PTO2_TENSOR_DATA_TIMEOUT_CYCLES) {
-                    orch.report_fatal(
-                        PTO2_ERROR_TENSOR_WAIT_TIMEOUT, caller,
-                        "Timeout (%llu cycles): consumers of producer (ring=%d, local=%d) not done",
-                        (unsigned long long)PTO2_TENSOR_DATA_TIMEOUT_CYCLES, ring_id, local_id
-                    );
-                    failed = true;
-                    return;
-                }
-            }
-        }
-    };
-
-    auto flush_segment = [&]() {
-        for (int i = 0; i < seg_count; i++) {
-            wait_one_producer(*seg[i]);
-            if (failed) return;
-            if (!wait_for_consumers) continue;
-            wait_one_consumers(*seg[i]);
-            if (failed) return;
-        }
-        seg_count = 0;
-    };
-
-    auto try_push = [&](const PTO2TaskSlotState &s) {
-        for (int j = 0; j < seg_count; j++) {
-            if (seg[j] == &s) return;  // per-segment dedup
-        }
-        if (seg_count == kSegmentCap) {
-            flush_segment();
-            if (failed) return;
-        }
-        seg[seg_count++] = &s;
-        if (!signaled) {
-            orch.scheduler->wiring.orch_needs_drain.store(true, std::memory_order_release);
-            signaled = true;
-        }
-    };
-
-    auto do_wait = [&]() {
-        // Step A: creator retention — read owner directly from tensor metadata
-        if (owner.is_valid()) {
-            auto &s = orch.sm_header->rings[owner.ring()].get_slot_state_by_task_id(owner.local());
-            try_push(s);
-            if (failed) return;
-        }
-
-        // Step B: modifier writer lookup (OverlapMap), direct callback
-        orch.tensor_map.lookup(tensor, [&](PTO2TensorMapEntry &entry, OverlapStatus) -> bool {
-            PTO2TaskId pid = entry.producer_task_id;
-            auto &s = orch.sm_header->rings[pid.ring()].get_slot_state_by_task_id(pid.local());
-            try_push(s);
-            return !failed;
-        });
-        if (failed) return;
-        flush_segment();
-    };
-
-    do_wait();
-    if (signaled) {
-        orch.scheduler->wiring.orch_needs_drain.store(false, std::memory_order_release);
-    }
-    return !failed;
-}
-MAYBE_UNINITIALIZED_END
-
-uint64_t get_tensor_data(PTO2Runtime *rt, const Tensor &tensor, uint32_t ndims, const uint32_t indices[]) {
-    if (tensor.buffer.addr == 0) {
-        unified_log_error(
-            __FUNCTION__, "get_tensor_data: buffer not allocated (addr=0). "
-                          "Use the Tensor returned by add_output(TensorCreateInfo) after submit returns."
-        );
-        return 0;
-    }
-
-    if (!wait_for_tensor_ready(rt, tensor, false, __FUNCTION__)) {
-        return 0;
-    }
-
-    uint64_t flat_offset = tensor.compute_flat_offset(indices, ndims);
-    uint64_t elem_size = get_element_size(tensor.dtype);
-    const void *ptr = reinterpret_cast<const void *>(tensor.buffer.addr + flat_offset * elem_size);
-    uint64_t result = 0;
-    memcpy(&result, ptr, elem_size);
-    return result;
-}
-
-void set_tensor_data(PTO2Runtime *rt, const Tensor &tensor, uint32_t ndims, const uint32_t indices[], uint64_t value) {
-    if (tensor.buffer.addr == 0) {
-        unified_log_error(
-            __FUNCTION__, "set_tensor_data: buffer not allocated (addr=0). "
-                          "Use the Tensor returned by add_output(TensorCreateInfo) after submit returns."
-        );
-        return;
-    }
-
-    // Wait for producer + all consumers before writing (WAW + WAR safety)
-    if (!wait_for_tensor_ready(rt, tensor, true, __FUNCTION__)) {
-        return;
-    }
-
-    uint64_t flat_offset = tensor.compute_flat_offset(indices, ndims);
-    uint64_t elem_size = get_element_size(tensor.dtype);
-    void *ptr = reinterpret_cast<void *>(tensor.buffer.addr + flat_offset * elem_size);
-    memcpy(ptr, &value, elem_size);
-}
-
-// Ops-table entry that hands the call-site captured by PTO2ScopeGuard to the
-// [ScopeStats] collector. The slot is always present in the struct to keep
-// the layout stable; at PTO2_PROFILING=0 we fill nullptr so the orchestration
-// .so's null-check skips it.
-#if PTO2_PROFILING
-static void scope_set_site_impl(const char *file, int line) { scope_stats_set_pending_site(file, line); }
-#endif
-
-static const PTO2RuntimeOps s_runtime_ops = {
-    .submit_task = submit_task_impl,
-    .scope_begin = rt_scope_begin,
-    .scope_end = rt_scope_end,
-    .orchestration_done = rt_orchestration_done,
-    .is_fatal = is_fatal_impl,
-    .report_fatal = rt_report_fatal,
-    .log_error = unified_log_error,
-    .log_warn = unified_log_warn,
-    .log_debug = unified_log_debug,
-    .log_info_v = unified_log_info_v,
-    .get_tensor_data = get_tensor_data,
-    .set_tensor_data = set_tensor_data,
-    .alloc_tensors = alloc_tensors_impl,
-    .submit_dummy_task = submit_dummy_task_impl,
-#if PTO2_PROFILING
-    .scope_set_site = scope_set_site_impl,
-#else
-    .scope_set_site = nullptr,
-#endif
-};
-
-// =============================================================================
-// Runtime Lifecycle (AICPU-only fixup)
-// =============================================================================
-//
-// Layout / init_data / wire / destroy live in
-// runtime/shared/pto_runtime2_init.cpp so the host build can pre-populate the
-// prebuilt arena image. The pieces below — wiring the ops table and the
-// SPMD core counts — depend on the device-side s_runtime_ops global and the
-// AICPU SchedulerContext respectively, so they remain in the AICPU build.
-
-void runtime_finalize_after_wire(PTO2Runtime *rt, int32_t aic_count, int32_t aiv_count) {
-    rt->ops = &s_runtime_ops;
-    rt->orchestrator.total_cluster_count = aic_count;
-    rt->orchestrator.total_aiv_count = aiv_count;
-}
-
-void runtime_set_mode(PTO2Runtime *rt, PTO2RuntimeMode mode) {
-    if (rt) {
-        rt->mode = mode;
-    }
-}
+// Polling redesign: pto_runtime2 logic is now inlined in pto_runtime2.h. This translation
+// unit is kept empty to preserve the upstream/main file layout.
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h
index 64f4c6319..46b77398d 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h
@@ -8,29 +8,6 @@
  * See LICENSE in the root of the software repository for the full text of the License.
  * -----------------------------------------------------------------------------------------------------------
  */
-/**
- * PTO Runtime2 - Main Interface
- *
- * This is the main header for the PTO Runtime2 system.
- * It provides a unified API for task graph construction and execution.
- *
- * Key Features:
- * - Ring buffer based memory management (zero allocation overhead)
- * - Lazy invalidation TensorMap for dependency discovery
- * - Scope-based buffer lifecycle management
- * - Per-task spinlocks for concurrent fanout updates
- * - Orchestrator-Scheduler decoupling via shared memory
- *
- * Usage:
- *   1. Create runtime: PTO2Runtime create methods
- *   2. Build task graph in orchestration function:
- *      - begin_scope() / end_scope()
- *      - submit_task()
- *   3. Mark orchestration complete: mark_done()
- *   4. Destroy runtime
- *
- * Based on: docs/RUNTIME_LOGIC.md
- */
 
 #pragma once
 
@@ -44,29 +21,33 @@
 #include "pto_orchestrator.h"
 #include "aicore_completion_mailbox.h"
 
-// =============================================================================
-// Runtime Context
-// =============================================================================
+#include <stdarg.h>
+#include <stdio.h>
+#include <string.h>
+#include "aicpu/device_time.h"
+#include "common/platform_config.h"  // PLATFORM_PROF_SYS_CNT_FREQ (data-wait deadline)
+#include "common/unified_log.h"
 
-/**
- * Runtime execution mode
- */
-enum PTO2RuntimeMode {
+__attribute__((weak, visibility("hidden"))) uint64_t get_sys_cnt_aicpu();
+
+// FREQ-scaled cycle count for the tensor-data wait timeout. Derived here, not
+// in pto_runtime2_types.h: that header is included by orchestrations which
+// define PLATFORM_PROF_SYS_CNT_FREQ locally, causing a redefinition conflict.
+// Mirrors the upstream/main approach in pto_runtime2.cpp pre-polling-squash.
+static constexpr uint64_t PTO2_TENSOR_DATA_TIMEOUT_CYCLES =
+    (PTO2_TENSOR_DATA_TIMEOUT_MS * PLATFORM_PROF_SYS_CNT_FREQ) / 1000;
+
+enum PTO2RuntimeMode
+{
     PTO2_MODE_EXECUTE = 0,    // Execute tasks on workers
     PTO2_MODE_SIMULATE = 1,   // Simulate task execution with cycle counting
     PTO2_MODE_GRAPH_ONLY = 2  // Build graph only, no execution
 };
 
-/**
- * Function-pointer ops table for runtime operations.
- *
- * The orchestration .so calls runtime functions through this table
- * (via pto_orchestration_api.h inline wrappers), so it has zero link
- * dependencies on runtime .cpp files.
- */
 typedef struct PTO2Runtime PTO2Runtime;  // forward declare for ops signatures
 
-struct PTO2RuntimeOps {
+struct PTO2RuntimeOps
+{
     TaskOutputTensors (*submit_task)(PTO2Runtime *rt, const MixedKernels &mixed_kernels, const L0TaskArgs &args);
     void (*scope_begin)(PTO2Runtime *rt);
     void (*scope_end)(PTO2Runtime *rt);
@@ -75,23 +56,15 @@ struct PTO2RuntimeOps {
     void (*report_fatal)(PTO2Runtime *rt, int32_t error_code, const char *func, const char *fmt, ...);
 
     // Logging (populated by runtime, called by orchestration)
-    void (*log_error)(const char *func, const char *fmt, ...);
-    void (*log_warn)(const char *func, const char *fmt, ...);
-    void (*log_debug)(const char *func, const char *fmt, ...);
-    // INFO with explicit verbosity tier (v ∈ [0,9]; gating done inside).
+    // INFO with explicit verbosity tier (v ∈ [0, 9]; gating done inside).
     void (*log_info_v)(const char *func, int v, const char *fmt, ...);
 
     // Cross-layer data access (orchestration reads/writes tensor values via runtime)
     // Placed after logging to avoid shifting hot-path field offsets.
     uint64_t (*get_tensor_data)(PTO2Runtime *rt, const Tensor &tensor, uint32_t ndims, const uint32_t indices[]);
-    void (*set_tensor_data)(
-        PTO2Runtime *rt, const Tensor &tensor, uint32_t ndims, const uint32_t indices[], uint64_t value
-    );
+    void (*set_tensor_data)(PTO2Runtime *rt, const Tensor &tensor, uint32_t ndims, const uint32_t indices[], uint64_t value);
     TaskOutputTensors (*alloc_tensors)(PTO2Runtime *rt, const L0TaskArgs &args);
     TaskOutputTensors (*submit_dummy_task)(PTO2Runtime *rt, const L0TaskArgs &args);
-    // Stash the call-site captured by PTO2ScopeGuard into the [ScopeStats]
-    // collector. Always present in the struct to keep ops-table layout stable
-    // across PTO2_PROFILING settings; set to nullptr at PTO2_PROFILING=0.
     void (*scope_set_site)(const char *file, int line);
 };
 
@@ -100,7 +73,8 @@ struct PTO2RuntimeOps {
  * layout (the input to runtime_reserve_layout). Stable per (callable_id, ring
  * config); re-read at AICPU boot to reconstruct ring/heap/dep-pool capacities.
  */
-struct ArenaSizingKey {
+struct ArenaSizingKey
+{
     uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH]{};
     uint64_t heap_sizes[PTO2_MAX_RING_DEPTH]{};
     int32_t dep_pool_capacities[PTO2_MAX_RING_DEPTH]{};
@@ -114,7 +88,8 @@ struct ArenaSizingKey {
  * runtime_wire_arena_pointers (the AICPU re-wires arena-internal pointers
  * from these after rtMemcpy).
  */
-struct ArenaOffsets {
+struct ArenaOffsets
+{
     size_t off_sm_handle{0};
     PTO2OrchestratorLayout orch;
     PTO2SchedulerLayout sched;
@@ -129,22 +104,18 @@ struct ArenaOffsets {
 /**
  * Layout descriptor for the prebuilt runtime arena. Two named halves with
  * distinct lifetimes/semantics: `sizing` is the layout-defining input
- * (capacities + scheduler timeout), `offsets` is the computed sub-region
- * offsets + arena size. Produced once on the host by runtime_reserve_layout();
- * consumed by runtime_init_data_from_layout and runtime_wire_arena_pointers.
+ * (capacities), `offsets` is the computed sub-region offsets + arena size.
+ * Produced once on the host by runtime_reserve_layout(); consumed by
+ * runtime_init_data_from_layout and runtime_wire_arena_pointers.
  */
-struct PTO2RuntimeArenaLayout {
+struct PTO2RuntimeArenaLayout
+{
     ArenaSizingKey sizing;
     ArenaOffsets offsets;
 };
 
-/**
- * PTO Runtime2 context
- *
- * Contains all state for orchestration and scheduling.
- * In simulated mode, runs in single process with shared address space.
- */
-struct PTO2Runtime {
+struct PTO2Runtime
+{
     // Ops table (first field — used by orchestration .so via function pointers)
     const PTO2RuntimeOps *ops;
     PTO2ScopeMode pending_scope_mode;
@@ -166,145 +137,352 @@ struct PTO2Runtime {
     // Statistics
     int64_t total_cycles;
 
-    // Prebuilt-arena fast path metadata. Carries every offset
-    // wire_arena_pointers needs at AICPU boot so the AICPU can reconstruct
-    // all arena-internal pointer fields without re-running init_data. The
-    // device base of the runtime arena travels separately on the host-side
-    // Runtime (Runtime::prebuilt_arena_base_), since the AICPU needs it
-    // *before* dereferencing this image. Populated on host by
-    // runtime_init_data_from_layout + runtime_wire_arena_pointers; read by
-    // aicpu_executor.cpp.
     PTO2RuntimeArenaLayout prebuilt_layout;
 };
 
-// =============================================================================
-// Runtime Lifecycle API
-// =============================================================================
-
-/**
- * Phase 1 — declare every sub-region (sm_handle wrapper, orchestrator /
- * scheduler / tensor_map / mailbox / PTO2Runtime header) on the supplied
- * arena. Pure arithmetic; does not touch device memory and may run on host.
- * Returns the layout descriptor; caller commits/attaches the arena before
- * Phase 2/3.
- */
-PTO2RuntimeArenaLayout runtime_reserve_layout(
-    DeviceArena &arena, uint64_t task_window_size, int32_t dep_pool_capacity = PTO2_DEP_LIST_POOL_SIZE
-);
-PTO2RuntimeArenaLayout runtime_reserve_layout(
+// Canonical per-ring form (matches upstream a5 signature).
+inline PTO2RuntimeArenaLayout runtime_reserve_layout(
     DeviceArena &arena, const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH],
     const uint64_t heap_sizes[PTO2_MAX_RING_DEPTH], const int32_t dep_pool_capacities[PTO2_MAX_RING_DEPTH]
-);
-
-/**
- * Phase 2 — write the data half of the runtime arena: standalone fields,
- * memset'd arena regions, sub-structure initializers, and SM-side device
- * pointers. The arena must already be committed (or attached); writes go
- * into arena.base() + sub-region offsets.
- *
- * `sm_dev_base` / `gm_heap_dev_base` are device addresses; we only store
- * them (never dereference). Safe to run on a host arena that owns a host
- * mirror of the runtime image — the resulting buffer is rtMemcpy-ready.
- *
- * Returns the PTO2Runtime* that sits at layout.off_runtime within the arena.
- * Caller must follow up with runtime_wire_arena_pointers; rt->ops and the
- * AICore-side count fields are left untouched and must be filled by the
- * AICPU at boot.
- */
-PTO2Runtime *runtime_init_data_from_layout(
-    DeviceArena &arena, const PTO2RuntimeArenaLayout &layout, PTO2RuntimeMode mode, void *sm_dev_base, uint64_t sm_size,
-    void *gm_heap_dev_base, uint64_t heap_size
-);
-PTO2Runtime *runtime_init_data_from_layout(
-    DeviceArena &arena, const PTO2RuntimeArenaLayout &layout, PTO2RuntimeMode mode, void *sm_dev_base, uint64_t sm_size,
-    void *gm_heap_dev_base, const uint64_t heap_sizes[PTO2_MAX_RING_DEPTH]
-);
-
-/**
- * Phase 3 — wire every arena-internal pointer field (rt->sm_handle,
- * rt->aicore_mailbox, orchestrator.{scope_tasks, scope_begins, scheduler,
- * tensor_map.*, rings[].fanin_pool.base}, scheduler.{ready_queues, dep_pool,
- * wiring.queue}) so each holds arena.base() + offset. Idempotent — runs on
- * both host (writing host-mirror addresses) and AICPU (writing device
- * addresses) sides.
- */
-void runtime_wire_arena_pointers(DeviceArena &arena, const PTO2RuntimeArenaLayout &layout, PTO2Runtime *rt);
-bool runtime_reset_for_reuse(DeviceArena &arena, const PTO2RuntimeArenaLayout &layout, PTO2Runtime *rt);
-
-/**
- * AICPU-only Phase 4 — fill in the few fields the host could not know at
- * prebuilt-image build time: the ops table (s_runtime_ops is a device-side
- * file-local global, host cannot resolve its device address) and the
- * orchestrator's core counts (depend on the executor's scheduler context).
- * Call once per boot after runtime_wire_arena_pointers.
- */
-void runtime_finalize_after_wire(PTO2Runtime *rt, int32_t aic_count, int32_t aiv_count);
-
-/**
- * Destroy runtime. With the prebuilt-arena fast path the arena buffer is
- * pooled across runs by DeviceRunner, so we never call arena.release()
- * here — the destructor only forgets sub-structure pointers (idempotent
- * cleanup).
- */
-void runtime_destroy(PTO2Runtime *rt, DeviceArena &arena);
-
-/**
- * Set execution mode
- */
-void runtime_set_mode(PTO2Runtime *rt, PTO2RuntimeMode mode);
-
-// =============================================================================
-// Orchestration API (called by orchestration function)
-// =============================================================================
-
-/**
- * Begin a new scope
- *
- * All tasks submitted within this scope will have their lifetime
- * bounded by the scope. When scope_end() is called, the scope
- * releases its reference to all enclosed tasks.
- */
-void rt_scope_begin(PTO2Runtime *rt);
-
-/**
- * End current scope
- *
- * Releases scope reference for all tasks submitted since scope_begin().
- * Tasks whose refcount reaches zero will have their buffers released.
- */
-void rt_scope_end(PTO2Runtime *rt);
-
-/**
- * Mark orchestration as complete
- *
- * Signals that no more tasks will be submitted.
- */
-void rt_orchestration_done(PTO2Runtime *rt);
-
-/**
- * Enter fatal state explicitly from orchestration.
- */
-void rt_report_fatal(PTO2Runtime *rt, int32_t error_code, const char *func, const char *fmt, ...);
-
-/**
- * Cross-layer data access: read a tensor value by waiting for its producer.
- */
-uint64_t get_tensor_data(PTO2Runtime *rt, const Tensor &tensor, uint32_t ndims, const uint32_t indices[]);
+)
+{
+    PTO2RuntimeArenaLayout layout{};
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        layout.sizing.task_window_sizes[r] = task_window_sizes[r];
+        layout.sizing.heap_sizes[r] = heap_sizes[r];
+        layout.sizing.dep_pool_capacities[r] = dep_pool_capacities[r];
+    }
+
+    int32_t task_window_sizes_i32[PTO2_MAX_RING_DEPTH];
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) task_window_sizes_i32[r] = static_cast<int32_t>(task_window_sizes[r]);
+
+    layout.offsets.off_sm_handle = arena.reserve(sizeof(PTO2SharedMemoryHandle), alignof(PTO2SharedMemoryHandle));
+    layout.offsets.orch = PTO2OrchestratorState::reserve_layout(arena, task_window_sizes_i32, dep_pool_capacities[0]);
+    layout.offsets.sched = PTO2SchedulerState::reserve_layout(arena, dep_pool_capacities[0]);
+    layout.offsets.off_runtime = arena.reserve(sizeof(PTO2Runtime), PTO2_ALIGN_SIZE);
+    layout.offsets.off_mailbox = arena.reserve(sizeof(AICoreCompletionMailbox), alignof(AICoreCompletionMailbox));
+
+    layout.offsets.arena_size = arena.total_size();
+    return layout;
+}
+
+// Single-size adapter: broadcasts the scalar to every ring. Defined after the
+// per-ring overload so name lookup sees both at the call site.
+inline PTO2RuntimeArenaLayout runtime_reserve_layout(DeviceArena &arena, uint64_t task_window_size, int32_t dep_pool_capacity)
+{
+    uint64_t per_ring_task_window[PTO2_MAX_RING_DEPTH];
+    uint64_t per_ring_heap[PTO2_MAX_RING_DEPTH];
+    int32_t per_ring_dep_pool[PTO2_MAX_RING_DEPTH];
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        per_ring_task_window[r] = task_window_size;
+        per_ring_heap[r] = 0;  // Heap default; caller may set separately via runtime_init_data_from_layout.
+        per_ring_dep_pool[r] = dep_pool_capacity;
+    }
+    return runtime_reserve_layout(arena, per_ring_task_window, per_ring_heap, per_ring_dep_pool);
+}
+
+inline PTO2Runtime *runtime_init_data_from_layout(DeviceArena &arena, const PTO2RuntimeArenaLayout &layout, PTO2RuntimeMode mode, void *sm_dev_base, uint64_t, void *gm_heap_dev_base, uint64_t heap_size)
+{
+    PTO2Runtime *rt = static_cast<PTO2Runtime *>(arena.region_ptr(layout.offsets.off_runtime));
+    memset(rt, 0, sizeof(*rt));
+
+    auto *sm_wrap = static_cast<PTO2SharedMemoryHandle *>(arena.region_ptr(layout.offsets.off_sm_handle));
+    memset(sm_wrap, 0, sizeof(*sm_wrap));
+
+    // rt->ops is filled by the AICPU at boot.
+    rt->mode = mode;
+    rt->gm_heap = gm_heap_dev_base;
+    rt->gm_heap_size = heap_size > 0 ? heap_size * PTO2_MAX_RING_DEPTH : 0;
+    rt->gm_heap_owned = false;
+    rt->total_cycles = 0;
+
+    if (!rt->orchestrator.init_data_from_layout(layout.offsets.orch, arena, sm_dev_base, gm_heap_dev_base, heap_size, layout.sizing.task_window_sizes[0])) return nullptr;
+    if (!rt->scheduler.init_data_from_layout(layout.offsets.sched, arena, sm_dev_base)) return nullptr;
+
+    auto *mailbox = static_cast<AICoreCompletionMailbox *>(arena.region_ptr(layout.offsets.off_mailbox));
+    memset(mailbox, 0, sizeof(*mailbox));
+
+    return rt;
+}
+
+// Per-ring overload (matches upstream a5 signature with sm_size + heap_sizes[]).
+inline PTO2Runtime *runtime_init_data_from_layout(
+    DeviceArena &arena, const PTO2RuntimeArenaLayout &layout, PTO2RuntimeMode mode,
+    void *sm_dev_base, uint64_t sm_size, void *gm_heap_dev_base, const uint64_t heap_sizes[PTO2_MAX_RING_DEPTH]
+)
+{
+    return runtime_init_data_from_layout(arena, layout, mode, sm_dev_base, sm_size, gm_heap_dev_base, heap_sizes[0]);
+}
+
+inline void runtime_wire_arena_pointers(DeviceArena &arena, const PTO2RuntimeArenaLayout &layout, PTO2Runtime *rt)
+{
+    rt->sm_handle = static_cast<PTO2SharedMemoryHandle *>(arena.region_ptr(layout.offsets.off_sm_handle));
+    rt->aicore_mailbox = static_cast<AICoreCompletionMailbox *>(arena.region_ptr(layout.offsets.off_mailbox));
+    rt->orchestrator.wire_arena_pointers(layout.offsets.orch, arena, &rt->scheduler);
+    rt->scheduler.wire_arena_pointers(layout.offsets.sched, arena);
+}
+
+inline void runtime_destroy(PTO2Runtime *rt)
+{
+    // Arena buffer is pooled across runs by DeviceRunner — never freed here.
+    if (!rt) return;
+    rt->scheduler.destroy();
+    rt->orchestrator.destroy();
+    rt->aicore_mailbox = nullptr;
+    rt->sm_handle = nullptr;
+}
+
+// Upstream-compatible overload: arena is ignored (arena lifetime is owned by
+// the caller in the polling design too).
+inline void runtime_destroy(PTO2Runtime *rt, DeviceArena & /*arena*/)
+{
+    runtime_destroy(rt);
+}
+
+// Stub for the upstream arena-reuse path (#1234). The polling design has not
+// adopted arena caching / reset_for_reuse machinery; the AICPU reuse path in
+// aicpu_executor still references this symbol, so provide a no-op that
+// succeeds. The init_per_ring call immediately above this in
+// aicpu_executor already resets the SM header for the next run.
+inline bool runtime_reset_for_reuse(DeviceArena & /*arena*/, const PTO2RuntimeArenaLayout & /*layout*/, PTO2Runtime *rt)
+{
+    return rt != nullptr;
+}
+
+inline void runtime_set_mode(PTO2Runtime *rt, PTO2RuntimeMode mode)
+{
+    if (rt) rt->mode = mode;
+}
+
+inline void rt_scope_begin(PTO2Runtime *rt)
+{
+    PTO2ScopeMode mode = rt->pending_scope_mode;
+    rt->pending_scope_mode = PTO2ScopeMode::AUTO;
+    rt->orchestrator.begin_scope(mode);
+}
+
+inline void rt_scope_end(PTO2Runtime *rt)
+{
+    rt->orchestrator.end_scope();
+}
+
+inline void rt_orchestration_done(PTO2Runtime *rt)
+{
+    rt->orchestrator.mark_done();
+}
+
+inline void rt_report_fatal(PTO2Runtime *rt, int32_t error_code, const char *func, const char *fmt, ...)
+{
+    va_list args;
+    va_start(args, fmt);
+    if (fmt == nullptr || fmt[0] == '\0')
+    {
+        rt->orchestrator.report_fatal(error_code, func, nullptr);
+    }
+    else
+    {
+        char message[1024];
+        vsnprintf(message, sizeof(message), fmt, args);
+        rt->orchestrator.report_fatal(error_code, func, "%s", message);
+    }
+    va_end(args);
+}
+
+// Orchestration-side logging dispatcher: orchestration .so calls
+// LOG_INFO_V<n>(fmt, ...) which routes through this op into the unified log.
+// The verbosity gate lives inside unified_log_info_v.
+inline void rt_log_info_v(const char *func, int v, const char *fmt, ...)
+{
+    va_list args;
+    va_start(args, fmt);
+    char message[1024];
+    vsnprintf(message, sizeof(message), fmt, args);
+    va_end(args);
+    unified_log_info_v(func, v, "%s", message);
+}
+
+MAYBE_UNINITIALIZED_BEGIN
+inline bool wait_for_tensor_ready(PTO2Runtime *rt, const Tensor &tensor, bool wait_for_consumers, const char *caller)
+{
+    PTO2TaskId owner = tensor.owner_task_id;
+    PTO2OrchestratorState &orch = rt->orchestrator;
+
+    constexpr int kSegmentCap = 64;
+    const PTO2TaskSlotState *seg[kSegmentCap];
+    int seg_count = 0;
+    bool signaled = false;
+    bool failed = false;
+
+    auto wait_one_producer = [&](const PTO2TaskSlotState &slot) {
+        uint8_t ring_id = slot.ring_id;
+        int32_t local_id = static_cast<int32_t>(slot.task->task_id.local());
+        auto &ring_hdr = orch.sm_header->rings[ring_id];
+        const int32_t mask = ring_hdr.task_window_mask;
+        uint64_t t0 = get_sys_cnt_aicpu();
+        int32_t spin_count = 0;
+        // (m) Use completion_flags as the single completion signal.
+        while (ring_hdr.completion_flags[local_id & mask].load(std::memory_order_acquire) == 0)
+        {
+            SPIN_WAIT_HINT();
+            if ((++spin_count & 1023) == 0 && get_sys_cnt_aicpu() - t0 > PTO2_TENSOR_DATA_TIMEOUT_CYCLES)
+            {
+                orch.report_fatal(PTO2_ERROR_TENSOR_WAIT_TIMEOUT, caller, "Timeout (%llu cycles): producer (ring=%d, local=%d) not completed", (unsigned long long)PTO2_TENSOR_DATA_TIMEOUT_CYCLES, ring_id, local_id);
+                failed = true;
+                return;
+            }
+        }
+    };
+
+    auto wait_one_consumers = [&](const PTO2TaskSlotState &slot) {
+        uint8_t ring_id = slot.ring_id;
+        int32_t local_id = slot.task->task_id.local();
+        // With watermark-based reclamation, "all consumers done" means the
+        // per-ring completed_watermark has reached this slot's recorded
+        // last_consumer_local_id.
+        PTO2SharedMemoryRingHeader &ring_hdr = rt->orchestrator.sm_header->rings[ring_id];
+        int32_t target = slot.last_consumer_local_id;
+        uint64_t t0 = get_sys_cnt_aicpu();
+        int32_t spin_count = 0;
+        while (ring_hdr.completed_watermark.load(std::memory_order_acquire) < target)
+        {
+            SPIN_WAIT_HINT();
+            if ((++spin_count & 1023) == 0 && get_sys_cnt_aicpu() - t0 > PTO2_TENSOR_DATA_TIMEOUT_CYCLES)
+            {
+                orch.report_fatal(PTO2_ERROR_TENSOR_WAIT_TIMEOUT, caller, "Timeout (%llu cycles): consumers of producer (ring=%d, local=%d) not done", (unsigned long long)PTO2_TENSOR_DATA_TIMEOUT_CYCLES, ring_id, local_id);
+                failed = true;
+                return;
+            }
+        }
+    };
+
+    auto flush_segment = [&]() {
+        for (int i = 0; i < seg_count; i++)
+        {
+            wait_one_producer(*seg[i]);
+            if (failed) return;
+            if (!wait_for_consumers) continue;
+            wait_one_consumers(*seg[i]);
+            if (failed) return;
+        }
+        seg_count = 0;
+    };
+
+    auto try_push = [&](const PTO2TaskSlotState &s) {
+        for (int j = 0; j < seg_count; j++)
+            if (seg[j] == &s) return;
+        if (seg_count == kSegmentCap)
+        {
+            flush_segment();
+            if (failed) return;
+        }
+        seg[seg_count++] = &s;
+        if (!signaled)
+        {
+            orch.scheduler->wiring.orch_needs_drain.store(true, std::memory_order_release);
+            signaled = true;
+        }
+    };
+
+    auto do_wait = [&]() {
+        if (owner.is_valid())
+        {
+            auto &s = orch.sm_header->rings[owner.ring()].get_slot_state_by_task_id(owner.local());
+            try_push(s);
+            if (failed) return;
+        }
+
+        orch.tensor_map.lookup(tensor, [&](PTO2TensorMapEntry &entry, OverlapStatus) -> bool {
+            PTO2TaskId pid = entry.producer_task_id;
+            auto &s = orch.sm_header->rings[pid.ring()].get_slot_state_by_task_id(pid.local());
+            try_push(s);
+            return !failed;
+        });
+        if (failed) return;
+        flush_segment();
+    };
+
+    do_wait();
+    if (signaled) orch.scheduler->wiring.orch_needs_drain.store(false, std::memory_order_release);
+    return !failed;
+}
+MAYBE_UNINITIALIZED_END
+
+inline uint64_t get_tensor_data(PTO2Runtime *rt, const Tensor &tensor, uint32_t ndims, const uint32_t indices[])
+{
+    if (tensor.buffer.addr == 0) return 0;
+
+    if (!wait_for_tensor_ready(rt, tensor, false, __FUNCTION__)) return 0;
+
+    uint64_t flat_offset = tensor.compute_flat_offset(indices, ndims);
+    uint64_t elem_size = get_element_size(tensor.dtype);
+    const void *ptr = reinterpret_cast<const void *>(tensor.buffer.addr + flat_offset * elem_size);
+    uint64_t result = 0;
+    memcpy(&result, ptr, elem_size);
+    return result;
+}
+
+inline void set_tensor_data(PTO2Runtime *rt, const Tensor &tensor, uint32_t ndims, const uint32_t indices[], uint64_t value)
+{
+    if (tensor.buffer.addr == 0) return;
+
+    // Wait for producer + all consumers before writing (WAW + WAR safety)
+    if (!wait_for_tensor_ready(rt, tensor, true, __FUNCTION__)) return;
+
+    uint64_t flat_offset = tensor.compute_flat_offset(indices, ndims);
+    uint64_t elem_size = get_element_size(tensor.dtype);
+    void *ptr = reinterpret_cast<void *>(tensor.buffer.addr + flat_offset * elem_size);
+    memcpy(ptr, &value, elem_size);
+}
+
+// Function-pointer ops table backing — moved from pto_runtime2.cpp so that
+// the inline runtime_finalize_after_wire above can refer to it.
+
+inline TaskOutputTensors submit_task_impl(PTO2Runtime *rt, const MixedKernels &mixed_kernels, const L0TaskArgs &args)
+{
+    return rt->orchestrator.submit_task(mixed_kernels, args);
+}
+
+inline TaskOutputTensors alloc_tensors_impl(PTO2Runtime *rt, const L0TaskArgs &args)
+{
+    return rt->orchestrator.alloc_tensors(args);
+}
+
+inline TaskOutputTensors submit_dummy_task_impl(PTO2Runtime *rt, const L0TaskArgs &args)
+{
+    return rt->orchestrator.submit_dummy_task(args);
+}
+
+inline bool is_fatal_impl(PTO2Runtime *rt)
+{
+    return rt->orchestrator.fatal;
+}
+
+inline const PTO2RuntimeOps s_runtime_ops = {
+    .submit_task = submit_task_impl,
+    .scope_begin = rt_scope_begin,
+    .scope_end = rt_scope_end,
+    .orchestration_done = rt_orchestration_done,
+    .is_fatal = is_fatal_impl,
+    .report_fatal = rt_report_fatal,
+    .log_info_v = rt_log_info_v,
+    .get_tensor_data = get_tensor_data,
+    .set_tensor_data = set_tensor_data,
+    .alloc_tensors = alloc_tensors_impl,
+    .submit_dummy_task = submit_dummy_task_impl,
+    .scope_set_site = nullptr,
+};
 
-/**
- * Cross-layer data access: write a value to a tensor at given indices.
- * Waits for producer completion (WAW) and all consumers (WAR) via TensorMap.
- * See set_tensor_data in pto_orchestration_api.h for full documentation.
- */
-void set_tensor_data(PTO2Runtime *rt, const Tensor &tensor, uint32_t ndims, const uint32_t indices[], uint64_t value);
+inline void runtime_finalize_after_wire(PTO2Runtime *rt, int32_t aic_count, int32_t aiv_count)
+{
+    rt->ops = &s_runtime_ops;
+    rt->orchestrator.total_cluster_count = aic_count;
+    rt->orchestrator.total_aiv_count = aiv_count;
+}
 
-/**
- * Slim config struct exported by orchestration .so via aicpu_orchestration_config().
- * Shared definition with pto_orchestration_api.h (same layout, guarded).
- */
 #ifndef PTO2_ORCHESTRATION_CONFIG_DEFINED
 #define PTO2_ORCHESTRATION_CONFIG_DEFINED
-struct PTO2OrchestrationConfig {
+struct PTO2OrchestrationConfig
+{
     int expected_arg_count;
 };
 #endif
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h
index 01194134a..659e30b7a 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h
@@ -9,19 +9,6 @@
  * -----------------------------------------------------------------------------------------------------------
  */
 
-/**
- * PTO Runtime2 - Core Type Definitions
- *
- * This header defines all fundamental types used by the PTO Runtime2 system:
- * - Configuration constants
- * - Worker types and task states
- * - Tensor regions and task parameters
- * - Task descriptors with fanin/fanout tracking
- * - Dependency list entries
- *
- * Based on: docs/RUNTIME_LOGIC.md
- */
-
 #ifndef SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_PTO_RUNTIME2_TYPES_H_
 #define SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_PTO_RUNTIME2_TYPES_H_
 
@@ -40,11 +27,6 @@
 #include "pto_task_id.h"
 #include "pto_types.h"
 
-// Spin-wait hint for AICPU threads.  On real hardware the AICPU has dedicated
-// ARM A55 cores — no OS yield is needed, so the hint is a no-op.  In simulation
-// all threads share host CPU cores, so we yield to prevent starvation.
-// This header is also compiled into the Host .so (for struct definitions only),
-// where the hint is never called — the fallback no-op keeps Host builds clean.
 #if __has_include("spin_hint.h")
 #include "spin_hint.h"
 #else
@@ -65,8 +47,7 @@
 // Use pto2_task_slot(sched, task_id) for slot calculation.
 #define PTO2_TASK_WINDOW_SIZE 16384  // Default per-ring task window size (power of 2)
 
-// Multi-ring: number of independent ring layers (HeapRing + TaskRing + DepPool per layer)
-// Scope depth maps to ring index via: min(scope_depth, PTO2_MAX_RING_DEPTH - 1)
+// Multi-ring layout: scope_depth → ring index (capped at PTO2_MAX_RING_DEPTH - 1).
 #define PTO2_MAX_RING_DEPTH 4
 
 // Memory pools (per-ring defaults; total = value × PTO2_MAX_RING_DEPTH)
@@ -77,11 +58,6 @@
 
 // Scope management
 #define PTO2_MAX_SCOPE_DEPTH 64  // Maximum nesting depth
-// Hard cap for the scope_tasks buffer. Equals the total in-flight ring slot
-// budget (PTO2_TASK_WINDOW_SIZE × PTO2_MAX_RING_DEPTH): once every ring slot
-// is in flight, no more tasks can ever be pushed regardless of buffer size.
-// scope_tasks_push fatals on overflow rather than growing the arena-owned
-// buffer (which would be UB on the arena's malloc'd backing).
 #define PTO2_SCOPE_TASKS_CAP (PTO2_TASK_WINDOW_SIZE * PTO2_MAX_RING_DEPTH)
 
 // Ready queue
@@ -93,8 +69,11 @@
 // Wiring queue
 #define PTO2_WRIRING_QUEUE_SIZE 1024  // Per-shape queue size
 
-// Fanin storage
-#define PTO2_FANIN_INLINE_CAP 64
+// Fanin storage — absolute max number of unique fanin dependencies per task.
+// Matches the upstream/main PTO2_FANIN_INLINE_CAP so workloads that already
+// fit there (qwen3_14b_decode, scalar_data_test, fanin_lookup_perf) keep
+// fitting after the polling-design rewrite.
+#define PTO2_MAX_FANIN 64
 
 // TensorMap cleanup interval
 #define PTO2_TENSORMAP_CLEANUP_INTERVAL 64  // Cleanup every N retired tasks
@@ -110,87 +89,38 @@
 // a redefinition conflict. See issue #1189.
 constexpr uint64_t PTO2_TENSOR_DATA_TIMEOUT_MS = 15000;  // 15 s
 
-// =============================================================================
-// Task States
-// =============================================================================
-
-/**
- * Task state enumeration
- *
- * State transitions:
- *   PENDING -> COMPLETED -> CONSUMED
- *
- * The slot stays in PENDING from submit through "ready in queue" and "running
- * on a worker"; readiness and running-vs-idle are derived from fanin_refcount
- * and per-core running_slot_state respectively, not from task_state itself.
- *
- * Conditions:
- *   PENDING->COMPLETED:   all subtasks finish (set by scheduler) or task is a
- *                         hidden alloc completed inline by the orchestrator
- *   COMPLETED->CONSUMED:  fanout_refcount == fanout_count && state == COMPLETED
- */
-typedef enum {
-    PTO2_TASK_PENDING = 0,    // Submitted; awaiting fanin, queued, or dispatched
-    PTO2_TASK_COMPLETED = 1,  // Execution finished, output may still be in use
-    PTO2_TASK_CONSUMED = 2    // Output fully consumed, buffers can be released
+typedef enum
+{
+    PTO2_TASK_PENDING = 0,   // Submitted; awaiting fanin, queued, or dispatched
+    PTO2_TASK_COMPLETED = 1  // Execution finished; per-ring completed_watermark
+                             // advances past this slot's last_consumer_local_id
+                             // to make its heap chunk reclaimable.
 } PTO2TaskState;
 
-/**
- * Result of a unified task allocation.
- */
-struct PTO2TaskAllocResult {
+struct PTO2TaskAllocResult
+{
     int32_t task_id;    // Absolute task ID (not wrapped)
     int32_t slot;       // task_id & (window_size - 1)
     void *packed_base;  // Heap allocation result (nullptr if failure)
     void *packed_end;   // packed_base + aligned output_size
 
-    bool failed() const { return task_id < 0; }
+    bool failed() const
+    {
+        return task_id < 0;
+    }
 };
 
-struct PTO2OutputLayout {
+struct PTO2OutputLayout
+{
     uint64_t offsets[MAX_TENSOR_ARGS] = {};
     uint64_t buffer_sizes[MAX_TENSOR_ARGS] = {};
     int32_t total_output_size = 0;
 };
 
-// =============================================================================
-// Dependency List Entry
-// =============================================================================
-
-/**
- * Fanin spill entry
- * Stored in the dedicated fanin spill ring buffer.
- */
 struct PTO2TaskSlotState;  // Forward declaration
-struct PTO2FaninPool;      // Forward declaration
-struct PTO2FaninSpillEntry {
-    PTO2TaskSlotState *slot_state;
-};
-static_assert(sizeof(PTO2FaninSpillEntry) == sizeof(uintptr_t));
-
-/**
- * Dependency list entry (singly-linked list node)
- * Stored in DepListPool ring buffer.
- */
-struct PTO2DepListEntry {
-    PTO2TaskSlotState *slot_state;  // Consumer slot state (direct pointer)
-    PTO2DepListEntry *next;         // next entry
-};
-
-// =============================================================================
-// Task Descriptor
-// =============================================================================
 
-/**
- * Task descriptor structure (shared memory)
- *
- * Stored in the TaskDescriptor ring buffer in shared memory.
- * Contains static identification and buffer pointers only.
- * Dynamic scheduling state (fanin/fanout/task_state) is in PTO2TaskSlotState.
- *
- * Fields set by Orchestrator at submission, read by Scheduler for dispatch.
- */
-struct PTO2TaskDescriptor {
+struct PTO2TaskDescriptor
+{
     // Mixed-task identification (encodes ring_id in upper 32 bits)
     PTO2TaskId task_id;  // raw: (ring_id << 32) | local_id
 
@@ -209,75 +139,32 @@ struct PTO2TaskDescriptor {
 /**
  * Task payload data (cold path - only accessed during orchestration and dispatch)
  *
- * Layout: metadata + inline fanin packed in the first 9 cache lines, followed
- * by bulk tensor and scalar data. Small fanins stay fully inline; larger
- * fanins spill into a per-ring ring buffer slice.
+ * Layout: metadata + flat fanin_local_ids[] in the first 2 cache lines,
+ * followed by bulk tensor and scalar data.
  */
-// Speculative early-dispatch claim states for PTO2TaskPayload::spec_state.
-enum PTO2SpecState : uint8_t {
-    PTO2_SPEC_NONE = 0,       // not pre-staged
-    PTO2_SPEC_STAGING = 1,    // Hook 1 claimed it; staging in progress
-    PTO2_SPEC_STAGED = 2,     // staged on a core, gated; staged_* fields valid
-    PTO2_SPEC_DISPATCHED = 3  // routed via the normal dispatch path (no pre-stage)
-};
-
-// A pre-staged consumer occupies one core per gated subtask block. WHICH cores
-// it occupies is recorded as a bitmask (staged_core_mask, 1 bit per global
-// core_id); the completion-path release iterates the set bits and rings each
-// core's doorbell from the scheduler's per-core doorbell table. Bounded by the
-// chip's core count (RUNTIME_MAX_WORKER = 72; no two-level pre-dispatch means
-// gated cores in flight <= core count), NOT by block_num — so a wide SPMD
-// consumer can pre-stage all its idle cores. 2 words = 128 bits >= 72.
-inline constexpr int PTO2_SPEC_CORE_MASK_WORDS = 2;
 
 struct PTO2TaskPayload {
-    // === Cache lines 0-8 (576B) — metadata + inline fanin ===
+    // === Cache lines 0-2 (192B) — metadata + fanin (wireless model) ===
     int32_t tensor_count{0};
     int32_t scalar_count{0};
-    int32_t fanin_actual_count{0};  // Actual fanin count (without the +1 redundance)
-    int32_t fanin_spill_start{0};   // Linear start index in fanin spill pool (0 = no spill)
-    PTO2FaninPool *fanin_spill_pool{nullptr};
-    PTO2TaskSlotState *fanin_inline_slot_states[PTO2_FANIN_INLINE_CAP];
-    // Speculative early-dispatch metadata (AICPU-side only). Ordered by descending
-    // alignment (8B mask, 4B fanin, then 1B flags) so the block packs with no
-    // internal padding. Kept here after the fanin array (not moved up front): on
-    // cache line 8 it shares only with the rarely-touched fanin tail, whereas in
-    // line 0 the spec atomics (written during staging) would false-share with
-    // tensor_count/scalar_count (read by build_payload at dispatch). Fits in the 40B
-    // between the fanin array (offset 536) and the 64B-aligned tensors[] (offset
-    // 576), so sizeof and tensors[] are unchanged.
-    //
-    // Bitmask of global core_ids this consumer is pre-staged (gated) on. Set with
-    // atomic fetch_or by concurrent stagers; read by release. (Re)initialized in
-    // PTO2TaskPayload::init before the slot can be staged again.
-    std::atomic<uint64_t> staged_core_mask[PTO2_SPEC_CORE_MASK_WORDS]{};
-    // Early-dispatch CANDIDATE detection (event-driven, dual of fanin_refcount):
-    // seeded at wiring with producers already complete, then a flagged producer's
-    // DISPATCH bumps each consumer's dispatch_fanin. dispatch_fanin ==
-    // fanin_actual_count  <=>  every producer is flagged-and-dispatched or was
-    // pre-completed  =>  this task is an early-dispatch candidate (push early_dispatch_queue).
-    std::atomic<int32_t> dispatch_fanin{0};  // CONSUMER side: flagged-dispatched + pre-completed producers
-    bool allow_early_resolve{false};         // codegen hint copied from Arg in PTO2TaskPayload::init
-    // Lock-free claim state shared by the stagers (Hook 1, possibly several AICPU
-    // threads concurrently) and the completion-path release: 0=NONE, 1=STAGING,
-    // 3=DISPATCHED (2=STAGED is unused now). STAGING is the STABLE gated state —
-    // many threads stage blocks concurrently while it holds, each claiming a block
-    // via the atomic next_block_idx and OR-ing its cores into staged_core_mask.
-    // Release does STAGING->DISPATCHED then rings the mask; a thread that stages a
-    // block AFTER release flipped DISPATCHED rings that block's doorbell itself
-    // (self-ring), so no doorbell is ever missed.
-    std::atomic<uint8_t> spec_state{0};
-    std::atomic<uint8_t> dispatch_propagated{0};  // PRODUCER side: once-guard for fanout propagation
-    std::atomic<uint8_t> spec_chain_active{0};    // inherited early-dispatch flag (auto-chain past codegen flag)
-    uint8_t spec_chain_depth{0};                  // auto-chain depth; inherited = parent+1, capped
-    // === Cache lines 9-72 (4096B) — tensors (alignas(64) forces alignment) ===
+    // wireless: flat fanin_local_ids[] populated at submit. The thread-0
+    // pending poll indexes a compact ring-level completion_flags byte array
+    // via these ids — avoids a pointer chase per fanin into a 128B-aligned
+    // slot_state.
+    int32_t fanin_count{0};
+    int32_t fanin_local_ids[PTO2_MAX_FANIN];
+    // Parallel array: producer's ring_id for each fanin edge. With multi-ring
+    // (PTO2_MAX_RING_DEPTH > 1), the consumer's pending poll must read the
+    // producer's ring's completion_flags — same-ring lookup is no longer a
+    // safe shortcut. Sized as bytes to stay cheap (16B for PTO2_MAX_FANIN=16).
+    uint8_t fanin_ring_ids[PTO2_MAX_FANIN];
+    // === Tensors (Tensor is alignas(64); array is naturally aligned) ===
     Tensor tensors[MAX_TENSOR_ARGS];
-    // === Cache lines 73-74 (128B) — scalars ===
+    // === Scalars ===
     uint64_t scalars[MAX_SCALAR_ARGS];
 
-    // Layout verification (size checks that don't need offsetof).
     static_assert(sizeof(Tensor) == 128, "Tensor must be 2 cache lines");
-    static_assert(MAX_SCALAR_ARGS * sizeof(uint64_t) == 128, "scalar region must be 128B (2 cache lines)");
+    static_assert(MAX_SCALAR_ARGS * sizeof(uint64_t) == MAX_SCALAR_ARGS * 8, "scalar region size matches MAX_SCALAR_ARGS");
 
     /**
      * Prefetch (for write) the regions init() is about to fill so the stores land
@@ -297,7 +184,6 @@ struct PTO2TaskPayload {
         __builtin_prefetch(this, 1, 3);
         __builtin_prefetch(reinterpret_cast<const char *>(this) + 64, 1, 3);
         __builtin_prefetch(reinterpret_cast<const char *>(this) + 128, 1, 3);
-        __builtin_prefetch(reinterpret_cast<const char *>(this) + 512, 1, 3);  // spec fields (cache line 8)
     }
 
     /**
@@ -310,15 +196,15 @@ struct PTO2TaskPayload {
      * @param args                Task arguments (tensors + scalars)
      * @param result  Materialized output tensors (from TensorCreateInfo path)
      */
-    void init(
-        const L0TaskArgs &args, TaskOutputTensors &result, PTO2TaskAllocResult &alloc_result, PTO2OutputLayout &layout
-    ) {
+    void init(const L0TaskArgs &args, TaskOutputTensors &result, PTO2TaskAllocResult &alloc_result, PTO2OutputLayout &layout) {
         tensor_count = args.tensor_count();
         scalar_count = args.scalar_count();
 
         // int32_t out_idx = 0;
-        for (int32_t i = 0; i < args.tensor_count(); i++) {
-            if (args.tag(i) != TensorArgType::OUTPUT) {
+        for (int32_t i = 0; i < args.tensor_count(); i++)
+        {
+            if (args.tag(i) != TensorArgType::OUTPUT)
+            {
                 tensors[i].copy(args.tensor(i).ref());
             } else {
                 init_tensor_from_create_info(
@@ -333,112 +219,42 @@ struct PTO2TaskPayload {
         // Round up to cache line boundary. Both arrays are 128B so no overrun.
         // Eliminates branches; extra bytes within the same CL have zero additional cost.
         memcpy(scalars, args.scalars(), PTO2_ALIGN_UP(args.scalar_count() * sizeof(uint64_t), 64));
-
-        // Speculative early-dispatch metadata — the single init point for these
-        // fields. reset_for_reuse MUST NOT touch the payload (it runs on the
-        // scheduler's advance-ring path and would pull this cold cache line across
-        // structures); prepare_task only allocates/binds. prefetch() warms this
-        // line (offset 512) so these writes land in warm cache.
-        //
-        // spec_state / staged_core_mask / dispatch_fanin / spec_chain_* are all
-        // CONSUMER-side: a task with allow_early_resolve == false still has them
-        // touched when one of ITS producers is flagged (propagate_dispatch_fanin
-        // bumps dispatch_fanin and may CAS spec_state / set the auto-chain flag on
-        // any consumer, independent of the consumer's own hint). So they MUST be
-        // zeroed here unconditionally — no per-task allow_early_resolve gating.
-        allow_early_resolve = args.allow_early_resolve();
-        spec_state.store(PTO2_SPEC_NONE, std::memory_order_relaxed);
-        for (int w = 0; w < PTO2_SPEC_CORE_MASK_WORDS; w++)
-            staged_core_mask[w].store(0, std::memory_order_relaxed);
-        dispatch_fanin.store(0, std::memory_order_relaxed);
-        dispatch_propagated.store(0, std::memory_order_relaxed);
-        spec_chain_active.store(0, std::memory_order_relaxed);
-        spec_chain_depth = 0;
     }
 };
 
 // PTO2TaskPayload layout verification (offsetof requires complete type).
-static_assert(offsetof(PTO2TaskPayload, fanin_spill_pool) == 16, "spill pool pointer layout drift");
-static_assert(
-    offsetof(PTO2TaskPayload, fanin_inline_slot_states) == 24, "inline fanin array must follow spill metadata"
-);
-static_assert(offsetof(PTO2TaskPayload, tensors) == 576, "tensors must start at byte 576 (cache line 9)");
-static_assert(
-    offsetof(PTO2TaskPayload, scalars) == 576 + MAX_TENSOR_ARGS * sizeof(Tensor),
-    "scalars must immediately follow tensors"
-);
-static_assert(
-    sizeof(PTO2TaskPayload) == 576 + MAX_TENSOR_ARGS * sizeof(Tensor) + MAX_SCALAR_ARGS * sizeof(uint64_t),
-    "PTO2TaskPayload size must stay on the baseline cache-line footprint"
-);
-
-/**
- * Per-task slot scheduling state (scheduler-private, NOT in shared memory)
- *
- * Consolidates all hot-path scheduling fields into a single cache-friendly
- * structure (32 bytes = half a cache line). Accessing any field of a task's
- * slot state brings all related fields into the same cache line.
- *
- * Concurrency notes:
- * - fanout_head, fanout_count protected by fanout_lock (per-task spinlock)
- * - fanin_count set once at submission, read-only after (hot path for ready check)
- * - task_state, fanin_refcount, fanout_refcount updated atomically
- */
+static_assert(offsetof(PTO2TaskPayload, fanin_local_ids) == 12, "fanin array must follow metadata words");
+static_assert(offsetof(PTO2TaskPayload, scalars) == offsetof(PTO2TaskPayload, tensors) + MAX_TENSOR_ARGS * sizeof(Tensor), "scalars must immediately follow tensors");
+static_assert(sizeof(PTO2TaskPayload) == offsetof(PTO2TaskPayload, scalars) + MAX_SCALAR_ARGS * sizeof(uint64_t), "no trailing padding after scalars");
+
+struct alignas(64) PTO2TaskSlotState
+{
+    // Highest local task id among this slot's consumers. Set to this slot's
+    // own local_id in prepare_task; bumped via max() in submit_task_common for
+    // each consumer that has this slot as a fanin. The slot's heap chunk is
+    // safe to reclaim when the per-ring completed_watermark reaches at least
+    // this id (i.e. every task up to and including the last consumer has
+    // transitioned to COMPLETED). Single-writer (orchestrator) at submit time.
+    int32_t last_consumer_local_id;
 
-// fanout_count / fanout_refcount bit encoding (both uint32):
-//   bits [30:0] = consumer references (count: # consumers; refcount: # released)
-//   bit  [31]   = the owning scope's reference (PTO2_FANOUT_SCOPE_BIT)
-// fanout_count is seeded to PTO2_FANOUT_SCOPE_BIT and ++'d per consumer, so it
-// ends as (SCOPE_BIT | num_consumers). release adds 1 (consumer completion) or
-// SCOPE_BIT (scope_end). CONSUMED iff fanout_refcount == fanout_count (every
-// consumer released AND scope bit set). Keeping the scope ref in a distinct bit
-// (rather than folding scope + consumers into one count) lets a consumer reach
-// fanout_refcount == (fanout_count & ~PTO2_FANOUT_SCOPE_BIT) while the scope bit
-// is still unset -- i.e. "all consumers done but scope still open" stays
-// distinguishable from "fully consumed". The heap/task deadlock detector keys
-// off exactly that complement: that condition with state==COMPLETED means the
-// head can only be released by scope_end, which a blocked orchestrator can
-// never reach -> provable deadlock.
-static constexpr uint32_t PTO2_FANOUT_SCOPE_BIT = 0x80000000u;
-
-struct alignas(64) PTO2TaskSlotState {
-    // Fanout lock + list (accessed together under lock in on_task_complete)
-    std::atomic<int32_t> fanout_lock;  // Per-task spinlock (0=unlocked, 1=locked)
-    uint32_t fanout_count;             // SCOPE_BIT (owning scope) | number of consumers
-
-    PTO2DepListEntry *fanout_head;  // Pointer to first fanout entry (nullptr = empty)
-
-    // Task state (completion, consumed check, ready check)
-    std::atomic<PTO2TaskState> task_state;  // PENDING/COMPLETED/CONSUMED
-
-    // Fanin (accessed together in release_fanin_and_check_ready)
-    std::atomic<int32_t> fanin_refcount;  // Dynamic: counts completed producers
-    int32_t fanin_count;                  // Number of producer dependencies (set once by wiring)
-
-    // Fanout refcount (accessed with fanout_count in check_and_handle_consumed)
-    std::atomic<uint32_t> fanout_refcount;  // Dynamic: low bits = released consumers, bit31 = scope released
-
-    // --- Per-slot constant, re-bound by orch::prepare_task each submit ---
-    // Value is the same on every reuse (&task_payloads[slot] / &task_descriptors[slot]),
-    // but written here per-submit instead of in an O(window_size) init loop —
-    // these are the only "scale-dependent" pointers in this struct, so moving
-    // them out of init makes startup cost independent of task_window_size.
     PTO2TaskPayload *payload;
     PTO2TaskDescriptor *task;
 
+    // --- (e) Wake-list: lightweight last-fanin notification ---
+    // When a pending consumer's fanin scan finds exactly ONE unmet fanin,
+    // it registers itself on the producer's wake list (CAS push). On producer
+    // completion, the producer atomic-exchanges wake_list_head to the
+    // SENTINEL value and pushes every waiter to the ready queues. Consumers
+    // that observe SENTINEL during registration push themselves directly
+    // (producer already completed). Reset to nullptr on slot reuse.
+    std::atomic<PTO2TaskSlotState *> wake_list_head{nullptr};
+    PTO2TaskSlotState *next_in_wake_list{nullptr};
+
     // --- Set per-submit (depend on task inputs) ---
     ActiveMask active_mask;  // Bitmask of active subtask slots (set once)
     uint8_t ring_id;         // Ring layer (immutable after init)
-    // Set by any subtask FIN that pushed deferred-completion CONDITIONs to
-    // the runtime mailbox; read by the last subtask FIN to decide whether
-    // the task needs MPSC-deferred completion or can complete inline on this
-    // thread. Carved out of the otherwise-padding byte between ring_id and
-    // dep_pool_mark to keep PTO2TaskSlotState at 64 bytes. The write is
-    // sequenced before on_subtask_complete's acq_rel fetch_add and the read
-    // after, so all earlier subtasks' writes are visible to the last subtask.
     std::atomic<bool> any_subtask_deferred{false};
     uint8_t _async_pad{0};
-    int32_t dep_pool_mark{0};  // Dep pool top after wiring (thread-0-only)
 
     std::atomic<int16_t> completed_subtasks{0};  // Each core completion increments by 1
     int16_t total_required_subtasks{0};          // = logical_block_num * popcount(active_mask)
@@ -449,99 +265,34 @@ struct alignas(64) PTO2TaskSlotState {
     // happens before release; normal dispatch of the remainder happens after).
     std::atomic<int16_t> next_block_idx{0};
 
-    /**
-     * Bind the slot-invariant ring id. Called once per slot during
-     * RingSchedState::init(); ring_id never changes across reuses.
-     */
-    void bind_ring(uint8_t rid) { ring_id = rid; }
+    void bind_ring(uint8_t rid)
+    {
+        ring_id = rid;
+    }
 
-    /**
-     * Re-bind the per-slot payload/task pointers. Called by
-     * orch::prepare_task on every submit. Value is constant for a given
-     * slot, but we pay the cheap re-write each submit (both fields land on
-     * the same 64B slot_state cache line that prepare_task is already
-     * dirtying) to avoid the init-time per-slot loop.
-     */
-    void bind_buffers(PTO2TaskPayload *p, PTO2TaskDescriptor *t) {
+    void bind_buffers(PTO2TaskPayload *p, PTO2TaskDescriptor *t)
+    {
         payload = p;
         task = t;
     }
 
-    /**
-     * Reset dynamic scheduling fields for slot reuse.
-     * Called by advance_ring_pointers() after a slot transitions to CONSUMED
-     * and last_task_alive advances past it, but before sync_to_sm() publishes
-     * the new last_task_alive to the orchestrator.
-     *
-     * Skips payload, task, ring_id (immutable, bound once at init).
-     * Skips task_state: left as CONSUMED so that wait_for_tensor_ready()
-     * callers holding stale owner_task_id still observe a completed state.
-     * task_state is set to PENDING by the orchestrator when it reuses the slot.
-     */
-    void reset_for_reuse() {
-        fanout_lock.store(0, std::memory_order_relaxed);
-        fanout_count = PTO2_FANOUT_SCOPE_BIT;  // bit31 = owning-scope ref; consumers ++ into low bits
-        fanout_head = nullptr;
-        fanin_refcount.store(0, std::memory_order_relaxed);
-        fanout_refcount.store(0, std::memory_order_relaxed);
+    void reset_for_reuse()
+    {
         completed_subtasks.store(0, std::memory_order_relaxed);
         next_block_idx.store(0, std::memory_order_relaxed);
         any_subtask_deferred.store(false, std::memory_order_relaxed);
-        // Note: payload spec fields (spec_state / staged_core_mask / dispatch_fanin /
-        // spec_chain_*) are NOT reset here — this method skips the payload by
-        // contract. They are (re)initialized in PTO2TaskPayload::init on every
-        // submit, before the slot becomes visible to the scheduler.
-    }
-
-    // === Per-task fanout spinlock ===
-    //
-    // Used by BOTH the orchestrator and the scheduler. The fanout_lock MUST
-    // be held whenever reading or writing fanout_head / fanout_count, because
-    // the orchestrator adds consumers concurrently with the scheduler
-    // traversing the list after task completion.
-
-#if PTO2_ORCH_PROFILING || PTO2_SCHED_PROFILING
-    void lock_fanout(uint64_t &atomic_count, uint64_t &wait_cycle) {
-        uint64_t t0 = get_sys_cnt_aicpu();
-        bool contended = false;
-        uint32_t atomic_ops = 0;
-
-        for (;;) {
-            while (fanout_lock.load(std::memory_order_acquire) != 0) {
-                contended = true;
-                atomic_ops++;
-                SPIN_WAIT_HINT();
-            }
-            int32_t expected = 0;
-            if (fanout_lock.compare_exchange_weak(expected, 1, std::memory_order_acquire, std::memory_order_relaxed)) {
-                atomic_ops++;
-                atomic_count += atomic_ops;
-                if (contended) {
-                    wait_cycle += (get_sys_cnt_aicpu() - t0);
-                }
-                return;
-            }
-            contended = true;
-            atomic_ops++;
-        }
+        // (e) Wake list: clear for the next incarnation. Previous incarnation
+        // left it at WAKE_LIST_SENTINEL (set by its on_mixed_task_complete).
+        wake_list_head.store(nullptr, std::memory_order_relaxed);
+        next_in_wake_list = nullptr;
+        // last_consumer_local_id is reset in prepare_task once the task_id is known.
     }
-#endif
-
-    void lock_fanout() {
-        for (;;) {
-            while (fanout_lock.load(std::memory_order_acquire) != 0) {
-                SPIN_WAIT_HINT();
-            }
-            int32_t expected = 0;
-            if (fanout_lock.compare_exchange_weak(expected, 1, std::memory_order_acquire, std::memory_order_relaxed)) {
-                return;
-            }
-        }
-    }
-
-    void unlock_fanout() { fanout_lock.store(0, std::memory_order_release); }
 };
 
-static_assert(sizeof(PTO2TaskSlotState) == 64);
+// (e) Sentinel marking a wake list as "owner already completed; no more
+// registrations accepted". Distinct from any real slot_state pointer.
+inline PTO2TaskSlotState *const WAKE_LIST_SENTINEL = reinterpret_cast<PTO2TaskSlotState *>(uintptr_t{1});
+
+static_assert(sizeof(PTO2TaskSlotState) <= 128, "slot state should fit in two cache lines");
 
 #endif  // SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_PTO_RUNTIME2_TYPES_H_
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h
index f1058675d..aa8539909 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h
@@ -8,64 +8,24 @@
  * See LICENSE in the root of the software repository for the full text of the License.
  * -----------------------------------------------------------------------------------------------------------
  */
-/**
- * PTO Runtime2 - Shared Memory Layout
- *
- * Defines the shared memory structure for Orchestrator-Scheduler communication.
- *
- * Memory Layout (per-ring sections repeat for each ring 0..PTO2_MAX_RING_DEPTH-1):
- *   +---------------------------+
- *   | SharedMemoryHeader        |  (per-ring flow control + sync)
- *   +---------------------------+
- *   | Ring 0: TaskDescriptor[]  |
- *   | Ring 0: TaskPayload[]     |
- *   | Ring 0: TaskSlotState[]   |
- *   +---------------------------+
- *   | Ring 1: TaskDescriptor[]  |
- *   | Ring 1: TaskPayload[]     |
- *   | Ring 1: TaskSlotState[]   |
- *   +---------------------------+
- *   | ...                       |
- *   +---------------------------+
- *
- * Design principles:
- * - Only data needed for Orchestrator<->Scheduler communication is here
- * - TensorMap, scope_stack, ready_queues, dep_pool are in private memory
- * - Flow control via atomic counters/flags (no locks needed for single-word R/W)
- *
- * Based on: docs/RUNTIME_LOGIC.md
- */
 
 #pragma once
 
 #include "utils/device_arena.h"
 #include "pto_runtime2_types.h"
 
-// =============================================================================
-// Shared Memory Header
-// =============================================================================
-
 struct PTO2SharedMemoryHandle;
 
-/**
- * Per-ring flow control state in shared memory.
- * Written/read by Orchestrator and Scheduler for synchronization.
- */
-struct alignas(64) PTO2RingFlowControl {
+struct alignas(64) PTO2RingFlowControl
+{
     // === Cache Line 0: Written by Orchestrator, Read by Scheduler ===
     alignas(64) std::atomic<int32_t> current_task_index;  // Task ring head (next to allocate)
 
     // === Cache Line 1: Written by Scheduler, Read by Orchestrator (for back-pressure) ===
     alignas(64) std::atomic<int32_t> last_task_alive;  // Task ring tail (oldest active task)
 
-    // Per-boot SM reset. PTO2TaskAllocator::init() seeds its private
-    // local_task_id_ from initial_local_task_id (default 0 in production)
-    // *without* dereferencing current_task_index — it relies on this reset
-    // running on every AICPU boot so 0 stays in sync. If you ever change
-    // the initial fc value or the boot ordering, update the default in
-    // PTO2TaskAllocator::init (pto_ring_buffer.h) in the same change, or
-    // submit IDs will be off by the divergence.
-    void init() {
+    void init()
+    {
         current_task_index.store(0, std::memory_order_relaxed);
         last_task_alive.store(0, std::memory_order_relaxed);
     }
@@ -75,15 +35,16 @@ struct alignas(64) PTO2RingFlowControl {
 
 static_assert(sizeof(PTO2RingFlowControl) == 128, "PTO2RingFlowControl must be exactly 2 cache lines (128B)");
 
-/**
- * Per-ring shared memory header section.
- *
- * Groups flow-control, layout info, and per-ring data pointers for a single ring.
- * Pointers are host-side only (set by setup_pointers, invalid on device).
- */
-struct alignas(64) PTO2SharedMemoryRingHeader {
+struct alignas(64) PTO2SharedMemoryRingHeader
+{
     PTO2RingFlowControl fc;
 
+    // Highest task_id such that every task with id in [0, completed_watermark]
+    // has reached COMPLETED. Maintained at task-completion time. Used to gate
+    // slot reclamation: a producer slot P is safe to retire when
+    // completed_watermark >= P.last_consumer_local_id.
+    alignas(64) std::atomic<int32_t> completed_watermark;
+
     // Layout metadata (set once at init)
     uint64_t task_window_size;
     int32_t task_window_mask;
@@ -95,31 +56,48 @@ struct alignas(64) PTO2SharedMemoryRingHeader {
     PTO2TaskPayload *task_payloads;
     PTO2TaskSlotState *slot_states;
 
-    int32_t get_slot_by_task_id(int32_t local_task_id) { return local_task_id & task_window_mask; }
-
-    PTO2TaskDescriptor &get_task_by_slot(int32_t slot) { return task_descriptors[slot]; }
+    // Compact contiguous array (one byte per slot) holding the polling-fast
+    // "task X completed?" flag. 0 = pending, 1 = completed. Indexed by
+    // local_id & task_window_mask. Writer: the task's completer at
+    // on_mixed_task_complete; Resetter: orchestrator in prepare_task for the
+    // newly-allocated slot. Reader: thread-0 fanin polling. Replaces a chain
+    // of 128B-aligned slot_state pointer derefs with byte reads into a single
+    // array — typically condenses 16 fanin checks into 1-2 cache lines.
+    std::atomic<uint8_t> *completion_flags;
+
+    PTO2TaskDescriptor &get_task_by_slot(int32_t slot)
+    {
+        return task_descriptors[slot];
+    }
 
-    PTO2TaskDescriptor &get_task_by_task_id(int32_t local_id) {
-        return task_descriptors[get_slot_by_task_id(local_id)];
+    PTO2TaskDescriptor &get_task_by_task_id(int32_t local_id)
+    {
+        return task_descriptors[local_id & task_window_mask];
     }
 
-    PTO2TaskPayload &get_payload_by_slot(int32_t slot) { return task_payloads[slot]; }
+    PTO2TaskPayload &get_payload_by_slot(int32_t slot)
+    {
+        return task_payloads[slot];
+    }
 
-    PTO2TaskPayload &get_payload_by_task_id(int32_t local_id) { return task_payloads[get_slot_by_task_id(local_id)]; }
+    PTO2TaskPayload &get_payload_by_task_id(int32_t local_id)
+    {
+        return task_payloads[local_id & task_window_mask];
+    }
 
-    PTO2TaskSlotState &get_slot_state_by_slot(int32_t slot) { return slot_states[slot]; }
+    PTO2TaskSlotState &get_slot_state_by_slot(int32_t slot)
+    {
+        return slot_states[slot];
+    }
 
-    PTO2TaskSlotState &get_slot_state_by_task_id(int32_t local_id) {
-        return slot_states[get_slot_by_task_id(local_id)];
+    PTO2TaskSlotState &get_slot_state_by_task_id(int32_t local_id)
+    {
+        return slot_states[local_id & task_window_mask];
     }
 };
 
-/**
- * Shared memory header structure
- *
- * Contains per-ring flow control and global layout information.
- */
-struct alignas(PTO2_ALIGN_SIZE) PTO2SharedMemoryHeader {
+struct alignas(PTO2_ALIGN_SIZE) PTO2SharedMemoryHeader
+{
     // === PER-RING FLOW CONTROL + LAYOUT INFO (set once at init) ===
     PTO2SharedMemoryRingHeader rings[PTO2_MAX_RING_DEPTH];
 
@@ -162,20 +140,10 @@ struct alignas(PTO2_ALIGN_SIZE) PTO2SharedMemoryHeader {
     std::atomic<int32_t> sched_stall_core;         // S1: stuck core id (-1 if N/A)
 };
 
-static_assert(
-    (sizeof(PTO2SharedMemoryHeader) % PTO2_ALIGN_SIZE == 0) && (sizeof(PTO2SharedMemoryHeader) < 4096),
-    "PTO2SharedMemoryHeader should be reasonably sized"
-);
-
-// =============================================================================
-// Shared Memory Handle
-// =============================================================================
+static_assert((sizeof(PTO2SharedMemoryHeader) % PTO2_ALIGN_SIZE == 0) && (sizeof(PTO2SharedMemoryHeader) < 4096), "PTO2SharedMemoryHeader should be reasonably sized");
 
-/**
- * Handle for shared memory lifecycle management (create/destroy).
- * Runtime components (orchestrator, scheduler) use PTO2SharedMemoryHeader* directly.
- */
-struct PTO2SharedMemoryHandle {
+struct PTO2SharedMemoryHandle
+{
     void *sm_base;     // Base address of shared memory
     uint64_t sm_size;  // Total size of shared memory
 
@@ -186,135 +154,236 @@ struct PTO2SharedMemoryHandle {
 
     // === Static helpers ===
 
-    static uint64_t calculate_size(uint64_t task_window_size);
-    static uint64_t calculate_size_per_ring(const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH]);
+    static uint64_t calculate_size(uint64_t task_window_size)
+    {
+        uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH];
+        for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) task_window_sizes[r] = task_window_size;
+        return calculate_size_per_ring(task_window_sizes);
+    }
+    static uint64_t calculate_size_per_ring(const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH])
+    {
+        uint64_t size = 0;
+
+        // Header (aligned to cache line)
+        size += PTO2_ALIGN_UP(sizeof(PTO2SharedMemoryHeader), PTO2_ALIGN_SIZE);
+
+        // Per-ring task descriptors and payloads
+        for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++)
+        {
+            size += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskDescriptor), PTO2_ALIGN_SIZE);
+            size += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskPayload), PTO2_ALIGN_SIZE);
+            size += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskSlotState), PTO2_ALIGN_SIZE);
+            size += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(std::atomic<uint8_t>), PTO2_ALIGN_SIZE);
+        }
+
+        return size;
+    }
 
-    // UT convenience: reserve wrapper + sm_base on `arena`, commit, and init
-    // using default PTO2_TASK_WINDOW_SIZE / PTO2_HEAP_SIZE. Only valid when the
-    // arena is otherwise empty (the call performs the single commit). All
-    // memory is owned by the arena — caller must not call destroy().
-    static PTO2SharedMemoryHandle *create_and_init_default(DeviceArena &arena);
+    static PTO2SharedMemoryHandle *create_and_init_default(DeviceArena &arena)
+    {
+        const uint64_t buffer_size = calculate_size(PTO2_TASK_WINDOW_SIZE);
+        const size_t off_handle = arena.reserve(sizeof(PTO2SharedMemoryHandle), alignof(PTO2SharedMemoryHandle));
+        const size_t off_buffer = arena.reserve(static_cast<size_t>(buffer_size), PTO2_ALIGN_SIZE);
+        if (arena.commit() == nullptr) return nullptr;
+
+        auto *handle = static_cast<PTO2SharedMemoryHandle *>(arena.region_ptr(off_handle));
+        memset(handle, 0, sizeof(*handle));
+        void *buffer = arena.region_ptr(off_buffer);
+        memset(buffer, 0, static_cast<size_t>(buffer_size));
+        if (!handle->init(buffer, buffer_size, PTO2_TASK_WINDOW_SIZE, PTO2_HEAP_SIZE)) return nullptr;
+        return handle;
+    }
 
     // === Instance methods ===
 
-    // In-place init for caller-provided wrapper storage (e.g. a region carved
-    // out of a DeviceArena). Sets is_owner = false, calls setup_pointers and
-    // init_header. Returns false when `sm_size` is too small for the requested
-    // `task_window_size`.
-    bool init(void *sm_base, uint64_t sm_size, uint64_t task_window_size, uint64_t heap_size);
+    bool init(void *sm_base_arg, uint64_t sm_size_arg, uint64_t task_window_size, uint64_t heap_size)
+    {
+        if (!sm_base_arg || sm_size_arg == 0) return false;
+        if (sm_size_arg < calculate_size(task_window_size)) return false;
+
+        sm_base = sm_base_arg;
+        sm_size = sm_size_arg;
+        is_owner = false;
+        setup_pointers(task_window_size);
+        init_header(task_window_size, heap_size);
+        return true;
+    }
+
+    // Per-ring init adapter (upstream signature). Polling-side init treats
+    // task_window_sizes[0] as canonical; rings 1..N inherit. heap_sizes[0] is
+    // passed to the per-ring header init below.
     bool init_per_ring(
-        void *sm_base, uint64_t sm_size, const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH],
+        void *sm_base_arg, uint64_t sm_size_arg, const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH],
         const uint64_t heap_sizes[PTO2_MAX_RING_DEPTH]
-    );
+    )
+    {
+        if (!sm_base_arg || sm_size_arg == 0) return false;
+        if (sm_size_arg < calculate_size_per_ring(task_window_sizes)) return false;
+
+        sm_base = sm_base_arg;
+        sm_size = sm_size_arg;
+        is_owner = false;
+        setup_pointers(task_window_sizes[0]);
+        init_header_per_ring(task_window_sizes, heap_sizes);
+        return true;
+    }
+
+    void destroy()
+    {
+        // Arena-owned wrappers (is_owner == false) are reclaimed by arena.release();
+        // calling destroy on them is a no-op so existing callers stay safe.
+        if (is_owner && sm_base)
+        {
+            free(sm_base);
+            free(this);
+        }
+    }
+    void print_layout()
+    {
+        if (!header) return;
+
+        for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++)
+        {}
+    }
+    bool validate()
+    {
+        if (!sm_base) return false;
+        if (!header) return false;
 
-    void destroy();
-    void print_layout();
-    bool validate();
+        PTO2SharedMemoryHeader *h = header;
+
+        for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++)
+            if (!h->rings[r].fc.validate(this, r)) return false;
+
+        return true;
+    }
 
 private:
-    void init_header(uint64_t task_window_size, uint64_t heap_size);
-    void init_header_per_ring(
-        const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH], const uint64_t heap_sizes[PTO2_MAX_RING_DEPTH]
-    );
-    void setup_pointers(uint64_t task_window_size);
-    void setup_pointers_per_ring(const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH]);
+    void init_header(uint64_t task_window_size, uint64_t heap_size)
+    {
+        uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH];
+        uint64_t heap_sizes[PTO2_MAX_RING_DEPTH];
+        for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++)
+        {
+            task_window_sizes[r] = task_window_size;
+            heap_sizes[r] = heap_size;
+        }
+        init_header_per_ring(task_window_sizes, heap_sizes);
+    }
+    void init_header_per_ring(const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH], const uint64_t heap_sizes[PTO2_MAX_RING_DEPTH])
+    {
+        // Per-ring flow control (start at 0)
+        for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++)
+        {
+            header->rings[r].fc.init();
+            // -1 = "no task completed yet"; first task to complete (local_id 0)
+            // will advance the watermark to 0.
+            header->rings[r].completed_watermark.store(-1, std::memory_order_relaxed);
+        }
+
+        header->orchestrator_done.store(0, std::memory_order_relaxed);
+
+        // Per-ring layout info
+        uint64_t offset = PTO2_ALIGN_UP(sizeof(PTO2SharedMemoryHeader), PTO2_ALIGN_SIZE);
+        for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++)
+        {
+            header->rings[r].task_window_size = task_window_sizes[r];
+            header->rings[r].task_window_mask = static_cast<int32_t>(task_window_sizes[r] - 1);
+            header->rings[r].heap_size = heap_sizes[r];
+            header->rings[r].task_descriptors_offset = offset;
+            offset += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskDescriptor), PTO2_ALIGN_SIZE);
+            offset += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskPayload), PTO2_ALIGN_SIZE);
+            offset += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskSlotState), PTO2_ALIGN_SIZE);
+        }
+
+        header->total_size = sm_size;
+        header->graph_output_ptr.store(0, std::memory_order_relaxed);
+        header->graph_output_size.store(0, std::memory_order_relaxed);
+
+        // Error reporting
+        header->orch_error_code.store(PTO2_ERROR_NONE, std::memory_order_relaxed);
+        header->sched_error_bitmap.store(0, std::memory_order_relaxed);
+        header->sched_error_code.store(PTO2_ERROR_NONE, std::memory_order_relaxed);
+        header->sched_error_thread.store(-1, std::memory_order_relaxed);
+
+        for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++)
+        {
+            auto &ring = header->rings[r];
+            for (uint64_t i = 0; i < task_window_sizes[r]; i++)
+            {
+                ring.slot_states[i].bind_ring(static_cast<uint8_t>(r));
+                ring.slot_states[i].reset_for_reuse();
+                ring.slot_states[i].active_mask = ActiveMask{};
+            }
+        }
+    }
+    void setup_pointers(uint64_t task_window_size)
+    {
+        uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH];
+        for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) task_window_sizes[r] = task_window_size;
+        setup_pointers_per_ring(task_window_sizes);
+    }
+    void setup_pointers_per_ring(const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH])
+    {
+        char *ptr = (char *)sm_base;
+
+        // Header
+        header = (PTO2SharedMemoryHeader *)ptr;
+        ptr += PTO2_ALIGN_UP(sizeof(PTO2SharedMemoryHeader), PTO2_ALIGN_SIZE);
+
+        // Per-ring task descriptors, payloads, and slot states
+        for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++)
+        {
+            auto &ring = header->rings[r];
+            ring.task_descriptors = (PTO2TaskDescriptor *)ptr;
+            ptr += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskDescriptor), PTO2_ALIGN_SIZE);
+
+            ring.task_payloads = (PTO2TaskPayload *)ptr;
+            ptr += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskPayload), PTO2_ALIGN_SIZE);
+
+            ring.slot_states = (PTO2TaskSlotState *)ptr;
+            ptr += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskSlotState), PTO2_ALIGN_SIZE);
+
+            ring.completion_flags = (std::atomic<uint8_t> *)ptr;
+            ptr += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(std::atomic<uint8_t>), PTO2_ALIGN_SIZE);
+        }
+    }
 };
 
-// =============================================================================
-// SM Device Layout Helpers
-// =============================================================================
-//
-// When the host pre-builds a runtime-arena image, it needs the device-side
-// addresses of several SM sub-fields (ring flow-control counters,
-// task_descriptors arrays, orch_error_code) so it can wire them into the
-// orchestrator / scheduler init_data path without dereferencing the SM —
-// the SM lives in device memory and cannot be touched from host.
-//
-// These helpers compute those addresses by offset arithmetic on the SM
-// device base. Pure pointer math, no loads/stores; safe to call from host.
-// The same arithmetic happens on AICPU too (via PTO2SharedMemoryHandle's
-// own setup_pointers), so values are guaranteed consistent across sides.
 namespace pto2_sm_layout {
 
-inline std::atomic<int32_t> *orch_error_code_addr(void *sm_dev_base) noexcept {
-    return reinterpret_cast<std::atomic<int32_t> *>(
-        static_cast<char *>(sm_dev_base) + offsetof(PTO2SharedMemoryHeader, orch_error_code)
-    );
+inline std::atomic<int32_t> *orch_error_code_addr(void *sm_dev_base) noexcept
+{
+    return reinterpret_cast<std::atomic<int32_t> *>(static_cast<char *>(sm_dev_base) + offsetof(PTO2SharedMemoryHeader, orch_error_code));
 }
 
-inline PTO2SharedMemoryRingHeader *ring_header_addr(void *sm_dev_base, int ring_id) noexcept {
-    return reinterpret_cast<PTO2SharedMemoryRingHeader *>(
-        static_cast<char *>(sm_dev_base) + offsetof(PTO2SharedMemoryHeader, rings) +
-        static_cast<size_t>(ring_id) * sizeof(PTO2SharedMemoryRingHeader)
-    );
+inline PTO2SharedMemoryRingHeader *ring_header_addr(void *sm_dev_base, int ring_id) noexcept
+{
+    return reinterpret_cast<PTO2SharedMemoryRingHeader *>(static_cast<char *>(sm_dev_base) + offsetof(PTO2SharedMemoryHeader, rings) + static_cast<size_t>(ring_id) * sizeof(PTO2SharedMemoryRingHeader));
 }
 
-inline std::atomic<int32_t> *ring_current_task_index_addr(void *sm_dev_base, int ring_id) noexcept {
-    return reinterpret_cast<std::atomic<int32_t> *>(
-        reinterpret_cast<char *>(ring_header_addr(sm_dev_base, ring_id)) + offsetof(PTO2SharedMemoryRingHeader, fc) +
-        offsetof(PTO2RingFlowControl, current_task_index)
-    );
+inline std::atomic<int32_t> *ring_current_task_index_addr(void *sm_dev_base, int ring_id) noexcept
+{
+    return reinterpret_cast<std::atomic<int32_t> *>(reinterpret_cast<char *>(ring_header_addr(sm_dev_base, ring_id)) + offsetof(PTO2SharedMemoryRingHeader, fc) + offsetof(PTO2RingFlowControl, current_task_index));
 }
 
-inline std::atomic<int32_t> *ring_last_task_alive_addr(void *sm_dev_base, int ring_id) noexcept {
-    return reinterpret_cast<std::atomic<int32_t> *>(
-        reinterpret_cast<char *>(ring_header_addr(sm_dev_base, ring_id)) + offsetof(PTO2SharedMemoryRingHeader, fc) +
-        offsetof(PTO2RingFlowControl, last_task_alive)
-    );
+inline std::atomic<int32_t> *ring_last_task_alive_addr(void *sm_dev_base, int ring_id) noexcept
+{
+    return reinterpret_cast<std::atomic<int32_t> *>(reinterpret_cast<char *>(ring_header_addr(sm_dev_base, ring_id)) + offsetof(PTO2SharedMemoryRingHeader, fc) + offsetof(PTO2RingFlowControl, last_task_alive));
 }
 
-// Byte offsets (from the SM base) of one ring's three segments. The per-ring
-// layout is: header, then for each ring descriptors -> payloads -> slot_states,
-// every segment PTO2_ALIGN_UP-padded.
-struct PTO2RingSegmentOffsets {
-    uint64_t descriptors;
-    uint64_t payloads;
-    uint64_t slot_states;
-    uint64_t end;  // offset just past this ring's slot_states (next ring's start; total SM size for the last ring)
-};
-
-// Single source of truth for the per-ring SM layout. Returns offsets (not
-// pointers), so it serves BOTH the host-side pointer setup
-// (`setup_pointers_per_ring`, which adds `sm_base`) and the device-address
-// helpers below (which add `sm_dev_base`). Adding or reordering a per-ring
-// segment is a one-line edit here; every consumer follows automatically, so the
-// layout walk can never silently disagree across call sites.
-inline PTO2RingSegmentOffsets
-ring_segment_offsets(const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH], int ring_id) noexcept {
+inline PTO2TaskDescriptor *ring_task_descriptors_addr(void *sm_dev_base, const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH], int ring_id) noexcept
+{
     assert(ring_id >= 0 && ring_id < PTO2_MAX_RING_DEPTH && "pto2_sm_layout: ring_id out of range");
-    uint64_t off = PTO2_ALIGN_UP(sizeof(PTO2SharedMemoryHeader), PTO2_ALIGN_SIZE);
-    for (int r = 0; r < ring_id; r++) {
-        off += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskDescriptor), PTO2_ALIGN_SIZE);
-        off += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskPayload), PTO2_ALIGN_SIZE);
-        off += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskSlotState), PTO2_ALIGN_SIZE);
+    char *p = static_cast<char *>(sm_dev_base);
+    p += PTO2_ALIGN_UP(sizeof(PTO2SharedMemoryHeader), PTO2_ALIGN_SIZE);
+    for (int r = 0; r < ring_id; r++)
+    {
+        p += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskDescriptor), PTO2_ALIGN_SIZE);
+        p += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskPayload), PTO2_ALIGN_SIZE);
+        p += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskSlotState), PTO2_ALIGN_SIZE);
     }
-    PTO2RingSegmentOffsets o{};
-    o.descriptors = off;
-    off += PTO2_ALIGN_UP(task_window_sizes[ring_id] * sizeof(PTO2TaskDescriptor), PTO2_ALIGN_SIZE);
-    o.payloads = off;
-    off += PTO2_ALIGN_UP(task_window_sizes[ring_id] * sizeof(PTO2TaskPayload), PTO2_ALIGN_SIZE);
-    o.slot_states = off;
-    off += PTO2_ALIGN_UP(task_window_sizes[ring_id] * sizeof(PTO2TaskSlotState), PTO2_ALIGN_SIZE);
-    o.end = off;
-    return o;
-}
-
-// Device address of ring `ring_id`'s task_descriptors array.
-inline PTO2TaskDescriptor *ring_task_descriptors_addr(
-    void *sm_dev_base, const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH], int ring_id
-) noexcept {
-    return reinterpret_cast<PTO2TaskDescriptor *>(
-        static_cast<char *>(sm_dev_base) + ring_segment_offsets(task_window_sizes, ring_id).descriptors
-    );
-}
-
-// Device address of ring `ring_id`'s slot_states array (used by the allocator's
-// deadlock detector to inspect the head task's state/fanout).
-inline PTO2TaskSlotState *
-ring_slot_states_addr(void *sm_dev_base, const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH], int ring_id) noexcept {
-    return reinterpret_cast<PTO2TaskSlotState *>(
-        static_cast<char *>(sm_dev_base) + ring_segment_offsets(task_window_sizes, ring_id).slot_states
-    );
+    return reinterpret_cast<PTO2TaskDescriptor *>(p);
 }
 
 }  // namespace pto2_sm_layout
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_submit_types.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_submit_types.h
index 21c77fce2..f70af0a23 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_submit_types.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_submit_types.h
@@ -9,36 +9,21 @@
  * -----------------------------------------------------------------------------------------------------------
  */
 
-/**
- * PTO Submit Types - Shared submit-contract definitions
- *
- * Header-only definitions shared by orchestration-facing and runtime-facing
- * headers. Keeps orchestration slim (no dependency on pto_runtime2_types.h).
- */
-
 #pragma once
 
 #include <stdint.h>
 
 inline constexpr int32_t INVALID_KERNEL_ID = -1;
 
-/**
- * Subtask slot count: AIC, AIV0, AIV1
- */
 inline constexpr int32_t PTO2_SUBTASK_SLOT_COUNT = 3;
 
-/**
- * Subtask slot indices
- */
-enum class PTO2SubtaskSlot : uint8_t {
+enum class PTO2SubtaskSlot : uint8_t
+{
     AIC = 0,
     AIV0 = 1,
     AIV1 = 2,
 };
 
-/**
- * Subtask mask bits (for ActiveMask)
- */
 inline constexpr uint8_t PTO2_SUBTASK_MASK_AIC = (1u << 0);         // 0x1
 inline constexpr uint8_t PTO2_SUBTASK_MASK_AIV0 = (1u << 1);        // 0x2
 inline constexpr uint8_t PTO2_SUBTASK_MASK_AIV1 = (1u << 2);        // 0x4
@@ -57,36 +42,46 @@ inline constexpr uint8_t PTO2_SUBTASK_FLAG_SYNC_START = (1u << 3);  // 0x8: all
  * with an empty core_mask route to a dedicated DUMMY ready queue and are
  * completed inline by the scheduler dispatch loop, bypassing core allocation.
  */
-enum class PTO2ResourceShape : uint8_t {
+enum class PTO2ResourceShape : uint8_t
+{
     AIC = 0,    // Single AIC
     AIV = 1,    // Single AIV
     MIX = 2,    // Full cluster (dispatch uses active_mask)
     DUMMY = 3,  // Dependency-only (no AICore dispatch)
 };
 
-// Number of *dispatchable* resource shapes (AIC, AIV, MIX). DUMMY does not
-// allocate a per-shape ready_queue entry / local buffer — it lives in a
-// dedicated queue inside PTO2SchedulerState.
 inline constexpr int32_t PTO2_NUM_RESOURCE_SHAPES = 3;
 
-/**
- * Bitmask of active subtask slots + flags, sizeof == 1.
- */
-class ActiveMask {
+class ActiveMask
+{
 public:
     constexpr ActiveMask() = default;
     constexpr explicit ActiveMask(uint8_t raw) :
-        raw_(raw) {}
+        raw_(raw)
+    {}
 
-    uint8_t raw() const { return raw_; }
+    uint8_t raw() const
+    {
+        return raw_;
+    }
 
-    bool subtask_active(PTO2SubtaskSlot slot) const { return (raw_ & (1u << static_cast<uint8_t>(slot))) != 0; }
+    bool subtask_active(PTO2SubtaskSlot slot) const
+    {
+        return (raw_ & (1u << static_cast<uint8_t>(slot))) != 0;
+    }
 
-    uint8_t core_mask() const { return raw_ & 0x07u; }
+    uint8_t core_mask() const
+    {
+        return raw_ & 0x07u;
+    }
 
-    bool requires_sync_start() const { return (raw_ & PTO2_SUBTASK_FLAG_SYNC_START) != 0; }
+    bool requires_sync_start() const
+    {
+        return (raw_ & PTO2_SUBTASK_FLAG_SYNC_START) != 0;
+    }
 
-    PTO2ResourceShape to_shape() const {
+    PTO2ResourceShape to_shape() const
+    {
         uint8_t cmask = core_mask();
         if (cmask == 0) return PTO2ResourceShape::DUMMY;
         int bit_count = __builtin_popcount(cmask);
@@ -95,22 +90,44 @@ class ActiveMask {
         return PTO2ResourceShape::AIV;
     }
 
-    void set_sync_start() { raw_ |= PTO2_SUBTASK_FLAG_SYNC_START; }
+    void set_sync_start()
+    {
+        raw_ |= PTO2_SUBTASK_FLAG_SYNC_START;
+    }
 
-    bool operator==(ActiveMask other) const { return raw_ == other.raw_; }
-    bool operator!=(ActiveMask other) const { return raw_ != other.raw_; }
+    bool operator==(ActiveMask other) const
+    {
+        return raw_ == other.raw_;
+    }
+    bool operator!=(ActiveMask other) const
+    {
+        return raw_ != other.raw_;
+    }
 
-    ActiveMask operator|(ActiveMask other) const { return ActiveMask(raw_ | other.raw_); }
-    ActiveMask &operator|=(ActiveMask other) {
+    ActiveMask operator|(ActiveMask other) const
+    {
+        return ActiveMask(raw_ | other.raw_);
+    }
+    ActiveMask &operator|=(ActiveMask other)
+    {
         raw_ |= other.raw_;
         return *this;
     }
 
-    ActiveMask operator&(uint8_t mask) const { return ActiveMask(raw_ & mask); }
+    ActiveMask operator&(uint8_t mask) const
+    {
+        return ActiveMask(raw_ & mask);
+    }
 
-    bool has_mask(uint8_t mask) const { return (raw_ & mask) != 0; }
+    bool has_mask(uint8_t mask) const
+    {
+        return (raw_ & mask) != 0;
+    }
 
-    explicit operator bool() const { return raw_ != 0; }
+    explicit operator bool() const
+    {
+        return raw_ != 0;
+    }
 
 private:
     uint8_t raw_{0};
@@ -118,18 +135,14 @@ class ActiveMask {
 
 static_assert(sizeof(ActiveMask) == 1, "ActiveMask must be exactly 1 byte");
 
-/**
- * Mixed-task submit contract.
- *
- * Each field holds either a valid kernel ID or INVALID_KERNEL_ID (inactive).
- * At least one slot must be valid.
- */
-struct MixedKernels {
+struct MixedKernels
+{
     int32_t aic_kernel_id{INVALID_KERNEL_ID};
     int32_t aiv0_kernel_id{INVALID_KERNEL_ID};
     int32_t aiv1_kernel_id{INVALID_KERNEL_ID};
 
-    ActiveMask to_active_mask() const {
+    ActiveMask to_active_mask() const
+    {
         uint8_t mask = 0;
         if (aic_kernel_id != INVALID_KERNEL_ID) mask |= PTO2_SUBTASK_MASK_AIC;
         if (aiv0_kernel_id != INVALID_KERNEL_ID) mask |= PTO2_SUBTASK_MASK_AIV0;
@@ -138,22 +151,28 @@ struct MixedKernels {
     }
 };
 
-/**
- * SPMD launch parameters carried inside Arg.
- *
- * Controls how many logical blocks (SPMD dimension) a single task
- * is expanded into at dispatch time.  Each block receives a unique
- * block_idx in [0, block_num) via the per-dispatch LocalContext.
- */
-class PTO2LaunchSpec {
+class PTO2LaunchSpec
+{
 public:
     constexpr PTO2LaunchSpec() = default;
 
-    int16_t block_num() const { return block_num_; }
-    void set_block_num(int16_t n) { block_num_ = n; }
+    int16_t block_num() const
+    {
+        return block_num_;
+    }
+    void set_block_num(int16_t n)
+    {
+        block_num_ = n;
+    }
 
-    bool require_sync_start() const { return require_sync_start_; }
-    void set_require_sync_start(bool v) { require_sync_start_ = v; }
+    bool require_sync_start() const
+    {
+        return require_sync_start_;
+    }
+    void set_require_sync_start(bool v)
+    {
+        require_sync_start_ = v;
+    }
 
 private:
     int16_t block_num_{1};
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h
index 33673b29c..366f05666 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h
@@ -9,37 +9,6 @@
  * -----------------------------------------------------------------------------------------------------------
  */
 
-/**
- * PTO Runtime2 - TensorMap Interface
- *
- * TensorMap provides producer lookup for dependency discovery:
- * - Maps Tensor -> producer task ID
- * - Used by pto_submit_task() to find dependencies
- *
- * Key design features:
- * 1. Ring buffer pool for entries (no malloc/free)
- * 2. Lazy invalidation (entries become stale when producer retires)
- * 3. Per-task per-ring entry tracking for efficient cleanup
- * 4. OVERLAP DETECTION: Detects dependencies for overlapping sub-regions
- *
- * Hash table with chaining:
- * - buckets[] array of head offsets
- * - Entries linked via next_in_bucket
- * - Insert at head (newest first) for sorted chains
- *
- * CRITICAL: Hash only by base_ptr
- * ==============================
- * For overlap detection to work, ALL sub-regions of the same base tensor
- * MUST be in the SAME hash bucket. This allows lookup to compare all
- * potentially overlapping regions.
- *
- * Overlap detection: Two regions create a dependency if:
- *   1. Same base_ptr (raw tensor pointer)
- *   2. Byte ranges [offset, offset+size) intersect
- *
- * Based on: docs/RUNTIME_LOGIC.md
- */
-
 #pragma once
 
 #include "common.h"
@@ -72,7 +41,8 @@ struct Segment {
  *
  * All offsets are relative to the arena's base.
  */
-struct PTO2TensorMapLayout {
+struct PTO2TensorMapLayout
+{
     size_t off_buckets;
     size_t off_bucket_epochs;
     size_t off_entry_pool;
@@ -124,119 +94,86 @@ extern uint64_t g_insert_count;
  *
  * Entry size: 128B (2 cache lines), matches Tensor.
  */
-struct alignas(64) PTO2TensorMapEntry {
+struct alignas(64) PTO2TensorMapEntry
+{
     // === Cache line 1 (64B) — lookup hot path; mirrors Tensor line 1 from byte 16 ===
-    uint64_t buffer_addr;                // 8B [0, 8):   tensor base address (hash key, mirrors Tensor::buffer.addr)
-    PTO2TensorMapEntry *next_in_bucket;  // 8B [8, 16):  next entry in hash bucket chain (overlays Tensor::buffer.size)
-    PTO2TaskId producer_task_id;         // 8B [16,24):  mirrors Tensor::owner_task_id slot
-    uint64_t start_offset;               // 8B [24,32):  mirrors Tensor::start_offset (element offset)
-    int32_t version;                     // 4B [32,36):  mirrors Tensor::version
-    uint32_t ndims;                      // 4B [36,40):  mirrors Tensor::ndims
-    DataType dtype;                      // 1B [40,41):  mirrors Tensor::dtype
-    bool manual_dep;                     // 1B [41,42):  mirrors Tensor::manual_dep
-    bool is_contiguous;                  // 1B [42,43):  mirrors Tensor::is_contiguous
-    uint8_t __padding1__;                // 1B [43,44):  mirrors Tensor padding
-    uint32_t shapes[MAX_TENSOR_DIMS];    // 20B [44,64): mirrors Tensor::shapes
+    uint64_t buffer_addr;                      // 8B [0, 8):   tensor base address (hash key, mirrors Tensor::buffer.addr)
+    PTO2TensorMapEntry *next_in_bucket;        // 8B [8, 16):  next entry in hash bucket chain (overlays Tensor::buffer.size)
+    PTO2TaskId producer_task_id;               // 8B [16, 24):  mirrors Tensor::owner_task_id slot
+    uint64_t start_offset;                     // 8B [24, 32):  mirrors Tensor::start_offset (element offset)
+    int32_t version;                           // 4B [32, 36):  mirrors Tensor::version
+    uint32_t ndims;                            // 4B [36, 40):  mirrors Tensor::ndims
+    DataType dtype;                            // 1B [40, 41):  mirrors Tensor::dtype
+    bool manual_dep;                           // 1B [41, 42):  mirrors Tensor::manual_dep
+    bool is_contiguous;                        // 1B [42, 43):  mirrors Tensor::is_contiguous
+    uint8_t __padding1__;                      // 1B [43, 44):  mirrors Tensor padding
+    uint32_t shapes[MAX_TENSOR_DIMS];          // 20B [44, 64): mirrors Tensor::shapes
 
     // === Cache line 2 (64B) — chain manipulation + non-contiguous overlap data ===
-    PTO2TensorMapEntry *prev_in_bucket;  // 8B [64, 72)
-    PTO2TensorMapEntry *next_in_task;    // 8B [72, 80)
-    PTO2TensorMapEntry *prev_in_task;    // 8B [80, 88)
-    int32_t bucket_index;                // 4B [88, 92): -1 when unlinked
-    uint32_t __padding2__;               // 4B [92, 96)
-    uint64_t extent_elem_cache;          // 8B [96,104): non-contiguous extent (mirrors Tensor)
-    uint32_t strides[MAX_TENSOR_DIMS];   // 20B [104,124): element strides, mirrors Tensor::strides
-    uint8_t __padding3__[4];             // 4B [124,128)
-
-    /**
-     * Copy overlap-relevant fields from a Tensor into this entry.
-     *
-     * 64B memcpy of Tensor cache line 1 populates buffer_addr (byte [0,8)),
-     * producer_task_id, start_offset, version, ndims, dtype, manual_dep,
-     * is_contiguous and shapes[]. Byte [8,16) holds Tensor::buffer.size in
-     * the source and gets written into next_in_bucket; that's harmless
-     * because link_entry() overwrites next_in_bucket immediately after.
-     *
-     * Cache line 2 (stride / extent_elem_cache) is derived from line 1 when
-     * the source is canonically contiguous (is_contiguous && start_offset==0),
-     * so the producer Tensor's cache line 2 stays cold during insert. Only
-     * non-contiguous producers pay one extra line 2 read.
-     */
-    void copy_from_tensor(const Tensor &tensor) {
+    PTO2TensorMapEntry *prev_in_bucket;         // 8B [64, 72)
+    PTO2TensorMapEntry *next_in_task;           // 8B [72, 80)
+    PTO2TensorMapEntry *prev_in_task;           // 8B [80, 88)
+    int32_t bucket_index;                       // 4B [88, 92): -1 when unlinked
+    uint32_t __padding2__;                      // 4B [92, 96)
+    uint64_t extent_elem_cache;                 // 8B [96, 104): non-contiguous extent (mirrors Tensor)
+    uint32_t strides[MAX_TENSOR_DIMS];          // 20B [104, 124): element strides, mirrors Tensor::strides
+    uint8_t __padding3__[4];                    // 4B [124, 128)
+
+    void copy_from_tensor(const Tensor &tensor)
+    {
         memcpy(this, &tensor, 64);
-        if (tensor.is_contiguous && tensor.start_offset == 0) {
+        if (tensor.is_contiguous && tensor.start_offset == 0)
+        {
             uint64_t numel = 1;
-            for (uint32_t i = 0; i < tensor.ndims; i++)
-                numel *= tensor.shapes[i];
+            for (uint32_t i = 0; i < tensor.ndims; i++) numel *= tensor.shapes[i];
             extent_elem_cache = numel;
             uint32_t s = 1;
-            for (int32_t i = static_cast<int32_t>(tensor.ndims) - 1; i >= 0; i--) {
+            for (int32_t i = static_cast<int32_t>(tensor.ndims) - 1; i >= 0; i--)
+            {
                 strides[i] = s;
                 s *= tensor.shapes[i];
             }
-        } else {
+        }
+        else
+        {
             extent_elem_cache = tensor.extent_elem_cache;
-            for (uint32_t i = 0; i < tensor.ndims; i++) {
-                strides[i] = tensor.strides[i];
-            }
+            for (uint32_t i = 0; i < tensor.ndims; i++) strides[i] = tensor.strides[i];
         }
     }
 
-    void copy_tensor_create_info(const TensorCreateInfo &tensor_create_info, uint64_t addr) {
+    void copy_tensor_create_info(const TensorCreateInfo &tensor_create_info, uint64_t addr)
+    {
         memcpy(this, &tensor_create_info, 64);
         buffer_addr = addr;
         // Create-info outputs are always contiguous with start_offset = 0;
         // extent_elem = prod(shapes); stride is row-major.
         uint64_t numel = 1;
-        for (uint32_t i = 0; i < tensor_create_info.ndims; i++) {
-            numel *= tensor_create_info.shapes[i];
-        }
+        for (uint32_t i = 0; i < tensor_create_info.ndims; i++) numel *= tensor_create_info.shapes[i];
         extent_elem_cache = numel;
         uint32_t s = 1;
-        for (int32_t i = static_cast<int32_t>(tensor_create_info.ndims) - 1; i >= 0; i--) {
+        for (int32_t i = static_cast<int32_t>(tensor_create_info.ndims) - 1; i >= 0; i--)
+        {
             strides[i] = s;
             s *= tensor_create_info.shapes[i];
         }
     }
 
-    /**
-     * Effective element extent of this entry.
-     * Contiguous-aligned views compute it from shapes alone (line 1 hit only);
-     * non-contiguous views read the cached value from line 2.
-     */
-    uint64_t effective_extent_elem() const {
-        if (is_contiguous) {
+    uint64_t effective_extent_elem() const
+    {
+        if (is_contiguous)
+        {
             uint64_t n = 1;
-            for (uint32_t i = 0; i < ndims; i++)
-                n *= shapes[i];
+            for (uint32_t i = 0; i < ndims; i++) n *= shapes[i];
             return n;
         }
         return extent_elem_cache;
     }
 
-    /**
-     * Check overlap between input tensor and this entry (the producer output).
-     *
-     * Three-level cascade:
-     *   L1 — O(1) byte-range intersection. Disjoint -> NO_OVERLAP.
-     *   L2 — O(ndims) hyper-rectangle precise check, eligible only when both
-     *        sides share the same canonical row-major axis layout (same
-     *        dtype/ndims/strides[], stride descends as integer multiples,
-     *        start_offset decomposes cleanly under the reference shape).
-     *        Yields NO_OVERLAP / COVERED / OTHER per-dim.
-     *   L3 — Non-hyper-rectangle pairs (transpose/permute mismatch, slice
-     *        with step, etc): conservative OTHER. Exact enumeration via
-     *        contiguous-segment merge is scheduled for a follow-up.
-     *
-     * COVERED is returned when `input` completely contains `entry` per-dim
-     * — dep_compute uses this to retire the now-redundant entry.
-     */
-    OverlapStatus check_overlap(const Tensor &input) const {
+    OverlapStatus check_overlap(const Tensor &input) const
+    {
         debug_assert(input.buffer.addr == buffer_addr);
         debug_assert(input.version >= version);
-        if (input.version > version) {
-            return OverlapStatus::OTHER;
-        }
+        if (input.version > version) return OverlapStatus::OTHER;
 
         // -------- L1: byte-range intersection (O(1) fast reject) --------
         const uint64_t in_begin = input.start_offset;
@@ -245,27 +182,15 @@ struct alignas(64) PTO2TensorMapEntry {
         const uint64_t ent_end = start_offset + effective_extent_elem();
         Segment in_range_bytes{in_begin, in_end};
         Segment ent_range_bytes{ent_begin, ent_end};
-        if (!in_range_bytes.line_segment_intersection(ent_range_bytes)) {
-            return OverlapStatus::NO_OVERLAP;
-        }
+        if (!in_range_bytes.line_segment_intersection(ent_range_bytes)) return OverlapStatus::NO_OVERLAP;
 
         // -------- L2 prereqs: same axis layout? --------
-        if (input.dtype != dtype || input.ndims != ndims || ndims == 0) {
-            return OverlapStatus::OTHER;
-        }
-        for (uint32_t i = 0; i < ndims; i++) {
+        if (input.dtype != dtype || input.ndims != ndims || ndims == 0) return OverlapStatus::OTHER;
+        for (uint32_t i = 0; i < ndims; i++)
             if (input.strides[i] != strides[i]) return OverlapStatus::OTHER;
-        }
-        // strides[ndims-1] must be 1 and strides[i-1] must be an integer
-        // multiple of strides[i] for the row-major reference-shape derivation
-        // below to hold. This rejects slice-with-step (strides[d] != prev factor)
-        // and any view chain that scrambles the axis order. (strides is
-        // uint32_t with the > 0 invariant enforced at construction, so no
-        // sign check needed.)
         if (strides[ndims - 1] != 1) return OverlapStatus::OTHER;
-        for (uint32_t i = 1; i < ndims; i++) {
+        for (uint32_t i = 1; i < ndims; i++)
             if (strides[i - 1] % strides[i] != 0) return OverlapStatus::OTHER;
-        }
 
         // Derive reference shape A from stride. By construction stride is
         // row-major over A: strides[i] = prod(A[i+1..ndims-1]). So
@@ -303,7 +228,8 @@ struct alignas(64) PTO2TensorMapEntry {
         uint32_t ent_offsets[MAX_TENSOR_DIMS] = {};
         uint64_t in_remain = input.start_offset;
         uint64_t ent_remain = start_offset;
-        for (uint32_t i = 0; i < ndims; i++) {
+        for (uint32_t i = 0; i < ndims; i++)
+        {
             const uint32_t s = strides[i];
             in_offsets[i] = static_cast<uint32_t>(in_remain / s);
             ent_offsets[i] = static_cast<uint32_t>(ent_remain / s);
@@ -314,22 +240,20 @@ struct alignas(64) PTO2TensorMapEntry {
 
         // Validate that each side fits within ref_shapes (defense in depth —
         // a well-formed view always satisfies this).
-        for (uint32_t i = 0; i < ndims; i++) {
+        for (uint32_t i = 0; i < ndims; i++)
+        {
             if (static_cast<uint64_t>(in_offsets[i]) + input.shapes[i] > ref_shapes[i]) return OverlapStatus::OTHER;
             if (static_cast<uint64_t>(ent_offsets[i]) + shapes[i] > ref_shapes[i]) return OverlapStatus::OTHER;
         }
 
         // -------- L2 core: per-dim line-segment intersection --------
         bool input_contains_entry = true;
-        for (uint32_t i = 0; i < ndims; i++) {
+        for (uint32_t i = 0; i < ndims; i++)
+        {
             Segment in_seg{in_offsets[i], static_cast<uint64_t>(in_offsets[i]) + input.shapes[i]};
             Segment ent_seg{ent_offsets[i], static_cast<uint64_t>(ent_offsets[i]) + shapes[i]};
-            if (!in_seg.line_segment_intersection(ent_seg)) {
-                return OverlapStatus::NO_OVERLAP;
-            }
-            if (!in_seg.contains(ent_seg)) {
-                input_contains_entry = false;
-            }
+            if (!in_seg.line_segment_intersection(ent_seg)) return OverlapStatus::NO_OVERLAP;
+            if (!in_seg.contains(ent_seg)) input_contains_entry = false;
         }
         return input_contains_entry ? OverlapStatus::COVERED : OverlapStatus::OTHER;
     }
@@ -345,20 +269,10 @@ static_assert(offsetof(PTO2TensorMapEntry, dtype) == offsetof(Tensor, dtype));
 static_assert(offsetof(PTO2TensorMapEntry, manual_dep) == offsetof(Tensor, manual_dep));
 static_assert(offsetof(PTO2TensorMapEntry, is_contiguous) == offsetof(Tensor, is_contiguous));
 static_assert(offsetof(PTO2TensorMapEntry, shapes) == offsetof(Tensor, shapes));
-static_assert(
-    offsetof(PTO2TensorMapEntry, prev_in_bucket) == 64, "TensorMapEntry must be exactly 2 cache lines (128 bytes)"
-);
+static_assert(offsetof(PTO2TensorMapEntry, prev_in_bucket) == 64, "TensorMapEntry must be exactly 2 cache lines (128 bytes)");
 
-// =============================================================================
-// TensorMap Lookup Chain Length Statistics (compile-time toggle)
-// =============================================================================
-
-/**
- * TensorMap structure
- *
- * Hash table with ring buffer entry pool and lazy invalidation.
- */
-struct PTO2TensorMap {
+struct PTO2TensorMap
+{
     // Hash table buckets (fixed size, power of 2)
     PTO2TensorMapEntry **buckets;  // Array of offsets into entry_pool (-1 = empty)
     uint32_t *bucket_epochs;
@@ -384,42 +298,25 @@ struct PTO2TensorMap {
     // Per-ring cleanup progress (for periodic cleanup_retired)
     int32_t last_cleanup[PTO2_MAX_RING_DEPTH]{};
 
-    uint32_t get_task_local_id_slot(uint8_t ring_id, uint32_t task_local_id) const {
+    uint32_t get_task_local_id_slot(uint8_t ring_id, uint32_t task_local_id) const
+    {
         return task_local_id & (task_window_sizes[ring_id] - 1);
     }
 
-    // Accessors read by scope_stats_collector. Declared unconditionally so the
-    // collector .cpp compiles at PTO2_PROFILING=0 (collector is unconditional —
-    // setter symbols must export for host dlsym; the probe call sites that use
-    // these accessors stay gated by PTO2_PROFILING).
-    int32_t current_used() const { return next_entry_idx - free_num; }
-    int32_t pool_capacity() const { return pool_size; }
-    int32_t free_entries() const { return pool_size - current_used(); }
-
-    // Reclaim retired entries across every ring, advancing each ring's cleanup
-    // cursor (last_cleanup[r]) to the supplied watermark. Returns the summed
-    // last_task_alive across rings — the monotone progress signal the
-    // orchestrator's exhaustion back-pressure loop watches to tell a transient
-    // shortage (some ring still retiring tasks) from a wedged pool (no ring
-    // advancing). Idempotent per watermark: a ring whose alive has not passed
-    // last_cleanup[r] is skipped, so it never double-frees.
-    int64_t reclaim_retired_all(const int32_t sm_last_task_alive[PTO2_MAX_RING_DEPTH]) {
-        int64_t alive_sum = 0;
-        for (int32_t r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-            int32_t alive = sm_last_task_alive[r];
-            sync_validity(r, alive);
-            if (alive > last_cleanup[r]) {
-                cleanup_retired(r, last_cleanup[r], alive);
-                last_cleanup[r] = alive;
-            }
-            alive_sum += alive;
-        }
-        return alive_sum;
+    int32_t current_used() const
+    {
+        return next_entry_idx - free_num;
+    }
+    int32_t pool_capacity() const
+    {
+        return pool_size;
     }
 
     // new_entry only allocates memory, does not assign attributes
-    PTO2TensorMapEntry *new_entry() {
-        if (free_num > 0) {
+    PTO2TensorMapEntry *new_entry()
+    {
+        if (free_num > 0)
+        {
             PTO2TensorMapEntry *res = free_entry_list[--free_num];
             debug_assert(res->bucket_index == -1);
             return res;
@@ -429,22 +326,24 @@ struct PTO2TensorMap {
         return res;
     }
 
-    void free_entry(PTO2TensorMapEntry &entry) {
+    void free_entry(PTO2TensorMapEntry &entry)
+    {
         always_assert(entry.bucket_index != -1);  // must still be in a bucket
 
         // Update predecessor's next pointer (O(1) via prev_in_bucket)
-        if (entry.prev_in_bucket == nullptr) {
+        if (entry.prev_in_bucket == nullptr)
+        {
             // Entry is the head of its bucket chain, update bucket head
             // Must compute hash BEFORE clearing tensor
             buckets[entry.bucket_index] = entry.next_in_bucket;
-        } else {
+        }
+        else
+        {
             entry.prev_in_bucket->next_in_bucket = entry.next_in_bucket;
         }
 
         // Update successor's prev pointer
-        if (entry.next_in_bucket != nullptr) {
-            entry.next_in_bucket->prev_in_bucket = entry.prev_in_bucket;
-        }
+        if (entry.next_in_bucket != nullptr) entry.next_in_bucket->prev_in_bucket = entry.prev_in_bucket;
 
         free_entry_list[free_num++] = &entry;
         entry.bucket_index = -1;
@@ -454,171 +353,150 @@ struct PTO2TensorMap {
         entry.prev_in_task = nullptr;
     }
 
-    // =============================================================================
-    // TensorMap API
-    // =============================================================================
-
-    /**
-     * Phase 1: reserve every sub-region (buckets, entry_pool, free list, per-ring
-     * task_entry_heads) on the supplied arena. Records the resulting offsets in
-     * the returned layout descriptor. Must be called before the arena is
-     * committed.
-     */
-    static PTO2TensorMapLayout reserve_layout(
-        DeviceArena &arena, int32_t num_buckets, int32_t pool_size, const int32_t task_window_sizes[PTO2_MAX_RING_DEPTH]
-    );
-
-    /**
-     * Same as reserve_layout() with default sizes (PTO2_TENSORMAP_NUM_BUCKETS,
-     * PTO2_TENSORMAP_POOL_SIZE).
-     */
-    static PTO2TensorMapLayout
-    reserve_layout_default(DeviceArena &arena, const int32_t task_window_sizes[PTO2_MAX_RING_DEPTH]);
-
-    /**
-     * Phase 3a: write everything *except* arena-internal pointer fields
-     * (buckets, entry_pool, free_entry_list, task_entry_heads[r]).
-     * Uses arena.region_ptr to address the arena regions for data writes,
-     * but does not store those addresses in struct fields. Safe to call on
-     * a host arena that holds the prebuilt image.
-     */
-    bool init_data_from_layout(const PTO2TensorMapLayout &layout, DeviceArena &arena);
-    void reset_for_reuse(const PTO2TensorMapLayout &layout);
-
-    /**
-     * Phase 3b: write the arena-internal pointer fields. Idempotent;
-     * called once on the host arena and once on the AICPU after attach.
-     */
-    void wire_arena_pointers(const PTO2TensorMapLayout &layout, DeviceArena &arena);
-
-    /**
-     * Tear down state. Does not free memory — the arena owns the backing
-     * buffer. Pointers are set to nullptr so accidental reuse traps.
-     */
-    void destroy();
-
-    /**
-     * Update validity threshold from shared memory
-     * Called periodically to refresh the lazy invalidation threshold.
-     *
-     * @param last_task_alive  Current value from shared memory
-     */
-    void sync_validity(int32_t ring_id, int32_t last_task_alive) { this->last_task_alives[ring_id] = last_task_alive; }
-
-    /**
-     * Lookup producer for a tensor region
-     *
-     * Searches the hash table for matching regions and invokes the callback
-     * for each overlapping valid entry.
-     * Stale entries from different rings are skipped (not truncated).
-     *
-     * The callback receives (PTO2TensorMapEntry &, OverlapStatus) and should
-     * return true to continue iteration, false to stop early. It is safe for
-     * the callback to call remove_entry() on the current entry: next_in_bucket
-     * is latched before invocation.
-     *
-     * @param tensor    Tensor to look up
-     * @param on_match  Callback invoked for each overlapping entry
-     */
+    static PTO2TensorMapLayout reserve_layout(DeviceArena &arena, int32_t new_num_buckets, int32_t new_pool_size, const int32_t new_task_window_sizes[PTO2_MAX_RING_DEPTH])
+    {
+        // num_buckets must be a power of two for the hash truncation to work.
+        always_assert((new_num_buckets & (new_num_buckets - 1)) == 0);
+
+        PTO2TensorMapLayout layout{};
+        layout.num_buckets = new_num_buckets;
+        layout.pool_size = new_pool_size;
+        for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) layout.task_window_sizes[r] = new_task_window_sizes[r];
+
+        layout.off_buckets = arena.reserve(static_cast<size_t>(new_num_buckets) * sizeof(PTO2TensorMapEntry *), alignof(PTO2TensorMapEntry *));
+        layout.off_entry_pool = arena.reserve(static_cast<size_t>(new_pool_size) * sizeof(PTO2TensorMapEntry), alignof(PTO2TensorMapEntry));
+        layout.off_free_entry_list = arena.reserve(static_cast<size_t>(new_pool_size) * sizeof(PTO2TensorMapEntry *), alignof(PTO2TensorMapEntry *));
+        for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) layout.off_task_entry_heads[r] = arena.reserve(static_cast<size_t>(new_task_window_sizes[r]) * sizeof(PTO2TensorMapEntry *), alignof(PTO2TensorMapEntry *));
+        return layout;
+    }
+
+    static PTO2TensorMapLayout reserve_layout_default(DeviceArena &arena, const int32_t new_task_window_sizes[PTO2_MAX_RING_DEPTH])
+    {
+        return reserve_layout(arena, PTO2_TENSORMAP_NUM_BUCKETS, PTO2_TENSORMAP_POOL_SIZE, new_task_window_sizes);
+    }
+
+    bool init_data_from_layout(const PTO2TensorMapLayout &layout, DeviceArena &arena)
+    {
+        num_buckets = layout.num_buckets;
+        pool_size = layout.pool_size;
+
+        // Address arena regions for data writes; do not store these in struct
+        // fields (wire_arena_pointers does that).
+        auto *buckets_arena = static_cast<PTO2TensorMapEntry **>(arena.region_ptr(layout.off_buckets));
+        auto *entry_pool_arena = static_cast<PTO2TensorMapEntry *>(arena.region_ptr(layout.off_entry_pool));
+        auto *free_list_arena = static_cast<PTO2TensorMapEntry **>(arena.region_ptr(layout.off_free_entry_list));
+
+        // buckets[]: empty == nullptr.
+        for (int32_t i = 0; i < num_buckets; i++) buckets_arena[i] = nullptr;
+
+        memset(entry_pool_arena, 0, static_cast<size_t>(pool_size) * sizeof(PTO2TensorMapEntry));
+        for (int32_t i = 0; i < pool_size; i++)
+        {
+            entry_pool_arena[i].bucket_index = -1;
+            entry_pool_arena[i].next_in_bucket = nullptr;
+            entry_pool_arena[i].prev_in_bucket = nullptr;
+            entry_pool_arena[i].next_in_task = nullptr;
+            entry_pool_arena[i].prev_in_task = nullptr;
+            entry_pool_arena[i].producer_task_id = PTO2TaskId{};
+        }
+
+        // free_entry_list: zeroed (was calloc'd before); contents become meaningful
+        // only after entries are freed back, so the body of the array stays as 0.
+        memset(free_list_arena, 0, static_cast<size_t>(pool_size) * sizeof(PTO2TensorMapEntry *));
+
+        next_entry_idx = 0;
+        free_num = 0;
+
+        for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++)
+        {
+            auto *heads_arena = static_cast<PTO2TensorMapEntry **>(arena.region_ptr(layout.off_task_entry_heads[r]));
+            for (int32_t i = 0; i < layout.task_window_sizes[r]; i++) heads_arena[i] = nullptr;
+            task_window_sizes[r] = layout.task_window_sizes[r];
+            last_task_alives[r] = 0;
+            last_cleanup[r] = 0;
+        }
+
+        return true;
+    }
+
+    void wire_arena_pointers(const PTO2TensorMapLayout &layout, DeviceArena &arena)
+    {
+        buckets = static_cast<PTO2TensorMapEntry **>(arena.region_ptr(layout.off_buckets));
+        entry_pool = static_cast<PTO2TensorMapEntry *>(arena.region_ptr(layout.off_entry_pool));
+        free_entry_list = static_cast<PTO2TensorMapEntry **>(arena.region_ptr(layout.off_free_entry_list));
+        for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) task_entry_heads[r] = static_cast<PTO2TensorMapEntry **>(arena.region_ptr(layout.off_task_entry_heads[r]));
+    }
+
+    void destroy()
+    {
+        buckets = nullptr;
+        entry_pool = nullptr;
+        free_entry_list = nullptr;
+        for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) task_entry_heads[r] = nullptr;
+    }
+
+    void sync_validity(int32_t ring_id, int32_t last_task_alive)
+    {
+        this->last_task_alives[ring_id] = last_task_alive;
+    }
+
     template <typename Fn>
-    void lookup(const Tensor &tensor, Fn &&on_match) {
+    void lookup(const Tensor &tensor, Fn &&on_match)
+    {
         uint32_t bucket_index = hash(tensor.buffer.addr);
         if (bucket_epochs[bucket_index] != current_epoch) {
             return;
         }
         PTO2TensorMapEntry *cur_entry = buckets[bucket_index];
 
-#if PTO2_TENSORMAP_PROFILING
-        g_lookup_count++;
-        int32_t chain_len = 0;
-#endif
-
-        while (cur_entry != nullptr) {
+        while (cur_entry != nullptr)
+        {
             PTO2TensorMapEntry *next_entry = cur_entry->next_in_bucket;
 
-#if PTO2_TENSORMAP_PROFILING
-            chain_len++;
-#endif
-            // Skip stale entries (no chain truncation — entries from different
-            // rings can be interleaved, so a stale entry from one ring does NOT
-            // imply subsequent entries from other rings are also stale)
-            if (!entry_valid(*cur_entry)) {
+            if (!entry_valid(*cur_entry))
+            {
                 cur_entry = next_entry;
                 continue;
             }
 
-            // Entry is valid - check if regions OVERLAP (not just exact match)
-            // Since we hash only by base_ptr, all entries in this bucket have
-            // potential to overlap. We must check actual byte-range overlap.
-            if (tensor.buffer.addr == cur_entry->buffer_addr) {
-#if PTO2_TENSORMAP_PROFILING
-                g_lookup_overlap_checks++;
-#endif
+            if (tensor.buffer.addr == cur_entry->buffer_addr)
+            {
                 auto overlap_status = cur_entry->check_overlap(tensor);
-                if (overlap_status != OverlapStatus::NO_OVERLAP) {
-#if PTO2_TENSORMAP_PROFILING
-                    g_lookup_overlap_hits++;
-#endif
-                    if (!on_match(*cur_entry, overlap_status)) {
-#if PTO2_TENSORMAP_PROFILING
-                        g_lookup_chain_total += chain_len;
-                        if (chain_len > g_lookup_chain_max) g_lookup_chain_max = chain_len;
-#endif
-                        return;
-                    }
+                if (overlap_status != OverlapStatus::NO_OVERLAP)
+                {
+                    if (!on_match(*cur_entry, overlap_status)) return;
                 }
             }
 
             // Move to next entry
             cur_entry = next_entry;
         }
-#if PTO2_TENSORMAP_PROFILING
-        g_lookup_chain_total += chain_len;
-        if (chain_len > g_lookup_chain_max) g_lookup_chain_max = chain_len;
-#endif
     }
 
-    /**
-     * Insert a new entry (called when task produces output)
-     *
-     * Allocates from ring buffer pool, may overwrite stale entries.
-     * Inserts at head of hash bucket chain (maintains task_id ordering).
-     *
-     * @param tensor            Tensor produced
-     * @param producer_task_id  Task ID of producer
-     */
-    void insert(const Tensor &tensor, PTO2TaskId producer_task_id) {
+    void insert(const Tensor &tensor, PTO2TaskId producer_task_id)
+    {
         PTO2TensorMapEntry *entry = new_entry();
         entry->copy_from_tensor(tensor);
         link_entry(entry, tensor.buffer.addr, producer_task_id);
     }
 
-    /**
-     * Cleanup stale entries for retired tasks
-     *
-     * Called periodically by Orchestrator when last_task_alive advances.
-     * Removes entries from bucket chains for tasks in [old, new) range.
-     *
-     * @param old_last_task_alive  Previous threshold
-     * @param new_last_task_alive  New threshold
-     */
-    void cleanup_retired(int32_t ring_id, int32_t old_last_task_alive, int32_t new_last_task_alive) {
+    void cleanup_retired(int32_t ring_id, int32_t old_last_task_alive, int32_t new_last_task_alive)
+    {
         // Iterate through retired tasks on this ring and remove their entries
-        for (int32_t local_id = old_last_task_alive; local_id < new_last_task_alive; local_id++) {
+        for (int32_t local_id = old_last_task_alive; local_id < new_last_task_alive; local_id++)
+        {
             int32_t task_slot = local_id & (task_window_sizes[ring_id] - 1);
             if (task_entry_head_epochs[ring_id][task_slot] != current_epoch) {
                 continue;
             }
             PTO2TensorMapEntry *cur_entry = task_entry_heads[ring_id][task_slot];
 
-            while (cur_entry != nullptr) {
+            while (cur_entry != nullptr)
+            {
                 PTO2TensorMapEntry *next_entry = cur_entry->next_in_task;  // Save before clearing
                 // Only remove if this entry belongs to the retiring task
                 // (slot may have been reused by a newer task)
-                debug_assert(
-                    cur_entry->producer_task_id ==
-                    PTO2TaskId::make(static_cast<uint8_t>(ring_id), static_cast<uint32_t>(local_id))
-                );
+                debug_assert(cur_entry->producer_task_id == PTO2TaskId::make(static_cast<uint8_t>(ring_id), static_cast<uint32_t>(local_id)));
                 free_entry(*cur_entry);
                 cur_entry = next_entry;
             }
@@ -628,30 +506,14 @@ struct PTO2TensorMap {
         }
     }
 
-    // =============================================================================
-    // Internal Helpers (exposed for testing)
-    // =============================================================================
-
-    /**
-     * Compute hash for tensor addr
-     *
-     * Multiplicative hash using the golden-ratio constant.  Multiplication
-     * mixes ALL input bits into the high bits of the product, so aligned
-     * addresses (low bits all-zero) still distribute evenly.  We extract
-     * the top log2(num_buckets) bits which carry the most entropy.
-     */
-    uint32_t hash(uint64_t key) {
+    uint32_t hash(uint64_t key)
+    {
         key *= 0x9E3779B97F4A7C15ULL;
         return static_cast<uint32_t>(key >> (64 - __builtin_ctz(num_buckets)));
     }
 
-    /**
-     * Link an initialized entry into bucket and task chains.
-     */
-    void link_entry(PTO2TensorMapEntry *entry, uint64_t addr, PTO2TaskId producer_task_id) {
-#if PTO2_TENSORMAP_PROFILING
-        g_insert_count++;
-#endif
+    void link_entry(PTO2TensorMapEntry *entry, uint64_t addr, PTO2TaskId producer_task_id)
+    {
         uint32_t bucket_index = hash(addr);
         auto ring_id = producer_task_id.ring();
         auto local_id = producer_task_id.local();
@@ -666,9 +528,7 @@ struct PTO2TensorMap {
         }
         entry->bucket_index = bucket_index;
         entry->next_in_bucket = buckets[bucket_index];
-        if (entry->next_in_bucket != nullptr) {
-            entry->next_in_bucket->prev_in_bucket = entry;
-        }
+        if (entry->next_in_bucket != nullptr) entry->next_in_bucket->prev_in_bucket = entry;
         buckets[bucket_index] = entry;
         entry->prev_in_bucket = nullptr;
 
@@ -679,86 +539,68 @@ struct PTO2TensorMap {
         }
         entry->next_in_task = task_entry_heads[ring_id][task_slot];
         entry->prev_in_task = nullptr;
-        if (entry->next_in_task != nullptr) {
-            entry->next_in_task->prev_in_task = entry;
-        }
+        if (entry->next_in_task != nullptr) entry->next_in_task->prev_in_task = entry;
         task_entry_heads[ring_id][task_slot] = entry;
     }
 
-    /**
-     * Check if entry is valid (producer has not retired)
-     */
-    bool entry_valid(const PTO2TensorMapEntry &entry) const {
+    bool entry_valid(const PTO2TensorMapEntry &entry) const
+    {
         return static_cast<int32_t>(entry.producer_task_id.local()) >= last_task_alives[entry.producer_task_id.ring()];
     }
 
-    void remove_entry(PTO2TensorMapEntry &entry) {
+    void remove_entry(PTO2TensorMapEntry &entry)
+    {
         remove_from_task(entry);
         free_entry(entry);
     }
 
-    /**
-     * Remove entry from its task chain (O(1) with prev pointer)
-     * Called during pool wrap-around to unlink reused entries.
-     */
-    void remove_from_task(PTO2TensorMapEntry &entry) {
+    void remove_from_task(PTO2TensorMapEntry &entry)
+    {
         always_assert(entry.bucket_index != -1);  // must still be in a bucket
         // Update predecessor's next pointer (O(1) via prev_in_task)
-        if (entry.prev_in_task == nullptr) {
+        if (entry.prev_in_task == nullptr)
+        {
             // Entry is the head of its task chain, update task_entry_heads
             int32_t ring_id = entry.producer_task_id.ring();
             int32_t local_id = static_cast<int32_t>(entry.producer_task_id.local());
             int32_t task_slot = local_id & (task_window_sizes[ring_id] - 1);
             task_entry_heads[ring_id][task_slot] = entry.next_in_task;
-        } else {
+        }
+        else
+        {
             entry.prev_in_task->next_in_task = entry.next_in_task;
         }
 
         // Update successor's prev pointer
-        if (entry.next_in_task != nullptr) {
-            entry.next_in_task->prev_in_task = entry.prev_in_task;
-        }
+        if (entry.next_in_task != nullptr) entry.next_in_task->prev_in_task = entry.prev_in_task;
 
         entry.next_in_task = nullptr;
         entry.prev_in_task = nullptr;
     }
 
-    // =============================================================================
-    // Debug Utilities
-    // =============================================================================
-
-    /**
-     * Print TensorMap statistics
-     */
-    void print_stats();
-
-    /**
-     * Get count of valid entries
-     */
-    int32_t valid_count();
-
-    // =============================================================================
-    // TensorMap Synchronization
-    // =============================================================================
-
-    /**
-     * Sync TensorMap validity threshold from shared memory
-     *
-     * Called periodically to refresh the lazy invalidation threshold.
-     * Also triggers cleanup if threshold has advanced significantly.
-     */
-    void sync_tensormap(PTO2TaskId task_id, int32_t sm_last_task_alive);
-};
+    int32_t valid_count()
+    {
+        int32_t count = 0;
 
-#if PTO2_TENSORMAP_PROFILING
-struct PTO2TensorMapProfilingData {
-    uint64_t lookup_chain_total;
-    uint64_t lookup_count;
-    int32_t lookup_chain_max;
-    uint64_t overlap_checks;
-    uint64_t overlap_hits;
-    uint64_t insert_count;
-};
+        for (int32_t i = 0; i < pool_size; i++)
+            if (entry_pool[i].bucket_index != -1 && entry_valid(entry_pool[i])) count++;
 
-PTO2TensorMapProfilingData pto2_tensormap_get_profiling();
-#endif
+        return count;
+    }
+
+    void sync_tensormap(PTO2TaskId task_id, int32_t sm_last_task_alive)
+    {
+        auto ring_id = task_id.ring();
+        auto local_id = task_id.local();
+        sync_validity(ring_id, sm_last_task_alive);
+
+        // Only attempt cleanup when last_task_alive has actually advanced;
+        // otherwise cleanup_retired would empty-loop and we'd spin forever.
+        auto overlap = get_task_local_id_slot(ring_id, local_id) == get_task_local_id_slot(ring_id, last_cleanup[ring_id]);
+        if (sm_last_task_alive - last_cleanup[ring_id] >= PTO2_TENSORMAP_CLEANUP_INTERVAL || overlap)
+        {
+            cleanup_retired(ring_id, last_cleanup[ring_id], sm_last_task_alive);
+            last_cleanup[ring_id] = sm_last_task_alive;
+        }
+    }
+};
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.cpp
index 4b7484bc9..4a73bb5f0 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.cpp
@@ -8,102 +8,8 @@
  * See LICENSE in the root of the software repository for the full text of the License.
  * -----------------------------------------------------------------------------------------------------------
  */
-/**
- * PTO Runtime2 - Scheduler Implementation
- *
- * Implements scheduler state management, ready queues, and task lifecycle.
- *
- * Based on: docs/RUNTIME_LOGIC.md
- */
-
-#include "pto_scheduler.h"
-#include <inttypes.h>
-#include <stdlib.h>
-#include "common/unified_log.h"
-
-#if PTO2_PROFILING
-// Weak fallbacks for host/UT builds that don't link the scope_stats collector.
-extern "C" __attribute__((weak, visibility("hidden"))) bool is_scope_stats_enabled() { return false; }
-extern "C" __attribute__((weak, visibility("hidden"))) void scope_stats_note_heap_wrap(int) {}
-#endif
-
-// =============================================================================
-// Scheduler Profiling Counters
-// =============================================================================
-
-#if PTO2_SCHED_PROFILING
-#include "common/platform_config.h"
-
-uint64_t g_sched_lock_cycle[PLATFORM_MAX_AICPU_THREADS] = {};
-uint64_t g_sched_fanout_cycle[PLATFORM_MAX_AICPU_THREADS] = {};
-uint64_t g_sched_fanin_cycle[PLATFORM_MAX_AICPU_THREADS] = {};
-uint64_t g_sched_self_consumed_cycle[PLATFORM_MAX_AICPU_THREADS] = {};
-uint64_t g_sched_lock_wait_cycle[PLATFORM_MAX_AICPU_THREADS] = {};
-uint64_t g_sched_push_wait_cycle[PLATFORM_MAX_AICPU_THREADS] = {};
-uint64_t g_sched_pop_wait_cycle[PLATFORM_MAX_AICPU_THREADS] = {};
-uint64_t g_sched_lock_atomic_count[PLATFORM_MAX_AICPU_THREADS] = {};
-uint64_t g_sched_fanout_atomic_count[PLATFORM_MAX_AICPU_THREADS] = {};
-uint64_t g_sched_fanin_atomic_count[PLATFORM_MAX_AICPU_THREADS] = {};
-uint64_t g_sched_self_atomic_count[PLATFORM_MAX_AICPU_THREADS] = {};
-uint64_t g_sched_pop_atomic_count[PLATFORM_MAX_AICPU_THREADS] = {};
-uint64_t g_sched_complete_count[PLATFORM_MAX_AICPU_THREADS] = {};
-
-PTO2SchedProfilingData scheduler_get_profiling(int thread_idx) {
-    PTO2SchedProfilingData d;
-    d.lock_cycle = std::exchange(g_sched_lock_cycle[thread_idx], 0);
-    d.fanout_cycle = std::exchange(g_sched_fanout_cycle[thread_idx], 0);
-    d.fanin_cycle = std::exchange(g_sched_fanin_cycle[thread_idx], 0);
-    d.self_consumed_cycle = std::exchange(g_sched_self_consumed_cycle[thread_idx], 0);
-    d.lock_wait_cycle = std::exchange(g_sched_lock_wait_cycle[thread_idx], 0);
-    d.push_wait_cycle = std::exchange(g_sched_push_wait_cycle[thread_idx], 0);
-    d.pop_wait_cycle = std::exchange(g_sched_pop_wait_cycle[thread_idx], 0);
-    d.lock_atomic_count = std::exchange(g_sched_lock_atomic_count[thread_idx], 0);
-    d.fanout_atomic_count = std::exchange(g_sched_fanout_atomic_count[thread_idx], 0);
-    d.fanin_atomic_count = std::exchange(g_sched_fanin_atomic_count[thread_idx], 0);
-    d.self_atomic_count = std::exchange(g_sched_self_atomic_count[thread_idx], 0);
-    d.pop_atomic_count = std::exchange(g_sched_pop_atomic_count[thread_idx], 0);
-    d.complete_count = std::exchange(g_sched_complete_count[thread_idx], 0);
-    return d;
-}
-#endif
-
-// =============================================================================
-// Debug Utilities
-// =============================================================================
-
-void PTO2SchedulerState::print_stats() {
-    PTO2SchedulerState *sched = this;
-    LOG_INFO_V0("=== Scheduler Statistics ===");
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        if (sched->ring_sched_states[r].last_task_alive > 0) {
-            LOG_INFO_V0("Ring %d:", r);
-            LOG_INFO_V0("  last_task_alive: %d", sched->ring_sched_states[r].last_task_alive);
-            auto &dp = sched->ring_sched_states[r].dep_pool;
-            if (dp.top > 0) {
-                LOG_INFO_V0(
-                    "  dep_pool: top=%d tail=%d used=%d high_water=%d capacity=%d", dp.top, dp.tail, dp.top - dp.tail,
-                    dp.high_water, dp.capacity
-                );
-            }
-        }
-    }
-#if PTO2_SCHED_PROFILING
-    LOG_INFO_V0("tasks_completed:   %lld", (long long)sched->tasks_completed.load(std::memory_order_relaxed));
-    LOG_INFO_V0("tasks_consumed:    %lld", (long long)sched->tasks_consumed.load(std::memory_order_relaxed));
-#endif
-    LOG_INFO_V0("============================");
-}
-
-void PTO2SchedulerState::print_queues() {
-    PTO2SchedulerState *sched = this;
-    LOG_INFO_V0("=== Ready Queues ===");
-
-    const char *shape_names[] = {"AIC", "AIV", "MIX"};
-
-    for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) {
-        LOG_INFO_V0("  %s: count=%" PRIu64, shape_names[i], sched->ready_queues[i].size());
-    }
-    LOG_INFO_V0("  DUMMY: count=%" PRIu64, sched->dummy_ready_queue.size());
 
-    LOG_INFO_V0("====================");
-}
+// All scheduler logic now lives inline in scheduler/pto_scheduler.h (polling
+// design — see commit message). This translation unit is kept empty to preserve
+// the upstream/main file layout; the polling redesign does not need a separate
+// .cpp module.
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.h
index a37eb0d43..684fcdd07 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.h
@@ -9,106 +9,62 @@
  * -----------------------------------------------------------------------------------------------------------
  */
 
-/**
- * PTO Runtime2 - Scheduler Interface
- *
- * The Scheduler is responsible for:
- * 1. Maintaining per-resource-shape ready queues
- * 2. Tracking task state (PENDING -> COMPLETED -> CONSUMED)
- * 3. Managing fanin/fanout refcounts for dependency resolution
- * 4. Advancing last_task_alive for heap reclamation
- * 5. Two-stage mixed-task completion (subtask done bits → mixed-task complete)
- *
- * The Scheduler runs on Device AI_CPU and processes:
- * - Task state transitions based on fanin_refcount
- * - Buffer lifecycle based on fanout_refcount
- * - Ring pointer advancement for flow control
- *
- * Based on: docs/RUNTIME_LOGIC.md
- */
-
 #pragma once
 
 #include <atomic>
 
 #include "common/core_type.h"
 #include "utils/device_arena.h"
-#include "aicpu/platform_regs.h"  // get_reg_ptr / RegId for the speculative doorbell
 #include "pto_async_wait.h"
 #include "pto_ring_buffer.h"
 #include "pto_runtime2_types.h"
 #include "pto_shared_memory.h"
 
-#include "aicpu/device_time.h"  // get_sys_cnt_aicpu (weak; used by spec doorbell timing too)
-#if PTO2_SCHED_PROFILING
-#define PTO2_SCHED_CYCLE_START() uint64_t _st0 = get_sys_cnt_aicpu(), _st1
-#define PTO2_SCHED_CYCLE_LAP(acc)   \
-    do {                            \
-        _st1 = get_sys_cnt_aicpu(); \
-        acc += (_st1 - _st0);       \
-        _st0 = _st1;                \
-    } while (0)
-#endif
+// Forward declaration so this header can compile under both AICPU and host
+// builds. The actual definition is provided by aicpu/device_time.cpp (AICPU)
+// or a weak stub in pto_runtime2.h (host). Used only for sub-phase profiling.
+uint64_t get_sys_cnt_aicpu();
 
-// =============================================================================
-// Ready Queue (Lock-free bounded MPMC — Vyukov design)
-// =============================================================================
-
-/**
- * Per-slot entry: sequence counter for ABA safety + task payload
- */
-struct PTO2ReadyQueueSlot {
+struct PTO2ReadyQueueSlot
+{
     std::atomic<int64_t> sequence;
     PTO2TaskSlotState *slot_state;
 };
 
-/**
- * Thread-local ready buffer for local-first dispatch optimization.
- *
- * Two buffers per scheduling thread, one per CoreType (AIC=0, AIV=1).
- * Initialized once before the scheduling loop; must be empty at
- * the start of each iteration (verified by always_assert).
- *
- * Phase 1 fills per-CoreType buffers via on_task_complete().
- * The dispatch stage drains them local-first via get_ready_tasks_batch,
- * with any remaining tasks pushed to the global ready queue.
- */
 // Number of CoreType values eligible for local dispatch (AIC=0, AIV=1)
 static constexpr int PTO2_LOCAL_DISPATCH_TYPE_NUM = 2;
 
-struct PTO2LocalReadyBuffer {
+struct PTO2LocalReadyBuffer
+{
     PTO2TaskSlotState **slot_states = nullptr;
     int count = 0;
     int capacity = 0;
 
-    void reset(PTO2TaskSlotState **buf, int cap) {
+    void reset(PTO2TaskSlotState **buf, int cap)
+    {
         slot_states = buf;
         count = 0;
         capacity = cap;
     }
 
-    bool try_push(PTO2TaskSlotState *s) {
-        if (slot_states && count < capacity) {
+    bool try_push(PTO2TaskSlotState *s)
+    {
+        if (slot_states && count < capacity)
+        {
             slot_states[count++] = s;
             return true;
         }
         return false;
     }
 
-    PTO2TaskSlotState *pop() { return (count > 0) ? slot_states[--count] : nullptr; }
+    PTO2TaskSlotState *pop()
+    {
+        return (count > 0) ? slot_states[--count] : nullptr;
+    }
 };
 
-/**
- * Lock-free bounded MPMC queue (Dmitry Vyukov design)
- *
- * Key properties:
- * - enqueue_pos and dequeue_pos on separate cache lines (no false sharing)
- * - Per-slot sequence counter prevents ABA problem
- * - Empty queue pop returns immediately (single atomic load, no lock)
- * - CAS contention is split: producers only touch enqueue_pos,
- *   consumers only touch dequeue_pos
- */
-struct alignas(64) PTO2ReadyQueue {
+struct alignas(64) PTO2ReadyQueue
+{
     PTO2ReadyQueueSlot *slots;
     uint64_t capacity;
     uint64_t mask;        // capacity - 1
@@ -120,7 +76,8 @@ struct alignas(64) PTO2ReadyQueue {
     std::atomic<uint64_t> dequeue_pos;
     char _pad2[64 - sizeof(std::atomic<uint64_t>)];  // Own cache line
 
-    uint64_t size() {
+    uint64_t size()
+    {
         uint64_t e = enqueue_pos.load(std::memory_order_relaxed);
         uint64_t d = dequeue_pos.load(std::memory_order_relaxed);
         return (e >= d) ? (e - d) : 0;
@@ -128,21 +85,22 @@ struct alignas(64) PTO2ReadyQueue {
 
     void reset_for_reuse() {}
 
-    bool push(PTO2TaskSlotState *slot_state) {
+    bool push(PTO2TaskSlotState *slot_state)
+    {
         uint64_t pos;
         PTO2ReadyQueueSlot *slot;
-        while (true) {
+        while (true)
+        {
             pos = enqueue_pos.load(std::memory_order_relaxed);
             slot = &slots[pos & mask];
             int64_t seq = slot->sequence.load(std::memory_order_acquire);
             int64_t diff = seq - static_cast<int64_t>(pos);
-            if (diff == 0) {
-                if (enqueue_pos.compare_exchange_weak(
-                        pos, pos + 1, std::memory_order_relaxed, std::memory_order_relaxed
-                    )) {
-                    break;
-                }
-            } else if (diff < 0) {
+            if (diff == 0)
+            {
+                if (enqueue_pos.compare_exchange_weak(pos, pos + 1, std::memory_order_relaxed, std::memory_order_relaxed)) break;
+            }
+            else if (diff < 0)
+            {
                 return false;  // Queue full
             }
         }
@@ -154,290 +112,142 @@ struct alignas(64) PTO2ReadyQueue {
 
     // Batch push: reserve count slots with a single CAS after confirming
     // every target slot is available under the usual Vyukov sequence check.
-    void push_batch(PTO2TaskSlotState **items, int count) {
+    void push_batch(PTO2TaskSlotState **items, int count)
+    {
         if (count == 0) return;
 
         uint64_t pos;
-        while (true) {
+        while (true)
+        {
             pos = enqueue_pos.load(std::memory_order_relaxed);
             bool ready = true;
-            for (int i = 0; i < count; i++) {
+            for (int i = 0; i < count; i++)
+            {
                 PTO2ReadyQueueSlot *slot = &slots[(pos + i) & mask];
                 int64_t seq = slot->sequence.load(std::memory_order_acquire);
                 int64_t diff = seq - static_cast<int64_t>(pos + i);
-                if (diff != 0) {
+                if (diff != 0)
+                {
                     ready = false;
                     break;
                 }
             }
-            if (!ready) {
-                continue;
-            }
-            if (enqueue_pos.compare_exchange_weak(
-                    pos, pos + count, std::memory_order_relaxed, std::memory_order_relaxed
-                )) {
-                break;
-            }
+            if (!ready) continue;
+            if (enqueue_pos.compare_exchange_weak(pos, pos + count, std::memory_order_relaxed, std::memory_order_relaxed)) break;
         }
 
-        for (int i = 0; i < count; i++) {
+        for (int i = 0; i < count; i++)
+        {
             PTO2ReadyQueueSlot *slot = &slots[(pos + i) & mask];
             slot->slot_state = items[i];
             slot->sequence.store(static_cast<int64_t>(pos + i + 1), std::memory_order_release);
         }
     }
 
-#if PTO2_ORCH_PROFILING || PTO2_SCHED_PROFILING
-    bool push(PTO2TaskSlotState *slot_state, uint64_t &atomic_count, uint64_t &wait_cycle) {
-        uint64_t pos;
-        PTO2ReadyQueueSlot *slot;
-        uint64_t t0 = get_sys_cnt_aicpu();
-        bool contended = false;
-        uint32_t atomic_ops = 0;
-        while (true) {
-            pos = enqueue_pos.load(std::memory_order_relaxed);
-            slot = &slots[pos & mask];
-            int64_t seq = slot->sequence.load(std::memory_order_acquire);
-            int64_t diff = seq - static_cast<int64_t>(pos);
-            atomic_ops += 2;  // enqueue_pos.load + sequence.load
-            if (diff == 0) {
-                if (enqueue_pos.compare_exchange_weak(
-                        pos, pos + 1, std::memory_order_relaxed, std::memory_order_relaxed
-                    )) {
-                    atomic_ops++;  // successful CAS
-                    break;
-                }
-                contended = true;
-                atomic_ops++;  // failed CAS
-            } else if (diff < 0) {
-                return false;  // Queue full
-            } else {
-                contended = true;  // diff > 0: slot not yet released, spin
-            }
-        }
-        atomic_ops++;  // final sequence.store
-        atomic_count += atomic_ops;
-        if (contended) {
-            wait_cycle += (get_sys_cnt_aicpu() - t0);
-        }
-
-        slot->slot_state = slot_state;
-        slot->sequence.store(static_cast<int64_t>(pos + 1), std::memory_order_release);
-        return true;
-    }
-#endif
-
-    PTO2TaskSlotState *pop() {
+    PTO2TaskSlotState *pop()
+    {
         // Fast-path: skip slot load when queue is clearly empty
         uint64_t d = dequeue_pos.load(std::memory_order_relaxed);
         uint64_t e = enqueue_pos.load(std::memory_order_relaxed);
-        if (d >= e) {
-            return nullptr;
-        }
+        if (d >= e) return nullptr;
 
         uint64_t pos;
         PTO2ReadyQueueSlot *slot;
-        while (true) {
+        while (true)
+        {
             pos = dequeue_pos.load(std::memory_order_relaxed);
             slot = &slots[pos & mask];
             int64_t seq = slot->sequence.load(std::memory_order_acquire);
             int64_t diff = seq - static_cast<int64_t>(pos + 1);
-            if (diff == 0) {
-                if (dequeue_pos.compare_exchange_weak(
-                        pos, pos + 1, std::memory_order_relaxed, std::memory_order_relaxed
-                    ))
-                    break;
-            } else if (diff < 0) {
-                return nullptr;  // Queue empty
+            if (diff == 0)
+            {
+                if (dequeue_pos.compare_exchange_weak(pos, pos + 1, std::memory_order_relaxed, std::memory_order_relaxed)) break;
             }
-        }
-
-        PTO2TaskSlotState *result = slot->slot_state;
-        slot->sequence.store(static_cast<int64_t>(pos + mask + 1), std::memory_order_release);
-        return result;
-    }
-
-#if PTO2_SCHED_PROFILING
-    PTO2TaskSlotState *pop(uint64_t &atomic_count, uint64_t &wait_cycle) {
-        // Fast-path: skip slot load when queue is clearly empty
-        uint64_t d = dequeue_pos.load(std::memory_order_relaxed);
-        uint64_t e = enqueue_pos.load(std::memory_order_relaxed);
-        atomic_count += 2;  // dequeue_pos.load + enqueue_pos.load
-        if (d >= e) {
-            return nullptr;
-        }
-
-        uint64_t pos;
-        PTO2ReadyQueueSlot *slot;
-        uint64_t t0 = get_sys_cnt_aicpu();
-        bool contended = false;
-        uint32_t atomic_ops = 0;
-        while (true) {
-            pos = dequeue_pos.load(std::memory_order_relaxed);
-            slot = &slots[pos & mask];
-            int64_t seq = slot->sequence.load(std::memory_order_acquire);
-            int64_t diff = seq - static_cast<int64_t>(pos + 1);
-            atomic_ops += 2;  // dequeue_pos.load + sequence.load
-            if (diff == 0) {
-                if (dequeue_pos.compare_exchange_weak(
-                        pos, pos + 1, std::memory_order_relaxed, std::memory_order_relaxed
-                    )) {
-                    atomic_ops++;  // successful CAS
-                    break;
-                }
-                contended = true;
-                atomic_ops++;  // failed CAS
-            } else if (diff < 0) {
-                atomic_count += atomic_ops;
+            else if (diff < 0)
+            {
                 return nullptr;  // Queue empty
-            } else {
-                contended = true;
             }
         }
-        atomic_ops++;  // final sequence.store
-        atomic_count += atomic_ops;
-        if (contended) {
-            wait_cycle += (get_sys_cnt_aicpu() - t0);
-        }
 
         PTO2TaskSlotState *result = slot->slot_state;
         slot->sequence.store(static_cast<int64_t>(pos + mask + 1), std::memory_order_release);
         return result;
     }
-#endif
 
     // Batch pop: reserve a contiguous run of ready slots with a single CAS.
     // Returns actual number of items popped (may be less than max_count).
-    int pop_batch(PTO2TaskSlotState **out, int max_count) {
+    int pop_batch(PTO2TaskSlotState **out, int max_count)
+    {
         uint64_t pos;
         int count;
-        while (true) {
+        while (true)
+        {
             pos = dequeue_pos.load(std::memory_order_relaxed);
             count = 0;
-            while (count < max_count) {
+            while (count < max_count)
+            {
                 PTO2ReadyQueueSlot *slot = &slots[(pos + count) & mask];
                 int64_t seq = slot->sequence.load(std::memory_order_acquire);
                 int64_t diff = seq - static_cast<int64_t>(pos + count + 1);
-                if (diff == 0) {
+                if (diff == 0)
+                {
                     count++;
                     continue;
                 }
-                if (diff < 0) {
-                    break;
-                }
+                if (diff < 0) break;
                 count = -1;
                 break;
             }
             if (count == 0) return 0;
             if (count < 0) continue;
-            if (dequeue_pos.compare_exchange_weak(
-                    pos, pos + count, std::memory_order_relaxed, std::memory_order_relaxed
-                )) {
-                break;
-            }
+            if (dequeue_pos.compare_exchange_weak(pos, pos + count, std::memory_order_relaxed, std::memory_order_relaxed)) break;
         }
 
-        for (int i = 0; i < count; i++) {
+        for (int i = 0; i < count; i++)
+        {
             PTO2ReadyQueueSlot *slot = &slots[(pos + i) & mask];
             out[i] = slot->slot_state;
             slot->sequence.store(static_cast<int64_t>(pos + i + mask + 1), std::memory_order_release);
         }
         return count;
     }
+};
 
-#if PTO2_SCHED_PROFILING
-    int pop_batch(PTO2TaskSlotState **out, int max_count, uint64_t &atomic_count, uint64_t &wait_cycle) {
-        uint64_t pos;
-        int count;
-        uint64_t t0 = get_sys_cnt_aicpu();
-        bool contended = false;
-        uint32_t atomic_ops = 0;
-        while (true) {
-            pos = dequeue_pos.load(std::memory_order_relaxed);
-            atomic_ops++;  // dequeue_pos.load
-            count = 0;
-            while (count < max_count) {
-                PTO2ReadyQueueSlot *slot = &slots[(pos + count) & mask];
-                int64_t seq = slot->sequence.load(std::memory_order_acquire);
-                int64_t diff = seq - static_cast<int64_t>(pos + count + 1);
-                atomic_ops++;  // sequence.load
-                if (diff == 0) {
-                    count++;
-                    continue;
-                }
-                if (diff < 0) {
-                    break;
-                }
-                contended = true;
-                count = -1;
-                break;
-            }
-            if (count == 0) {
-                atomic_count += atomic_ops;
-                return 0;
-            }
-            if (count < 0) {
-                continue;
-            }
-            if (dequeue_pos.compare_exchange_weak(
-                    pos, pos + count, std::memory_order_relaxed, std::memory_order_relaxed
-                )) {
-                atomic_ops++;  // successful CAS
-                break;
-            }
-            contended = true;
-            atomic_ops++;  // failed CAS
-        }
-
-        for (int i = 0; i < count; i++) {
-            PTO2ReadyQueueSlot *slot = &slots[(pos + i) & mask];
-            out[i] = slot->slot_state;
-            slot->sequence.store(static_cast<int64_t>(pos + i + mask + 1), std::memory_order_release);
-            atomic_ops++;  // sequence.store
-        }
-        atomic_count += atomic_ops;
-        if (contended) {
-            wait_cycle += (get_sys_cnt_aicpu() - t0);
-        }
-        return count;
+inline size_t ready_queue_reserve_layout(DeviceArena &arena, uint64_t capacity)
+{
+    return arena.reserve(capacity * sizeof(PTO2ReadyQueueSlot), PTO2_ALIGN_SIZE);
+}
+inline bool ready_queue_init_data_from_layout(PTO2ReadyQueue *queue, DeviceArena &arena, size_t slots_off, uint64_t capacity)
+{
+    // Address the slots region for data writes without storing the pointer in
+    // queue->slots — that field is set by ready_queue_wire_arena_pointers.
+    auto *slots_arena = static_cast<PTO2ReadyQueueSlot *>(arena.region_ptr(slots_off));
+    queue->capacity = capacity;
+    queue->mask = capacity - 1;
+    queue->enqueue_pos.store(0, std::memory_order_relaxed);
+    queue->dequeue_pos.store(0, std::memory_order_relaxed);
+
+    for (uint64_t i = 0; i < capacity; i++)
+    {
+        slots_arena[i].sequence.store((int64_t)i, std::memory_order_relaxed);
+        slots_arena[i].slot_state = nullptr;
     }
-#endif
-};
 
-// Cold-path ready queue operations (defined in pto_scheduler.cpp). Declared
-// as non-member so PTO2ReadyQueue stays a POD-like struct with cache-line
-// alignment. Storage is owned by the caller-supplied arena.
-//   reserve_layout: declare the slots[] region on the arena (must precede commit)
-//   init_from_layout: bind slots pointer from arena.region_ptr(off) and
-//                     initialize sequence counters
-//   destroy: forget the slots pointer (arena owns the buffer)
-size_t ready_queue_reserve_layout(DeviceArena &arena, uint64_t capacity);
-// Writes everything *except* the arena-internal `slots` pointer field
-// (sequences/positions on the slot array, capacity, mask). Uses
-// arena.region_ptr(slots_off) only to address the slot array for writes;
-// does NOT store the pointer in `queue->slots`. Call
-// `ready_queue_wire_arena_pointers` afterwards to set the field itself.
-bool ready_queue_init_data_from_layout(PTO2ReadyQueue *queue, DeviceArena &arena, size_t slots_off, uint64_t capacity);
+    return true;
+}
 // Stores queue->slots = arena.region_ptr(slots_off). Idempotent.
-void ready_queue_wire_arena_pointers(PTO2ReadyQueue *queue, DeviceArena &arena, size_t slots_off);
-void ready_queue_destroy(PTO2ReadyQueue *queue);
+inline void ready_queue_wire_arena_pointers(PTO2ReadyQueue *queue, DeviceArena &arena, size_t slots_off)
+{
+    queue->slots = static_cast<PTO2ReadyQueueSlot *>(arena.region_ptr(slots_off));
+}
+inline void ready_queue_destroy(PTO2ReadyQueue *queue)
+{
+    // Arena owns the slots[] buffer; just forget the pointer.
+    queue->slots = nullptr;
+}
 
-// =============================================================================
-// SPSC Queue (Single-Producer Single-Consumer, wait-free)
-// =============================================================================
-//
-// Bounded ring buffer optimized for the wiring queue use case:
-//   - Producer: orchestrator thread (push)
-//   - Consumer: scheduler thread 0 (pop_batch)
-//
-// Design based on Rigtorp's cached-index technique: each side caches
-// the other's index locally, avoiding cross-core cache line bouncing
-// on the hot path. Only when the local cache says "full" or "empty"
-// does the thread issue an acquire load on the remote index.
-//
-// Memory layout: 5 cache-line-aligned fields ensure zero false sharing.
-
-struct alignas(64) PTO2SpscQueue {
+struct alignas(64) PTO2SpscQueue
+{
     // --- Producer cache lines (orchestrator thread) ---
     alignas(64) std::atomic<uint64_t> head_{0};
     alignas(64) uint64_t tail_cached_{0};
@@ -453,26 +263,18 @@ struct alignas(64) PTO2SpscQueue {
     // Padding to exactly 5 cache lines
     char padding[64 - sizeof(PTO2TaskSlotState **) - sizeof(uint64_t)];
 
-    // Reserve the backing buffer region on the supplied arena. Returns the
-    // region offset, to be passed to init_from_layout() after the arena is
-    // committed. Cache-line aligned: the buffer is shared between the
-    // orchestrator (push) and scheduler thread 0 (pop_batch), so its base
-    // must not false-share with neighboring regions.
-    static size_t reserve_layout(DeviceArena &arena, uint64_t capacity) {
-        return arena.reserve(capacity * sizeof(uintptr_t), PTO2_ALIGN_SIZE);
+    static size_t reserve_layout(DeviceArena &arena, uint64_t capacity)
+    {
+        return arena.reserve(capacity * sizeof(PTO2TaskSlotState *), PTO2_ALIGN_SIZE);
     }
 
-    // Writes everything except the arena-internal `buffer_` pointer field
-    // (zeros the slot pointer array, mask/head/tail). The host pre-builds the
-    // image without storing a host address in buffer_; the AICPU wires
-    // buffer_ at boot via wire_arena_pointers().
-    bool init_data_from_layout(DeviceArena &arena, size_t buffer_off, uint64_t capacity) {
+    bool init_data_from_layout(DeviceArena &arena, size_t buffer_off, uint64_t capacity)
+    {
         if (capacity == 0 || (capacity & (capacity - 1)) != 0) return false;
         auto *buf = static_cast<PTO2TaskSlotState **>(arena.region_ptr(buffer_off));
         // calloc'd-equivalent: zero the slot pointers so spurious early pops
         // observe nullptr.
-        for (uint64_t i = 0; i < capacity; i++)
-            buf[i] = nullptr;
+        for (uint64_t i = 0; i < capacity; i++) buf[i] = nullptr;
         mask_ = capacity - 1;
         head_.store(0, std::memory_order_relaxed);
         tail_.store(0, std::memory_order_relaxed);
@@ -483,7 +285,8 @@ struct alignas(64) PTO2SpscQueue {
 
     // Wire the arena-internal pointer. Called by both host (with host arena)
     // and AICPU (with device arena attached to the prebuilt image).
-    void wire_arena_pointers(DeviceArena &arena, size_t buffer_off) {
+    void wire_arena_pointers(DeviceArena &arena, size_t buffer_off)
+    {
         buffer_ = static_cast<PTO2TaskSlotState **>(arena.region_ptr(buffer_off));
     }
 
@@ -495,22 +298,19 @@ struct alignas(64) PTO2SpscQueue {
     }
 
     // Arena owns the buffer; here we only forget our pointer.
-    void destroy() { buffer_ = nullptr; }
-
-    // Push one item (producer only). Returns false if queue is full.
-    // Full condition: next_h - tail > mask_ (i.e. > capacity-1), so the
-    // effective usable capacity is capacity-1 (one slot is wasted as a
-    // sentinel to distinguish full from empty). uint64_t wrapping is safe
-    // since head and tail are monotonically increasing and subtraction
-    // wraps correctly.
-    bool push(PTO2TaskSlotState *item) {
+    void destroy()
+    {
+        buffer_ = nullptr;
+    }
+
+    bool push(PTO2TaskSlotState *item)
+    {
         uint64_t h = head_.load(std::memory_order_relaxed);
         uint64_t next_h = h + 1;
-        if (next_h - tail_cached_ > mask_) {
+        if (next_h - tail_cached_ > mask_)
+        {
             tail_cached_ = tail_.load(std::memory_order_acquire);
-            if (next_h - tail_cached_ > mask_) {
-                return false;
-            }
+            if (next_h - tail_cached_ > mask_) return false;
         }
         buffer_[h & mask_] = item;
         head_.store(next_h, std::memory_order_release);
@@ -518,139 +318,98 @@ struct alignas(64) PTO2SpscQueue {
     }
 
     // Pop up to max_count items (consumer only). Returns actual count.
-    int pop_batch(PTO2TaskSlotState **out, int max_count) {
+    int pop_batch(PTO2TaskSlotState **out, int max_count)
+    {
         uint64_t t = tail_.load(std::memory_order_relaxed);
         uint64_t avail = head_cached_ - t;
-        if (avail < static_cast<uint64_t>(max_count)) {
+        if (avail < static_cast<uint64_t>(max_count))
+        {
             head_cached_ = head_.load(std::memory_order_acquire);
             avail = head_cached_ - t;
             if (avail == 0) return 0;
         }
         int count = (avail < static_cast<uint64_t>(max_count)) ? static_cast<int>(avail) : max_count;
-        for (int i = 0; i < count; i++) {
-            out[i] = buffer_[(t + i) & mask_];
-        }
+        for (int i = 0; i < count; i++) out[i] = buffer_[(t + i) & mask_];
         tail_.store(t + count, std::memory_order_release);
         return count;
     }
 
     // Approximate size (used for backoff decisions, not exact).
-    uint64_t size() const {
+    uint64_t size() const
+    {
         uint64_t h = head_.load(std::memory_order_acquire);
         uint64_t t = tail_.load(std::memory_order_acquire);
         return h - t;
     }
-
-    // Full ⟺ the producer's next push() would fail: size has reached the
-    // usable capacity (mask_ = capacity - 1, one slot reserved as sentinel).
-    // Used by the wiring-queue deadlock detector to prove the orchestrator is
-    // blocked in push().
-    bool full() const { return size() >= mask_; }
 };
 
 static_assert(sizeof(PTO2SpscQueue) == 5 * 64, "PTO2SpscQueue must be exactly 5 cache lines (320B)");
 // =============================================================================
 
-/**
- * Statistics returned by mixed-task completion processing
- */
-struct CompletionStats {
+struct CompletionStats
+{
     int32_t fanout_edges;       // Number of fanout edges traversed (notify consumers)
     int32_t tasks_enqueued;     // Number of consumers that became READY
     int32_t fanin_edges;        // Number of fanin edges traversed (release producers)
     bool mixed_task_completed;  // True only when this callback completed a mixed task
 };
 
-/**
- * Layout descriptor produced by PTO2SchedulerState::reserve_layout(). Holds
- * the arena offsets of every sub-region the scheduler needs plus the
- * capacities used at layout time (init_from_layout reuses them).
- */
-struct PTO2SchedulerLayout {
+struct PTO2SchedulerLayout
+{
     size_t off_ready_queue_slots[PTO2_NUM_RESOURCE_SHAPES];
     size_t off_dummy_ready_queue_slots;
-    size_t off_early_dispatch_queue_slots;
-    size_t off_dep_pool_entries[PTO2_MAX_RING_DEPTH];
-    size_t off_wiring_spsc_buffer;
+    size_t off_pending_spsc_buffer;
+    size_t off_pending_buffer;
     uint64_t ready_queue_capacity;
     uint64_t spsc_capacity;
-    int32_t dep_pool_capacities[PTO2_MAX_RING_DEPTH];
+    uint64_t pending_capacity;
 };
 
-/**
- * Scheduler state structure
- *
- * Contains dynamic state updated during task execution.
- * Separated from shared memory for cache efficiency.
- * Hot-path methods are defined inline (implicitly inline as member functions).
- */
-struct PTO2SchedulerState {
+struct PTO2SchedulerState
+{
     // Shared memory access
     PTO2SharedMemoryHeader *sm_header;
 
     // Per-ring state
-    struct alignas(64) RingSchedState {
-        // --- Cache Line 0: ring pointer (read-only) + hot path (read-write) ---
+    struct alignas(64) RingSchedState
+    {
         PTO2SharedMemoryRingHeader *ring;
         int32_t last_task_alive;
         std::atomic<int32_t> advance_lock;  // multi-thread CAS
 
-        // --- Cache Line 1+: Thread 0 only (wiring dep_pool) ---
-        alignas(64) PTO2DepListPool dep_pool;
-        // One-shot latch for the wiring-queue deadlock report (thread 0 only):
-        // the drain breaks on dep_pool exhaustion every call while wedged, so
-        // the tier-1 structural diagnostic is emitted once, not per call.
-        bool dep_deadlock_reported = false;
-#if PTO2_PROFILING
-        // Published only for scope_stats; orchestrator must not read dep_pool's non-atomic counters directly.
-        alignas(64) std::atomic<int32_t> dep_pool_snapshot_tail;
-        std::atomic<int32_t> dep_pool_snapshot_top;
-#endif
-
-        // Initialize arena-internal data + arena-external pointers; does NOT
-        // store dep_pool.base (that lives in the runtime arena and is wired
-        // by SchedulerState::wire_arena_pointers). The `ring` field stores
-        // the device address of the SM ring header — computed via offset
-        // arithmetic, no SM dereference.
-        bool init_data_from_layout(void *sm_dev_base, int32_t ring_id);
-        void reset_for_reuse(void *sm_dev_base, int32_t ring_id, std::atomic<int32_t> *orch_err);
-        void destroy();
-
-        void sync_to_sm() { ring->fc.last_task_alive.store(last_task_alive, std::memory_order_release); }
-
-#if PTO2_PROFILING
-        void publish_dep_pool_snapshot() {
-            dep_pool_snapshot_tail.store(dep_pool.tail, std::memory_order_release);
-            dep_pool_snapshot_top.store(dep_pool.top, std::memory_order_release);
+        bool init_data_from_layout(void *sm_dev_base, int32_t ring_id)
+        {
+            ring = pto2_sm_layout::ring_header_addr(sm_dev_base, ring_id);
+            last_task_alive = 0;
+            advance_lock.store(0, std::memory_order_relaxed);
+            return true;
         }
 
-        void read_dep_pool_snapshot(int32_t &tail, int32_t &top) const {
-            top = dep_pool_snapshot_top.load(std::memory_order_acquire);
-            tail = dep_pool_snapshot_tail.load(std::memory_order_acquire);
-            if (tail > top) tail = top;
+        void destroy() { ring = nullptr; }
+
+        void sync_to_sm()
+        {
+            ring->fc.last_task_alive.store(last_task_alive, std::memory_order_release);
         }
-#endif
 
-        void advance_ring_pointers() {
-            int32_t current_task_index = ring->fc.current_task_index.load(std::memory_order_acquire);
+        void advance_ring_pointers()
+        {
+            const int32_t watermark = ring->completed_watermark.load(std::memory_order_acquire);
             int32_t old_last_task_alive = last_task_alive;
 
-            while (last_task_alive < current_task_index) {
+            // Retire any slot at the tail whose last consumer is at or below
+            // the global completed watermark — i.e. every consumer of this
+            // producer has reached COMPLETED. Implies this slot itself is
+            // COMPLETED because the seed value of last_consumer_local_id is
+            // the slot's own local_id.
+            while (last_task_alive <= watermark)
+            {
                 PTO2TaskSlotState &slot_state = ring->get_slot_state_by_task_id(last_task_alive);
-                if (slot_state.task_state.load(std::memory_order_acquire) != PTO2_TASK_CONSUMED) {
-                    break;
-                }
+                if (watermark < slot_state.last_consumer_local_id) break;
                 last_task_alive++;
             }
 
-            // Eager reset: prepare reclaimed slots for reuse while still hot in cache.
-            // Safe because last_task_alive has advanced past these slots but
-            // sync_to_sm has not yet published — the orchestrator cannot reuse
-            // them until the release store below.
-            // Skips payload, task, ring_id — immutable after RingSchedState::init().
-            for (int32_t id = old_last_task_alive; id < last_task_alive; id++) {
-                ring->get_slot_state_by_task_id(id).reset_for_reuse();
-            }
+            for (int32_t id = old_last_task_alive; id < last_task_alive; id++) ring->get_slot_state_by_task_id(id).reset_for_reuse();
 
             sync_to_sm();
         }
@@ -663,909 +422,439 @@ struct PTO2SchedulerState {
     // the dispatch loop and completed inline -- never goes to AICore.
     PTO2ReadyQueue dummy_ready_queue;
 
-    // Wiring subsystem — groups all wiring-related state for cache-line isolation.
-    //
-    // Three cache-line regions by writer:
-    //   1. batch_*  / backoff — thread 0 exclusive (local batch buffer)
-    //   2. queue    — SPSC: orchestrator push, thread 0 pop
-    //   3. orch_needs_drain — orchestrator write, thread 0 read
-    struct alignas(64) WiringState {
-        static constexpr uint64_t BATCH_SIZE = 30;
+    // Thread 0 exclusive: circular FIFO of tasks awaiting fanin readiness.
+    // SPSC queue receives slot_states from the orchestrator; thread 0 drains
+    // them into the pending ring and polls fanin readiness. Storing the FIFO
+    // out of band (instead of intrusively in PTO2TaskSlotState) keeps the
+    // task struct free of scheduler-private state.
+    struct alignas(64) PendingState
+    {
         static constexpr int BACKOFF_LIMIT = 32;
-
-        // --- Thread 0 exclusive: local batch buffer + backoff ---
-        int batch_count = 0;
-        int batch_index = 0;
-        int backoff_counter = 0;
-        PTO2TaskSlotState *batch[BATCH_SIZE];
+        static constexpr int DRAIN_BATCH = 30;
+        static constexpr int POLL_MAX_PER_ITER = 128;
+
+        // --- Thread 0 exclusive ---
+        PTO2TaskSlotState **pending_buf{nullptr};  // capacity slots, arena-owned
+        uint32_t pending_cap{0};
+        uint32_t pending_mask{0};
+        uint32_t pending_head_idx{0};  // next pop
+        uint32_t pending_tail_idx{0};  // next push
+        int backoff_counter{0};
+        PTO2TaskSlotState *drain_buf[DRAIN_BATCH];
 
         // --- SPSC queue: orchestrator (push) ↔ thread 0 (pop) ---
         PTO2SpscQueue queue;
 
         // --- Orchestrator write, thread 0 read ---
         alignas(64) std::atomic<bool> orch_needs_drain{false};
-        // Set to 1 only while the orchestrator is actually spinning in
-        // queue.push() (queue full), cleared on a successful push. The wiring
-        // deadlock detector reads this as the producer-blocked observable: it
-        // proves the orchestrator is stuck BEFORE its scope_end, as opposed to
-        // having just filled the queue with its last in-scope push and being
-        // about to call scope_end (which would release the head -> no deadlock).
-        std::atomic<int32_t> producer_blocked{0};
-    } wiring;
 
-    static_assert(
-        offsetof(WiringState, queue) == 256, "WiringState: batch region must be exactly 4 cache lines before queue"
-    );
-    static_assert(sizeof(WiringState) == 640, "WiringState must be exactly 10 cache lines (640B)");
+        uint32_t pending_count() const { return pending_tail_idx - pending_head_idx; }
+        bool pending_empty() const { return pending_tail_idx == pending_head_idx; }
+    } wiring;
 
     alignas(64) AsyncWaitList async_wait_list;
 
-    // Statistics (cold path, isolated from hot-path fields)
-#if PTO2_SCHED_PROFILING
-    alignas(64) std::atomic<int64_t> tasks_completed;
-    std::atomic<int64_t> tasks_consumed;
-#endif
-    // =========================================================================
-    // Inline hot-path methods
-    // =========================================================================
-
-    /**
-     * Drain wiring queue: pop submitted tasks and wire their fanout edges.
-     * Called by scheduler thread 0 each loop iteration. Sets fanin_count,
-     * acquires fanout_lock per producer, allocates dep_pool entries, and
-     * pushes ready tasks to the appropriate ready queue.
-     *
-     * @return Number of tasks wired this call.
-     */
-
-    int drain_wiring_queue(bool force_drain = false) {
-        int wired = 0;
-
-        // Refill local batch buffer when exhausted.
-        if (wiring.batch_index >= wiring.batch_count) {
-            // Backoff: defer pop when queue holds fewer than a full batch,
-            // unless force_drain, orch_needs_drain, or backoff limit reached.
-            if (!force_drain && wiring.queue.size() < WiringState::BATCH_SIZE) {
-                if (!wiring.orch_needs_drain.load(std::memory_order_acquire) &&
-                    wiring.backoff_counter < WiringState::BACKOFF_LIMIT) {
-                    wiring.backoff_counter++;
-                    return 0;
-                }
-            }
-            wiring.backoff_counter = 0;
-            wiring.batch_count = wiring.queue.pop_batch(wiring.batch, WiringState::BATCH_SIZE);
-            wiring.batch_index = 0;
-            if (wiring.batch_count == 0) return 0;
-        }
-
-        // Process tasks from local buffer in strict FIFO order.
-        while (wiring.batch_index < wiring.batch_count) {
-            PTO2TaskSlotState *ws = wiring.batch[wiring.batch_index];
-            int ring_id = ws->ring_id;
-            auto &rss = ring_sched_states[ring_id];
-            int32_t wfanin = ws->payload->fanin_actual_count;
-
-            if (wfanin > 0 && rss.dep_pool.available() < wfanin) {
-                rss.dep_pool.reclaim(*rss.ring, rss.last_task_alive);
-                if (rss.dep_pool.available() < wfanin) {
-#if PTO2_PROFILING
-                    if (is_scope_stats_enabled()) {
-                        rss.publish_dep_pool_snapshot();
-                    }
-#endif
-                    // dep_pool can't reclaim because the reclaim watermark is
-                    // wedged. This runs on the scheduler thread, so unlike
-                    // alloc()'s detector it cannot self-observe that the
-                    // orchestrator is blocked; wiring.producer_blocked is the
-                    // external certificate -- the orchestrator sets it ONLY while
-                    // it is actually spinning in queue.push() (cleared on a
-                    // successful push), so the "just filled the queue then called
-                    // scope_end" case (push succeeded -> flag stays 0) cannot trip
-                    // a false report. With the producer provably stuck in push
-                    // (program-order before its scope_end) AND the head COMPLETED,
-                    // all consumers released, scope still open (only scope_end
-                    // frees it), scope_end can never run -> provable head-of-line
-                    // deadlock. The producer-blocked gate also pins the head:
-                    // scope_end has not run, so the scope-gated head cannot be
-                    // CONSUMED/reset concurrently while we read it.
-                    if (!rss.dep_deadlock_reported && wiring.producer_blocked.load(std::memory_order_acquire) != 0) {
-                        int32_t last_alive = rss.last_task_alive;
-                        PTO2TaskSlotState &h = rss.ring->get_slot_state_by_task_id(last_alive);
-                        // Read the head under its fanout_lock: fanout_count is a
-                        // lock-protected field, and one snapshot keeps the check
-                        // and the report consistent.
-                        h.lock_fanout();
-                        int32_t state = h.task_state.load(std::memory_order_acquire);
-                        uint32_t fc = h.fanout_count;
-                        uint32_t rc = h.fanout_refcount.load(std::memory_order_acquire);
-                        h.unlock_fanout();
-                        bool head_scope_gated = (state == PTO2_TASK_COMPLETED) && (rc == (fc & ~PTO2_FANOUT_SCOPE_BIT));
-                        if (head_scope_gated) {
-                            rss.dep_deadlock_reported = true;
-                            report_wiring_deadlock(rss, wfanin, last_alive, state, fc, rc);
-                            // Latch the shared fatal so both sides exit fast off
-                            // one error code: the scheduler cold-path poll
-                            // (handle_orchestrator_exit) emergency_shutdowns, and
-                            // the orchestrator's push spin breaks out and unwinds.
-                            if (rss.dep_pool.error_code_ptr != nullptr) {
-                                int32_t expected = PTO2_ERROR_NONE;
-                                rss.dep_pool.error_code_ptr->compare_exchange_strong(
-                                    expected, PTO2_ERROR_DEP_POOL_OVERFLOW, std::memory_order_acq_rel
-                                );
-                            }
-                        }
-                    }
-                    break;  // not enough dep_pool space — keep remainder for next call
-                }
-            }
-
-            wiring.batch_index++;
-            wire_task(rss, ws, wfanin);
-            wired++;
-        }
+    void push_ready_routed(PTO2TaskSlotState *slot_state)
+    {
+        PTO2ResourceShape shape = slot_state->active_mask.to_shape();
+        if (shape == PTO2ResourceShape::DUMMY) dummy_ready_queue.push(slot_state);
+        else ready_queues[static_cast<int32_t>(shape)].push(slot_state);
+    }
 
-        return wired;
+    // Append slot to the tail of the pending FIFO.
+    void pending_push_back(PTO2TaskSlotState *s)
+    {
+        wiring.pending_buf[wiring.pending_tail_idx & wiring.pending_mask] = s;
+        wiring.pending_tail_idx++;
     }
 
-    // Tier-1 structural diagnostic for a provable wiring-queue deadlock (head
-    // COMPLETED + all consumers released + scope still open, dep_pool exhausted,
-    // orchestrator provably blocked in push). The head snapshot (state/fc/rc) is
-    // taken under fanout_lock by the caller and passed in, so the report agrees
-    // with the check and reads no lock-protected field unlocked.
-    void report_wiring_deadlock(
-        RingSchedState &rss, int32_t wfanin, int32_t last_alive, int32_t state, uint32_t fc, uint32_t rc
-    ) {
-        LOG_ERROR("========================================");
-        LOG_ERROR("FATAL: Wiring-Queue Deadlock - Dep Pool Exhausted!");
-        LOG_ERROR("========================================");
-        LOG_ERROR("Head task %d COMPLETED, all consumers released, scope still open ->", last_alive);
-        LOG_ERROR("only scope_end can free it, but the orchestrator is blocked on a full wiring");
-        LOG_ERROR("queue (in push, before its scope_end). Provable head-of-line deadlock.");
-        LOG_ERROR(
-            "  Head task %d: state=%d, consumers=%u/%u, scope_released=%d", last_alive, state,
-            rc & ~PTO2_FANOUT_SCOPE_BIT, fc & ~PTO2_FANOUT_SCOPE_BIT, (rc & PTO2_FANOUT_SCOPE_BIT) ? 1 : 0
-        );
-        LOG_ERROR("  Dep pool:   used=%d/%d, needed=%d entries", rss.dep_pool.used(), rss.dep_pool.capacity, wfanin);
-        LOG_ERROR("Solution:");
-        LOG_ERROR("  The open scope's fanout exceeds the dep pool. Either split the scope, or");
-        LOG_ERROR("  raise PTO2_RING_DEP_POOL (compile-time PTO2_DEP_LIST_POOL_SIZE).");
-        LOG_ERROR("========================================");
+    // Pop the head of the pending FIFO (or nullptr).
+    PTO2TaskSlotState *pending_pop_front()
+    {
+        if (wiring.pending_empty()) return nullptr;
+        PTO2TaskSlotState *s = wiring.pending_buf[wiring.pending_head_idx & wiring.pending_mask];
+        wiring.pending_head_idx++;
+        return s;
     }
 
-    // Route a ready slot to the right global queue. Dummy tasks (empty
-    // active_mask) live in dummy_ready_queue; everything else goes to the
-    // per-shape ready_queues[]. Used by paths that do not have a thread-local
-    // ready buffer (e.g. wiring). See push_ready_routed_local for the
-    // dispatch-time fast path.
-    void push_ready_routed(PTO2TaskSlotState *slot_state) {
-        PTO2ResourceShape shape = slot_state->active_mask.to_shape();
-        if (shape == PTO2ResourceShape::DUMMY) {
-            dummy_ready_queue.push(slot_state);
-        } else {
-            ready_queues[static_cast<int32_t>(shape)].push(slot_state);
+    bool fanin_satisfied(PTO2TaskSlotState *s) const
+    {
+        const PTO2TaskPayload &p = *s->payload;
+        for (int32_t i = 0; i < p.fanin_count; i++)
+        {
+            const auto &prod_ring = *ring_sched_states[p.fanin_ring_ids[i]].ring;
+            if (prod_ring.completion_flags[p.fanin_local_ids[i] & prod_ring.task_window_mask].load(std::memory_order_acquire) == 0) return false;
         }
+        return true;
     }
 
-    /**
-     * Wire fanout edges for a single task. Sets fanin_count, acquires each
-     * producer's fanout_lock, allocates dep_pool entries for live producers,
-     * pushes the task to the ready queue once its fanin refcount is satisfied.
-     */
-    void wire_task(RingSchedState &rss, PTO2TaskSlotState *ws, int32_t wfanin) {
-        PTO2TaskPayload *wp = ws->payload;
-        ws->fanin_count = wfanin + 1;
-
-        if (wfanin != 0) {
-            int32_t early_finished = 0;
-            for_each_fanin_slot_state(*wp, [&](PTO2TaskSlotState *producer) {
-                producer->lock_fanout();
-                int32_t pstate = producer->task_state.load(std::memory_order_acquire);
-                if (pstate >= PTO2_TASK_COMPLETED) {
-                    early_finished++;
-                } else {
-                    producer->fanout_head = rss.dep_pool.prepend(producer->fanout_head, ws);
-                }
-                producer->unlock_fanout();
-            });
-
-            // Seed dispatch_fanin with producers already complete at wiring
-            // time (e.g. buffer-creator tasks that finished before this
-            // consumer entered the graph). Such producers never dispatch at
-            // runtime, so they can never bump dispatch_fanin via the fanout
-            // walk; without this seed the candidate compare
-            // (dispatch_fanin == fanin_actual_count) would be unreachable
-            // whenever any producer is pre-completed. Mirrors the
-            // early_finished seed that ready_fanin gets via init_rc.
-            if (early_finished != 0) {
-                wp->dispatch_fanin.fetch_add(early_finished, std::memory_order_acq_rel);
-            }
-
-            int32_t init_rc = early_finished + 1;
-            int32_t new_rc = ws->fanin_refcount.fetch_add(init_rc, std::memory_order_acq_rel) + init_rc;
-            if (new_rc >= ws->fanin_count) {
-                push_ready_routed(ws);
+    // First-unmet classification used by the pending poll and wake_list
+    // drain. Returns:
+    //   -1: all fanins met (route directly to ready)
+    //   ≥0: index of the first unmet fanin (register on its producer's
+    //       wake list). The polling-only path used to distinguish
+    //       "exactly-1 unmet" from "2+ unmet" so the 2+ case could be
+    //       re-queued for the next polling cycle; the wake-list-only
+    //       redesign instead always registers on the first unmet (rescan
+    //       on wake via on_mixed_task_complete), eliminating the
+    //       O(pending × fanin) per-iteration polling cost.
+    int classify_fanin_state(PTO2TaskSlotState *s) const
+    {
+        const PTO2TaskPayload &p = *s->payload;
+        for (int32_t i = 0; i < p.fanin_count; i++)
+        {
+            const auto &prod_ring = *ring_sched_states[p.fanin_ring_ids[i]].ring;
+            if (prod_ring.completion_flags[p.fanin_local_ids[i] & prod_ring.task_window_mask].load(std::memory_order_acquire) == 0)
+            {
+                return i;
             }
-        } else {
-            ws->fanin_refcount.fetch_add(1, std::memory_order_acq_rel);
-            push_ready_routed(ws);
         }
-
-        ws->dep_pool_mark = rss.dep_pool.top;
-#if PTO2_PROFILING
-        if (is_scope_stats_enabled()) {
-            rss.publish_dep_pool_snapshot();
+        return -1;
+    }
+
+    // (e) Register `consumer` on `producer`'s wake list. If producer has
+    // already completed (head == WAKE_LIST_SENTINEL), push consumer directly
+    // to ready_queues. Otherwise CAS push-onto the head.
+    void register_wake(PTO2TaskSlotState *producer, PTO2TaskSlotState *consumer)
+    {
+        PTO2TaskSlotState *expected = producer->wake_list_head.load(std::memory_order_relaxed);
+        while (true)
+        {
+            if (expected == WAKE_LIST_SENTINEL)
+            {
+                // Producer already completed and drained its wake list. The
+                // last unmet fanin is now satisfied; push consumer to ready.
+                push_ready_routed(consumer);
+                return;
+            }
+            consumer->next_in_wake_list = expected;
+            if (producer->wake_list_head.compare_exchange_weak(expected, consumer, std::memory_order_acq_rel, std::memory_order_relaxed))
+            {
+                return;  // registered
+            }
+            // CAS failed: expected was updated by load on retry. Loop.
         }
-#endif
     }
 
-    void check_and_handle_consumed(PTO2TaskSlotState &slot_state) {
-        // Read fanout_refcount/fanout_count and flip COMPLETED->CONSUMED under
-        // fanout_lock. The orchestrator claims producers (fanout_count++) under the
-        // same lock, so the consume decision is serialized against a concurrent
-        // claim: either the ++ lands first (count then exceeds refcount, so we do
-        // not consume and the producer stays pinned until released) or the consume
-        // lands first (the orchestrator then observes CONSUMED and skips the
-        // claim). Without this lock a claim racing the consume desyncs the slot's
-        // refcount and wedges in-order reclaim.
-        bool became_consumed = false;
-        slot_state.lock_fanout();
-        if (slot_state.fanout_refcount.load(std::memory_order_acquire) == slot_state.fanout_count) {
-            PTO2TaskState expected = PTO2_TASK_COMPLETED;
-            became_consumed = slot_state.task_state.compare_exchange_strong(
-                expected, PTO2_TASK_CONSUMED, std::memory_order_acq_rel, std::memory_order_acquire
-            );
+    // Thread 0 entry point: drain SPSC into pending list, then poll pending
+    // for newly-ready tasks. Not-ready tasks rotate to the tail.
+    // Returns >0 if anything moved (SPSC drained OR tasks routed to ready);
+    // 0 signals no productive work.
+    //
+    // Sub-phase timing pointers (optional). If non-null, cumulative cycle/
+    // iteration counters for Stage 1 (SPSC drain) and Stage 2 (pending poll)
+    // are accumulated into them.
+    int drain_wiring_queue(bool force_drain = false,
+                           uint64_t *spsc_cyc_out = nullptr, uint64_t *spsc_iters_out = nullptr,
+                           uint64_t *poll_cyc_out = nullptr, uint64_t *poll_iters_out = nullptr)
+    {
+        // Stage 1: drain SPSC → pending FIFO tail
+        uint64_t t0 = spsc_cyc_out ? get_sys_cnt_aicpu() : 0;
+        int drained = wiring.queue.pop_batch(wiring.drain_buf, PendingState::DRAIN_BATCH);
+        for (int i = 0; i < drained; i++) pending_push_back(wiring.drain_buf[i]);
+        if (spsc_cyc_out)
+        {
+            *spsc_cyc_out += get_sys_cnt_aicpu() - t0;
+            if (spsc_iters_out) (*spsc_iters_out)++;
+        }
+
+        // Backoff when nothing to do and orchestrator isn't pressing
+        if (drained == 0 && wiring.pending_empty())
+        {
+            if (!force_drain && !wiring.orch_needs_drain.load(std::memory_order_acquire) && wiring.backoff_counter < PendingState::BACKOFF_LIMIT)
+            {
+                wiring.backoff_counter++;
+                return 0;
+            }
         }
-        slot_state.unlock_fanout();
-        if (!became_consumed) return;
-
-#if PTO2_SCHED_PROFILING
-        tasks_consumed.fetch_add(1, std::memory_order_relaxed);
-#endif
-
-        int32_t ring_id = slot_state.ring_id;
-        // advance_ring_pointers (and the reset_for_reuse it triggers) MUST run
-        // outside fanout_lock: reset_for_reuse stores fanout_lock=0 and would
-        // clobber a held lock. Safe here — the slot is CONSUMED and quiescent.
-        // Try-lock — if another thread is advancing this ring, it will scan our CONSUMED task
-        int32_t expected_lock = 0;
-        if (ring_sched_states[ring_id].advance_lock.compare_exchange_strong(
-                expected_lock, 1, std::memory_order_acquire, std::memory_order_relaxed
-            )) {
-            ring_sched_states[ring_id].advance_ring_pointers();
-            ring_sched_states[ring_id].advance_lock.store(0, std::memory_order_release);
+        wiring.backoff_counter = 0;
+
+        // Stage 2: drain pending FIFO. Each task gets scanned exactly once
+        // here — its state is either "all met → ready_queue" or "register
+        // on the first unmet producer's wake_list and leave". Tasks never
+        // re-enter pending FIFO; re-scans happen lazily on wake via
+        // on_mixed_task_complete's wake_list drain (see below). This
+        // eliminates the O(pending × fanin) per-iteration polling cost
+        // that hurt host time under chains of multi-fanin tasks.
+        uint64_t t1 = poll_cyc_out ? get_sys_cnt_aicpu() : 0;
+        int routed = 0;
+        int to_visit = static_cast<int>(wiring.pending_count());
+        if (to_visit > PendingState::POLL_MAX_PER_ITER) to_visit = PendingState::POLL_MAX_PER_ITER;
+        for (int i = 0; i < to_visit; i++)
+        {
+            PTO2TaskSlotState *s = pending_pop_front();
+            if (s == nullptr) break;
+            int state = classify_fanin_state(s);
+            if (state < 0)
+            {
+                push_ready_routed(s);
+            }
+            else
+            {
+                // First unmet at index `state`; register on that producer
+                // and leave the FIFO. Producer is in fanin_ring_ids[state]
+                // (may differ from the consumer's ring under multi-ring
+                // fanin). When the producer completes its wake_list drain
+                // will rescan and either push to ready or re-register on
+                // the next unmet producer.
+                int32_t prod_local = s->payload->fanin_local_ids[state];
+                uint8_t prod_ring = s->payload->fanin_ring_ids[state];
+                auto &ring = *ring_sched_states[prod_ring].ring;
+                PTO2TaskSlotState *producer = &ring.get_slot_state_by_task_id(prod_local);
+                register_wake(producer, s);
+            }
+            routed++;
         }
-    }
-
-#if PTO2_ORCH_PROFILING || PTO2_SCHED_PROFILING
-    void check_and_handle_consumed(PTO2TaskSlotState &slot_state, uint64_t &atomic_count) {
-        // See the non-profiling overload for why the read + COMPLETED->CONSUMED
-        // flip is serialized against the orchestrator's claim under fanout_lock.
-        bool became_consumed = false;
-        slot_state.lock_fanout();
-        atomic_count += 1;  // lock CAS
-        uint32_t fc = slot_state.fanout_count;
-        uint32_t rc = slot_state.fanout_refcount.load(std::memory_order_acquire);
-        atomic_count += 1;  // fanout_refcount.load (fanout_count is a plain read under lock)
-        if (rc == fc) {
-            PTO2TaskState expected = PTO2_TASK_COMPLETED;
-            became_consumed = slot_state.task_state.compare_exchange_strong(
-                expected, PTO2_TASK_CONSUMED, std::memory_order_acq_rel, std::memory_order_acquire
-            );
-            atomic_count += 1;  // CAS
+        if (poll_cyc_out)
+        {
+            *poll_cyc_out += get_sys_cnt_aicpu() - t1;
+            if (poll_iters_out) (*poll_iters_out)++;
         }
-        slot_state.unlock_fanout();
-        atomic_count += 1;  // unlock store
-        if (!became_consumed) return;
-
-#if PTO2_SCHED_PROFILING
-        tasks_consumed.fetch_add(1, std::memory_order_relaxed);
-#endif
 
-        int32_t ring_id = slot_state.ring_id;
-        // advance_ring_pointers + reset_for_reuse run outside fanout_lock (reset
-        // stores fanout_lock=0). Safe — the slot is CONSUMED and quiescent.
-        // Try-lock — if another thread is advancing this ring, it will scan our CONSUMED task
-        int32_t expected_lock = 0;
-        if (ring_sched_states[ring_id].advance_lock.compare_exchange_strong(
-                expected_lock, 1, std::memory_order_acquire, std::memory_order_relaxed
-            )) {
-            ring_sched_states[ring_id].advance_ring_pointers();
-            ring_sched_states[ring_id].advance_lock.store(0, std::memory_order_release);
-            atomic_count += 2;  // try-lock CAS + unlock store
-        } else {
-            atomic_count += 1;  // failed try-lock CAS
-        }
+        return drained + routed;
     }
-#endif
 
-    void release_producer(PTO2TaskSlotState &slot_state) {
-        slot_state.fanout_refcount.fetch_add(1, std::memory_order_acq_rel);
-        check_and_handle_consumed(slot_state);
-    }
-
-    // Scope-end release: sets bit31 (PTO2_FANOUT_SCOPE_BIT) instead of bumping a
-    // consumer ref. Called exactly once per task from on_scope_end. Keeping it a
-    // distinct add lets the deadlock detector tell "waiting only on scope_end"
-    // (head COMPLETED, refcount == fanout_count & ~SCOPE_BIT) apart from
-    // "waiting on a consumer".
-    void release_producer_scope(PTO2TaskSlotState &slot_state) {
-        slot_state.fanout_refcount.fetch_add(PTO2_FANOUT_SCOPE_BIT, std::memory_order_acq_rel);
-        check_and_handle_consumed(slot_state);
-    }
-
-#if PTO2_ORCH_PROFILING || PTO2_SCHED_PROFILING
-    void release_producer(PTO2TaskSlotState &slot_state, uint64_t &atomic_count) {
-        slot_state.fanout_refcount.fetch_add(1, std::memory_order_acq_rel);
-        atomic_count += 1;  // fanout_refcount.fetch_add
-        check_and_handle_consumed(slot_state, atomic_count);
+    int get_ready_tasks_batch(PTO2ResourceShape shape, PTO2LocalReadyBuffer &local_buf, PTO2TaskSlotState **out, int max_count)
+    {
+        int count = 0;
+        while (count < max_count && local_buf.count > 0) out[count++] = local_buf.slot_states[--local_buf.count];
+        int remaining = max_count - count;
+        if (remaining > 0) count += ready_queues[static_cast<int32_t>(shape)].pop_batch(out + count, remaining);
+        return count;
     }
 
-    void release_producer_scope(PTO2TaskSlotState &slot_state, uint64_t &atomic_count) {
-        slot_state.fanout_refcount.fetch_add(PTO2_FANOUT_SCOPE_BIT, std::memory_order_acq_rel);
-        atomic_count += 1;  // fanout_refcount.fetch_add
-        check_and_handle_consumed(slot_state, atomic_count);
-    }
-#endif
-
-    // Speculative early-dispatch release. If the now-ready task was pre-staged
-    // (gated on a core), ring its DATA_MAIN_BASE high-32 doorbell RIGHT HERE in
-    // the completion path — the moment its last producer's FIN satisfies fanin —
-    // instead of routing it through the ready queue and waiting for the dispatch
-    // pass to pop it. Returns true if the task is fully handled (caller must NOT
-    // push to the ready queue). Returns false when the caller must route C
-    // normally: either it was never pre-staged, OR it is a SPMD consumer only
-    // PARTIALLY pre-staged — the gated blocks are released by the doorbells rung
-    // here, and the remaining (next_block_idx .. logical_block_num) blocks
-    // dispatch normally off the ready queue. Lock-free claim shared with Hook 1
-    // (the stager): CAS NONE->DISPATCHED wins => not pre-staged; lose => STAGED
-    // (spin past the brief STAGING window so the mask is visible), then ring.
-
-    // Per-core speculative doorbell table. Hook 1 records each gated core's
-    // (reg_addr, dispatch token) here at stage time; the completion-path release
-    // reads it back for the cores set in the consumer's staged_core_mask. One
-    // global table indexed by core_id (not per-task): gated cores in flight are
-    // bounded by the chip's core count (no two-level pre-dispatch), so this is the
-    // natural capacity and removes the old per-task 3-doorbell cap.
-    struct SpecDoorbell {
-        uint64_t addr{0};
-        uint32_t token{0};
-    };
-    SpecDoorbell spec_doorbell_table[PTO2_SPEC_CORE_MASK_WORDS * 64]{};
-
-    // Cross-thread early-dispatch work queue (a PTO2ReadyQueue MPMC instance,
-    // arena-backed — reserved/wired in pto_runtime2_init alongside the ready queues).
-    // A consumer's SPMD blocks span cores owned by several AICPU threads, but only a
-    // thread RUNNING the consumer's producer discovers it (via the producer's
-    // fanout). When that producer is thread-local (e.g. a 16-block AIV op filling one
-    // thread's cores), the other threads never see the consumer and its blocks on
-    // their cores can't pre-stage. The first claimer pushes the partially-staged
-    // consumer here; every idle thread's early_dispatch pass pops one, stages a range onto
-    // ITS OWN cores (range-claim via next_block_idx), and re-pushes if blocks remain
-    // — exactly mirroring how a partially-dispatched SPMD task is re-pushed to the
-    // ready queue (scheduler_dispatch: pop -> claim -> re-push). A stale/released
-    // entry fails the STAGING check on pop and is dropped; a push that overflows is
-    // logged and the consumer's blocks fall back to normal dispatch.
-    PTO2ReadyQueue early_dispatch_queue;
-
-    static inline void ring_one_doorbell(uint64_t reg_addr, uint32_t token) {
-        volatile uint64_t *dmb = reinterpret_cast<volatile uint64_t *>(get_reg_ptr(reg_addr, RegId::DATA_MAIN_BASE));
-        uint64_t tk = static_cast<uint64_t>(token);
-        *dmb = (tk << 32) | tk;  // 64-bit STR: high=low=token releases the gated AICore
+    bool on_subtask_complete(PTO2TaskSlotState &slot_state)
+    {
+        int16_t prev = slot_state.completed_subtasks.fetch_add(1, std::memory_order_acq_rel);
+        return (prev + 1) == slot_state.total_required_subtasks;
     }
 
-    // auto-chain depth cap: a candidate inherits the flag only while depth < this.
-    static constexpr uint8_t PTO2_SPEC_CHAIN_MAX = 4;
-
-    // Event-driven candidate detection (the dual of fanin_refcount/ready). Call when a
-    // FLAGGED producer `p` DISPATCHES (starts running): walk its fanout and bump each
-    // consumer's dispatch_fanin. A consumer whose dispatch_fanin reaches
-    // fanin_actual_count (= every producer is either flagged-and-dispatched, or was
-    // already complete when the consumer was wired) is an early-dispatch candidate:
-    // CAS NONE->STAGING (exactly-once) and push to early_dispatch_queue for the idle drain to
-    // pre-stage. Once-guarded per producer so an SPMD producer's block-by-block
-    // dispatch propagates once. Replaces the old per-iteration pass-1 PULL scan.
-    void propagate_dispatch_fanin(PTO2TaskSlotState &p) {
-        if (!(p.payload->allow_early_resolve || p.payload->spec_chain_active.load(std::memory_order_acquire)))
-            return;  // only flagged (codegen or inherited) producers propagate
-        if (p.payload->dispatch_propagated.exchange(1, std::memory_order_acq_rel) != 0)
-            return;  // already propagated once
-        uint8_t child_depth = static_cast<uint8_t>(p.payload->spec_chain_depth + 1);
-        p.lock_fanout();
-        PTO2DepListEntry *edge = p.fanout_head;  // snapshot head, walk lock-free (fanout stable by dispatch)
-        p.unlock_fanout();
-        for (; edge != nullptr; edge = edge->next) {
-            PTO2TaskSlotState *c = edge->slot_state;
-            // Compare to fanin_actual_count (the real producer-edge count), NOT
-            // fanin_count: fanin_count = fanin_actual_count + 1 (a self/wiring +1 that
-            // ready_fanin gets but dispatch_fanin does not). dispatch_fanin starts at
-            // the wiring-time early_finished seed (producers already complete) and is
-            // bumped here by flagged producers; reaching fanin_actual_count means every
-            // producer is flagged-dispatched or was pre-completed.
-            int32_t nf = c->payload->dispatch_fanin.fetch_add(1, std::memory_order_acq_rel) + 1;
-            if (nf != c->payload->fanin_actual_count) continue;
-            if (c->active_mask.requires_sync_start()) continue;  // sync_start can't be block-by-block pre-staged
-            PTO2ResourceShape shape = c->active_mask.to_shape();
-            if (shape != PTO2ResourceShape::AIC && shape != PTO2ResourceShape::AIV && shape != PTO2ResourceShape::MIX)
-                continue;
-            uint8_t expect = PTO2_SPEC_NONE;  // exactly-once: only the CAS winner enqueues
-            if (!c->payload->spec_state.compare_exchange_strong(
-                    expect, PTO2_SPEC_STAGING, std::memory_order_seq_cst, std::memory_order_seq_cst
-                ))
+    // Publish this slot as COMPLETED, then advance the per-ring monotonic
+    // completed_watermark — the highest local_id W such that every task
+    // 0..W has reached COMPLETED. Reclamation in advance_ring_pointers gates
+    // on watermark >= producer.last_consumer_local_id, so no consumer→producer
+    // notification edge is needed.
+    void on_mixed_task_complete(PTO2TaskSlotState &slot_state)
+    {
+        // (m) Skip slot_state.task_state.store here; completion_flags below is
+        // the single source of truth. Saves one atomic release store per task.
+        const int32_t my_id = static_cast<int32_t>(slot_state.task->task_id.local());
+        int32_t ring_id = slot_state.ring_id;
+        auto &rss = ring_sched_states[ring_id];
+        auto &ring = *rss.ring;
+
+        // Publish to the polling-fast completion array. Release ordering
+        // makes the producer's output writes visible to consumers that
+        // acquire-load this byte in fanin_satisfied.
+        ring.completion_flags[my_id & ring.task_window_mask].store(1, std::memory_order_release);
+
+        // Drain the wake list. Each consumer registered on this slot was
+        // waiting on at least one unmet fanin (this one). After
+        // completion_flag is set above, atomic-exchange wake_list_head to
+        // SENTINEL (refusing any future registrations) and process each
+        // waiter: rescan its fanin, route to ready_queue if all met, else
+        // re-register on the new first-unmet producer. Ordering:
+        // completion_flag is set BEFORE the exchange, so any consumer that
+        // races a registration against our exchange and observes a SENTINEL
+        // during retry will see completion_flag=1 and either rescan-and-route
+        // or self-register on the next unmet.
+        PTO2TaskSlotState *waiter = slot_state.wake_list_head.exchange(WAKE_LIST_SENTINEL, std::memory_order_acq_rel);
+        while (waiter != nullptr && waiter != WAKE_LIST_SENTINEL)
+        {
+            PTO2TaskSlotState *next = waiter->next_in_wake_list;
+            waiter->next_in_wake_list = nullptr;
+            // Fast path: single-fanin waiters were waiting on *us* (the only
+            // possible fanin). No rescan needed — push straight to ready.
+            // Saves one classify_fanin_state call (a byte read in
+            // completion_flags) per waiter. Skips the cache-miss-prone
+            // multi-ring lookup for the common chain-task case where each
+            // task has exactly one predecessor.
+            if (waiter->payload->fanin_count == 1)
+            {
+                push_ready_routed(waiter);
+                waiter = next;
                 continue;
-            if (child_depth < PTO2_SPEC_CHAIN_MAX) {  // auto-chain: C propagates to ITS consumers
-                c->payload->spec_chain_depth = child_depth;
-                c->payload->spec_chain_active.store(1, std::memory_order_release);
             }
-            early_dispatch_queue.push(c);
-        }
-    }
-
-    // Collects consumers released via the speculative-doorbell path during a
-    // single on_task_complete fanout walk, so their dispatch_fanin
-    // propagation runs AFTER the walk — never between two siblings' doorbells.
-    struct SpecReleaseSink {
-        static constexpr int CAP = 32;
-        PTO2TaskSlotState *items[CAP];
-        int n = 0;
-        inline bool push(PTO2TaskSlotState *s) {
-            if (n >= CAP) return false;
-            items[n++] = s;
-            return true;
-        }
-    };
-
-    inline bool try_speculative_release(PTO2TaskSlotState &slot_state, SpecReleaseSink *sink = nullptr) {
-        // Never staged => CAS NONE->DISPATCHED wins => dispatch normally.
-        uint8_t expect = PTO2_SPEC_NONE;
-        if (slot_state.payload->spec_state.compare_exchange_strong(
-                expect, PTO2_SPEC_DISPATCHED, std::memory_order_seq_cst, std::memory_order_seq_cst
-            )) {
-            return false;
-        }
-        // Staged (STAGING). Flip STAGING->DISPATCHED, THEN read the mask. seq_cst
-        // gives a total order with the concurrent stagers, each of which OR-s its
-        // core into the mask and THEN loads spec_state: a stager whose bit lands
-        // before this CAS is read here and rung; a stager whose bit lands after
-        // sees DISPATCHED and rings that core itself (self-ring in
-        // stage_consumer_blocks). Either way every gated core's doorbell fires once
-        // (a double-ring is harmless — the AICore already matched). This replaces
-        // the old transient-STAGING spin: STAGING is now the stable gated state.
-        expect = PTO2_SPEC_STAGING;
-        slot_state.payload->spec_state.compare_exchange_strong(
-            expect, PTO2_SPEC_DISPATCHED, std::memory_order_seq_cst, std::memory_order_seq_cst
-        );
-        for (int w = 0; w < PTO2_SPEC_CORE_MASK_WORDS; w++) {
-            uint64_t bits = slot_state.payload->staged_core_mask[w].load(std::memory_order_seq_cst);
-            while (bits != 0) {
-                int core_id = w * 64 + __builtin_ctzll(bits);
-                bits &= bits - 1;
-                ring_one_doorbell(spec_doorbell_table[core_id].addr, spec_doorbell_table[core_id].token);
+            int state = classify_fanin_state(waiter);
+            if (state < 0)
+            {
+                push_ready_routed(waiter);
             }
-        }
-        // This pre-staged consumer was just released by its doorbell — it starts
-        // running NOW, so propagate dispatch_fanin to ITS consumers (auto-chain,
-        // knob A). Defer it via the sink so it runs after the whole fanout walk:
-        // doing it inline here would delay the doorbells of later consumers in the
-        // same producer's fanout. Fallback to inline if no sink / sink full.
-        if (sink == nullptr || !sink->push(&slot_state)) {
-            propagate_dispatch_fanin(slot_state);
-        }
-        // No explicit removal from the cross-thread queue: a still-queued entry for
-        // this consumer is now DISPATCHED and is dropped when a peer pops it.
-        // Fully pre-staged => skip the ready queue. Partially staged SPMD consumer =>
-        // fall through so the caller pushes C; dispatch resumes from next_block_idx.
-        return slot_state.next_block_idx.load(std::memory_order_seq_cst) >= slot_state.logical_block_num;
-    }
-
-    bool release_fanin_and_check_ready(
-        PTO2TaskSlotState &slot_state, PTO2LocalReadyBuffer *local_bufs = nullptr, SpecReleaseSink *sink = nullptr
-    ) {
-        // Atomically increment fanin_refcount and check if all producers are done
-        // ACQ_REL on fanin_refcount already synchronizes with the orchestrator's
-        // init release, making fanin_count visible — plain load suffices.
-        int32_t new_refcount = slot_state.fanin_refcount.fetch_add(1, std::memory_order_acq_rel) + 1;
-
-        if (new_refcount == slot_state.fanin_count) {
-            // Speculative early-dispatch: pre-staged tasks are released by doorbell
-            // here, skipping the ready-queue round-trip entirely.
-            if (try_speculative_release(slot_state, sink)) return true;
-            // Local-first: try per-CoreType thread-local buffer before global queue
-            // Route by active_mask: AIC-containing tasks → buf[0], AIV-only → buf[1]
-            // DUMMY shape is out of range for local_bufs (sized PTO2_NUM_RESOURCE_SHAPES);
-            // dummy slots bypass the local fast path and go straight to dummy_ready_queue.
-            PTO2ResourceShape shape = slot_state.active_mask.to_shape();
-            if (shape == PTO2ResourceShape::DUMMY) {
-                dummy_ready_queue.push(&slot_state);
-            } else if (!local_bufs || !local_bufs[static_cast<int32_t>(shape)].try_push(&slot_state)) {
-                ready_queues[static_cast<int32_t>(shape)].push(&slot_state);
+            else
+            {
+                // Still some fanin unmet — re-register on the new first
+                // unmet producer's wake list.
+                int32_t prod_local = waiter->payload->fanin_local_ids[state];
+                uint8_t prod_ring = waiter->payload->fanin_ring_ids[state];
+                auto &prod_ring_hdr = *ring_sched_states[prod_ring].ring;
+                PTO2TaskSlotState *producer = &prod_ring_hdr.get_slot_state_by_task_id(prod_local);
+                register_wake(producer, waiter);
+            }
+            waiter = next;
+        }
+
+        // CAS-advance the watermark, bounded by my_id (which we know is
+        // published since we just completed it). If a forward task we observe
+        // as COMPLETED is also published, but a gap remains, we stop — the
+        // task filling the gap will resume the walk when it completes.
+        int32_t w = ring.completed_watermark.load(std::memory_order_acquire);
+        while (w < my_id)
+        {
+            int32_t next = w + 1;
+            if (ring.completion_flags[next & ring.task_window_mask].load(std::memory_order_acquire) == 0) break;
+            if (ring.completed_watermark.compare_exchange_weak(w, next, std::memory_order_acq_rel, std::memory_order_acquire))
+            {
+                w = next;
             }
-            return true;
         }
-        return false;
-    }
 
-#if PTO2_ORCH_PROFILING || PTO2_SCHED_PROFILING
-    bool release_fanin_and_check_ready(
-        PTO2TaskSlotState &slot_state, uint64_t &atomic_count, uint64_t &push_wait,
-        PTO2LocalReadyBuffer *local_bufs = nullptr, SpecReleaseSink *sink = nullptr
-    ) {
-        int32_t new_refcount = slot_state.fanin_refcount.fetch_add(1, std::memory_order_acq_rel) + 1;
-        atomic_count += 1;  // fanin_refcount.fetch_add
-
-        if (new_refcount == slot_state.fanin_count) {
-            // Speculative early-dispatch: pre-staged tasks are released by doorbell
-            // here, skipping the ready-queue round-trip entirely.
-            if (try_speculative_release(slot_state, sink)) return true;
-            // Local-first: try per-CoreType thread-local buffer before global queue.
-            // Dummy slots bypass local_bufs (out-of-range for PTO2_NUM_RESOURCE_SHAPES)
-            // and go straight to dummy_ready_queue; use the profiling-aware push so
-            // atomic_count / push_wait stay consistent with the non-dummy path.
-            PTO2ResourceShape shape = slot_state.active_mask.to_shape();
-            if (shape == PTO2ResourceShape::DUMMY) {
-                dummy_ready_queue.push(&slot_state, atomic_count, push_wait);
-            } else if (!local_bufs || !local_bufs[static_cast<int32_t>(shape)].try_push(&slot_state)) {
-                ready_queues[static_cast<int32_t>(shape)].push(&slot_state, atomic_count, push_wait);
+        // Try to retire slots whose last consumer has reached COMPLETED.
+        // Gate the try-lock + advance walk on a lag threshold: most
+        // completions advance the watermark by 1 slot; firing the try-lock
+        // per completion costs ~10-30 ns × ~65K completions × N threads of
+        // wasted CAS attempts. With the gate, the try-lock fires ~32× less
+        // often. Empirically 32 is the sweet spot — bigger thresholds let
+        // the allocator stall more often waiting for reclamation. The lag
+        // read of last_task_alive is non-atomic but monotonic and only used
+        // as a hint — stale-but-OK.
+        if (w - rss.last_task_alive >= 32)
+        {
+            int32_t expected_lock = 0;
+            if (rss.advance_lock.compare_exchange_strong(expected_lock, 1, std::memory_order_acquire, std::memory_order_relaxed))
+            {
+                rss.advance_ring_pointers();
+                rss.advance_lock.store(0, std::memory_order_release);
             }
-            return true;
         }
-        return false;
     }
-#endif
 
-    int get_ready_tasks_batch(
-        PTO2ResourceShape shape, PTO2LocalReadyBuffer &local_buf, PTO2TaskSlotState **out, int max_count
-    ) {
-        int count = 0;
-        while (count < max_count && local_buf.count > 0) {
-            out[count++] = local_buf.slot_states[--local_buf.count];
-        }
-        int remaining = max_count - count;
-        if (remaining > 0) {
-            count += ready_queues[static_cast<int32_t>(shape)].pop_batch(out + count, remaining);
-        }
-        return count;
-    }
+    // === Cold-path API ===
 
-#if PTO2_SCHED_PROFILING
-    int get_ready_tasks_batch(
-        PTO2ResourceShape shape, PTO2LocalReadyBuffer &local_buf, PTO2TaskSlotState **out, int max_count,
-        uint64_t &atomic_count, uint64_t &wait_cycle
-    ) {
-        int count = 0;
-        while (count < max_count && local_buf.count > 0) {
-            out[count++] = local_buf.slot_states[--local_buf.count];
-        }
-        int remaining = max_count - count;
-        if (remaining > 0) {
-            count +=
-                ready_queues[static_cast<int32_t>(shape)].pop_batch(out + count, remaining, atomic_count, wait_cycle);
-        }
-        return count;
-    }
-#endif
-
-    void on_scope_end(PTO2TaskSlotState **task_slot_states, int32_t count) {
-#if PTO2_ORCH_PROFILING
-        extern uint64_t g_orch_scope_end_atomic_count;
-        if (count > 0) __builtin_prefetch(task_slot_states[0], 1, 0);
-        for (int32_t i = 0; i < count; i++) {
-            if (i + 1 < count) __builtin_prefetch(task_slot_states[i + 1], 1, 0);
-            release_producer_scope(*task_slot_states[i], g_orch_scope_end_atomic_count);
-        }
-#else
-        if (count > 0) __builtin_prefetch(task_slot_states[0], 1, 0);
-        for (int32_t i = 0; i < count; i++) {
-            if (i + 1 < count) __builtin_prefetch(task_slot_states[i + 1], 1, 0);
-            release_producer_scope(*task_slot_states[i]);
-        }
-#endif
-    }
+    static PTO2SchedulerLayout reserve_layout(DeviceArena &arena, int32_t /*dep_pool_capacity*/)
+    {
+        PTO2SchedulerLayout layout{};
+        layout.ready_queue_capacity = PTO2_READY_QUEUE_SIZE;
+        layout.spsc_capacity = PTO2_WRIRING_QUEUE_SIZE;
+        layout.pending_capacity = PTO2_TASK_WINDOW_SIZE;  // bounded by per-ring slot window
 
-    /**
-     * Subtask completion: atomic counter model.
-     * Called when a single subtask (AIC, AIV0, or AIV1) finishes on any block.
-     * Atomically increments completed_subtasks and checks whether all subtasks
-     * across all blocks are done.
-     *
-     * @return true if this was the last subtask, completing the entire task.
-     */
-    bool on_subtask_complete(PTO2TaskSlotState &slot_state) {
-        int16_t prev = slot_state.completed_subtasks.fetch_add(1, std::memory_order_acq_rel);
-        return (prev + 1) == slot_state.total_required_subtasks;
+        for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) layout.off_ready_queue_slots[i] = ready_queue_reserve_layout(arena, PTO2_READY_QUEUE_SIZE);
+        layout.off_dummy_ready_queue_slots = ready_queue_reserve_layout(arena, PTO2_READY_QUEUE_SIZE);
+        layout.off_pending_spsc_buffer = PTO2SpscQueue::reserve_layout(arena, PTO2_WRIRING_QUEUE_SIZE);
+        layout.off_pending_buffer = arena.reserve(layout.pending_capacity * sizeof(PTO2TaskSlotState *), PTO2_ALIGN_SIZE);
+        return layout;
     }
 
-    /**
-     * Two-stage completion: second stage.
-     * Called exactly once when all subtasks of a task are done (i.e.,
-     * on_subtask_complete returned true). Walks the consumer (fanout) list,
-     * decrements each consumer's fanin, pushes newly-ready ones, and rings
-     * doorbells for speculative hits.
-     *
-     * Non-PROFILING returns the consumer-walk count (= edges traversed). The
-     * Resolve swimlane bar reads it to label the bar with how many successors
-     * actually got resolved. PROFILING returns the richer CompletionStats
-     * whose `fanout_edges` carries the same number.
-     */
-#if PTO2_SCHED_PROFILING
-    CompletionStats
-#else
-    uint32_t
-#endif
-    on_task_complete(
-        PTO2TaskSlotState &slot_state,
-#if PTO2_SCHED_PROFILING
-        int thread_idx,
-#endif
-
-        PTO2LocalReadyBuffer *local_bufs = nullptr
-    ) {
-#if PTO2_SCHED_PROFILING
-        CompletionStats stats = {0, 0, 0, true};
-#else
-        uint32_t consumer_walk_count = 0;
-#endif
-#if PTO2_SCHED_PROFILING
-        extern uint64_t g_sched_lock_cycle[], g_sched_fanout_cycle[];
-        extern uint64_t g_sched_lock_atomic_count[], g_sched_lock_wait_cycle[];
-        extern uint64_t g_sched_fanout_atomic_count[], g_sched_push_wait_cycle[];
-        uint64_t lock_atomics = 0, lock_wait = 0;
-        PTO2_SCHED_CYCLE_START();
-#endif
-
-#if PTO2_SCHED_PROFILING
-        slot_state.lock_fanout(lock_atomics, lock_wait);
-#else
-        slot_state.lock_fanout();
-#endif
-        slot_state.task_state.store(PTO2_TASK_COMPLETED, std::memory_order_release);
-        PTO2DepListEntry *current = slot_state.fanout_head;  // Protected by fanout_lock
-        slot_state.unlock_fanout();
-
-#if PTO2_SCHED_PROFILING
-        lock_atomics += 2;  // state.store + unlock.store
-        g_sched_lock_atomic_count[thread_idx] += lock_atomics;
-        g_sched_lock_wait_cycle[thread_idx] += lock_wait;
-        PTO2_SCHED_CYCLE_LAP(g_sched_lock_cycle[thread_idx]);
-#endif
-
-        // Fanout: notify consumers. A pre-staged consumer that becomes ready has
-        // its doorbell rung INLINE (db = nullptr) the moment its node is reached,
-        // not batched to after the whole walk — so a flagged consumer near the
-        // front of the list starts immediately and overlaps the remaining
-        // release_fanin work for the other consumers, instead of waiting for the
-        // full O(fanout-degree) walk (~5us for a 50-consumer producer).
-        //
-        // Safe on silicon: the producer's slot is already COMPLETED here — every
-        // SPMD block has FIN'd AND dcci-flushed its output to HBM before
-        // on_task_complete runs — so a released consumer never reads stale
-        // producer output. (Batching used to align the released wave, but pushed
-        // every doorbell to the end of the walk, defeating the whole point of
-        // speculative early-dispatch: minimal producer-end -> consumer-start.)
-#if PTO2_SCHED_PROFILING
-        uint64_t fanout_atomics = 0, push_wait = 0;
-#endif
-        // Doorbells for released pre-staged consumers fire INLINE in the walk
-        // below; their dispatch_fanin propagation is collected here and replayed
-        // after the walk, so no consumer's doorbell waits on a sibling's propagate.
-        SpecReleaseSink rel_sink;
-        while (current != nullptr) {
-            PTO2TaskSlotState &consumer_slot = *current->slot_state;
-#if PTO2_SCHED_PROFILING
-            stats.fanout_edges++;
-            if (release_fanin_and_check_ready(consumer_slot, fanout_atomics, push_wait, local_bufs, &rel_sink)) {
-                stats.tasks_enqueued++;
-            }
-#else
-            consumer_walk_count++;
-            release_fanin_and_check_ready(consumer_slot, local_bufs, &rel_sink);
-#endif
-            current = current->next;
-        }
-        for (int i = 0; i < rel_sink.n; i++) {
-            propagate_dispatch_fanin(*rel_sink.items[i]);
-        }
+    bool init_data_from_layout(const PTO2SchedulerLayout &layout, DeviceArena &arena, void *sm_dev_base)
+    {
+        PTO2SchedulerState *sched = this;
+        sched->sm_header = reinterpret_cast<PTO2SharedMemoryHeader *>(sm_dev_base);
 
-#if PTO2_SCHED_PROFILING
-        g_sched_fanout_atomic_count[thread_idx] += fanout_atomics;
-        g_sched_push_wait_cycle[thread_idx] += push_wait;
-        PTO2_SCHED_CYCLE_LAP(g_sched_fanout_cycle[thread_idx]);
-        return stats;
-#else
-        return consumer_walk_count;
-#endif
-    }
+        for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++)
+            if (!sched->ring_sched_states[r].init_data_from_layout(sm_dev_base, r)) return false;
+
+        for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++)
+            if (!ready_queue_init_data_from_layout(&sched->ready_queues[i], arena, layout.off_ready_queue_slots[i], layout.ready_queue_capacity)) return false;
+        if (!ready_queue_init_data_from_layout(&sched->dummy_ready_queue, arena, layout.off_dummy_ready_queue_slots, layout.ready_queue_capacity)) return false;
+
+        if (!sched->wiring.queue.init_data_from_layout(arena, layout.off_pending_spsc_buffer, layout.spsc_capacity)) return false;
 
-    /**
-     * Cold path: release producers (fanin traversal) + check self for CONSUMED.
-     * Returns fanin edge count for profiling.
-     */
-
-#if PTO2_SCHED_PROFILING
-    int32_t on_task_release(PTO2TaskSlotState &slot_state, int32_t thread_idx) {
-        PTO2_SCHED_CYCLE_START();
-        extern uint64_t g_sched_fanin_cycle[], g_sched_fanin_atomic_count[];
-        extern uint64_t g_sched_self_atomic_count[];
-        extern uint64_t g_sched_self_consumed_cycle[];
-        extern uint64_t g_sched_complete_count[];
-        uint64_t fanin_atomics = 0;
-#else
-    int32_t on_task_release(PTO2TaskSlotState &slot_state) {
-#endif
-        PTO2TaskPayload *payload = slot_state.payload;
-        for_each_fanin_slot_state(*payload, [&](PTO2TaskSlotState *producer_slot_state) {
-#if PTO2_SCHED_PROFILING
-            release_producer(*producer_slot_state, fanin_atomics);
-#else
-            release_producer(*producer_slot_state);
-#endif
-        });
-#if PTO2_SCHED_PROFILING
-        g_sched_fanin_atomic_count[thread_idx] += fanin_atomics;
-        PTO2_SCHED_CYCLE_LAP(g_sched_fanin_cycle[thread_idx]);
-#endif
-
-        // Self consumed check
-#if PTO2_SCHED_PROFILING
-        uint64_t self_atomics = 0;
-        check_and_handle_consumed(slot_state, self_atomics);
-        g_sched_self_atomic_count[thread_idx] += self_atomics;
-        PTO2_SCHED_CYCLE_LAP(g_sched_self_consumed_cycle[thread_idx]);
-        g_sched_complete_count[thread_idx]++;
-#else
-        check_and_handle_consumed(slot_state);
-#endif
-        return payload->fanin_actual_count;
+        if (layout.pending_capacity == 0 || (layout.pending_capacity & (layout.pending_capacity - 1)) != 0) return false;
+        sched->wiring.pending_buf = static_cast<PTO2TaskSlotState **>(arena.region_ptr(layout.off_pending_buffer));
+        sched->wiring.pending_cap = static_cast<uint32_t>(layout.pending_capacity);
+        sched->wiring.pending_mask = sched->wiring.pending_cap - 1;
+        sched->wiring.pending_head_idx = 0;
+        sched->wiring.pending_tail_idx = 0;
+        sched->wiring.backoff_counter = 0;
+
+        return true;
     }
 
-    // === Cold-path API (defined in pto_scheduler.cpp) ===
-
-    // Phase 1: declare every sub-region (ready_queue slots, dummy queue slots,
-    // per-ring dep_pool entries, wiring SPSC buffer) on the supplied arena.
-    // Capacities are baked into the returned layout; init_data_from_layout uses
-    // the same values.
-    static PTO2SchedulerLayout reserve_layout(DeviceArena &arena, int32_t dep_pool_capacity = PTO2_DEP_LIST_POOL_SIZE);
-    static PTO2SchedulerLayout
-    reserve_layout(DeviceArena &arena, const int32_t dep_pool_capacities[PTO2_MAX_RING_DEPTH]);
-
-    // Phase 3a: write everything *except* arena-internal pointer fields.
-    // `sm_dev_base` is the device address of the SM (only stored, never
-    // dereferenced here). Safe to call on a host arena that holds the
-    // prebuilt image buffer. (The orchestrator counterpart takes
-    // task_window_size for ring task_descriptors address arithmetic; the
-    // scheduler only needs the SM header / ring header base addresses,
-    // both window-size-independent.)
-    bool init_data_from_layout(const PTO2SchedulerLayout &layout, DeviceArena &arena, void *sm_dev_base);
-    void reset_for_reuse(const PTO2SchedulerLayout &layout, void *sm_dev_base);
-
-    // Phase 3b: write the arena-internal pointer fields
-    // (ready_queues[].slots, dummy_ready_queue.slots, dep_pool.base for each
-    // ring, wiring.queue.buffer_). Called on both host and device sides.
-    void wire_arena_pointers(const PTO2SchedulerLayout &layout, DeviceArena &arena);
+    void wire_arena_pointers(const PTO2SchedulerLayout &layout, DeviceArena &arena)
+    {
+        PTO2SchedulerState *sched = this;
+        for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) ready_queue_wire_arena_pointers(&sched->ready_queues[i], arena, layout.off_ready_queue_slots[i]);
+        ready_queue_wire_arena_pointers(&sched->dummy_ready_queue, arena, layout.off_dummy_ready_queue_slots);
+        sched->wiring.queue.wire_arena_pointers(arena, layout.off_pending_spsc_buffer);
+        sched->wiring.pending_buf = static_cast<PTO2TaskSlotState **>(arena.region_ptr(layout.off_pending_buffer));
+    }
 
     // Forget per-region pointers; arena owns the backing memory.
-    void destroy();
-    void print_stats();
-    void print_queues();
+    void destroy()
+    {
+        PTO2SchedulerState *sched = this;
+        for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) sched->ring_sched_states[r].destroy();
+        sched->wiring.queue.destroy();
+        sched->wiring.pending_buf = nullptr;
+        for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) ready_queue_destroy(&sched->ready_queues[i]);
+        ready_queue_destroy(&sched->dummy_ready_queue);
+    }
 };
 
 // Scheduler cold-path API is declared as PTO2SchedulerState member functions.
-// See init()/destroy()/print_stats()/print_queues() below the struct definition.
-
-// try_inline_complete_locked: short-circuit NotDeferred completions seen during
-// drain so they don't grow entries[]. Defined here (not in pto_async_wait.h)
-// because PTO2SchedulerState's on_task_complete signature is only known
-// after its full definition above.
-//
-// When the deferred_release_slot_states[] buffer is full, drain it via
-// on_task_release before appending — mirrors the same overflow-drain idiom
-// that scheduler_completion.cpp's inline NotDeferred path uses, so high task
-// rates don't surface as ASYNC_WAIT_OVERFLOW errors.
-inline bool
-AsyncWaitList::try_inline_complete_locked(AsyncWaitList::DrainCompletionSink &sink, PTO2TaskSlotState &slot_state) {
-    // Return value (CompletionStats / consumer-walk count) discarded:
-    // async-wait drain path has no Resolve swimlane bar attached.
-#if PTO2_SCHED_PROFILING
-    (void)sink.sched->on_task_complete(slot_state, sink.thread_idx, sink.local_bufs);
-#else
-    (void)sink.sched->on_task_complete(slot_state, sink.local_bufs);
-#endif
-    if (*sink.deferred_release_count >= sink.deferred_release_capacity) {
-        while (*sink.deferred_release_count > 0) {
-#if PTO2_SCHED_PROFILING
-            (void)sink.sched->on_task_release(
-                *sink.deferred_release_slot_states[--(*sink.deferred_release_count)], sink.thread_idx
-            );
-#else
-            sink.sched->on_task_release(*sink.deferred_release_slot_states[--(*sink.deferred_release_count)]);
-#endif
-        }
-    }
-    sink.deferred_release_slot_states[(*sink.deferred_release_count)++] = &slot_state;
+// See init()/destroy() below the struct definition.
+
+inline bool AsyncWaitList::try_inline_complete_locked(AsyncWaitList::DrainCompletionSink &sink, PTO2TaskSlotState &slot_state)
+{
+    sink.sched->on_mixed_task_complete(slot_state);
     sink.inline_completed++;
     return true;
 }
 
 template <bool Profiling>
-inline AsyncPollResult AsyncWaitList::poll_and_complete(
-    AICoreCompletionMailbox *aicore_mailbox, PTO2SchedulerState *sched, PTO2LocalReadyBuffer *local_bufs,
-    PTO2TaskSlotState **deferred_release_slot_states, int32_t &deferred_release_count, int32_t deferred_release_capacity
-#if PTO2_SCHED_PROFILING
-    ,
-    int thread_idx
-#endif
-) {
+inline AsyncPollResult AsyncWaitList::poll_and_complete(AICoreCompletionMailbox *aicore_mailbox, PTO2SchedulerState *sched)
+{
     AsyncPollResult result;
     if (!try_lock()) return result;
 
     AsyncWaitList::DrainCompletionSink sink{};
     sink.sched = sched;
-    sink.local_bufs = local_bufs;
-    sink.deferred_release_slot_states = deferred_release_slot_states;
-    sink.deferred_release_count = &deferred_release_count;
-    sink.deferred_release_capacity = deferred_release_capacity;
-#if PTO2_SCHED_PROFILING
-    sink.thread_idx = thread_idx;
-#endif
 
     int32_t drain_err = PTO2_ERROR_NONE;
     drain_aicore_completion_mailbox_locked(aicore_mailbox, sink, drain_err);
-    if (drain_err != PTO2_ERROR_NONE) {
+    if (drain_err != PTO2_ERROR_NONE)
+    {
         result.error_code = drain_err;
         unlock();
         return result;
     }
     result.completed += sink.inline_completed;
 
-    for (int32_t i = count - 1; i >= 0; --i) {
+    for (int32_t i = count - 1; i >= 0; --i)
+    {
         AsyncWaitEntry &entry = entries[i];
         uintptr_t last_invalidated_counter_line = static_cast<uintptr_t>(-1);
-        for (int32_t c = 0; c < entry.condition_count; c++) {
+        for (int32_t c = 0; c < entry.condition_count; c++)
+        {
             CompletionCondition &cond = entry.conditions[c];
             if (cond.satisfied) continue;
-            if (cond.completion_type == COMPLETION_TYPE_COUNTER && cond.counter_addr != nullptr) {
+            if (cond.completion_type == COMPLETION_TYPE_COUNTER && cond.counter_addr != nullptr)
+            {
                 uintptr_t counter_line = mailbox_cache_line(cond.counter_addr);
-                if (counter_line != last_invalidated_counter_line) {
+                if (counter_line != last_invalidated_counter_line)
+                {
                     cache_invalidate_range(reinterpret_cast<const void *>(counter_line), sizeof(uint32_t));
                     last_invalidated_counter_line = counter_line;
                 }
             }
             CompletionPollResult poll = cond.test();
-            if (poll.state == CompletionPollState::FAILED) {
+            if (poll.state == CompletionPollState::FAILED)
+            {
                 result.error_code = poll.error_code;
                 result.failed_slot_state = entry.slot_state;
                 unlock();
                 return result;
             }
-            if (poll.state == CompletionPollState::READY) {
+            if (poll.state == CompletionPollState::READY)
+            {
                 cond.satisfied = true;
                 cond.retire();
                 entry.waiting_completion_count--;
             }
         }
 
-        if (entry.normal_done && entry.waiting_completion_count <= 0) {
-            // Return value (CompletionStats / consumer-walk count) discarded:
-            // deferred-completion drain has no Resolve swimlane bar attached.
-#if PTO2_SCHED_PROFILING
-            (void)sched->on_task_complete(*entry.slot_state, thread_idx, local_bufs);
-#else
-            (void)sched->on_task_complete(*entry.slot_state, local_bufs);
-#endif
-            // Drain deferred_release in place when the buffer fills — same
-            // overflow-drain idiom used by complete_slot_task's inline path
-            // and by try_inline_complete_locked. Without this, large bursts
-            // of completable wait_list entries in a single poll surfaced as
-            // ASYNC_WAIT_OVERFLOW under the MPSC model.
-            if (deferred_release_count >= deferred_release_capacity) {
-                while (deferred_release_count > 0) {
-#if PTO2_SCHED_PROFILING
-                    (void)sched->on_task_release(*deferred_release_slot_states[--deferred_release_count], thread_idx);
-#else
-                    sched->on_task_release(*deferred_release_slot_states[--deferred_release_count]);
-#endif
-                }
-            }
-            deferred_release_slot_states[deferred_release_count++] = entry.slot_state;
+        if (entry.normal_done && entry.waiting_completion_count <= 0)
+        {
+            sched->on_mixed_task_complete(*entry.slot_state);
             result.completed++;
 
             int32_t last = count - 1;
@@ -1577,37 +866,3 @@ inline AsyncPollResult AsyncWaitList::poll_and_complete(
     unlock();
     return result;
 }
-
-// =============================================================================
-// Scheduler Profiling Data
-// =============================================================================
-
-#if PTO2_SCHED_PROFILING
-struct PTO2SchedProfilingData {
-    // Sub-phase cycle breakdown within on_task_complete
-    uint64_t lock_cycle;           // lock_fanout + state store + unlock
-    uint64_t fanout_cycle;         // fanout traversal
-    uint64_t fanin_cycle;          // fanin traversal
-    uint64_t self_consumed_cycle;  // self check_and_handle_consumed
-
-    // Wait times
-    uint64_t lock_wait_cycle;  // spin-wait in fanout_lock
-    uint64_t push_wait_cycle;  // CAS contention in push()
-    uint64_t pop_wait_cycle;   // CAS contention in pop()
-
-    // Atomic counts per sub-phase
-    uint64_t lock_atomic_count;
-    uint64_t fanout_atomic_count;
-    uint64_t fanin_atomic_count;
-    uint64_t self_atomic_count;
-    uint64_t pop_atomic_count;
-
-    int64_t complete_count;
-};
-
-/**
- * Get and reset scheduler profiling data for a specific thread.
- * Returns accumulated profiling data and resets counters.
- */
-PTO2SchedProfilingData scheduler_get_profiling(int thread_idx);
-#endif
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp
index 8e1813367..0dd10cd45 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp
@@ -8,1102 +8,7 @@
  * See LICENSE in the root of the software repository for the full text of the License.
  * -----------------------------------------------------------------------------------------------------------
  */
-#include "scheduler_context.h"
 
-#include <cinttypes>
-#include <cstdio>
-
-#include "common/unified_log.h"
-#include "aicpu/dep_gen_collector_aicpu.h"
-#include "aicpu/device_phase_aicpu.h"
-#include "aicpu/device_time.h"
-#include "aicpu/l2_swimlane_collector_aicpu.h"
-#include "aicpu/platform_regs.h"
-#include "aicpu/pmu_collector_aicpu.h"
-#include "aicpu/tensor_dump_aicpu.h"
-#include "common/memory_barrier.h"
-#include "common/l2_swimlane_profiling.h"
-#include "common/platform_config.h"
-#include "pto_runtime2.h"
-#include "pto_shared_memory.h"
-#include "runtime.h"
-#include "spin_hint.h"
-
-// =============================================================================
-// Cold-path helpers for the main dispatch loop (noinline to reduce hot-loop icache)
-// =============================================================================
-
-// Returns true iff this call won the first-writer CAS for sched_error_code — the
-// caller may then write companion fields (e.g. the stall detail) knowing they
-// describe the same observation that owns the latched code.
-static bool latch_scheduler_error(PTO2SharedMemoryHeader *header, int32_t thread_idx, int32_t error_code) {
-    if (header == nullptr || error_code == PTO2_ERROR_NONE) {
-        return false;
-    }
-    // The first error code/thread pair wins; the bitmap cumulatively records all reporting threads.
-    int32_t expected = PTO2_ERROR_NONE;
-    bool won = header->sched_error_code.compare_exchange_strong(expected, error_code, std::memory_order_acq_rel);
-    if (won) {
-        header->sched_error_thread.store(thread_idx, std::memory_order_release);
-    }
-    if (thread_idx >= 0 && thread_idx < 32) {
-        header->sched_error_bitmap.fetch_or(1U << static_cast<uint32_t>(thread_idx), std::memory_order_acq_rel);
-    }
-    return won;
-}
-
-LoopAction SchedulerContext::handle_orchestrator_exit(
-    int32_t thread_idx, PTO2SharedMemoryHeader *header, Runtime *runtime, int32_t &task_count
-) {
-    if (completed_.load(std::memory_order_acquire)) {
-        return LoopAction::BREAK_LOOP;
-    }
-    int32_t orch_err = header->orch_error_code.load(std::memory_order_acquire);
-    if (orch_err != PTO2_ERROR_NONE) {
-        LOG_ERROR(
-            "Thread %d: Fatal error (code=%d), sending EXIT_SIGNAL to all cores. "
-            "completed_tasks=%d, total_tasks=%d",
-            thread_idx, orch_err, completed_tasks_.load(std::memory_order_relaxed), total_tasks_
-        );
-        if (!completed_.exchange(true, std::memory_order_acq_rel)) {
-            emergency_shutdown(runtime);
-        }
-        return LoopAction::BREAK_LOOP;
-    }
-    int32_t sched_err = header->sched_error_code.load(std::memory_order_acquire);
-    if (sched_err != PTO2_ERROR_NONE) {
-        LOG_ERROR("Thread %d: Scheduler fatal error detected (code=%d)", thread_idx, sched_err);
-        if (!completed_.exchange(true, std::memory_order_acq_rel)) {
-            emergency_shutdown(runtime);
-        }
-        return LoopAction::BREAK_LOOP;
-    }
-
-    bool orch_done = orchestrator_done_.load(std::memory_order_acquire);
-    if (!orch_done) return LoopAction::NONE;
-
-    task_count = total_tasks_;
-    if (task_count > 0 && completed_tasks_.load(std::memory_order_relaxed) >= task_count) {
-        completed_.store(true, std::memory_order_release);
-        LOG_INFO_V0(
-            "Thread %d: PTO2 completed tasks %d/%d", thread_idx, completed_tasks_.load(std::memory_order_relaxed),
-            task_count
-        );
-        return LoopAction::BREAK_LOOP;
-    }
-    return LoopAction::NONE;
-}
-
-LoopAction
-SchedulerContext::check_idle_fatal_error(int32_t thread_idx, PTO2SharedMemoryHeader *header, Runtime *runtime) {
-    if (completed_.load(std::memory_order_acquire)) {
-        return LoopAction::BREAK_LOOP;
-    }
-    int32_t orch_err = header->orch_error_code.load(std::memory_order_acquire);
-    if (orch_err != PTO2_ERROR_NONE) {
-        LOG_ERROR("Thread %d: Fatal error detected (code=%d), sending EXIT_SIGNAL to all cores", thread_idx, orch_err);
-        if (!completed_.exchange(true, std::memory_order_acq_rel)) {
-            emergency_shutdown(runtime);
-        }
-        return LoopAction::BREAK_LOOP;
-    }
-    int32_t sched_err = header->sched_error_code.load(std::memory_order_acquire);
-    if (sched_err != PTO2_ERROR_NONE) {
-        LOG_ERROR("Thread %d: Scheduler fatal error detected (code=%d)", thread_idx, sched_err);
-        if (!completed_.exchange(true, std::memory_order_acq_rel)) {
-            emergency_shutdown(runtime);
-        }
-        return LoopAction::BREAK_LOOP;
-    }
-    return LoopAction::NONE;
-}
-
-// =============================================================================
-// Stall diagnostic log format.
-//
-// Every line is self-contained — when scheduler threads emit concurrently and
-// device_log interleaves their output, each line still carries enough context
-// to identify which thread / iteration / object it belongs to.
-//
-// Prefix on every line:
-//   [STALL thread=N idle_iterations=K] CATEGORY ...
-//
-// All scheduler threads spinning at the same idle rate hit STALL_LOG_INTERVAL
-// together, so lines with the same idle_iterations belong to one diagnostic
-// round; grep "idle_iterations=N" groups one round's output.
-//
-// Categories (and which thread emits them):
-//   SUMMARY  — completed / total counts and scan totals               (thread 0 only)
-//   TASK     — one per non-completed task scanned from shared rings   (thread 0 only)
-//              - state=RUNNING: includes running_on=[...] cross-ref
-//              - state=READY:   fanin satisfied but no idle core yet
-//              - state=WAIT:    includes missing_deps=N
-//   CLUSTER  — one per cluster owned by this thread                   (every thread)
-//              - busy slot shows kernel + task_id + cond_reg_state;
-//                ANOMALY suffix when COND register is fin while software
-//                still has the slot marked busy.
-//
-// Reader workflow:
-//   1. grep SUMMARY                          -> overall completion status
-//   2. grep "idle_iterations=N TASK"         -> stuck RUNNING task and which
-//                                               core/thread it is on
-//   3. grep "idle_iterations=N CLUSTER.*task=<id>" -> cross-check via the
-//                                                     cluster line (or just
-//                                                     read running_on in step 2)
-// =============================================================================
-
-namespace {
-
-// Format a core's idle/busy state into a fixed buffer. Used inside CLUSTER lines.
-// Layout (idle):    coreN(idle)
-// Layout (busy):    coreN(busy kernel=K task=T cond_reg_state=ack)
-// Layout (anomaly): coreN(busy kernel=K task=T cond_reg_state=fin ANOMALY)
-//
-// Healthy busy: COND register reports ack (AICore still executing). fin means
-// AICore wrote completion but AICPU hasn't recycled the running slot yet —
-// either a completion-poll bug or the diagnostic raced the recycle.
-void format_core_status(
-    char *buf, size_t buf_size, int32_t core_id, bool idle, const CoreExecState *core_state, uint64_t reg_addr_for_cond
-) {
-    if (idle) {
-        snprintf(buf, buf_size, "core%d(idle)", core_id);
-        return;
-    }
-    int32_t kernel = -1;
-    int64_t task_id_raw = -1;
-    if (core_state && core_state->running_slot_state) {
-        int32_t subslot = static_cast<int32_t>(core_state->running_subslot);
-        kernel = core_state->running_slot_state->task->kernel_id[subslot];
-        task_id_raw = static_cast<int64_t>(core_state->running_slot_state->task->task_id.raw);
-    }
-    uint64_t cond_reg = read_reg(reg_addr_for_cond, RegId::COND);
-    int32_t hw_state = EXTRACT_TASK_STATE(cond_reg);
-    const char *cond_reg_state_str = (hw_state == TASK_ACK_STATE) ? "ack" : "fin";
-    if (hw_state == TASK_ACK_STATE) {
-        snprintf(
-            buf, buf_size, "core%d(busy kernel=%d task=%" PRId64 " cond_reg_state=%s)", core_id, kernel, task_id_raw,
-            cond_reg_state_str
-        );
-    } else {
-        snprintf(
-            buf, buf_size,
-            "core%d(busy kernel=%d task=%" PRId64
-            " cond_reg_state=%s ANOMALY cond_tok=%d running_tok=%d pending_tok=%d)",
-            core_id, kernel, task_id_raw, cond_reg_state_str, EXTRACT_TASK_ID(cond_reg),
-            core_state->running_reg_task_id, core_state->pending_reg_task_id
-        );
-    }
-}
-
-}  // namespace
-
-int32_t SchedulerContext::find_core_owner_thread(int32_t core_id) const {
-    for (int32_t t = 0; t < aicpu_thread_num_; t++) {
-        const int32_t *ids = core_trackers_[t].core_ids();
-        int32_t n = core_trackers_[t].core_num();
-        for (int32_t i = 0; i < n; i++) {
-            if (ids[i] == core_id) return t;
-        }
-    }
-    return -1;
-}
-
-bool SchedulerContext::self_owns_running_task(int32_t thread_idx) const {
-    const int32_t *cores = core_trackers_[thread_idx].core_ids();
-    int32_t core_num = core_trackers_[thread_idx].core_num();
-    for (int32_t i = 0; i < core_num; i++) {
-        if (core_exec_states_[cores[i]].running_slot_state != nullptr) {
-            return true;
-        }
-    }
-    return false;
-}
-
-bool SchedulerContext::no_thread_owns_running_task() const {
-    for (int32_t t = 0; t < aicpu_thread_num_; t++) {
-        if (self_owns_running_task(t)) return false;
-    }
-    return true;
-}
-
-void SchedulerContext::log_stall_diagnostics(
-    int32_t thread_idx, int32_t task_count, int32_t idle_iterations, int32_t last_progress_count
-) {
-    CoreTracker &tracker = core_trackers_[thread_idx];
-
-    // T0 owns the shared-ring scan; printing it from other threads would
-    // produce identical TASK lines once per scheduler thread.
-    if (thread_idx == 0) {
-        int32_t cnt_ready = 0, cnt_waiting = 0, cnt_running = 0, submitted_in_ring = 0;
-        for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-            PTO2SharedMemoryRingHeader &ring = *sched_->ring_sched_states[r].ring;
-            int32_t ring_task_count = ring.fc.current_task_index.load(std::memory_order_relaxed);
-            submitted_in_ring += ring_task_count;
-            // Scan only live task_ids [last_task_alive, current_task_index): slots
-            // wrap (slot = task_id % window), so starting at 0 re-reads each live
-            // slot once per earlier task_id and inflates the scan_* counts.
-            int32_t ring_task_start = ring.fc.last_task_alive.load(std::memory_order_relaxed);
-            for (int32_t si = ring_task_start; si < ring_task_count; si++) {
-                PTO2TaskSlotState &slot_state = ring.get_slot_state_by_task_id(si);
-                PTO2TaskState st = slot_state.task_state.load(std::memory_order_relaxed);
-                int32_t rc = slot_state.fanin_refcount.load(std::memory_order_relaxed);
-                int32_t fi = slot_state.fanin_count;
-                int32_t kid_aic = slot_state.task->kernel_id[0];
-                int32_t kid_aiv0 = slot_state.task->kernel_id[1];
-                int32_t kid_aiv1 = slot_state.task->kernel_id[2];
-                int64_t task_id = static_cast<int64_t>(slot_state.task->task_id.raw);
-                if (st >= PTO2_TASK_COMPLETED) continue;
-                // task_state has no intermediate ready/running value — it
-                // stays PENDING until the worker stores COMPLETED. Classify
-                // by the ground truth instead: a slot is RUNNING iff some
-                // core has it as running_slot_state. A task occupies at most
-                // 3 cores (one cluster), all under the same owner thread by
-                // construction of assign_cores_to_threads.
-                char running_on[192] = {0};
-                int32_t owner = -1;
-                int32_t pos = 0;
-                bool is_running = false;
-                for (int32_t cid = 0; cid < cores_total_num_ && pos + 32 < (int32_t)sizeof(running_on); cid++) {
-                    if (core_exec_states_[cid].running_slot_state != &slot_state) continue;
-                    is_running = true;
-                    if (owner < 0) owner = find_core_owner_thread(cid);
-                    const char *sname = subslot_name(core_exec_states_[cid].running_subslot);
-                    int32_t written = snprintf(
-                        running_on + pos, sizeof(running_on) - pos, "%score=%d(%s)", pos == 0 ? "" : " ", cid, sname
-                    );
-                    if (written > 0) pos += written;
-                }
-
-                if (is_running) {
-                    cnt_running++;
-                    if (cnt_running > STALL_DUMP_READY_MAX) continue;
-                    LOG_INFO_V9(
-                        "[STALL thread=%d idle_iterations=%d] TASK ring=%d task_id=%" PRId64
-                        " state=RUNNING fanin_refcount=%d/%d kernels=[aic:%d aiv0:%d aiv1:%d] "
-                        "running_on=[owner_thread=%d cores=[%s]]",
-                        thread_idx, idle_iterations, r, task_id, rc, fi, kid_aic, kid_aiv0, kid_aiv1, owner, running_on
-                    );
-                    continue;
-                }
-                if (rc >= fi) {
-                    cnt_ready++;
-                    if (cnt_ready > STALL_DUMP_READY_MAX) continue;
-                    LOG_INFO_V9(
-                        "[STALL thread=%d idle_iterations=%d] TASK ring=%d task_id=%" PRId64
-                        " state=READY   fanin_refcount=%d/%d kernels=[aic:%d aiv0:%d aiv1:%d]",
-                        thread_idx, idle_iterations, r, task_id, rc, fi, kid_aic, kid_aiv0, kid_aiv1
-                    );
-                    continue;
-                }
-                cnt_waiting++;
-                if (cnt_waiting > STALL_DUMP_WAIT_MAX) continue;
-                LOG_INFO_V9(
-                    "[STALL thread=%d idle_iterations=%d] TASK ring=%d task_id=%" PRId64
-                    " state=WAIT    fanin_refcount=%d/%d kernels=[aic:%d aiv0:%d aiv1:%d] missing_deps=%d",
-                    thread_idx, idle_iterations, r, task_id, rc, fi, kid_aic, kid_aiv0, kid_aiv1, fi - rc
-                );
-            }
-        }
-        int32_t effective_total = task_count > 0 ? task_count : submitted_in_ring;
-        int32_t c = completed_tasks_.load(std::memory_order_relaxed);
-        LOG_INFO_V9(
-            "[STALL thread=%d idle_iterations=%d] SUMMARY completed=%d/%d last_progress_iteration=%d "
-            "scan_ready=%d scan_waiting=%d scan_running=%d",
-            thread_idx, idle_iterations, c, effective_total, last_progress_count, cnt_ready, cnt_waiting, cnt_running
-        );
-    }
-
-    // CLUSTER lines: one per cluster this thread owns.
-    // cluster_id = local_cluster_idx * active_sched_threads_ + thread_idx, matching the
-    // round-robin assignment in assign_cores_to_threads.
-    int32_t ast = active_sched_threads_ > 0 ? active_sched_threads_ : aicpu_thread_num_;
-    for (int32_t cli = 0; cli < tracker.get_cluster_count() && cli < STALL_DUMP_CORE_MAX; cli++) {
-        int32_t offset = cli * 3;
-        int32_t aic_id = tracker.get_aic_core_id(offset);
-        int32_t aiv0_id = tracker.get_aiv0_core_id(offset);
-        int32_t aiv1_id = tracker.get_aiv1_core_id(offset);
-        bool aic_idle = tracker.is_aic_core_idle(offset);
-        bool aiv0_idle = tracker.is_aiv0_core_idle(offset);
-        bool aiv1_idle = tracker.is_aiv1_core_idle(offset);
-        int32_t cluster_id = cli * ast + thread_idx;
-        char aic_buf[192], aiv0_buf[192], aiv1_buf[192];
-        format_core_status(
-            aic_buf, sizeof(aic_buf), aic_id, aic_idle, &core_exec_states_[aic_id], core_exec_states_[aic_id].reg_addr
-        );
-        format_core_status(
-            aiv0_buf, sizeof(aiv0_buf), aiv0_id, aiv0_idle, &core_exec_states_[aiv0_id],
-            core_exec_states_[aiv0_id].reg_addr
-        );
-        format_core_status(
-            aiv1_buf, sizeof(aiv1_buf), aiv1_id, aiv1_idle, &core_exec_states_[aiv1_id],
-            core_exec_states_[aiv1_id].reg_addr
-        );
-        LOG_INFO_V9(
-            "[STALL thread=%d idle_iterations=%d] CLUSTER cluster_id=%d aic=%s aiv0=%s aiv1=%s", thread_idx,
-            idle_iterations, cluster_id, aic_buf, aiv0_buf, aiv1_buf
-        );
-    }
-}
-
-void SchedulerContext::log_shutdown_stall_snapshot(
-    int32_t trigger_thread_idx, int32_t trigger_idle_iterations, int32_t trigger_last_progress_count
-) {
-    LOG_WARN(
-        "[SHUTDOWN_SNAPSHOT trigger_thread=%d reason=scheduler_timeout idle_iterations=%d] "
-        "dumping all scheduler threads before emergency shutdown",
-        trigger_thread_idx, trigger_idle_iterations
-    );
-    int32_t thread_count = active_sched_threads_ > 0 ? active_sched_threads_ : aicpu_thread_num_;
-    if (thread_count < 0 || thread_count > MAX_AICPU_THREADS) {
-        LOG_ERROR(
-            "[SHUTDOWN_SNAPSHOT trigger_thread=%d] invalid thread_count=%d, clamping to [0,%d]", trigger_thread_idx,
-            thread_count, MAX_AICPU_THREADS
-        );
-        thread_count = thread_count < 0 ? 0 : MAX_AICPU_THREADS;
-    }
-    for (int32_t t = 0; t < thread_count; t++) {
-        log_stall_diagnostics(t, total_tasks_, trigger_idle_iterations, trigger_last_progress_count);
-    }
-}
-
-SchedulerContext::StallClassification SchedulerContext::classify_stall_reason() const {
-    StallClassification cls{};
-    cls.stuck_task_id = -1;
-    cls.stuck_core = -1;
-    int32_t cnt_running = 0, cnt_ready = 0, cnt_waiting = 0;
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        PTO2SharedMemoryRingHeader &ring = *sched_->ring_sched_states[r].ring;
-        int32_t ring_task_count = ring.fc.current_task_index.load(std::memory_order_relaxed);
-        // Active task_ids live in [last_task_alive, current_task_index); slots wrap
-        // (slot = task_id % window), so scanning from 0 re-reads each live slot once
-        // per earlier task_id that mapped to it -- inflating the counts to O(history).
-        // Start at the tail so each live slot is visited exactly once (O(window)).
-        int32_t ring_task_start = ring.fc.last_task_alive.load(std::memory_order_relaxed);
-        for (int32_t si = ring_task_start; si < ring_task_count; si++) {
-            PTO2TaskSlotState &slot_state = ring.get_slot_state_by_task_id(si);
-            PTO2TaskState st = slot_state.task_state.load(std::memory_order_relaxed);
-            if (st >= PTO2_TASK_COMPLETED) continue;
-            // Same ground truth as log_stall_diagnostics: task_state stays PENDING
-            // until COMPLETED, so RUNNING is read from core ownership, not the slot.
-            int32_t run_core = -1;
-            for (int32_t cid = 0; cid < cores_total_num_; cid++) {
-                if (core_exec_states_[cid].running_slot_state == &slot_state) {
-                    run_core = cid;
-                    break;
-                }
-            }
-            if (run_core >= 0) {
-                if (cnt_running == 0) {
-                    // Snapshot the non-atomic task pointer once: it can be null on a
-                    // torn slot, and a concurrent writer may flip it mid-read.
-                    PTO2TaskDescriptor *task_ptr = slot_state.task;
-                    cls.stuck_task_id = (task_ptr != nullptr) ? static_cast<int64_t>(task_ptr->task_id.raw) : -1;
-                    cls.stuck_core = run_core;
-                }
-                cnt_running++;
-                continue;
-            }
-            int32_t rc = slot_state.fanin_refcount.load(std::memory_order_relaxed);
-            int32_t fi = slot_state.fanin_count;
-            if (rc >= fi) {
-                cnt_ready++;
-                continue;
-            }
-            cnt_waiting++;
-        }
-    }
-    cls.cnt_running = cnt_running;
-    cls.cnt_ready = cnt_ready;
-    cls.cnt_waiting = cnt_waiting;
-    cls.completed = completed_tasks_.load(std::memory_order_relaxed);
-    cls.total = total_tasks_;
-    cls.orch_done = orchestrator_done_ ? 1 : 0;
-    cls.detail = classify_stall_detail(cnt_running, cnt_ready, cnt_waiting, cls.orch_done);
-    return cls;
-}
-
-int32_t SchedulerContext::handle_timeout_exit(
-    int32_t thread_idx, PTO2SharedMemoryHeader *header, Runtime *runtime, int32_t idle_iterations,
-    int32_t last_progress_count
-#if PTO2_PROFILING
-    ,
-    uint64_t sched_start_ts
-#endif
-) {
-    StallClassification cls = classify_stall_reason();
-    LOG_ERROR(
-        "[STALL thread=%d idle_iterations=%d] TIMEOUT_EXIT after_idle_iterations=%d sub_class=%s "
-        "completed=%d/%d running=%d ready=%d waiting=%d orch_done=%d stuck_task_id=%" PRId64 " stuck_core=%d",
-        thread_idx, idle_iterations, idle_iterations, stall_detail_name(cls.detail), cls.completed, cls.total,
-        cls.cnt_running, cls.cnt_ready, cls.cnt_waiting, cls.orch_done, cls.stuck_task_id, cls.stuck_core
-    );
-    // Only the thread that wins the code-100 latch publishes the detail/locators,
-    // keeping the host-visible sub-class consistent with the latched code.
-    if (latch_scheduler_error(header, thread_idx, PTO2_ERROR_SCHEDULER_TIMEOUT) && header != nullptr) {
-        header->sched_stall_completed.store(cls.completed, std::memory_order_relaxed);
-        header->sched_stall_total.store(cls.total, std::memory_order_relaxed);
-        header->sched_stall_cnt_running.store(cls.cnt_running, std::memory_order_relaxed);
-        header->sched_stall_cnt_ready.store(cls.cnt_ready, std::memory_order_relaxed);
-        header->sched_stall_cnt_waiting.store(cls.cnt_waiting, std::memory_order_relaxed);
-        header->sched_stall_orch_done.store(cls.orch_done, std::memory_order_relaxed);
-        header->sched_stall_task_id.store(cls.stuck_task_id, std::memory_order_relaxed);
-        header->sched_stall_core.store(cls.stuck_core, std::memory_order_relaxed);
-        // detail published last (release) so a host reading a non-NONE detail
-        // sees the locators above already settled.
-        header->sched_stall_detail.store(cls.detail, std::memory_order_release);
-    }
-    if (!completed_.exchange(true, std::memory_order_acq_rel)) {
-        log_shutdown_stall_snapshot(thread_idx, idle_iterations, last_progress_count);
-#if PTO2_PROFILING
-        // Capture the in-flight kernels' partial output before signalling the
-        // cores to exit, so the dump reflects the live stuck state.
-        if (is_dump_args_enabled()) {
-            dump_running_task_outputs<PTO2_SUBTASK_SLOT_COUNT>(
-                thread_idx, cores_total_num_,
-                [this](int32_t cid) {
-                    return core_exec_states_[cid].running_slot_state;
-                },
-                [](ActiveMask active_mask, int raw_subtask_id) {
-                    return active_mask.subtask_active(static_cast<PTO2SubtaskSlot>(raw_subtask_id));
-                },
-                [this](int32_t func_id) {
-                    return get_function_bin_addr(func_id);
-                }
-            );
-        }
-#endif
-        emergency_shutdown(runtime);
-    }
-#if PTO2_PROFILING
-    uint64_t sched_timeout_ts = get_sys_cnt_aicpu();
-    aicpu_phase_set_window(
-        AicpuPhase::SchedWindow, static_cast<uint64_t>(sched_start_ts), static_cast<uint64_t>(sched_timeout_ts)
-    );
-#if PTO2_SCHED_PROFILING
-    LOG_INFO_V9(
-        "Thread %d: sched_start=%" PRIu64 " sched_end(timeout)=%" PRIu64 " sched_cost=%.3fus", thread_idx,
-        static_cast<uint64_t>(sched_start_ts), static_cast<uint64_t>(sched_timeout_ts),
-        cycles_to_us(sched_timeout_ts - sched_start_ts)
-    );
-#endif
-#endif
-    return -PTO2_ERROR_SCHEDULER_TIMEOUT;
-}
-
-#if PTO2_PROFILING
-void SchedulerContext::log_l2_swimlane_summary(int32_t thread_idx, [[maybe_unused]] int32_t cur_thread_completed) {
-    auto &l2_swimlane = sched_l2_swimlane_[thread_idx];
-    uint64_t sched_end_ts = get_sys_cnt_aicpu();
-    // Ride the sched window home to the host phase buffer (the host reduces
-    // across sched threads → the `Sched` [STRACE] marker). The verbose
-    // per-thread device-log line below is now opt-in deep-dive.
-    aicpu_phase_set_window(
-        AicpuPhase::SchedWindow, static_cast<uint64_t>(l2_swimlane.sched_start_ts), static_cast<uint64_t>(sched_end_ts)
-    );
-#if PTO2_SCHED_PROFILING
-    LOG_INFO_V9(
-        "Thread %d: sched_start=%" PRIu64 " sched_end=%" PRIu64 " sched_cost=%.3fus", thread_idx,
-        static_cast<uint64_t>(l2_swimlane.sched_start_ts), static_cast<uint64_t>(sched_end_ts),
-        cycles_to_us(sched_end_ts - l2_swimlane.sched_start_ts)
-    );
-
-    uint64_t sched_total = l2_swimlane.sched_wiring_cycle + l2_swimlane.sched_complete_cycle +
-                           l2_swimlane.sched_dispatch_cycle + l2_swimlane.sched_idle_cycle;
-    if (sched_total == 0) sched_total = 1;
-
-    {
-        PTO2SchedProfilingData sp = scheduler_get_profiling(thread_idx);
-        uint64_t otc_total = sp.lock_cycle + sp.fanout_cycle + sp.fanin_cycle + sp.self_consumed_cycle;
-        uint64_t complete_poll =
-            (l2_swimlane.sched_complete_cycle > otc_total + l2_swimlane.sched_complete_perf_cycle) ?
-                (l2_swimlane.sched_complete_cycle - otc_total - l2_swimlane.sched_complete_perf_cycle) :
-                0;
-        uint64_t dispatch_poll = (l2_swimlane.sched_dispatch_cycle >
-                                  l2_swimlane.sched_dispatch_pop_cycle + l2_swimlane.sched_dispatch_setup_cycle) ?
-                                     (l2_swimlane.sched_dispatch_cycle - l2_swimlane.sched_dispatch_pop_cycle -
-                                      l2_swimlane.sched_dispatch_setup_cycle) :
-                                     0;
-
-        LOG_INFO_V9(
-            "Thread %d: === Scheduler Phase Breakdown: total=%.3fus, %d tasks ===", thread_idx,
-            cycles_to_us(sched_total), cur_thread_completed
-        );
-
-        // fanout / fanin per-thread aggregates live in
-        // sched_overhead_analysis.compute_dag_stats_from_deps (deps.json edges
-        // × core_to_thread).
-        LOG_INFO_V9(
-            "Thread %d:   complete       : %.3fus (%.1f%%)", thread_idx, cycles_to_us(l2_swimlane.sched_complete_cycle),
-            l2_swimlane.sched_complete_cycle * 100.0 / sched_total
-        );
-
-        uint64_t c_parent = l2_swimlane.sched_complete_cycle > 0 ? l2_swimlane.sched_complete_cycle : 1;
-        uint64_t complete_miss_count = (l2_swimlane.complete_probe_count > l2_swimlane.complete_hit_count) ?
-                                           (l2_swimlane.complete_probe_count - l2_swimlane.complete_hit_count) :
-                                           0;
-        double complete_hit_rate = l2_swimlane.complete_probe_count > 0 ?
-                                       l2_swimlane.complete_hit_count * 100.0 / l2_swimlane.complete_probe_count :
-                                       0.0;
-        LOG_INFO_V9(
-            "Thread %d:     poll         : %.3fus (%.1f%%)  hit=%" PRIu64 ", miss=%" PRIu64 ", hit_rate=%.1f%%",
-            thread_idx, cycles_to_us(complete_poll), complete_poll * 100.0 / c_parent,
-            static_cast<uint64_t>(l2_swimlane.complete_hit_count), static_cast<uint64_t>(complete_miss_count),
-            complete_hit_rate
-        );
-        LOG_INFO_V9(
-            "Thread %d:     otc_lock     : %.3fus (%.1f%%)  work=%.3fus wait=%.3fus  atomics=%" PRIu64 "", thread_idx,
-            cycles_to_us(sp.lock_cycle), sp.lock_cycle * 100.0 / c_parent,
-            cycles_to_us(sp.lock_cycle - sp.lock_wait_cycle), cycles_to_us(sp.lock_wait_cycle),
-            static_cast<uint64_t>(sp.lock_atomic_count)
-        );
-        LOG_INFO_V9(
-            "Thread %d:     otc_fanout   : %.3fus (%.1f%%)  work=%.3fus wait=%.3fus  atomics=%" PRIu64 "", thread_idx,
-            cycles_to_us(sp.fanout_cycle), sp.fanout_cycle * 100.0 / c_parent,
-            cycles_to_us(sp.fanout_cycle - sp.push_wait_cycle), cycles_to_us(sp.push_wait_cycle),
-            static_cast<uint64_t>(sp.fanout_atomic_count)
-        );
-        LOG_INFO_V9(
-            "Thread %d:     otc_fanin    : %.3fus (%.1f%%)  atomics=%" PRIu64 "", thread_idx,
-            cycles_to_us(sp.fanin_cycle), sp.fanin_cycle * 100.0 / c_parent,
-            static_cast<uint64_t>(sp.fanin_atomic_count)
-        );
-        LOG_INFO_V9(
-            "Thread %d:     otc_self     : %.3fus (%.1f%%)  atomics=%" PRIu64 "", thread_idx,
-            cycles_to_us(sp.self_consumed_cycle), sp.self_consumed_cycle * 100.0 / c_parent,
-            static_cast<uint64_t>(sp.self_atomic_count)
-        );
-        LOG_INFO_V9(
-            "Thread %d:     perf         : %.3fus (%.1f%%)", thread_idx,
-            cycles_to_us(l2_swimlane.sched_complete_perf_cycle),
-            l2_swimlane.sched_complete_perf_cycle * 100.0 / c_parent
-        );
-
-        LOG_INFO_V9(
-            "Thread %d:   dispatch       : %.3fus (%.1f%%)", thread_idx, cycles_to_us(l2_swimlane.sched_dispatch_cycle),
-            l2_swimlane.sched_dispatch_cycle * 100.0 / sched_total
-        );
-
-        uint64_t d_parent = l2_swimlane.sched_dispatch_cycle > 0 ? l2_swimlane.sched_dispatch_cycle : 1;
-        LOG_INFO_V9(
-            "Thread %d:     poll         : %.3fus (%.1f%%)", thread_idx, cycles_to_us(dispatch_poll),
-            dispatch_poll * 100.0 / d_parent
-        );
-        LOG_INFO_V9(
-            "Thread %d:     pop          : %.3fus (%.1f%%)  work=%.3fus wait=%.3fus  atomics=%" PRIu64 "", thread_idx,
-            cycles_to_us(l2_swimlane.sched_dispatch_pop_cycle), l2_swimlane.sched_dispatch_pop_cycle * 100.0 / d_parent,
-            cycles_to_us(l2_swimlane.sched_dispatch_pop_cycle - sp.pop_wait_cycle), cycles_to_us(sp.pop_wait_cycle),
-            static_cast<uint64_t>(sp.pop_atomic_count)
-        );
-        LOG_INFO_V9(
-            "Thread %d:     setup        : %.3fus (%.1f%%)", thread_idx,
-            cycles_to_us(l2_swimlane.sched_dispatch_setup_cycle),
-            l2_swimlane.sched_dispatch_setup_cycle * 100.0 / d_parent
-        );
-
-#if PTO2_SCHED_PROFILING
-        LOG_INFO_V9(
-            "Thread %d:   wiring         : %.3fus (%.1f%%)  tasks=%d", thread_idx,
-            cycles_to_us(l2_swimlane.sched_wiring_cycle), l2_swimlane.sched_wiring_cycle * 100.0 / sched_total,
-            l2_swimlane.phase_wiring_count
-        );
-#else
-        LOG_INFO_V9(
-            "Thread %d:   wiring         : %.3fus (%.1f%%)", thread_idx, cycles_to_us(l2_swimlane.sched_wiring_cycle),
-            l2_swimlane.sched_wiring_cycle * 100.0 / sched_total
-        );
-#endif
-
-        LOG_INFO_V9(
-            "Thread %d:   idle           : %.3fus (%.1f%%)", thread_idx, cycles_to_us(l2_swimlane.sched_idle_cycle),
-            l2_swimlane.sched_idle_cycle * 100.0 / sched_total
-        );
-
-        if (cur_thread_completed > 0) {
-            LOG_INFO_V9(
-                "Thread %d:   avg/complete   : %.3fus", thread_idx,
-                cycles_to_us(l2_swimlane.sched_complete_cycle) / cur_thread_completed
-            );
-        }
-    }
-    LOG_INFO_V9(
-        "Thread %d: Scheduler summary: total_time=%.3fus, loops=%" PRIu64 ", tasks_scheduled=%d", thread_idx,
-        cycles_to_us(sched_total), static_cast<uint64_t>(l2_swimlane.sched_loop_count), cur_thread_completed
-    );
-#endif
-}
-#endif
-
-// =============================================================================
-// Shutdown: deinit AICore regs for this thread's cores (and PMU finalize if enabled).
-// Orchestrator threads have core_trackers_[thread_idx].core_num() == 0 -> no-op.
-// platform_deinit_aicore_regs is idempotent; safe to call after early completion.
-// =============================================================================
-int32_t SchedulerContext::shutdown(int32_t thread_idx) {
-    const int32_t *cores = core_trackers_[thread_idx].core_ids();
-    int32_t core_num = core_trackers_[thread_idx].core_num();
-    if (core_num == 0) return 0;
-
-#if PTO2_PROFILING
-    if (is_pmu_enabled()) {
-        pmu_aicpu_finalize(cores, core_num);
-    }
-#endif
-
-    LOG_INFO_V0("Thread %d: Shutting down %d cores", thread_idx, core_num);
-    int32_t rc = 0;
-    for (int32_t i = 0; i < core_num; i++) {
-        int32_t core_id = cores[i];
-        uint64_t reg_addr = core_exec_states_[core_id].reg_addr;
-        if (reg_addr != 0) {
-            // Timeout means AICore is unresponsive. Log and continue deiniting remaining cores.
-            if (platform_deinit_aicore_regs(reg_addr) != 0) {
-                LOG_ERROR("Thread %d: Core %d deinit timed out", thread_idx, core_id);
-                rc = -1;
-            }
-        } else {
-            LOG_ERROR("Thread %d: Core %d has invalid register address", thread_idx, core_id);
-        }
-    }
-    LOG_INFO_V0("Thread %d: Shutdown complete", thread_idx);
-    return rc;
-}
-
-// =============================================================================
-// Handshake with all AICore workers; discover core type and reg address.
-// =============================================================================
-int32_t SchedulerContext::handshake_all_cores(Runtime *runtime) {
-    Handshake *all_handshakes = reinterpret_cast<Handshake *>(runtime->dev.workers);
-    cores_total_num_ = runtime->dev.worker_count;
-
-    // Validate cores_total_num_ before using as array index
-    if (cores_total_num_ == 0 || cores_total_num_ > RUNTIME_MAX_WORKER) {
-        LOG_ERROR("Invalid cores_total_num %d (expected 1-%d)", cores_total_num_, RUNTIME_MAX_WORKER);
-        return -1;
-    }
-
-    aic_count_ = 0;
-    aiv_count_ = 0;
-
-    LOG_INFO_V0("Handshaking with %d cores", cores_total_num_);
-
-    // Step 1: Write per-core payload addresses and send handshake signal.
-    // OUT_OF_ORDER_STORE_BARRIER() ensures task is globally visible before
-    // aicpu_ready=1, so AICore reads the correct payload pointer after waking up.
-    for (int32_t i = 0; i < cores_total_num_; i++) {
-        all_handshakes[i].task = reinterpret_cast<uint64_t>(&payload_per_core_[i][0]);
-        OUT_OF_ORDER_STORE_BARRIER();
-        all_handshakes[i].aicpu_ready = 1;
-    }
-    OUT_OF_ORDER_STORE_BARRIER();
-
-    // Get platform physical cores count for validation
-    uint32_t max_physical_cores_count = platform_get_physical_cores_count();
-
-    // Step 2: Wait for all cores to respond, collect core type and register addresses
-    bool handshake_failed = false;
-    for (int32_t i = 0; i < cores_total_num_; i++) {
-        Handshake *hank = &all_handshakes[i];
-
-        while (hank->aicore_regs_ready == 0) {
-            SPIN_WAIT_HINT();
-        }
-
-        uint32_t physical_core_id = hank->physical_core_id;
-
-        if (physical_core_id >= max_physical_cores_count) {
-            LOG_ERROR(
-                "Core %d reported invalid physical_core_id=%u (platform max=%u)", i, physical_core_id,
-                max_physical_cores_count
-            );
-            handshake_failed = true;
-            continue;
-        }
-
-        uint64_t *regs = reinterpret_cast<uint64_t *>(regs_);
-        uint64_t reg_addr = regs[physical_core_id];
-
-        // Initialize AICore registers after discovery (first round)
-        platform_init_aicore_regs(reg_addr);
-        OUT_OF_ORDER_STORE_BARRIER();
-        hank->aicpu_regs_ready = 1;
-
-        OUT_OF_ORDER_STORE_BARRIER();
-
-        while (hank->aicore_done == 0) {
-            SPIN_WAIT_HINT();
-        }
-
-        CoreType type = hank->core_type;
-
-        core_exec_states_[i].reg_addr = reg_addr;
-        core_exec_states_[i].cond_ptr = get_reg_ptr(reg_addr, RegId::COND);
-
-#if PTO2_PROFILING
-        // Record physical_core_id for PMU init later (CoreExecState has no room
-        // for this field under PTO2_PROFILING).
-        physical_core_ids_[i] = physical_core_id;
-#endif
-#if !PTO2_PROFILING
-        core_exec_states_[i].worker_id = i;
-        core_exec_states_[i].physical_core_id = physical_core_id;
-        core_exec_states_[i].core_type = type;
-#endif
-
-        if (type == CoreType::AIC) {
-            aic_worker_ids_[aic_count_++] = i;
-            LOG_INFO_V0("Core %d: AIC, physical_id=%u, reg_addr=0x%lx", i, physical_core_id, reg_addr);
-        } else {
-            aiv_worker_ids_[aiv_count_++] = i;
-            LOG_INFO_V0("Core %d: AIV, physical_id=%u, reg_addr=0x%lx", i, physical_core_id, reg_addr);
-        }
-    }
-
-    if (handshake_failed) {
-        emergency_shutdown(runtime);
-        return -1;
-    }
-
-    LOG_INFO_V0("Core discovery complete: %d AIC, %d AIV", aic_count_, aiv_count_);
-    return 0;
-}
-
-// =============================================================================
-// Assign discovered cores to scheduler threads (cluster-aligned round-robin).
-// =============================================================================
-bool SchedulerContext::assign_cores_to_threads() {
-    // Cluster-aligned round-robin assignment: cluster ci -> sched thread ci % active_sched_threads_.
-    // Each cluster = 1 AIC + 2 adjacent AIV; the triple is always kept together.
-    active_sched_threads_ = (sched_thread_num_ > 0) ? sched_thread_num_ : aicpu_thread_num_;
-    int32_t cluster_count = aic_count_;
-
-    // Max clusters any single sched thread can hold: ceil(cluster_count / active_sched_threads_).
-    int32_t max_clusters_per_thread = (cluster_count + active_sched_threads_ - 1) / active_sched_threads_;
-    int32_t thread_cores_num = max_clusters_per_thread * 3;
-
-    if (thread_cores_num > CoreTracker::MAX_CORE_PER_THREAD) {
-        LOG_ERROR("Can't assign more then 64 cores in per scheduler");
-        return false;
-    }
-
-    LOG_INFO_V0(
-        "Assigning cores (round-robin): %d clusters across %d sched threads (%d AIC, %d AIV)", cluster_count,
-        active_sched_threads_, aic_count_, aiv_count_
-    );
-
-    for (int32_t i = 0; i < RUNTIME_MAX_WORKER; i++) {
-        core_exec_states_[i].running_reg_task_id = AICPU_TASK_INVALID;
-        core_exec_states_[i].pending_reg_task_id = AICPU_TASK_INVALID;
-    }
-
-    // Count clusters per thread first (round-robin may distribute unevenly)
-    int32_t clusters_per_thread[MAX_AICPU_THREADS] = {};
-    for (int32_t ci = 0; ci < cluster_count; ci++) {
-        clusters_per_thread[ci % active_sched_threads_]++;
-    }
-    for (int32_t i = 0; i < active_sched_threads_; i++) {
-        core_trackers_[i].init(clusters_per_thread[i]);
-    }
-
-    int32_t cluster_idx_per_thread[MAX_AICPU_THREADS] = {};
-
-    for (int32_t ci = 0; ci < cluster_count; ci++) {
-        int32_t t = ci % active_sched_threads_;
-
-        int32_t aic_wid = aic_worker_ids_[ci];
-        int32_t aiv0_wid = aiv_worker_ids_[2 * ci];
-        int32_t aiv1_wid = aiv_worker_ids_[2 * ci + 1];
-
-        core_trackers_[t].set_cluster(cluster_idx_per_thread[t]++, aic_wid, aiv0_wid, aiv1_wid);
-
-        LOG_INFO_V0("Thread %d: cluster %d (AIC=%d, AIV0=%d, AIV1=%d)", t, ci, aic_wid, aiv0_wid, aiv1_wid);
-    }
-
-    for (int32_t t = 0; t < aicpu_thread_num_; t++) {
-        LOG_INFO_V0(
-            "Thread %d: total %d cores (%d clusters)", t, core_trackers_[t].core_num(),
-            core_trackers_[t].get_cluster_count()
-        );
-    }
-
-    LOG_INFO_V0(
-        "Config: threads=%d, cores=%d, cores_per_thread=%d", aicpu_thread_num_, cores_total_num_, thread_cores_num
-    );
-    return true;
-}
-
-// =============================================================================
-// Emergency shutdown: broadcast exit signal to every handshake'd core and
-// deinit their AICore register blocks. Idempotent.
-// =============================================================================
-void SchedulerContext::emergency_shutdown(Runtime *runtime) {
-    LOG_WARN("Emergency shutdown: sending exit signal to all initialized cores");
-    Handshake *all_handshakes = reinterpret_cast<Handshake *>(runtime->dev.workers);
-    int32_t timeout_count = 0;
-    for (int32_t i = 0; i < cores_total_num_; i++) {
-        Handshake *hank = &all_handshakes[i];
-        OUT_OF_ORDER_STORE_BARRIER();
-        hank->aicpu_regs_ready = 1;
-        if (core_exec_states_[i].reg_addr != 0) {
-            if (platform_deinit_aicore_regs(core_exec_states_[i].reg_addr) != 0) {
-                timeout_count++;
-            }
-        }
-    }
-    if (timeout_count > 0) {
-        LOG_ERROR("Emergency shutdown: %d cores did not acknowledge exit", timeout_count);
-    }
-    LOG_WARN("Emergency shutdown complete");
-}
-
-// =============================================================================
-// Lifecycle: init / deinit
-// =============================================================================
-int32_t
-SchedulerContext::init(Runtime *runtime, int32_t aicpu_thread_num, int32_t sched_thread_num, uint64_t regs_base) {
-    always_assert(runtime != nullptr);
-
-    // Zero all per-core execution state before handshake
-    memset(core_exec_states_, 0, sizeof(core_exec_states_));
-
-    // Wire thread/transition configuration that handshake/assign need to read.
-    aicpu_thread_num_ = aicpu_thread_num;
-    sched_thread_num_ = sched_thread_num;
-    regs_ = regs_base;
-
-#if PTO2_PROFILING
-    // l2_swimlane_aicpu_init promotes g_l2_swimlane_level from the shared-memory
-    // header — must be called BEFORE caching the level, otherwise the cached
-    // value would still be 0 (only the binary enable bit has been seeded by
-    // kernel.cpp at this point). Reset the cached level on disabled runs so a
-    // prior enabled launch's level can't leak into the phase-record gates in
-    // scheduler_dispatch.
-    if (is_l2_swimlane_enabled()) {
-        l2_swimlane_aicpu_init(runtime->dev.worker_count);
-        l2_swimlane_level_ = get_l2_swimlane_level();
-        if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) {
-            // Sched-phase pool count must match the dump_args_init thread count
-            // below. This block runs before assign_cores_to_threads, so the
-            // active_sched_threads_ member isn't set yet — recompute the same
-            // normalization locally: sched_thread_num_ <= 0 means "use all AICPU
-            // threads as scheduler threads" (see assign_cores_to_threads'
-            // active_sched_threads_). Without it, init_phase would prime zero
-            // sched pools and all sched_phase emits would silently drop.
-            const int sched_phase_threads = (sched_thread_num_ > 0) ? sched_thread_num_ : aicpu_thread_num_;
-            // Orchestration is always single-threaded, so orch-phase is one pool
-            // (ordinal 0) — see record_orch_phase.
-            const int orch_phase_threads = 1;
-            l2_swimlane_aicpu_init_phase(runtime->dev.worker_count, sched_phase_threads, orch_phase_threads);
-        }
-    } else {
-        l2_swimlane_level_ = L2SwimlaneLevel::DISABLED;
-    }
-#endif
-
-    // Discover cores and assign to scheduler threads.
-    int32_t rc = handshake_all_cores(runtime);
-    if (rc != 0) {
-        LOG_ERROR("handshake_all_cores failed");
-        return rc;
-    }
-    if (!assign_cores_to_threads()) {
-        return -1;
-    }
-
-    // Profiling-subsystem buffer/state init: single-threaded cold path, so the
-    // "do it once" guarantee is structural (no CAS needed). Runs after
-    // handshake_all_cores / assign_cores_to_threads because pmu_aicpu_init needs
-    // physical_core_ids_ / cores_total_num_. Mirrors the l2_swimlane_aicpu_init
-    // convention above; the per-thread *_set_orch_thread_idx setters stay on the
-    // orchestrator thread (see aicpu_executor.cpp).
-#if PTO2_PROFILING
-    if (is_dump_args_enabled()) {
-        dump_args_init(active_sched_threads_);
-    }
-    if (is_pmu_enabled()) {
-        pmu_aicpu_init(physical_core_ids_, cores_total_num_);
-        LOG_INFO_V0("PMU profiling started on %d cores", cores_total_num_);
-    }
-    // dep_gen is host-driven (SubmitTrace) — runtime-gated by the host flag —
-    // and compiles out with the other profiling subsystems at PTO2_PROFILING=0.
-    // init() only pops the initial buffer from instance 0's free_queue; the
-    // orchestrator thread still records its idx via
-    // dep_gen_aicpu_set_orch_thread_idx() before the first record_submit.
-    if (is_dep_gen_enabled()) {
-        dep_gen_aicpu_init();
-    }
-#endif
-
-    // Initialize task counters. Task count comes from PTO2 shared memory.
-    if (runtime->get_gm_sm_ptr()) {
-        auto *header = static_cast<PTO2SharedMemoryHeader *>(runtime->get_gm_sm_ptr());
-        // Read at one-time boot init, before the SM is reset for the run, so a
-        // ring not yet written holds uninitialized memory (0xbe... under ASAN's
-        // malloc-fill). Sum in int64 and only count rings whose value is a
-        // plausible task count — (0, PTO2_SCOPE_TASKS_CAP]; a ring cannot hold
-        // more than the scope cap. This rejects any garbage pattern (negative
-        // or positive), so uninitialized rings contribute 0 (the correct boot
-        // count) while valid counts still add up, with no signed overflow.
-        int64_t pto2_count = 0;
-        for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-            int32_t ring_tasks = header->rings[r].fc.current_task_index.load(std::memory_order_acquire);
-            if (ring_tasks > 0 && ring_tasks <= PTO2_SCOPE_TASKS_CAP) pto2_count += ring_tasks;
-        }
-        total_tasks_ = static_cast<int32_t>(pto2_count);
-    } else {
-        total_tasks_ = 0;
-    }
-    completed_tasks_.store(0, std::memory_order_release);
-
-    // Device orchestration: the orchestrator thread flips this when the graph is built.
-    orchestrator_done_.store(false, std::memory_order_release);
-
-    // Clear per-core dispatch payloads
-    memset(payload_per_core_, 0, sizeof(payload_per_core_));
-    memset(deferred_slab_per_core_, 0, sizeof(deferred_slab_per_core_));
-
-    // Initialize per-core GlobalContext (sub_block_id) based on cluster position.
-    // This is done once at startup and never modified afterwards.
-    for (int32_t t = 0; t < sched_thread_num_; t++) {
-        CoreTracker &tracker = core_trackers_[t];
-        for (int32_t c = 0; c < tracker.get_cluster_count(); c++) {
-            int32_t cluster_offset = c * 3;  // Each cluster = 1 AIC + 2 AIV
-            auto aiv0_id = tracker.get_core_id_by_offset(tracker.get_aiv0_core_offset(cluster_offset));
-            auto aiv1_id = tracker.get_core_id_by_offset(tracker.get_aiv1_core_offset(cluster_offset));
-            payload_per_core_[aiv0_id][0].global_context.sub_block_id = 0;
-            payload_per_core_[aiv0_id][1].global_context.sub_block_id = 0;
-            payload_per_core_[aiv1_id][0].global_context.sub_block_id = 1;
-            payload_per_core_[aiv1_id][1].global_context.sub_block_id = 1;
-        }
-    }
-
-    func_id_to_addr_ = runtime->dev.func_id_to_addr_;
-
-    return 0;
-}
-
-void SchedulerContext::deinit() {
-    // Reset all per-core execution state
-    for (int32_t i = 0; i < RUNTIME_MAX_WORKER; i++) {
-        core_exec_states_[i] = {};
-        core_exec_states_[i].running_reg_task_id = AICPU_TASK_INVALID;
-        core_exec_states_[i].pending_reg_task_id = AICPU_TASK_INVALID;
-    }
-
-    // No per-core memset of payload_per_core_ / deferred_slab_per_core_ here
-    // (~300 KB across all cores). Both are fully re-initialized at dispatch
-    // before they can be read: dispatch_task sets deferred_slab->count = 0 /
-    // error_code = NONE and build_payload() overwrites every payload field
-    // (function addr, args[], contexts, not_ready) on the exact [core][buf_idx]
-    // about to run. The consumer side cannot reach a stale slot either: the
-    // drain only services a core's running_reg_task_id, and the loop above
-    // already reset every core_exec_states_[].running/pending_reg_task_id to
-    // AICPU_TASK_INVALID — so no FIN for an undispatched slot is processed, and
-    // the count-gated consumer never reads entries[] past the fresh count.
-
-    // Reset sync-start drain coordination — a previous run that aborted mid-drain
-    // would otherwise leave dirty pending/elected/ack state for the next reuse.
-    drain_state_.sync_start_pending.store(0, std::memory_order_release);
-    drain_state_.drain_worker_elected.store(0, std::memory_order_release);
-    drain_state_.drain_ack_mask.store(0, std::memory_order_release);
-    drain_state_.pending_task.store(nullptr, std::memory_order_release);
-
-    // Reset task counters and orchestrator state
-    completed_tasks_.store(0, std::memory_order_release);
-    total_tasks_ = 0;
-    orchestrator_done_.store(false, std::memory_order_release);
-    completed_.store(false, std::memory_order_release);
-
-    // Reset core discovery and assignment state
-    aic_count_ = 0;
-    aiv_count_ = 0;
-    cores_total_num_ = 0;
-    aicpu_thread_num_ = 0;
-    sched_thread_num_ = 0;
-    active_sched_threads_ = 0;
-    for (int32_t t = 0; t < MAX_AICPU_THREADS; t++) {
-        core_trackers_[t] = CoreTracker{};
-    }
-
-    regs_ = 0;
-    sched_ = nullptr;
-    rt_ = nullptr;
-    func_id_to_addr_ = nullptr;
-}
-
-void SchedulerContext::bind_runtime(PTO2Runtime *rt) {
-    rt_ = rt;
-    sched_ = &rt->scheduler;
-}
-
-void SchedulerContext::wait_for_orchestration_done_before_dispatch(Runtime *runtime, int32_t thread_idx) {
-    while (!orchestration_done() && !completed_.load(std::memory_order_acquire)) {
-        if (thread_idx == 0 && sched_ != nullptr) {
-            // Use the wiring subsystem's normal batch/backoff policy while
-            // waiting. This still honors orch_needs_drain/producer_blocked
-            // signals without force-draining an empty queue every spin.
-            int wired = sched_->drain_wiring_queue(/*force_drain=*/false);
-            if (wired > 0) {
-                continue;
-            }
-        }
-        if (sched_ != nullptr && sched_->sm_header != nullptr &&
-            check_idle_fatal_error(thread_idx, sched_->sm_header, runtime) == LoopAction::BREAK_LOOP) {
-            break;
-        }
-        SPIN_WAIT_HINT();
-    }
-}
-
-// =============================================================================
-// Post-orchestration bookkeeping. Runs on the orchestrator thread once the
-// build phase finishes; folds inline-completed tasks, flips orchestrator_done_,
-// and drives the orchestrator → scheduler core transition (or fatal shutdown).
-// =============================================================================
-void SchedulerContext::on_orchestration_done(
-    Runtime *runtime, PTO2Runtime *rt, [[maybe_unused]] int32_t thread_idx, int32_t total_tasks
-) {
-#if PTO2_PROFILING
-    if (l2_swimlane_level_ >= L2SwimlaneLevel::ORCH_PHASES) {
-        // Flush the orchestrator's orch-phase buffer (single instance, pool 0).
-        // The orchestrator has no scheduler-phase pool of its own — those belong
-        // to the scheduler threads and are flushed in scheduler_dispatch.
-        l2_swimlane_aicpu_flush_orch_phase_buffer(thread_idx);
-    }
-#endif
-
-    total_tasks_ = total_tasks;
-
-    // Fold tasks completed inline during orchestration
-    int32_t inline_completed = static_cast<int32_t>(rt->orchestrator.inline_completed_tasks);
-    if (inline_completed > 0) {
-        completed_tasks_.fetch_add(inline_completed, std::memory_order_relaxed);
-#if PTO2_SCHED_PROFILING
-        rt->scheduler.tasks_completed.fetch_add(inline_completed, std::memory_order_relaxed);
-#endif
-    }
-    orchestrator_done_.store(true, std::memory_order_release);
-
-    // Check for fatal error from orchestration; if so, shut down immediately.
-    int32_t orch_err = 0;
-    if (sched_->sm_header) {
-        orch_err = sched_->sm_header->orch_error_code.load(std::memory_order_relaxed);
-    }
-    if (orch_err != PTO2_ERROR_NONE) {
-        if (!completed_.exchange(true, std::memory_order_acq_rel)) {
-            emergency_shutdown(runtime);
-        }
-    }
-
-#if PTO2_PROFILING
-    // Write the core-to-thread mapping so the profiling data reflects the
-    // scheduler threads' final core distribution.
-    if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) {
-        l2_swimlane_aicpu_init_core_assignments(cores_total_num_);
-        for (int32_t t = 0; t < active_sched_threads_; t++) {
-            l2_swimlane_aicpu_write_core_assignments_for_thread(
-                t, core_trackers_[t].core_ids(), core_trackers_[t].core_num()
-            );
-        }
-    }
-#endif
-}
+// Polling redesign: completion / dispatch / cold-path logic is now inlined in
+// scheduler/scheduler_context.h and scheduler/pto_scheduler.h. This translation
+// unit is kept empty to preserve the upstream/main file layout.
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_completion.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_completion.cpp
index 774589865..0dd10cd45 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_completion.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_completion.cpp
@@ -8,607 +8,7 @@
  * See LICENSE in the root of the software repository for the full text of the License.
  * -----------------------------------------------------------------------------------------------------------
  */
-#include "scheduler_context.h"
 
-#include <algorithm>
-
-#include "common/unified_log.h"
-#include "aicpu/device_time.h"
-#include "aicpu/platform_regs.h"
-#include "common/l2_swimlane_profiling.h"
-#include "common/memory_barrier.h"
-#include "common/platform_config.h"
-#include "pto_runtime2.h"
-#include "runtime.h"
-#include "spin_hint.h"
-
-// Performance profiling headers
-#include "aicpu/l2_swimlane_collector_aicpu.h"
-#include "aicpu/pmu_collector_aicpu.h"
-#include "aicpu/tensor_dump_aicpu.h"
-
-// =============================================================================
-// Dual-slot state machine helpers
-// =============================================================================
-
-namespace {
-inline constexpr int32_t PTO2_DEFERRED_RELEASE_CAP = 256;
-}
-
-// Pure function: read register result -> SlotTransition (no side effects).
-SlotTransition SchedulerContext::decide_slot_transition(
-    int32_t reg_task_id, int32_t reg_state, int32_t running_id, int32_t pending_id, bool pending_gated
-) {
-    SlotTransition t;
-    if (pending_id != AICPU_TASK_INVALID && reg_task_id == pending_id) {
-        t.matched = true;
-        t.running_done = true;  // Serial execution: pending event implies running done
-        t.running_freed = true;
-        t.pending_freed = true;
-        if (reg_state == TASK_FIN_STATE) {
-            t.pending_done = true;  // Case 1: pending FIN
-        }
-        // else: Case 2: pending ACK (pending_done stays false)
-    } else if (reg_task_id == running_id) {
-        if (reg_state == TASK_FIN_STATE) {
-            if (pending_id == AICPU_TASK_INVALID) {
-                // Case 3.2: running FIN, no pending -> core goes idle
-                t.matched = true;
-                t.running_done = true;
-                t.running_freed = true;
-            } else if (pending_gated) {
-                // Case 3.3: running FIN, pending is a SPECULATIVE GATED task. The
-                // Case 3.1 "wait for the pending's ack" shortcut assumes the AICore
-                // immediately runs the pending task; a gated task instead spins on
-                // its doorbell and never acks until its producer completes — and
-                // that producer's completion depends on collecting THIS running FIN.
-                // Waiting would deadlock. Complete the running FIN now and promote
-                // the gated task (it then skip-gates until its doorbell). pending is
-                // NOT freed (it promotes, not retires) so the bitmap update keeps the
-                // core off-limits — no second gated block, no doorbell overwrite.
-                t.matched = true;
-                t.running_done = true;
-                t.running_freed = true;
-            }
-            // Case 3.1: running FIN, NON-gated pending exists -> skip (transient
-            // state). Case 1/2 (pending ack/FIN) completes running implicitly.
-        } else {
-            // Case 4: running ACK -- only pending_freed (slot now hardware-latched)
-            t.matched = true;
-            t.pending_freed = true;
-        }
-    }
-    return t;
-}
-
-// Complete one slot's task: subtask counting, mixed completion, deferred release, profiling.
-void SchedulerContext::complete_slot_task(
-    PTO2TaskSlotState &slot_state, int32_t expected_reg_task_id, [[maybe_unused]] PTO2SubtaskSlot subslot,
-    int32_t thread_idx, int32_t core_id, Handshake *hank, int32_t &completed_this_turn,
-    PTO2TaskSlotState *deferred_release_slot_states[], int32_t &deferred_release_count, PTO2LocalReadyBuffer *local_bufs
-#if PTO2_PROFILING
-    ,
-    uint64_t dispatch_ts, uint64_t finish_ts
-#endif
-) {
-#if PTO2_PROFILING
-    auto &l2_swimlane = sched_l2_swimlane_[thread_idx];
-#else
-    (void)hank;
-#endif
-    // MPSC fast-path is opt-in per task: only tasks with at least one subtask
-    // that registered a deferred condition route through the mailbox. Pure
-    // non-deferred tasks complete inline on this thread (matching pre-MPSC
-    // behavior — keeps the common case parallelized across scheduler threads
-    // instead of serializing through the single consumer). The
-    // any_subtask_deferred flag on slot_state is the discriminator; it's set
-    // (release) before on_subtask_complete and read (acquire) after, so the
-    // last subtask sees flag writes from any earlier subtask of the same task.
-    AICoreCompletionMailbox *mailbox = rt_ != nullptr ? rt_->aicore_mailbox : nullptr;
-    bool defer_completion_to_consumer = false;
-
-    if (slot_state.payload != nullptr) {
-        volatile DeferredCompletionSlab *deferred_slab = &deferred_slab_per_core_[core_id][expected_reg_task_id & 1];
-        int32_t slab_err = deferred_slab->error_code;
-        if (slab_err != PTO2_ERROR_NONE) {
-            int32_t expected = PTO2_ERROR_NONE;
-            sched_->sm_header->sched_error_code.compare_exchange_strong(
-                expected, slab_err, std::memory_order_acq_rel, std::memory_order_acquire
-            );
-            completed_.store(true, std::memory_order_release);
-            return;
-        }
-
-        uint32_t cond_count = deferred_slab->count;
-        if (cond_count > MAX_COMPLETIONS_PER_TASK) {
-            int32_t expected = PTO2_ERROR_NONE;
-            sched_->sm_header->sched_error_code.compare_exchange_strong(
-                expected, PTO2_ERROR_ASYNC_REGISTRATION_FAILED, std::memory_order_acq_rel, std::memory_order_acquire
-            );
-            completed_.store(true, std::memory_order_release);
-            return;
-        }
-
-        if (cond_count > 0) {
-            // Publish "this task is deferred" before on_subtask_complete so the
-            // acq_rel fetch_add inside on_subtask_complete makes the flag
-            // visible to whichever subtask sees task_complete=true (which may
-            // be this thread or a later one).
-            slot_state.any_subtask_deferred.store(true, std::memory_order_release);
-
-            const PTO2TaskId token = slot_state.task->task_id;
-            for (uint32_t i = 0; i < cond_count; ++i) {
-                volatile DeferredCompletionEntry *e = &deferred_slab->entries[i];
-                while (!mailbox->try_push_condition(token, e->addr, e->expected_value, e->engine, e->completion_type)) {
-                    sched_->async_wait_list.mpsc_skipped_count.fetch_add(1, std::memory_order_relaxed);
-                    SPIN_WAIT_HINT();
-                }
-            }
-        }
-    }
-
-    bool task_complete = sched_->on_subtask_complete(slot_state);
-
-#if PTO2_PROFILING
-    // Sub-block retire that did not finish the slot: record it so the poll
-    // iteration becomes visible on the scheduler lane (the SPMD harvest tail).
-    if (!task_complete && l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) {
-        l2_swimlane.phase_subretire_count++;
-    }
-#endif
-
-    if (task_complete && slot_state.payload != nullptr &&
-        slot_state.any_subtask_deferred.load(std::memory_order_acquire)) {
-        // Some subtask of this task registered conditions; finish the
-        // registration by handing the slot_state off to the consumer.
-        while (!mailbox->try_push_normal_done(slot_state.task->task_id, reinterpret_cast<uint64_t>(&slot_state))) {
-            sched_->async_wait_list.mpsc_skipped_count.fetch_add(1, std::memory_order_relaxed);
-            SPIN_WAIT_HINT();
-        }
-        defer_completion_to_consumer = true;
-    }
-
-    if (task_complete && !defer_completion_to_consumer) {
-#if PTO2_PROFILING
-        if (is_dump_args_enabled()) {
-            dump_args_for_task<PTO2_SUBTASK_SLOT_COUNT>(
-                thread_idx, slot_state, TensorDumpStage::AFTER_COMPLETION,
-                [](ActiveMask active_mask, int raw_subtask_id) {
-                    return active_mask.subtask_active(static_cast<PTO2SubtaskSlot>(raw_subtask_id));
-                },
-                [this](int32_t func_id) {
-                    return get_function_bin_addr(func_id);
-                }
-            );
-        }
-#endif
-#if PTO2_PROFILING
-        // Time Resolve (walk the consumer list, decrement each consumer's
-        // fanin, push the newly-ready ones, ring doorbells for speculative
-        // hits) so it renders as a child bar nested inside this iteration's
-        // Complete bar. The 1 µs floor below filters out the ~88% of tasks
-        // with 1-2 consumers (~500 ns Resolve) so only the long broadcast /
-        // reduction walks stand out on the lane.
-        uint64_t resolve_t0 = (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) ? get_sys_cnt_aicpu() : 0;
-#endif
-        // [[maybe_unused]] silences -Werror=unused-but-set-variable on the
-        // profiling-flags-smoke build path where PTO2_PROFILING is OFF and
-        // the Resolve emit below is excluded.
-        [[maybe_unused]] uint32_t consumers_resolved = 0;
-#if PTO2_SCHED_PROFILING
-        // SCHED_PROFILING variant takes thread_idx for its per-thread atomic
-        // counter side-effects (g_sched_*_atomic_count[thread_idx], consumed
-        // by the otc_* log lines). It returns CompletionStats whose
-        // `fanout_edges` is the consumer-walk count.
-        consumers_resolved = sched_->on_task_complete(slot_state, thread_idx, local_bufs).fanout_edges;
-#else
-        consumers_resolved = sched_->on_task_complete(slot_state, local_bufs);
-#endif
-#if PTO2_PROFILING
-        if (resolve_t0 != 0) {
-            uint64_t resolve_t1 = get_sys_cnt_aicpu();
-            // Filter: drop Resolve bars under 1 µs so the lane shows only
-            // resolves that did meaningful work (high consumer counts or
-            // doorbells). 50 cycles @ 50 MHz = 1 µs (PLATFORM_PROF_SYS_CNT_FREQ
-            // is the device sys-cnt frequency).
-            constexpr uint64_t RESOLVE_EMIT_MIN_CYCLES = PLATFORM_PROF_SYS_CNT_FREQ / 1'000'000;  // 1 µs
-            if (resolve_t1 - resolve_t0 >= RESOLVE_EMIT_MIN_CYCLES) {
-                l2_swimlane_aicpu_record_sched_phase(
-                    thread_idx, L2SwimlaneSchedPhaseKind::Resolve, resolve_t0, resolve_t1, l2_swimlane.sched_loop_count,
-                    consumers_resolved
-                );
-            }
-        }
-        l2_swimlane.phase_complete_count++;
-#endif
-        if (deferred_release_count < PTO2_DEFERRED_RELEASE_CAP) {
-            deferred_release_slot_states[deferred_release_count++] = &slot_state;
-        } else {
-            LOG_INFO_V9("Thread %d: release", thread_idx);
-            while (deferred_release_count > 0) {
-#if PTO2_SCHED_PROFILING
-                // SCHED_PROFILING variant takes thread_idx for the per-thread
-                // atomic counter side-effects. The return value is unused.
-                (void)sched_->on_task_release(*deferred_release_slot_states[--deferred_release_count], thread_idx);
-#else
-                sched_->on_task_release(*deferred_release_slot_states[--deferred_release_count]);
-#endif
-            }
-            deferred_release_slot_states[deferred_release_count++] = &slot_state;
-        }
-        completed_this_turn++;
-    }
-
-#if PTO2_PROFILING
-    // Level gate: at AICORE_TIMING (level=1) the AICore record alone carries
-    // {start, end, task_token_raw}, host resolves func_id/core_type from
-    // dep_gen / per-core mapping, and AICPU has nothing to write. Only at
-    // AICPU_TIMING (level=2) and above does AICPU contribute dispatch/finish
-    // timestamps via complete_task. Bypassing here saves the per-completion
-    // hot-path cost (counter inc + ring lookup + record store + wmb + buffer
-    // rotation bookkeeping) for runs that only want AICore timing.
-    if (l2_swimlane.l2_swimlane_enabled && l2_swimlane_level_ >= L2SwimlaneLevel::AICPU_TIMING) {
-#if PTO2_SCHED_PROFILING
-        uint64_t t_perf_start = get_sys_cnt_aicpu();
-#endif
-
-        if (l2_swimlane_aicpu_complete_task(
-                core_id, thread_idx, static_cast<uint32_t>(expected_reg_task_id), dispatch_ts, finish_ts
-            ) != 0) {
-            LOG_ERROR(
-                "Core %d: l2_swimlane_aicpu_complete_task failed for task 0x%" PRIx64, core_id,
-                static_cast<uint64_t>(slot_state.task->task_id.raw)
-            );
-        }
-#if PTO2_SCHED_PROFILING
-        l2_swimlane.sched_complete_perf_cycle += (get_sys_cnt_aicpu() - t_perf_start);
-#endif
-    }
-
-    if (is_pmu_enabled()) {
-        pmu_aicpu_record_task(
-            core_id, thread_idx, slot_state.task->task_id.raw,
-            slot_state.task->kernel_id[static_cast<int32_t>(subslot)], hank[core_id].core_type
-        );
-    }
-#endif
-}
-
-// Promote pending slot data to running slot. Clears pending fields.
-void SchedulerContext::promote_pending_to_running(CoreExecState &core) {
-    core.running_slot_state = core.pending_slot_state;
-    core.running_reg_task_id = core.pending_reg_task_id;
-    core.running_subslot = core.pending_subslot;
-#if PTO2_PROFILING
-    core.running_dispatch_timestamp = core.pending_dispatch_timestamp;
-#endif
-    core.pending_slot_state = nullptr;
-    core.pending_reg_task_id = AICPU_TASK_INVALID;
-}
-
-// Clear running slot (core becomes idle).
-void SchedulerContext::clear_running_slot(CoreExecState &core) {
-    core.running_slot_state = nullptr;
-    core.running_reg_task_id = AICPU_TASK_INVALID;
-}
-
-void SchedulerContext::check_running_cores_for_completion(
-    int32_t thread_idx, Handshake *hank, int32_t &completed_this_turn, int32_t &cur_thread_completed,
-    bool &made_progress, PTO2TaskSlotState *deferred_release_slot_states[], int32_t &deferred_release_count,
-    PTO2LocalReadyBuffer *local_bufs
-) {
-#if PTO2_SCHED_PROFILING
-    auto &l2_swimlane = sched_l2_swimlane_[thread_idx];
-#endif
-    CoreTracker &tracker = core_trackers_[thread_idx];
-    auto running_core_states = tracker.get_all_running_cores();
-    while (running_core_states.has_value()) {
-        int32_t bit_pos = running_core_states.pop_first();
-        int32_t core_id = tracker.get_core_id_by_offset(bit_pos);
-        CoreExecState &core = core_exec_states_[core_id];
-
-        // Skip gated speculative cores. A STAGED task is parked on this core
-        // waiting for its doorbell — it physically cannot ACK/FIN yet, so
-        // reading its COND (MMIO, and the core is hot-spinning on its own SPR)
-        // every poll is pure waste that drags out the completion phase. The
-        // doorbell (try_speculative_release) flips spec_state to DISPATCHED, at
-        // which point the core becomes pollable again and its FIN is caught.
-        // Cheap cacheable load; no MMIO. Pending slot is empty while gated.
-        {
-            PTO2TaskSlotState *rs = core.running_slot_state;
-            if (rs != nullptr && rs->payload != nullptr &&
-                rs->payload->spec_state.load(std::memory_order_relaxed) == PTO2_SPEC_STAGING) {
-                continue;
-            }
-        }
-
-        // --- Judgment phase: read register, derive transition ---
-        // Use the precomputed cond_ptr (resolved once in handshake) to skip
-        // the reg_offset switch and reg_addr addition on every poll.
-        uint64_t reg_val = static_cast<uint64_t>(*core.cond_ptr);
-        // ARM64 allows Device-nGnRnE -> Normal-cacheable load reorder; the
-        // rmb() pins any AICore-published cacheable reads downstream of the
-        // FIN observation. Replaces the post-`__sync_synchronize` that the
-        // old read_reg() helper carried implicitly.
-        rmb();
-        int32_t reg_task_id = EXTRACT_TASK_ID(reg_val);
-        int32_t reg_state = EXTRACT_TASK_STATE(reg_val);
-
-#if PTO2_SCHED_PROFILING
-        if (l2_swimlane.l2_swimlane_enabled) {
-            l2_swimlane.complete_probe_count++;
-        }
-#endif
-
-        // A pending task is "gated" when it is a speculative pre-stage still
-        // waiting on its doorbell (STAGED): it will not ack on the producer's FIN,
-        // so the Case 3.1 wait-for-pending-ack shortcut would deadlock. Detect it
-        // so decide_slot_transition completes the running FIN and promotes it.
-        bool pending_gated =
-            (core.pending_slot_state != nullptr && core.pending_slot_state->payload != nullptr &&
-             core.pending_slot_state->payload->spec_state.load(std::memory_order_relaxed) == PTO2_SPEC_STAGING);
-        SlotTransition t = decide_slot_transition(
-            reg_task_id, reg_state, core.running_reg_task_id, core.pending_reg_task_id, pending_gated
-        );
-        if (!t.matched) continue;
-
-#if PTO2_SCHED_PROFILING
-        if (l2_swimlane.l2_swimlane_enabled && (t.running_done || t.pending_done)) {
-            l2_swimlane.complete_hit_count++;
-        }
-#endif
-
-#if PTO2_PROFILING
-        // Capture finish_ts at the FIN observation point — right after rmb()
-        // above pinned the cacheable AICore reads downstream of the register
-        // load, and BEFORE any fanin / deferred-release work. Anything later
-        // (slot transition apply, complete_slot_task fanin processing) would
-        // charge AICPU completion-processing cost to the (end → finish)
-        // span, masking the actual FIN-delivery latency.
-        uint64_t finish_ts = 0;
-        if (l2_swimlane_level_ >= L2SwimlaneLevel::AICPU_TIMING && (t.pending_done || t.running_done)) {
-            finish_ts = get_sys_cnt_aicpu();
-        }
-#endif
-
-        // --- Apply phase: execute actions based on transition ---
-
-        // 1. Complete finished tasks (capture pointers before modifying core state)
-        if (t.pending_done) {
-            complete_slot_task(
-                *core.pending_slot_state, core.pending_reg_task_id, core.pending_subslot, thread_idx, core_id, hank,
-                completed_this_turn, deferred_release_slot_states, deferred_release_count, local_bufs
-#if PTO2_PROFILING
-                ,
-                core.pending_dispatch_timestamp, finish_ts
-#endif
-            );
-            cur_thread_completed++;
-        }
-        if (t.running_done) {
-            complete_slot_task(
-                *core.running_slot_state, core.running_reg_task_id, core.running_subslot, thread_idx, core_id, hank,
-                completed_this_turn, deferred_release_slot_states, deferred_release_count, local_bufs
-#if PTO2_PROFILING
-                ,
-                core.running_dispatch_timestamp, finish_ts
-#endif
-            );
-            cur_thread_completed++;
-        }
-
-        // 2. Update slot data
-        if (t.running_freed) {
-            if (core.pending_slot_state != nullptr && !t.pending_done) {
-                promote_pending_to_running(core);  // Case 2 or Case 3 (with pending)
-            } else {
-                clear_running_slot(core);  // Case 1 or Case 3 (no pending)
-                if (t.pending_done) {
-                    // Case 1: pending FIN observed directly -- clear stale pending fields.
-                    // Without this, pending_reg_task_id retains a stale value that blocks
-                    // clear_pending_occupied and permanently degrades pipelining.
-                    core.pending_slot_state = nullptr;
-                    core.pending_reg_task_id = AICPU_TASK_INVALID;
-                }
-            }
-        }
-
-        // 3. Update tracker bitmap
-        bool is_idle = (core.running_reg_task_id == AICPU_TASK_INVALID);
-        if (is_idle) {
-            tracker.change_core_state(bit_pos);       // Mark idle
-            tracker.clear_pending_occupied(bit_pos);  // Idle safeguard: no payload to protect
-        } else if (t.pending_freed && core.pending_reg_task_id == AICPU_TASK_INVALID) {
-            // Case 4 (running ACK) or Case 2 (pending ACK): clear pending_occupied only
-            // when no pending task is currently held. Otherwise pending slot is occupied
-            // by a pre-loaded task and must stay protected.
-            tracker.clear_pending_occupied(bit_pos);
-        }
-
-        // 4. Progress signal (only when running task completes)
-        if (t.running_done) {
-            made_progress = true;
-        }
-    }
-}
-
-// =============================================================================
-// sync_start drain protocol
-// =============================================================================
-
-// Take ownership of slot_state and signal all threads to enter drain mode.
-// Returns true if this thread won the CAS and owns the drain slot.
-// Returns false if another thread already holds drain; caller must re-push slot_state.
-//
-// Two-phase protocol: CAS 0 -> -1 (sentinel) to claim ownership, store task and
-// reset election flag, then release-store block_num.  Other threads acquire-load
-// sync_start_pending; seeing block_num > 0 ensures all relaxed stores are visible.
-bool SchedulerContext::enter_drain_mode(PTO2TaskSlotState *slot_state, int32_t block_num) {
-    int32_t expected = 0;
-    if (!drain_state_.sync_start_pending.compare_exchange_strong(
-            expected, -1, std::memory_order_relaxed, std::memory_order_relaxed
-        )) {
-        return false;  // Another thread already holds the drain slot.
-    }
-    // We own the drain slot.  Store the task and reset election flag before making it visible.
-    drain_state_.pending_task.store(slot_state, std::memory_order_release);
-    drain_state_.drain_ack_mask.store(0, std::memory_order_relaxed);
-    drain_state_.drain_worker_elected.store(0, std::memory_order_relaxed);
-    // Release store: all stores above are now visible to any thread that
-    // acquire-loads sync_start_pending and sees block_num > 0.
-    drain_state_.sync_start_pending.store(block_num, std::memory_order_release);
-    return true;
-}
-
-// Count total available resources across all scheduler threads for a given shape.
-int32_t SchedulerContext::count_global_available(PTO2ResourceShape shape, uint8_t core_mask) {
-    int32_t total = 0;
-    for (int32_t t = 0; t < active_sched_threads_; t++) {
-        if (shape == PTO2ResourceShape::MIX) {
-            total += core_trackers_[t].count_mix_running_clusters(core_mask);
-        } else {
-            total += core_trackers_[t].get_idle_core_offset_states(shape).count();
-        }
-    }
-    return total;
-}
-
-// Drain worker: dispatch all blocks in one pass across all threads' trackers.
-// Called only when global resources >= block_num, so one pass always suffices.
-// All other threads are spinning -- the drain worker has exclusive tracker access.
-void SchedulerContext::drain_worker_dispatch(int32_t block_num) {
-    PTO2TaskSlotState *slot_state = drain_state_.pending_task.load(std::memory_order_acquire);
-    if (!slot_state) {
-        drain_state_.sync_start_pending.store(0, std::memory_order_release);
-        return;
-    }
-    PTO2ResourceShape shape = slot_state->active_mask.to_shape();
-    uint8_t core_mask = slot_state->active_mask.core_mask();
-
-    for (int32_t t = 0;
-         t < active_sched_threads_ && slot_state->next_block_idx.load(std::memory_order_relaxed) < block_num; t++) {
-        auto valid = (shape == PTO2ResourceShape::MIX) ?
-                         core_trackers_[t].get_mix_running_cluster_offset_states(core_mask) :
-                         core_trackers_[t].get_idle_core_offset_states(shape);
-        int32_t start = slot_state->next_block_idx.load(std::memory_order_relaxed);
-        int32_t remaining = slot_state->logical_block_num - start;
-        int32_t claim = std::min(valid.count(), remaining);
-        slot_state->next_block_idx.store(static_cast<int16_t>(start + claim), std::memory_order_relaxed);
-        PublishHandle handles[CoreTracker::MAX_CLUSTERS * 3];
-        int handle_count = 0;
-        for (int32_t b = 0; b < claim; b++) {
-            auto core_offset = valid.pop_first();
-            handle_count += prepare_block_for_dispatch(
-                t, core_offset, *slot_state, shape, false, start + b, &handles[handle_count]
-            );
-        }
-        wmb();
-        uint64_t dispatch_ts = 0;
-#if PTO2_PROFILING
-        if (l2_swimlane_level_ >= L2SwimlaneLevel::AICPU_TIMING) {
-            dispatch_ts = get_sys_cnt_aicpu();
-        }
-#endif
-        for (int i = 0; i < handle_count; i++) {
-            publish_subtask_to_core(handles[i], dispatch_ts);
-        }
-    }
-
-    // All blocks dispatched -- clear drain state.
-    // Release fence ensures tracker mutations are visible to threads that
-    // acquire-load sync_start_pending == 0 and resume normal operation.
-    std::atomic_thread_fence(std::memory_order_release);
-    drain_state_.pending_task.store(nullptr, std::memory_order_release);
-    drain_state_.drain_ack_mask.store(0, std::memory_order_relaxed);
-    drain_state_.drain_worker_elected.store(0, std::memory_order_relaxed);
-    drain_state_.sync_start_pending.store(0, std::memory_order_release);
-}
-
-// Called by each scheduler thread when drain_state_.sync_start_pending != 0.
-//
-// Protocol (single-stage ack barrier):
-//   1. Ack barrier: all threads signal they've stopped dispatch, then spin
-//      until all ack bits are set.
-//      If this thread's bit gets cleared while waiting, a reset occurred -- return.
-//   2. Election: one thread wins the CAS and becomes the drain worker.
-//      If resources are insufficient, reset ack/election fields and return --
-//      all threads resume completion polling to free running cores, then retry.
-//   3. Dispatch: elected thread dispatches all blocks (one pass, resources guaranteed).
-//      Non-elected threads spin-wait until sync_start_pending == 0.
-//      During dispatch the elected thread has exclusive tracker access.
-void SchedulerContext::handle_drain_mode(int32_t thread_idx) {
-    // Every spin in this function honors is_completed(): once the run latches
-    // completed_ (all tasks done, or a fatal error raised elsewhere), peers leave
-    // the dispatch loop and stop participating in the drain. A thread parked in a
-    // drain spin would then wait forever for acks / a gate-open that can no longer
-    // arrive -- the AICPU watchdog never fires here because these spins live
-    // outside the dispatch loop's wall-clock budget, so the hang escalates straight
-    // to the 3 s STARS op-exec timeout (507018) and poisons the device. Bailing on
-    // completed_ is always safe: any pending sync_start task is either already
-    // dispatched (a stale re-popped slot) or moot under teardown, and deinit()
-    // resets drain_state_ before the next run, so leaving it dirty is harmless.
-    // Spin until drain is fully initialized (sentinel -1 -> block_num > 0).
-    int32_t block_num;
-    do {
-        if (is_completed()) return;
-        block_num = drain_state_.sync_start_pending.load(std::memory_order_acquire);
-    } while (block_num < 0);
-    if (block_num == 0) return;
-
-    uint32_t all_acked = (1u << active_sched_threads_) - 1;
-
-    // Ack barrier -- signal this thread has stopped dispatch.
-    drain_state_.drain_ack_mask.fetch_or(1u << thread_idx, std::memory_order_release);
-
-    // Spin until all threads have acked.
-    // If our bit is cleared while waiting, elected reset due to insufficient resources.
-    while (true) {
-        if (is_completed()) return;
-        uint32_t ack = drain_state_.drain_ack_mask.load(std::memory_order_acquire);
-        if ((ack & all_acked) == all_acked) break;
-        if ((ack & (1u << thread_idx)) == 0) return;
-        SPIN_WAIT_HINT();
-    }
-
-    // Election -- exactly one thread wins the CAS.
-    int32_t expected = 0;
-    drain_state_.drain_worker_elected.compare_exchange_strong(
-        expected, thread_idx + 1, std::memory_order_acquire, std::memory_order_relaxed
-    );
-
-    if (drain_state_.drain_worker_elected.load(std::memory_order_relaxed) != thread_idx + 1) {
-        // Non-elected: spin-wait for drain completion or resource-insufficient reset.
-        while (drain_state_.sync_start_pending.load(std::memory_order_acquire) != 0) {
-            if (is_completed()) return;
-            if (drain_state_.drain_worker_elected.load(std::memory_order_acquire) == 0) return;
-            SPIN_WAIT_HINT();
-        }
-        return;
-    }
-
-    // Elected: check if global resources are sufficient.
-    PTO2TaskSlotState *slot_state = drain_state_.pending_task.load(std::memory_order_acquire);
-    if (slot_state == nullptr) {
-        // pending_task is observed null only when a concurrent drain completion
-        // already cleared it (drain_worker_dispatch nulls it before reopening the
-        // gate). That drain is done and this is a stale-elected thread, so just
-        // release the election lock and return. Do NOT clear drain_ack_mask or
-        // sync_start_pending: a *new* drain run may already be active and
-        // accumulating acks, and zeroing them would corrupt it into a hang.
-        drain_state_.drain_worker_elected.store(0, std::memory_order_release);
-        return;
-    }
-    PTO2ResourceShape shape = slot_state->active_mask.to_shape();
-    int32_t available = count_global_available(shape, slot_state->active_mask.core_mask());
-
-    if (available < block_num) {
-        // Insufficient resources -- reset drain fields so threads can resume
-        // completion polling to free running cores, then retry.
-        drain_state_.drain_ack_mask.store(0, std::memory_order_release);
-        drain_state_.drain_worker_elected.store(0, std::memory_order_release);
-        return;
-    }
-
-    // Dispatch -- all other threads are spinning, elected thread has exclusive tracker access.
-    drain_worker_dispatch(block_num);
-}
+// Polling redesign: completion / dispatch / cold-path logic is now inlined in
+// scheduler/scheduler_context.h and scheduler/pto_scheduler.h. This translation
+// unit is kept empty to preserve the upstream/main file layout.
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h
index 02962864d..91e779e02 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h
@@ -13,17 +13,62 @@
 
 #include "aicpu/platform_regs.h"
 #include "common/l2_swimlane_profiling.h"
-#include "common/unified_log.h"
-#include "scheduler_types.h"
+#include "scheduler/scheduler_types.h"
 
 #include "scheduler/pto_scheduler.h"
 
 #include "aicore_completion_mailbox.h"
 #include "pto2_dispatch_payload.h"
 
-// These macros are defined in runtime.h, but we cannot include it here
-// (it pulls in Handshake which we only forward-declare).  Mirror the
-// authoritative values so the class layout compiles standalone.
+#include <cinttypes>
+#include <cstdio>
+#include "runtime.h"
+#include "pto_runtime2.h"
+#include "pto_shared_memory.h"
+#include "aicpu/device_time.h"
+#include "aicpu/pmu_collector_aicpu.h"
+#include "aicpu/tensor_dump_aicpu.h"
+#include "common/memory_barrier.h"
+#include "common/platform_config.h"
+#include "common/unified_log.h"
+#include "spin_hint.h"
+// SchedulerThreadProfile is defined in scheduler_types.h (above) so the
+// drain_wiring_queue method in pto_scheduler.h can take a pointer to it.
+
+#ifndef unlikely
+#define unlikely(x) __builtin_expect(!!(x), 0)
+#endif
+
+inline void latch_scheduler_error(PTO2SharedMemoryHeader *header, int32_t thread_idx, int32_t error_code)
+{
+    if (header == nullptr || error_code == PTO2_ERROR_NONE) return;
+    int32_t expected = PTO2_ERROR_NONE;
+    if (header->sched_error_code.compare_exchange_strong(expected, error_code, std::memory_order_acq_rel)) header->sched_error_thread.store(thread_idx, std::memory_order_release);
+    if (thread_idx >= 0 && thread_idx < 32) header->sched_error_bitmap.fetch_or(1U << static_cast<uint32_t>(thread_idx), std::memory_order_acq_rel);
+}
+
+inline void format_core_status(char *buf, size_t buf_size, int32_t core_id, bool idle, const CoreExecState *core_state, uint64_t reg_addr_for_cond)
+{
+    if (idle)
+    {
+        snprintf(buf, buf_size, "core%d(idle)", core_id);
+        return;
+    }
+    int32_t kernel = -1;
+    int64_t task_id_raw = -1;
+    if (core_state && core_state->running_slot_state)
+    {
+        int32_t subslot = static_cast<int32_t>(core_state->running_subslot);
+        kernel = core_state->running_slot_state->task->kernel_id[subslot];
+        task_id_raw = static_cast<int64_t>(core_state->running_slot_state->task->task_id.raw);
+    }
+    uint64_t cond_reg = read_reg(reg_addr_for_cond, RegId::COND);
+    int32_t hw_state = EXTRACT_TASK_STATE(cond_reg);
+    const char *cond_reg_state_str = (hw_state == TASK_ACK_STATE) ? "ack" : "fin";
+    if (hw_state == TASK_ACK_STATE) snprintf(buf, buf_size, "core%d(busy kernel=%d task=%" PRId64 " cond_reg_state=%s)", core_id, kernel, task_id_raw, cond_reg_state_str);
+    else snprintf(buf, buf_size, "core%d(busy kernel=%d task=%" PRId64 " cond_reg_state=%s ANOMALY)", core_id, kernel, task_id_raw, cond_reg_state_str);
+}
+
 #ifndef RUNTIME_MAX_WORKER
 #define RUNTIME_MAX_WORKER 72
 #endif
@@ -36,83 +81,381 @@ class Runtime;
 struct Handshake;
 struct PTO2Runtime;
 
-/**
- * SchedulerContext: owns all scheduler-side state and methods.
- *
- * Held as a member of AicpuExecutor (sched_ctx_).  The single public entry
- * point is resolve_and_dispatch(), called once per scheduler thread.
- *
- * All dispatch/completion/drain/cold-path logic is implemented as private
- * member methods, split across three .cpp files by responsibility:
- *   - scheduler_completion.cpp  (completion polling, drain protocol)
- *   - scheduler_cold_path.cpp   (exit checks, stall diagnostics, profiling)
- *   - scheduler_dispatch.cpp    (task dispatch loop and helpers)
- */
-class SchedulerContext {
+class SchedulerContext
+{
 public:
-    // =========================================================================
-    // Lifecycle
-    // =========================================================================
-
-    // Initialize scheduler state from the given runtime and thread layout.
-    // - Discovers cores via handshake_all_cores()
-    // - Assigns cores to scheduler threads
-    // - Resets task counters, payloads, per-core GlobalContext
-    // - Binds func_id_to_addr_ / initial sched_ (if rt is already known)
-    // - Captures AICore-register base (consumed by handshake_all_cores())
-    // Returns 0 on success, negative on failure (handshake / assignment error).
-    int32_t init(Runtime *runtime, int32_t aicpu_thread_num, int32_t sched_thread_num, uint64_t regs_base);
+    int32_t init(Runtime *runtime, int32_t aicpu_thread_num, int32_t sched_thread_num, uint64_t regs_base)
+    {
+        always_assert(runtime != nullptr);
+
+        // Zero all per-core execution state before handshake
+        memset(core_exec_states_, 0, sizeof(core_exec_states_));
+
+        // Wire thread/transition configuration that handshake/assign need to read.
+        aicpu_thread_num_ = aicpu_thread_num;
+        sched_thread_num_ = sched_thread_num;
+        regs_ = regs_base;
+
+        // Discover cores and assign to scheduler threads.
+        int32_t rc = handshake_all_cores(runtime);
+        if (rc != 0) return rc;
+        if (!assign_cores_to_threads()) return -1;
+
+        // Initialize task counters. Task count comes from PTO2 shared memory.
+        if (runtime->get_gm_sm_ptr())
+        {
+            auto *header = static_cast<PTO2SharedMemoryHeader *>(runtime->get_gm_sm_ptr());
+            int64_t pto2_count = 0;
+            for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++)
+            {
+                int32_t ring_tasks = header->rings[r].fc.current_task_index.load(std::memory_order_acquire);
+                if (ring_tasks > 0 && ring_tasks <= PTO2_SCOPE_TASKS_CAP) pto2_count += ring_tasks;
+            }
+            total_tasks_ = static_cast<int32_t>(pto2_count);
+        }
+        else
+        {
+            total_tasks_ = 0;
+        }
+        completed_tasks_.store(0, std::memory_order_release);
+
+        // Device orchestration: the orchestrator thread flips this when the graph is built.
+        orchestrator_done_ = false;
+
+        // Clear per-core dispatch payloads
+        memset(payload_per_core_, 0, sizeof(payload_per_core_));
+        memset(deferred_slab_per_core_, 0, sizeof(deferred_slab_per_core_));
+
+        // Initialize per-core GlobalContext (sub_block_id) based on cluster position.
+        // This is done once at startup and never modified afterwards.
+        for (int32_t t = 0; t < sched_thread_num_; t++)
+        {
+            CoreTracker &tracker = core_trackers_[t];
+            for (int32_t c = 0; c < tracker.get_cluster_count(); c++)
+            {
+                int32_t cluster_offset = c * 3;  // Each cluster = 1 AIC + 2 AIV
+                auto aiv0_id = tracker.get_core_id_by_offset(tracker.get_aiv0_core_offset(cluster_offset));
+                auto aiv1_id = tracker.get_core_id_by_offset(tracker.get_aiv1_core_offset(cluster_offset));
+                payload_per_core_[aiv0_id][0].global_context.sub_block_id = 0;
+                payload_per_core_[aiv0_id][1].global_context.sub_block_id = 0;
+                payload_per_core_[aiv1_id][0].global_context.sub_block_id = 1;
+                payload_per_core_[aiv1_id][1].global_context.sub_block_id = 1;
+            }
+        }
+
+        func_id_to_addr_ = runtime->dev.func_id_to_addr_;
+
+        return 0;
+    }
 
     // Reset all SchedulerContext-owned state to its post-construction defaults.
     // Called by AicpuExecutor::deinit() during per-run teardown.
-    void deinit();
+    void deinit()
+    {
+        // Reset all per-core execution state
+        for (int32_t i = 0; i < RUNTIME_MAX_WORKER; i++)
+        {
+            core_exec_states_[i] = {};
+            core_exec_states_[i].running_reg_task_id = AICPU_TASK_INVALID;
+            core_exec_states_[i].pending_reg_task_id = AICPU_TASK_INVALID;
+        }
 
-    // =========================================================================
-    // Per-thread execution entry points (called by AicpuExecutor::run)
-    // =========================================================================
+        // Clear per-core dispatch payloads
+        memset(payload_per_core_, 0, sizeof(payload_per_core_));
+        memset(deferred_slab_per_core_, 0, sizeof(deferred_slab_per_core_));
+
+        // Reset sync-start drain coordination — a previous run that aborted mid-drain
+        // would otherwise leave dirty pending/elected/ack state for the next reuse.
+        drain_state_.sync_start_pending.store(0, std::memory_order_release);
+        drain_state_.drain_worker_elected.store(0, std::memory_order_release);
+        drain_state_.drain_ack_mask.store(0, std::memory_order_release);
+        drain_state_.pending_task.store(nullptr, std::memory_order_release);
+
+        // Reset task counters and orchestrator state
+        completed_tasks_.store(0, std::memory_order_release);
+        total_tasks_ = 0;
+        orchestrator_done_ = false;
+        pto2_init_done_.store(false, std::memory_order_release);
+        pto2_init_complete_.store(false, std::memory_order_release);
+
+        completed_.store(false, std::memory_order_release);
+
+        // Reset core discovery and assignment state
+        aic_count_ = 0;
+        aiv_count_ = 0;
+        cores_total_num_ = 0;
+        aicpu_thread_num_ = 0;
+        sched_thread_num_ = 0;
+        active_sched_threads_ = 0;
+        for (int32_t t = 0; t < MAX_AICPU_THREADS; t++) core_trackers_[t] = CoreTracker{};
+
+        regs_ = 0;
+        sched_ = nullptr;
+        rt_ = nullptr;
+        func_id_to_addr_ = nullptr;
+    }
 
     // Main scheduler thread entry: poll completion + dispatch ready tasks.
-    int32_t resolve_and_dispatch(Runtime *runtime, int32_t thread_idx);
-
-    // Shutdown AICore registers for this thread's assigned cores.
-    // Also runs PMU finalize (PTO2_PROFILING) before deinit when enabled.
-    // Orchestrator threads (core_trackers_[thread_idx].core_num() == 0) are a no-op.
-    int32_t shutdown(int32_t thread_idx);
-
-    // Run all post-orchestration scheduler bookkeeping:
-    //  - publishes core assignments to the perf collector (PTO2_PROFILING)
-    //  - latches submitted task count from PTO2 shared memory
-    //  - folds inline_completed_tasks into completed_tasks_
-    //  - flips orchestrator_done_ and triggers core transition
-    //    (skipped on fatal error — emergency_shutdown runs instead)
-    // Callers must invoke rt_orchestration_done(rt) before this — that
-    // step belongs to the orchestrator lifecycle, not the scheduler.
-    void on_orchestration_done(Runtime *runtime, PTO2Runtime *rt, int32_t thread_idx, int32_t total_tasks);
+    int32_t resolve_and_dispatch(Runtime *runtime, int32_t thread_idx)
+    {
+        always_assert(sched_ != nullptr);
+        CoreTracker &tracker = core_trackers_[thread_idx];
+
+        PTO2SharedMemoryHeader *header = sched_->sm_header;
+        if (!header) return -1;
+
+        // One-time init: assign perf buffers (one thread does it; others wait)
+        if (!pto2_init_done_.exchange(true, std::memory_order_acq_rel)) pto2_init_complete_.store(true, std::memory_order_release);
+        else
+            while (!pto2_init_complete_.load(std::memory_order_acquire)) SPIN_WAIT_HINT();
+
+        int32_t cur_thread_completed = 0;
+        int32_t idle_iterations = 0;
+
+        constexpr int LOCAL_READY_CAP_PER_TYPE = 64;
+        PTO2TaskSlotState *local_ptrs[PTO2_NUM_RESOURCE_SHAPES][LOCAL_READY_CAP_PER_TYPE];
+        PTO2LocalReadyBuffer local_bufs[PTO2_NUM_RESOURCE_SHAPES];
+        for (int32_t i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) local_bufs[i].reset(local_ptrs[i], LOCAL_READY_CAP_PER_TYPE);
+
+        const bool pmu_active = is_pmu_enabled();
+
+        uint64_t last_progress_ts = get_sys_cnt_aicpu();
+
+        // Profile reset + total-cycle start. Reset here so each
+        // resolve_and_dispatch call (≈ one kernel launch) records its own
+        // breakdown. The dump happens at loop exit, well outside the hot path.
+        SchedulerThreadProfile &profile = thread_profiles_[thread_idx];
+        profile.reset();
+        const uint64_t profile_loop_start = get_sys_cnt_aicpu();
+
+        while (true)
+        {
+            if (completed_.load(std::memory_order_acquire)) break;
+            bool made_progress = false;
+            profile.total_iters++;
+            if (!tracker.has_any_running_cores())
+            {
+                LoopAction action = handle_orchestrator_exit(header, runtime);
+                if (action == LoopAction::BREAK_LOOP) break;
+            }
+
+            // Phase 1: Check running cores for completion
+            int32_t completed_this_turn = 0;
+
+            if (tracker.has_any_running_cores())
+            {
+                uint64_t t0 = get_sys_cnt_aicpu();
+                check_running_cores_for_completion(thread_idx, completed_this_turn, cur_thread_completed, made_progress);
+                profile.completion_cycles += get_sys_cnt_aicpu() - t0;
+                profile.completion_iters++;
+            }
+            if (completed_this_turn > 0)
+            {
+                completed_tasks_.fetch_add(completed_this_turn, std::memory_order_relaxed);
+            }
+
+            uint64_t t0_async = 0;
+            if (rt_ != nullptr && rt_->aicore_mailbox != nullptr && (sched_->async_wait_list.count > 0 || rt_->aicore_mailbox->has_pending()))
+            {
+                t0_async = get_sys_cnt_aicpu();
+                AsyncPollResult poll_result = sched_->async_wait_list.poll_and_complete<false>(rt_->aicore_mailbox, sched_);
+                if (poll_result.error_code != PTO2_ERROR_NONE)
+                {
+                    int32_t expected = PTO2_ERROR_NONE;
+                    header->sched_error_code.compare_exchange_strong(expected, poll_result.error_code, std::memory_order_acq_rel, std::memory_order_acquire);
+                    completed_.store(true, std::memory_order_release);
+                    break;
+                }
+                if (poll_result.completed > 0)
+                {
+                    completed_tasks_.fetch_add(poll_result.completed, std::memory_order_relaxed);
+                    made_progress = true;
+                }
+                profile.async_wait_cycles += get_sys_cnt_aicpu() - t0_async;
+                profile.async_wait_iters++;
+            }
+
+            // Phase 2 drain check
+            if (drain_state_.sync_start_pending.load(std::memory_order_acquire) != 0)
+            {
+                handle_drain_mode(thread_idx);
+                continue;
+            }
+
+            // Phase 3: Drain wiring queue (thread 0 only). Pass cumulative
+            // sub-phase counters (SPSC drain stage 1 / pending-FIFO poll
+            // stage 2) so drain_wiring_queue accumulates into them.
+            if (thread_idx == 0)
+            {
+                uint64_t t0 = get_sys_cnt_aicpu();
+                int wired = sched_->drain_wiring_queue(orchestrator_done_,
+                    &profile.spsc_drain_cycles, &profile.spsc_drain_iters,
+                    &profile.pending_poll_cycles, &profile.pending_poll_iters);
+                if (wired > 0) made_progress = true;
+                profile.drain_wiring_cycles += get_sys_cnt_aicpu() - t0;
+                profile.drain_wiring_iters++;
+            }
+
+            if (thread_idx == 0)
+            {
+                uint64_t t0 = get_sys_cnt_aicpu();
+                constexpr int DUMMY_DRAIN_BATCH = 16;
+                PTO2TaskSlotState *dummy_batch[DUMMY_DRAIN_BATCH];
+                int dummy_got = sched_->dummy_ready_queue.pop_batch(dummy_batch, DUMMY_DRAIN_BATCH);
+                for (int di = 0; di < dummy_got; di++)
+                {
+                    PTO2TaskSlotState &dummy_slot = *dummy_batch[di];
+                    sched_->on_mixed_task_complete(dummy_slot);
+                    completed_tasks_.fetch_add(1, std::memory_order_relaxed);
+                    cur_thread_completed++;
+                }
+                if (dummy_got > 0) made_progress = true;
+                profile.dummy_drain_cycles += get_sys_cnt_aicpu() - t0;
+                profile.dummy_drain_iters++;
+            }
+
+            // Phase 4: MIX-strict-priority dispatch with phase-split and
+            // cross-thread idle gating. See dispatch_ready_tasks for the policy.
+            {
+                uint64_t t0 = get_sys_cnt_aicpu();
+                dispatch_ready_tasks(thread_idx, tracker, local_bufs, pmu_active, made_progress);
+                profile.dispatch_cycles += get_sys_cnt_aicpu() - t0;
+                profile.dispatch_iters++;
+            }
+
+            if (made_progress)
+            {
+                idle_iterations = 0;
+                last_progress_ts = get_sys_cnt_aicpu();
+            }
+            else
+            {
+                uint64_t t0_idle = get_sys_cnt_aicpu();
+                idle_iterations++;
+
+                if (idle_iterations % FATAL_ERROR_CHECK_INTERVAL == 0)
+                {
+                    LoopAction action = check_idle_fatal_error(header, runtime);
+                    if (action == LoopAction::BREAK_LOOP) break;
+                }
+
+                if (idle_iterations % STALL_LOG_INTERVAL == 0) log_stall_diagnostics(thread_idx);
+                if (get_sys_cnt_aicpu() - last_progress_ts > SCHEDULER_TIMEOUT_CYCLES)
+                {
+                    bool self_owns = self_owns_running_task(thread_idx);
+                    bool global_stuck = !self_owns && total_tasks_ > 0 && completed_tasks_.load(std::memory_order_relaxed) < total_tasks_ && no_thread_owns_running_task();
+                    if (self_owns || global_stuck) return handle_timeout_exit(thread_idx, header, runtime);
+                    last_progress_ts = get_sys_cnt_aicpu();
+                }
+                SPIN_WAIT_HINT();
+                profile.idle_spin_cycles += get_sys_cnt_aicpu() - t0_idle;
+                profile.idle_iters++;
+            }
+        }
+
+        // Dump profile breakdown for this thread. Logged AFTER the hot loop
+        // exits, so this adds no overhead to the measured phases.
+        profile.total_cycles = get_sys_cnt_aicpu() - profile_loop_start;
+        LOG_INFO_V9(
+            "CLAUDE_PROFILING thread=%d total_cyc=%lu iters=%lu compl_cyc=%lu compl_n=%lu ctask_cyc=%lu ctask_n=%lu cores_scan=%lu async_cyc=%lu async_n=%lu drain_cyc=%lu drain_n=%lu spsc_cyc=%lu spsc_n=%lu poll_cyc=%lu poll_n=%lu poll_skipped=%lu dummy_cyc=%lu dummy_n=%lu dispatch_cyc=%lu dispatch_n=%lu idle_cyc=%lu idle_n=%lu",
+            (int)thread_idx,
+            (unsigned long)profile.total_cycles, (unsigned long)profile.total_iters,
+            (unsigned long)profile.completion_cycles, (unsigned long)profile.completion_iters,
+            (unsigned long)profile.complete_task_cycles, (unsigned long)profile.complete_task_calls,
+            (unsigned long)profile.cores_scanned,
+            (unsigned long)profile.async_wait_cycles, (unsigned long)profile.async_wait_iters,
+            (unsigned long)profile.drain_wiring_cycles, (unsigned long)profile.drain_wiring_iters,
+            (unsigned long)profile.spsc_drain_cycles, (unsigned long)profile.spsc_drain_iters,
+            (unsigned long)profile.pending_poll_cycles, (unsigned long)profile.pending_poll_iters,
+            (unsigned long)profile.pending_poll_skipped,
+            (unsigned long)profile.dummy_drain_cycles, (unsigned long)profile.dummy_drain_iters,
+            (unsigned long)profile.dispatch_cycles, (unsigned long)profile.dispatch_iters,
+            (unsigned long)profile.idle_spin_cycles, (unsigned long)profile.idle_iters);
+
+        return cur_thread_completed;
+    }
+
+    int32_t shutdown(int32_t thread_idx)
+    {
+        const int32_t *cores = core_trackers_[thread_idx].core_ids();
+        int32_t core_num = core_trackers_[thread_idx].core_num();
+        if (core_num == 0) return 0;
+
+        int32_t rc = 0;
+        for (int32_t i = 0; i < core_num; i++)
+        {
+            int32_t core_id = cores[i];
+            uint64_t reg_addr = core_exec_states_[core_id].reg_addr;
+            if (reg_addr != 0)
+            {
+                // Timeout means AICore is unresponsive. Log and continue deiniting remaining cores.
+                if (platform_deinit_aicore_regs(reg_addr) != 0) rc = -1;
+            }
+            else
+            {}
+        }
+        return rc;
+    }
+
+    // Upstream-compatible overload: accepts thread_idx (ignored — polling
+    // scheduler's bookkeeping is thread-agnostic at this point).
+    void on_orchestration_done(Runtime *runtime, PTO2Runtime *rt, int32_t total_tasks, int32_t /*thread_idx*/)
+    {
+        on_orchestration_done(runtime, rt, total_tasks);
+    }
+
+    void on_orchestration_done(Runtime *runtime, PTO2Runtime *rt, int32_t total_tasks)
+    {
+        total_tasks_ = total_tasks;
+
+        // Fold tasks completed inline during orchestration
+        int32_t inline_completed = static_cast<int32_t>(rt->orchestrator.inline_completed_tasks);
+        if (inline_completed > 0) completed_tasks_.fetch_add(inline_completed, std::memory_order_relaxed);
+        orchestrator_done_ = true;
+
+        // Check for fatal error from orchestration; if so, shut down immediately.
+        int32_t orch_err = 0;
+        if (sched_->sm_header) orch_err = sched_->sm_header->orch_error_code.load(std::memory_order_relaxed);
+        if (orch_err != PTO2_ERROR_NONE)
+        {
+            if (!completed_.exchange(true, std::memory_order_acq_rel)) emergency_shutdown(runtime);
+        }
+
+    }
 
     // Bind the PTO2Runtime scheduler pointer. Required in device-orchestration
     // mode where rt is created by the orchestrator thread after init().
-    void bind_runtime(PTO2Runtime *rt);
-
-    // Serial orch->sched mode pre-dispatch wait. Thread 0 may drain deferred
-    // wiring to keep the bounded wiring queue from back-pressuring orchestration,
-    // but no AICore dispatch happens before orchestrator_done_.
-    void wait_for_orchestration_done_before_dispatch(Runtime *runtime, int32_t thread_idx);
+    void bind_runtime(PTO2Runtime *rt)
+    {
+        rt_ = rt;
+        sched_ = &rt->scheduler;
+    }
 
-    // =========================================================================
-    // State queries / external synchronization points
-    // =========================================================================
+    int32_t aic_count() const
+    {
+        return aic_count_;
+    }
+    int32_t aiv_count() const
+    {
+        return aiv_count_;
+    }
+    bool is_completed() const
+    {
+        return completed_.load(std::memory_order_acquire);
+    }
+    int32_t completed_tasks_count() const
+    {
+        return completed_tasks_.load(std::memory_order_acquire);
+    }
 
-    int32_t aic_count() const { return aic_count_; }
-    int32_t aiv_count() const { return aiv_count_; }
-    bool is_completed() const { return completed_.load(std::memory_order_acquire); }
-    int32_t completed_tasks_count() const { return completed_tasks_.load(std::memory_order_acquire); }
-    bool orchestration_done() const { return orchestrator_done_.load(std::memory_order_relaxed); }
+    // Block until the first scheduler thread has finished one-time PTO2 init.
+    // Called by the orchestrator thread in device-orch mode.
+    void wait_pto2_init_complete() const
+    {
+        while (!pto2_init_complete_.load(std::memory_order_acquire)) SPIN_WAIT_HINT();
+    }
 
 private:
-    // =========================================================================
-    // State
-    // =========================================================================
-
     // --- Scheduler binding & per-core runtime state ---
     alignas(64) PTO2SchedulerState *sched_{nullptr};
     PTO2Runtime *rt_{nullptr};
@@ -122,32 +465,23 @@ class SchedulerContext {
 
     // Cluster-ordered core trackers, one per scheduler thread
     CoreTracker core_trackers_[MAX_AICPU_THREADS];
+    SchedulerThreadProfile thread_profiles_[MAX_AICPU_THREADS];
 
     // Per-core dispatch payload storage: dual-buffer for pipelining.
     // buf_idx = reg_task_id & 1; adjacent dispatches alternate automatically.
     PTO2DispatchPayload payload_per_core_[RUNTIME_MAX_WORKER][2];
 
-    // Per-core deferred-completion software registration storage.  This has
-    // the same runtime lifetime as payload_per_core_, but is kept out of the
-    // dispatch payload so normal task dispatch layout and cache footprint stay
-    // unchanged.
     DeferredCompletionSlab deferred_slab_per_core_[RUNTIME_MAX_WORKER][2];
 
     // sync_start drain coordination
     SyncStartDrainState drain_state_;
 
-#if PTO2_PROFILING
-    SchedL2SwimlaneCounters sched_l2_swimlane_[MAX_AICPU_THREADS];
-    // Cached once at init() from get_l2_swimlane_level(), AFTER
-    // l2_swimlane_aicpu_init has promoted the level from the shared-memory header.
-    L2SwimlaneLevel l2_swimlane_level_{L2SwimlaneLevel::DISABLED};
-#endif
-
     // --- Task-execution tracking ---
     std::atomic<int32_t> completed_tasks_{0};
     int32_t total_tasks_{0};
     // Device orchestration: set by last orchestrator when graph is built; schedulers poll it.
-    std::atomic<bool> orchestrator_done_{false};
+    // volatile prevents the compiler from hoisting the load out of spin loops.
+    volatile bool orchestrator_done_{false};
     std::atomic<bool> completed_{false};
     uint64_t *func_id_to_addr_{nullptr};
 
@@ -166,38 +500,167 @@ class SchedulerContext {
     // Platform AICore-register base array (set by AicpuExecutor before init()).
     uint64_t regs_{0};
 
-#if PTO2_PROFILING
-    // PMU profiling: physical core IDs for PMU MMIO base resolution.
-    // Separate storage because CoreExecState's 64-byte budget has no room for
-    // physical_core_id when PTO2_PROFILING=1.
-    uint32_t physical_core_ids_[RUNTIME_MAX_WORKER]{};
-#endif
-
-    // =========================================================================
-    // Core management (scheduler_cold_path.cpp)
-    // =========================================================================
+    // --- One-time init coordination ---
+    std::atomic<bool> pto2_init_done_{false};
+    std::atomic<bool> pto2_init_complete_{false};
 
     // Handshake with all AICore workers; populates core_exec_states_, worker id lists.
-    int32_t handshake_all_cores(Runtime *runtime);
+    int32_t handshake_all_cores(Runtime *runtime)
+    {
+        Handshake *all_handshakes = reinterpret_cast<Handshake *>(runtime->dev.workers);
+        cores_total_num_ = runtime->dev.worker_count;
+
+        // Validate cores_total_num_ before using as array index
+        if (cores_total_num_ == 0 || cores_total_num_ > RUNTIME_MAX_WORKER) return -1;
+
+        aic_count_ = 0;
+        aiv_count_ = 0;
+
+        for (int32_t i = 0; i < cores_total_num_; i++)
+        {
+            all_handshakes[i].task = reinterpret_cast<uint64_t>(&payload_per_core_[i][0]);
+            OUT_OF_ORDER_STORE_BARRIER();
+            all_handshakes[i].aicpu_ready = 1;
+        }
+        OUT_OF_ORDER_STORE_BARRIER();
+
+        // Get platform physical cores count for validation
+        uint32_t max_physical_cores_count = platform_get_physical_cores_count();
+
+        // Step 2: Wait for all cores to respond, collect core type and register addresses
+        bool handshake_failed = false;
+        for (int32_t i = 0; i < cores_total_num_; i++)
+        {
+            Handshake *hank = &all_handshakes[i];
+
+            while (hank->aicore_regs_ready == 0) SPIN_WAIT_HINT();
+
+            uint32_t physical_core_id = hank->physical_core_id;
+
+            if (physical_core_id >= max_physical_cores_count)
+            {
+                handshake_failed = true;
+                continue;
+            }
+
+            uint64_t *regs = reinterpret_cast<uint64_t *>(regs_);
+            uint64_t reg_addr = regs[physical_core_id];
+
+            // Initialize AICore registers after discovery (first round)
+            platform_init_aicore_regs(reg_addr);
+            OUT_OF_ORDER_STORE_BARRIER();
+            hank->aicpu_regs_ready = 1;
+
+            OUT_OF_ORDER_STORE_BARRIER();
+
+            while (hank->aicore_done == 0) SPIN_WAIT_HINT();
+
+            CoreType type = hank->core_type;
+
+            core_exec_states_[i].reg_addr = reg_addr;
+            core_exec_states_[i].cond_ptr = get_reg_ptr(reg_addr, RegId::COND);
+
+            core_exec_states_[i].worker_id = i;
+            core_exec_states_[i].physical_core_id = physical_core_id;
+            core_exec_states_[i].core_type = type;
+
+            if (type == CoreType::AIC) aic_worker_ids_[aic_count_++] = i;
+            else aiv_worker_ids_[aiv_count_++] = i;
+        }
+
+        if (handshake_failed)
+        {
+            emergency_shutdown(runtime);
+            return -1;
+        }
+
+        return 0;
+    }
 
     // Assign discovered cores (cluster = 1 AIC + 2 AIV) round-robin across scheduler threads.
-    bool assign_cores_to_threads();
+    bool assign_cores_to_threads()
+    {
+        // Cluster-aligned round-robin assignment: cluster ci -> sched thread ci % active_sched_threads_.
+        // Each cluster = 1 AIC + 2 adjacent AIV; the triple is always kept together.
+        active_sched_threads_ = (sched_thread_num_ > 0) ? sched_thread_num_ : aicpu_thread_num_;
+        int32_t cluster_count = aic_count_;
+
+        // Max clusters any single sched thread can hold: ceil(cluster_count / active_sched_threads_).
+        int32_t max_clusters_per_thread = (cluster_count + active_sched_threads_ - 1) / active_sched_threads_;
+        int32_t thread_cores_num = max_clusters_per_thread * 3;
+
+        if (thread_cores_num > CoreTracker::MAX_CORE_PER_THREAD) return false;
+
+        for (int32_t i = 0; i < RUNTIME_MAX_WORKER; i++)
+        {
+            core_exec_states_[i].running_reg_task_id = AICPU_TASK_INVALID;
+            core_exec_states_[i].pending_reg_task_id = AICPU_TASK_INVALID;
+        }
+
+        // Count clusters per thread first (round-robin may distribute unevenly)
+        int32_t clusters_per_thread[MAX_AICPU_THREADS] = {};
+        for (int32_t ci = 0; ci < cluster_count; ci++) clusters_per_thread[ci % active_sched_threads_]++;
+        for (int32_t i = 0; i < active_sched_threads_; i++) core_trackers_[i].init(clusters_per_thread[i]);
+
+        int32_t cluster_idx_per_thread[MAX_AICPU_THREADS] = {};
+
+        for (int32_t ci = 0; ci < cluster_count; ci++)
+        {
+            int32_t t = ci % active_sched_threads_;
+
+            int32_t aic_wid = aic_worker_ids_[ci];
+            int32_t aiv0_wid = aiv_worker_ids_[2 * ci];
+            int32_t aiv1_wid = aiv_worker_ids_[2 * ci + 1];
+
+            core_trackers_[t].set_cluster(cluster_idx_per_thread[t]++, aic_wid, aiv0_wid, aiv1_wid);
+        }
+
+        for (int32_t t = 0; t < aicpu_thread_num_; t++)
+        {}
+
+        return true;
+    }
 
     // Emergency shutdown: broadcast exit signal to every handshake'd core and
     // deinit their AICore register blocks. Idempotent.
-    void emergency_shutdown(Runtime *runtime);
-
-    // =========================================================================
-    // Dispatch (scheduler_dispatch.cpp)
-    // =========================================================================
+    void emergency_shutdown(Runtime *runtime)
+    {
+        Handshake *all_handshakes = reinterpret_cast<Handshake *>(runtime->dev.workers);
+        int32_t timeout_count = 0;
+        for (int32_t i = 0; i < cores_total_num_; i++)
+        {
+            Handshake *hank = &all_handshakes[i];
+            OUT_OF_ORDER_STORE_BARRIER();
+            hank->aicpu_regs_ready = 1;
+            if (core_exec_states_[i].reg_addr != 0)
+            {
+                if (platform_deinit_aicore_regs(core_exec_states_[i].reg_addr) != 0) timeout_count++;
+            }
+        }
+        if (timeout_count > 0)
+        {}
+    }
 
-    static const char *shape_name(PTO2ResourceShape shape);
+    static const char *shape_name(PTO2ResourceShape shape)
+    {
+        switch (shape)
+        {
+        case PTO2ResourceShape::AIC:
+            return "AIC";
+        case PTO2ResourceShape::AIV:
+            return "AIV";
+        case PTO2ResourceShape::MIX:
+            return "MIX";
+        case PTO2ResourceShape::DUMMY:
+            return "DUMMY";
+        }
+        return "UNKNOWN";
+    }
 
-    // Lower-case rendering of PTO2SubtaskSlot, used by dispatch and stall logs.
-    // Kept lower-case to match the `kernels=[aic:N aiv0:N aiv1:N]` field
-    // convention already established in the stall log family.
-    static inline const char *subslot_name(PTO2SubtaskSlot s) {
-        switch (s) {
+    static inline const char *subslot_name(PTO2SubtaskSlot s)
+    {
+        switch (s)
+        {
         case PTO2SubtaskSlot::AIC:
             return "aic";
         case PTO2SubtaskSlot::AIV0:
@@ -208,220 +671,794 @@ class SchedulerContext {
         return "?";
     }
 
-    int pop_ready_tasks_batch(
-        PTO2ResourceShape shape, int32_t thread_idx, PTO2LocalReadyBuffer &local_buf, PTO2TaskSlotState **out,
-        int max_count
-    );
-
-    void build_payload(
-        PTO2DispatchPayload &dispatch_payload, PTO2TaskSlotState &slot_state, PTO2SubtaskSlot subslot,
-        const AsyncCtx &async_ctx, int32_t block_idx
-    );
-
-    // Batched-dispatch primitives. prepare_* builds the payload and per-core
-    // state; publish_* issues the MMIO register write. Callers must wmb()
-    // between the prepare batch and the publish batch, then sample
-    // get_sys_cnt_aicpu() once and pass it to publish_* for every handle.
-    //
-    // dispatch_timestamp_slot points to the CoreExecState slot
-    // (pending_dispatch_timestamp / running_dispatch_timestamp) selected at
-    // prepare time, or nullptr when L2 swimlane is below AICPU_TIMING and no
-    // dispatch timestamp is being recorded.
-    struct PublishHandle {
+    int pop_ready_tasks_batch(PTO2ResourceShape shape, PTO2LocalReadyBuffer &local_buf, PTO2TaskSlotState **out, int max_count)
+    {
+        return sched_->get_ready_tasks_batch(shape, local_buf, out, max_count);
+    }
+
+    void build_payload(PTO2DispatchPayload &dispatch_payload, PTO2TaskSlotState &slot_state, PTO2SubtaskSlot subslot, const AsyncCtx &async_ctx, int32_t block_idx)
+    {
+        int32_t slot_idx = static_cast<int32_t>(subslot);
+        uint64_t callable_addr = get_function_bin_addr(slot_state.task->kernel_id[slot_idx]);
+        const CoreCallable *callable = reinterpret_cast<const CoreCallable *>(callable_addr);
+        dispatch_payload.function_bin_addr = callable->resolved_addr();
+        auto &payload = *slot_state.payload;
+        int n = 0;
+        for (int32_t i = 0; i < payload.tensor_count; i++) dispatch_payload.args[n++] = reinterpret_cast<uint64_t>(&payload.tensors[i]);
+        for (int32_t i = 0; i < payload.scalar_count; i++) dispatch_payload.args[n++] = payload.scalars[i];
+        dispatch_payload.local_context.block_idx = block_idx;
+        dispatch_payload.local_context.block_num = slot_state.logical_block_num;
+        dispatch_payload.local_context.async_ctx = async_ctx;
+        dispatch_payload.args[PAYLOAD_LOCAL_CONTEXT_INDEX] = reinterpret_cast<uint64_t>(&dispatch_payload.local_context);
+        dispatch_payload.args[PAYLOAD_GLOBAL_CONTEXT_INDEX] = reinterpret_cast<uint64_t>(&dispatch_payload.global_context);
+    }
+
+    struct PublishHandle
+    {
         uint64_t reg_addr;
         uint32_t reg_task_id;
         int32_t core_offset;
         uint64_t *dispatch_timestamp_slot;
     };
 
-    PublishHandle prepare_subtask_to_core(
-        int32_t thread_idx, int32_t core_offset, PTO2TaskSlotState &slot_state, PTO2SubtaskSlot subslot,
-        bool to_pending, int32_t block_idx
-    );
+    SchedulerContext::PublishHandle prepare_subtask_to_core(int32_t thread_idx, int32_t core_offset, PTO2TaskSlotState &slot_state, PTO2SubtaskSlot subslot, bool to_pending, int32_t block_idx)
+    {
+        CoreTracker &tracker = core_trackers_[thread_idx];
+        auto core_id = tracker.get_core_id_by_offset(core_offset);
+        CoreExecState &core_exec_state = core_exec_states_[core_id];
+
+        core_exec_state.dispatch_seq++;
+        uint32_t reg_task_id = core_exec_state.dispatch_seq & TASK_ID_MASK;
+        static_assert((TASK_ID_MASK - AICORE_EXIT_SIGNAL + 1) % 2 == 0, "Sentinel skip must be even to preserve dual-buffer parity");
+        if (reg_task_id >= AICORE_EXIT_SIGNAL)
+        {
+            core_exec_state.dispatch_seq += (TASK_ID_MASK - reg_task_id + 1);
+            reg_task_id = core_exec_state.dispatch_seq & TASK_ID_MASK;
+        }
 
-    inline void publish_subtask_to_core(const PublishHandle &h, uint64_t dispatch_ts) {
-        if (h.dispatch_timestamp_slot != nullptr) {
-            *h.dispatch_timestamp_slot = dispatch_ts;
+        uint32_t buf_idx = reg_task_id & 1u;
+        PTO2DispatchPayload &payload = payload_per_core_[core_id][buf_idx];
+        DeferredCompletionSlab *deferred_slab = &deferred_slab_per_core_[core_id][buf_idx];
+        deferred_slab->count = 0;
+        deferred_slab->error_code = PTO2_ERROR_NONE;
+        AsyncCtx async_ctx = AsyncCtx::make(slot_state.task->task_id, deferred_slab);
+        build_payload(payload, slot_state, subslot, async_ctx, block_idx);
+
+        if (to_pending)
+        {
+            core_exec_state.pending_subslot = subslot;
+            core_exec_state.pending_slot_state = &slot_state;
+            core_exec_state.pending_reg_task_id = static_cast<int32_t>(reg_task_id);
+        }
+        else
+        {
+            core_exec_state.running_subslot = subslot;
+            core_exec_state.running_slot_state = &slot_state;
+            core_exec_state.running_reg_task_id = static_cast<int32_t>(reg_task_id);
+            tracker.change_core_state(core_offset);
         }
+        tracker.set_pending_occupied(core_offset);
+
+        uint64_t *dispatch_timestamp_slot = nullptr;
+
+        return PublishHandle{core_exec_state.reg_addr, reg_task_id, core_offset, dispatch_timestamp_slot};
+    }
+
+    inline void publish_subtask_to_core(const PublishHandle &h, uint64_t dispatch_ts)
+    {
+        if (h.dispatch_timestamp_slot != nullptr) *h.dispatch_timestamp_slot = dispatch_ts;
         write_reg(h.reg_addr, RegId::DATA_MAIN_BASE, static_cast<uint64_t>(h.reg_task_id));
     }
 
     // Fan out one block's subtasks (1 for AIC/AIV, 1-3 for MIX) into the
     // caller-supplied handles buffer. Returns the number of handles written.
-    int prepare_block_for_dispatch(
-        int32_t thread_idx, int32_t core_offset, PTO2TaskSlotState &slot_state, PTO2ResourceShape shape,
-        bool to_pending, int32_t block_idx, PublishHandle *out_handles
-    );
-
-    void dispatch_shape(
-        int32_t thread_idx, PTO2ResourceShape shape, CoreTracker::DispatchPhase phase, PTO2LocalReadyBuffer &local_buf,
-        CoreTracker &tracker, bool &entered_drain, bool &made_progress, bool &try_pushed
-    );
-
-    // Speculative early-dispatch (Hook 1). After normal dispatch leaves idle
-    // cores spare, pre-stage the consumers of any RUNNING flagged producer onto
-    // those cores with not_ready=1 (gated). Touches no dependency state — the
-    // task is released by the doorbell at its normal ready-pop (Hook 2).
-    int32_t try_speculative_early_dispatch(int32_t thread_idx);
-
-    // Stage the already-claimed range [start, start+count) of consumer `c` onto
-    // thread_idx's idle (RUNNING slot) then pending (gated-pending, promote-on-FIN)
-    // cores from the provided free-core sets. The caller advances next_block_idx and
-    // re-pushes `c` BEFORE calling, so this expensive prepare+publish runs
-    // concurrently with peers (mirrors the normal SPMD dispatch path). Returns the
-    // number of blocks staged.
-    int32_t stage_consumer_blocks(
-        int32_t thread_idx, PTO2TaskSlotState *c, PTO2ResourceShape shape, int32_t start, int32_t count,
-        CoreTracker::BitStates &idle, CoreTracker::BitStates &pend
-    );
-
-    // One pass of "Phase 4" in the resolve_and_dispatch loop: IDLE-stage dispatch
-    // for MIX then (if no mix residual) AIC/AIV; mid-flush of local buffers; then
-    // PENDING-stage dispatch with cross-thread idle gating. MIX is strictly
-    // prioritized — when mix residual is detected after MIX-IDLE, AIC/AIV are
-    // skipped for the whole pass but MIX-PENDING still runs.
-    //
-    // Forward-progress argument for AIC/AIV: skip_aic_aiv is sticky for the
-    // current pass only. The next loop iteration re-evaluates after Phase 1
-    // completion polling and the global MIX queue draining (here or on any
-    // peer thread). AIC/AIV starvation is therefore bounded by MIX throughput,
-    // not unbounded — once mix completes on at least one cluster, the next
-    // pass either drains the residual or admits AIC/AIV.
-    void dispatch_ready_tasks(
-        int32_t thread_idx, CoreTracker &tracker, PTO2LocalReadyBuffer (&local_bufs)[PTO2_NUM_RESOURCE_SHAPES],
-        bool pmu_active, bool &made_progress, bool &try_pushed
-    );
-
-    // Returns true if any *other* scheduler thread currently has an idle core
-    // matching `shape`. Used as a scheduling hint on the PENDING dispatch path
-    // — see the implementation in scheduler_dispatch.cpp for the hint-semantics
-    // rationale and the safety argument against the drain worker.
-    bool has_idle_in_other_threads(int32_t self_thread_idx, PTO2ResourceShape shape) const;
-
-    // True if mix tasks remain anywhere this thread could see them: the caller's
-    // MIX local LIFO stack or the global MIX ready queue. Approximate —
-    // PTO2ReadyQueue::size() (see pto_scheduler.h) snapshots its enqueue/dequeue
-    // positions with std::memory_order_relaxed and may interleave with concurrent
-    // push/pop. Don't confuse with PTO2SpscQueue::size(), which uses acquire
-    // loads — that one isn't on this path. A stale read here causes at most one
-    // extra/missed AIC/AIV skip and self-corrects on the next loop iteration.
-    bool has_residual_mix(const PTO2LocalReadyBuffer &mix_local_buf) const {
+    int prepare_block_for_dispatch(int32_t thread_idx, int32_t core_offset, PTO2TaskSlotState &slot_state, PTO2ResourceShape shape, bool to_pending, int32_t block_idx, PublishHandle *out_handles)
+    {
+        CoreTracker &tracker = core_trackers_[thread_idx];
+        if (shape == PTO2ResourceShape::MIX)
+        {
+            uint8_t cmask = slot_state.active_mask.core_mask();
+            int n = 0;
+            if (cmask & PTO2_SUBTASK_MASK_AIC)
+            {
+                bool p = to_pending && !tracker.is_aic_core_idle(core_offset);
+                out_handles[n++] = prepare_subtask_to_core(thread_idx, tracker.get_aic_core_offset(core_offset), slot_state, PTO2SubtaskSlot::AIC, p, block_idx);
+            }
+            if (cmask & PTO2_SUBTASK_MASK_AIV0)
+            {
+                bool p = to_pending && !tracker.is_aiv0_core_idle(core_offset);
+                out_handles[n++] = prepare_subtask_to_core(thread_idx, tracker.get_aiv0_core_offset(core_offset), slot_state, PTO2SubtaskSlot::AIV0, p, block_idx);
+            }
+            if (cmask & PTO2_SUBTASK_MASK_AIV1)
+            {
+                bool p = to_pending && !tracker.is_aiv1_core_idle(core_offset);
+                out_handles[n++] = prepare_subtask_to_core(thread_idx, tracker.get_aiv1_core_offset(core_offset), slot_state, PTO2SubtaskSlot::AIV1, p, block_idx);
+            }
+            return n;
+        }
+        else if (shape == PTO2ResourceShape::AIC)
+        {
+            out_handles[0] = prepare_subtask_to_core(thread_idx, core_offset, slot_state, PTO2SubtaskSlot::AIC, to_pending, block_idx);
+            return 1;
+        }
+        else
+        {
+            out_handles[0] = prepare_subtask_to_core(thread_idx, core_offset, slot_state, PTO2SubtaskSlot::AIV0, to_pending, block_idx);
+            return 1;
+        }
+    }
+
+    void dispatch_shape(int32_t thread_idx, PTO2ResourceShape shape, CoreTracker::DispatchPhase phase, PTO2LocalReadyBuffer &local_buf, CoreTracker &tracker, bool &entered_drain, bool &made_progress)
+    {
+        if (entered_drain) return;
+
+        bool is_pending = (phase == CoreTracker::DispatchPhase::PENDING);
+        auto cores = tracker.get_dispatchable_cores(shape, phase);
+        if (!cores.has_value()) return;
+
+        while (cores.has_value() && !entered_drain)
+        {
+            int want = cores.count();
+            PTO2TaskSlotState *batch[CoreTracker::MAX_CLUSTERS * 3];
+            int got = pop_ready_tasks_batch(shape, local_buf, batch, want);
+            if (got == 0) break;
+
+            bool any_sync_start = false;
+            for (int bi = 0; bi < got; bi++)
+            {
+                if (batch[bi]->active_mask.requires_sync_start())
+                {
+                    any_sync_start = true;
+                    break;
+                }
+            }
+
+            PublishHandle handles[CoreTracker::MAX_CLUSTERS * 3];
+            int handle_count = 0;
+            bool dispatched_any = false;
+
+            auto flush_publish = [&]() {
+                if (handle_count == 0) return;
+                wmb();
+                uint64_t dispatch_ts = 0;
+                for (int i = 0; i < handle_count; i++) publish_subtask_to_core(handles[i], dispatch_ts);
+                handle_count = 0;
+                made_progress = true;
+            };
+
+            for (int bi = 0; bi < got; bi++)
+            {
+                PTO2TaskSlotState *slot_state = batch[bi];
+
+                if (slot_state->active_mask.requires_sync_start())
+                {
+                    if (is_pending)
+                    {
+                        sched_->ready_queues[static_cast<int32_t>(shape)].push(slot_state);
+                        continue;
+                    }
+                    int32_t available = cores.count();
+                    if (available < slot_state->logical_block_num)
+                    {
+                        flush_publish();
+                        if (!enter_drain_mode(slot_state, slot_state->logical_block_num)) sched_->ready_queues[static_cast<int32_t>(shape)].push(slot_state);
+                        for (int rem = bi + 1; rem < got; rem++) sched_->ready_queues[static_cast<int32_t>(shape)].push(batch[rem]);
+                        entered_drain = true;
+                        break;
+                    }
+                }
+
+                if (!cores.has_value())
+                {
+                    flush_publish();
+                    sched_->ready_queues[static_cast<int32_t>(shape)].push_batch(&batch[bi], got - bi);
+                    break;
+                }
+
+                dispatched_any = true;
+                int32_t remaining = slot_state->logical_block_num - slot_state->next_block_idx;
+                int32_t claim = std::min(cores.count(), remaining);
+                int32_t start = slot_state->next_block_idx;
+                slot_state->next_block_idx += claim;
+
+                if (slot_state->next_block_idx < slot_state->logical_block_num) sched_->ready_queues[static_cast<int32_t>(shape)].push(slot_state);
+
+                for (int32_t b = 0; b < claim; b++)
+                {
+                    auto core_offset = cores.pop_first();
+                    handle_count += prepare_block_for_dispatch(thread_idx, core_offset, *slot_state, shape, is_pending, start + b, &handles[handle_count]);
+                }
+
+                if (any_sync_start) flush_publish();
+            }
+
+            flush_publish();
+
+            if (!dispatched_any) break;
+
+            if (!cores.has_value()) cores = tracker.get_dispatchable_cores(shape, phase);
+        }
+    }
+
+    void dispatch_ready_tasks(int32_t thread_idx, CoreTracker &tracker, PTO2LocalReadyBuffer (&local_bufs)[PTO2_NUM_RESOURCE_SHAPES], bool pmu_active, bool &made_progress)
+    {
+        using Phase = CoreTracker::DispatchPhase;
+        constexpr int32_t MIX_I = static_cast<int32_t>(PTO2ResourceShape::MIX);
+
+        static constexpr PTO2ResourceShape kAicAivOrder[2][2] = {
+            {PTO2ResourceShape::AIC, PTO2ResourceShape::AIV},
+            {PTO2ResourceShape::AIV, PTO2ResourceShape::AIC},
+        };
+        const PTO2ResourceShape *aic_aiv = kAicAivOrder[thread_idx & 1];
+
+        const int32_t bd_per_thread = PLATFORM_MAX_BLOCKDIM / active_sched_threads_;
+        const int32_t thread_capacity[PTO2_NUM_RESOURCE_SHAPES] = {
+            bd_per_thread * PLATFORM_AIC_CORES_PER_BLOCKDIM,
+            bd_per_thread * PLATFORM_AIV_CORES_PER_BLOCKDIM,
+            bd_per_thread,
+        };
+        for (int32_t s = 0; s < PTO2_NUM_RESOURCE_SHAPES; s++)
+        {
+            auto &lb = local_bufs[s];
+            int32_t excess = lb.count - thread_capacity[s];
+            if (excess <= 0) continue;
+            if (!has_idle_in_other_threads(thread_idx, static_cast<PTO2ResourceShape>(s))) continue;
+            sched_->ready_queues[s].push_batch(&lb.slot_states[lb.count - excess], excess);
+            lb.count -= excess;
+        }
+
+        auto flush_local_bufs = [&]() {
+            for (int32_t s = 0; s < PTO2_NUM_RESOURCE_SHAPES; s++)
+            {
+                auto &lb = local_bufs[s];
+                if (lb.count > 0)
+                {
+                    sched_->ready_queues[s].push_batch(lb.slot_states, lb.count);
+                    lb.count = 0;
+                }
+            }
+        };
+        struct FlushGuard
+        {
+            decltype(flush_local_bufs) &flush_fn;
+            ~FlushGuard()
+            {
+                flush_fn();
+            }
+        } flush_guard{flush_local_bufs};
+
+        bool entered_drain = false;
+
+        // ===== IDLE stage =====
+        dispatch_shape(thread_idx, PTO2ResourceShape::MIX, Phase::IDLE, local_bufs[MIX_I], tracker, entered_drain, made_progress);
+        if (entered_drain) return;
+
+        bool skip_aic_aiv = has_residual_mix(local_bufs[MIX_I]);
+
+        if (!skip_aic_aiv)
+        {
+            for (int i = 0; i < 2; i++)
+            {
+                PTO2ResourceShape s = aic_aiv[i];
+                dispatch_shape(thread_idx, s, Phase::IDLE, local_bufs[static_cast<int32_t>(s)], tracker, entered_drain, made_progress);
+                if (entered_drain) return;
+            }
+        }
+
+        // Flush between IDLE and PENDING so PENDING-stage queue-size checks and any
+        // peer-thread reads see the IDLE-stage release_fanin output.
+        flush_local_bufs();
+
+        if (pmu_active) return;
+
+        if (!has_idle_in_other_threads(thread_idx, PTO2ResourceShape::MIX))
+        {
+            dispatch_shape(thread_idx, PTO2ResourceShape::MIX, Phase::PENDING, local_bufs[MIX_I], tracker, entered_drain, made_progress);
+            if (entered_drain) return;
+        }
+
+        // Re-check after MIX-PENDING. If MIX-IDLE already set skip_aic_aiv, leave
+        // it set; otherwise, escalate iff PENDING-MIX left residual.
+        if (!skip_aic_aiv && has_residual_mix(local_bufs[MIX_I])) skip_aic_aiv = true;
+
+        if (skip_aic_aiv) return;
+
+        // AIC/AIV-PENDING gate: a peer-idle skip is a delay, not a loss — the peer
+        // will pull from the global queue on its next IDLE pass.
+        for (int i = 0; i < 2; i++)
+        {
+            PTO2ResourceShape s = aic_aiv[i];
+            if (has_idle_in_other_threads(thread_idx, s)) continue;
+            dispatch_shape(thread_idx, s, Phase::PENDING, local_bufs[static_cast<int32_t>(s)], tracker, entered_drain, made_progress);
+            if (entered_drain) return;
+        }
+    }
+
+    bool has_idle_in_other_threads(int32_t self_thread_idx, PTO2ResourceShape shape) const
+    {
+        for (int32_t t = 0; t < active_sched_threads_; t++)
+        {
+            if (t == self_thread_idx) continue;
+            if (core_trackers_[t].get_idle_core_offset_states(shape).has_value()) return true;
+        }
+        return false;
+    }
+
+    bool has_residual_mix(const PTO2LocalReadyBuffer &mix_local_buf) const
+    {
         return mix_local_buf.count > 0 || sched_->ready_queues[static_cast<int32_t>(PTO2ResourceShape::MIX)].size() > 0;
     }
 
-    // =========================================================================
-    // Completion & drain (scheduler_completion.cpp)
-    // =========================================================================
+    static SlotTransition decide_slot_transition(int32_t reg_task_id, int32_t reg_state, int32_t running_id, int32_t pending_id)
+    {
+        SlotTransition t;
+        if (pending_id != AICPU_TASK_INVALID && reg_task_id == pending_id)
+        {
+            t.matched = true;
+            t.running_done = true;  // Serial execution: pending event implies running done
+            t.running_freed = true;
+            t.pending_freed = true;
+            if (reg_state == TASK_FIN_STATE) t.pending_done = true;  // Case 1: pending FIN
+            // else: Case 2: pending ACK (pending_done stays false)
+        }
+        else if (reg_task_id == running_id)
+        {
+            if (reg_state == TASK_FIN_STATE)
+            {
+                if (pending_id == AICPU_TASK_INVALID)
+                {
+                    // Case 3.2: running FIN, no pending -> core goes idle
+                    t.matched = true;
+                    t.running_done = true;
+                    t.running_freed = true;
+                }
+                // Case 3.1: running FIN, pending exists -> skip (transient state).
+                // Case 1/2 (pending ACK/FIN) will complete running implicitly via running_done=true.
+            }
+            else
+            {
+                // Case 4: running ACK -- only pending_freed (slot now hardware-latched)
+                t.matched = true;
+                t.pending_freed = true;
+            }
+        }
+        return t;
+    }
 
-    static SlotTransition decide_slot_transition(
-        int32_t reg_task_id, int32_t reg_state, int32_t running_id, int32_t pending_id, bool pending_gated = false
-    );
+    void complete_slot_task(PTO2TaskSlotState &slot_state, int32_t expected_reg_task_id, int32_t core_id, int32_t &completed_this_turn)
+    {
+        AICoreCompletionMailbox *mailbox = rt_ != nullptr ? rt_->aicore_mailbox : nullptr;
+        bool defer_completion_to_consumer = false;
+
+        if (slot_state.payload != nullptr)
+        {
+            volatile DeferredCompletionSlab *deferred_slab = &deferred_slab_per_core_[core_id][expected_reg_task_id & 1];
+            // (q) Read count first. AICore only writes error_code as part of a
+            // condition-registration attempt that also increments count, so
+            // count == 0 ⇒ no error and no conditions to forward. This is the
+            // common path for kernels that don't use async waits (paged
+            // attention, GEMM, etc.) and saves an L1 load + branch per call.
+            uint32_t cond_count = deferred_slab->count;
+            if (cond_count != 0)
+            {
+                int32_t slab_err = deferred_slab->error_code;
+                if (slab_err != PTO2_ERROR_NONE)
+                {
+                    int32_t expected = PTO2_ERROR_NONE;
+                    sched_->sm_header->sched_error_code.compare_exchange_strong(expected, slab_err, std::memory_order_acq_rel, std::memory_order_acquire);
+                    completed_.store(true, std::memory_order_release);
+                    return;
+                }
+                if (cond_count > MAX_COMPLETIONS_PER_TASK)
+                {
+                    int32_t expected = PTO2_ERROR_NONE;
+                    sched_->sm_header->sched_error_code.compare_exchange_strong(expected, PTO2_ERROR_ASYNC_REGISTRATION_FAILED, std::memory_order_acq_rel, std::memory_order_acquire);
+                    completed_.store(true, std::memory_order_release);
+                    return;
+                }
+
+                slot_state.any_subtask_deferred.store(true, std::memory_order_release);
+
+                const PTO2TaskId token = slot_state.task->task_id;
+                for (uint32_t i = 0; i < cond_count; ++i)
+                {
+                    volatile DeferredCompletionEntry *e = &deferred_slab->entries[i];
+                    while (!mailbox->try_push_condition(token, e->addr, e->expected_value, e->engine, e->completion_type))
+                    {
+                        sched_->async_wait_list.mpsc_skipped_count.fetch_add(1, std::memory_order_relaxed);
+                        SPIN_WAIT_HINT();
+                    }
+                }
+            }
+        }
 
-    void complete_slot_task(
-        PTO2TaskSlotState &slot_state, int32_t expected_reg_task_id, PTO2SubtaskSlot subslot, int32_t thread_idx,
-        int32_t core_id, Handshake *hank, int32_t &completed_this_turn,
-        PTO2TaskSlotState *deferred_release_slot_states[], int32_t &deferred_release_count,
-        PTO2LocalReadyBuffer *local_bufs
-#if PTO2_PROFILING
-        ,
-        uint64_t dispatch_ts, uint64_t finish_ts
-#endif
-    );
-
-    static void promote_pending_to_running(CoreExecState &core);
-    static void clear_running_slot(CoreExecState &core);
-
-    void check_running_cores_for_completion(
-        int32_t thread_idx, Handshake *hank, int32_t &completed_this_turn, int32_t &cur_thread_completed,
-        bool &made_progress, PTO2TaskSlotState *deferred_release_slot_states[], int32_t &deferred_release_count,
-        PTO2LocalReadyBuffer *local_bufs
-    );
-
-    bool enter_drain_mode(PTO2TaskSlotState *slot_state, int32_t block_num);
-    int32_t count_global_available(PTO2ResourceShape shape, uint8_t core_mask);
-    void drain_worker_dispatch(int32_t block_num);
-    void handle_drain_mode(int32_t thread_idx);
-
-    // =========================================================================
-    // Cold path: exit checks, stall diagnostics, profiling (scheduler_cold_path.cpp)
-    // =========================================================================
-
-    __attribute__((noinline, cold)) LoopAction
-    handle_orchestrator_exit(int32_t thread_idx, PTO2SharedMemoryHeader *header, Runtime *runtime, int32_t &task_count);
-
-    __attribute__((noinline, cold)) LoopAction
-    check_idle_fatal_error(int32_t thread_idx, PTO2SharedMemoryHeader *header, Runtime *runtime);
-
-    __attribute__((noinline, cold)) void
-    log_stall_diagnostics(int32_t thread_idx, int32_t task_count, int32_t idle_iterations, int32_t last_progress_count);
-
-    __attribute__((noinline, cold)) void log_shutdown_stall_snapshot(
-        int32_t trigger_thread_idx, int32_t trigger_idle_iterations, int32_t trigger_last_progress_count
-    );
-
-    // Reverse lookup: given a global core_id, find which scheduler thread's
-    // tracker owns it. Returns -1 if not found. Linear scan — only used on
-    // the cold diagnostic path.
-    int32_t find_core_owner_thread(int32_t core_id) const;
-
-    // Does this thread own any core with a RUNNING task (running_slot_state set)?
-    // Gates the scheduler timeout fatal latch: a thread without an owned
-    // RUNNING task has no first-hand evidence of a stuck dispatch and must
-    // not declare global fatal on its own idle observation. The thread that
-    // does own the stuck task will reach the budget on its own polls and
-    // latch with valid evidence (or recover when the COND register flips).
-    bool self_owns_running_task(int32_t thread_idx) const;
-
-    // Does *any* scheduler thread own a RUNNING task? Used as the second
-    // fatal-latch condition: if the wall-clock budget elapsed AND no thread
-    // owns RUNNING work AND tasks remain incomplete, the system is in a
-    // pre-dispatch / WAIT-only deadlock (e.g. dependency cycle) and the
-    // ownerless idle threads are the only observers — let one of them latch.
-    bool no_thread_owns_running_task() const;
-
-    // One-glance classification of a no-progress timeout, derived from state the
-    // scheduler already holds at the stall. Reduces the multi-state snapshot to a
-    // dominant PTO2_STALL_DETAIL_* sub-class plus a few locator fields, which
-    // handle_timeout_exit propagates to host alongside the unchanged code 100.
-    struct StallClassification {
-        int32_t detail;         // PTO2_STALL_DETAIL_*
-        int32_t cnt_running;    // tasks observed RUNNING (on a core)
-        int32_t cnt_ready;      // fanin-satisfied but not dispatched
-        int32_t cnt_waiting;    // still waiting on fanin
-        int32_t completed;      // completed_tasks_ snapshot
-        int32_t total;          // total_tasks_ snapshot
-        int32_t orch_done;      // orchestrator_done flag (0/1)
-        int64_t stuck_task_id;  // S1: first RUNNING task's id (-1 if none)
-        int32_t stuck_core;     // S1: core hosting it (-1 if none)
-    };
+        bool mixed_complete = sched_->on_subtask_complete(slot_state);
+
+        if (mixed_complete && slot_state.payload != nullptr && slot_state.any_subtask_deferred.load(std::memory_order_acquire))
+        {
+            // Some subtask of this task registered conditions; finish the
+            // registration by handing the slot_state off to the consumer.
+            while (!mailbox->try_push_normal_done(slot_state.task->task_id, reinterpret_cast<uint64_t>(&slot_state)))
+            {
+                sched_->async_wait_list.mpsc_skipped_count.fetch_add(1, std::memory_order_relaxed);
+                SPIN_WAIT_HINT();
+            }
+            defer_completion_to_consumer = true;
+        }
 
-    // Scan the rings once (same ground truth as log_stall_diagnostics: a slot is
-    // RUNNING iff a core holds it as running_slot_state) and reduce to a
-    // StallClassification. Pure reads — safe to call from any scheduler thread.
-    __attribute__((noinline, cold)) StallClassification classify_stall_reason() const;
-
-    __attribute__((noinline, cold)) int32_t handle_timeout_exit(
-        int32_t thread_idx, PTO2SharedMemoryHeader *header, Runtime *runtime, int32_t idle_iterations,
-        int32_t last_progress_count
-#if PTO2_PROFILING
-        ,
-        uint64_t sched_start_ts
-#endif
-    );
+        if (mixed_complete && !defer_completion_to_consumer)
+        {
+            sched_->on_mixed_task_complete(slot_state);
+            completed_this_turn++;
+        }
+    }
 
-#if PTO2_PROFILING
-    __attribute__((noinline, cold)) void log_l2_swimlane_summary(int32_t thread_idx, int32_t cur_thread_completed);
-#endif
+    static void promote_pending_to_running(CoreExecState &core)
+    {
+        core.running_slot_state = core.pending_slot_state;
+        core.running_reg_task_id = core.pending_reg_task_id;
+        core.running_subslot = core.pending_subslot;
+        core.pending_slot_state = nullptr;
+        core.pending_reg_task_id = AICPU_TASK_INVALID;
+    }
+    static void clear_running_slot(CoreExecState &core)
+    {
+        core.running_slot_state = nullptr;
+        core.running_reg_task_id = AICPU_TASK_INVALID;
+    }
 
-    // =========================================================================
-    // Small inline helpers
-    // =========================================================================
+    void check_running_cores_for_completion(int32_t thread_idx, int32_t &completed_this_turn, int32_t &cur_thread_completed, bool &made_progress)
+    {
+        SchedulerThreadProfile &profile = thread_profiles_[thread_idx];
+        CoreTracker &tracker = core_trackers_[thread_idx];
+        auto running_core_states = tracker.get_all_running_cores();
+        while (running_core_states.has_value())
+        {
+            int32_t bit_pos = running_core_states.pop_first();
+            int32_t core_id = tracker.get_core_id_by_offset(bit_pos);
+            CoreExecState &core = core_exec_states_[core_id];
+            profile.cores_scanned++;
+
+            uint64_t reg_val = static_cast<uint64_t>(*core.cond_ptr);
+            rmb();
+            int32_t reg_task_id = EXTRACT_TASK_ID(reg_val);
+            int32_t reg_state = EXTRACT_TASK_STATE(reg_val);
+
+            SlotTransition t = decide_slot_transition(reg_task_id, reg_state, core.running_reg_task_id, core.pending_reg_task_id);
+            if (!t.matched) continue;
+
+            // --- Apply phase: execute actions based on transition ---
+
+            // 1. Complete finished tasks (capture pointers before modifying core state)
+            if (t.pending_done)
+            {
+                uint64_t tc0 = get_sys_cnt_aicpu();
+                complete_slot_task(*core.pending_slot_state, core.pending_reg_task_id, core_id, completed_this_turn);
+                profile.complete_task_cycles += get_sys_cnt_aicpu() - tc0;
+                profile.complete_task_calls++;
+                cur_thread_completed++;
+            }
+            if (t.running_done)
+            {
+                uint64_t tc0 = get_sys_cnt_aicpu();
+                complete_slot_task(*core.running_slot_state, core.running_reg_task_id, core_id, completed_this_turn);
+                profile.complete_task_cycles += get_sys_cnt_aicpu() - tc0;
+                profile.complete_task_calls++;
+                cur_thread_completed++;
+            }
+
+            // 2. Update slot data
+            if (t.running_freed)
+            {
+                if (core.pending_slot_state != nullptr && !t.pending_done)
+                {
+                    promote_pending_to_running(core);  // Case 2 or Case 3 (with pending)
+                }
+                else
+                {
+                    clear_running_slot(core);  // Case 1 or Case 3 (no pending)
+                    if (t.pending_done)
+                    {
+                        core.pending_slot_state = nullptr;
+                        core.pending_reg_task_id = AICPU_TASK_INVALID;
+                    }
+                }
+            }
+
+            // 3. Update tracker bitmap
+            bool is_idle = (core.running_reg_task_id == AICPU_TASK_INVALID);
+            if (is_idle)
+            {
+                tracker.change_core_state(bit_pos);       // Mark idle
+                tracker.clear_pending_occupied(bit_pos);  // Idle safeguard: no payload to protect
+            }
+            else if (t.pending_freed && core.pending_reg_task_id == AICPU_TASK_INVALID)
+            {
+                tracker.clear_pending_occupied(bit_pos);
+            }
+
+            // 4. Progress signal (only when running task completes)
+            if (t.running_done) made_progress = true;
+        }
+    }
 
-    uint64_t get_function_bin_addr(int func_id) const {
-        if (!func_id_to_addr_ || func_id < 0 || func_id >= RUNTIME_MAX_FUNC_ID) {
-            LOG_ERROR("func_id=%d is out of range [0, %d) or map is null", func_id, RUNTIME_MAX_FUNC_ID);
-            return 0;
+    bool enter_drain_mode(PTO2TaskSlotState *slot_state, int32_t block_num)
+    {
+        int32_t expected = 0;
+        if (!drain_state_.sync_start_pending.compare_exchange_strong(expected, -1, std::memory_order_relaxed, std::memory_order_relaxed)) return false;  // Another thread already holds the drain slot.
+        // We own the drain slot.  Store the task and reset election flag before making it visible.
+        drain_state_.pending_task.store(slot_state, std::memory_order_release);
+        drain_state_.drain_ack_mask.store(0, std::memory_order_relaxed);
+        drain_state_.drain_worker_elected.store(0, std::memory_order_relaxed);
+        // Release store: all stores above are now visible to any thread that
+        // acquire-loads sync_start_pending and sees block_num > 0.
+        drain_state_.sync_start_pending.store(block_num, std::memory_order_release);
+        return true;
+    }
+    int32_t count_global_available(PTO2ResourceShape shape)
+    {
+        int32_t total = 0;
+        for (int32_t t = 0; t < active_sched_threads_; t++) total += core_trackers_[t].get_idle_core_offset_states(shape).count();
+        return total;
+    }
+    void drain_worker_dispatch(int32_t block_num)
+    {
+        PTO2TaskSlotState *slot_state = drain_state_.pending_task.load(std::memory_order_acquire);
+        if (!slot_state)
+        {
+            drain_state_.sync_start_pending.store(0, std::memory_order_release);
+            return;
+        }
+        PTO2ResourceShape shape = slot_state->active_mask.to_shape();
+
+        for (int32_t t = 0; t < active_sched_threads_ && slot_state->next_block_idx < block_num; t++)
+        {
+            auto valid = core_trackers_[t].get_idle_core_offset_states(shape);
+            int32_t remaining = slot_state->logical_block_num - slot_state->next_block_idx;
+            int32_t claim = std::min(valid.count(), remaining);
+            int32_t start = slot_state->next_block_idx;
+            slot_state->next_block_idx += claim;
+            PublishHandle handles[CoreTracker::MAX_CLUSTERS * 3];
+            int handle_count = 0;
+            for (int32_t b = 0; b < claim; b++)
+            {
+                auto core_offset = valid.pop_first();
+                handle_count += prepare_block_for_dispatch(t, core_offset, *slot_state, shape, false, start + b, &handles[handle_count]);
+            }
+            wmb();
+            uint64_t dispatch_ts = 0;
+            for (int i = 0; i < handle_count; i++) publish_subtask_to_core(handles[i], dispatch_ts);
+        }
+
+        std::atomic_thread_fence(std::memory_order_release);
+        drain_state_.pending_task.store(nullptr, std::memory_order_release);
+        drain_state_.drain_ack_mask.store(0, std::memory_order_relaxed);
+        drain_state_.drain_worker_elected.store(0, std::memory_order_relaxed);
+        drain_state_.sync_start_pending.store(0, std::memory_order_release);
+    }
+    void handle_drain_mode(int32_t thread_idx)
+    {
+        // Spin until drain is fully initialized (sentinel -1 -> block_num > 0).
+        int32_t block_num;
+        do {
+            block_num = drain_state_.sync_start_pending.load(std::memory_order_acquire);
+        } while (block_num < 0);
+        if (block_num == 0) return;
+
+        uint32_t all_acked = (1u << active_sched_threads_) - 1;
+
+        // Ack barrier -- signal this thread has stopped dispatch.
+        drain_state_.drain_ack_mask.fetch_or(1u << thread_idx, std::memory_order_release);
+
+        // Spin until all threads have acked.
+        // If our bit is cleared while waiting, elected reset due to insufficient resources.
+        while (true)
+        {
+            uint32_t ack = drain_state_.drain_ack_mask.load(std::memory_order_acquire);
+            if ((ack & all_acked) == all_acked) break;
+            if ((ack & (1u << thread_idx)) == 0) return;
+            SPIN_WAIT_HINT();
+        }
+
+        // Election -- exactly one thread wins the CAS.
+        int32_t expected = 0;
+        drain_state_.drain_worker_elected.compare_exchange_strong(expected, thread_idx + 1, std::memory_order_acquire, std::memory_order_relaxed);
+
+        if (drain_state_.drain_worker_elected.load(std::memory_order_relaxed) != thread_idx + 1)
+        {
+            // Non-elected: spin-wait for drain completion or resource-insufficient reset.
+            while (drain_state_.sync_start_pending.load(std::memory_order_acquire) != 0)
+            {
+                if (drain_state_.drain_worker_elected.load(std::memory_order_acquire) == 0) return;
+                SPIN_WAIT_HINT();
+            }
+            return;
+        }
+
+        // Elected: check if global resources are sufficient.
+        PTO2TaskSlotState *slot_state = drain_state_.pending_task.load(std::memory_order_acquire);
+        if (slot_state == nullptr)
+        {
+            drain_state_.drain_worker_elected.store(0, std::memory_order_release);
+            return;
+        }
+        PTO2ResourceShape shape = slot_state->active_mask.to_shape();
+        int32_t available = count_global_available(shape);
+
+        if (available < block_num)
+        {
+            // Insufficient resources -- reset drain fields so threads can resume
+            // completion polling to free running cores, then retry.
+            drain_state_.drain_ack_mask.store(0, std::memory_order_release);
+            drain_state_.drain_worker_elected.store(0, std::memory_order_release);
+            return;
+        }
+
+        // Dispatch -- all other threads are spinning, elected thread has exclusive tracker access.
+        drain_worker_dispatch(block_num);
+    }
+
+    LoopAction handle_orchestrator_exit(PTO2SharedMemoryHeader *header, Runtime *runtime)
+    {
+        if (completed_.load(std::memory_order_acquire)) return LoopAction::BREAK_LOOP;
+        int32_t orch_err = header->orch_error_code.load(std::memory_order_acquire);
+        if (orch_err != PTO2_ERROR_NONE)
+        {
+            if (!completed_.exchange(true, std::memory_order_acq_rel)) emergency_shutdown(runtime);
+            return LoopAction::BREAK_LOOP;
+        }
+        int32_t sched_err = header->sched_error_code.load(std::memory_order_acquire);
+        if (sched_err != PTO2_ERROR_NONE)
+        {
+            if (!completed_.exchange(true, std::memory_order_acq_rel)) emergency_shutdown(runtime);
+            return LoopAction::BREAK_LOOP;
+        }
+
+        if (!orchestrator_done_) return LoopAction::NONE;
+
+        if (total_tasks_ > 0 && completed_tasks_.load(std::memory_order_relaxed) >= total_tasks_)
+        {
+            completed_.store(true, std::memory_order_release);
+            return LoopAction::BREAK_LOOP;
+        }
+        return LoopAction::NONE;
+    }
+
+    LoopAction check_idle_fatal_error(PTO2SharedMemoryHeader *header, Runtime *runtime)
+    {
+        if (completed_.load(std::memory_order_acquire)) return LoopAction::BREAK_LOOP;
+        int32_t orch_err = header->orch_error_code.load(std::memory_order_acquire);
+        if (orch_err != PTO2_ERROR_NONE)
+        {
+            if (!completed_.exchange(true, std::memory_order_acq_rel)) emergency_shutdown(runtime);
+            return LoopAction::BREAK_LOOP;
         }
+        int32_t sched_err = header->sched_error_code.load(std::memory_order_acquire);
+        if (sched_err != PTO2_ERROR_NONE)
+        {
+            if (!completed_.exchange(true, std::memory_order_acq_rel)) emergency_shutdown(runtime);
+            return LoopAction::BREAK_LOOP;
+        }
+        return LoopAction::NONE;
+    }
+
+    void log_stall_diagnostics(int32_t thread_idx)
+    {
+        CoreTracker &tracker = core_trackers_[thread_idx];
+
+        // T0 owns the shared-ring scan; printing it from other threads would
+        // produce identical TASK lines once per scheduler thread.
+        if (thread_idx == 0)
+        {
+            int32_t cnt_ready = 0, cnt_waiting = 0, cnt_running = 0, submitted_in_ring = 0;
+            for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++)
+            {
+                PTO2SharedMemoryRingHeader &ring = *sched_->ring_sched_states[r].ring;
+                int32_t ring_task_count = ring.fc.current_task_index.load(std::memory_order_relaxed);
+                submitted_in_ring += ring_task_count;
+                for (int32_t si = 0; si < ring_task_count; si++)
+                {
+                    PTO2TaskSlotState &slot_state = ring.get_slot_state_by_task_id(si);
+                    // (m) task_state retired; use completion_flags directly.
+                    bool fanin_ready = sched_->fanin_satisfied(&slot_state);
+                    if (ring.completion_flags[si & ring.task_window_mask].load(std::memory_order_relaxed) != 0) continue;
+                    char running_on[192] = {0};
+                    int32_t owner = -1;
+                    int32_t pos = 0;
+                    bool is_running = false;
+                    for (int32_t cid = 0; cid < cores_total_num_ && pos + 32 < (int32_t)sizeof(running_on); cid++)
+                    {
+                        if (core_exec_states_[cid].running_slot_state != &slot_state) continue;
+                        is_running = true;
+                        if (owner < 0) owner = find_core_owner_thread(cid);
+                        const char *sname = subslot_name(core_exec_states_[cid].running_subslot);
+                        int32_t written = snprintf(running_on + pos, sizeof(running_on) - pos, "%score=%d(%s)", pos == 0 ? "" : " ", cid, sname);
+                        if (written > 0) pos += written;
+                    }
+
+                    if (is_running)
+                    {
+                        cnt_running++;
+                        if (cnt_running > STALL_DUMP_READY_MAX) continue;
+                        continue;
+                    }
+                    if (fanin_ready)
+                    {
+                        cnt_ready++;
+                        if (cnt_ready > STALL_DUMP_READY_MAX) continue;
+                        continue;
+                    }
+                    cnt_waiting++;
+                    if (cnt_waiting > STALL_DUMP_WAIT_MAX) continue;
+                }
+            }
+        }
+
+        for (int32_t cli = 0; cli < tracker.get_cluster_count() && cli < STALL_DUMP_CORE_MAX; cli++)
+        {
+            int32_t offset = cli * 3;
+            int32_t aic_id = tracker.get_aic_core_id(offset);
+            int32_t aiv0_id = tracker.get_aiv0_core_id(offset);
+            int32_t aiv1_id = tracker.get_aiv1_core_id(offset);
+            bool aic_idle = tracker.is_aic_core_idle(offset);
+            bool aiv0_idle = tracker.is_aiv0_core_idle(offset);
+            bool aiv1_idle = tracker.is_aiv1_core_idle(offset);
+            char aic_buf[128], aiv0_buf[128], aiv1_buf[128];
+            format_core_status(aic_buf, sizeof(aic_buf), aic_id, aic_idle, &core_exec_states_[aic_id], core_exec_states_[aic_id].reg_addr);
+            format_core_status(aiv0_buf, sizeof(aiv0_buf), aiv0_id, aiv0_idle, &core_exec_states_[aiv0_id], core_exec_states_[aiv0_id].reg_addr);
+            format_core_status(aiv1_buf, sizeof(aiv1_buf), aiv1_id, aiv1_idle, &core_exec_states_[aiv1_id], core_exec_states_[aiv1_id].reg_addr);
+        }
+    }
+
+    void log_shutdown_stall_snapshot()
+    {
+        int32_t thread_count = active_sched_threads_ > 0 ? active_sched_threads_ : aicpu_thread_num_;
+        if (thread_count < 0 || thread_count > MAX_AICPU_THREADS) thread_count = thread_count < 0 ? 0 : MAX_AICPU_THREADS;
+        for (int32_t t = 0; t < thread_count; t++) log_stall_diagnostics(t);
+    }
+
+    int32_t find_core_owner_thread(int32_t core_id) const
+    {
+        for (int32_t t = 0; t < aicpu_thread_num_; t++)
+        {
+            const int32_t *ids = core_trackers_[t].core_ids();
+            int32_t n = core_trackers_[t].core_num();
+            for (int32_t i = 0; i < n; i++)
+                if (ids[i] == core_id) return t;
+        }
+        return -1;
+    }
+
+    bool self_owns_running_task(int32_t thread_idx) const
+    {
+        const int32_t *cores = core_trackers_[thread_idx].core_ids();
+        int32_t core_num = core_trackers_[thread_idx].core_num();
+        for (int32_t i = 0; i < core_num; i++)
+            if (core_exec_states_[cores[i]].running_slot_state != nullptr) return true;
+        return false;
+    }
+
+    bool no_thread_owns_running_task() const
+    {
+        for (int32_t t = 0; t < aicpu_thread_num_; t++)
+            if (self_owns_running_task(t)) return false;
+        return true;
+    }
+
+    int32_t handle_timeout_exit(int32_t thread_idx, PTO2SharedMemoryHeader *header, Runtime *runtime)
+    {
+        latch_scheduler_error(header, thread_idx, PTO2_ERROR_SCHEDULER_TIMEOUT);
+        if (!completed_.exchange(true, std::memory_order_acq_rel))
+        {
+            log_shutdown_stall_snapshot();
+            emergency_shutdown(runtime);
+        }
+        return -PTO2_ERROR_SCHEDULER_TIMEOUT;
+    }
+
+    uint64_t get_function_bin_addr(int func_id) const
+    {
+        if (!func_id_to_addr_ || func_id < 0 || func_id >= RUNTIME_MAX_FUNC_ID) return 0;
         return func_id_to_addr_[func_id];
     }
 };
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp
index c4a10369d..0dd10cd45 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp
@@ -8,1477 +8,7 @@
  * See LICENSE in the root of the software repository for the full text of the License.
  * -----------------------------------------------------------------------------------------------------------
  */
-#include "scheduler_context.h"
 
-#include <algorithm>
-#include <cinttypes>
-#include <limits>
-
-#include "common.h"  // debug_assert
-
-#include "common/unified_log.h"
-#include "aicpu/aicpu_device_config.h"
-#include "aicpu/device_time.h"
-#include "aicpu/platform_regs.h"
-#include "callable.h"
-#include "common/l2_swimlane_profiling.h"
-#include "common/memory_barrier.h"
-#include "common/platform_config.h"
-#include "pto_runtime2.h"
-#include "runtime.h"
-#include "spin_hint.h"
-
-// Performance profiling headers
-#include "aicpu/l2_swimlane_collector_aicpu.h"
-#include "aicpu/pmu_collector_aicpu.h"
-#include "aicpu/tensor_dump_aicpu.h"
-
-#ifndef unlikely
-#define unlikely(x) __builtin_expect(!!(x), 0)
-#endif
-
-// =============================================================================
-// Dispatch helpers
-// =============================================================================
-
-namespace {
-inline constexpr int32_t PTO2_DEFERRED_RELEASE_CAP = 256;
-}
-
-// The speculative core bitmask (PTO2_SPEC_CORE_MASK_WORDS * 64 bits) must cover
-// every global core_id, and the per-core doorbell table is sized to match.
-static_assert(
-    RUNTIME_MAX_WORKER <= PTO2_SPEC_CORE_MASK_WORDS * 64, "staged_core_mask too small for RUNTIME_MAX_WORKER cores"
-);
-
-const char *SchedulerContext::shape_name(PTO2ResourceShape shape) {
-    switch (shape) {
-    case PTO2ResourceShape::AIC:
-        return "AIC";
-    case PTO2ResourceShape::AIV:
-        return "AIV";
-    case PTO2ResourceShape::MIX:
-        return "MIX";
-    case PTO2ResourceShape::DUMMY:
-        return "DUMMY";
-    }
-    return "UNKNOWN";
-}
-
-bool SchedulerContext::has_idle_in_other_threads(int32_t self_thread_idx, PTO2ResourceShape shape) const {
-    // Cross-thread read of peer trackers without explicit synchronization. The
-    // backing `core_states_` is a naturally aligned uint64_t; aarch64 guarantees
-    // single-copy atomicity for an 8-byte aligned load, so no torn read. The
-    // value is consumed only as a scheduling *hint* — a stale read at worst
-    // causes one missed/extra pending dispatch, corrected on the next iteration.
-    // Drain-mode cross-thread writes are serialized by handle_drain_mode's ack
-    // barrier (all peers spin out of the dispatch path before any tracker
-    // mutation), so this routine is never racing the drain worker.
-    for (int32_t t = 0; t < active_sched_threads_; t++) {
-        if (t == self_thread_idx) continue;
-        if (core_trackers_[t].get_idle_core_offset_states(shape).has_value()) {
-            return true;
-        }
-    }
-    return false;
-}
-
-int SchedulerContext::pop_ready_tasks_batch(
-    PTO2ResourceShape shape, int32_t thread_idx, PTO2LocalReadyBuffer &local_buf, PTO2TaskSlotState **out, int max_count
-) {
-#if PTO2_PROFILING
-    auto &l2_swimlane = sched_l2_swimlane_[thread_idx];
-#if PTO2_SCHED_PROFILING
-    extern uint64_t g_sched_pop_atomic_count[], g_sched_pop_wait_cycle[];
-    uint64_t t_pop_start = get_sys_cnt_aicpu();
-    int count = sched_->get_ready_tasks_batch(
-        shape, local_buf, out, max_count, g_sched_pop_atomic_count[thread_idx], g_sched_pop_wait_cycle[thread_idx]
-    );
-    l2_swimlane.sched_dispatch_pop_cycle += (get_sys_cnt_aicpu() - t_pop_start);
-#else
-    int count = sched_->get_ready_tasks_batch(shape, local_buf, out, max_count);
-#endif
-    if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) {
-        if (count > 0) {
-            l2_swimlane.pop_hit += count;
-        } else {
-            l2_swimlane.pop_miss++;
-        }
-    }
-#else
-    (void)thread_idx;
-    int count = sched_->get_ready_tasks_batch(shape, local_buf, out, max_count);
-#endif
-    return count;
-}
-
-void SchedulerContext::build_payload(
-    PTO2DispatchPayload &dispatch_payload, PTO2TaskSlotState &slot_state, PTO2SubtaskSlot subslot,
-    const AsyncCtx &async_ctx, int32_t block_idx
-) {
-    int32_t slot_idx = static_cast<int32_t>(subslot);
-    uint64_t callable_addr = get_function_bin_addr(slot_state.task->kernel_id[slot_idx]);
-    const CoreCallable *callable = reinterpret_cast<const CoreCallable *>(callable_addr);
-    dispatch_payload.function_bin_addr = callable->resolved_addr();
-    auto &payload = *slot_state.payload;
-    int n = 0;
-    for (int32_t i = 0; i < payload.tensor_count; i++) {
-        dispatch_payload.args[n++] = reinterpret_cast<uint64_t>(&payload.tensors[i]);
-    }
-    for (int32_t i = 0; i < payload.scalar_count; i++) {
-        dispatch_payload.args[n++] = payload.scalars[i];
-    }
-    dispatch_payload.local_context.block_idx = block_idx;
-    dispatch_payload.local_context.block_num = slot_state.logical_block_num;
-    dispatch_payload.local_context.async_ctx = async_ctx;
-    dispatch_payload.args[PAYLOAD_LOCAL_CONTEXT_INDEX] = reinterpret_cast<uint64_t>(&dispatch_payload.local_context);
-    dispatch_payload.args[PAYLOAD_GLOBAL_CONTEXT_INDEX] = reinterpret_cast<uint64_t>(&dispatch_payload.global_context);
-    // Speculative early-dispatch: a task being staged (Hook 1 set spec_state to
-    // STAGING before this call) is gated — the AICore must wait for the
-    // DATA_MAIN_BASE high-32 doorbell. All other dispatches run on pickup.
-    dispatch_payload.not_ready =
-        (slot_state.payload->spec_state.load(std::memory_order_relaxed) == PTO2_SPEC_STAGING) ? 1 : 0;
-}
-
-SchedulerContext::PublishHandle SchedulerContext::prepare_subtask_to_core(
-    int32_t thread_idx, int32_t core_offset, PTO2TaskSlotState &slot_state, PTO2SubtaskSlot subslot, bool to_pending,
-    int32_t block_idx
-) {
-    CoreTracker &tracker = core_trackers_[thread_idx];
-    auto core_id = tracker.get_core_id_by_offset(core_offset);
-    CoreExecState &core_exec_state = core_exec_states_[core_id];
-
-    core_exec_state.dispatch_seq++;
-    uint32_t reg_task_id = core_exec_state.dispatch_seq & TASK_ID_MASK;
-    static_assert(
-        (TASK_ID_MASK - AICORE_EXIT_SIGNAL + 1) % 2 == 0, "Sentinel skip must be even to preserve dual-buffer parity"
-    );
-    if (reg_task_id >= AICORE_EXIT_SIGNAL) {
-        core_exec_state.dispatch_seq += (TASK_ID_MASK - reg_task_id + 1);
-        reg_task_id = core_exec_state.dispatch_seq & TASK_ID_MASK;
-    }
-
-    uint32_t buf_idx = reg_task_id & 1u;
-    PTO2DispatchPayload &payload = payload_per_core_[core_id][buf_idx];
-    DeferredCompletionSlab *deferred_slab = &deferred_slab_per_core_[core_id][buf_idx];
-    deferred_slab->count = 0;
-    deferred_slab->error_code = PTO2_ERROR_NONE;
-    AsyncCtx async_ctx = AsyncCtx::make(slot_state.task->task_id, deferred_slab);
-    build_payload(payload, slot_state, subslot, async_ctx, block_idx);
-
-    if (to_pending) {
-        core_exec_state.pending_subslot = subslot;
-        core_exec_state.pending_slot_state = &slot_state;
-        core_exec_state.pending_reg_task_id = static_cast<int32_t>(reg_task_id);
-    } else {
-        core_exec_state.running_subslot = subslot;
-        core_exec_state.running_slot_state = &slot_state;
-        core_exec_state.running_reg_task_id = static_cast<int32_t>(reg_task_id);
-        tracker.change_core_state(core_offset);
-    }
-    tracker.set_pending_occupied(core_offset);
-
-    LOG_DEBUG(
-        "Thread %d: Dispatched %s %s task %" PRId64 " kernel_id=[%d,%d,%d] block_idx=%d/total_blocks=%d to"
-        " core_offset=%d core_id=%d reg_task_id=%u",
-        thread_idx, to_pending ? "pending" : "idle", subslot_name(subslot),
-        static_cast<int64_t>(slot_state.task->task_id.raw), slot_state.task->kernel_id[0],
-        slot_state.task->kernel_id[1], slot_state.task->kernel_id[2], block_idx, slot_state.logical_block_num,
-        core_offset, core_id, reg_task_id
-    );
-
-    // AICore buffer rotation lives on the dispatch path: count this dispatch
-    // and rotate before write_reg when we're about to cross a BUFFER_SIZE
-    // boundary. The completion-before-dispatch invariant makes this race-free
-    // (all prior tasks on this core have FIN'd, so AICore has dcci'd their
-    // records out of the old buffer). Gated on the same enable bit as flush
-    // so level=1 (AICORE_TIMING-only) participates without needing complete_task.
-#if PTO2_PROFILING
-    if (l2_swimlane_level_ != L2SwimlaneLevel::DISABLED) {
-        l2_swimlane_aicpu_on_aicore_dispatch(core_id, thread_idx);
-    }
-#endif
-
-    uint64_t *dispatch_timestamp_slot = nullptr;
-#if PTO2_PROFILING
-    if (l2_swimlane_level_ >= L2SwimlaneLevel::AICPU_TIMING) {
-        dispatch_timestamp_slot =
-            to_pending ? &core_exec_state.pending_dispatch_timestamp : &core_exec_state.running_dispatch_timestamp;
-    }
-#endif
-
-    return PublishHandle{core_exec_state.reg_addr, reg_task_id, core_offset, dispatch_timestamp_slot};
-}
-
-int SchedulerContext::prepare_block_for_dispatch(
-    int32_t thread_idx, int32_t core_offset, PTO2TaskSlotState &slot_state, PTO2ResourceShape shape, bool to_pending,
-    int32_t block_idx, PublishHandle *out_handles
-) {
-#if PTO2_PROFILING
-    if (is_dump_args_enabled()) {
-        dump_args_for_task<PTO2_SUBTASK_SLOT_COUNT>(
-            thread_idx, slot_state, TensorDumpStage::BEFORE_DISPATCH,
-            [](ActiveMask active_mask, int raw_subtask_id) {
-                return active_mask.subtask_active(static_cast<PTO2SubtaskSlot>(raw_subtask_id));
-            },
-            [this](int32_t func_id) {
-                return get_function_bin_addr(func_id);
-            }
-        );
-    }
-#endif
-    CoreTracker &tracker = core_trackers_[thread_idx];
-    if (shape == PTO2ResourceShape::MIX) {
-        uint8_t cmask = slot_state.active_mask.core_mask();
-        int n = 0;
-        if (cmask & PTO2_SUBTASK_MASK_AIC) {
-            out_handles[n++] = prepare_subtask_to_core(
-                thread_idx, tracker.get_aic_core_offset(core_offset), slot_state, PTO2SubtaskSlot::AIC, to_pending,
-                block_idx
-            );
-        }
-        if (cmask & PTO2_SUBTASK_MASK_AIV0) {
-            out_handles[n++] = prepare_subtask_to_core(
-                thread_idx, tracker.get_aiv0_core_offset(core_offset), slot_state, PTO2SubtaskSlot::AIV0, to_pending,
-                block_idx
-            );
-        }
-        if (cmask & PTO2_SUBTASK_MASK_AIV1) {
-            out_handles[n++] = prepare_subtask_to_core(
-                thread_idx, tracker.get_aiv1_core_offset(core_offset), slot_state, PTO2SubtaskSlot::AIV1, to_pending,
-                block_idx
-            );
-        }
-#if PTO2_PROFILING
-        sched_l2_swimlane_[thread_idx].phase_dispatch_count += __builtin_popcount(cmask);
-#endif
-        return n;
-    } else if (shape == PTO2ResourceShape::AIC) {
-        out_handles[0] =
-            prepare_subtask_to_core(thread_idx, core_offset, slot_state, PTO2SubtaskSlot::AIC, to_pending, block_idx);
-#if PTO2_PROFILING
-        sched_l2_swimlane_[thread_idx].phase_dispatch_count += 1;
-#endif
-        return 1;
-    } else {
-        out_handles[0] =
-            prepare_subtask_to_core(thread_idx, core_offset, slot_state, PTO2SubtaskSlot::AIV0, to_pending, block_idx);
-#if PTO2_PROFILING
-        sched_l2_swimlane_[thread_idx].phase_dispatch_count += 1;
-#endif
-        return 1;
-    }
-}
-
-void SchedulerContext::dispatch_shape(
-    int32_t thread_idx, PTO2ResourceShape shape, CoreTracker::DispatchPhase phase, PTO2LocalReadyBuffer &local_buf,
-    CoreTracker &tracker, bool &entered_drain, bool &made_progress, bool &try_pushed
-) {
-#if PTO2_SCHED_PROFILING
-    auto &l2_swimlane = sched_l2_swimlane_[thread_idx];
-#endif
-    if (entered_drain) return;
-
-    bool is_pending = (phase == CoreTracker::DispatchPhase::PENDING);
-    bool is_mix = (shape == PTO2ResourceShape::MIX);
-    auto cores = is_mix ? tracker.get_cluster_offset_states() : tracker.get_dispatchable_cores(shape, phase);
-    if (!cores.has_value()) return;
-
-    while (cores.has_value() && !entered_drain) {
-        int want = cores.count();
-        PTO2TaskSlotState *batch[CoreTracker::MAX_CLUSTERS * 3];
-        int got = pop_ready_tasks_batch(shape, thread_idx, local_buf, batch, want);
-        if (got == 0) break;
-
-        // sync_start exclusion gate.
-        //
-        // When the popped batch contains a sync_start task we MUST publish each
-        // prior task with its own wmb so AICore receives them with time
-        // separation. The drain coordinator's `count_global_available()` check
-        // reads the per-thread CoreTracker, and although `prepare_block_for_dispatch`
-        // marks cores occupied synchronously, the head-start between successive
-        // tasks is what lets the surrounding completion loop catch up on FINs in
-        // the retry window when the sync_start task hits insufficient resources.
-        // Bursting all prior tasks at the end of the pop (cross-task batching)
-        // collapses that head-start and causes spmd_sync_start_stress to time
-        // out via 507018 on ~40% of runs — see
-        // docs/investigations/2026-06-cross-task-batched-publish.md.
-        //
-        // When the batch carries no sync_start task, no drain entry can happen
-        // in this pop, so we hoist `handles[]`, `wmb()`, and the publish loop
-        // out of the per-task body. One wmb amortizes across all tasks and one
-        // dispatch_ts is shared, which restores ~60 ns first-to-last AICore
-        // start span for single-block decode kernels (out_proj, q_proj, ...).
-        // Detection is a single mask check per task — cheap relative to even
-        // one register write.
-        bool any_sync_start = false;
-        for (int bi = 0; bi < got; bi++) {
-            if (batch[bi]->active_mask.requires_sync_start()) {
-                any_sync_start = true;
-                break;
-            }
-        }
-
-        // handles[] is sized for the MIX worst case: total claims across the
-        // pop bounded by `cores.count() ≤ MAX_CLUSTERS`, and each block
-        // contributes ≤ 3 subtasks for MIX.
-        PublishHandle handles[CoreTracker::MAX_CLUSTERS * 3];
-        int handle_count = 0;
-        bool dispatched_any = false;
-        // Slots dispatched this pop whose dispatch_fanin must be propagated to
-        // consumers. Deferred until AFTER publish (below) so a flagged producer's
-        // fanout walk never sits between claiming cores and publishing its own
-        // blocks — doing it inline delays this thread's blocks while peer threads
-        // co-dispatching the same SPMD task publish immediately, misaligning the
-        // task's block starts. Bounded by cores.count() ≤ MAX_CLUSTERS dispatches.
-        PTO2TaskSlotState *prop_list[CoreTracker::MAX_CLUSTERS];
-        int prop_n = 0;
-#if PTO2_SCHED_PROFILING
-        uint64_t t_setup_start = get_sys_cnt_aicpu();
-#endif
-
-        // Flush prepared-but-unpublished handles. Required before
-        // `enter_drain_mode` so the drain coordinator sees cores as occupied,
-        // and at the per-task boundary when `any_sync_start` is true.
-        auto flush_publish = [&]() {
-            if (handle_count == 0) return;
-            wmb();
-            uint64_t dispatch_ts = 0;
-#if PTO2_PROFILING
-            if (l2_swimlane_level_ >= L2SwimlaneLevel::AICPU_TIMING) {
-                dispatch_ts = get_sys_cnt_aicpu();
-            }
-#endif
-            for (int i = 0; i < handle_count; i++) {
-                publish_subtask_to_core(handles[i], dispatch_ts);
-            }
-            handle_count = 0;
-            made_progress = true;
-        };
-
-        for (int bi = 0; bi < got; bi++) {
-            PTO2TaskSlotState *slot_state = batch[bi];
-            CoreTracker::BitStates selected_mix_clusters(0ULL);
-
-            if (is_mix) {
-                auto candidates = cores;
-                uint8_t cmask = slot_state->active_mask.core_mask();
-                auto wanted = is_pending ? CoreTracker::MixPlacement::PENDING : CoreTracker::MixPlacement::RUNNING;
-                while (candidates.has_value()) {
-                    int32_t cluster_offset = candidates.pop_first();
-                    if (tracker.classify_mix_cluster(cluster_offset, cmask) == wanted) {
-                        selected_mix_clusters |= CoreTracker::BitStates(1ULL << cluster_offset);
-                    }
-                }
-                if (!selected_mix_clusters.has_value()) {
-                    sched_->ready_queues[static_cast<int32_t>(shape)].push(slot_state);
-                    continue;
-                }
-            }
-
-            // (Speculative pre-staged tasks never reach this ready-pop: they are
-            // released by their doorbell in release_fanin_and_check_ready the
-            // instant their last producer completes — see try_speculative_release.)
-
-            if (slot_state->active_mask.requires_sync_start()) {
-                if (is_pending) {
-                    sched_->ready_queues[static_cast<int32_t>(shape)].push(slot_state);
-                    continue;
-                }
-                int32_t available = is_mix ? selected_mix_clusters.count() : cores.count();
-                if (available < slot_state->logical_block_num) {
-                    flush_publish();
-                    if (!enter_drain_mode(slot_state, slot_state->logical_block_num)) {
-                        sched_->ready_queues[static_cast<int32_t>(shape)].push(slot_state);
-                    }
-                    for (int rem = bi + 1; rem < got; rem++) {
-                        sched_->ready_queues[static_cast<int32_t>(shape)].push(batch[rem]);
-                    }
-                    entered_drain = true;
-                    break;
-                }
-            }
-
-            if (!cores.has_value()) {
-                flush_publish();
-                sched_->ready_queues[static_cast<int32_t>(shape)].push_batch(&batch[bi], got - bi);
-                break;
-            }
-
-            dispatched_any = true;
-            try_pushed = true;
-            // Record for deferred dispatch_fanin propagation after this pop's
-            // blocks are published (see after the loop). propagate's own guard
-            // filters non-flagged slots, so recording unconditionally is cheap.
-            if (prop_n < static_cast<int>(sizeof(prop_list) / sizeof(prop_list[0]))) {
-                prop_list[prop_n++] = slot_state;
-            }
-            // Claim a contiguous range of blocks, hand the slot back to the
-            // ready queue immediately, then perform the expensive dispatches.
-            // This lets other schedulers concurrently claim and dispatch the
-            // remaining blocks of the same SPMD task instead of spinning while
-            // this thread fills all its own cores. Only local `start + b` is
-            // read after the push — `next_block_idx` may already be advanced
-            // by another scheduler that popped the slot.
-            int32_t start = slot_state->next_block_idx.load(std::memory_order_relaxed);
-            int32_t remaining = slot_state->logical_block_num - start;
-            int32_t available = is_mix ? selected_mix_clusters.count() : cores.count();
-            int32_t claim = std::min(available, remaining);
-            slot_state->next_block_idx.store(static_cast<int16_t>(start + claim), std::memory_order_relaxed);
-
-            if (start + claim < slot_state->logical_block_num) {
-                sched_->ready_queues[static_cast<int32_t>(shape)].push(slot_state);
-            }
-
-            for (int32_t b = 0; b < claim; b++) {
-                auto core_offset = is_mix ? selected_mix_clusters.pop_first() : cores.pop_first();
-                if (is_mix) {
-                    cores.clear_bit(core_offset);
-                }
-                handle_count += prepare_block_for_dispatch(
-                    thread_idx, core_offset, *slot_state, shape, is_pending, start + b, &handles[handle_count]
-                );
-            }
-
-            // Sync_start exclusion: flush per task so prior tasks have head-
-            // start time before any sync_start drain check. Normal batches
-            // fall through and accumulate for one cross-task flush at the
-            // end of the pop.
-            if (any_sync_start) {
-                flush_publish();
-            }
-        }
-
-        flush_publish();
-        // Blocks are published; now propagate dispatch_fanin for any flagged
-        // producers dispatched above (knob A: producer is running). Off the
-        // pre-publish path so it cannot delay or misalign their blocks.
-        for (int i = 0; i < prop_n; i++) {
-            sched_->propagate_dispatch_fanin(*prop_list[i]);
-        }
-#if PTO2_SCHED_PROFILING
-        l2_swimlane.sched_dispatch_setup_cycle += (get_sys_cnt_aicpu() - t_setup_start);
-#endif
-
-        if (!dispatched_any) break;
-
-        if (!cores.has_value()) {
-            cores = is_mix ? tracker.get_cluster_offset_states() : tracker.get_dispatchable_cores(shape, phase);
-        }
-    }
-}
-
-void SchedulerContext::dispatch_ready_tasks(
-    int32_t thread_idx, CoreTracker &tracker, PTO2LocalReadyBuffer (&local_bufs)[PTO2_NUM_RESOURCE_SHAPES],
-    bool pmu_active, bool &made_progress, bool &try_pushed
-) {
-    using Phase = CoreTracker::DispatchPhase;
-    constexpr int32_t MIX_I = static_cast<int32_t>(PTO2ResourceShape::MIX);
-
-    // MIX is handled explicitly at the top of each stage; only AIC/AIV cycle
-    // through this 2-elem array, with order toggled by thread parity for
-    // shape-level load balancing across threads.
-    static constexpr PTO2ResourceShape kAicAivOrder[2][2] = {
-        {PTO2ResourceShape::AIC, PTO2ResourceShape::AIV},
-        {PTO2ResourceShape::AIV, PTO2ResourceShape::AIC},
-    };
-    const PTO2ResourceShape *aic_aiv = kAicAivOrder[thread_idx & 1];
-
-    // Spill overflow from local_bufs to the shared ready queue BEFORE we start
-    // dispatching. release_fanin's fast path packs all newly-ready consumers
-    // into the producing thread's local_bufs (zero atomic, peer-invisible). For
-    // batch releases (e.g. attn_fence → 50 out_proj consumers) that
-    // overshoots this thread's slot budget so peers are starving while we
-    // hoard. The cross-thread invisibility window between "complete pushes 50
-    // to local" and "IDLE-AIC's mid-phase flush exposes overflow to shared"
-    // is what shows up in the swimlane as the multi-microsecond inter-thread
-    // stagger on out_proj's first wave.
-    //
-    // Gate conditions:
-    //   (a) local count exceeds this thread's per-shape block budget — we
-    //       can't dispatch them all even with both RUNNING+PENDING slots;
-    //   (b) at least one peer has idle cores in this shape — they want work.
-    // Both must hold to avoid wasting a CAS push when we could profitably
-    // self-dispatch the overflow. Condition (b) reads peer CoreTracker
-    // (plain 8-byte load on a rarely-contended cache line, ~5 ns) — we
-    // deliberately avoid ready_queues[s].size() here, which is two atomic
-    // loads on lines pushers + poppers actively bounce.
-    //
-    // Capacity derives from how cores are partitioned across sched threads:
-    //   per-shape budget = (PLATFORM_MAX_BLOCKDIM / active_sched_threads_)
-    //                       × cores_per_blockdim_for_that_shape
-    //   MIX is 1 cluster per block dim, so its budget equals the block-dim
-    //   share without multiplying.
-    //
-    // Push the trailing `excess` slot pointers — O(1) count decrement, no
-    // memmove. push_batch is one CAS for the whole excess; peers see the
-    // batch immediately and can race for them.
-    const int32_t bd_per_thread = PLATFORM_MAX_BLOCKDIM / active_sched_threads_;
-    const int32_t thread_capacity[PTO2_NUM_RESOURCE_SHAPES] = {
-        /*AIC=*/bd_per_thread * PLATFORM_AIC_CORES_PER_BLOCKDIM,
-        /*AIV=*/bd_per_thread * PLATFORM_AIV_CORES_PER_BLOCKDIM,
-        /*MIX=*/bd_per_thread,
-    };
-    for (int32_t s = 0; s < PTO2_NUM_RESOURCE_SHAPES; s++) {
-        auto &lb = local_bufs[s];
-        int32_t excess = lb.count - thread_capacity[s];
-        if (excess <= 0) continue;
-        if (!has_idle_in_other_threads(thread_idx, static_cast<PTO2ResourceShape>(s))) continue;
-        sched_->ready_queues[s].push_batch(&lb.slot_states[lb.count - excess], excess);
-        lb.count -= excess;
-    }
-
-    auto flush_local_bufs = [&]() {
-        for (int32_t s = 0; s < PTO2_NUM_RESOURCE_SHAPES; s++) {
-            auto &lb = local_bufs[s];
-            if (lb.count > 0) {
-                sched_->ready_queues[s].push_batch(lb.slot_states, lb.count);
-                lb.count = 0;
-            }
-        }
-    };
-    // Every return path below must flush; wrap in RAII so we cannot forget.
-    // The mid-function flush between IDLE and PENDING is still called
-    // explicitly — guard only covers exit.
-    struct FlushGuard {
-        decltype(flush_local_bufs) &flush_fn;
-        ~FlushGuard() { flush_fn(); }
-    } flush_guard{flush_local_bufs};
-
-    bool entered_drain = false;
-
-    // ===== IDLE stage =====
-    dispatch_shape(
-        thread_idx, PTO2ResourceShape::MIX, Phase::IDLE, local_bufs[MIX_I], tracker, entered_drain, made_progress,
-        try_pushed
-    );
-    if (entered_drain) return;
-
-    // MIX-IDLE residual: AIC/AIV (both IDLE and PENDING) yield for this pass.
-    // MIX-PENDING below still runs — that is the core of "mix strict priority":
-    // pending slots are spent on mix before AIC/AIV get any chance.
-    bool skip_aic_aiv = has_residual_mix(local_bufs[MIX_I]);
-
-    if (!skip_aic_aiv) {
-        for (int i = 0; i < 2; i++) {
-            PTO2ResourceShape s = aic_aiv[i];
-            dispatch_shape(
-                thread_idx, s, Phase::IDLE, local_bufs[static_cast<int32_t>(s)], tracker, entered_drain, made_progress,
-                try_pushed
-            );
-            if (entered_drain) return;
-        }
-    }
-
-    // Flush between IDLE and PENDING so PENDING-stage queue-size checks and any
-    // peer-thread reads see the IDLE-stage release_fanin output.
-    flush_local_bufs();
-
-    if (pmu_active) return;
-
-    // ===== PENDING stage =====
-    // MIX-PENDING gate: skip when a peer has an idle MIX-capable cluster — that
-    // peer's next IDLE-MIX iteration will pull the mix task from the global
-    // queue (already flushed above) at lower latency than us pre-loading a
-    // pending slot here. Forward progress for MIX is preserved: at least one
-    // thread will run MIX-IDLE next pass and consume the residual.
-    //
-    // The gate is NOT subject to skip_aic_aiv — residual mix continues to drain
-    // via pending slots on this thread when no peer is idle.
-    if (!has_idle_in_other_threads(thread_idx, PTO2ResourceShape::MIX)) {
-        dispatch_shape(
-            thread_idx, PTO2ResourceShape::MIX, Phase::PENDING, local_bufs[MIX_I], tracker, entered_drain,
-            made_progress, try_pushed
-        );
-        if (entered_drain) return;
-    }
-
-    // Re-check after MIX-PENDING. If MIX-IDLE already set skip_aic_aiv, leave
-    // it set; otherwise, escalate iff PENDING-MIX left residual.
-    if (!skip_aic_aiv && has_residual_mix(local_bufs[MIX_I])) {
-        skip_aic_aiv = true;
-    }
-
-    // PENDING-MIX may have re-populated AIC/AIV local_bufs via release_fanin
-    // during in-flight completions; flush_guard ensures these don't carry
-    // across to the next iteration's IDLE stage.
-    if (skip_aic_aiv) return;
-
-    // AIC/AIV-PENDING gate: a peer-idle skip is a delay, not a loss — the peer
-    // will pull from the global queue on its next IDLE pass.
-    for (int i = 0; i < 2; i++) {
-        PTO2ResourceShape s = aic_aiv[i];
-        if (has_idle_in_other_threads(thread_idx, s)) continue;
-        dispatch_shape(
-            thread_idx, s, Phase::PENDING, local_bufs[static_cast<int32_t>(s)], tracker, entered_drain, made_progress,
-            try_pushed
-        );
-        if (entered_drain) return;
-    }
-}
-
-// Stage the ALREADY-CLAIMED range [start, start+count) of consumer `c` onto
-// thread_idx's idle then pending cores. The caller (the queue drain) has advanced
-// next_block_idx by `count` under pop-exclusivity AND re-pushed `c` for peers
-// BEFORE calling this — so this, the expensive prepare+publish, runs CONCURRENTLY
-// with peers staging other ranges of the same consumer. This mirrors the normal
-// SPMD dispatch path (claim range -> store next_block_idx -> re-push -> dispatch).
-// `idle`/`pend` are this thread's free-core sets, sized so idle.count+pend.count >=
-// count (the caller clamped the claim to them), so all `count` blocks get a core.
-//
-// Rule 1: idle cores -> gated task in the RUNNING slot. Rule 2: PENDING slot of
-// cores running a real task -> promoted in when that task FINs (gated-pending Case
-// 3.3 in decide_slot_transition completes the running FIN + promotes instead of
-// waiting for an ack the gated task never sends). Each staged core stays
-// pending_occupied while gated, so no second gated block stacks on it.
-//
-// Self-ring: release flips STAGING->DISPATCHED then rings the mask. A block staged
-// after that flip isn't in the mask release read, so this thread rings it here. The
-// seq_cst order between "OR mask then load spec_state" (here) and "store DISPATCHED
-// then read mask" (release) guarantees every gated core's doorbell fires.
-int32_t SchedulerContext::stage_consumer_blocks(
-    int32_t thread_idx, PTO2TaskSlotState *c, PTO2ResourceShape shape, int32_t start, int32_t count,
-    CoreTracker::BitStates &idle, CoreTracker::BitStates &pend
-) {
-    CoreTracker &tracker = core_trackers_[thread_idx];
-    // Stamp the real pre-stage time (NOT 0) so the swimlane shows these blocks
-    // dispatched during the producer's run, not at trace start.
-    uint64_t early_dispatch_ts = get_sys_cnt_aicpu();
-    uint64_t my_cores[PTO2_SPEC_CORE_MASK_WORDS] = {0};  // cores this thread gated (for self-ring)
-    int32_t staged = 0;
-    int32_t block = start;
-    auto stage_from = [&](CoreTracker::BitStates &avail, bool to_pending) {
-        // Mirror the normal flush_publish (scheduler_dispatch.cpp wmb()+publish loop):
-        // prepare all claimed blocks' payloads, one wmb(), then publish. The wmb
-        // guarantees the not_ready gate + args are globally visible before any
-        // DATA_MAIN_BASE token — without it a gated core can pick up the token and
-        // dcci a stale payload (the doorbell/release path mirrors normal dispatch).
-        PublishHandle handles[CoreTracker::MAX_CLUSTERS * 3];
-        int n = 0;
-        while (count > 0 && avail.has_value()) {
-            int32_t core_offset = avail.pop_first();
-            n += prepare_block_for_dispatch(thread_idx, core_offset, *c, shape, to_pending, block, &handles[n]);
-            block++;
-            count--;
-            staged++;
-        }
-        if (n == 0) return;
-        wmb();
-        for (int i = 0; i < n; i++) {
-            publish_subtask_to_core(handles[i], early_dispatch_ts);
-            int32_t cid = tracker.get_core_id_by_offset(handles[i].core_offset);
-            sched_->spec_doorbell_table[cid].addr = handles[i].reg_addr;
-            sched_->spec_doorbell_table[cid].token = handles[i].reg_task_id;
-            my_cores[cid >> 6] |= (1ULL << (cid & 63));
-        }
-    };
-    if (idle.has_value()) stage_from(idle, /*to_pending=*/false);
-    if (pend.has_value()) stage_from(pend, /*to_pending=*/true);
-    // Publish all this thread's gated cores into the shared mask in one OR per word
-    // (vs one per subtask) so release sees them; seq_cst keeps the self-ring order.
-    for (int w = 0; w < PTO2_SPEC_CORE_MASK_WORDS; w++)
-        if (my_cores[w] != 0) c->payload->staged_core_mask[w].fetch_or(my_cores[w], std::memory_order_seq_cst);
-
-    // If release already flipped DISPATCHED, it may have read the mask before our
-    // bits landed — ring our own cores so none is left gated forever.
-    if (staged > 0 && c->payload->spec_state.load(std::memory_order_seq_cst) == PTO2_SPEC_DISPATCHED) {
-        for (int w = 0; w < PTO2_SPEC_CORE_MASK_WORDS; w++) {
-            uint64_t bits = my_cores[w];
-            while (bits != 0) {
-                int cid = w * 64 + __builtin_ctzll(bits);
-                bits &= bits - 1;
-                PTO2SchedulerState::ring_one_doorbell(
-                    sched_->spec_doorbell_table[cid].addr, sched_->spec_doorbell_table[cid].token
-                );
-            }
-        }
-    }
-    return staged;
-}
-
-// Early-dispatch drain (idle pass). Candidates are pushed to early_dispatch_queue
-// EVENT-DRIVEN by propagate_dispatch_fanin (a flagged producer's dispatch bumps its
-// consumers' dispatch_fanin; reaching fanin_count enqueues the consumer) — there is
-// no per-iteration PULL scan here anymore. This pass only DRAINS the queue.
-// Returns the number of blocks staged this pass (for the EarlyDispatch swimlane bar).
-int32_t SchedulerContext::try_speculative_early_dispatch(int32_t thread_idx) {
-    constexpr int PTO2_EARLY_DISPATCH_DRAIN_MAX = 8;  // bounded pops per pass
-    CoreTracker &tracker = core_trackers_[thread_idx];
-    int32_t total_staged = 0;
-
-    // Drain the queue — mirrors the normal SPMD dispatch path. Pop a consumer,
-    // CLAIM a range sized to THIS thread's free cores by advancing next_block_idx with
-    // a CAS (atomic — next_block_idx is shared with normal dispatch, which also claims
-    // it if release routes the consumer to the ready queue, so a plain store could
-    // double-dispatch), RE-PUSH it for peers, THEN do the expensive prepare+publish.
-    // Re-pushing before staging lets peers claim the next range and stage CONCURRENTLY
-    // — a wide consumer (online_softmax, 48 blocks) is filled by all idle threads in
-    // parallel instead of a serial winner-then-peer daisy chain. Bounded pops/pass.
-    for (int n = 0; n < PTO2_EARLY_DISPATCH_DRAIN_MAX; n++) {
-        PTO2TaskSlotState *c = sched_->early_dispatch_queue.pop();
-        if (c == nullptr) break;
-        if (c->payload->spec_state.load(std::memory_order_acquire) != PTO2_SPEC_STAGING) continue;  // released
-        PTO2ResourceShape shape = c->active_mask.to_shape();
-        auto idle = tracker.get_idle_core_offset_states(shape);
-        auto pend = tracker.get_pending_core_offset_states(shape);
-        int32_t freecores = (idle.has_value() ? idle.count() : 0) + (pend.has_value() ? pend.count() : 0);
-        if (freecores == 0) {  // no free cores of this shape — give it back for peers and stop
-            sched_->early_dispatch_queue.push(c);
-            break;
-        }
-        // CAS-claim a contiguous range [start, start+claim) sized to this thread's
-        // free cores; CAS keeps it atomic against peers AND normal dispatch.
-        int32_t start = 0, claim = 0;
-        while (true) {
-            int16_t cur = c->next_block_idx.load(std::memory_order_relaxed);
-            if (cur >= c->logical_block_num) break;  // fully claimed
-            int32_t cnt = c->logical_block_num - cur;
-            if (cnt > freecores) cnt = freecores;
-            if (c->next_block_idx.compare_exchange_weak(
-                    cur, static_cast<int16_t>(cur + cnt), std::memory_order_seq_cst, std::memory_order_relaxed
-                )) {
-                start = cur;
-                claim = cnt;
-                break;
-            }
-        }
-        if (claim == 0) continue;  // nothing left to claim -> drop (no re-push)
-        // Re-push for concurrent peers BEFORE the expensive staging.
-        if (start + claim < c->logical_block_num) {
-            if (!sched_->early_dispatch_queue.push(c))
-                LOG_INFO_V9(
-                    "[SPEC] queue full on re-push, consumer=%" PRId64, static_cast<int64_t>(c->task->task_id.raw)
-                );
-        }
-        total_staged += stage_consumer_blocks(thread_idx, c, shape, start, claim, idle, pend);
-    }
-    return total_staged;
-}
-
-// =============================================================================
-// Main scheduler dispatch loop
-// =============================================================================
-
-int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_idx) {
-    always_assert(sched_ != nullptr);
-    CoreTracker &tracker = core_trackers_[thread_idx];
-    LOG_INFO_V0("Thread %d: resolve_and_dispatch entry", thread_idx);
-
-    PTO2SharedMemoryHeader *header = sched_->sm_header;
-    if (!header) {
-        LOG_ERROR("PTO2 dispatch: header is null");
-        return -1;
-    }
-    LOG_INFO_V0(
-        "Thread %d: header=%p, task_desc_offset[0]=%lu, window_size=%lu", thread_idx, static_cast<void *>(header),
-        static_cast<uint64_t>(header->rings[0].task_descriptors_offset),
-        static_cast<uint64_t>(header->rings[0].task_window_size)
-    );
-
-    Handshake *hank = static_cast<Handshake *>(runtime->dev.workers);
-    LOG_INFO_V0(
-        "Thread %d: hank=%p, window_size=%lu", thread_idx, static_cast<void *>(hank),
-        static_cast<uint64_t>(header->rings[0].task_window_size)
-    );
-
-    LOG_INFO_V0("Thread %d: PTO2 dispatch starting with %d cores", thread_idx, core_trackers_[thread_idx].core_num());
-    int32_t cur_thread_completed = 0;
-    // Non-zero once a scheduler-hang timeout latches; returned in place of the
-    // completed count so the caller still sees the negative error rc while the
-    // shared end-of-loop flush below runs.
-    int32_t timeout_rc = 0;
-    int32_t idle_iterations = 0;
-    int32_t last_progress_count = 0;
-#if PTO2_PROFILING
-    auto &l2_swimlane = sched_l2_swimlane_[thread_idx];
-    l2_swimlane.reset();
-    l2_swimlane.l2_swimlane_enabled = (l2_swimlane_level_ != L2SwimlaneLevel::DISABLED);
-#endif
-
-    constexpr int LOCAL_READY_CAP_PER_TYPE = 64;
-    PTO2TaskSlotState *local_ptrs[PTO2_NUM_RESOURCE_SHAPES][LOCAL_READY_CAP_PER_TYPE];
-    PTO2LocalReadyBuffer local_bufs[PTO2_NUM_RESOURCE_SHAPES];
-    for (int32_t i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) {
-        local_bufs[i].reset(local_ptrs[i], LOCAL_READY_CAP_PER_TYPE);
-    }
-    PTO2TaskSlotState *deferred_release_slot_states[PTO2_DEFERRED_RELEASE_CAP];
-    int32_t deferred_release_count = 0;
-
-    // PMU runs require single-issue dispatch — overlapping in-flight tasks
-    // pollute per-task PMU counters, so skip the PENDING pre-load phase.
-    // Cached at function scope: is_pmu_enabled() is extern "C" and the
-    // compiler cannot hoist it across the dispatch loop on its own.
-#if PTO2_PROFILING
-    const bool pmu_active = is_pmu_enabled();
-#else
-    // PMU is definitionally off when profiling is compiled out; hard-set false
-    // so dispatch keeps its overlapping (non-single-issue) fast path.
-    constexpr bool pmu_active = false;
-#endif
-
-#if PTO2_PROFILING
-    l2_swimlane.sched_start_ts = get_sys_cnt_aicpu();
-#endif
-
-#if PTO2_PROFILING
-    // Queue-depth snapshot carried across the iteration boundary: each phase
-    // emit consumes (phase_start_*) and refreshes them with its own end snapshot
-    // so the next phase's "at_start" equals the previous phase's "at_end".
-    //
-    // L2SWIMLANE_NUM_QUEUE_SHAPES (3) matches PTO2_NUM_RESOURCE_SHAPES: AIC/AIV/MIX.
-    //
-    // **Hot-path cost discipline.** Local depth (this thread's PTO2LocalReadyBuffer)
-    // is a single int read on a register-cached stack — free. Shared depth
-    // (PTO2ReadyQueue::size) is two atomic relaxed loads against cache lines
-    // that all peer sched threads also write to (enqueue_pos and dequeue_pos
-    // bounce on every flush_local_bufs + every pop). With both phases emitting
-    // per iter that's 12 cross-core loads × thousands of iters per run, a
-    // measurable AICPU slowdown. Mitigation: lazy + per-iter cached shared
-    // snapshot, refreshed at most once per iteration. The complete-emit and
-    // dispatch-emit in the same iter both reuse the same shared sample; the
-    // big transitions (local→shared flush) still show up across iter boundaries.
-    static_assert(
-        L2SWIMLANE_NUM_QUEUE_SHAPES == PTO2_NUM_RESOURCE_SHAPES,
-        "queue snapshot width must match runtime resource shape count"
-    );
-    int16_t phase_start_local[L2SWIMLANE_NUM_QUEUE_SHAPES] = {0};
-    int16_t phase_start_shared[L2SWIMLANE_NUM_QUEUE_SHAPES] = {0};
-    int16_t iter_shared_snapshot[L2SWIMLANE_NUM_QUEUE_SHAPES] = {0};
-    bool iter_shared_sampled = false;
-    auto capture_local_snapshot = [&](int16_t local_out[L2SWIMLANE_NUM_QUEUE_SHAPES]) {
-        for (int s = 0; s < L2SWIMLANE_NUM_QUEUE_SHAPES; s++) {
-            local_out[s] = static_cast<int16_t>(local_bufs[s].count);
-        }
-    };
-    auto get_or_sample_shared = [&]() -> const int16_t * {
-        if (!iter_shared_sampled) {
-            // Clamp to int16_t max before narrowing. PTO2_PROF_READYQUEUE_SIZE
-            // is in the low thousands today but could grow with platform
-            // scaling — without clamp, sizes above 32767 wrap to negatives
-            // and silently corrupt the snapshot.
-            constexpr size_t kMax = static_cast<size_t>(std::numeric_limits<int16_t>::max());
-            for (int s = 0; s < L2SWIMLANE_NUM_QUEUE_SHAPES; s++) {
-                const size_t qsize = sched_->ready_queues[s].size();
-                iter_shared_snapshot[s] = static_cast<int16_t>(std::min(qsize, kMax));
-            }
-            iter_shared_sampled = true;
-        }
-        return iter_shared_snapshot;
-    };
-    auto capture_phase_end = [&](int16_t local_out[L2SWIMLANE_NUM_QUEUE_SHAPES],
-                                 int16_t shared_out[L2SWIMLANE_NUM_QUEUE_SHAPES]) {
-        capture_local_snapshot(local_out);
-        const int16_t *shared_cached = get_or_sample_shared();
-        for (int s = 0; s < L2SWIMLANE_NUM_QUEUE_SHAPES; s++)
-            shared_out[s] = shared_cached[s];
-    };
-    if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) {
-        capture_phase_end(phase_start_local, phase_start_shared);
-    }
-#endif
-
-    // Wall-clock timestamp of the last completed task on this thread.
-    // Updated on made_progress; consulted to decide whether the wall-clock
-    // budget for declaring a scheduler hang has elapsed. Initialized to
-    // "now" so the first budget cycle starts when this thread does, not at
-    // an undefined value.
-    uint64_t last_progress_ts = get_sys_cnt_aicpu();
-    // Per-device override latched once at worker init by simpler_aicpu_init
-    // (InitArgs.scheduler_timeout_ms -> resident-SO global). 0 means no
-    // override; fall back to the compile-time SCHEDULER_TIMEOUT_CYCLES.
-    uint64_t scheduler_timeout_cycles = SCHEDULER_TIMEOUT_CYCLES;
-    const int32_t scheduler_timeout_ms_override = get_scheduler_timeout_ms();
-    if (scheduler_timeout_ms_override > 0) {
-        scheduler_timeout_cycles =
-            static_cast<uint64_t>(scheduler_timeout_ms_override) * (PLATFORM_PROF_SYS_CNT_FREQ / 1000);
-    }
-
-    while (true) {
-        if (completed_.load(std::memory_order_acquire)) {
-            break;
-        }
-        bool made_progress = false;
-#if PTO2_PROFILING
-        CYCLE_COUNT_START();
-        l2_swimlane.sched_loop_count++;
-        uint64_t _t0_phase = _t0;
-        // Release is the only "no Complete/Dispatch bar" attribution we keep —
-        // emitted with its own span in the idle branch below. Iterations that
-        // only scan/poll show as blank gaps; the per-loop Poll/Scan bars (PR
-        // #1079 debug overlay) were removed since "scheduler is polling when
-        // there's nothing to do" carries no actionable signal.
-        // Per-iter lazy shared-queue snapshot: first phase emit in this iter
-        // pays the atomic-load cost, subsequent emits in the same iter reuse
-        // the cached value. Reset here so we re-sample exactly once per iter
-        // (or skip entirely on iters with no phase emit).
-        iter_shared_sampled = false;
-#endif
-        int32_t task_count = 0;
-        if (!tracker.has_any_running_cores()) {
-            LoopAction action = handle_orchestrator_exit(thread_idx, header, runtime, task_count);
-            if (action == LoopAction::BREAK_LOOP) break;
-        }
-
-#if PTO2_PROFILING
-        CYCLE_COUNT_LAP(l2_swimlane.sched_idle_cycle);
-#endif
-
-        // Phase 1: Check running cores for completion
-        int32_t completed_this_turn = 0;
-
-        bool try_completed = tracker.has_any_running_cores();
-        if (try_completed) {
-            check_running_cores_for_completion(
-                thread_idx, hank, completed_this_turn, cur_thread_completed, made_progress,
-                deferred_release_slot_states, deferred_release_count, local_bufs
-            );
-        }
-        if (completed_this_turn > 0) {
-#if PTO2_SCHED_PROFILING
-            sched_->tasks_completed.fetch_add(completed_this_turn, std::memory_order_relaxed);
-#endif
-            int32_t prev = completed_tasks_.fetch_add(completed_this_turn, std::memory_order_relaxed);
-            int32_t new_total = prev + completed_this_turn;
-            last_progress_count = new_total;
-            if (thread_idx == 0 && task_count > 0) {
-                if (new_total <= PROGRESS_VERBOSE_THRESHOLD ||
-                    new_total / PROGRESS_LOG_INTERVAL != prev / PROGRESS_LOG_INTERVAL || new_total >= task_count) {
-                    LOG_INFO_V9(
-                        "PTO2 progress: completed=%d total=%d (%.1f%%)", new_total, task_count,
-                        100.0 * new_total / task_count
-                    );
-                }
-            }
-        }
-
-        if (rt_ != nullptr && rt_->aicore_mailbox != nullptr &&
-            (sched_->async_wait_list.count > 0 || rt_->aicore_mailbox->has_pending())) {
-            AsyncPollResult poll_result = sched_->async_wait_list.poll_and_complete<false>(
-                rt_->aicore_mailbox, sched_, local_bufs, deferred_release_slot_states, deferred_release_count,
-                PTO2_DEFERRED_RELEASE_CAP
-#if PTO2_SCHED_PROFILING
-                ,
-                thread_idx
-#endif
-            );
-            if (poll_result.error_code != PTO2_ERROR_NONE) {
-                int32_t expected = PTO2_ERROR_NONE;
-                header->sched_error_code.compare_exchange_strong(
-                    expected, poll_result.error_code, std::memory_order_acq_rel, std::memory_order_acquire
-                );
-                completed_.store(true, std::memory_order_release);
-                break;
-            }
-            if (poll_result.completed > 0) {
-#if PTO2_SCHED_PROFILING
-                sched_->tasks_completed.fetch_add(poll_result.completed, std::memory_order_relaxed);
-#endif
-                int32_t prev = completed_tasks_.fetch_add(poll_result.completed, std::memory_order_relaxed);
-                int32_t new_total = prev + poll_result.completed;
-                last_progress_count = new_total;
-                made_progress = true;
-            }
-        }
-
-#if PTO2_PROFILING
-        if (!try_completed) {
-            CYCLE_COUNT_LAP(l2_swimlane.sched_idle_cycle);
-        } else {
-            CYCLE_COUNT_LAP(l2_swimlane.sched_complete_cycle);
-            // Emit on any completion work this iteration — a finished slot OR
-            // sub-block retires that did not finish a slot. The latter makes the
-            // SPMD harvest tail visible (count field = blocks processed this
-            // iteration; on a pure-retire iteration phase_complete_count is 0).
-            if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES &&
-                (l2_swimlane.phase_complete_count > 0 || l2_swimlane.phase_subretire_count > 0)) {
-                // Local depth is cheap (this thread's own buffer counter).
-                // Shared depth is NOT sampled here: complete's release_fanin
-                // pushes to local_bufs in the fast path (try_push succeeds
-                // until cap=64). Shared only changes on dispatch's flush
-                // path. Carrying phase_start_shared forward as end_shared
-                // is the right answer 99% of the time AND skips three
-                // contended atomic loads per emit.
-                int16_t phase_end_local[L2SWIMLANE_NUM_QUEUE_SHAPES];
-                capture_local_snapshot(phase_end_local);
-                l2_swimlane_aicpu_record_sched_phase(
-                    thread_idx, L2SwimlaneSchedPhaseKind::Complete, _t0_phase, _t1, l2_swimlane.sched_loop_count,
-                    l2_swimlane.phase_complete_count + l2_swimlane.phase_subretire_count, /*pop_hit=*/0,
-                    /*pop_miss=*/0, phase_start_local, phase_start_shared, phase_end_local, phase_start_shared
-                );
-                for (int s = 0; s < L2SWIMLANE_NUM_QUEUE_SHAPES; s++) {
-                    phase_start_local[s] = phase_end_local[s];
-                    // phase_start_shared unchanged — carried forward
-                }
-                _t0_phase = _t1;
-                l2_swimlane.phase_complete_count = 0;
-                l2_swimlane.phase_subretire_count = 0;
-            }
-        }
-#endif
-
-        bool try_pushed = false;
-
-        // Phase 2 drain check
-        if (drain_state_.sync_start_pending.load(std::memory_order_acquire) != 0) {
-            handle_drain_mode(thread_idx);
-            continue;
-        }
-
-        // Phase 3: Drain wiring queue (thread 0 only)
-        int wired = 0;
-        if (thread_idx == 0) {
-            wired = sched_->drain_wiring_queue(orchestrator_done_.load(std::memory_order_relaxed));
-            if (wired > 0) {
-                made_progress = true;
-#if PTO2_SCHED_PROFILING
-                l2_swimlane.phase_wiring_count += wired;
-#endif
-            }
-        }
-#if PTO2_PROFILING
-        CYCLE_COUNT_LAP(l2_swimlane.sched_wiring_cycle);
-        // Wire outer phase: emit one bar covering this iter's drain_wiring_queue
-        // pass when it wired any tasks. tasks_processed = wired count. Resolve
-        // does NOT nest under Wire — wiring only enqueues, the consumer release
-        // happens later in Complete/Dummy.
-        if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES && wired > 0) {
-            int16_t phase_end_local[L2SWIMLANE_NUM_QUEUE_SHAPES];
-            capture_local_snapshot(phase_end_local);
-            l2_swimlane_aicpu_record_sched_phase(
-                thread_idx, L2SwimlaneSchedPhaseKind::Wire, _t0_phase, _t1, l2_swimlane.sched_loop_count,
-                static_cast<uint32_t>(wired), /*pop_hit=*/0, /*pop_miss=*/0, phase_start_local, phase_start_shared,
-                phase_end_local, phase_start_shared
-            );
-            for (int s = 0; s < L2SWIMLANE_NUM_QUEUE_SHAPES; s++) {
-                phase_start_local[s] = phase_end_local[s];
-            }
-            _t0_phase = _t1;
-        }
-#endif
-
-        // Phase 3b: Drain dummy ready queue (thread 0 only).
-        //
-        // Dependency-only tasks bypass AICore dispatch: they go through the
-        // scheduler so fanin/fanout edges stay consistent, but completion is
-        // signalled inline here. Pinned to thread 0 to avoid cross-thread
-        // races and to keep cache hot near the wiring drain above.
-        if (thread_idx == 0) {
-            constexpr int DUMMY_DRAIN_BATCH = 16;
-            PTO2TaskSlotState *dummy_batch[DUMMY_DRAIN_BATCH];
-            int dummy_got = sched_->dummy_ready_queue.pop_batch(dummy_batch, DUMMY_DRAIN_BATCH);
-#if PTO2_PROFILING
-            // Dummy outer phase: covers handling of all dummies popped this
-            // iter. Per-dummy DummyTask markers are emitted to a SEPARATE lane
-            // (Worker View AICPU_N) by the converter, so they do not nest
-            // under this bar. Resolve emits below DO land on the sched lane
-            // and nest under this Dummy outer by time containment.
-            uint64_t dummy_outer_t0 =
-                (dummy_got > 0 && l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) ? get_sys_cnt_aicpu() : 0;
-#endif
-            for (int di = 0; di < dummy_got; di++) {
-                PTO2TaskSlotState &dummy_slot = *dummy_batch[di];
-
-                // ----- DummyTask phase: dummy "task" identity marker. --------
-                // The dummy has no AICore presence — start ≈ end (1 cycle
-                // wide, just "we identified it"). Converter renders this on
-                // Worker View's DUMMY_T{thread} lane so the DAG node is
-                // visually present. tasks_processed = task_token low 32 bits
-                // (= local_id within ring) so deps.json flow arrows can land.
-                // The Resolve work that follows is emitted separately below.
-#if PTO2_PROFILING
-                if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) {
-                    uint64_t dummy_marker_t = get_sys_cnt_aicpu();
-                    uint32_t dummy_id_low32 = static_cast<uint32_t>(dummy_slot.task->task_id.raw & 0xFFFFFFFFu);
-                    l2_swimlane_aicpu_record_sched_phase(
-                        thread_idx, L2SwimlaneSchedPhaseKind::DummyTask, dummy_marker_t, dummy_marker_t,
-                        sched_l2_swimlane_[thread_idx].sched_loop_count, dummy_id_low32
-                    );
-                }
-#endif
-
-                // ----- Resolve work: walk this dummy's consumer list. ------
-                // Same 1 µs filter as the main-path Resolve emit suppresses
-                // dummies whose consumer release runs sub-microsecond.
-#if PTO2_PROFILING
-                uint64_t dummy_resolve_t0 =
-                    (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) ? get_sys_cnt_aicpu() : 0;
-#endif
-                // [[maybe_unused]] silences -Werror=unused-but-set-variable on
-                // the profiling-flags-smoke build path where PTO2_PROFILING is
-                // OFF and the Resolve emit below is excluded.
-                [[maybe_unused]] uint32_t dummy_consumers = 0;
-#if PTO2_SCHED_PROFILING
-                dummy_consumers = sched_->on_task_complete(dummy_slot, thread_idx, local_bufs).fanout_edges;
-#else
-                dummy_consumers = sched_->on_task_complete(dummy_slot, local_bufs);
-#endif
-#if PTO2_PROFILING
-                if (dummy_resolve_t0 != 0) {
-                    uint64_t dummy_resolve_t1 = get_sys_cnt_aicpu();
-                    constexpr uint64_t RESOLVE_EMIT_MIN_CYCLES = PLATFORM_PROF_SYS_CNT_FREQ / 1'000'000;  // 1 µs
-                    if (dummy_resolve_t1 - dummy_resolve_t0 >= RESOLVE_EMIT_MIN_CYCLES) {
-                        l2_swimlane_aicpu_record_sched_phase(
-                            thread_idx, L2SwimlaneSchedPhaseKind::Resolve, dummy_resolve_t0, dummy_resolve_t1,
-                            sched_l2_swimlane_[thread_idx].sched_loop_count, dummy_consumers
-                        );
-                    }
-                }
-#endif
-                // Dummy tasks have no subtasks to retire and no fanout pre-conditions
-                // beyond their own producers; release self-reference so the slot can
-                // reach CONSUMED once all consumers drain.
-                deferred_release_slot_states[deferred_release_count++] = &dummy_slot;
-                if (deferred_release_count >= PTO2_DEFERRED_RELEASE_CAP) {
-                    while (deferred_release_count > 0) {
-#if PTO2_SCHED_PROFILING
-                        (void)sched_->on_task_release(
-                            *deferred_release_slot_states[--deferred_release_count], thread_idx
-                        );
-#else
-                        sched_->on_task_release(*deferred_release_slot_states[--deferred_release_count]);
-#endif
-                    }
-                }
-                int32_t prev = completed_tasks_.fetch_add(1, std::memory_order_relaxed);
-                last_progress_count = prev + 1;
-                cur_thread_completed++;
-            }
-            if (dummy_got > 0) {
-                made_progress = true;
-            }
-#if PTO2_PROFILING
-            // Emit Dummy outer over the whole dummy_drain pass. Span starts at
-            // dummy_outer_t0 (captured before the pop_batch) and ends at "now".
-            // tasks_processed = dummy_got. Advancing _t0_phase here makes the
-            // following Dispatch / EarlyDispatch / second-Complete bars start
-            // at this end.
-            if (dummy_outer_t0 != 0) {
-                int16_t phase_end_local[L2SWIMLANE_NUM_QUEUE_SHAPES];
-                capture_local_snapshot(phase_end_local);
-                uint64_t dummy_outer_t1 = get_sys_cnt_aicpu();
-                l2_swimlane_aicpu_record_sched_phase(
-                    thread_idx, L2SwimlaneSchedPhaseKind::Dummy, dummy_outer_t0, dummy_outer_t1,
-                    l2_swimlane.sched_loop_count, static_cast<uint32_t>(dummy_got), /*pop_hit=*/0,
-                    /*pop_miss=*/0, phase_start_local, phase_start_shared, phase_end_local, phase_start_shared
-                );
-                for (int s = 0; s < L2SWIMLANE_NUM_QUEUE_SHAPES; s++) {
-                    phase_start_local[s] = phase_end_local[s];
-                }
-                _t0_phase = dummy_outer_t1;
-                // We do NOT re-sync _t0/_t1 — the dummy span will be absorbed
-                // into the next CYCLE_COUNT_LAP accumulator. The phase-model
-                // anchor (_t0_phase) is the authoritative source for bar spans
-                // on the swimlane; the cycle accumulators are coarse aggregates.
-            }
-#endif
-        }
-
-        // Phase 4: MIX-strict-priority dispatch with phase-split and
-        // cross-thread idle gating. See dispatch_ready_tasks for the policy.
-#if PTO2_PROFILING
-        uint64_t dispatch_t0 = (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) ? get_sys_cnt_aicpu() : 0;
-#endif
-        dispatch_ready_tasks(thread_idx, tracker, local_bufs, pmu_active, made_progress, try_pushed);
-#if PTO2_PROFILING
-        // Emit Dispatch IMMEDIATELY after dispatch_ready_tasks so its span
-        // covers the actual publish work — not the trailing second-poll /
-        // early-dispatch time. (Pre-redesign the Dispatch emit lived at iter
-        // end with span extending past the second poll, which made finish_time
-        // events from the second poll fall under the Dispatch bar rather than
-        // a Complete bar of their own — confusing for trace consumers.)
-        if (dispatch_t0 != 0 && try_pushed && l2_swimlane.phase_dispatch_count > 0) {
-            uint64_t dispatch_t1 = get_sys_cnt_aicpu();
-            uint64_t pop_hit_delta = l2_swimlane.pop_hit - l2_swimlane.pop_hit_at_last_emit;
-            uint64_t pop_miss_delta = l2_swimlane.pop_miss - l2_swimlane.pop_miss_at_last_emit;
-            debug_assert(pop_hit_delta < (1ULL << 32));
-            debug_assert(pop_miss_delta < (1ULL << 32));
-            int16_t phase_end_local[L2SWIMLANE_NUM_QUEUE_SHAPES];
-            int16_t phase_end_shared[L2SWIMLANE_NUM_QUEUE_SHAPES];
-            capture_phase_end(phase_end_local, phase_end_shared);
-            l2_swimlane_aicpu_record_sched_phase(
-                thread_idx, L2SwimlaneSchedPhaseKind::Dispatch, _t0_phase, dispatch_t1, l2_swimlane.sched_loop_count,
-                l2_swimlane.phase_dispatch_count, static_cast<uint32_t>(pop_hit_delta),
-                static_cast<uint32_t>(pop_miss_delta), phase_start_local, phase_start_shared, phase_end_local,
-                phase_end_shared
-            );
-            for (int s = 0; s < L2SWIMLANE_NUM_QUEUE_SHAPES; s++) {
-                phase_start_local[s] = phase_end_local[s];
-                phase_start_shared[s] = phase_end_shared[s];
-            }
-            _t0_phase = dispatch_t1;
-            l2_swimlane.phase_dispatch_count = 0;
-            l2_swimlane.pop_hit_at_last_emit = l2_swimlane.pop_hit;
-            l2_swimlane.pop_miss_at_last_emit = l2_swimlane.pop_miss;
-        }
-#endif
-
-        // Phase 4b: early-dispatch onto spare cores, but ONLY when this thread is
-        // otherwise idle — nothing was dispatched this iteration AND no ready work is
-        // queued for any shape. Early-dispatch competes with normal dispatch for
-        // pending slots, so gating on "no ready work" keeps it from delaying a real
-        // ready task; skipping the producer-fanout scan when busy also removes its
-        // per-iteration cost (the discovery walk only runs on genuinely idle passes).
-        bool any_ready_work = try_pushed;
-        for (int s = 0; !any_ready_work && s < PTO2_NUM_RESOURCE_SHAPES; s++) {
-            if (sched_->ready_queues[s].size() > 0 || local_bufs[s].count > 0) any_ready_work = true;
-        }
-#if PTO2_PROFILING
-        bool early_dispatch_record = l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES;
-        uint64_t early_dispatch_t0 = early_dispatch_record ? get_sys_cnt_aicpu() : 0;
-#endif
-        // Skip speculative early-dispatch under PMU: dispatch_ready_tasks already
-        // withholds PENDING dispatch when pmu_active to preserve single-issue PMU
-        // windows, and staging gated work into idle/pending slots would perturb the
-        // same windows.
-        [[maybe_unused]] int32_t staged_count =
-            (pmu_active || any_ready_work) ? 0 : try_speculative_early_dispatch(thread_idx);
-#if PTO2_PROFILING
-        // Emit an EarlyDispatch bar so a staging-dominated iteration is attributed
-        // to early-dispatch rather than disappearing into a blank gap.
-        if (early_dispatch_record && staged_count > 0) {
-            uint64_t early_dispatch_t1 = get_sys_cnt_aicpu();
-            l2_swimlane_aicpu_record_sched_phase(
-                thread_idx, L2SwimlaneSchedPhaseKind::EarlyDispatch, early_dispatch_t0, early_dispatch_t1,
-                sched_l2_swimlane_[thread_idx].sched_loop_count, static_cast<uint32_t>(staged_count)
-            );
-            // prepare_block_for_dispatch bumped phase_dispatch_count while staging;
-            // those blocks belong to this EarlyDispatch bar, so clear the counter
-            // before it leaks into the next Dispatch bar.
-            sched_l2_swimlane_[thread_idx].phase_dispatch_count = 0;
-            // Advance _t0_phase so the following second-poll's Complete bar
-            // starts at the EarlyDispatch end, not before it (otherwise their
-            // spans overlap and the outer-phase mutual-exclusion breaks).
-            _t0_phase = early_dispatch_t1;
-        }
-#endif
-
-        // Second completion poll. dispatch_ready_tasks + try_speculative_early_dispatch
-        // above can take several us in a busy window; a producer block that FINs
-        // during them would otherwise wait for the NEXT iteration's top-of-loop
-        // Phase-1 poll (the ~7us detection latency that delays a flagged
-        // producer's doorbell). Re-polling here observes those FINs immediately,
-        // so the doorbell fires this iteration. Idempotent (the poll is a poll);
-        // we drain deferred releases eagerly to keep the buffer from growing.
-#if PTO2_PROFILING
-        uint64_t complete2_t0 = (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) ? get_sys_cnt_aicpu() : 0;
-#endif
-        if (tracker.has_any_running_cores()) {
-            int32_t completed_2nd = 0;
-            check_running_cores_for_completion(
-                thread_idx, hank, completed_2nd, cur_thread_completed, made_progress, deferred_release_slot_states,
-                deferred_release_count, local_bufs
-            );
-            if (completed_2nd > 0) {
-#if PTO2_SCHED_PROFILING
-                sched_->tasks_completed.fetch_add(completed_2nd, std::memory_order_relaxed);
-#endif
-                completed_tasks_.fetch_add(completed_2nd, std::memory_order_relaxed);
-                last_progress_count = completed_tasks_.load(std::memory_order_relaxed);
-            }
-            // Eager drain so the second poll can't push deferred_release toward
-            // its cap between idle iterations.
-            while (deferred_release_count >= PTO2_DEFERRED_RELEASE_CAP - 96) {
-#if PTO2_SCHED_PROFILING
-                (void)sched_->on_task_release(*deferred_release_slot_states[--deferred_release_count], thread_idx);
-#else
-                sched_->on_task_release(*deferred_release_slot_states[--deferred_release_count]);
-#endif
-            }
-        }
-#if PTO2_PROFILING
-        // Complete2 outer phase: covers second-poll FIN observation. Without
-        // this emit, FIN counts from the second poll would carry over into the
-        // next iter's first-Complete bar and be displayed with a span that
-        // doesn't actually include those FINs' timestamps (visible mismatch
-        // between Complete bar span and per-task finish_time in Worker /
-        // Scheduler View).
-        if (complete2_t0 != 0 && (l2_swimlane.phase_complete_count > 0 || l2_swimlane.phase_subretire_count > 0)) {
-            uint64_t complete2_t1 = get_sys_cnt_aicpu();
-            int16_t phase_end_local[L2SWIMLANE_NUM_QUEUE_SHAPES];
-            capture_local_snapshot(phase_end_local);
-            l2_swimlane_aicpu_record_sched_phase(
-                thread_idx, L2SwimlaneSchedPhaseKind::Complete, complete2_t0, complete2_t1,
-                l2_swimlane.sched_loop_count, l2_swimlane.phase_complete_count + l2_swimlane.phase_subretire_count,
-                /*pop_hit=*/0,
-                /*pop_miss=*/0, phase_start_local, phase_start_shared, phase_end_local, phase_start_shared
-            );
-            for (int s = 0; s < L2SWIMLANE_NUM_QUEUE_SHAPES; s++) {
-                phase_start_local[s] = phase_end_local[s];
-            }
-            _t0_phase = complete2_t1;
-            l2_swimlane.phase_complete_count = 0;
-            l2_swimlane.phase_subretire_count = 0;
-        }
-
-        // Cycle-counter LAP for the iter tail. Dispatch's emit moved earlier
-        // (see Phase 4 above) so this branch only routes the time accumulator.
-        if (!try_pushed) {
-            CYCLE_COUNT_LAP(l2_swimlane.sched_idle_cycle);
-        } else {
-            CYCLE_COUNT_LAP(l2_swimlane.sched_dispatch_cycle);
-        }
-#endif
-
-#if !PTO2_PROFILING
-        (void)try_completed;
-        (void)try_pushed;
-#endif
-
-        if (made_progress) {
-            idle_iterations = 0;
-            last_progress_ts = get_sys_cnt_aicpu();
-        } else {
-#if PTO2_PROFILING
-            uint64_t rel_t0 = (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES && deferred_release_count > 0) ?
-                                  get_sys_cnt_aicpu() :
-                                  0;
-#endif
-            while (deferred_release_count > 0) {
-#if PTO2_SCHED_PROFILING
-                (void)sched_->on_task_release(*deferred_release_slot_states[--deferred_release_count], thread_idx);
-#else
-                sched_->on_task_release(*deferred_release_slot_states[--deferred_release_count]);
-#endif
-            }
-#if PTO2_PROFILING
-            // Release is a distinct operation from the poll scan — emit it with
-            // its own span (Perfetto nests it inside the surrounding poll/idle
-            // run by time-containment) rather than competing with poll for one
-            // per-iteration label.
-            if (rel_t0 != 0) {
-                l2_swimlane_aicpu_record_sched_phase(
-                    thread_idx, L2SwimlaneSchedPhaseKind::Release, rel_t0, get_sys_cnt_aicpu(),
-                    l2_swimlane.sched_loop_count, /*tasks_processed=*/0
-                );
-            }
-#endif
-            idle_iterations++;
-
-            if (idle_iterations % FATAL_ERROR_CHECK_INTERVAL == 0) {
-                LoopAction action = check_idle_fatal_error(thread_idx, header, runtime);
-                if (action == LoopAction::BREAK_LOOP) break;
-            }
-
-            if (idle_iterations % STALL_LOG_INTERVAL == 0) {
-                log_stall_diagnostics(thread_idx, total_tasks_, idle_iterations, last_progress_count);
-            }
-            // Wall-clock budget gate, with two fatal-latch branches:
-            //
-            // 1. Self owns a RUNNING task — first-hand evidence the
-            //    dispatch is stuck. Latch.
-            // 2. No thread anywhere owns a RUNNING task AND tasks remain
-            //    unfinished — the system is in a pre-dispatch / WAIT-only
-            //    deadlock (e.g. dependency cycle). Ownerless idle threads
-            //    are the only observers; let this one latch on the global
-            //    evidence (`completed_tasks_ < total_tasks_` and
-            //    `no_thread_owns_running_task()`).
-            //
-            // Otherwise: a sibling thread owns a RUNNING task but hasn't
-            // hit its own budget yet (typical distributed startup-skew
-            // case) — refresh last_progress_ts and keep spinning. The
-            // STALL diagnostic above still fires periodically so
-            // observability is preserved.
-            if (get_sys_cnt_aicpu() - last_progress_ts > scheduler_timeout_cycles) {
-                bool self_owns = self_owns_running_task(thread_idx);
-                bool global_stuck = !self_owns && total_tasks_ > 0 &&
-                                    completed_tasks_.load(std::memory_order_relaxed) < total_tasks_ &&
-                                    no_thread_owns_running_task();
-                if (self_owns || global_stuck) {
-                    // Latch the error + emergency_shutdown, then break to the
-                    // shared end-of-loop cleanup so the diagnostic buffers get
-                    // flushed to the host. An early return here would strand the
-                    // stuck task's already-dumped inputs and every completed
-                    // task's in/out records in the unflushed per-thread dump
-                    // buffer — exactly the state we need to triage the hang.
-                    timeout_rc = handle_timeout_exit(
-                        thread_idx, header, runtime, idle_iterations, last_progress_count
-#if PTO2_PROFILING
-                        ,
-                        l2_swimlane.sched_start_ts
-#endif
-                    );
-                    break;
-                }
-                last_progress_ts = get_sys_cnt_aicpu();
-            }
-            SPIN_WAIT_HINT();
-#if PTO2_PROFILING
-            CYCLE_COUNT_LAP(l2_swimlane.sched_idle_cycle);
-            // _t0_phase advances through idle laps so the next emitted
-            // COMPLETE/DISPATCH bar starts at the iter it actually ran in, not
-            // at the start of the preceding idle stretch. The idle/poll time
-            // itself is attributed by the activity-fill below — no blanks.
-            if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) {
-                _t0_phase = _t1;
-            }
-#endif
-        }
-    }
-
-    // Drain any entries left in the deferred-release batch. The in-loop flush
-    // only fires on idle iterations and on buffer-full; a loop exit while the
-    // last iteration made progress can leave entries un-released. Drop them
-    // here so every consumed producer slot completes its on_task_release
-    // regardless of which loop-exit path fired.
-    while (deferred_release_count > 0) {
-#if PTO2_SCHED_PROFILING
-        (void)sched_->on_task_release(*deferred_release_slot_states[--deferred_release_count], thread_idx);
-#else
-        sched_->on_task_release(*deferred_release_slot_states[--deferred_release_count]);
-#endif
-    }
-
-#if PTO2_PROFILING
-    // Final-drain: emit any pop_hit / pop_miss accrued since the last
-    // dispatch emit (typically the trailing idle loops while waiting for
-    // orchestrator_done_) as a zero-duration synthetic dispatch record so
-    // sum(record.pop_*) reconciles with the run-cumulative counter.
-    // Gate on SCHED_PHASES — at lower levels the phase buffer is never
-    // flushed (see below), so writing this record would be wasted work.
-    if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) {
-        uint64_t final_pop_hit_delta = l2_swimlane.pop_hit - l2_swimlane.pop_hit_at_last_emit;
-        uint64_t final_pop_miss_delta = l2_swimlane.pop_miss - l2_swimlane.pop_miss_at_last_emit;
-        debug_assert(final_pop_hit_delta < (1ULL << 32));
-        debug_assert(final_pop_miss_delta < (1ULL << 32));
-        if (final_pop_hit_delta != 0 || final_pop_miss_delta != 0) {
-            uint64_t t_now = get_sys_cnt_aicpu();
-            int16_t phase_end_local[L2SWIMLANE_NUM_QUEUE_SHAPES];
-            int16_t phase_end_shared[L2SWIMLANE_NUM_QUEUE_SHAPES];
-            capture_phase_end(phase_end_local, phase_end_shared);
-            l2_swimlane_aicpu_record_sched_phase(
-                thread_idx, L2SwimlaneSchedPhaseKind::Dispatch, t_now, t_now, l2_swimlane.sched_loop_count, 0,
-                static_cast<uint32_t>(final_pop_hit_delta), static_cast<uint32_t>(final_pop_miss_delta),
-                phase_end_local, phase_end_shared, phase_end_local, phase_end_shared
-            );
-            l2_swimlane.pop_hit_at_last_emit = l2_swimlane.pop_hit;
-            l2_swimlane.pop_miss_at_last_emit = l2_swimlane.pop_miss;
-        }
-    }
-    log_l2_swimlane_summary(thread_idx, cur_thread_completed);
-#endif
-
-#if PTO2_PROFILING
-    if (l2_swimlane.l2_swimlane_enabled) {
-        l2_swimlane_aicpu_flush(
-            thread_idx, core_trackers_[thread_idx].core_ids(), core_trackers_[thread_idx].core_num()
-        );
-        if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) {
-            l2_swimlane_aicpu_flush_sched_phase_buffer(thread_idx);
-        }
-    }
-#endif
-#if PTO2_PROFILING
-    if (is_dump_args_enabled()) {
-        dump_args_flush(thread_idx);
-    }
-#endif
-#if PTO2_PROFILING
-    if (is_pmu_enabled()) {
-        pmu_aicpu_flush_buffers(
-            thread_idx, core_trackers_[thread_idx].core_ids(), core_trackers_[thread_idx].core_num()
-        );
-    }
-#endif
-
-    return timeout_rc != 0 ? timeout_rc : cur_thread_completed;
-}
+// Polling redesign: completion / dispatch / cold-path logic is now inlined in
+// scheduler/scheduler_context.h and scheduler/pto_scheduler.h. This translation
+// unit is kept empty to preserve the upstream/main file layout.
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_types.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_types.h
index 445c46a56..98aff8edb 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_types.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_types.h
@@ -19,80 +19,65 @@
 #include "pto_runtime2_types.h"
 #include "spin_hint.h"
 
-// =============================================================================
-// Profiling macros (compile-time gated)
-// =============================================================================
-
-#if PTO2_PROFILING
-#include "aicpu/device_time.h"
-// Accumulated nanoseconds per sub-step
-#define CYCLE_COUNT_START() uint64_t _t0 = get_sys_cnt_aicpu(), _t1
-#define CYCLE_COUNT_LAP(acc)       \
-    do {                           \
-        _t1 = get_sys_cnt_aicpu(); \
-        acc += (_t1 - _t0);        \
-        _t0 = _t1;                 \
-    } while (0)
-#else
-#define CYCLE_COUNT_START()
-#define CYCLE_COUNT_LAP(acc)
-#endif
-
-// =============================================================================
-// Scheduler constants
-// =============================================================================
-
 constexpr int32_t MAX_AICPU_THREADS = PLATFORM_MAX_AICPU_THREADS;
 
-// Periodic cadence (in idle iterations) for emitting the per-thread STALL
-// diagnostic while no progress is being made. Purely an observability knob,
-// independent of the wall-clock timeout below: small enough to fire a few times
-// before the budget expires, large enough not to flood device_log.
+// PLATFORM_MAX_IDLE_ITERATIONS was removed upstream; fixed cadence matches a5's
+// equivalent (used only for per-thread diagnostic logging, not for the fatal-
+// timeout path which uses wall-clock).
 constexpr int32_t STALL_LOG_INTERVAL = 480000;
 constexpr int32_t FATAL_ERROR_CHECK_INTERVAL = 1024;  // Check orchestrator error every N idle iters
 
-// Wall-clock budget for declaring "no progress = scheduler timeout". Replaces
-// the per-thread iteration-count cap that once lived here as MAX_IDLE_ITERATIONS
-// for the fatal-latch decision; STALL_LOG_INTERVAL above keeps the per-thread
-// diagnostic cadence.
-//
-// Using wall-clock here is load-bearing for distributed runs: with per-thread
-// iteration counts, a pure-idle thread spinning ~115 ns/iter hits the cap in
-// ~92 ms while a sibling thread polling a RUNNING task takes ~200 ms for the
-// same iteration count. The fast spinner racing ahead and latching fatal
-// kills the slower-but-correct poller mid-poll — see the distributed
-// startup-skew scenario in issue #897.
-//
-// The budget is platform-defined (PLATFORM_SCHEDULER_TIMEOUT_MS in spin_hint.h).
-// Onboard keeps it below the STARS op-execute and host stream-sync budgets so
-// the AICPU can flush diagnostics before the host-visible timeout chain fires.
-// Sim has no STARS or ACL stream-sync timeout, but uses the same no-progress
-// watchdog shape. See spin_hint.h for the per-variant rationale.
 constexpr int32_t SCHEDULER_TIMEOUT_MS = PLATFORM_SCHEDULER_TIMEOUT_MS;
-constexpr uint64_t SCHEDULER_TIMEOUT_CYCLES =
-    static_cast<uint64_t>(SCHEDULER_TIMEOUT_MS) * (PLATFORM_PROF_SYS_CNT_FREQ / 1000);
+constexpr uint64_t SCHEDULER_TIMEOUT_CYCLES = static_cast<uint64_t>(SCHEDULER_TIMEOUT_MS) * (PLATFORM_PROF_SYS_CNT_FREQ / 1000);
 constexpr int32_t STALL_DUMP_READY_MAX = 8;
 constexpr int32_t STALL_DUMP_WAIT_MAX = 4;
 constexpr int32_t STALL_DUMP_CORE_MAX = 8;
 constexpr int32_t PROGRESS_VERBOSE_THRESHOLD = 10;  // log every completion for the first N tasks
 constexpr int32_t PROGRESS_LOG_INTERVAL = 250;      // log every N completions after threshold
 
-// =============================================================================
-// Control flow signal from cold-path helpers back to the main dispatch loop.
-// =============================================================================
-
-enum class LoopAction : int8_t {
+enum class LoopAction : int8_t
+{
     NONE,        // cold path did not trigger; proceed normally
     BREAK_LOOP,  // equivalent to 'break' from the while(true) loop
 };
 
-// =============================================================================
-// Per-core state: one cache line per core to eliminate false sharing
-// and co-locate all hot-path fields for minimal cache misses.
-// Dual-slot layout: running (currently executing) + pending (pre-loaded, awaiting hardware pickup).
-// =============================================================================
+// Per-thread phase profiling. Accumulates cumulative cycle counts and entry
+// counts for each phase of resolve_and_dispatch's main loop. Dumped once at
+// loop exit via LOG_INFO_V9 — the hot path only does cycle counter math.
+struct alignas(64) SchedulerThreadProfile
+{
+    uint64_t total_cycles{0};
+    uint64_t completion_cycles{0};
+    // Sub-phase of completion: time spent INSIDE complete_slot_task, and
+    // count of times it ran (one per subtask completion observed).
+    uint64_t complete_task_cycles{0};
+    uint64_t complete_task_calls{0};
+    // Sub-phase of completion: count of cores scanned per iter (proxy for
+    // cond_ptr read cost; aggregate / completion_iters = avg cores/iter).
+    uint64_t cores_scanned{0};
+    uint64_t async_wait_cycles{0};
+    uint64_t drain_wiring_cycles{0};
+    uint64_t spsc_drain_cycles{0};    // sub-phase of drain_wiring: SPSC → pending FIFO
+    uint64_t pending_poll_cycles{0};  // sub-phase of drain_wiring: pending FIFO → ready
+    uint64_t dummy_drain_cycles{0};
+    uint64_t dispatch_cycles{0};
+    uint64_t idle_spin_cycles{0};
+    uint64_t completion_iters{0};
+    uint64_t async_wait_iters{0};
+    uint64_t drain_wiring_iters{0};
+    uint64_t spsc_drain_iters{0};
+    uint64_t pending_poll_iters{0};
+    uint64_t pending_poll_skipped{0};  // (a) gate hits: poll calls skipped due to no new completions
+    uint64_t dummy_drain_iters{0};
+    uint64_t dispatch_iters{0};
+    uint64_t idle_iters{0};
+    uint64_t total_iters{0};
+
+    void reset() { *this = SchedulerThreadProfile{}; }
+};
 
-struct alignas(64) CoreExecState {
+struct alignas(64) CoreExecState
+{
     // --- Hot fields (completion + dispatch, every iteration) ---
     uint64_t reg_addr;                      // offset  0: register base address (set once in handshake)
     PTO2TaskSlotState *running_slot_state;  // offset  8: slot state for running task (nullptr = empty)
@@ -103,35 +88,17 @@ struct alignas(64) CoreExecState {
     PTO2SubtaskSlot running_subslot;        // offset 36: which subtask slot is running
     PTO2SubtaskSlot pending_subslot;        // offset 37: which subtask slot is pending
     uint8_t pad0_[2];                       // offset 38: alignment padding
-    // Precomputed COND register pointer; resolved once in handshake so the
-    // hot completion poll does a single volatile load instead of recomputing
-    // reg_base + reg_offset(COND) on every iteration.
-    volatile uint32_t *cond_ptr;  // offset 40: precomputed pointer to COND register
-#if PTO2_PROFILING
-    // --- Profiling fields (dispatch path, compile-time gated) ---
-    uint64_t running_dispatch_timestamp;  // offset 48: AICPU dispatch timestamp for running task
-    uint64_t pending_dispatch_timestamp;  // offset 56: AICPU dispatch timestamp for pending task
-#else
+    volatile uint32_t *cond_ptr;            // offset 40: precomputed pointer to COND register
     // --- Cold fields (init/diagnostics only, never in hot path) ---
     int32_t worker_id;          // offset 48: index in runtime.dev.workers[]
     uint32_t physical_core_id;  // offset 52: hardware physical core ID
     CoreType core_type;         // offset 56: AIC or AIV (enum class : int32_t)
     uint8_t pad2_[4];           // offset 60: pad to 64 bytes
-#endif
 };
 static_assert(sizeof(CoreExecState) == 64, "CoreExecState must occupy exactly one cache line");
 
-// =============================================================================
-// CoreTracker: cluster-based bitmask tracker for idle/running core state.
-//
-// core_states_ encodes per-cluster core idle/running in 3 bits per cluster:
-//   bit i*3   = AIC of cluster i   (1 = idle, 0 = running)
-//   bit i*3+1 = AIV0 of cluster i
-//   bit i*3+2 = AIV1 of cluster i
-// Max 21 clusters per tracker (63 bits in uint64_t).
-// =============================================================================
-
-class alignas(64) CoreTracker {
+class alignas(64) CoreTracker
+{
 public:
     static inline int32_t MAX_CORE_PER_THREAD = 63;
     static constexpr int32_t MAX_CLUSTERS = 63 / 3;
@@ -139,31 +106,69 @@ class alignas(64) CoreTracker {
 public:
     CoreTracker() = default;
 
-    class BitStates {
+    class BitStates
+    {
     public:
         BitStates() = default;
 
         explicit BitStates(uint64_t states) :
-            states_(states) {}
-        void init() { states_ = 0; }
-
-        BitStates operator~() const { return BitStates(~states_); }
-        BitStates operator&(const BitStates &other) const { return BitStates(states_ & other.states_); }
-        BitStates operator|(const BitStates &other) const { return BitStates(states_ | other.states_); }
-        BitStates operator^(const BitStates &other) const { return BitStates(states_ ^ other.states_); }
-        BitStates operator>>(int32_t offset) const { return BitStates(states_ >> offset); }
-        BitStates operator<<(int32_t offset) const { return BitStates(states_ << offset); }
-        void operator&=(const BitStates &other) { states_ &= other.states_; }
-        void operator|=(const BitStates &other) { states_ |= other.states_; }
-        void operator^=(const BitStates &other) { states_ ^= other.states_; }
-
-        bool has_value() const { return states_ > 0; }
-        int32_t count() const { return __builtin_popcountll(states_); }
-        void clear_bit(int32_t offset) { states_ &= ~(1ULL << offset); }
+            states_(states)
+        {}
+        void init()
+        {
+            states_ = 0;
+        }
+
+        BitStates operator~() const
+        {
+            return BitStates(~states_);
+        }
+        BitStates operator&(const BitStates &other) const
+        {
+            return BitStates(states_ & other.states_);
+        }
+        BitStates operator|(const BitStates &other) const
+        {
+            return BitStates(states_ | other.states_);
+        }
+        BitStates operator^(const BitStates &other) const
+        {
+            return BitStates(states_ ^ other.states_);
+        }
+        BitStates operator>>(int32_t offset) const
+        {
+            return BitStates(states_ >> offset);
+        }
+        BitStates operator<<(int32_t offset) const
+        {
+            return BitStates(states_ << offset);
+        }
+        void operator&=(const BitStates &other)
+        {
+            states_ &= other.states_;
+        }
+        void operator|=(const BitStates &other)
+        {
+            states_ |= other.states_;
+        }
+        void operator^=(const BitStates &other)
+        {
+            states_ ^= other.states_;
+        }
+
+        bool has_value() const
+        {
+            return states_ > 0;
+        }
+        int32_t count() const
+        {
+            return __builtin_popcountll(states_);
+        }
 
         // Extract the lowest set bit from mask, clear it, and return its position.
         // Returns -1 if mask is empty.
-        int32_t pop_first() {
+        int32_t pop_first()
+        {
             if (states_ == 0) return -1;
             int32_t pos = __builtin_ctzll(states_);
             states_ &= states_ - 1;
@@ -175,66 +180,73 @@ class alignas(64) CoreTracker {
     };
 
 public:
-    void init(int32_t cluster_count) {
+    void init(int32_t cluster_count)
+    {
         cluster_count_ = cluster_count;
         aic_mask_.init();
         aiv_mask_.init();
         pending_occupied_.init();
-        for (int32_t i = 0; i < cluster_count; i++) {
+        for (int32_t i = 0; i < cluster_count; i++)
+        {
             aic_mask_ |= BitStates(1ULL << (i * 3));
             aiv_mask_ |= BitStates(6ULL << (i * 3));
         }
         core_states_ = aic_mask_ | aiv_mask_;
     }
 
-    void set_cluster(int32_t cluster_idx, int32_t aic_wid, int32_t aiv0_wid, int32_t aiv1_wid) {
+    void set_cluster(int32_t cluster_idx, int32_t aic_wid, int32_t aiv0_wid, int32_t aiv1_wid)
+    {
         core_id_map_[cluster_idx * 3] = aic_wid;
         core_id_map_[cluster_idx * 3 + 1] = aiv0_wid;
         core_id_map_[cluster_idx * 3 + 2] = aiv1_wid;
     }
 
-    int32_t get_cluster_count() const { return cluster_count_; }
+    int32_t get_cluster_count() const
+    {
+        return cluster_count_;
+    }
 
     // --- Running core queries ---
 
     template <CoreType CT>
-    bool has_running_cores() const {
-        if constexpr (CT == CoreType::AIC) {
-            return ((~core_states_) & aic_mask_).has_value();
-        } else {
-            return ((~core_states_) & aiv_mask_).has_value();
-        }
+    bool has_running_cores() const
+    {
+        if constexpr (CT == CoreType::AIC) return ((~core_states_) & aic_mask_).has_value();
+        else return ((~core_states_) & aiv_mask_).has_value();
     }
 
-    bool has_any_running_cores() const { return ((~core_states_) & (aic_mask_ | aiv_mask_)).has_value(); }
+    bool has_any_running_cores() const
+    {
+        return ((~core_states_) & (aic_mask_ | aiv_mask_)).has_value();
+    }
 
     template <CoreType CT>
-    int32_t get_running_count() const {
-        if constexpr (CT == CoreType::AIC) {
-            return ((~core_states_) & aic_mask_).count();
-        } else {
-            return ((~core_states_) & aiv_mask_).count();
-        }
+    int32_t get_running_count() const
+    {
+        if constexpr (CT == CoreType::AIC) return ((~core_states_) & aic_mask_).count();
+        else return ((~core_states_) & aiv_mask_).count();
     }
 
     // Return an opaque bitmask for iterating running cores of a given type.
     // Use pop_first() to extract core bit offsets one at a time.
     template <CoreType CT>
-    BitStates get_running_cores() const {
-        if constexpr (CT == CoreType::AIC) {
-            return (~core_states_) & aic_mask_;
-        } else {
-            return (~core_states_) & aiv_mask_;
-        }
+    BitStates get_running_cores() const
+    {
+        if constexpr (CT == CoreType::AIC) return (~core_states_) & aic_mask_;
+        else return (~core_states_) & aiv_mask_;
     }
 
-    BitStates get_all_running_cores() const { return (~core_states_) & (aic_mask_ | aiv_mask_); }
-    BitStates get_cluster_offset_states() const { return aic_mask_; }
+    BitStates get_all_running_cores() const
+    {
+        return (~core_states_) & (aic_mask_ | aiv_mask_);
+    }
 
     // --- Cluster matching ---
 
-    BitStates get_valid_cluster_offset_states(PTO2ResourceShape shape) const {
-        switch (shape) {
+    BitStates get_valid_cluster_offset_states(PTO2ResourceShape shape) const
+    {
+        switch (shape)
+        {
         case PTO2ResourceShape::AIC:
             return core_states_ & aic_mask_;
         case PTO2ResourceShape::AIV:
@@ -249,143 +261,116 @@ class alignas(64) CoreTracker {
         return BitStates(0ULL);
     }
 
-    int32_t get_aic_core_id(int32_t cluster_offset) const { return core_id_map_[cluster_offset]; }
-    int32_t get_aiv0_core_id(int32_t cluster_offset) const { return core_id_map_[cluster_offset + 1]; }
-    int32_t get_aiv1_core_id(int32_t cluster_offset) const { return core_id_map_[cluster_offset + 2]; }
+    int32_t get_aic_core_id(int32_t cluster_offset) const
+    {
+        return core_id_map_[cluster_offset];
+    }
+    int32_t get_aiv0_core_id(int32_t cluster_offset) const
+    {
+        return core_id_map_[cluster_offset + 1];
+    }
+    int32_t get_aiv1_core_id(int32_t cluster_offset) const
+    {
+        return core_id_map_[cluster_offset + 2];
+    }
 
-    int32_t get_aic_core_offset(int32_t cluster_offset) const { return cluster_offset; }
-    int32_t get_aiv0_core_offset(int32_t cluster_offset) const { return cluster_offset + 1; }
-    int32_t get_aiv1_core_offset(int32_t cluster_offset) const { return cluster_offset + 2; }
+    int32_t get_aic_core_offset(int32_t cluster_offset) const
+    {
+        return cluster_offset;
+    }
+    int32_t get_aiv0_core_offset(int32_t cluster_offset) const
+    {
+        return cluster_offset + 1;
+    }
+    int32_t get_aiv1_core_offset(int32_t cluster_offset) const
+    {
+        return cluster_offset + 2;
+    }
 
-    bool is_aic_core_idle(int32_t cluster_offset) const {
+    bool is_aic_core_idle(int32_t cluster_offset) const
+    {
         return ((core_states_ >> cluster_offset) & BitStates(1ULL)).has_value();
     }
-    bool is_aiv0_core_idle(int32_t cluster_offset) const {
+    bool is_aiv0_core_idle(int32_t cluster_offset) const
+    {
         return ((core_states_ >> (cluster_offset + 1)) & BitStates(1ULL)).has_value();
     }
-    bool is_aiv1_core_idle(int32_t cluster_offset) const {
+    bool is_aiv1_core_idle(int32_t cluster_offset) const
+    {
         return ((core_states_ >> (cluster_offset + 2)) & BitStates(1ULL)).has_value();
     }
 
     // --- State mutation ---
 
     // Toggle bit at the given bit offset (running <-> idle)
-    void change_core_state(int32_t bit_offset) { core_states_ ^= BitStates(1ULL << bit_offset); }
-
-    // --- Pending-occupied tracking ---
-    // Tracks whether a core's pending payload slot is occupied (awaiting hardware ACK).
-    // SET on dispatch (both running-first and pending), CLEAR on idle or pending_freed.
+    void change_core_state(int32_t bit_offset)
+    {
+        core_states_ ^= BitStates(1ULL << bit_offset);
+    }
 
-    void set_pending_occupied(int32_t bit_offset) { pending_occupied_ |= BitStates(1ULL << bit_offset); }
-    void clear_pending_occupied(int32_t bit_offset) {
+    void set_pending_occupied(int32_t bit_offset)
+    {
+        pending_occupied_ |= BitStates(1ULL << bit_offset);
+    }
+    void clear_pending_occupied(int32_t bit_offset)
+    {
         pending_occupied_ ^= (pending_occupied_ & BitStates(1ULL << bit_offset));
     }
 
     // --- Two-phase dispatch queries ---
 
-    // Idle dispatch: returns bit offsets of idle cores for the given shape.
-    // For AIC: 1 bit per cluster (core offset == cluster offset).
-    // For AIV: 1 bit per AIV core (2 bits per cluster at aiv_mask_ positions).
-    // Only AIC needs pending_occupied filtering: by invariant, idle cores (core_states_ bit=1)
-    // always have pending_occupied=0, so AIV/MIX need no extra filtering.
-    // Skipping the AIC-centric filter also fixes a latent bug where a running+pending AIC core
-    // would incorrectly block AIV idle dispatch on the same cluster.
-    BitStates get_idle_core_offset_states(PTO2ResourceShape shape) const {
-        if (shape == PTO2ResourceShape::AIC) {
-            return get_valid_cluster_offset_states(shape) & ~(pending_occupied_ & aic_mask_);
-        }
-        if (shape == PTO2ResourceShape::AIV) {
-            return core_states_ & aiv_mask_;
-        }
+    BitStates get_idle_core_offset_states(PTO2ResourceShape shape) const
+    {
+        if (shape == PTO2ResourceShape::AIC) return get_valid_cluster_offset_states(shape) & ~(pending_occupied_ & aic_mask_);
+        if (shape == PTO2ResourceShape::AIV) return core_states_ & aiv_mask_;
         return get_valid_cluster_offset_states(shape);  // MIX: cluster-level
     }
 
-    // Pending dispatch: returns bit offsets of cores eligible for pending-slot dispatch.
-    // AIC: 1 bit per cluster (aic_mask_ positions). AIV: 1 bit per AIV core (aiv_mask_ positions).
-    // Runtime MIX dispatch uses classify_mix_cluster() so the decision follows the task's active_mask.
-    enum class MixPlacement : uint8_t { RUNNING, PENDING, REJECT };
-
-    // A MIX block must place all cores named by active_mask the same way:
-    // all idle means running placement, all running means pending placement,
-    // and any mixed state is retried later.
-    MixPlacement classify_mix_cluster(int32_t cluster_offset, uint8_t core_mask) const {
-        BitStates used(0ULL);
-        if (core_mask & PTO2_SUBTASK_MASK_AIC) {
-            used |= BitStates(1ULL << cluster_offset);
-        }
-        if (core_mask & PTO2_SUBTASK_MASK_AIV0) {
-            used |= BitStates(1ULL << (cluster_offset + 1));
-        }
-        if (core_mask & PTO2_SUBTASK_MASK_AIV1) {
-            used |= BitStates(1ULL << (cluster_offset + 2));
-        }
-        if (!used.has_value() || (pending_occupied_ & used).has_value()) {
-            return MixPlacement::REJECT;
-        }
-
-        BitStates idle = core_states_ & used;
-        if (idle.count() == used.count()) {
-            return MixPlacement::RUNNING;
-        }
-        if (!idle.has_value()) {
-            return MixPlacement::PENDING;
-        }
-        return MixPlacement::REJECT;
-    }
-
-    BitStates get_mix_running_cluster_offset_states(uint8_t core_mask) const {
-        BitStates result(0ULL);
-        BitStates candidates = get_cluster_offset_states();
-        while (candidates.has_value()) {
-            int32_t cluster_offset = candidates.pop_first();
-            if (classify_mix_cluster(cluster_offset, core_mask) == MixPlacement::RUNNING) {
-                result |= BitStates(1ULL << cluster_offset);
-            }
-        }
-        return result;
-    }
-
-    int32_t count_mix_running_clusters(uint8_t core_mask) const {
-        return get_mix_running_cluster_offset_states(core_mask).count();
-    }
-
-    BitStates get_pending_core_offset_states(PTO2ResourceShape shape) const {
-        if (shape == PTO2ResourceShape::MIX) {
-            // Shape-level query kept conservative for legacy callers/tests.
-            // The real MIX dispatch path applies active_mask in classify_mix_cluster().
+    BitStates get_pending_core_offset_states(PTO2ResourceShape shape) const
+    {
+        if (shape == PTO2ResourceShape::MIX)
+        {
             // Any core without a pending payload can accept a dispatch (idle or running).
             BitStates available = ~pending_occupied_;
-            BitStates mix_available =
-                (available & aic_mask_) & ((available >> 1) & aic_mask_) & ((available >> 2) & aic_mask_);
-            // Pending MIX can only reuse a fully-running cluster. Partially-running clusters
-            // could split one MIX block across immediate and pending placement.
+            BitStates mix_available = (available & aic_mask_) & ((available >> 1) & aic_mask_) & ((available >> 2) & aic_mask_);
+            // Exclude fully-idle clusters (handled by IDLE phase) to prevent double-dispatch.
             BitStates running = ~core_states_;
-            BitStates cluster_all_running =
-                (running & aic_mask_) & ((running >> 1) & aic_mask_) & ((running >> 2) & aic_mask_);
-            return mix_available & cluster_all_running;
-        }
-        if (shape == PTO2ResourceShape::AIC) {
-            return (~core_states_) & aic_mask_ & ~(pending_occupied_ & aic_mask_);
+            BitStates cluster_has_running = (running & aic_mask_) | ((running >> 1) & aic_mask_) | ((running >> 2) & aic_mask_);
+            return mix_available & cluster_has_running;
         }
+        if (shape == PTO2ResourceShape::AIC) return (~core_states_) & aic_mask_ & ~(pending_occupied_ & aic_mask_);
         // AIV
         return (~core_states_) & aiv_mask_ & ~pending_occupied_;
     }
 
     // --- Two-phase dispatch unified query ---
 
-    enum class DispatchPhase : uint8_t { IDLE, PENDING };
+    enum class DispatchPhase : uint8_t
+    {
+        IDLE,
+        PENDING
+    };
 
-    BitStates get_dispatchable_cores(PTO2ResourceShape shape, DispatchPhase phase) const {
-        return (phase == DispatchPhase::IDLE) ? get_idle_core_offset_states(shape) :
-                                                get_pending_core_offset_states(shape);
+    BitStates get_dispatchable_cores(PTO2ResourceShape shape, DispatchPhase phase) const
+    {
+        return (phase == DispatchPhase::IDLE) ? get_idle_core_offset_states(shape) : get_pending_core_offset_states(shape);
     }
 
     // --- Bit offset <-> worker_id mapping ---
 
-    int32_t get_core_id_by_offset(int32_t offset) const { return core_id_map_[offset]; }
+    int32_t get_core_id_by_offset(int32_t offset) const
+    {
+        return core_id_map_[offset];
+    }
 
-    const int32_t *core_ids() const { return core_id_map_; }
-    int32_t core_num() const { return cluster_count_ * 3; }
+    const int32_t *core_ids() const
+    {
+        return core_id_map_;
+    }
+    int32_t core_num() const
+    {
+        return cluster_count_ * 3;
+    }
 
 private:
     int32_t cluster_count_;
@@ -396,12 +381,8 @@ class alignas(64) CoreTracker {
     int32_t core_id_map_[63];  // bit_position -> worker_id, max 21 clusters * 3
 };
 
-// =============================================================================
-// SlotTransition: pure event signals from a single register poll.
-// true = event occurred, false = no-op (maintain current state).
-// =============================================================================
-
-struct SlotTransition {
+struct SlotTransition
+{
     bool running_done = false;   // running task completed
     bool pending_done = false;   // pending task completed
     bool running_freed = false;  // running slot data should be released
@@ -409,55 +390,13 @@ struct SlotTransition {
     bool matched = false;        // some case was hit (otherwise skip apply)
 };
 
-// =============================================================================
-// Profiling counters (compile-time gated)
-// =============================================================================
-
-#if PTO2_PROFILING
-struct alignas(64) SchedL2SwimlaneCounters {
-    bool l2_swimlane_enabled{false};
-    uint64_t sched_start_ts{0};
-    uint64_t sched_complete_cycle{0};
-    uint64_t sched_dispatch_cycle{0};
-    uint64_t sched_wiring_cycle{0};
-    uint64_t sched_idle_cycle{0};
-    uint64_t sched_loop_count{0};
-    uint32_t phase_complete_count{0};
-    // Sub-block retires that did NOT finish a slot (SPMD blocks of a multi-block
-    // task retiring one at a time). Counted separately so the Complete-phase
-    // emit can fire on poll iterations that only retired sub-blocks — otherwise
-    // the serial-harvest tail of an SPMD slot is invisible (no slot completes
-    // until the last block, leaving the scheduler lane blank for that window).
-    uint32_t phase_subretire_count{0};
-    uint32_t phase_dispatch_count{0};
-    // Per-emit delta is (current - *_at_last_emit). Accumulated only when
-    // l2_swimlane_level_ >= SCHED_PHASES.
-    uint64_t pop_hit{0};
-    uint64_t pop_miss{0};
-    uint64_t pop_hit_at_last_emit{0};
-    uint64_t pop_miss_at_last_emit{0};
-#if PTO2_SCHED_PROFILING
-    uint32_t phase_wiring_count{0};
-    uint64_t complete_probe_count{0};
-    uint64_t complete_hit_count{0};
-    uint64_t sched_complete_perf_cycle{0};
-    uint64_t sched_dispatch_pop_cycle{0};
-    uint64_t sched_dispatch_setup_cycle{0};
-#endif
-    void reset() { *this = SchedL2SwimlaneCounters{}; }
-};
-#endif
-
-// =============================================================================
-// sync_start drain coordination
-// =============================================================================
-
 // When sync_start_pending != 0, all scheduler threads skip dispatch
 // (only process completions) until the drain worker finishes launching all blocks.
-struct alignas(64) SyncStartDrainState {
-    std::atomic<int32_t> sync_start_pending{0};    // 0=normal; -1=initializing; >0=active (value=block_num)
-    std::atomic<int32_t> drain_worker_elected{0};  // 0=none; >0: elected thread's (thread_idx+1)
-    std::atomic<uint32_t> drain_ack_mask{0};       // bit per thread; all-set = all threads reached ack barrier
+struct alignas(64) SyncStartDrainState
+{
+    std::atomic<int32_t> sync_start_pending{0};              // 0=normal; -1=initializing; >0=active (value=block_num)
+    std::atomic<int32_t> drain_worker_elected{0};            // 0=none; >0: elected thread's (thread_idx+1)
+    std::atomic<uint32_t> drain_ack_mask{0};                 // bit per thread; all-set = all threads reached ack barrier
     std::atomic<PTO2TaskSlotState *> pending_task{nullptr};  // held task (not re-queued)
     int32_t _pad[10];
 };
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/pto_runtime2_init.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/pto_runtime2_init.cpp
index 1561acc56..c0a126a39 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/pto_runtime2_init.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/pto_runtime2_init.cpp
@@ -8,604 +8,7 @@
  * See LICENSE in the root of the software repository for the full text of the License.
  * -----------------------------------------------------------------------------------------------------------
  */
-/**
- * Host/AICPU shared runtime-arena layout, init_data and wire implementations.
- *
- * Lives under runtime/shared/ so it is included in both the host_runtime.so
- * build (host pre-populates the prebuilt arena image) and the aicpu_runtime
- * build (AICPU runs wire_arena_pointers + reset_for_reuse after attach). The
- * device-only parts of pto_runtime2.cpp / pto_orchestrator.cpp / pto_scheduler.cpp
- * (ops table, scope/submit/dispatch business logic, profiling) stay in their
- * original files and the aicpu build only.
- */
-
-#include <stdlib.h>
-#include <string.h>
-
-#include <limits>
-
-#include "pto_orchestrator.h"
-#include "pto_runtime2.h"
-#include "pto_ring_buffer.h"
-#include "pto_shared_memory.h"
-#include "pto_tensormap.h"
-#include "scheduler/pto_scheduler.h"
-
-static bool sum_ring_heap_sizes(const uint64_t heap_sizes[PTO2_MAX_RING_DEPTH], uint64_t *total) {
-    uint64_t sum = 0;
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        if (heap_sizes[r] > std::numeric_limits<uint64_t>::max() - sum) {
-            LOG_ERROR("Total ring heap size overflows uint64_t");
-            return false;
-        }
-        sum += heap_sizes[r];
-    }
-    *total = sum;
-    return true;
-}
-
-// =============================================================================
-// Ready queue
-// =============================================================================
-
-size_t ready_queue_reserve_layout(DeviceArena &arena, uint64_t capacity) {
-    // Align the slots[] base to a full cache line so MPMC CAS traffic on the
-    // first slot cannot false-share with whatever region sits in front of us
-    // (e.g. orchestrator tensormap heads written by the orch thread).
-    return arena.reserve(capacity * sizeof(PTO2ReadyQueueSlot), PTO2_ALIGN_SIZE);
-}
-
-bool ready_queue_init_data_from_layout(PTO2ReadyQueue *queue, DeviceArena &arena, size_t slots_off, uint64_t capacity) {
-    // Address the slots region for data writes without storing the pointer in
-    // queue->slots — that field is set by ready_queue_wire_arena_pointers.
-    auto *slots_arena = static_cast<PTO2ReadyQueueSlot *>(arena.region_ptr(slots_off));
-    queue->capacity = capacity;
-    queue->mask = capacity - 1;
-    queue->enqueue_pos.store(0, std::memory_order_relaxed);
-    queue->dequeue_pos.store(0, std::memory_order_relaxed);
-
-    for (uint64_t i = 0; i < capacity; i++) {
-        slots_arena[i].sequence.store((int64_t)i, std::memory_order_relaxed);
-        slots_arena[i].slot_state = nullptr;
-    }
-
-    return true;
-}
-
-void ready_queue_wire_arena_pointers(PTO2ReadyQueue *queue, DeviceArena &arena, size_t slots_off) {
-    queue->slots = static_cast<PTO2ReadyQueueSlot *>(arena.region_ptr(slots_off));
-}
-
-void ready_queue_destroy(PTO2ReadyQueue *queue) {
-    // Arena owns the slots[] buffer; just forget the pointer.
-    queue->slots = nullptr;
-}
-
-// =============================================================================
-// Scheduler
-// =============================================================================
-
-bool PTO2SchedulerState::RingSchedState::init_data_from_layout(void *sm_dev_base, int32_t ring_id) {
-    // ring stores the device address of the SM ring header — pure offset
-    // arithmetic, no SM load.
-    ring = pto2_sm_layout::ring_header_addr(sm_dev_base, ring_id);
-    last_task_alive = 0;
-    advance_lock.store(0, std::memory_order_relaxed);
-#if PTO2_PROFILING
-    dep_pool_snapshot_tail.store(1, std::memory_order_relaxed);
-    dep_pool_snapshot_top.store(1, std::memory_order_relaxed);
-#endif
-
-    // Per-slot SM-side initialization (bind_ring + reset_for_reuse +
-    // fanin_count/active_mask zero) lives in PTO2SharedMemoryHandle::
-    // init_header_per_ring so the AICPU performs it during SM reset; host
-    // prebuilt-arena init skips SM access here.
-
-    return true;
-}
-
-void PTO2SchedulerState::RingSchedState::reset_for_reuse(
-    void *sm_dev_base, int32_t ring_id, std::atomic<int32_t> *orch_err
-) {
-    ring = pto2_sm_layout::ring_header_addr(sm_dev_base, ring_id);
-    last_task_alive = 0;
-    advance_lock.store(0, std::memory_order_relaxed);
-    dep_deadlock_reported = false;
-    dep_pool.reset_for_reuse(orch_err);
-#if PTO2_PROFILING
-    dep_pool_snapshot_tail.store(1, std::memory_order_relaxed);
-    dep_pool_snapshot_top.store(1, std::memory_order_relaxed);
-#endif
-}
-
-void PTO2SchedulerState::RingSchedState::destroy() { ring = nullptr; }
-
-PTO2SchedulerLayout PTO2SchedulerState::reserve_layout(DeviceArena &arena, int32_t dep_pool_capacity) {
-    int32_t dep_pool_capacities[PTO2_MAX_RING_DEPTH];
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        dep_pool_capacities[r] = dep_pool_capacity;
-    }
-    return reserve_layout(arena, dep_pool_capacities);
-}
-
-PTO2SchedulerLayout
-PTO2SchedulerState::reserve_layout(DeviceArena &arena, const int32_t dep_pool_capacities[PTO2_MAX_RING_DEPTH]) {
-    PTO2SchedulerLayout layout{};
-    layout.ready_queue_capacity = PTO2_READY_QUEUE_SIZE;
-    layout.spsc_capacity = PTO2_WRIRING_QUEUE_SIZE;
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        layout.dep_pool_capacities[r] = dep_pool_capacities[r];
-    }
-
-    for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) {
-        layout.off_ready_queue_slots[i] = ready_queue_reserve_layout(arena, PTO2_READY_QUEUE_SIZE);
-    }
-    layout.off_dummy_ready_queue_slots = ready_queue_reserve_layout(arena, PTO2_READY_QUEUE_SIZE);
-    layout.off_early_dispatch_queue_slots = ready_queue_reserve_layout(arena, PTO2_EARLY_DISPATCH_QUEUE_SIZE);
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        // Force a cache-line base so writes from scheduler thread 0 (sole
-        // writer of this ring's dep_pool) do not invalidate adjacent
-        // multi-threaded regions like ready_queue.slots.
-        layout.off_dep_pool_entries[r] =
-            arena.reserve(static_cast<size_t>(dep_pool_capacities[r]) * sizeof(PTO2DepListEntry), PTO2_ALIGN_SIZE);
-    }
-    layout.off_wiring_spsc_buffer = PTO2SpscQueue::reserve_layout(arena, PTO2_WRIRING_QUEUE_SIZE);
-    return layout;
-}
-
-bool PTO2SchedulerState::init_data_from_layout(
-    const PTO2SchedulerLayout &layout, DeviceArena &arena, void *sm_dev_base
-) {
-    PTO2SchedulerState *sched = this;
-    sched->sm_header = reinterpret_cast<PTO2SharedMemoryHeader *>(sm_dev_base);
-#if PTO2_SCHED_PROFILING
-    sched->tasks_completed.store(0, std::memory_order_relaxed);
-    sched->tasks_consumed.store(0, std::memory_order_relaxed);
-#endif
-
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        if (!sched->ring_sched_states[r].init_data_from_layout(sm_dev_base, r)) {
-            return false;
-        }
-    }
-
-    for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) {
-        if (!ready_queue_init_data_from_layout(
-                &sched->ready_queues[i], arena, layout.off_ready_queue_slots[i], layout.ready_queue_capacity
-            )) {
-            return false;
-        }
-    }
-    if (!ready_queue_init_data_from_layout(
-            &sched->dummy_ready_queue, arena, layout.off_dummy_ready_queue_slots, layout.ready_queue_capacity
-        )) {
-        return false;
-    }
-    if (!ready_queue_init_data_from_layout(
-            &sched->early_dispatch_queue, arena, layout.off_early_dispatch_queue_slots, PTO2_EARLY_DISPATCH_QUEUE_SIZE
-        )) {
-        return false;
-    }
-
-    auto *orch_err = pto2_sm_layout::orch_error_code_addr(sm_dev_base);
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        auto *dep_entries = static_cast<PTO2DepListEntry *>(arena.region_ptr(layout.off_dep_pool_entries[r]));
-        memset(dep_entries, 0, static_cast<size_t>(layout.dep_pool_capacities[r]) * sizeof(PTO2DepListEntry));
-        sched->ring_sched_states[r].dep_pool.init(dep_entries, layout.dep_pool_capacities[r], orch_err);
-    }
-
-    if (!sched->wiring.queue.init_data_from_layout(arena, layout.off_wiring_spsc_buffer, layout.spsc_capacity)) {
-        return false;
-    }
-    sched->wiring.batch_count = 0;
-    sched->wiring.batch_index = 0;
-    sched->wiring.backoff_counter = 0;
-
-    return true;
-}
-
-void PTO2SchedulerState::reset_for_reuse(const PTO2SchedulerLayout &layout, void *sm_dev_base) {
-    PTO2SchedulerState *sched = this;
-    sched->sm_header = reinterpret_cast<PTO2SharedMemoryHeader *>(sm_dev_base);
-#if PTO2_SCHED_PROFILING
-    sched->tasks_completed.store(0, std::memory_order_relaxed);
-    sched->tasks_consumed.store(0, std::memory_order_relaxed);
-#endif
-
-    auto *orch_err = pto2_sm_layout::orch_error_code_addr(sm_dev_base);
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        sched->ring_sched_states[r].reset_for_reuse(sm_dev_base, r, orch_err);
-    }
-
-    for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) {
-        sched->ready_queues[i].reset_for_reuse();
-    }
-    sched->dummy_ready_queue.reset_for_reuse();
-    sched->early_dispatch_queue.reset_for_reuse();
-
-    sched->wiring.queue.reset_for_reuse();
-    sched->wiring.batch_count = 0;
-    sched->wiring.batch_index = 0;
-    sched->wiring.backoff_counter = 0;
-    sched->wiring.orch_needs_drain.store(false, std::memory_order_relaxed);
-    sched->wiring.producer_blocked.store(0, std::memory_order_relaxed);
-    sched->async_wait_list.reset_for_reuse();
-    (void)layout;
-}
-
-void PTO2SchedulerState::wire_arena_pointers(const PTO2SchedulerLayout &layout, DeviceArena &arena) {
-    PTO2SchedulerState *sched = this;
-    for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) {
-        ready_queue_wire_arena_pointers(&sched->ready_queues[i], arena, layout.off_ready_queue_slots[i]);
-    }
-    ready_queue_wire_arena_pointers(&sched->dummy_ready_queue, arena, layout.off_dummy_ready_queue_slots);
-    ready_queue_wire_arena_pointers(&sched->early_dispatch_queue, arena, layout.off_early_dispatch_queue_slots);
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        sched->ring_sched_states[r].dep_pool.base =
-            static_cast<PTO2DepListEntry *>(arena.region_ptr(layout.off_dep_pool_entries[r]));
-    }
-    sched->wiring.queue.wire_arena_pointers(arena, layout.off_wiring_spsc_buffer);
-}
-
-void PTO2SchedulerState::destroy() {
-    PTO2SchedulerState *sched = this;
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        sched->ring_sched_states[r].destroy();
-        sched->ring_sched_states[r].dep_pool.base = nullptr;
-    }
-    sched->wiring.queue.destroy();
-    for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) {
-        ready_queue_destroy(&sched->ready_queues[i]);
-    }
-    ready_queue_destroy(&sched->dummy_ready_queue);
-    ready_queue_destroy(&sched->early_dispatch_queue);
-}
-
-// =============================================================================
-// Orchestrator
-// =============================================================================
-
-PTO2OrchestratorLayout PTO2OrchestratorState::reserve_layout(
-    DeviceArena &arena, const int32_t task_window_sizes[PTO2_MAX_RING_DEPTH], int32_t dep_pool_capacity
-) {
-    int32_t dep_pool_capacities[PTO2_MAX_RING_DEPTH];
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        dep_pool_capacities[r] = dep_pool_capacity;
-    }
-    return reserve_layout(arena, task_window_sizes, dep_pool_capacities);
-}
-
-PTO2OrchestratorLayout PTO2OrchestratorState::reserve_layout(
-    DeviceArena &arena, const int32_t task_window_sizes[PTO2_MAX_RING_DEPTH],
-    const int32_t dep_pool_capacities[PTO2_MAX_RING_DEPTH]
-) {
-    PTO2OrchestratorLayout layout{};
-    // scope_tasks holds every task in the open scope across all rings, so its cap
-    // is the real in-flight budget = sum of the (runtime) per-ring windows. Using
-    // the compile-time PTO2_SCOPE_TASKS_CAP instead under-sized the buffer when
-    // ring_task_window was enlarged past the default (premature SCOPE_TASKS_OVERFLOW)
-    // and over-allocated it when shrunk. See issue #1188.
-    //
-    // Accumulate in int64: each window is validated <= INT32_MAX individually, but
-    // the sum of PTO2_MAX_RING_DEPTH windows can exceed it — a bare int32 sum would
-    // wrap to a negative/undersized cap. Bound the result before narrowing.
-    int64_t scope_tasks_cap = 0;
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        always_assert(task_window_sizes[r] > 0);
-        scope_tasks_cap += task_window_sizes[r];
-    }
-    always_assert(scope_tasks_cap <= std::numeric_limits<int32_t>::max());
-    layout.scope_tasks_cap = static_cast<int32_t>(scope_tasks_cap);
-    layout.scope_stack_capacity = PTO2_MAX_SCOPE_DEPTH;
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        layout.dep_pool_capacities[r] = dep_pool_capacities[r];
-    }
-
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        const size_t fanin_pool_bytes =
-            PTO2_ALIGN_UP(static_cast<size_t>(dep_pool_capacities[r]) * sizeof(PTO2FaninSpillEntry), PTO2_ALIGN_SIZE);
-        layout.off_fanin_pool[r] = arena.reserve(fanin_pool_bytes, PTO2_ALIGN_SIZE);
-
-        always_assert(task_window_sizes[r] > 0 && (task_window_sizes[r] & (task_window_sizes[r] - 1)) == 0);
-        const size_t seen_epoch_bytes =
-            PTO2_ALIGN_UP(static_cast<size_t>(task_window_sizes[r]) * sizeof(uint32_t), PTO2_ALIGN_SIZE);
-        layout.off_fanin_seen_epoch[r] = arena.reserve(seen_epoch_bytes, PTO2_ALIGN_SIZE);
-    }
-    layout.off_scope_tasks =
-        arena.reserve(static_cast<size_t>(layout.scope_tasks_cap) * sizeof(uintptr_t), alignof(PTO2TaskSlotState *));
-    layout.off_scope_begins =
-        arena.reserve(static_cast<size_t>(layout.scope_stack_capacity) * sizeof(int32_t), alignof(int32_t));
-    layout.tensor_map = PTO2TensorMap::reserve_layout_default(arena, task_window_sizes);
-    return layout;
-}
-
-bool PTO2OrchestratorState::init_data_from_layout(
-    const PTO2OrchestratorLayout &layout, DeviceArena &arena, void *sm_dev_base, void *gm_heap, uint64_t heap_size,
-    uint64_t task_window_size
-) {
-    uint64_t heap_sizes[PTO2_MAX_RING_DEPTH];
-    uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH];
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        heap_sizes[r] = heap_size;
-        task_window_sizes[r] = task_window_size;
-    }
-    return init_data_from_layout(layout, arena, sm_dev_base, gm_heap, heap_sizes, task_window_sizes);
-}
-
-bool PTO2OrchestratorState::init_data_from_layout(
-    const PTO2OrchestratorLayout &layout, DeviceArena &arena, void *sm_dev_base, void *gm_heap,
-    const uint64_t heap_sizes[PTO2_MAX_RING_DEPTH], const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH]
-) {
-    auto *orch = this;
-    *orch = PTO2OrchestratorState{};
-
-    orch->sm_header = reinterpret_cast<PTO2SharedMemoryHeader *>(sm_dev_base);
-    orch->gm_heap_base = gm_heap;
-    uint64_t total_heap_size = 0;
-    if (!sum_ring_heap_sizes(heap_sizes, &total_heap_size)) {
-        return false;
-    }
-    orch->gm_heap_size = total_heap_size;
-    orch->fatal = false;
-
-    auto *orch_err = pto2_sm_layout::orch_error_code_addr(sm_dev_base);
-    uint64_t heap_offset = 0;
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        void *ring_heap_base = reinterpret_cast<char *>(gm_heap) + heap_offset;
-        auto *task_descs_dev = pto2_sm_layout::ring_task_descriptors_addr(sm_dev_base, task_window_sizes, r);
-        auto *slot_states_dev = pto2_sm_layout::ring_slot_states_addr(sm_dev_base, task_window_sizes, r);
-        auto *cur_idx_dev = pto2_sm_layout::ring_current_task_index_addr(sm_dev_base, r);
-        auto *last_alive_dev = pto2_sm_layout::ring_last_task_alive_addr(sm_dev_base, r);
-
-        orch->rings[r].task_allocator.init(
-            task_descs_dev, static_cast<int32_t>(task_window_sizes[r]), cur_idx_dev, last_alive_dev, ring_heap_base,
-            heap_sizes[r], orch_err, slot_states_dev, 0, static_cast<uint8_t>(r)
-        );
-        heap_offset += heap_sizes[r];
-
-        const size_t fanin_pool_bytes = PTO2_ALIGN_UP(
-            static_cast<size_t>(layout.dep_pool_capacities[r]) * sizeof(PTO2FaninSpillEntry), PTO2_ALIGN_SIZE
-        );
-        auto *fanin_entries = static_cast<PTO2FaninSpillEntry *>(arena.region_ptr(layout.off_fanin_pool[r]));
-        memset(fanin_entries, 0, fanin_pool_bytes);
-        orch->rings[r].fanin_pool.init(fanin_entries, layout.dep_pool_capacities[r], orch_err);
-
-        const size_t seen_epoch_bytes = PTO2_ALIGN_UP(
-            static_cast<size_t>(layout.tensor_map.task_window_sizes[r]) * sizeof(uint32_t), PTO2_ALIGN_SIZE
-        );
-        auto *seen_epoch = static_cast<uint32_t *>(arena.region_ptr(layout.off_fanin_seen_epoch[r]));
-        memset(seen_epoch, 0, seen_epoch_bytes);
-        orch->fanin_seen_epoch[r] = seen_epoch;
-    }
-
-    if (!orch->tensor_map.init_data_from_layout(layout.tensor_map, arena)) {
-        return false;
-    }
-
-    orch->scope_tasks_size = 0;
-    orch->scope_tasks_capacity = layout.scope_tasks_cap;
-    orch->scope_stack_top = -1;
-    orch->scope_stack_capacity = layout.scope_stack_capacity;
-    orch->manual_begin_depth = PTO2_MAX_SCOPE_DEPTH;
-
-    return true;
-}
-
-bool PTO2OrchestratorState::reset_for_reuse(
-    const PTO2OrchestratorLayout &layout, void *sm_dev_base, void *gm_heap,
-    const uint64_t heap_sizes[PTO2_MAX_RING_DEPTH], const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH]
-) {
-    auto *orch = this;
-    orch->sm_header = reinterpret_cast<PTO2SharedMemoryHeader *>(sm_dev_base);
-    orch->gm_heap_base = gm_heap;
-    uint64_t total_heap_size = 0;
-    if (!sum_ring_heap_sizes(heap_sizes, &total_heap_size)) {
-        return false;
-    }
-    orch->gm_heap_size = total_heap_size;
-    orch->fatal = false;
-    orch->inline_completed_tasks = 0;
-
-    uint32_t next_epoch = orch->fanin_seen_current_epoch + 1;
-    if (next_epoch == 0) {
-        next_epoch = 1;
-        for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-            memset(
-                orch->fanin_seen_epoch[r], 0,
-                static_cast<size_t>(layout.tensor_map.task_window_sizes[r]) * sizeof(uint32_t)
-            );
-        }
-    }
-    orch->fanin_seen_current_epoch = next_epoch;
-
-    auto *orch_err = pto2_sm_layout::orch_error_code_addr(sm_dev_base);
-    uint64_t heap_offset = 0;
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        void *ring_heap_base = reinterpret_cast<char *>(gm_heap) + heap_offset;
-        auto *task_descs_dev = pto2_sm_layout::ring_task_descriptors_addr(sm_dev_base, task_window_sizes, r);
-        auto *slot_states_dev = pto2_sm_layout::ring_slot_states_addr(sm_dev_base, task_window_sizes, r);
-        auto *cur_idx_dev = pto2_sm_layout::ring_current_task_index_addr(sm_dev_base, r);
-        auto *last_alive_dev = pto2_sm_layout::ring_last_task_alive_addr(sm_dev_base, r);
-
-        orch->rings[r].task_allocator.init(
-            task_descs_dev, static_cast<int32_t>(task_window_sizes[r]), cur_idx_dev, last_alive_dev, ring_heap_base,
-            heap_sizes[r], orch_err, slot_states_dev, 0, static_cast<uint8_t>(r)
-        );
-        heap_offset += heap_sizes[r];
-        orch->rings[r].fanin_pool.reset_for_reuse(orch_err);
-    }
-
-    orch->tensor_map.reset_for_reuse(layout.tensor_map);
-    orch->scope_tasks_size = 0;
-    orch->scope_tasks_capacity = layout.scope_tasks_cap;
-    orch->scope_stack_top = -1;
-    orch->scope_stack_capacity = layout.scope_stack_capacity;
-    orch->manual_begin_depth = PTO2_MAX_SCOPE_DEPTH;
-    orch->total_cluster_count = 0;
-    orch->total_aiv_count = 0;
-#if PTO2_PROFILING
-    orch->tasks_submitted = 0;
-    orch->buffers_allocated = 0;
-    orch->bytes_allocated = 0;
-#endif
-    return true;
-}
-
-void PTO2OrchestratorState::wire_arena_pointers(
-    const PTO2OrchestratorLayout &layout, DeviceArena &arena, PTO2SchedulerState *scheduler_arg
-) {
-    auto *orch = this;
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        orch->rings[r].fanin_pool.base = static_cast<PTO2FaninSpillEntry *>(arena.region_ptr(layout.off_fanin_pool[r]));
-        orch->fanin_seen_epoch[r] = static_cast<uint32_t *>(arena.region_ptr(layout.off_fanin_seen_epoch[r]));
-    }
-    orch->tensor_map.wire_arena_pointers(layout.tensor_map, arena);
-    orch->scope_tasks = static_cast<PTO2TaskSlotState **>(arena.region_ptr(layout.off_scope_tasks));
-    orch->scope_begins = static_cast<int32_t *>(arena.region_ptr(layout.off_scope_begins));
-    orch->scheduler = scheduler_arg;
-}
-
-void PTO2OrchestratorState::destroy() {
-    auto *orch = this;
-    orch->tensor_map.destroy();
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        orch->rings[r].fanin_pool.base = nullptr;
-        orch->fanin_seen_epoch[r] = nullptr;
-    }
-    orch->scope_tasks = nullptr;
-    orch->scope_begins = nullptr;
-}
-
-void PTO2OrchestratorState::set_scheduler(PTO2SchedulerState *scheduler) { this->scheduler = scheduler; }
-
-// =============================================================================
-// Top-level runtime arena
-// =============================================================================
-
-PTO2RuntimeArenaLayout
-runtime_reserve_layout(DeviceArena &arena, uint64_t task_window_size, int32_t dep_pool_capacity) {
-    uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH];
-    uint64_t heap_sizes[PTO2_MAX_RING_DEPTH];
-    int32_t dep_pool_capacities[PTO2_MAX_RING_DEPTH];
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        task_window_sizes[r] = task_window_size;
-        heap_sizes[r] = 0;
-        dep_pool_capacities[r] = dep_pool_capacity;
-    }
-    return runtime_reserve_layout(arena, task_window_sizes, heap_sizes, dep_pool_capacities);
-}
-
-PTO2RuntimeArenaLayout runtime_reserve_layout(
-    DeviceArena &arena, const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH],
-    const uint64_t heap_sizes[PTO2_MAX_RING_DEPTH], const int32_t dep_pool_capacities[PTO2_MAX_RING_DEPTH]
-) {
-    PTO2RuntimeArenaLayout layout{};
-
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        layout.sizing.task_window_sizes[r] = task_window_sizes[r];
-        layout.sizing.heap_sizes[r] = heap_sizes[r];
-        layout.sizing.dep_pool_capacities[r] = dep_pool_capacities[r];
-    }
-
-    layout.offsets.off_sm_handle = arena.reserve(sizeof(PTO2SharedMemoryHandle), alignof(PTO2SharedMemoryHandle));
-    int32_t task_window_sizes_i32[PTO2_MAX_RING_DEPTH];
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        task_window_sizes_i32[r] = static_cast<int32_t>(task_window_sizes[r]);
-    }
-    layout.offsets.orch = PTO2OrchestratorState::reserve_layout(arena, task_window_sizes_i32, dep_pool_capacities);
-    layout.offsets.sched = PTO2SchedulerState::reserve_layout(arena, dep_pool_capacities);
-    layout.offsets.off_runtime = arena.reserve(sizeof(PTO2Runtime), PTO2_ALIGN_SIZE);
-    layout.offsets.off_mailbox = arena.reserve(sizeof(AICoreCompletionMailbox), alignof(AICoreCompletionMailbox));
-
-    layout.offsets.arena_size = arena.total_size();
-    return layout;
-}
-
-PTO2Runtime *runtime_init_data_from_layout(
-    DeviceArena &arena, const PTO2RuntimeArenaLayout &layout, PTO2RuntimeMode mode, void *sm_dev_base,
-    uint64_t /*sm_size*/, void *gm_heap_dev_base, uint64_t heap_size
-) {
-    uint64_t heap_sizes[PTO2_MAX_RING_DEPTH];
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        heap_sizes[r] = heap_size;
-    }
-    return runtime_init_data_from_layout(arena, layout, mode, sm_dev_base, 0, gm_heap_dev_base, heap_sizes);
-}
-
-PTO2Runtime *runtime_init_data_from_layout(
-    DeviceArena &arena, const PTO2RuntimeArenaLayout &layout, PTO2RuntimeMode mode, void *sm_dev_base,
-    uint64_t /*sm_size*/, void *gm_heap_dev_base, const uint64_t heap_sizes[PTO2_MAX_RING_DEPTH]
-) {
-    PTO2Runtime *rt = static_cast<PTO2Runtime *>(arena.region_ptr(layout.offsets.off_runtime));
-    memset(rt, 0, sizeof(*rt));
-
-    auto *sm_wrap = static_cast<PTO2SharedMemoryHandle *>(arena.region_ptr(layout.offsets.off_sm_handle));
-    memset(sm_wrap, 0, sizeof(*sm_wrap));
-
-    // rt->ops is filled by the AICPU at boot.
-    rt->mode = mode;
-    rt->gm_heap = gm_heap_dev_base;
-    uint64_t total_heap_size = 0;
-    if (!sum_ring_heap_sizes(heap_sizes, &total_heap_size)) {
-        return nullptr;
-    }
-    rt->gm_heap_size = total_heap_size;
-    rt->gm_heap_owned = false;
-    rt->total_cycles = 0;
-
-    if (!rt->orchestrator.init_data_from_layout(
-            layout.offsets.orch, arena, sm_dev_base, gm_heap_dev_base, heap_sizes, layout.sizing.task_window_sizes
-        )) {
-        return nullptr;
-    }
-    if (!rt->scheduler.init_data_from_layout(layout.offsets.sched, arena, sm_dev_base)) {
-        return nullptr;
-    }
-
-    auto *mailbox = static_cast<AICoreCompletionMailbox *>(arena.region_ptr(layout.offsets.off_mailbox));
-    memset(mailbox, 0, sizeof(*mailbox));
-
-    return rt;
-}
-
-void runtime_wire_arena_pointers(DeviceArena &arena, const PTO2RuntimeArenaLayout &layout, PTO2Runtime *rt) {
-    rt->sm_handle = static_cast<PTO2SharedMemoryHandle *>(arena.region_ptr(layout.offsets.off_sm_handle));
-    rt->aicore_mailbox = static_cast<AICoreCompletionMailbox *>(arena.region_ptr(layout.offsets.off_mailbox));
-    rt->orchestrator.wire_arena_pointers(layout.offsets.orch, arena, &rt->scheduler);
-    rt->scheduler.wire_arena_pointers(layout.offsets.sched, arena);
-}
-
-bool runtime_reset_for_reuse(DeviceArena &arena, const PTO2RuntimeArenaLayout &layout, PTO2Runtime *rt) {
-    (void)arena;
-    if (rt == nullptr) {
-        return false;
-    }
-
-    rt->pending_scope_mode = PTO2ScopeMode::AUTO;
-    rt->total_cycles = 0;
-    rt->gm_heap_owned = false;
-
-    uint64_t total_heap_size = 0;
-    if (!sum_ring_heap_sizes(layout.sizing.heap_sizes, &total_heap_size)) {
-        return false;
-    }
-    rt->gm_heap_size = total_heap_size;
-
-    if (!rt->orchestrator.reset_for_reuse(
-            layout.offsets.orch, rt->sm_handle->sm_base, rt->gm_heap, layout.sizing.heap_sizes,
-            layout.sizing.task_window_sizes
-        )) {
-        return false;
-    }
-    rt->scheduler.reset_for_reuse(layout.offsets.sched, rt->sm_handle->sm_base);
-    return true;
-}
 
-void runtime_destroy(PTO2Runtime *rt, DeviceArena & /*arena*/) {
-    // Arena buffer is pooled across runs by DeviceRunner — never freed here.
-    if (!rt) return;
-    rt->scheduler.destroy();
-    rt->orchestrator.destroy();
-    rt->aicore_mailbox = nullptr;
-    rt->sm_handle = nullptr;
-}
+// Polling redesign: init / shared-memory / tensormap / runtime helpers are now
+// header-only (declared inline in the runtime/ headers). This translation
+// unit is kept empty to preserve the upstream/main file layout.
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/pto_shared_memory.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/pto_shared_memory.cpp
index 2ebeb42ed..c0a126a39 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/pto_shared_memory.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/pto_shared_memory.cpp
@@ -8,243 +8,7 @@
  * See LICENSE in the root of the software repository for the full text of the License.
  * -----------------------------------------------------------------------------------------------------------
  */
-/**
- * PTO Runtime2 - Shared Memory Implementation
- *
- * Implements shared memory allocation, initialization, and management
- * for Orchestrator-Scheduler communication.
- *
- * Based on: docs/RUNTIME_LOGIC.md
- */
-
-#include "pto_shared_memory.h"
-#include <inttypes.h>
-#include <stdlib.h>
-#include <string.h>
-#include "common/unified_log.h"
-
-// =============================================================================
-// Size Calculation
-// =============================================================================
-
-uint64_t PTO2SharedMemoryHandle::calculate_size(uint64_t task_window_size) {
-    uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH];
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        task_window_sizes[r] = task_window_size;
-    }
-    return calculate_size_per_ring(task_window_sizes);
-}
-
-uint64_t PTO2SharedMemoryHandle::calculate_size_per_ring(const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH]) {
-    // Total SM size = offset just past the last ring, from the single source of
-    // truth for the layout (pto2_sm_layout::ring_segment_offsets).
-    return pto2_sm_layout::ring_segment_offsets(task_window_sizes, PTO2_MAX_RING_DEPTH - 1).end;
-}
-
-// =============================================================================
-// Creation and Destruction
-// =============================================================================
-
-void PTO2SharedMemoryHandle::setup_pointers_per_ring(const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH]) {
-    char *base = (char *)sm_base;
-    header = (PTO2SharedMemoryHeader *)base;
-
-    // Per-ring descriptors / payloads / slot_states — offsets from the single
-    // source of truth (pto2_sm_layout::ring_segment_offsets), so this setup and
-    // the device-address helpers cannot drift.
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        auto off = pto2_sm_layout::ring_segment_offsets(task_window_sizes, r);
-        auto &ring = header->rings[r];
-        ring.task_descriptors = (PTO2TaskDescriptor *)(base + off.descriptors);
-        ring.task_payloads = (PTO2TaskPayload *)(base + off.payloads);
-        ring.slot_states = (PTO2TaskSlotState *)(base + off.slot_states);
-    }
-}
-
-void PTO2SharedMemoryHandle::setup_pointers(uint64_t task_window_size) {
-    uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH];
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        task_window_sizes[r] = task_window_size;
-    }
-    setup_pointers_per_ring(task_window_sizes);
-}
-
-bool PTO2SharedMemoryHandle::init(
-    void *sm_base_arg, uint64_t sm_size_arg, uint64_t task_window_size, uint64_t heap_size
-) {
-    uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH];
-    uint64_t heap_sizes[PTO2_MAX_RING_DEPTH];
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        task_window_sizes[r] = task_window_size;
-        heap_sizes[r] = heap_size;
-    }
-    return init_per_ring(sm_base_arg, sm_size_arg, task_window_sizes, heap_sizes);
-}
-
-bool PTO2SharedMemoryHandle::init_per_ring(
-    void *sm_base_arg, uint64_t sm_size_arg, const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH],
-    const uint64_t heap_sizes[PTO2_MAX_RING_DEPTH]
-) {
-    if (!sm_base_arg || sm_size_arg == 0) return false;
-    if (sm_size_arg < calculate_size_per_ring(task_window_sizes)) return false;
-
-    sm_base = sm_base_arg;
-    sm_size = sm_size_arg;
-    is_owner = false;
-    setup_pointers_per_ring(task_window_sizes);
-    init_header_per_ring(task_window_sizes, heap_sizes);
-    return true;
-}
-
-PTO2SharedMemoryHandle *PTO2SharedMemoryHandle::create_and_init_default(DeviceArena &arena) {
-    const uint64_t buffer_size = calculate_size(PTO2_TASK_WINDOW_SIZE);
-    const size_t off_handle = arena.reserve(sizeof(PTO2SharedMemoryHandle), alignof(PTO2SharedMemoryHandle));
-    const size_t off_buffer = arena.reserve(static_cast<size_t>(buffer_size), PTO2_ALIGN_SIZE);
-    if (arena.commit() == nullptr) return nullptr;
-
-    auto *handle = static_cast<PTO2SharedMemoryHandle *>(arena.region_ptr(off_handle));
-    memset(handle, 0, sizeof(*handle));
-    void *buffer = arena.region_ptr(off_buffer);
-    memset(buffer, 0, static_cast<size_t>(buffer_size));
-    if (!handle->init(buffer, buffer_size, PTO2_TASK_WINDOW_SIZE, PTO2_HEAP_SIZE)) return nullptr;
-    return handle;
-}
-
-void PTO2SharedMemoryHandle::destroy() {
-    // Arena-owned wrappers (is_owner == false) are reclaimed by arena.release();
-    // calling destroy on them is a no-op so existing callers stay safe.
-    if (is_owner && sm_base) {
-        free(sm_base);
-        free(this);
-    }
-}
-
-// =============================================================================
-// Initialization
-// =============================================================================
-//
-// no need init data in pool, init pool data when used
-void PTO2SharedMemoryHandle::init_header(uint64_t task_window_size, uint64_t heap_size) {
-    uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH];
-    uint64_t heap_sizes[PTO2_MAX_RING_DEPTH];
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        task_window_sizes[r] = task_window_size;
-        heap_sizes[r] = heap_size;
-    }
-    init_header_per_ring(task_window_sizes, heap_sizes);
-}
-
-void PTO2SharedMemoryHandle::init_header_per_ring(
-    const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH], const uint64_t heap_sizes[PTO2_MAX_RING_DEPTH]
-) {
-    // Per-ring flow control (start at 0)
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        header->rings[r].fc.init();
-    }
-
-    header->orchestrator_done.store(0, std::memory_order_relaxed);
-
-    // Per-ring layout info
-    uint64_t offset = PTO2_ALIGN_UP(sizeof(PTO2SharedMemoryHeader), PTO2_ALIGN_SIZE);
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        header->rings[r].task_window_size = task_window_sizes[r];
-        header->rings[r].task_window_mask = static_cast<int32_t>(task_window_sizes[r] - 1);
-        header->rings[r].heap_size = heap_sizes[r];
-        header->rings[r].task_descriptors_offset = offset;
-        offset += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskDescriptor), PTO2_ALIGN_SIZE);
-        offset += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskPayload), PTO2_ALIGN_SIZE);
-        offset += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskSlotState), PTO2_ALIGN_SIZE);
-    }
-
-    header->total_size = sm_size;
-    header->graph_output_ptr.store(0, std::memory_order_relaxed);
-    header->graph_output_size.store(0, std::memory_order_relaxed);
-
-    // Error reporting
-    header->orch_error_code.store(PTO2_ERROR_NONE, std::memory_order_relaxed);
-    header->sched_error_bitmap.store(0, std::memory_order_relaxed);
-    header->sched_error_code.store(PTO2_ERROR_NONE, std::memory_order_relaxed);
-    header->sched_error_thread.store(-1, std::memory_order_relaxed);
-    header->sched_stall_detail.store(PTO2_STALL_DETAIL_NONE, std::memory_order_relaxed);
-    header->sched_stall_completed.store(0, std::memory_order_relaxed);
-    header->sched_stall_total.store(0, std::memory_order_relaxed);
-    header->sched_stall_cnt_running.store(0, std::memory_order_relaxed);
-    header->sched_stall_cnt_ready.store(0, std::memory_order_relaxed);
-    header->sched_stall_cnt_waiting.store(0, std::memory_order_relaxed);
-    header->sched_stall_orch_done.store(0, std::memory_order_relaxed);
-    header->sched_stall_task_id.store(-1, std::memory_order_relaxed);
-    header->sched_stall_core.store(-1, std::memory_order_relaxed);
-
-    // No per-slot loop: prepare_task resets each slot when it allocates it, and
-    // the scheduler only scans submitted task_ids [last_task_alive,
-    // current_task_index), so unsubmitted slots are never read. Per-boot reset
-    // is just the header fields above; per-slot state is set lazily at submit.
-}
-
-// =============================================================================
-// Debug Utilities
-// =============================================================================
-
-void PTO2SharedMemoryHandle::print_layout() {
-    if (!header) return;
-
-    PTO2SharedMemoryHeader *h = header;
-
-    LOG_INFO_V0("=== PTO2 Shared Memory Layout ===");
-    LOG_INFO_V0("Base address:       %p", sm_base);
-    LOG_INFO_V0("Total size:         %" PRIu64 " bytes", h->total_size);
-    LOG_INFO_V0("Ring depth:         %d", PTO2_MAX_RING_DEPTH);
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        LOG_INFO_V0("Ring %d:", r);
-        LOG_INFO_V0("  task_window_size: %" PRIu64, h->rings[r].task_window_size);
-        LOG_INFO_V0("  heap_size:        %" PRIu64 " bytes", h->rings[r].heap_size);
-        LOG_INFO_V0(
-            "  descriptors_off:  %" PRIu64 " (0x%" PRIx64 ")", h->rings[r].task_descriptors_offset,
-            h->rings[r].task_descriptors_offset
-        );
-        LOG_INFO_V0("  current_task_idx: %d", h->rings[r].fc.current_task_index.load(std::memory_order_acquire));
-        LOG_INFO_V0("  last_task_alive:  %d", h->rings[r].fc.last_task_alive.load(std::memory_order_acquire));
-    }
-    LOG_INFO_V0("orchestrator_done:  %d", h->orchestrator_done.load(std::memory_order_acquire));
-    LOG_INFO_V0("Error state:");
-    LOG_INFO_V0("  orch_error_code:    %d", h->orch_error_code.load(std::memory_order_relaxed));
-    LOG_INFO_V0("  sched_error_bitmap: 0x%x", h->sched_error_bitmap.load(std::memory_order_relaxed));
-    LOG_INFO_V0("  sched_error_code:   %d", h->sched_error_code.load(std::memory_order_relaxed));
-    LOG_INFO_V0("  sched_error_thread: %d", h->sched_error_thread.load(std::memory_order_relaxed));
-    LOG_INFO_V0("================================");
-}
-
-bool PTO2SharedMemoryHandle::validate() {
-    if (!sm_base) return false;
-    if (!header) return false;
-
-    PTO2SharedMemoryHeader *h = header;
-
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        if (!h->rings[r].fc.validate(this, r)) return false;
-    }
-
-    return true;
-}
-
-bool PTO2RingFlowControl::validate(PTO2SharedMemoryHandle *handle, int32_t ring_id) const {
-    if (!handle) return false;
-    if (!handle->header) return false;
-    if (ring_id < 0 || ring_id >= PTO2_MAX_RING_DEPTH) return false;
-
-    const PTO2SharedMemoryHeader *h = handle->header;
-
-    // Check that offsets are within bounds
-    if (h->rings[ring_id].task_descriptors_offset >= h->total_size) return false;
-
-    // Check pointer alignment
-    if ((uintptr_t)h->rings[ring_id].task_descriptors % PTO2_ALIGN_SIZE != 0) return false;
-
-    // Check flow control pointer sanity
-    int32_t current = current_task_index.load(std::memory_order_acquire);
-    int32_t last_alive = last_task_alive.load(std::memory_order_acquire);
-    if (current < 0) return false;
-    if (last_alive < 0) return false;
 
-    return true;
-}
+// Polling redesign: init / shared-memory / tensormap / runtime helpers are now
+// header-only (declared inline in the runtime/ headers). This translation
+// unit is kept empty to preserve the upstream/main file layout.
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/pto_tensormap.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/pto_tensormap.cpp
index fb22bb4d2..c0a126a39 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/pto_tensormap.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/pto_tensormap.cpp
@@ -8,287 +8,7 @@
  * See LICENSE in the root of the software repository for the full text of the License.
  * -----------------------------------------------------------------------------------------------------------
  */
-/**
- * PTO Runtime2 - TensorMap Implementation
- *
- * Implements TensorMap with ring buffer pool, lazy invalidation,
- * and chain truncation optimization.
- *
- * Key features:
- * 1. O(1) insert at bucket head
- * 2. O(valid_entries) lookup with chain truncation
- * 3. Automatic stale entry cleanup during lookup
- * 4. Periodic explicit cleanup for long chains
- *
- * Based on: docs/RUNTIME_LOGIC.md
- */
-
-#include "pto_tensormap.h"
-
-#include <stdlib.h>
-#include <string.h>
-
-#include "common.h"
-#include "common/unified_log.h"
-
-// =============================================================================
-// TensorMap Lookup Chain Length Statistics (compile-time toggle)
-// =============================================================================
-#if PTO2_TENSORMAP_PROFILING
-uint64_t g_lookup_chain_total = 0;
-uint64_t g_lookup_count = 0;
-int32_t g_lookup_chain_max = 0;
-uint64_t g_lookup_overlap_checks = 0;
-uint64_t g_lookup_overlap_hits = 0;
-uint64_t g_insert_count = 0;
-#endif
-
-// =============================================================================
-// Initialization and Destruction
-// =============================================================================
-
-PTO2TensorMapLayout PTO2TensorMap::reserve_layout(
-    DeviceArena &arena, int32_t new_num_buckets, int32_t new_pool_size,
-    const int32_t new_task_window_sizes[PTO2_MAX_RING_DEPTH]
-) {
-    // num_buckets must be a power of two for the hash truncation to work.
-    always_assert((new_num_buckets & (new_num_buckets - 1)) == 0);
-
-    PTO2TensorMapLayout layout{};
-    layout.num_buckets = new_num_buckets;
-    layout.pool_size = new_pool_size;
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        layout.task_window_sizes[r] = new_task_window_sizes[r];
-    }
-
-    layout.off_buckets = arena.reserve(
-        static_cast<size_t>(new_num_buckets) * sizeof(PTO2TensorMapEntry *), alignof(PTO2TensorMapEntry *)
-    );
-    layout.off_bucket_epochs =
-        arena.reserve(static_cast<size_t>(new_num_buckets) * sizeof(uint32_t), alignof(uint32_t));
-    layout.off_entry_pool =
-        arena.reserve(static_cast<size_t>(new_pool_size) * sizeof(PTO2TensorMapEntry), alignof(PTO2TensorMapEntry));
-    layout.off_free_entry_list =
-        arena.reserve(static_cast<size_t>(new_pool_size) * sizeof(PTO2TensorMapEntry *), alignof(PTO2TensorMapEntry *));
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        layout.off_task_entry_heads[r] = arena.reserve(
-            static_cast<size_t>(new_task_window_sizes[r]) * sizeof(PTO2TensorMapEntry *), alignof(PTO2TensorMapEntry *)
-        );
-        layout.off_task_entry_head_epochs[r] =
-            arena.reserve(static_cast<size_t>(new_task_window_sizes[r]) * sizeof(uint32_t), alignof(uint32_t));
-    }
-    return layout;
-}
-
-PTO2TensorMapLayout
-PTO2TensorMap::reserve_layout_default(DeviceArena &arena, const int32_t new_task_window_sizes[PTO2_MAX_RING_DEPTH]) {
-    return reserve_layout(arena, PTO2_TENSORMAP_NUM_BUCKETS, PTO2_TENSORMAP_POOL_SIZE, new_task_window_sizes);
-}
-
-bool PTO2TensorMap::init_data_from_layout(const PTO2TensorMapLayout &layout, DeviceArena &arena) {
-    num_buckets = layout.num_buckets;
-    pool_size = layout.pool_size;
-
-    // Address arena regions for data writes; do not store these in struct
-    // fields (wire_arena_pointers does that).
-    auto *buckets_arena = static_cast<PTO2TensorMapEntry **>(arena.region_ptr(layout.off_buckets));
-    auto *bucket_epochs_arena = static_cast<uint32_t *>(arena.region_ptr(layout.off_bucket_epochs));
-    auto *entry_pool_arena = static_cast<PTO2TensorMapEntry *>(arena.region_ptr(layout.off_entry_pool));
-    auto *free_list_arena = static_cast<PTO2TensorMapEntry **>(arena.region_ptr(layout.off_free_entry_list));
-
-    // buckets[]: empty == nullptr.
-    for (int32_t i = 0; i < num_buckets; i++) {
-        buckets_arena[i] = nullptr;
-        bucket_epochs_arena[i] = 0;
-    }
-
-    // entry_pool: zero-init equivalent to the previous calloc(entry_pool, ...).
-    // The pool's persistent invariant after init is "bucket_index == -1 means
-    // not linked", set explicitly below.
-    memset(entry_pool_arena, 0, static_cast<size_t>(pool_size) * sizeof(PTO2TensorMapEntry));
-    for (int32_t i = 0; i < pool_size; i++) {
-        entry_pool_arena[i].bucket_index = -1;
-        entry_pool_arena[i].next_in_bucket = nullptr;
-        entry_pool_arena[i].prev_in_bucket = nullptr;
-        entry_pool_arena[i].next_in_task = nullptr;
-        entry_pool_arena[i].prev_in_task = nullptr;
-        entry_pool_arena[i].producer_task_id = PTO2TaskId{};
-    }
-
-    // free_entry_list: zeroed (was calloc'd before); contents become meaningful
-    // only after entries are freed back, so the body of the array stays as 0.
-    memset(free_list_arena, 0, static_cast<size_t>(pool_size) * sizeof(PTO2TensorMapEntry *));
-
-    next_entry_idx = 0;
-    free_num = 0;
-
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        auto *heads_arena = static_cast<PTO2TensorMapEntry **>(arena.region_ptr(layout.off_task_entry_heads[r]));
-        auto *head_epochs_arena = static_cast<uint32_t *>(arena.region_ptr(layout.off_task_entry_head_epochs[r]));
-        for (int32_t i = 0; i < layout.task_window_sizes[r]; i++) {
-            heads_arena[i] = nullptr;
-            head_epochs_arena[i] = 0;
-        }
-        task_window_sizes[r] = layout.task_window_sizes[r];
-        last_task_alives[r] = 0;
-        last_cleanup[r] = 0;
-    }
-
-    return true;
-}
-
-void PTO2TensorMap::reset_for_reuse(const PTO2TensorMapLayout &layout) {
-    num_buckets = layout.num_buckets;
-    pool_size = layout.pool_size;
-    next_entry_idx = 0;
-    free_num = 0;
-    current_epoch++;
-    if (current_epoch == 0) {
-        current_epoch = 1;
-        memset(bucket_epochs, 0, static_cast<size_t>(layout.num_buckets) * sizeof(uint32_t));
-        for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-            memset(task_entry_head_epochs[r], 0, static_cast<size_t>(layout.task_window_sizes[r]) * sizeof(uint32_t));
-        }
-    }
-
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        task_window_sizes[r] = layout.task_window_sizes[r];
-        last_task_alives[r] = 0;
-        last_cleanup[r] = 0;
-    }
-}
-
-void PTO2TensorMap::wire_arena_pointers(const PTO2TensorMapLayout &layout, DeviceArena &arena) {
-    buckets = static_cast<PTO2TensorMapEntry **>(arena.region_ptr(layout.off_buckets));
-    bucket_epochs = static_cast<uint32_t *>(arena.region_ptr(layout.off_bucket_epochs));
-    entry_pool = static_cast<PTO2TensorMapEntry *>(arena.region_ptr(layout.off_entry_pool));
-    free_entry_list = static_cast<PTO2TensorMapEntry **>(arena.region_ptr(layout.off_free_entry_list));
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        task_entry_heads[r] = static_cast<PTO2TensorMapEntry **>(arena.region_ptr(layout.off_task_entry_heads[r]));
-        task_entry_head_epochs[r] = static_cast<uint32_t *>(arena.region_ptr(layout.off_task_entry_head_epochs[r]));
-    }
-}
-
-void PTO2TensorMap::destroy() {
-    // Arena owns the backing memory; here we only forget our pointers so any
-    // stray post-destroy access trips a nullptr dereference instead of reading
-    // a recycled allocation.
-    buckets = nullptr;
-    bucket_epochs = nullptr;
-    entry_pool = nullptr;
-    free_entry_list = nullptr;
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        task_entry_heads[r] = nullptr;
-        task_entry_head_epochs[r] = nullptr;
-    }
-}
-
-// =============================================================================
-// Debug Utilities
-// =============================================================================
-
-void PTO2TensorMap::print_stats() {
-    int32_t valid = 0;
-    int32_t stale = 0;
-    int32_t empty_buckets = 0;
-    int32_t max_chain = 0;
-    int64_t total_chain = 0;
-    int32_t non_empty_buckets = 0;
-
-    // Count entries
-    for (int32_t i = 0; i < pool_size; i++) {
-        if (entry_pool[i].bucket_index != -1) {
-            if (entry_valid(entry_pool[i])) {
-                valid++;
-            } else {
-                stale++;
-            }
-        }
-    }
-
-    // Count bucket stats
-    for (int32_t b = 0; b < num_buckets; b++) {
-        int32_t chain_len = 0;
-        auto cur_entry = buckets[b];
-
-        while (cur_entry != nullptr) {
-            chain_len++;
-            cur_entry = cur_entry->next_in_bucket;
-        }
-
-        if (chain_len == 0) {
-            empty_buckets++;
-        } else {
-            non_empty_buckets++;
-            total_chain += chain_len;
-            if (chain_len > max_chain) {
-                max_chain = chain_len;
-            }
-        }
-    }
-
-    LOG_INFO_V0("=== TensorMap Statistics ===");
-    LOG_INFO_V0("Pool size:           %d", pool_size);
-    LOG_INFO_V0("Pool next entry idx: %d", next_entry_idx);
-    LOG_INFO_V0("Pool free_num:       %d", free_num);
-    LOG_INFO_V0("Num buckets:         %d", num_buckets);
-    LOG_INFO_V0("Valid entries:       %d", valid);
-    LOG_INFO_V0("Stale entries:       %d", stale);
-    LOG_INFO_V0("Empty buckets:       %d", empty_buckets);
-    LOG_INFO_V0("Max chain len:       %d", max_chain);
-    LOG_INFO_V0("Avg chain len:       %.2f", non_empty_buckets > 0 ? (float)total_chain / non_empty_buckets : 0);
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        LOG_INFO_V0("Last task alive[%d]: %d", r, last_task_alives[r]);
-    }
-    LOG_INFO_V0("============================");
-}
-
-int32_t PTO2TensorMap::valid_count() {
-    int32_t count = 0;
-
-    for (int32_t i = 0; i < pool_size; i++) {
-        if (entry_pool[i].bucket_index != -1 && entry_valid(entry_pool[i])) {
-            count++;
-        }
-    }
-
-    return count;
-}
-
-void PTO2TensorMap::sync_tensormap(PTO2TaskId task_id, int32_t sm_last_task_alive) {
-    auto ring_id = task_id.ring();
-    auto local_id = task_id.local();
-    sync_validity(ring_id, sm_last_task_alive);
-
-    // Only attempt cleanup when last_task_alive has actually advanced;
-    // otherwise cleanup_retired would empty-loop and we'd spin forever.
-    auto overlap = get_task_local_id_slot(ring_id, local_id) == get_task_local_id_slot(ring_id, last_cleanup[ring_id]);
-    if (sm_last_task_alive - last_cleanup[ring_id] >= PTO2_TENSORMAP_CLEANUP_INTERVAL || overlap) {
-        cleanup_retired(ring_id, last_cleanup[ring_id], sm_last_task_alive);
-        last_cleanup[ring_id] = sm_last_task_alive;
-    }
-}
-
-// =============================================================================
-// TensorMap Lookup Profiling
-// =============================================================================
-#if PTO2_TENSORMAP_PROFILING
-PTO2TensorMapProfilingData pto2_tensormap_get_profiling() {
-    PTO2TensorMapProfilingData d;
-    d.lookup_chain_total = g_lookup_chain_total;
-    d.lookup_count = g_lookup_count;
-    d.lookup_chain_max = g_lookup_chain_max;
-    d.overlap_checks = g_lookup_overlap_checks;
-    d.overlap_hits = g_lookup_overlap_hits;
-    d.insert_count = g_insert_count;
 
-    // Reset
-    g_lookup_chain_total = 0;
-    g_lookup_count = 0;
-    g_lookup_chain_max = 0;
-    g_lookup_overlap_checks = 0;
-    g_lookup_overlap_hits = 0;
-    g_insert_count = 0;
-    return d;
-}
-#endif
+// Polling redesign: init / shared-memory / tensormap / runtime helpers are now
+// header-only (declared inline in the runtime/ headers). This translation
+// unit is kept empty to preserve the upstream/main file layout.
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp
index 08f86f814..c0a126a39 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp
@@ -8,109 +8,7 @@
  * See LICENSE in the root of the software repository for the full text of the License.
  * -----------------------------------------------------------------------------------------------------------
  */
-/**
- * Runtime Class - Implementation
- *
- * Device execution and handshake control.
- * Task graph construction is handled by PTO2Runtime.
- */
-
-#include "runtime.h"
-
-#include "common/unified_log.h"
-#include "pto_runtime2_types.h"
-#include "pto_shared_memory.h"
-
-// =============================================================================
-// Constructor
-// =============================================================================
-
-Runtime::Runtime() {
-    // NOTE: host_api is initialized in InitRuntime() (host-only code)
-    // because the CApi functions don't exist when compiled for device.
-
-    // Initialize the device-copied descriptor (`dev`).
-    memset(dev.workers, 0, sizeof(dev.workers));
-    dev.worker_count = 0;
-    dev.aicpu_thread_num = 1;
-    dev.ready_queue_shards = RUNTIME_DEFAULT_READY_QUEUE_SHARDS;
-    memset(dev.aicpu_allowed_cpus, 0, sizeof(dev.aicpu_allowed_cpus));
-    dev.aicpu_allowed_cpu_count = 0;
-    dev.aicpu_launch_count = 0;
-    dev.serial_orch_sched = false;
-    dev.gm_sm_ptr_ = nullptr;
-    dev.orch_args_storage_.clear();
-    dev.prebuilt_arena_base_ = nullptr;
-    dev.prebuilt_runtime_offset_ = 0;
-    dev.active_callable_id_ = -1;
-    for (int i = 0; i < RUNTIME_MAX_FUNC_ID; i++) {
-        dev.func_id_to_addr_[i] = 0;
-    }
-
-    // Initialize host-only tail.
-    registered_kernel_count_ = 0;
-}
-
-// =============================================================================
-// Device orchestration
-// =============================================================================
-
-void *Runtime::get_gm_sm_ptr() const { return dev.gm_sm_ptr_; }
-const ChipStorageTaskArgs &Runtime::get_orch_args() const { return dev.orch_args_storage_; }
-void Runtime::set_gm_sm_ptr(void *p) { dev.gm_sm_ptr_ = p; }
-void Runtime::set_orch_args(const ChipStorageTaskArgs &args) { dev.orch_args_storage_ = args; }
-
-void Runtime::set_prebuilt_arena(void *arena_base, size_t runtime_off) {
-    dev.prebuilt_arena_base_ = arena_base;
-    dev.prebuilt_runtime_offset_ = runtime_off;
-}
-void *Runtime::get_prebuilt_arena_base() const { return dev.prebuilt_arena_base_; }
-size_t Runtime::get_prebuilt_runtime_offset() const { return dev.prebuilt_runtime_offset_; }
-
-void Runtime::set_active_callable_id(int32_t callable_id) { dev.active_callable_id_ = callable_id; }
-
-int32_t Runtime::get_active_callable_id() const { return dev.active_callable_id_; }
-
-uint64_t Runtime::get_function_bin_addr(int func_id) const {
-    if (func_id < 0 || func_id >= RUNTIME_MAX_FUNC_ID) return 0;
-    return dev.func_id_to_addr_[func_id];
-}
-
-void Runtime::set_function_bin_addr(int func_id, uint64_t addr) {
-    if (func_id < 0 || func_id >= RUNTIME_MAX_FUNC_ID) {
-        LOG_ERROR("[Runtime] func_id=%d is out of range [0, %d)", func_id, RUNTIME_MAX_FUNC_ID);
-        return;
-    }
-    if (addr != 0 && dev.func_id_to_addr_[func_id] == 0) {
-        if (registered_kernel_count_ < RUNTIME_MAX_FUNC_ID) {
-            registered_kernel_func_ids_[registered_kernel_count_++] = func_id;
-        } else {
-            LOG_ERROR(
-                "[Runtime] Registration limit reached (%d). Cannot track func_id=%d for cleanup.", RUNTIME_MAX_FUNC_ID,
-                func_id
-            );
-        }
-    }
-    dev.func_id_to_addr_[func_id] = addr;
-}
-
-void Runtime::replay_function_bin_addr(int func_id, uint64_t addr) {
-    if (func_id < 0 || func_id >= RUNTIME_MAX_FUNC_ID) {
-        LOG_ERROR("[Runtime] func_id=%d is out of range [0, %d)", func_id, RUNTIME_MAX_FUNC_ID);
-        return;
-    }
-    dev.func_id_to_addr_[func_id] = addr;
-}
-
-int Runtime::get_registered_kernel_count() const { return registered_kernel_count_; }
-
-int Runtime::get_registered_kernel_func_id(int index) const {
-    if (index < 0 || index >= registered_kernel_count_) return -1;
-    return registered_kernel_func_ids_[index];
-}
-
-void Runtime::clear_registered_kernels() { registered_kernel_count_ = 0; }
 
-// trb's device image is just the `dev` descriptor (the rest of Runtime is
-// host-only). Mirrors the host_build_graph definition (= sizeof(Runtime)).
-size_t runtime_device_copy_size(const Runtime &) { return sizeof(DeviceRuntimeLaunchDesc); }
+// Polling redesign: init / shared-memory / tensormap / runtime helpers are now
+// header-only (declared inline in the runtime/ headers). This translation
+// unit is kept empty to preserve the upstream/main file layout.

From 66506330ce498eedf08408716a82da9c674db8e3 Mon Sep 17 00:00:00 2001
From: s00831018 <sergio.miguel.martin@huawei.com>
Date: Thu, 25 Jun 2026 13:13:55 +0200
Subject: [PATCH 02/14] Fix arg order in on_orchestration_done upstream
 overload
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Upstream aicpu_executor.cpp calls
  sched_ctx_.on_orchestration_done(runtime, rt, thread_idx, total_tasks)
but my adapter received the args in reverse order. The result was
total_tasks_ = thread_idx (0/1/2 instead of the real task count), so the
scheduler thought it was done before the orchestrator finished — and
the test hung in 507018 territory regardless.

Fix puts thread_idx and total_tasks in the same positions as upstream.

Still hangs after this fix — runtime hangs earlier than on_orchestration_done.
No LOG_INFO_V0 output from polling kernel at all (even with --log-level v9).
Working theory: macro wiring drift between polling-side scheduler_context.h
and upstream's unified_log + orchestration_api log entry points. To diagnose
further, would need to verify which log_info_v fn the macro resolves to in
the built libaicpu_kernel.so.
---
 .../runtime/scheduler/scheduler_context.h                  | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h
index 91e779e02..1e172a109 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h
@@ -397,9 +397,10 @@ class SchedulerContext
         return rc;
     }
 
-    // Upstream-compatible overload: accepts thread_idx (ignored — polling
-    // scheduler's bookkeeping is thread-agnostic at this point).
-    void on_orchestration_done(Runtime *runtime, PTO2Runtime *rt, int32_t total_tasks, int32_t /*thread_idx*/)
+    // Upstream-compatible overload: signature is (runtime, rt, thread_idx, total_tasks).
+    // thread_idx is ignored — polling scheduler's bookkeeping is thread-agnostic at
+    // this point.
+    void on_orchestration_done(Runtime *runtime, PTO2Runtime *rt, int32_t /*thread_idx*/, int32_t total_tasks)
     {
         on_orchestration_done(runtime, rt, total_tasks);
     }

From b7ddee784ac4fece3bc00bd43380eec52ef72f6b Mon Sep 17 00:00:00 2001
From: s00831018 <sergio.miguel.martin@huawei.com>
Date: Thu, 25 Jun 2026 13:21:02 +0200
Subject: [PATCH 03/14] Use per-ring setup_pointers in init_per_ring adapter
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

setup_pointers(task_window_sizes[0]) broadcasts the first ring size to all
rings, which is fine only when all rings have the same window. Use the
canonical per-ring setup_pointers_per_ring to handle the general case.

(Test workload happens to use uniform 16384 across all 4 rings, so this
fix is correctness-improving but not the cause of the current hang.)

Hang location now identified: orchestrator thread 3 hangs inside the
loaded orch SO's (*p_func)(orch_args_cached_) call — i.e. inside the
user-graph submit loop — between aicpu_executor.cpp's printed Ring
sizes (line 487) and Orchestrator completed (line 685). Most likely
candidates: prepare_task allocator wait, tensormap insert, or
submit_task_common's last_consumer_local_id update path. Next session
should add LOG_INFO_V0 inside submit_task_common to bracket which
call hangs.
---
 .../tensormap_and_ringbuffer/runtime/pto_shared_memory.h        | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h
index aa8539909..faf5164a2 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h
@@ -223,7 +223,7 @@ struct PTO2SharedMemoryHandle
         sm_base = sm_base_arg;
         sm_size = sm_size_arg;
         is_owner = false;
-        setup_pointers(task_window_sizes[0]);
+        setup_pointers_per_ring(task_window_sizes);
         init_header_per_ring(task_window_sizes, heap_sizes);
         return true;
     }

From 22b2e22c760582b6cae589a2cd002216e0ae8a9f Mon Sep 17 00:00:00 2001
From: s00831018 <sergio.miguel.martin@huawei.com>
Date: Thu, 25 Jun 2026 14:25:32 +0200
Subject: [PATCH 04/14] ABI fix: align PTO2RuntimeOps with
 pto_orchestration_api.h

ROOT CAUSE of the runtime hang at vnl-main reconciliation.

The squash-merge took polling-pr-minimal's version of pto_runtime2.h (had
only `log_info_v` in PTO2RuntimeOps) but kept upstream's pto_orchestration_api.h
(declares `log_error`, `log_warn`, `log_debug`, `log_info_v` in that order).

When the orchestration .so called rt->ops->log_info_v(...) from inside the
dlopen'd user-graph SO, the compiler resolved log_info_v's offset using the
orch-side layout (after 6 fn ptrs + 3 logging fn ptrs). But the runtime had
initialized rt->ops as s_runtime_ops using the polling-side layout (log_info_v
right after report_fatal). The orch SO followed the wrong function pointer
into a get_tensor_data/set_tensor_data slot, which jumped into corruption
and silently hung the entire AICPU thread.

Symptom: aicpu_executor.cpp reached "DIAG pre-p_func" then (*p_func) never
returned. orch_diag_step on shared memory stayed at 30. AICore stream
timed out at 507018.

Fix: restore log_error / log_warn / log_debug fields (and their rt_log_error
/ rt_log_warn / rt_log_debug dispatcher implementations, populated in
s_runtime_ops) before log_info_v. ABI now matches between both sides.

Result: paged_attention Case1 PASSED on a 5-round run, dev 7.
Avg Host 208 ms, Avg Device 31 ms.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../runtime/pto_runtime2.h                    | 45 +++++++++++++++++--
 1 file changed, 41 insertions(+), 4 deletions(-)

diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h
index 46b77398d..7eecb777a 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h
@@ -55,7 +55,14 @@ struct PTO2RuntimeOps
     bool (*is_fatal)(PTO2Runtime *rt);
     void (*report_fatal)(PTO2Runtime *rt, int32_t error_code, const char *func, const char *fmt, ...);
 
-    // Logging (populated by runtime, called by orchestration)
+    // Logging (populated by runtime, called by orchestration).
+    // ABI-aligned with pto_orchestration_api.h's PTO2RuntimeOps: log_error,
+    // log_warn, log_debug, log_info_v in this exact order. Mismatched layout
+    // here causes the orch SO to call wrong function pointers via rt->ops,
+    // which manifests as silent hangs in the dlopen'd orchestration code.
+    void (*log_error)(const char *func, const char *fmt, ...);
+    void (*log_warn)(const char *func, const char *fmt, ...);
+    void (*log_debug)(const char *func, const char *fmt, ...);
     // INFO with explicit verbosity tier (v ∈ [0, 9]; gating done inside).
     void (*log_info_v)(const char *func, int v, const char *fmt, ...);
 
@@ -288,9 +295,36 @@ inline void rt_report_fatal(PTO2Runtime *rt, int32_t error_code, const char *fun
     va_end(args);
 }
 
-// Orchestration-side logging dispatcher: orchestration .so calls
-// LOG_INFO_V<n>(fmt, ...) which routes through this op into the unified log.
-// The verbosity gate lives inside unified_log_info_v.
+// Orchestration-side logging dispatchers: orchestration .so calls
+// LOG_*(fmt, ...) which routes through these ops into the unified log.
+// Verbosity gates live inside the unified_log_* primitives.
+inline void rt_log_error(const char *func, const char *fmt, ...)
+{
+    va_list args;
+    va_start(args, fmt);
+    char message[1024];
+    vsnprintf(message, sizeof(message), fmt, args);
+    va_end(args);
+    unified_log_error(func, "%s", message);
+}
+inline void rt_log_warn(const char *func, const char *fmt, ...)
+{
+    va_list args;
+    va_start(args, fmt);
+    char message[1024];
+    vsnprintf(message, sizeof(message), fmt, args);
+    va_end(args);
+    unified_log_warn(func, "%s", message);
+}
+inline void rt_log_debug(const char *func, const char *fmt, ...)
+{
+    va_list args;
+    va_start(args, fmt);
+    char message[1024];
+    vsnprintf(message, sizeof(message), fmt, args);
+    va_end(args);
+    unified_log_debug(func, "%s", message);
+}
 inline void rt_log_info_v(const char *func, int v, const char *fmt, ...)
 {
     va_list args;
@@ -464,6 +498,9 @@ inline const PTO2RuntimeOps s_runtime_ops = {
     .orchestration_done = rt_orchestration_done,
     .is_fatal = is_fatal_impl,
     .report_fatal = rt_report_fatal,
+    .log_error = rt_log_error,
+    .log_warn = rt_log_warn,
+    .log_debug = rt_log_debug,
     .log_info_v = rt_log_info_v,
     .get_tensor_data = get_tensor_data,
     .set_tensor_data = set_tensor_data,

From 3870c6635b6e029e4d2bfa2f4c54437ca885c20e Mon Sep 17 00:00:00 2001
From: s00831018 <sergio.miguel.martin@huawei.com>
Date: Thu, 25 Jun 2026 16:49:30 +0200
Subject: [PATCH 05/14] Drop redundant pending FIFO from wiring-queue drain
 path
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The wake-list-only redesign made classify_fanin_state's decision
terminal — each task is either routed to a ready queue (all fanins
met) or registered on a producer's wake_list (first unmet), with no
"leave it for next iter" outcome. With DRAIN_BATCH (30) below the old
POLL_MAX_PER_ITER cap (128), the intermediate FIFO emptied within the
same iter that filled it, paying push_back + pop_front overhead per
task for no carry-over benefit.

drain_wiring_queue now classifies + routes each drained task in-line.
Removes pending_buf/cap/mask/head_idx/tail_idx state, pending_push_back
/pending_pop_front/pending_count/pending_empty helpers, off_pending_buffer
+ pending_capacity layout fields, POLL_MAX_PER_ITER, and the per-iter
PTO2_TASK_WINDOW_SIZE pointer array arena reservation.

Net diff: −81 / +36. Smoke-tested paged_attention Case1 100 rounds
PASS on dev 6; targeted A/B vs HEAD shows consistent small device
improvement (−0.5% on paged_attention C1, −1.5% on
paged_attention_manual_scope C1, −2.6% on alternating_matmul_add C1).
Host time is too noisy on this shared box to claim a host win, but is
neutral-or-better across the three samples.
---
 .../runtime/scheduler/pto_scheduler.h         | 111 ++++++------------
 .../runtime/scheduler/scheduler_context.h     |   2 +-
 .../runtime/scheduler/scheduler_types.h       |   4 +-
 3 files changed, 36 insertions(+), 81 deletions(-)

diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.h
index 684fcdd07..2422344d8 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.h
@@ -359,10 +359,8 @@ struct PTO2SchedulerLayout
     size_t off_ready_queue_slots[PTO2_NUM_RESOURCE_SHAPES];
     size_t off_dummy_ready_queue_slots;
     size_t off_pending_spsc_buffer;
-    size_t off_pending_buffer;
     uint64_t ready_queue_capacity;
     uint64_t spsc_capacity;
-    uint64_t pending_capacity;
 };
 
 struct PTO2SchedulerState
@@ -422,23 +420,21 @@ struct PTO2SchedulerState
     // the dispatch loop and completed inline -- never goes to AICore.
     PTO2ReadyQueue dummy_ready_queue;
 
-    // Thread 0 exclusive: circular FIFO of tasks awaiting fanin readiness.
-    // SPSC queue receives slot_states from the orchestrator; thread 0 drains
-    // them into the pending ring and polls fanin readiness. Storing the FIFO
-    // out of band (instead of intrusively in PTO2TaskSlotState) keeps the
-    // task struct free of scheduler-private state.
+    // Thread 0 exclusive: bounded SPSC drain → classify → route. The
+    // orchestrator pushes slot_states into the SPSC queue; thread 0 drains
+    // a batch per scheduler iter, classifies each task's fanin state, and
+    // routes terminally — either to a ready queue (all fanins met) or onto
+    // a producer's wake_list (first unmet). No intermediate FIFO: each
+    // drained task is classified once, never re-queued. The wake-list-only
+    // redesign made classify_fanin_state's decision terminal, so the
+    // previously-needed pending FIFO became dead weight on the critical
+    // path.
     struct alignas(64) PendingState
     {
         static constexpr int BACKOFF_LIMIT = 32;
         static constexpr int DRAIN_BATCH = 30;
-        static constexpr int POLL_MAX_PER_ITER = 128;
 
         // --- Thread 0 exclusive ---
-        PTO2TaskSlotState **pending_buf{nullptr};  // capacity slots, arena-owned
-        uint32_t pending_cap{0};
-        uint32_t pending_mask{0};
-        uint32_t pending_head_idx{0};  // next pop
-        uint32_t pending_tail_idx{0};  // next push
         int backoff_counter{0};
         PTO2TaskSlotState *drain_buf[DRAIN_BATCH];
 
@@ -447,9 +443,6 @@ struct PTO2SchedulerState
 
         // --- Orchestrator write, thread 0 read ---
         alignas(64) std::atomic<bool> orch_needs_drain{false};
-
-        uint32_t pending_count() const { return pending_tail_idx - pending_head_idx; }
-        bool pending_empty() const { return pending_tail_idx == pending_head_idx; }
     } wiring;
 
     alignas(64) AsyncWaitList async_wait_list;
@@ -461,22 +454,6 @@ struct PTO2SchedulerState
         else ready_queues[static_cast<int32_t>(shape)].push(slot_state);
     }
 
-    // Append slot to the tail of the pending FIFO.
-    void pending_push_back(PTO2TaskSlotState *s)
-    {
-        wiring.pending_buf[wiring.pending_tail_idx & wiring.pending_mask] = s;
-        wiring.pending_tail_idx++;
-    }
-
-    // Pop the head of the pending FIFO (or nullptr).
-    PTO2TaskSlotState *pending_pop_front()
-    {
-        if (wiring.pending_empty()) return nullptr;
-        PTO2TaskSlotState *s = wiring.pending_buf[wiring.pending_head_idx & wiring.pending_mask];
-        wiring.pending_head_idx++;
-        return s;
-    }
-
     bool fanin_satisfied(PTO2TaskSlotState *s) const
     {
         const PTO2TaskPayload &p = *s->payload;
@@ -488,16 +465,13 @@ struct PTO2SchedulerState
         return true;
     }
 
-    // First-unmet classification used by the pending poll and wake_list
-    // drain. Returns:
+    // First-unmet classification used by the wiring-queue drain and the
+    // wake_list rescan. Returns:
     //   -1: all fanins met (route directly to ready)
     //   ≥0: index of the first unmet fanin (register on its producer's
-    //       wake list). The polling-only path used to distinguish
-    //       "exactly-1 unmet" from "2+ unmet" so the 2+ case could be
-    //       re-queued for the next polling cycle; the wake-list-only
-    //       redesign instead always registers on the first unmet (rescan
-    //       on wake via on_mixed_task_complete), eliminating the
-    //       O(pending × fanin) per-iteration polling cost.
+    //       wake list). Decision is terminal — tasks are never re-queued
+    //       for polling; rescans happen lazily on producer completion via
+    //       on_mixed_task_complete's wake_list drain.
     int classify_fanin_state(PTO2TaskSlotState *s) const
     {
         const PTO2TaskPayload &p = *s->payload;
@@ -536,22 +510,21 @@ struct PTO2SchedulerState
         }
     }
 
-    // Thread 0 entry point: drain SPSC into pending list, then poll pending
-    // for newly-ready tasks. Not-ready tasks rotate to the tail.
-    // Returns >0 if anything moved (SPSC drained OR tasks routed to ready);
-    // 0 signals no productive work.
+    // Thread 0 entry point: drain a bounded batch from the orchestrator's
+    // SPSC queue, then classify+route each drained task terminally. Returns
+    // the count of routed tasks (also the drained count — each drained task
+    // is classified once and never re-queued).
     //
     // Sub-phase timing pointers (optional). If non-null, cumulative cycle/
-    // iteration counters for Stage 1 (SPSC drain) and Stage 2 (pending poll)
+    // iteration counters for Stage 1 (SPSC drain) and Stage 2 (classify+route)
     // are accumulated into them.
     int drain_wiring_queue(bool force_drain = false,
                            uint64_t *spsc_cyc_out = nullptr, uint64_t *spsc_iters_out = nullptr,
                            uint64_t *poll_cyc_out = nullptr, uint64_t *poll_iters_out = nullptr)
     {
-        // Stage 1: drain SPSC → pending FIFO tail
+        // Stage 1: drain SPSC → drain_buf
         uint64_t t0 = spsc_cyc_out ? get_sys_cnt_aicpu() : 0;
         int drained = wiring.queue.pop_batch(wiring.drain_buf, PendingState::DRAIN_BATCH);
-        for (int i = 0; i < drained; i++) pending_push_back(wiring.drain_buf[i]);
         if (spsc_cyc_out)
         {
             *spsc_cyc_out += get_sys_cnt_aicpu() - t0;
@@ -559,7 +532,7 @@ struct PTO2SchedulerState
         }
 
         // Backoff when nothing to do and orchestrator isn't pressing
-        if (drained == 0 && wiring.pending_empty())
+        if (drained == 0)
         {
             if (!force_drain && !wiring.orch_needs_drain.load(std::memory_order_acquire) && wiring.backoff_counter < PendingState::BACKOFF_LIMIT)
             {
@@ -569,21 +542,15 @@ struct PTO2SchedulerState
         }
         wiring.backoff_counter = 0;
 
-        // Stage 2: drain pending FIFO. Each task gets scanned exactly once
-        // here — its state is either "all met → ready_queue" or "register
-        // on the first unmet producer's wake_list and leave". Tasks never
-        // re-enter pending FIFO; re-scans happen lazily on wake via
-        // on_mixed_task_complete's wake_list drain (see below). This
-        // eliminates the O(pending × fanin) per-iteration polling cost
-        // that hurt host time under chains of multi-fanin tasks.
+        // Stage 2: classify + route each drained task in-line. Each task's
+        // state is "all met → ready_queue" or "first unmet → register on that
+        // producer's wake_list". Tasks are scanned exactly once here;
+        // re-scans on producer completion happen via on_mixed_task_complete's
+        // wake_list drain.
         uint64_t t1 = poll_cyc_out ? get_sys_cnt_aicpu() : 0;
-        int routed = 0;
-        int to_visit = static_cast<int>(wiring.pending_count());
-        if (to_visit > PendingState::POLL_MAX_PER_ITER) to_visit = PendingState::POLL_MAX_PER_ITER;
-        for (int i = 0; i < to_visit; i++)
+        for (int i = 0; i < drained; i++)
         {
-            PTO2TaskSlotState *s = pending_pop_front();
-            if (s == nullptr) break;
+            PTO2TaskSlotState *s = wiring.drain_buf[i];
             int state = classify_fanin_state(s);
             if (state < 0)
             {
@@ -591,11 +558,10 @@ struct PTO2SchedulerState
             }
             else
             {
-                // First unmet at index `state`; register on that producer
-                // and leave the FIFO. Producer is in fanin_ring_ids[state]
-                // (may differ from the consumer's ring under multi-ring
-                // fanin). When the producer completes its wake_list drain
-                // will rescan and either push to ready or re-register on
+                // Producer is in fanin_ring_ids[state] (may differ from
+                // the consumer's ring under multi-ring fanin). When the
+                // producer completes, its wake_list drain rescans this
+                // consumer and either pushes to ready or re-registers on
                 // the next unmet producer.
                 int32_t prod_local = s->payload->fanin_local_ids[state];
                 uint8_t prod_ring = s->payload->fanin_ring_ids[state];
@@ -603,7 +569,6 @@ struct PTO2SchedulerState
                 PTO2TaskSlotState *producer = &ring.get_slot_state_by_task_id(prod_local);
                 register_wake(producer, s);
             }
-            routed++;
         }
         if (poll_cyc_out)
         {
@@ -611,7 +576,7 @@ struct PTO2SchedulerState
             if (poll_iters_out) (*poll_iters_out)++;
         }
 
-        return drained + routed;
+        return drained;
     }
 
     int get_ready_tasks_batch(PTO2ResourceShape shape, PTO2LocalReadyBuffer &local_buf, PTO2TaskSlotState **out, int max_count)
@@ -735,12 +700,10 @@ struct PTO2SchedulerState
         PTO2SchedulerLayout layout{};
         layout.ready_queue_capacity = PTO2_READY_QUEUE_SIZE;
         layout.spsc_capacity = PTO2_WRIRING_QUEUE_SIZE;
-        layout.pending_capacity = PTO2_TASK_WINDOW_SIZE;  // bounded by per-ring slot window
 
         for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) layout.off_ready_queue_slots[i] = ready_queue_reserve_layout(arena, PTO2_READY_QUEUE_SIZE);
         layout.off_dummy_ready_queue_slots = ready_queue_reserve_layout(arena, PTO2_READY_QUEUE_SIZE);
         layout.off_pending_spsc_buffer = PTO2SpscQueue::reserve_layout(arena, PTO2_WRIRING_QUEUE_SIZE);
-        layout.off_pending_buffer = arena.reserve(layout.pending_capacity * sizeof(PTO2TaskSlotState *), PTO2_ALIGN_SIZE);
         return layout;
     }
 
@@ -758,12 +721,6 @@ struct PTO2SchedulerState
 
         if (!sched->wiring.queue.init_data_from_layout(arena, layout.off_pending_spsc_buffer, layout.spsc_capacity)) return false;
 
-        if (layout.pending_capacity == 0 || (layout.pending_capacity & (layout.pending_capacity - 1)) != 0) return false;
-        sched->wiring.pending_buf = static_cast<PTO2TaskSlotState **>(arena.region_ptr(layout.off_pending_buffer));
-        sched->wiring.pending_cap = static_cast<uint32_t>(layout.pending_capacity);
-        sched->wiring.pending_mask = sched->wiring.pending_cap - 1;
-        sched->wiring.pending_head_idx = 0;
-        sched->wiring.pending_tail_idx = 0;
         sched->wiring.backoff_counter = 0;
 
         return true;
@@ -775,7 +732,6 @@ struct PTO2SchedulerState
         for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) ready_queue_wire_arena_pointers(&sched->ready_queues[i], arena, layout.off_ready_queue_slots[i]);
         ready_queue_wire_arena_pointers(&sched->dummy_ready_queue, arena, layout.off_dummy_ready_queue_slots);
         sched->wiring.queue.wire_arena_pointers(arena, layout.off_pending_spsc_buffer);
-        sched->wiring.pending_buf = static_cast<PTO2TaskSlotState **>(arena.region_ptr(layout.off_pending_buffer));
     }
 
     // Forget per-region pointers; arena owns the backing memory.
@@ -784,7 +740,6 @@ struct PTO2SchedulerState
         PTO2SchedulerState *sched = this;
         for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) sched->ring_sched_states[r].destroy();
         sched->wiring.queue.destroy();
-        sched->wiring.pending_buf = nullptr;
         for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) ready_queue_destroy(&sched->ready_queues[i]);
         ready_queue_destroy(&sched->dummy_ready_queue);
     }
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h
index 1e172a109..0b1907895 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h
@@ -283,7 +283,7 @@ class SchedulerContext
             }
 
             // Phase 3: Drain wiring queue (thread 0 only). Pass cumulative
-            // sub-phase counters (SPSC drain stage 1 / pending-FIFO poll
+            // sub-phase counters (SPSC drain stage 1 / classify+route
             // stage 2) so drain_wiring_queue accumulates into them.
             if (thread_idx == 0)
             {
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_types.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_types.h
index 98aff8edb..dd3d0ffc4 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_types.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_types.h
@@ -57,8 +57,8 @@ struct alignas(64) SchedulerThreadProfile
     uint64_t cores_scanned{0};
     uint64_t async_wait_cycles{0};
     uint64_t drain_wiring_cycles{0};
-    uint64_t spsc_drain_cycles{0};    // sub-phase of drain_wiring: SPSC → pending FIFO
-    uint64_t pending_poll_cycles{0};  // sub-phase of drain_wiring: pending FIFO → ready
+    uint64_t spsc_drain_cycles{0};    // sub-phase of drain_wiring: SPSC pop_batch into drain_buf
+    uint64_t pending_poll_cycles{0};  // sub-phase of drain_wiring: classify+route each drained task
     uint64_t dummy_drain_cycles{0};
     uint64_t dispatch_cycles{0};
     uint64_t idle_spin_cycles{0};

From f46b11df36c15c1d8b50ee6531902553c85c7485 Mon Sep 17 00:00:00 2001
From: s00831018 <sergio.miguel.martin@huawei.com>
Date: Thu, 25 Jun 2026 16:59:27 +0200
Subject: [PATCH 06/14] Merge fanin_builder loops in submit_task_common
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two consecutive loops over fanin_builder ran back-to-back per task:
the first updated each same-ring producer's last_consumer_local_id
high-water-mark, the second copied (local_id, ring_id) into the
payload's flat arrays. Fold into one loop.

Side benefit: read ring_id from the cache-warm fanin_builder.ring_ids
SOA slice (already populated by append_fanin_or_fail) instead of
dereferencing slot_state->ring_id. Cross-ring fanin iters now skip
the slot dereference entirely; only same-ring iters touch the
producer's slot_state cache line.

A/B on dev 6, 100 rounds trimmed-80:
  alternating_matmul_add C1 — Host 165.2 → 148.7 ms (-10.0%),
                              Device 1.43 → 1.43 ms (flat).
  paged_attention C1       — Host noisy across samples,
                              Device 31.03 → 30.85 ms (-0.6%).

Smaller tests see most of the host benefit (per-task host overhead
dominates their wall time); large device-bound tests see negligible
delta as expected.
---
 .../runtime/pto_orchestrator.h                | 40 +++++++++++--------
 1 file changed, 23 insertions(+), 17 deletions(-)

diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h
index aa8602443..d24242c8f 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h
@@ -610,28 +610,34 @@ inline TaskOutputTensors submit_task_common(PTO2OrchestratorState *orch, const L
     task.packed_buffer_base = prepared.alloc_result.packed_base;
     task.packed_buffer_end = prepared.alloc_result.packed_end;
 
-    // Push this consumer's local_id into each producer's last_consumer high-
-    // water-mark, replacing the per-completion fanout_refcount notification.
-    // Reclamation gates on the per-ring completed_watermark reaching this
-    // value. Only update for same-ring fanin: cross-ring consumers live in a
-    // different local_id space, so their id is meaningless to the producer's
-    // ring's watermark. Cross-ring producer slots reclaim on scope_end /
-    // ring wrap instead — acceptable since cross-ring fanin (e.g.
-    // alloc_tensors output) is sparse.
+    // Single pass over fanin_builder:
+    //   - Copy local_id/ring_id into payload so the scheduler can index the
+    //     producer's ring's completion_flags from the consumer side.
+    //   - Push this consumer's local_id into each same-ring producer's
+    //     last_consumer high-water-mark, replacing the per-completion
+    //     fanout_refcount notification. Reclamation gates on the per-ring
+    //     completed_watermark reaching this value. Only update for same-ring
+    //     fanin: cross-ring consumers live in a different local_id space,
+    //     so their id is meaningless to the producer's ring's watermark.
+    //     Cross-ring producer slots reclaim on scope_end / ring wrap instead
+    //     — acceptable since cross-ring fanin (e.g. alloc_tensors output)
+    //     is sparse.
+    // Use fanin_builder.ring_ids[i] (cache-warm SOA slice) for the same-ring
+    // check so cross-ring iters skip the slot_state dereference entirely.
     const uint8_t self_ring = task_id.ring();
     const int32_t self_local = static_cast<int32_t>(task_id.local());
-    for (int32_t i = 0; i < fanin_builder.count; i++)
-    {
-        PTO2TaskSlotState *prod = fanin_builder.slots[i];
-        if (prod->ring_id != self_ring) continue;
-        if (self_local > prod->last_consumer_local_id) prod->last_consumer_local_id = self_local;
-    }
-
     payload.fanin_count = fanin_builder.count;
     for (int32_t i = 0; i < fanin_builder.count; i++)
     {
-        payload.fanin_local_ids[i] = fanin_builder.local_ids[i];
-        payload.fanin_ring_ids[i] = fanin_builder.ring_ids[i];
+        const int32_t local = fanin_builder.local_ids[i];
+        const uint8_t ring = fanin_builder.ring_ids[i];
+        payload.fanin_local_ids[i] = local;
+        payload.fanin_ring_ids[i] = ring;
+        if (ring == self_ring)
+        {
+            PTO2TaskSlotState *prod = fanin_builder.slots[i];
+            if (self_local > prod->last_consumer_local_id) prod->last_consumer_local_id = self_local;
+        }
     }
 
     payload.init(args, result, prepared.alloc_result, layout);

From 9a7836b4a0cf4ec6bb959d85e4a7f2e54b6bfabf Mon Sep 17 00:00:00 2001
From: s00831018 <sergio.miguel.martin@huawei.com>
Date: Fri, 26 Jun 2026 14:02:04 +0200
Subject: [PATCH 07/14] Drop dead defensive store in wake-list drain

In on_mixed_task_complete's wake-list walk, after reading next, the
code was assigning waiter->next_in_wake_list = nullptr. The store has
no observable effect: register_wake() unconditionally overwrites the
field on every re-registration (before the CAS that publishes the
consumer onto a producer's wake list), and reset_for_reuse() clears
it on slot reuse. No reader exists between this point and the next
overwrite/reset.

Saves one store per waiter across every producer completion. Tiny
absolute win (paged_attention ~5K-10K wake-list iters/round) but
removes confusing-by-omission code: a reader could conclude the
nullptr clear was load-bearing for an ordering or visibility
invariant when it isn't.

Smoke-tested paged_attention Case1 (5 rounds) on dev 6: PASS.
---
 .../runtime/scheduler/pto_scheduler.h                        | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.h
index 2422344d8..e38d20128 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.h
@@ -627,7 +627,10 @@ struct PTO2SchedulerState
         while (waiter != nullptr && waiter != WAKE_LIST_SENTINEL)
         {
             PTO2TaskSlotState *next = waiter->next_in_wake_list;
-            waiter->next_in_wake_list = nullptr;
+            // next_in_wake_list left as-is: every re-registration via
+            // register_wake() overwrites the field before the CAS publishes
+            // the consumer, and reset_for_reuse() clears it on slot reuse.
+            // No reader between here and the next overwrite/reset.
             // Fast path: single-fanin waiters were waiting on *us* (the only
             // possible fanin). No rescan needed — push straight to ready.
             // Saves one classify_fanin_state call (a byte read in

From 1916230119c9f528e90861911ebdebb834744cc9 Mon Sep 17 00:00:00 2001
From: s00831018 <sergio.miguel.martin@huawei.com>
Date: Mon, 29 Jun 2026 10:47:23 +0200
Subject: [PATCH 08/14] Stub wait_for_orchestration_done_before_dispatch for
 polling design
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Upstream's 8ac5ee80 (feat(runtime): add serial orch sched gate #1176)
introduced a pre-dispatch barrier that aicpu_executor.cpp calls on the
SchedulerContext when runtime->serial_orch_sched is true:

    if (serial_orch_sched_) {
        sched_ctx_.wait_for_orchestration_done_before_dispatch(runtime, thread_idx);
    }

The polling SchedulerContext rewrite (873d83a8) doesn't have the method,
so the build fails:
  aicpu_executor.cpp: error: 'class SchedulerContext' has no member
  named 'wait_for_orchestration_done_before_dispatch'

Add a polling-side stub that matches the upstream semantics: spin until
orchestrator_done_ is set, and on thread 0 drain the wiring SPSC in the
meantime so the orchestrator's per-task pushes don't back-pressure the
bounded wiring queue. Other threads just idle on the flag. The
existing `volatile bool orchestrator_done_` is the right gate — same
the rest of the polling design polls.

Surfaced during the upstream-main → vnl-main rebase onto b1e4bd23.
Same shape as the other rebase-trap fixes (ABI alignment, arg order,
per-ring setup, ctor zero-init) that bit earlier rebases.
---
 .../runtime/scheduler/scheduler_context.h        | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h
index 0b1907895..d9761a62e 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h
@@ -432,6 +432,22 @@ class SchedulerContext
         sched_ = &rt->scheduler;
     }
 
+    // Serial orch->sched mode pre-dispatch gate. Spin until the orchestrator
+    // marks itself done; thread 0 may drain the wiring SPSC in the meantime
+    // so the orchestrator's submit_task pushes don't back-pressure. Other
+    // threads idle on the orchestrator_done_ flag.
+    void wait_for_orchestration_done_before_dispatch(Runtime * /*runtime*/, int32_t thread_idx)
+    {
+        while (!orchestrator_done_)
+        {
+            if (thread_idx == 0 && sched_ != nullptr)
+            {
+                sched_->drain_wiring_queue(false);
+            }
+            SPIN_WAIT_HINT();
+        }
+    }
+
     int32_t aic_count() const
     {
         return aic_count_;

From ebf6d23c5a9d1b60fc706d7d38a205a28b9be50a Mon Sep 17 00:00:00 2001
From: s00831018 <sergio.miguel.martin@huawei.com>
Date: Mon, 29 Jun 2026 15:52:12 +0200
Subject: [PATCH 09/14] Relax on_subtask_complete fetch_add to relaxed
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

completed_subtasks is a pure counter — only readers are this fetch_add
itself (per-subtask completion) and reset_for_reuse's relaxed init. No
other state piggybacks ordering through this atomic, so the acq_rel
ordering was defensive rather than load-bearing.

Producer→consumer publication actually happens downstream in
on_mixed_task_complete via completion_flag.store(release) and
wake_list_head.exchange(acq_rel) — those are the AICPU↔AICPU sync
edges that gate consumer dispatch. The producer→consumer GM data
ordering is handled by AICore-side cache coherence independent of
this counter's memory ordering.

On aarch64 this lowers LDADDAL to LDADD (~1–2 cycles saved per call).
on_subtask_complete runs once per AICore subtask completion — paged_
attention C1 makes ~200K calls per round, so saved cycles aggregate to
sub-ms territory per round, below host trial-to-trial noise but
non-negative on device.

Smoke-tested on dev 0 (5/3/3/3/3/3 rounds, --skip-golden):
  paged_attention C1, alternating_matmul_add C1,
  paged_attention_manual_scope C1, spmd_multiblock_mix C1,
  batch_paged_attention C1, paged_attention_unroll C1 — all PASS.
---
 .../runtime/scheduler/pto_scheduler.h                 | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.h
index e38d20128..d69505c3c 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.h
@@ -590,7 +590,16 @@ struct PTO2SchedulerState
 
     bool on_subtask_complete(PTO2TaskSlotState &slot_state)
     {
-        int16_t prev = slot_state.completed_subtasks.fetch_add(1, std::memory_order_acq_rel);
+        // Relaxed fetch_add: completed_subtasks is a pure counter with no
+        // other observers piggybacking state through it. The only readers
+        // are this fetch_add itself (per-subtask) and reset_for_reuse's
+        // relaxed init. Real publication of the producer's completion to
+        // consumer threads happens downstream in on_mixed_task_complete via
+        // completion_flag.store(release) + wake_list_head.exchange(acq_rel)
+        // — those are the AICPU↔AICPU sync edges. The producer→consumer
+        // GM data ordering is handled by AICore-side cache coherence
+        // independent of this counter's ordering.
+        int16_t prev = slot_state.completed_subtasks.fetch_add(1, std::memory_order_relaxed);
         return (prev + 1) == slot_state.total_required_subtasks;
     }
 

From cc357b8a6c6f590182526fc6ed584520e8740ef4 Mon Sep 17 00:00:00 2001
From: s00831018 <sergio.miguel.martin@huawei.com>
Date: Tue, 30 Jun 2026 09:46:14 +0200
Subject: [PATCH 10/14] Fix --enable-l2-swimlane deadlock: missing
 l2_swimlane_aicpu_init
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The polling-design squash dropped the l2_swimlane_aicpu_init() call in
SchedulerContext::init. Without it, --enable-l2-swimlane 1 runs hit
AICore-side memory corruption that surfaces on the AICPU side as either
orch_error_code=3 (PTO2_ERROR_FLOW_CONTROL_DEADLOCK) on paged_attention
Case1 or sched_error_code=100 (PTO2_ERROR_SCHEDULER_TIMEOUT) on
multi_round_paged_attention Case1 — the failure mode depends on which
AICore op first touches the uninitialized rotation-table slot.

Root cause:
  - Host's init_l2_swimlane allocates device memory, fills the rotation
    table pointer in KernelArgs.l2_swimlane_aicore_rotation_table, and
    sets PROFILING_FLAG_L2_SWIMLANE.
  - AICore kernel.cpp:118-128 stashes &rotation_table[block_idx] at
    entry (the slot pointer, before contents).
  - The contract (aicore_executor.cpp:105-110): AICPU must call
    l2_swimlane_aicpu_init() to populate slot CONTENTS before
    handshake_all_cores() sets aicpu_ready=1.
  - aicore_executor.cpp:110 dereferences the slot once handshake is
    past Phase 1, expecting the buffer pointer to be live.
  - Polling design's SchedulerContext::init calls
    handshake_all_cores() without ever calling l2_swimlane_aicpu_init,
    so the slot stays uninitialized. AICore then writes records to
    garbage GM addresses → AICore stops making progress → AICPU
    eventually times out.

Mirrors the existing host_build_graph runtime
(host_build_graph/aicpu/aicpu_executor.cpp:341-343) which does call
l2_swimlane_aicpu_init before handshake_all_cores.

The init is gated on is_l2_swimlane_enabled() (set per launch from the
PROFILING_FLAG_L2_SWIMLANE bit in KernelArgs), so non-swimlane runs
pay nothing.

This only restores level-1 AICORE_TIMING. Higher levels
(AICPU_TIMING / SCHED_PHASES / ORCH_PHASES) need additional missing
calls (l2_swimlane_aicpu_init_phase, _init_core_assignments, and the
emit calls themselves in dispatch/complete paths) — not added here
since the polling runtime emits no records anyway, so the higher
levels would still produce empty buffers.

Repro before fix:
  python3 examples/a2a3/tensormap_and_ringbuffer/paged_attention/test_paged_attention.py \
    --case Case1 --rounds 1 --skip-golden --enable-l2-swimlane 1 \
    --manual include --platform a2a3 --device <N>
  → orch_error_code=3 + AICore 507018 within ~1s.

After fix: PASS. Also re-verified with multi_round_paged_attention
Case1 (was sched_error_code=100) and the non-swimlane smoke set
(paged_attention C1, alternating_matmul_add C1, paged_attention_
manual_scope C1) on dev 0.
---
 .../runtime/scheduler/scheduler_context.h     | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h
index d9761a62e..b3cffce33 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h
@@ -12,6 +12,7 @@
 #define SCHEDULER_CONTEXT_H
 
 #include "aicpu/platform_regs.h"
+#include "aicpu/l2_swimlane_collector_aicpu.h"
 #include "common/l2_swimlane_profiling.h"
 #include "scheduler/scheduler_types.h"
 
@@ -96,6 +97,24 @@ class SchedulerContext
         sched_thread_num_ = sched_thread_num;
         regs_ = regs_base;
 
+        // Initialize l2-swimlane buffers BEFORE handshake_all_cores so the
+        // AICore-side rotation table slots are populated when AICore reads
+        // them post-handshake. AICore stashes &rotation_table[block_idx] at
+        // entry; the slot CONTENTS (the actual record buffer pointer it later
+        // dereferences) are written here. handshake_all_cores sets
+        // aicpu_ready=1 per core, which is AICore's signal to proceed past
+        // Phase 1 — once it has the green light, it expects the slot to be
+        // initialized. See the contract comment in
+        // aicore/aicore_executor.cpp:105-110 and the parallel call in
+        // host_build_graph/aicpu/aicpu_executor.cpp:341. Without this call,
+        // --enable-l2-swimlane runs hit AICore-side memory corruption that
+        // surfaces as orch FLOW_CONTROL_DEADLOCK (paged_attention C1) or
+        // sched SCHEDULER_TIMEOUT (multi_round_paged_attention C1) depending
+        // on which AICore op first touches the uninitialized slot.
+        if (is_l2_swimlane_enabled()) {
+            l2_swimlane_aicpu_init(runtime->worker_count);
+        }
+
         // Discover cores and assign to scheduler threads.
         int32_t rc = handshake_all_cores(runtime);
         if (rc != 0) return rc;

From eea72e94fd990f7e21ab638e22e278dffc73c966 Mon Sep 17 00:00:00 2001
From: s00831018 <sergio.miguel.martin@huawei.com>
Date: Tue, 30 Jun 2026 10:39:07 +0200
Subject: [PATCH 11/14] Adopt #1199 deferred-init: reset slot_state at submit,
 drop per-boot loop
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

init_header_per_ring ran an O(sum(task_window_sizes)) loop on every run
that called bind_ring + reset_for_reuse + active_mask reset on every slot.
These are slow AICPU writes into device SM; on alternating_matmul_add
Case1 (small workload, default window 65536) it measured ~420 us per
round of Device-wall overhead — visible as a +22.7% Device regression
vs upstream/main, even though Effective (the orch∪sched window) was
17% faster on polling design.

The loop is redundant. The scheduler only ever scans submitted task_ids
[last_task_alive, current_task_index); slots that have not been through
prepare_task are never read. Move the reset into prepare_task on the
slot it just allocated:

- prepare_task() now calls bind_ring(ring_id) + reset_for_reuse() on the
  freshly-allocated slot, right after the slot_state lookup and before
  any other per-submit field assignment. Race-free: the polling
  allocator only returns a slot whose previous incarnation is fully
  consumed (alloc spins until completed_watermark passes its
  last_consumer_local_id), and the slot is not published to any
  scheduler thread until the wiring.queue.push at the end of
  submit_task_common.
- This does NOT rely on the scheduler's eager reset-after-CONSUMED
  (the pto_scheduler.h:401 loop that resets [old_last_alive, last_alive)
  as the watermark advances). That loop only covers contiguous tail
  reclaim within a single run; cross-run, slots a prior run left at
  WAKE_LIST_SENTINEL etc. would otherwise carry stale state into the
  first reuse. Doing the reset at submit time makes every reused slot
  self-clean regardless of prior history.
- init_header_per_ring drops the per-slot loop entirely; it now only
  resets per-boot header fields (flow control, layout, error reporting).
  Per-slot state is established lazily at submit.
- active_mask is already overwritten per-submit at the existing
  prepare_task assignment, so the loop's explicit `active_mask = {}`
  is subsumed.

Cost moves from O(window) every run to O(tasks actually submitted) —
and stays on the device (no host DMA). Mirrors upstream
commit 59bb1ec7 (#1199) for polling design.

Measured (a2a3 onboard dev 2, alternating_matmul_add Case1, 100 rounds
trimmed-80):
- Before (vnl-main HEAD): Device 1462.2 us, Effective 644.0 us
- After:                   Device 1042.1 us, Effective 628.6 us
- Delta:                          -420 us (-28.7%)     -15 us (-2.4%)

Vs upstream/main (59bb1ec7) baseline:
- Device:    1191.8 → 1042.1   (-149.7 us, -12.6% faster than upstream)
- Effective:  776.6 →  628.6   (-148.0 us, -19.1% faster than upstream)

Polling design is now both faster on Device wall and faster on the
orch∪sched window than upstream/main.

Testing:
- paged_attention C1 (5 rounds + --enable-l2-swimlane 1): PASS
- multi_round_paged_attention C1 (5 rounds): PASS — exercises slot
  reuse across multiple runs, which is the worst case for the deferred
  reset (relies on prepare_task touching every reused slot).
- alternating_matmul_add C1 (5 rounds): PASS
---
 .../runtime/pto_orchestrator.h                   | 13 +++++++++++++
 .../runtime/pto_shared_memory.h                  | 16 ++++++----------
 2 files changed, 19 insertions(+), 10 deletions(-)

diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h
index d24242c8f..f22064567 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h
@@ -507,6 +507,19 @@ inline bool prepare_task(PTO2OrchestratorState *orch, const L0TaskArgs &args, in
 
     prefetch_payload(out->payload, args.tensor_count(), args.scalar_count());
 
+    // Reset the fanout/wake-list/subtask bookkeeping for this reuse. The allocator
+    // only returns a slot whose previous incarnation is fully consumed (alloc spins
+    // until completed_watermark passes its last_consumer_local_id), and the slot is
+    // not published to any scheduler thread until the wiring.queue.push at the end
+    // of submit_task_common — so this reset is race-free. Doing it here (not relying
+    // on the scheduler's eager reset-after-CONSUMED, which only covers the
+    // contiguously-reclaimed tail within a single run) makes every reused slot
+    // self-clean across runs, which lets the per-boot SM init skip its O(window)
+    // per-slot loop. bind_ring is slot-invariant but cheap to re-assert on the
+    // already-dirtied cache line. Mirrors upstream #1199.
+    out->slot_state->bind_ring(ring_id);
+    out->slot_state->reset_for_reuse();
+
     out->slot_state->bind_buffers(out->payload, out->task);
 
     // Clear the polling-fast completion byte for the newly-allocated slot.
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h
index faf5164a2..836c731aa 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h
@@ -306,16 +306,12 @@ struct PTO2SharedMemoryHandle
         header->sched_error_code.store(PTO2_ERROR_NONE, std::memory_order_relaxed);
         header->sched_error_thread.store(-1, std::memory_order_relaxed);
 
-        for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++)
-        {
-            auto &ring = header->rings[r];
-            for (uint64_t i = 0; i < task_window_sizes[r]; i++)
-            {
-                ring.slot_states[i].bind_ring(static_cast<uint8_t>(r));
-                ring.slot_states[i].reset_for_reuse();
-                ring.slot_states[i].active_mask = ActiveMask{};
-            }
-        }
+        // No per-slot loop: prepare_task() resets each slot when the allocator
+        // hands it out (bind_ring + reset_for_reuse + per-submit fields). The
+        // scheduler only scans submitted task_ids [last_task_alive,
+        // current_task_index), so unsubmitted slots are never read. Cost moves
+        // from O(sum(task_window_sizes)) every run to O(tasks actually
+        // submitted) — and stays on the device. Mirrors upstream #1199.
     }
     void setup_pointers(uint64_t task_window_size)
     {

From 1ca976e37e0a596def85d0fc1a72bb94a14f87ae Mon Sep 17 00:00:00 2001
From: s00831018 <sergio.miguel.martin@huawei.com>
Date: Wed, 1 Jul 2026 10:22:11 +0200
Subject: [PATCH 12/14] Update runtime field access for dev.* split

Upstream #1216 split the Runtime class into an offset-0 DeviceRuntimeLaunchDesc
`dev` member + host-only tail; fields that were `runtime->X` are now
`runtime->dev.X`. The polling squash needed one such access in the
l2_swimlane_aicpu_init call.

Rebase-time fix, small enough to land next to the swimlane commit rather
than amending it and rewriting later commit SHAs.
---
 .../runtime/scheduler/scheduler_context.h                       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h
index b3cffce33..cd3ef0bbf 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h
@@ -112,7 +112,7 @@ class SchedulerContext
         // sched SCHEDULER_TIMEOUT (multi_round_paged_attention C1) depending
         // on which AICore op first touches the uninitialized slot.
         if (is_l2_swimlane_enabled()) {
-            l2_swimlane_aicpu_init(runtime->worker_count);
+            l2_swimlane_aicpu_init(runtime->dev.worker_count);
         }
 
         // Discover cores and assign to scheduler threads.

From 8585efcb6fd8fe04cde5dfd1b39db0f69a1e209b Mon Sep 17 00:00:00 2001
From: s00831018 <sergio.miguel.martin@huawei.com>
Date: Wed, 1 Jul 2026 10:52:04 +0200
Subject: [PATCH 13/14] Restore upstream shared/runtime.cpp for out-of-line
 Runtime methods
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Upstream #1216 moved the Runtime ctor + accessors out of the header into
shared/runtime.cpp. The polling squash previously stubbed this file empty
(all logic was header-inlined), but with the auto-merged runtime.h now
declaring these methods as prototypes, the stub loses the definitions and
the linker fails with undefined symbols like Runtime::set_orch_args.

Restore the file from upstream so the definitions exist. Since polling
also adopts upstream's dev.* split via runtime.h, the upstream .cpp is
a straight fit — no polling-specific bodies to preserve.
---
 .../runtime/shared/runtime.cpp                | 108 +++++++++++++++++-
 1 file changed, 105 insertions(+), 3 deletions(-)

diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp
index c0a126a39..08f86f814 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp
@@ -8,7 +8,109 @@
  * See LICENSE in the root of the software repository for the full text of the License.
  * -----------------------------------------------------------------------------------------------------------
  */
+/**
+ * Runtime Class - Implementation
+ *
+ * Device execution and handshake control.
+ * Task graph construction is handled by PTO2Runtime.
+ */
+
+#include "runtime.h"
+
+#include "common/unified_log.h"
+#include "pto_runtime2_types.h"
+#include "pto_shared_memory.h"
+
+// =============================================================================
+// Constructor
+// =============================================================================
+
+Runtime::Runtime() {
+    // NOTE: host_api is initialized in InitRuntime() (host-only code)
+    // because the CApi functions don't exist when compiled for device.
+
+    // Initialize the device-copied descriptor (`dev`).
+    memset(dev.workers, 0, sizeof(dev.workers));
+    dev.worker_count = 0;
+    dev.aicpu_thread_num = 1;
+    dev.ready_queue_shards = RUNTIME_DEFAULT_READY_QUEUE_SHARDS;
+    memset(dev.aicpu_allowed_cpus, 0, sizeof(dev.aicpu_allowed_cpus));
+    dev.aicpu_allowed_cpu_count = 0;
+    dev.aicpu_launch_count = 0;
+    dev.serial_orch_sched = false;
+    dev.gm_sm_ptr_ = nullptr;
+    dev.orch_args_storage_.clear();
+    dev.prebuilt_arena_base_ = nullptr;
+    dev.prebuilt_runtime_offset_ = 0;
+    dev.active_callable_id_ = -1;
+    for (int i = 0; i < RUNTIME_MAX_FUNC_ID; i++) {
+        dev.func_id_to_addr_[i] = 0;
+    }
+
+    // Initialize host-only tail.
+    registered_kernel_count_ = 0;
+}
+
+// =============================================================================
+// Device orchestration
+// =============================================================================
+
+void *Runtime::get_gm_sm_ptr() const { return dev.gm_sm_ptr_; }
+const ChipStorageTaskArgs &Runtime::get_orch_args() const { return dev.orch_args_storage_; }
+void Runtime::set_gm_sm_ptr(void *p) { dev.gm_sm_ptr_ = p; }
+void Runtime::set_orch_args(const ChipStorageTaskArgs &args) { dev.orch_args_storage_ = args; }
+
+void Runtime::set_prebuilt_arena(void *arena_base, size_t runtime_off) {
+    dev.prebuilt_arena_base_ = arena_base;
+    dev.prebuilt_runtime_offset_ = runtime_off;
+}
+void *Runtime::get_prebuilt_arena_base() const { return dev.prebuilt_arena_base_; }
+size_t Runtime::get_prebuilt_runtime_offset() const { return dev.prebuilt_runtime_offset_; }
+
+void Runtime::set_active_callable_id(int32_t callable_id) { dev.active_callable_id_ = callable_id; }
+
+int32_t Runtime::get_active_callable_id() const { return dev.active_callable_id_; }
+
+uint64_t Runtime::get_function_bin_addr(int func_id) const {
+    if (func_id < 0 || func_id >= RUNTIME_MAX_FUNC_ID) return 0;
+    return dev.func_id_to_addr_[func_id];
+}
+
+void Runtime::set_function_bin_addr(int func_id, uint64_t addr) {
+    if (func_id < 0 || func_id >= RUNTIME_MAX_FUNC_ID) {
+        LOG_ERROR("[Runtime] func_id=%d is out of range [0, %d)", func_id, RUNTIME_MAX_FUNC_ID);
+        return;
+    }
+    if (addr != 0 && dev.func_id_to_addr_[func_id] == 0) {
+        if (registered_kernel_count_ < RUNTIME_MAX_FUNC_ID) {
+            registered_kernel_func_ids_[registered_kernel_count_++] = func_id;
+        } else {
+            LOG_ERROR(
+                "[Runtime] Registration limit reached (%d). Cannot track func_id=%d for cleanup.", RUNTIME_MAX_FUNC_ID,
+                func_id
+            );
+        }
+    }
+    dev.func_id_to_addr_[func_id] = addr;
+}
+
+void Runtime::replay_function_bin_addr(int func_id, uint64_t addr) {
+    if (func_id < 0 || func_id >= RUNTIME_MAX_FUNC_ID) {
+        LOG_ERROR("[Runtime] func_id=%d is out of range [0, %d)", func_id, RUNTIME_MAX_FUNC_ID);
+        return;
+    }
+    dev.func_id_to_addr_[func_id] = addr;
+}
+
+int Runtime::get_registered_kernel_count() const { return registered_kernel_count_; }
+
+int Runtime::get_registered_kernel_func_id(int index) const {
+    if (index < 0 || index >= registered_kernel_count_) return -1;
+    return registered_kernel_func_ids_[index];
+}
+
+void Runtime::clear_registered_kernels() { registered_kernel_count_ = 0; }
 
-// Polling redesign: init / shared-memory / tensormap / runtime helpers are now
-// header-only (declared inline in the runtime/ headers). This translation
-// unit is kept empty to preserve the upstream/main file layout.
+// trb's device image is just the `dev` descriptor (the rest of Runtime is
+// host-only). Mirrors the host_build_graph definition (= sizeof(Runtime)).
+size_t runtime_device_copy_size(const Runtime &) { return sizeof(DeviceRuntimeLaunchDesc); }

From e3503171f9d06cf434e4d22ab70037ebb4123045 Mon Sep 17 00:00:00 2001
From: s00831018 <sergio.miguel.martin@huawei.com>
Date: Wed, 1 Jul 2026 11:36:38 +0200
Subject: [PATCH 14/14] Adopt #1234 arena reuse: allocate epoch arrays +
 implement reset_for_reuse

Two changes to make polling design compatible with upstream #1234
(Support: reuse resident prebuilt runtime arenas):

1. Allocate the epoch-tracking arrays in PTO2TensorMap::reserve_layout
   and init_data_from_layout / wire_arena_pointers. The polling squash
   inherited off_bucket_epochs / off_task_entry_head_epochs layout
   fields plus the bucket_epochs / task_entry_head_epochs pointer
   fields (they read at lookup/insert), but the allocation/wiring
   logic that populates them was missing. Before this fix
   bucket_epochs pointed at arena offset 0 (default-init'd),
   producing arbitrary writes to the arena base and the fast
   AICPU crash bisected to #1234 on single-round tests.

2. Replace the runtime_reset_for_reuse no-op stub with a real body
   that re-runs orchestrator/scheduler init_data_from_layout on the
   pooled arena, then re-wires arena-internal pointers (needed
   because init_data_from_layout does *state = {} which wipes them).
   Upstream #1234 skips the H2D re-upload on bind cache hits and
   relies on this call to scrub the prior run's SM state; without
   it multi-round tests hit stale orchestrator/scheduler state on
   run #2+ and fail with 507018.

Smoke tests passing on a2a3 dev 0:
- paged_attention C1 (1 round + --enable-l2-swimlane 1): PASS
- paged_attention C1 (5 rounds): PASS  # exercises reset_for_reuse
- multi_round_paged_attention C1 (5 rounds): PASS
- alternating_matmul_add C1 (5 rounds): PASS
---
 .../runtime/pto_orchestrator.h                | 23 ++++++++
 .../runtime/pto_ring_buffer.h                 | 12 ++++
 .../runtime/pto_runtime2.h                    | 34 ++++++++++--
 .../runtime/pto_tensormap.h                   | 55 +++++++++++++++++--
 .../runtime/scheduler/pto_scheduler.h         | 27 +++++++++
 5 files changed, 141 insertions(+), 10 deletions(-)

diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h
index f22064567..fce256ef6 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h
@@ -207,6 +207,29 @@ struct PTO2OrchestratorState
         orch->scheduler = scheduler_arg;
     }
 
+    // Surgical reset for the arena-reuse path (#1234). Only touches state that
+    // mutates across runs — leaves the arena-internal pointers wired by
+    // wire_arena_pointers alone, and skips the O(pool_size + num_buckets)
+    // tensor_map re-init in favour of an epoch bump (bucket_epochs and
+    // task_entry_head_epochs are compared against current_epoch on every
+    // lookup; a bump invalidates all stale entries in O(1)).
+    void reset_for_reuse()
+    {
+        auto *orch = this;
+        for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++)
+        {
+            orch->rings[r].task_allocator.reset_for_reuse();
+        }
+        orch->tensor_map.reset_for_reuse();
+        orch->scope_tasks_size = 0;
+        orch->scope_stack_top = -1;
+        orch->manual_begin_depth = PTO2_MAX_SCOPE_DEPTH;
+        orch->fatal = false;
+        orch->inline_completed_tasks = 0;
+        orch->fanin_seen_current_epoch++;
+        if (orch->fanin_seen_current_epoch == 0) orch->fanin_seen_current_epoch = 1;
+    }
+
     // Forget pointers; arena owns the backing buffers.
     void destroy()
     {
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h
index 3faef6b4c..2854867f1 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h
@@ -53,6 +53,18 @@ class PTO2TaskAllocator
         last_alive_seen_ = 0;
     }
 
+    // Surgical reset for arena reuse: just the per-run counters. The
+    // arena-internal pointers (descriptors_, current_index_ptr_, etc.) are
+    // still valid, since wire_arena_pointers was called before this on the
+    // AICPU side.
+    void reset_for_reuse()
+    {
+        local_task_id_ = 0;
+        heap_top_ = 0;
+        heap_tail_ = 0;
+        last_alive_seen_ = 0;
+    }
+
     PTO2TaskAllocResult alloc(int32_t output_size)
     {
         uint64_t aligned_size = output_size > 0 ? PTO2_ALIGN_UP(static_cast<uint64_t>(output_size), PTO2_ALIGN_SIZE) : 0;
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h
index 7eecb777a..f7d7ccdb0 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h
@@ -246,14 +246,36 @@ inline void runtime_destroy(PTO2Runtime *rt, DeviceArena & /*arena*/)
     runtime_destroy(rt);
 }
 
-// Stub for the upstream arena-reuse path (#1234). The polling design has not
-// adopted arena caching / reset_for_reuse machinery; the AICPU reuse path in
-// aicpu_executor still references this symbol, so provide a no-op that
-// succeeds. The init_per_ring call immediately above this in
-// aicpu_executor already resets the SM header for the next run.
+// Upstream arena-reuse path (#1234). On cache hits the host skips the
+// arena re-upload, so the AICPU-side reset here is the only thing that
+// scrubs the previous run's orchestrator/scheduler state. Currently
+// re-runs init_data_from_layout on each sub-region followed by
+// wire_arena_pointers (init_data_from_layout wipes the struct via
+// *state = {}, so the wired pointers must be re-set). This adds ~2 ms of
+// Device wall vs upstream's surgical reset_for_reuse; a fully surgical
+// polling version is deferred as follow-up work (see the reset_for_reuse
+// methods added on PTO2OrchestratorState / PTO2SchedulerState /
+// PTO2TensorMap / PTO2TaskAllocator / PTO2ReadyQueue / PTO2SpscQueue for
+// the scaffolding — the last-mile issue is that ready_queue's
+// reset_for_reuse is a no-op and something in the surgical path leaves
+// state that trips a scheduler stall on the second run).
+inline void runtime_wire_arena_pointers(DeviceArena &arena, const PTO2RuntimeArenaLayout &layout, PTO2Runtime *rt);
+
 inline bool runtime_reset_for_reuse(DeviceArena & /*arena*/, const PTO2RuntimeArenaLayout & /*layout*/, PTO2Runtime *rt)
 {
-    return rt != nullptr;
+    if (rt == nullptr) return false;
+
+    rt->pending_scope_mode = PTO2ScopeMode::AUTO;
+    rt->total_cycles = 0;
+    rt->gm_heap_owned = false;
+
+    void *sm_dev_base = rt->sm_handle ? rt->sm_handle->sm_base : nullptr;
+    if (sm_dev_base == nullptr) return false;
+
+    rt->orchestrator.reset_for_reuse();
+    rt->scheduler.reset_for_reuse(sm_dev_base);
+
+    return true;
 }
 
 inline void runtime_set_mode(PTO2Runtime *rt, PTO2RuntimeMode mode)
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h
index 366f05666..d3f6601ee 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h
@@ -364,9 +364,14 @@ struct PTO2TensorMap
         for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) layout.task_window_sizes[r] = new_task_window_sizes[r];
 
         layout.off_buckets = arena.reserve(static_cast<size_t>(new_num_buckets) * sizeof(PTO2TensorMapEntry *), alignof(PTO2TensorMapEntry *));
+        layout.off_bucket_epochs = arena.reserve(static_cast<size_t>(new_num_buckets) * sizeof(uint32_t), alignof(uint32_t));
         layout.off_entry_pool = arena.reserve(static_cast<size_t>(new_pool_size) * sizeof(PTO2TensorMapEntry), alignof(PTO2TensorMapEntry));
         layout.off_free_entry_list = arena.reserve(static_cast<size_t>(new_pool_size) * sizeof(PTO2TensorMapEntry *), alignof(PTO2TensorMapEntry *));
-        for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) layout.off_task_entry_heads[r] = arena.reserve(static_cast<size_t>(new_task_window_sizes[r]) * sizeof(PTO2TensorMapEntry *), alignof(PTO2TensorMapEntry *));
+        for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++)
+        {
+            layout.off_task_entry_heads[r] = arena.reserve(static_cast<size_t>(new_task_window_sizes[r]) * sizeof(PTO2TensorMapEntry *), alignof(PTO2TensorMapEntry *));
+            layout.off_task_entry_head_epochs[r] = arena.reserve(static_cast<size_t>(new_task_window_sizes[r]) * sizeof(uint32_t), alignof(uint32_t));
+        }
         return layout;
     }
 
@@ -383,11 +388,16 @@ struct PTO2TensorMap
         // Address arena regions for data writes; do not store these in struct
         // fields (wire_arena_pointers does that).
         auto *buckets_arena = static_cast<PTO2TensorMapEntry **>(arena.region_ptr(layout.off_buckets));
+        auto *bucket_epochs_arena = static_cast<uint32_t *>(arena.region_ptr(layout.off_bucket_epochs));
         auto *entry_pool_arena = static_cast<PTO2TensorMapEntry *>(arena.region_ptr(layout.off_entry_pool));
         auto *free_list_arena = static_cast<PTO2TensorMapEntry **>(arena.region_ptr(layout.off_free_entry_list));
 
         // buckets[]: empty == nullptr.
-        for (int32_t i = 0; i < num_buckets; i++) buckets_arena[i] = nullptr;
+        for (int32_t i = 0; i < num_buckets; i++)
+        {
+            buckets_arena[i] = nullptr;
+            bucket_epochs_arena[i] = 0;
+        }
 
         memset(entry_pool_arena, 0, static_cast<size_t>(pool_size) * sizeof(PTO2TensorMapEntry));
         for (int32_t i = 0; i < pool_size; i++)
@@ -410,7 +420,12 @@ struct PTO2TensorMap
         for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++)
         {
             auto *heads_arena = static_cast<PTO2TensorMapEntry **>(arena.region_ptr(layout.off_task_entry_heads[r]));
-            for (int32_t i = 0; i < layout.task_window_sizes[r]; i++) heads_arena[i] = nullptr;
+            auto *head_epochs_arena = static_cast<uint32_t *>(arena.region_ptr(layout.off_task_entry_head_epochs[r]));
+            for (int32_t i = 0; i < layout.task_window_sizes[r]; i++)
+            {
+                heads_arena[i] = nullptr;
+                head_epochs_arena[i] = 0;
+            }
             task_window_sizes[r] = layout.task_window_sizes[r];
             last_task_alives[r] = 0;
             last_cleanup[r] = 0;
@@ -422,9 +437,41 @@ struct PTO2TensorMap
     void wire_arena_pointers(const PTO2TensorMapLayout &layout, DeviceArena &arena)
     {
         buckets = static_cast<PTO2TensorMapEntry **>(arena.region_ptr(layout.off_buckets));
+        bucket_epochs = static_cast<uint32_t *>(arena.region_ptr(layout.off_bucket_epochs));
         entry_pool = static_cast<PTO2TensorMapEntry *>(arena.region_ptr(layout.off_entry_pool));
         free_entry_list = static_cast<PTO2TensorMapEntry **>(arena.region_ptr(layout.off_free_entry_list));
-        for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) task_entry_heads[r] = static_cast<PTO2TensorMapEntry **>(arena.region_ptr(layout.off_task_entry_heads[r]));
+        for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++)
+        {
+            task_entry_heads[r] = static_cast<PTO2TensorMapEntry **>(arena.region_ptr(layout.off_task_entry_heads[r]));
+            task_entry_head_epochs[r] = static_cast<uint32_t *>(arena.region_ptr(layout.off_task_entry_head_epochs[r]));
+        }
+    }
+
+    // Surgical reset for arena reuse (#1234): O(1) epoch bump replaces the
+    // O(num_buckets + pool_size + Σ task_window_sizes) re-init of
+    // init_data_from_layout. bucket_epochs[i] and task_entry_head_epochs[r][i]
+    // are compared against current_epoch on every lookup/insert; bumping
+    // current_epoch invalidates all previous entries logically. Only on the
+    // rare wrap to 0 do we pay the O(num_buckets + Σ window) reset.
+    void reset_for_reuse()
+    {
+        next_entry_idx = 0;
+        free_num = 0;
+        for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++)
+        {
+            last_task_alives[r] = 0;
+            last_cleanup[r] = 0;
+        }
+        current_epoch++;
+        if (current_epoch == 0)
+        {
+            current_epoch = 1;
+            for (int32_t i = 0; i < num_buckets; i++) bucket_epochs[i] = 0;
+            for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++)
+            {
+                for (int32_t i = 0; i < task_window_sizes[r]; i++) task_entry_head_epochs[r][i] = 0;
+            }
+        }
     }
 
     void destroy()
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.h
index d69505c3c..09f1ab1c0 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.h
@@ -83,6 +83,10 @@ struct alignas(64) PTO2ReadyQueue
         return (e >= d) ? (e - d) : 0;
     }
 
+    // No-op: the sequence-based Vyukov MPMC queue is self-consistent across
+    // runs — every slot's sequence at end of run 1 equals the enqueue_pos
+    // where run 2's first push at that slot will land, so pushes/pops resume
+    // seamlessly without any reset.
     void reset_for_reuse() {}
 
     bool push(PTO2TaskSlotState *slot_state)
@@ -755,6 +759,29 @@ struct PTO2SchedulerState
         for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) ready_queue_destroy(&sched->ready_queues[i]);
         ready_queue_destroy(&sched->dummy_ready_queue);
     }
+
+    // Surgical reset for arena reuse (#1234): resets per-run mutable state
+    // without redoing the O(ready_queue_capacity) buffer-zeroing that
+    // init_data_from_layout does. Ring pointer is re-set from sm_dev_base
+    // since we can't rely on the previous run's value being valid across
+    // arena reuse.
+    void reset_for_reuse(void *sm_dev_base)
+    {
+        PTO2SchedulerState *sched = this;
+        sched->sm_header = reinterpret_cast<PTO2SharedMemoryHeader *>(sm_dev_base);
+        for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++)
+        {
+            sched->ring_sched_states[r].ring = pto2_sm_layout::ring_header_addr(sm_dev_base, r);
+            sched->ring_sched_states[r].last_task_alive = 0;
+            sched->ring_sched_states[r].advance_lock.store(0, std::memory_order_relaxed);
+        }
+        for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) sched->ready_queues[i].reset_for_reuse();
+        sched->dummy_ready_queue.reset_for_reuse();
+        sched->wiring.queue.reset_for_reuse();
+        sched->wiring.backoff_counter = 0;
+        sched->wiring.orch_needs_drain.store(false, std::memory_order_relaxed);
+        sched->async_wait_list.reset_for_reuse();
+    }
 };
 
 // Scheduler cold-path API is declared as PTO2SchedulerState member functions.