hw-native-sys · ChaoWao · Jul 1, 2026 · Jun 30, 2026
diff --git a/docs/dynamic-linking.md b/docs/dynamic-linking.md
@@ -224,11 +224,11 @@ SchedulerContext owns its own teardown:
 - `SchedulerContext::deinit()` resets every scheduler-owned field —
   per-core states, payloads, sync-start drain coordination
   (`sync_start_pending` / `drain_worker_elected` / `drain_ack_mask` /
-  `pending_task`), task counters, transition flags, worker-id lists,
+  `pending_task`), task counters, worker-id lists,
   core trackers, `cores_total_num_` / `aic_count_` / `aiv_count_`,
   `regs_`, `sched_`, `func_id_to_addr_`, and the `pto2_init_*` flags.
 - `AicpuExecutor::deinit()` calls `sched_ctx_.deinit()` first, then resets
-  only its own fields: `thread_num_`, `sched_thread_num_`, `orch_to_sched_`,
+  only its own fields: `thread_num_`, `sched_thread_num_`,
   `orch_func_`, `orch_args_cached_`, `orch_so_handle_`, `orch_so_path_`,
   `runtime_init_ready_`, and the lifecycle atomics
   (`initialized_`, `init_done_`, `init_failed_`, `finished_`, `thread_idx_`,

diff --git a/src/a2a3/platform/include/aicpu/l2_swimlane_collector_aicpu.h b/src/a2a3/platform/include/aicpu/l2_swimlane_collector_aicpu.h
@@ -155,8 +155,7 @@ void l2_swimlane_aicpu_flush(int thread_idx, const int *cur_thread_cores, int co
  *                                 to the L2Swimlane base
  * @param num_sched_phase_threads  Number of sched-phase pools to prime
  * @param num_orch_phase_threads   Number of orch-phase pools to prime
- *                                 (typically 1; in orch_to_sched mode =
- *                                 num_aicpu_threads)
+ *                                 (typically 1)
  */
 void l2_swimlane_aicpu_init_phase(int worker_count, int num_sched_phase_threads, int num_orch_phase_threads);
 
@@ -200,8 +199,7 @@ void l2_swimlane_aicpu_record_sched_phase(
  * Must be called once from the orchestrator thread before any
  * l2_swimlane_aicpu_record_orch_phase() calls.
  *
- * @param thread_idx Thread index for the orchestrator (typically num_sched_threads;
- *                   in orch_to_sched mode each scheduler thread sets its own)
+ * @param thread_idx Thread index for the orchestrator (typically num_sched_threads)
  */
 void l2_swimlane_aicpu_set_orch_thread_idx(int thread_idx);
 

diff --git a/src/a2a3/platform/shared/aicpu/l2_swimlane_collector_aicpu.cpp b/src/a2a3/platform/shared/aicpu/l2_swimlane_collector_aicpu.cpp
@@ -69,8 +69,7 @@ static L2SwimlaneAicpuTaskBuffer *s_current_aicpu_task_buffers[PLATFORM_MAX_CORE
 static L2SwimlaneAicpuSchedPhasePool *s_sched_phase_pools[PLATFORM_MAX_AICPU_THREADS] = {};
 static L2SwimlaneAicpuSchedPhaseBuffer *s_current_sched_phase_buffers[PLATFORM_MAX_AICPU_THREADS] = {};
 
-// Per-thread orch-phase pool/buffer caches (typically one orch thread; in
-// orch_to_sched mode all aicpu threads can write here).
+// Per-thread orch-phase pool/buffer caches (one orch thread).
 static L2SwimlaneAicpuOrchPhasePool *s_orch_phase_pools[PLATFORM_MAX_AICPU_THREADS] = {};
 static L2SwimlaneAicpuOrchPhaseBuffer *s_current_orch_phase_buffers[PLATFORM_MAX_AICPU_THREADS] = {};
 

diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
@@ -114,7 +114,6 @@ struct OrchSoEntry {
 
 struct AicpuExecutor {
     int32_t sched_thread_num_;
-    bool orch_to_sched_{false};
     bool serial_orch_sched_{false};
 
     // ===== Thread management state =====
@@ -206,7 +205,6 @@ int32_t AicpuExecutor::init(Runtime *runtime) {
     aicpu_thread_num_ = runtime->dev.aicpu_thread_num;
     if (aicpu_thread_num_ == 0) aicpu_thread_num_ = 1;
     sched_thread_num_ = aicpu_thread_num_ - 1;
-    orch_to_sched_ = runtime->dev.orch_to_sched;
     serial_orch_sched_ = runtime->dev.serial_orch_sched;
 
     if (aicpu_thread_num_ < 1 || aicpu_thread_num_ > MAX_AICPU_THREADS) {
@@ -215,7 +213,7 @@ int32_t AicpuExecutor::init(Runtime *runtime) {
         return -1;
     }
 
-    if (sched_ctx_.init(runtime, aicpu_thread_num_, sched_thread_num_, orch_to_sched_, get_platform_regs()) != 0) {
+    if (sched_ctx_.init(runtime, aicpu_thread_num_, sched_thread_num_, get_platform_regs()) != 0) {
         init_failed_.store(true, std::memory_order_release);
         return -1;
     }
@@ -728,8 +726,8 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
         LOG_INFO_V0("Thread %d: Orchestrator completed", thread_idx);
     }
 
-    // Scheduler thread (orchestrator threads skip dispatch when orch_to_sched_ is false)
-    if (!sched_ctx_.is_completed() && (thread_idx < sched_thread_num_ || orch_to_sched_)) {
+    // Scheduler thread (orchestrator thread skips dispatch and exits after orchestration)
+    if (!sched_ctx_.is_completed() && thread_idx < sched_thread_num_) {
         // Device orchestration: wait for the primary orchestrator to initialize the SM header
         while (!runtime_init_ready_.load(std::memory_order_acquire)) {
             SPIN_WAIT_HINT();
@@ -802,7 +800,6 @@ void AicpuExecutor::deinit(Runtime *runtime) {
 
     aicpu_thread_num_ = 0;
     sched_thread_num_ = 0;
-    orch_to_sched_ = false;
     serial_orch_sched_ = false;
 
     orch_args_cached_.reset();

diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/docs/RUNTIME_LOGIC.md b/src/a2a3/runtime/tensormap_and_ringbuffer/docs/RUNTIME_LOGIC.md
@@ -544,19 +544,19 @@ Public surface (called from `AicpuExecutor::init/run/deinit`):
 
 | Method | Phase | Purpose |
 | ------ | ----- | ------- |
-| `init(runtime, aicpu_thread_num, sched_thread_num, orch_to_sched, regs_base)` | once per run | Handshake + assign cores, reset counters, latch `regs_base`, bind `func_id_to_addr_` |
+| `init(runtime, aicpu_thread_num, sched_thread_num, regs_base)` | once per run | Handshake + assign cores, reset counters, latch `regs_base`, bind `func_id_to_addr_` |
 | `bind_runtime(rt)` | device-orch only | Wire `sched_` to `rt->scheduler` once the orchestrator thread creates `rt` |
 | `resolve_and_dispatch(runtime, thread_idx)` | per scheduler thread | Main dispatch loop |
 | `shutdown(thread_idx)` | per thread on exit | `platform_deinit_aicore_regs` for this thread's cores; PMU finalize when enabled |
-| `on_orchestration_done(runtime, rt, thread_idx, total_tasks)` | orchestrator thread | Publish core assignments, latch task count, fold inline-completed tasks, flip `orchestrator_done_`, drive orch→sched core transition (or `emergency_shutdown` on fatal) |
+| `on_orchestration_done(runtime, rt, thread_idx, total_tasks)` | orchestrator thread | Publish core assignments, latch task count, fold inline-completed tasks, flip `orchestrator_done_` (or `emergency_shutdown` on fatal) |
 | `deinit()` | once per run | Reset every scheduler-owned field to its post-construction default |
 | Read-only accessors | various | `aic_count()` / `aiv_count()` / `is_completed()` / `completed_tasks_count()` |
 
 Private internals are split across three .cpp files by responsibility:
 
 - `scheduler_completion.cpp` — completion polling, drain protocol
 - `scheduler_dispatch.cpp` — task dispatch loop and helpers
-- `scheduler_cold_path.cpp` — exit checks, stall diagnostics, profiling, lifecycle (`init/deinit`), core management (`handshake_all_cores` / `assign_cores_to_threads` / `reassign_cores_for_all_threads` / `emergency_shutdown`), and `on_orchestration_done`
+- `scheduler_cold_path.cpp` — exit checks, stall diagnostics, profiling, lifecycle (`init/deinit`), core management (`handshake_all_cores` / `assign_cores_to_threads` / `emergency_shutdown`), and `on_orchestration_done`
 
 `AicpuExecutor` calls neither `handshake_*`, `assign_*`, `reassign_*`, nor `emergency_shutdown` directly — they are private, invoked only by `init` and `on_orchestration_done`.
 

diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/docs/SUBMIT_BY_CLUSTER.md b/src/a2a3/runtime/tensormap_and_ringbuffer/docs/SUBMIT_BY_CLUSTER.md
@@ -36,7 +36,7 @@ Legacy per-task submit (`kernel_id + worker_type`) cannot express atomic co-disp
 
 Design must preserve the current main runtime architecture:
 
-1. Executor threading split (orchestrator thread vs scheduler threads), and post-orchestrator transition (`transition_requested_` + `reassign_cores_for_all_threads()`).
+1. Executor threading split (orchestrator thread vs scheduler threads); the orchestrator thread exits after the task graph is built while scheduler threads dispatch to completion.
 2. Shared-memory hot/cold split (`PTO2TaskDescriptor` hot + `PTO2TaskPayload` cold).
 
 ## 5. Terminology
@@ -146,10 +146,8 @@ This project-defined flattened numbering is kept unchanged.
 ### 9.2 Cluster Ownership
 
 1. One cluster must be owned by one scheduler domain/thread at a time.
-2. No split-cluster ownership in either:
-   - initial `assign_cores_to_threads()`
-   - post-orchestrator `reassign_cores_for_all_threads()`
-3. Lane occupancy bookkeeping must remain consistent with ownership after reassignment.
+2. No split-cluster ownership in `assign_cores_to_threads()`.
+3. Lane occupancy bookkeeping must remain consistent with ownership.
 
 ## 10. Functional Requirements
 

diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/docs/profiling_levels.md b/src/a2a3/runtime/tensormap_and_ringbuffer/docs/profiling_levels.md
@@ -73,7 +73,6 @@ Each sub-level macro requires `PTO2_PROFILING=1`:
 
 - Base timing counters for scheduler loop (`sched_complete/dispatch/idle/scan`)
 - Per-thread orchestration timing (`orch_start`, `orch_end`, `orch_cost`)
-- Stage-level orchestration end timestamp (`orch_stage_end`, printed by last orch thread only, marks the moment all orch threads have finished and core transition is about to be requested; only when `orch_to_sched_` is true)
 - PTO2 total submitted tasks count (printed by last orch thread, after orch timing line)
 - Scheduler summary output (`total_time`, `loops`, `tasks_scheduled`)
 - Scheduler lifetime timestamps and cost (`sched_start`, `sched_end`, `sched_cost` — captured inside `resolve_and_dispatch_pto2()`, printed before Scheduler summary)
@@ -87,19 +86,17 @@ Each sub-level macro requires `PTO2_PROFILING=1`:
 
 - `Thread %d: orch_start=%llu orch_end=%llu orch_cost=%.3fus` — each orch thread, after orchestration fully complete
 - `PTO2 total submitted tasks = %d, already executed %d tasks` — last orch thread only (×1), after orch timing line
-- `Thread %d: orch_stage_end=%llu` — last orch thread only (×1), only when `orch_to_sched_=true`
 - `Thread %d: sched_start=%llu sched_end=%llu sched_cost=%.3fus` — each sched thread, printed before Scheduler summary
 - `Thread %d: Scheduler summary: total_time=%.3fus, loops=%llu, tasks_scheduled=%d` — each sched thread
 - `Thread %d: sched_start=%llu sched_end(timeout)=%llu sched_cost=%.3fus` — timeout path only (replaces normal `sched_end`)
 
 **LOG_INFO_V9 count (normal run):**
 
-- `orch_to_sched_=false` (default): `N_sched*2 + N_orch*1 + 1` (orch_timing + PTO2_total + sched_timing + Scheduler_summary)
-- `orch_to_sched_=true` (`PTO2_ORCH_TO_SCHED=1`): adds 1 (`orch_stage_end`)
+- `N_sched*2 + N_orch*1 + 1` (orch_timing + PTO2_total + sched_timing + Scheduler_summary)
 
 > See the table at the end for concrete counts based on the `paged_attention` example.
 
-**Example log output — `orch_to_sched_=false`** (from `paged_attention`, device 10):
+**Example log output** (from `paged_attention`, device 10):
 
 ```text
 Thread 2: orch_start=48214752948321 orch_end=48214752959379 orch_cost=230.000us
@@ -111,26 +108,10 @@ Thread 0: sched_start=48214752948200 sched_end=48214752963571 sched_cost=320.000
 Thread 0: Scheduler summary: total_time=183.180us, loops=4611, tasks_scheduled=7
 ```
 
-**Example log output — `orch_to_sched_=true`** (`PTO2_ORCH_TO_SCHED=1`, from `paged_attention`, device 11):
-
-```text
-Thread 3: orch_stage_end=48236915058307
-Thread 3: orch_start=48236915044001 orch_end=48236915058781 orch_cost=308.000us
-Thread 2: orch_start=48236915044003 orch_end=48236915058782 orch_cost=308.000us
-PTO2 total submitted tasks = 13, already executed 13 tasks
-Thread 0: sched_start=48236915043911 sched_end=48236915059191 sched_cost=318.000us
-Thread 0: Scheduler summary: total_time=187.920us, loops=4561, tasks_scheduled=4
-Thread 1: sched_start=48236915043947 sched_end=48236915061881 sched_cost=372.000us
-Thread 1: Scheduler summary: total_time=168.620us, loops=3880, tasks_scheduled=9
-```
-
-> With `orch_to_sched_=true`, orch threads transition to schedulers after orchestration. They print `orch_end` but do NOT print `Scheduler summary` or `sched_end` (they have no cores assigned at shutdown time).
-
 **Note:**
 
 - All logs above are controlled by compile-time macro `PTO2_PROFILING`, not by `enable_l2_swimlane`.
 - `enable_l2_swimlane` only controls shared-memory data collection / swimlane export.
-- Enable `orch_to_sched_` via environment variable: `PTO2_ORCH_TO_SCHED=1`.
 
 ---
 
@@ -420,13 +401,13 @@ definitions to runtime headers.
 
 > Example: `paged_attention` on Ascend hardware, 2 sched threads + 2 orch threads, normal run (no stall/timeout).
 
-| Level | Macro Settings | LOG_INFO_V9 Count (`orch_to_sched_=false`) | LOG_INFO_V9 Count (`orch_to_sched_=true`) | Description |
-| ----- | -------------- | ------------------------------------------ | ----------------------------------------- | ----------- |
-| 0 | `PTO2_PROFILING=0` | 0 | 0 | No timing output |
-| 1 | `PTO2_PROFILING=1` | 7 | 8 | Timing timestamps + scheduler summary |
-| 2 | `+PTO2_SCHED_PROFILING=1` | — | — | Scheduler detailed phase breakdown |
-| 3 | `+PTO2_ORCH_PROFILING=1` | — | — | Orchestrator detailed phase breakdown |
-| 4 | `+PTO2_TENSORMAP_PROFILING=1` | — | — | TensorMap lookup stats |
+| Level | Macro Settings | LOG_INFO_V9 Count | Description |
+| ----- | -------------- | ----------------- | ----------- |
+| 0 | `PTO2_PROFILING=0` | 0 | No timing output |
+| 1 | `PTO2_PROFILING=1` | 7 | Timing timestamps + scheduler summary |
+| 2 | `+PTO2_SCHED_PROFILING=1` | — | Scheduler detailed phase breakdown |
+| 3 | `+PTO2_ORCH_PROFILING=1` | — | Orchestrator detailed phase breakdown |
+| 4 | `+PTO2_TENSORMAP_PROFILING=1` | — | TensorMap lookup stats |
 
 ---
 

diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp
@@ -477,10 +477,6 @@ static bool stage_device_args(
 // runtime. Behavior-only env reads (no new gates); kept here so the args and
 // image steps stay free of unrelated state.
 static void apply_orch_sched_env_flags(Runtime *runtime) {
-    const char *orch_env = std::getenv("PTO2_ORCH_TO_SCHED");
-    runtime->dev.orch_to_sched = orch_env && (orch_env[0] == '1' || orch_env[0] == 't' || orch_env[0] == 'T');
-    LOG_INFO_V0("Orchestrator-to-scheduler transition: %s", runtime->dev.orch_to_sched ? "enabled" : "disabled");
-
     const char *serial_env = std::getenv("PTO2_SERIAL_ORCH_SCHED");
     runtime->dev.serial_orch_sched =
         serial_env && (serial_env[0] == '1' || serial_env[0] == 't' || serial_env[0] == 'T');

diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h
@@ -207,7 +207,6 @@ struct alignas(64) DeviceRuntimeLaunchDesc {
     // (= orch + schedulers). AicpuExecutor splits this into one orchestrator
     // thread (highest idx, runs aicpu_orchestration_entry) and the remaining
     // aicpu_thread_num-1 scheduler threads that dispatch tasks to AICore.
-    // The orch thread also dispatches when env PTO2_ORCH_TO_SCHED is set.
     int aicpu_thread_num;
     int ready_queue_shards;  // Number of ready queue shards (1..MAX_AICPU_THREADS, default MAX-1)
 
@@ -223,12 +222,6 @@ struct alignas(64) DeviceRuntimeLaunchDesc {
     // PTO2 integration: kernel_id -> GM function_bin_addr mapping
     uint64_t func_id_to_addr_[RUNTIME_MAX_FUNC_ID];
 
-    // Orchestrator-to-scheduler transition control
-    // When true, orchestrator threads convert to scheduler threads after orchestration completes.
-    // When false (default), orchestrator threads exit after orchestration without dispatching tasks.
-    // Controlled via PTO2_ORCH_TO_SCHED environment variable.
-    bool orch_to_sched;
-
     // Serial orchestrator -> scheduler start control.
     // When true, scheduler threads wait until orchestration has fully built the
     // task graph before entering resolve_and_dispatch().