diff --git a/docs/dynamic-linking.md b/docs/dynamic-linking.md
index 88252d2dc..a8fa2e045 100644
--- a/docs/dynamic-linking.md
+++ b/docs/dynamic-linking.md
@@ -224,11 +224,11 @@ SchedulerContext owns its own teardown:
 - `SchedulerContext::deinit()` resets every scheduler-owned field —
   per-core states, payloads, sync-start drain coordination
   (`sync_start_pending` / `drain_worker_elected` / `drain_ack_mask` /
-  `pending_task`), task counters, transition flags, worker-id lists,
+  `pending_task`), task counters, worker-id lists,
   core trackers, `cores_total_num_` / `aic_count_` / `aiv_count_`,
   `regs_`, `sched_`, `func_id_to_addr_`, and the `pto2_init_*` flags.
 - `AicpuExecutor::deinit()` calls `sched_ctx_.deinit()` first, then resets
-  only its own fields: `thread_num_`, `sched_thread_num_`, `orch_to_sched_`,
+  only its own fields: `thread_num_`, `sched_thread_num_`,
   `orch_func_`, `orch_args_cached_`, `orch_so_handle_`, `orch_so_path_`,
   `runtime_init_ready_`, and the lifecycle atomics
   (`initialized_`, `init_done_`, `init_failed_`, `finished_`, `thread_idx_`,
diff --git a/src/a2a3/platform/include/aicpu/l2_swimlane_collector_aicpu.h b/src/a2a3/platform/include/aicpu/l2_swimlane_collector_aicpu.h
index 80da54476..3182eb1c1 100644
--- a/src/a2a3/platform/include/aicpu/l2_swimlane_collector_aicpu.h
+++ b/src/a2a3/platform/include/aicpu/l2_swimlane_collector_aicpu.h
@@ -155,8 +155,7 @@ void l2_swimlane_aicpu_flush(int thread_idx, const int *cur_thread_cores, int co
  *                                 to the L2Swimlane base
  * @param num_sched_phase_threads  Number of sched-phase pools to prime
  * @param num_orch_phase_threads   Number of orch-phase pools to prime
- *                                 (typically 1; in orch_to_sched mode =
- *                                 num_aicpu_threads)
+ *                                 (typically 1)
  */
 void l2_swimlane_aicpu_init_phase(int worker_count, int num_sched_phase_threads, int num_orch_phase_threads);
 
@@ -200,8 +199,7 @@ void l2_swimlane_aicpu_record_sched_phase(
  * Must be called once from the orchestrator thread before any
  * l2_swimlane_aicpu_record_orch_phase() calls.
  *
- * @param thread_idx Thread index for the orchestrator (typically num_sched_threads;
- *                   in orch_to_sched mode each scheduler thread sets its own)
+ * @param thread_idx Thread index for the orchestrator (typically num_sched_threads)
  */
 void l2_swimlane_aicpu_set_orch_thread_idx(int thread_idx);
 
diff --git a/src/a2a3/platform/shared/aicpu/l2_swimlane_collector_aicpu.cpp b/src/a2a3/platform/shared/aicpu/l2_swimlane_collector_aicpu.cpp
index 5ed92cd61..aafffa8ee 100644
--- a/src/a2a3/platform/shared/aicpu/l2_swimlane_collector_aicpu.cpp
+++ b/src/a2a3/platform/shared/aicpu/l2_swimlane_collector_aicpu.cpp
@@ -69,8 +69,7 @@ static L2SwimlaneAicpuTaskBuffer *s_current_aicpu_task_buffers[PLATFORM_MAX_CORE
 static L2SwimlaneAicpuSchedPhasePool *s_sched_phase_pools[PLATFORM_MAX_AICPU_THREADS] = {};
 static L2SwimlaneAicpuSchedPhaseBuffer *s_current_sched_phase_buffers[PLATFORM_MAX_AICPU_THREADS] = {};
 
-// Per-thread orch-phase pool/buffer caches (typically one orch thread; in
-// orch_to_sched mode all aicpu threads can write here).
+// Per-thread orch-phase pool/buffer caches (one orch thread).
 static L2SwimlaneAicpuOrchPhasePool *s_orch_phase_pools[PLATFORM_MAX_AICPU_THREADS] = {};
 static L2SwimlaneAicpuOrchPhaseBuffer *s_current_orch_phase_buffers[PLATFORM_MAX_AICPU_THREADS] = {};
 
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
index 2d5613ba8..0832c3c62 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
@@ -114,7 +114,6 @@ struct OrchSoEntry {
 
 struct AicpuExecutor {
     int32_t sched_thread_num_;
-    bool orch_to_sched_{false};
     bool serial_orch_sched_{false};
 
     // ===== Thread management state =====
@@ -206,7 +205,6 @@ int32_t AicpuExecutor::init(Runtime *runtime) {
     aicpu_thread_num_ = runtime->dev.aicpu_thread_num;
     if (aicpu_thread_num_ == 0) aicpu_thread_num_ = 1;
     sched_thread_num_ = aicpu_thread_num_ - 1;
-    orch_to_sched_ = runtime->dev.orch_to_sched;
     serial_orch_sched_ = runtime->dev.serial_orch_sched;
 
     if (aicpu_thread_num_ < 1 || aicpu_thread_num_ > MAX_AICPU_THREADS) {
@@ -215,7 +213,7 @@ int32_t AicpuExecutor::init(Runtime *runtime) {
         return -1;
     }
 
-    if (sched_ctx_.init(runtime, aicpu_thread_num_, sched_thread_num_, orch_to_sched_, get_platform_regs()) != 0) {
+    if (sched_ctx_.init(runtime, aicpu_thread_num_, sched_thread_num_, get_platform_regs()) != 0) {
         init_failed_.store(true, std::memory_order_release);
         return -1;
     }
@@ -728,8 +726,8 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
         LOG_INFO_V0("Thread %d: Orchestrator completed", thread_idx);
     }
 
-    // Scheduler thread (orchestrator threads skip dispatch when orch_to_sched_ is false)
-    if (!sched_ctx_.is_completed() && (thread_idx < sched_thread_num_ || orch_to_sched_)) {
+    // Scheduler thread (orchestrator thread skips dispatch and exits after orchestration)
+    if (!sched_ctx_.is_completed() && thread_idx < sched_thread_num_) {
         // Device orchestration: wait for the primary orchestrator to initialize the SM header
         while (!runtime_init_ready_.load(std::memory_order_acquire)) {
             SPIN_WAIT_HINT();
@@ -802,7 +800,6 @@ void AicpuExecutor::deinit(Runtime *runtime) {
 
     aicpu_thread_num_ = 0;
     sched_thread_num_ = 0;
-    orch_to_sched_ = false;
     serial_orch_sched_ = false;
 
     orch_args_cached_.reset();
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/docs/RUNTIME_LOGIC.md b/src/a2a3/runtime/tensormap_and_ringbuffer/docs/RUNTIME_LOGIC.md
index ef01e3231..ef59d2e98 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/docs/RUNTIME_LOGIC.md
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/docs/RUNTIME_LOGIC.md
@@ -544,11 +544,11 @@ Public surface (called from `AicpuExecutor::init/run/deinit`):
 
 | Method | Phase | Purpose |
 | ------ | ----- | ------- |
-| `init(runtime, aicpu_thread_num, sched_thread_num, orch_to_sched, regs_base)` | once per run | Handshake + assign cores, reset counters, latch `regs_base`, bind `func_id_to_addr_` |
+| `init(runtime, aicpu_thread_num, sched_thread_num, regs_base)` | once per run | Handshake + assign cores, reset counters, latch `regs_base`, bind `func_id_to_addr_` |
 | `bind_runtime(rt)` | device-orch only | Wire `sched_` to `rt->scheduler` once the orchestrator thread creates `rt` |
 | `resolve_and_dispatch(runtime, thread_idx)` | per scheduler thread | Main dispatch loop |
 | `shutdown(thread_idx)` | per thread on exit | `platform_deinit_aicore_regs` for this thread's cores; PMU finalize when enabled |
-| `on_orchestration_done(runtime, rt, thread_idx, total_tasks)` | orchestrator thread | Publish core assignments, latch task count, fold inline-completed tasks, flip `orchestrator_done_`, drive orch→sched core transition (or `emergency_shutdown` on fatal) |
+| `on_orchestration_done(runtime, rt, thread_idx, total_tasks)` | orchestrator thread | Publish core assignments, latch task count, fold inline-completed tasks, flip `orchestrator_done_` (or `emergency_shutdown` on fatal) |
 | `deinit()` | once per run | Reset every scheduler-owned field to its post-construction default |
 | Read-only accessors | various | `aic_count()` / `aiv_count()` / `is_completed()` / `completed_tasks_count()` |
 
@@ -556,7 +556,7 @@ Private internals are split across three .cpp files by responsibility:
 
 - `scheduler_completion.cpp` — completion polling, drain protocol
 - `scheduler_dispatch.cpp` — task dispatch loop and helpers
-- `scheduler_cold_path.cpp` — exit checks, stall diagnostics, profiling, lifecycle (`init/deinit`), core management (`handshake_all_cores` / `assign_cores_to_threads` / `reassign_cores_for_all_threads` / `emergency_shutdown`), and `on_orchestration_done`
+- `scheduler_cold_path.cpp` — exit checks, stall diagnostics, profiling, lifecycle (`init/deinit`), core management (`handshake_all_cores` / `assign_cores_to_threads` / `emergency_shutdown`), and `on_orchestration_done`
 
 `AicpuExecutor` calls neither `handshake_*`, `assign_*`, `reassign_*`, nor `emergency_shutdown` directly — they are private, invoked only by `init` and `on_orchestration_done`.
 
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/docs/SUBMIT_BY_CLUSTER.md b/src/a2a3/runtime/tensormap_and_ringbuffer/docs/SUBMIT_BY_CLUSTER.md
index 8cba7e90c..50f734fee 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/docs/SUBMIT_BY_CLUSTER.md
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/docs/SUBMIT_BY_CLUSTER.md
@@ -36,7 +36,7 @@ Legacy per-task submit (`kernel_id + worker_type`) cannot express atomic co-disp
 
 Design must preserve the current main runtime architecture:
 
-1. Executor threading split (orchestrator thread vs scheduler threads), and post-orchestrator transition (`transition_requested_` + `reassign_cores_for_all_threads()`).
+1. Executor threading split (orchestrator thread vs scheduler threads); the orchestrator thread exits after the task graph is built while scheduler threads dispatch to completion.
 2. Shared-memory hot/cold split (`PTO2TaskDescriptor` hot + `PTO2TaskPayload` cold).
 
 ## 5. Terminology
@@ -146,10 +146,8 @@ This project-defined flattened numbering is kept unchanged.
 ### 9.2 Cluster Ownership
 
 1. One cluster must be owned by one scheduler domain/thread at a time.
-2. No split-cluster ownership in either:
-   - initial `assign_cores_to_threads()`
-   - post-orchestrator `reassign_cores_for_all_threads()`
-3. Lane occupancy bookkeeping must remain consistent with ownership after reassignment.
+2. No split-cluster ownership in `assign_cores_to_threads()`.
+3. Lane occupancy bookkeeping must remain consistent with ownership.
 
 ## 10. Functional Requirements
 
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/docs/profiling_levels.md b/src/a2a3/runtime/tensormap_and_ringbuffer/docs/profiling_levels.md
index bd669f365..4467cd7b2 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/docs/profiling_levels.md
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/docs/profiling_levels.md
@@ -73,7 +73,6 @@ Each sub-level macro requires `PTO2_PROFILING=1`:
 
 - Base timing counters for scheduler loop (`sched_complete/dispatch/idle/scan`)
 - Per-thread orchestration timing (`orch_start`, `orch_end`, `orch_cost`)
-- Stage-level orchestration end timestamp (`orch_stage_end`, printed by last orch thread only, marks the moment all orch threads have finished and core transition is about to be requested; only when `orch_to_sched_` is true)
 - PTO2 total submitted tasks count (printed by last orch thread, after orch timing line)
 - Scheduler summary output (`total_time`, `loops`, `tasks_scheduled`)
 - Scheduler lifetime timestamps and cost (`sched_start`, `sched_end`, `sched_cost` — captured inside `resolve_and_dispatch_pto2()`, printed before Scheduler summary)
@@ -87,19 +86,17 @@ Each sub-level macro requires `PTO2_PROFILING=1`:
 
 - `Thread %d: orch_start=%llu orch_end=%llu orch_cost=%.3fus` — each orch thread, after orchestration fully complete
 - `PTO2 total submitted tasks = %d, already executed %d tasks` — last orch thread only (×1), after orch timing line
-- `Thread %d: orch_stage_end=%llu` — last orch thread only (×1), only when `orch_to_sched_=true`
 - `Thread %d: sched_start=%llu sched_end=%llu sched_cost=%.3fus` — each sched thread, printed before Scheduler summary
 - `Thread %d: Scheduler summary: total_time=%.3fus, loops=%llu, tasks_scheduled=%d` — each sched thread
 - `Thread %d: sched_start=%llu sched_end(timeout)=%llu sched_cost=%.3fus` — timeout path only (replaces normal `sched_end`)
 
 **LOG_INFO_V9 count (normal run):**
 
-- `orch_to_sched_=false` (default): `N_sched*2 + N_orch*1 + 1` (orch_timing + PTO2_total + sched_timing + Scheduler_summary)
-- `orch_to_sched_=true` (`PTO2_ORCH_TO_SCHED=1`): adds 1 (`orch_stage_end`)
+- `N_sched*2 + N_orch*1 + 1` (orch_timing + PTO2_total + sched_timing + Scheduler_summary)
 
 > See the table at the end for concrete counts based on the `paged_attention` example.
 
-**Example log output — `orch_to_sched_=false`** (from `paged_attention`, device 10):
+**Example log output** (from `paged_attention`, device 10):
 
 ```text
 Thread 2: orch_start=48214752948321 orch_end=48214752959379 orch_cost=230.000us
@@ -111,26 +108,10 @@ Thread 0: sched_start=48214752948200 sched_end=48214752963571 sched_cost=320.000
 Thread 0: Scheduler summary: total_time=183.180us, loops=4611, tasks_scheduled=7
 ```
 
-**Example log output — `orch_to_sched_=true`** (`PTO2_ORCH_TO_SCHED=1`, from `paged_attention`, device 11):
-
-```text
-Thread 3: orch_stage_end=48236915058307
-Thread 3: orch_start=48236915044001 orch_end=48236915058781 orch_cost=308.000us
-Thread 2: orch_start=48236915044003 orch_end=48236915058782 orch_cost=308.000us
-PTO2 total submitted tasks = 13, already executed 13 tasks
-Thread 0: sched_start=48236915043911 sched_end=48236915059191 sched_cost=318.000us
-Thread 0: Scheduler summary: total_time=187.920us, loops=4561, tasks_scheduled=4
-Thread 1: sched_start=48236915043947 sched_end=48236915061881 sched_cost=372.000us
-Thread 1: Scheduler summary: total_time=168.620us, loops=3880, tasks_scheduled=9
-```
-
-> With `orch_to_sched_=true`, orch threads transition to schedulers after orchestration. They print `orch_end` but do NOT print `Scheduler summary` or `sched_end` (they have no cores assigned at shutdown time).
-
 **Note:**
 
 - All logs above are controlled by compile-time macro `PTO2_PROFILING`, not by `enable_l2_swimlane`.
 - `enable_l2_swimlane` only controls shared-memory data collection / swimlane export.
-- Enable `orch_to_sched_` via environment variable: `PTO2_ORCH_TO_SCHED=1`.
 
 ---
 
@@ -420,13 +401,13 @@ definitions to runtime headers.
 
 > Example: `paged_attention` on Ascend hardware, 2 sched threads + 2 orch threads, normal run (no stall/timeout).
 
-| Level | Macro Settings | LOG_INFO_V9 Count (`orch_to_sched_=false`) | LOG_INFO_V9 Count (`orch_to_sched_=true`) | Description |
-| ----- | -------------- | ------------------------------------------ | ----------------------------------------- | ----------- |
-| 0 | `PTO2_PROFILING=0` | 0 | 0 | No timing output |
-| 1 | `PTO2_PROFILING=1` | 7 | 8 | Timing timestamps + scheduler summary |
-| 2 | `+PTO2_SCHED_PROFILING=1` | — | — | Scheduler detailed phase breakdown |
-| 3 | `+PTO2_ORCH_PROFILING=1` | — | — | Orchestrator detailed phase breakdown |
-| 4 | `+PTO2_TENSORMAP_PROFILING=1` | — | — | TensorMap lookup stats |
+| Level | Macro Settings | LOG_INFO_V9 Count | Description |
+| ----- | -------------- | ----------------- | ----------- |
+| 0 | `PTO2_PROFILING=0` | 0 | No timing output |
+| 1 | `PTO2_PROFILING=1` | 7 | Timing timestamps + scheduler summary |
+| 2 | `+PTO2_SCHED_PROFILING=1` | — | Scheduler detailed phase breakdown |
+| 3 | `+PTO2_ORCH_PROFILING=1` | — | Orchestrator detailed phase breakdown |
+| 4 | `+PTO2_TENSORMAP_PROFILING=1` | — | TensorMap lookup stats |
 
 ---
 
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp
index dc0ddefb8..7952045ce 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp
@@ -477,10 +477,6 @@ static bool stage_device_args(
 // runtime. Behavior-only env reads (no new gates); kept here so the args and
 // image steps stay free of unrelated state.
 static void apply_orch_sched_env_flags(Runtime *runtime) {
-    const char *orch_env = std::getenv("PTO2_ORCH_TO_SCHED");
-    runtime->dev.orch_to_sched = orch_env && (orch_env[0] == '1' || orch_env[0] == 't' || orch_env[0] == 'T');
-    LOG_INFO_V0("Orchestrator-to-scheduler transition: %s", runtime->dev.orch_to_sched ? "enabled" : "disabled");
-
     const char *serial_env = std::getenv("PTO2_SERIAL_ORCH_SCHED");
     runtime->dev.serial_orch_sched =
         serial_env && (serial_env[0] == '1' || serial_env[0] == 't' || serial_env[0] == 'T');
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h
index ae6a2446e..5f3109a08 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h
@@ -207,7 +207,6 @@ struct alignas(64) DeviceRuntimeLaunchDesc {
     // (= orch + schedulers). AicpuExecutor splits this into one orchestrator
     // thread (highest idx, runs aicpu_orchestration_entry) and the remaining
     // aicpu_thread_num-1 scheduler threads that dispatch tasks to AICore.
-    // The orch thread also dispatches when env PTO2_ORCH_TO_SCHED is set.
     int aicpu_thread_num;
     int ready_queue_shards;  // Number of ready queue shards (1..MAX_AICPU_THREADS, default MAX-1)
 
@@ -223,12 +222,6 @@ struct alignas(64) DeviceRuntimeLaunchDesc {
     // PTO2 integration: kernel_id -> GM function_bin_addr mapping
     uint64_t func_id_to_addr_[RUNTIME_MAX_FUNC_ID];
 
-    // Orchestrator-to-scheduler transition control
-    // When true, orchestrator threads convert to scheduler threads after orchestration completes.
-    // When false (default), orchestrator threads exit after orchestration without dispatching tasks.
-    // Controlled via PTO2_ORCH_TO_SCHED environment variable.
-    bool orch_to_sched;
-
     // Serial orchestrator -> scheduler start control.
     // When true, scheduler threads wait until orchestration has fully built the
     // task graph before entering resolve_and_dispatch().
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp
index cae114427..5bac8297a 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp
@@ -94,21 +94,6 @@ LoopAction SchedulerContext::handle_orchestrator_exit(
     return LoopAction::NONE;
 }
 
-LoopAction SchedulerContext::handle_core_transition(bool &cores_released) {
-    if (!transition_requested_.load(std::memory_order_acquire)) return LoopAction::NONE;
-    if (!reassigned_.load(std::memory_order_acquire)) {
-        wait_reassign_.fetch_add(1, std::memory_order_release);
-        while (!reassigned_.load(std::memory_order_acquire)) {
-            if (completed_.load(std::memory_order_acquire)) {
-                return LoopAction::BREAK_LOOP;
-            }
-            SPIN_WAIT_HINT();
-        }
-    }
-    cores_released = true;
-    return LoopAction::NONE;
-}
-
 LoopAction
 SchedulerContext::check_idle_fatal_error(int32_t thread_idx, PTO2SharedMemoryHeader *header, Runtime *runtime) {
     if (completed_.load(std::memory_order_acquire)) {
@@ -330,7 +315,7 @@ void SchedulerContext::log_stall_diagnostics(
 
     // CLUSTER lines: one per cluster this thread owns.
     // cluster_id = local_cluster_idx * active_sched_threads_ + thread_idx, matching the
-    // round-robin assignment in assign_cores_to_threads / reassign_cores_for_all_threads.
+    // round-robin assignment in assign_cores_to_threads.
     int32_t ast = active_sched_threads_ > 0 ? active_sched_threads_ : aicpu_thread_num_;
     for (int32_t cli = 0; cli < tracker.get_cluster_count() && cli < STALL_DUMP_CORE_MAX; cli++) {
         int32_t offset = cli * 3;
@@ -848,76 +833,6 @@ bool SchedulerContext::assign_cores_to_threads() {
     return true;
 }
 
-// =============================================================================
-// Reassign all cores across all threads (sched + orchestrator) after orchestration.
-// =============================================================================
-void SchedulerContext::reassign_cores_for_all_threads() {
-    LOG_INFO_V0(
-        "Reassigning cores (cluster-aligned) for %d threads: %d AIC, %d AIV", aicpu_thread_num_, aic_count_, aiv_count_
-    );
-
-    // Collect running worker_ids from all current trackers
-    bool running_cores[RUNTIME_MAX_WORKER] = {};
-    for (int32_t i = 0; i < aicpu_thread_num_; i++) {
-        auto all_running = core_trackers_[i].get_all_running_cores();
-        int32_t bp;
-        while ((bp = all_running.pop_first()) >= 0) {
-            running_cores[core_trackers_[i].get_core_id_by_offset(bp)] = true;
-        }
-    }
-
-    // Count clusters per thread (round-robin across all threads)
-    int32_t cluster_count = aic_count_;
-    int32_t clusters_per_thread[MAX_AICPU_THREADS] = {};
-    for (int32_t ci = 0; ci < cluster_count; ci++) {
-        clusters_per_thread[ci % aicpu_thread_num_]++;
-    }
-
-    // Re-init all trackers and reset core counts
-    for (int32_t i = 0; i < aicpu_thread_num_; i++) {
-        core_trackers_[i].init(clusters_per_thread[i]);
-    }
-
-    // Assign clusters round-robin and restore running state
-    int32_t cluster_idx_per_thread[MAX_AICPU_THREADS] = {};
-    for (int32_t ci = 0; ci < cluster_count; ci++) {
-        int32_t t = ci % aicpu_thread_num_;
-
-        int32_t aic_wid = aic_worker_ids_[ci];
-        int32_t aiv0_wid = aiv_worker_ids_[2 * ci];
-        int32_t aiv1_wid = aiv_worker_ids_[2 * ci + 1];
-
-        int32_t cl_idx = cluster_idx_per_thread[t]++;
-        core_trackers_[t].set_cluster(cl_idx, aic_wid, aiv0_wid, aiv1_wid);
-
-        // init() marks all idle; toggle cores that were running and restore pending_occupied
-        if (running_cores[aic_wid]) {
-            core_trackers_[t].change_core_state(cl_idx * 3);
-            core_trackers_[t].set_pending_occupied(cl_idx * 3);
-        }
-        if (running_cores[aiv0_wid]) {
-            core_trackers_[t].change_core_state(cl_idx * 3 + 1);
-            core_trackers_[t].set_pending_occupied(cl_idx * 3 + 1);
-        }
-        if (running_cores[aiv1_wid]) {
-            core_trackers_[t].change_core_state(cl_idx * 3 + 2);
-            core_trackers_[t].set_pending_occupied(cl_idx * 3 + 2);
-        }
-    }
-
-    // Log final distribution
-    LOG_INFO_V0("Core reassignment complete:");
-    for (int32_t t = 0; t < aicpu_thread_num_; t++) {
-        int32_t aic_running = core_trackers_[t].get_running_count<CoreType::AIC>();
-        int32_t aiv_running = core_trackers_[t].get_running_count<CoreType::AIV>();
-        LOG_INFO_V0(
-            "  Thread %d: %d cores, %d clusters (AIC running=%d, AIV running=%d)", t, core_trackers_[t].core_num(),
-            core_trackers_[t].get_cluster_count(), aic_running, aiv_running
-        );
-    }
-    active_sched_threads_ = aicpu_thread_num_;
-}
-
 // =============================================================================
 // Emergency shutdown: broadcast exit signal to every handshake'd core and
 // deinit their AICore register blocks. Idempotent.
@@ -945,9 +860,8 @@ void SchedulerContext::emergency_shutdown(Runtime *runtime) {
 // =============================================================================
 // Lifecycle: init / deinit
 // =============================================================================
-int32_t SchedulerContext::init(
-    Runtime *runtime, int32_t aicpu_thread_num, int32_t sched_thread_num, bool orch_to_sched, uint64_t regs_base
-) {
+int32_t
+SchedulerContext::init(Runtime *runtime, int32_t aicpu_thread_num, int32_t sched_thread_num, uint64_t regs_base) {
     always_assert(runtime != nullptr);
 
     // Zero all per-core execution state before handshake
@@ -956,7 +870,6 @@ int32_t SchedulerContext::init(
     // Wire thread/transition configuration that handshake/assign need to read.
     aicpu_thread_num_ = aicpu_thread_num;
     sched_thread_num_ = sched_thread_num;
-    orch_to_sched_ = orch_to_sched;
     regs_ = regs_base;
 
 #if PTO2_PROFILING
@@ -977,10 +890,9 @@ int32_t SchedulerContext::init(
             // threads as scheduler threads" (see assign_cores_to_threads'
             // active_sched_threads_). Without it, init_phase would prime zero
             // sched pools and all sched_phase emits would silently drop.
-            const int active_sched = (sched_thread_num_ > 0) ? sched_thread_num_ : aicpu_thread_num_;
-            const int sched_phase_threads = orch_to_sched_ ? aicpu_thread_num_ : active_sched;
+            const int sched_phase_threads = (sched_thread_num_ > 0) ? sched_thread_num_ : aicpu_thread_num_;
             // Orchestration is always single-threaded, so orch-phase is one pool
-            // (ordinal 0) in both modes — see record_orch_phase.
+            // (ordinal 0) — see record_orch_phase.
             const int orch_phase_threads = 1;
             l2_swimlane_aicpu_init_phase(runtime->dev.worker_count, sched_phase_threads, orch_phase_threads);
         }
@@ -1007,7 +919,7 @@ int32_t SchedulerContext::init(
     // orchestrator thread (see aicpu_executor.cpp).
 #if PTO2_PROFILING
     if (is_dump_args_enabled()) {
-        dump_args_init(orch_to_sched_ ? aicpu_thread_num_ : active_sched_threads_);
+        dump_args_init(active_sched_threads_);
     }
     if (is_pmu_enabled()) {
         pmu_aicpu_init(physical_core_ids_, cores_total_num_);
@@ -1101,11 +1013,6 @@ void SchedulerContext::deinit() {
     completed_tasks_.store(0, std::memory_order_release);
     total_tasks_ = 0;
     orchestrator_done_.store(false, std::memory_order_release);
-
-    // Reset core transition state
-    transition_requested_.store(false, std::memory_order_release);
-    wait_reassign_.store(0, std::memory_order_release);
-    reassigned_.store(false, std::memory_order_release);
     completed_.store(false, std::memory_order_release);
 
     // Reset core discovery and assignment state
@@ -1114,7 +1021,6 @@ void SchedulerContext::deinit() {
     cores_total_num_ = 0;
     aicpu_thread_num_ = 0;
     sched_thread_num_ = 0;
-    orch_to_sched_ = false;
     active_sched_threads_ = 0;
     for (int32_t t = 0; t < MAX_AICPU_THREADS; t++) {
         core_trackers_[t] = CoreTracker{};
@@ -1156,7 +1062,7 @@ void SchedulerContext::wait_for_orchestration_done_before_dispatch(Runtime *runt
 // and drives the orchestrator → scheduler core transition (or fatal shutdown).
 // =============================================================================
 void SchedulerContext::on_orchestration_done(
-    Runtime *runtime, PTO2Runtime *rt, int32_t thread_idx, int32_t total_tasks
+    Runtime *runtime, PTO2Runtime *rt, [[maybe_unused]] int32_t thread_idx, int32_t total_tasks
 ) {
 #if PTO2_PROFILING
     if (l2_swimlane_level_ >= L2SwimlaneLevel::ORCH_PHASES) {
@@ -1190,32 +1096,9 @@ void SchedulerContext::on_orchestration_done(
         }
     }
 
-    // Skip core transition on fatal error — cores already shut down above.
-    if (completed_.load(std::memory_order_acquire)) {
-        // Signal transition to unblock scheduler threads waiting at core transition
-        transition_requested_.store(true, std::memory_order_release);
-        reassigned_.store(true, std::memory_order_release);
-    } else if (orch_to_sched_) {
-        LOG_INFO_V0("Thread %d: Set orchestrator_done=true, requesting core transition", thread_idx);
-        transition_requested_.store(true, std::memory_order_release);
-
-        // Wait for scheduler threads to acknowledge transition request
-        while (wait_reassign_.load(std::memory_order_acquire) != sched_thread_num_) {
-            if (completed_.load(std::memory_order_acquire)) {
-                break;
-            }
-            SPIN_WAIT_HINT();
-        }
-        if (!completed_.load(std::memory_order_acquire)) {
-            reassign_cores_for_all_threads();
-            reassigned_.store(true, std::memory_order_release);
-        }
-    }
-
 #if PTO2_PROFILING
-    // Write core-to-thread mapping AFTER reassignment so the profiling data
-    // reflects the final distribution (all active_sched_threads_, including
-    // former orchestrator threads when orch_to_sched_ is enabled).
+    // Write the core-to-thread mapping so the profiling data reflects the
+    // scheduler threads' final core distribution.
     if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) {
         l2_swimlane_aicpu_init_core_assignments(cores_total_num_);
         for (int32_t t = 0; t < active_sched_threads_; t++) {
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h
index d8669d42b..02962864d 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h
@@ -61,8 +61,7 @@ class SchedulerContext {
     // - Binds func_id_to_addr_ / initial sched_ (if rt is already known)
     // - Captures AICore-register base (consumed by handshake_all_cores())
     // Returns 0 on success, negative on failure (handshake / assignment error).
-    int32_t
-    init(Runtime *runtime, int32_t aicpu_thread_num, int32_t sched_thread_num, bool orch_to_sched, uint64_t regs_base);
+    int32_t init(Runtime *runtime, int32_t aicpu_thread_num, int32_t sched_thread_num, uint64_t regs_base);
 
     // Reset all SchedulerContext-owned state to its post-construction defaults.
     // Called by AicpuExecutor::deinit() during per-run teardown.
@@ -152,15 +151,9 @@ class SchedulerContext {
     std::atomic<bool> completed_{false};
     uint64_t *func_id_to_addr_{nullptr};
 
-    // --- Core-transition coordination ---
-    std::atomic<bool> transition_requested_{false};
-    std::atomic<int32_t> wait_reassign_{0};
-    std::atomic<bool> reassigned_{false};
-
     // --- Thread/core configuration ---
     int32_t active_sched_threads_{0};
     int32_t sched_thread_num_{0};
-    bool orch_to_sched_{false};
     int32_t aicpu_thread_num_{0};
     int32_t cores_total_num_{0};
 
@@ -190,9 +183,6 @@ class SchedulerContext {
     // Assign discovered cores (cluster = 1 AIC + 2 AIV) round-robin across scheduler threads.
     bool assign_cores_to_threads();
 
-    // Re-distribute all cores across all threads after orchestration completes.
-    void reassign_cores_for_all_threads();
-
     // Emergency shutdown: broadcast exit signal to every handshake'd core and
     // deinit their AICore register blocks. Idempotent.
     void emergency_shutdown(Runtime *runtime);
@@ -359,8 +349,6 @@ class SchedulerContext {
     __attribute__((noinline, cold)) LoopAction
     handle_orchestrator_exit(int32_t thread_idx, PTO2SharedMemoryHeader *header, Runtime *runtime, int32_t &task_count);
 
-    __attribute__((noinline, cold)) LoopAction handle_core_transition(bool &cores_released);
-
     __attribute__((noinline, cold)) LoopAction
     check_idle_fatal_error(int32_t thread_idx, PTO2SharedMemoryHeader *header, Runtime *runtime);
 
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp
index a3e58c8d6..4b94d0ae0 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp
@@ -803,8 +803,6 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_
     PTO2TaskSlotState *deferred_release_slot_states[PTO2_DEFERRED_RELEASE_CAP];
     int32_t deferred_release_count = 0;
 
-    bool cores_released = false;
-
     // PMU runs require single-issue dispatch — overlapping in-flight tasks
     // pollute per-task PMU counters, so skip the PENDING pre-load phase.
     // Cached at function scope: is_pmu_enabled() is extern "C" and the
@@ -916,11 +914,6 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_
             if (action == LoopAction::BREAK_LOOP) break;
         }
 
-        if (!cores_released && orch_to_sched_) {
-            LoopAction action = handle_core_transition(cores_released);
-            if (action == LoopAction::BREAK_LOOP) break;
-        }
-
 #if PTO2_PROFILING
         CYCLE_COUNT_LAP(l2_swimlane.sched_idle_cycle);
 #endif
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp
index 8b28e620e..5edaa438b 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp
@@ -37,7 +37,6 @@ Runtime::Runtime() {
     memset(dev.aicpu_allowed_cpus, 0, sizeof(dev.aicpu_allowed_cpus));
     dev.aicpu_allowed_cpu_count = 0;
     dev.aicpu_launch_count = 0;
-    dev.orch_to_sched = false;
     dev.serial_orch_sched = false;
     dev.gm_sm_ptr_ = nullptr;
     dev.slot_states_ptr_ = nullptr;
diff --git a/src/a5/platform/include/aicpu/l2_swimlane_collector_aicpu.h b/src/a5/platform/include/aicpu/l2_swimlane_collector_aicpu.h
index 80da54476..3182eb1c1 100644
--- a/src/a5/platform/include/aicpu/l2_swimlane_collector_aicpu.h
+++ b/src/a5/platform/include/aicpu/l2_swimlane_collector_aicpu.h
@@ -155,8 +155,7 @@ void l2_swimlane_aicpu_flush(int thread_idx, const int *cur_thread_cores, int co
  *                                 to the L2Swimlane base
  * @param num_sched_phase_threads  Number of sched-phase pools to prime
  * @param num_orch_phase_threads   Number of orch-phase pools to prime
- *                                 (typically 1; in orch_to_sched mode =
- *                                 num_aicpu_threads)
+ *                                 (typically 1)
  */
 void l2_swimlane_aicpu_init_phase(int worker_count, int num_sched_phase_threads, int num_orch_phase_threads);
 
@@ -200,8 +199,7 @@ void l2_swimlane_aicpu_record_sched_phase(
  * Must be called once from the orchestrator thread before any
  * l2_swimlane_aicpu_record_orch_phase() calls.
  *
- * @param thread_idx Thread index for the orchestrator (typically num_sched_threads;
- *                   in orch_to_sched mode each scheduler thread sets its own)
+ * @param thread_idx Thread index for the orchestrator (typically num_sched_threads)
  */
 void l2_swimlane_aicpu_set_orch_thread_idx(int thread_idx);
 
diff --git a/src/a5/platform/shared/aicpu/l2_swimlane_collector_aicpu.cpp b/src/a5/platform/shared/aicpu/l2_swimlane_collector_aicpu.cpp
index 5ed92cd61..aafffa8ee 100644
--- a/src/a5/platform/shared/aicpu/l2_swimlane_collector_aicpu.cpp
+++ b/src/a5/platform/shared/aicpu/l2_swimlane_collector_aicpu.cpp
@@ -69,8 +69,7 @@ static L2SwimlaneAicpuTaskBuffer *s_current_aicpu_task_buffers[PLATFORM_MAX_CORE
 static L2SwimlaneAicpuSchedPhasePool *s_sched_phase_pools[PLATFORM_MAX_AICPU_THREADS] = {};
 static L2SwimlaneAicpuSchedPhaseBuffer *s_current_sched_phase_buffers[PLATFORM_MAX_AICPU_THREADS] = {};
 
-// Per-thread orch-phase pool/buffer caches (typically one orch thread; in
-// orch_to_sched mode all aicpu threads can write here).
+// Per-thread orch-phase pool/buffer caches (one orch thread).
 static L2SwimlaneAicpuOrchPhasePool *s_orch_phase_pools[PLATFORM_MAX_AICPU_THREADS] = {};
 static L2SwimlaneAicpuOrchPhaseBuffer *s_current_orch_phase_buffers[PLATFORM_MAX_AICPU_THREADS] = {};
 
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
index 41a97ace8..91488097a 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
+++ b/src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
@@ -114,7 +114,6 @@ struct OrchSoEntry {
 
 struct AicpuExecutor {
     int32_t sched_thread_num_;
-    bool orch_to_sched_{false};
     bool serial_orch_sched_{false};
 
     // ===== Thread management state =====
@@ -208,7 +207,6 @@ int32_t AicpuExecutor::init(Runtime *runtime) {
     aicpu_thread_num_ = runtime->dev.aicpu_thread_num;
     if (aicpu_thread_num_ == 0) aicpu_thread_num_ = 1;
     sched_thread_num_ = aicpu_thread_num_ - 1;
-    orch_to_sched_ = runtime->dev.orch_to_sched;
     serial_orch_sched_ = runtime->dev.serial_orch_sched;
 
     if (aicpu_thread_num_ < 1 || aicpu_thread_num_ > MAX_AICPU_THREADS) {
@@ -217,7 +215,7 @@ int32_t AicpuExecutor::init(Runtime *runtime) {
         return -1;
     }
 
-    if (sched_ctx_.init(runtime, aicpu_thread_num_, sched_thread_num_, orch_to_sched_, get_platform_regs()) != 0) {
+    if (sched_ctx_.init(runtime, aicpu_thread_num_, sched_thread_num_, get_platform_regs()) != 0) {
         init_failed_.store(true, std::memory_order_release);
         return -1;
     }
@@ -723,8 +721,8 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
         LOG_INFO_V0("Thread %d: Orchestrator completed", thread_idx);
     }
 
-    // Scheduler thread (orchestrator threads skip dispatch when orch_to_sched_ is false)
-    if (!sched_ctx_.is_completed() && (thread_idx < sched_thread_num_ || orch_to_sched_)) {
+    // Scheduler thread (orchestrator thread skips dispatch and exits after orchestration)
+    if (!sched_ctx_.is_completed() && thread_idx < sched_thread_num_) {
         // Device orchestration: wait for the primary orchestrator to initialize the SM header
         while (!runtime_init_ready_.load(std::memory_order_acquire)) {
             SPIN_WAIT_HINT();
@@ -797,7 +795,6 @@ void AicpuExecutor::deinit(Runtime *runtime) {
 
     aicpu_thread_num_ = 0;
     sched_thread_num_ = 0;
-    orch_to_sched_ = false;
     serial_orch_sched_ = false;
 
     orch_args_cached_.reset();
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/docs/RUNTIME_LOGIC.md b/src/a5/runtime/tensormap_and_ringbuffer/docs/RUNTIME_LOGIC.md
index 83ea5c270..a339b178e 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/docs/RUNTIME_LOGIC.md
+++ b/src/a5/runtime/tensormap_and_ringbuffer/docs/RUNTIME_LOGIC.md
@@ -544,11 +544,11 @@ Public surface (called from `AicpuExecutor::init/run/deinit`):
 
 | Method | Phase | Purpose |
 | ------ | ----- | ------- |
-| `init(runtime, aicpu_thread_num, sched_thread_num, orch_to_sched, regs_base)` | once per run | Handshake + assign cores, reset counters, latch `regs_base`, bind `func_id_to_addr_` |
+| `init(runtime, aicpu_thread_num, sched_thread_num, regs_base)` | once per run | Handshake + assign cores, reset counters, latch `regs_base`, bind `func_id_to_addr_` |
 | `bind_runtime(rt)` | device-orch only | Wire `sched_` to `rt->scheduler` once the orchestrator thread creates `rt` |
 | `resolve_and_dispatch(runtime, thread_idx)` | per scheduler thread | Main dispatch loop |
 | `shutdown(thread_idx)` | per thread on exit | `platform_deinit_aicore_regs` for this thread's cores |
-| `on_orchestration_done(runtime, rt, thread_idx, total_tasks)` | orchestrator thread | Publish core assignments, latch task count, fold inline-completed tasks, flip `orchestrator_done_`, drive orch→sched core transition (or `emergency_shutdown` on fatal) |
+| `on_orchestration_done(runtime, rt, thread_idx, total_tasks)` | orchestrator thread | Publish core assignments, latch task count, fold inline-completed tasks, flip `orchestrator_done_` (or `emergency_shutdown` on fatal) |
 | `deinit()` | once per run | Reset every scheduler-owned field to its post-construction default |
 | Read-only accessors | various | `aic_count()` / `aiv_count()` / `is_completed()` / `completed_tasks_count()` |
 
@@ -556,7 +556,7 @@ Private internals are split across three .cpp files by responsibility:
 
 - `scheduler_completion.cpp` — completion polling, drain protocol
 - `scheduler_dispatch.cpp` — task dispatch loop and helpers
-- `scheduler_cold_path.cpp` — exit checks, stall diagnostics, profiling, lifecycle (`init/deinit`), core management (`handshake_all_cores` / `assign_cores_to_threads` / `reassign_cores_for_all_threads` / `emergency_shutdown`), and `on_orchestration_done`
+- `scheduler_cold_path.cpp` — exit checks, stall diagnostics, profiling, lifecycle (`init/deinit`), core management (`handshake_all_cores` / `assign_cores_to_threads` / `emergency_shutdown`), and `on_orchestration_done`
 
 `AicpuExecutor` calls neither `handshake_*`, `assign_*`, `reassign_*`, nor `emergency_shutdown` directly — they are private, invoked only by `init` and `on_orchestration_done`.
 
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/docs/SUBMIT_BY_CLUSTER.md b/src/a5/runtime/tensormap_and_ringbuffer/docs/SUBMIT_BY_CLUSTER.md
index 8cba7e90c..50f734fee 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/docs/SUBMIT_BY_CLUSTER.md
+++ b/src/a5/runtime/tensormap_and_ringbuffer/docs/SUBMIT_BY_CLUSTER.md
@@ -36,7 +36,7 @@ Legacy per-task submit (`kernel_id + worker_type`) cannot express atomic co-disp
 
 Design must preserve the current main runtime architecture:
 
-1. Executor threading split (orchestrator thread vs scheduler threads), and post-orchestrator transition (`transition_requested_` + `reassign_cores_for_all_threads()`).
+1. Executor threading split (orchestrator thread vs scheduler threads); the orchestrator thread exits after the task graph is built while scheduler threads dispatch to completion.
 2. Shared-memory hot/cold split (`PTO2TaskDescriptor` hot + `PTO2TaskPayload` cold).
 
 ## 5. Terminology
@@ -146,10 +146,8 @@ This project-defined flattened numbering is kept unchanged.
 ### 9.2 Cluster Ownership
 
 1. One cluster must be owned by one scheduler domain/thread at a time.
-2. No split-cluster ownership in either:
-   - initial `assign_cores_to_threads()`
-   - post-orchestrator `reassign_cores_for_all_threads()`
-3. Lane occupancy bookkeeping must remain consistent with ownership after reassignment.
+2. No split-cluster ownership in `assign_cores_to_threads()`.
+3. Lane occupancy bookkeeping must remain consistent with ownership.
 
 ## 10. Functional Requirements
 
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/docs/profiling_levels.md b/src/a5/runtime/tensormap_and_ringbuffer/docs/profiling_levels.md
index 2ef6c1b6a..62a38766e 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/docs/profiling_levels.md
+++ b/src/a5/runtime/tensormap_and_ringbuffer/docs/profiling_levels.md
@@ -73,7 +73,6 @@ Each sub-level macro requires `PTO2_PROFILING=1`:
 
 - Base timing counters for scheduler loop (`sched_complete/dispatch/idle/scan`)
 - Per-thread orchestration timing (`orch_start`, `orch_end`, `orch_cost`)
-- Stage-level orchestration end timestamp (`orch_stage_end`, printed by last orch thread only, marks the moment all orch threads have finished and core transition is about to be requested; only when `orch_to_sched_` is true)
 - PTO2 total submitted tasks count (printed by last orch thread, after orch timing line)
 - Scheduler summary output (`total_time`, `loops`, `tasks_scheduled`)
 - Scheduler lifetime timestamps and cost (`sched_start`, `sched_end`, `sched_cost` — captured inside `resolve_and_dispatch_pto2()`, printed before Scheduler summary)
@@ -87,19 +86,17 @@ Each sub-level macro requires `PTO2_PROFILING=1`:
 
 - `Thread %d: orch_start=%llu orch_end=%llu orch_cost=%.3fus` — each orch thread, after orchestration fully complete
 - `PTO2 total submitted tasks = %d, already executed %d tasks` — last orch thread only (×1), after orch timing line
-- `Thread %d: orch_stage_end=%llu` — last orch thread only (×1), only when `orch_to_sched_=true`
 - `Thread %d: sched_start=%llu sched_end=%llu sched_cost=%.3fus` — each sched thread, printed before Scheduler summary
 - `Thread %d: Scheduler summary: total_time=%.3fus, loops=%llu, tasks_scheduled=%d` — each sched thread
 - `Thread %d: sched_start=%llu sched_end(timeout)=%llu sched_cost=%.3fus` — timeout path only (replaces normal `sched_end`)
 
 **LOG_INFO_V9 count (normal run):**
 
-- `orch_to_sched_=false` (default): `N_sched*2 + N_orch*1 + 1` (orch_timing + PTO2_total + sched_timing + Scheduler_summary)
-- `orch_to_sched_=true` (`PTO2_ORCH_TO_SCHED=1`): adds 1 (`orch_stage_end`)
+- `N_sched*2 + N_orch*1 + 1` (orch_timing + PTO2_total + sched_timing + Scheduler_summary)
 
 > See the table at the end for concrete counts based on the `paged_attention` example.
 
-**Example log output — `orch_to_sched_=false`** (from `paged_attention`, device 10):
+**Example log output** (from `paged_attention`, device 10):
 
 ```text
 Thread 2: orch_start=48214752948321 orch_end=48214752959379 orch_cost=230.000us
@@ -111,26 +108,10 @@ Thread 0: sched_start=48214752948200 sched_end=48214752963571 sched_cost=320.000
 Thread 0: Scheduler summary: total_time=183.180us, loops=4611, tasks_scheduled=7
 ```
 
-**Example log output — `orch_to_sched_=true`** (`PTO2_ORCH_TO_SCHED=1`, from `paged_attention`, device 11):
-
-```text
-Thread 3: orch_stage_end=48236915058307
-Thread 3: orch_start=48236915044001 orch_end=48236915058781 orch_cost=308.000us
-Thread 2: orch_start=48236915044003 orch_end=48236915058782 orch_cost=308.000us
-PTO2 total submitted tasks = 13, already executed 13 tasks
-Thread 0: sched_start=48236915043911 sched_end=48236915059191 sched_cost=318.000us
-Thread 0: Scheduler summary: total_time=187.920us, loops=4561, tasks_scheduled=4
-Thread 1: sched_start=48236915043947 sched_end=48236915061881 sched_cost=372.000us
-Thread 1: Scheduler summary: total_time=168.620us, loops=3880, tasks_scheduled=9
-```
-
-> With `orch_to_sched_=true`, orch threads transition to schedulers after orchestration. They print `orch_end` but do NOT print `Scheduler summary` or `sched_end` (they have no cores assigned at shutdown time).
-
 **Note:**
 
 - All logs above are controlled by compile-time macro `PTO2_PROFILING`, not by `enable_l2_swimlane`.
 - `enable_l2_swimlane` only controls shared-memory data collection / swimlane export.
-- Enable `orch_to_sched_` via environment variable: `PTO2_ORCH_TO_SCHED=1`.
 
 ---
 
@@ -390,13 +371,13 @@ definitions to runtime headers.
 
 > Example: `paged_attention` on Ascend hardware, 2 sched threads + 2 orch threads, normal run (no stall/timeout).
 
-| Level | Macro Settings | LOG_INFO_V9 Count (`orch_to_sched_=false`) | LOG_INFO_V9 Count (`orch_to_sched_=true`) | Description |
-| ----- | -------------- | ------------------------------------------ | ----------------------------------------- | ----------- |
-| 0 | `PTO2_PROFILING=0` | 0 | 0 | No timing output |
-| 1 | `PTO2_PROFILING=1` | 7 | 8 | Timing timestamps + scheduler summary |
-| 2 | `+PTO2_SCHED_PROFILING=1` | — | — | Scheduler detailed phase breakdown |
-| 3 | `+PTO2_ORCH_PROFILING=1` | — | — | Orchestrator detailed phase breakdown |
-| 4 | `+PTO2_TENSORMAP_PROFILING=1` | — | — | TensorMap lookup stats |
+| Level | Macro Settings | LOG_INFO_V9 Count | Description |
+| ----- | -------------- | ----------------- | ----------- |
+| 0 | `PTO2_PROFILING=0` | 0 | No timing output |
+| 1 | `PTO2_PROFILING=1` | 7 | Timing timestamps + scheduler summary |
+| 2 | `+PTO2_SCHED_PROFILING=1` | — | Scheduler detailed phase breakdown |
+| 3 | `+PTO2_ORCH_PROFILING=1` | — | Orchestrator detailed phase breakdown |
+| 4 | `+PTO2_TENSORMAP_PROFILING=1` | — | TensorMap lookup stats |
 
 ---
 
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp b/src/a5/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp
index dc0ddefb8..7952045ce 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp
+++ b/src/a5/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp
@@ -477,10 +477,6 @@ static bool stage_device_args(
 // runtime. Behavior-only env reads (no new gates); kept here so the args and
 // image steps stay free of unrelated state.
 static void apply_orch_sched_env_flags(Runtime *runtime) {
-    const char *orch_env = std::getenv("PTO2_ORCH_TO_SCHED");
-    runtime->dev.orch_to_sched = orch_env && (orch_env[0] == '1' || orch_env[0] == 't' || orch_env[0] == 'T');
-    LOG_INFO_V0("Orchestrator-to-scheduler transition: %s", runtime->dev.orch_to_sched ? "enabled" : "disabled");
-
     const char *serial_env = std::getenv("PTO2_SERIAL_ORCH_SCHED");
     runtime->dev.serial_orch_sched =
         serial_env && (serial_env[0] == '1' || serial_env[0] == 't' || serial_env[0] == 'T');
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/runtime.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/runtime.h
index 6d2bc08a0..22d965231 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/runtime.h
+++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/runtime.h
@@ -215,7 +215,6 @@ struct alignas(64) DeviceRuntimeLaunchDesc {
     // (= orch + schedulers). AicpuExecutor splits this into one orchestrator
     // thread (highest idx, runs aicpu_orchestration_entry) and the remaining
     // aicpu_thread_num-1 scheduler threads that dispatch tasks to AICore.
-    // The orch thread also dispatches when env PTO2_ORCH_TO_SCHED is set.
     int aicpu_thread_num;
     int ready_queue_shards;  // Number of ready queue shards (1..MAX_AICPU_THREADS, default MAX-1)
 
@@ -237,12 +236,6 @@ struct alignas(64) DeviceRuntimeLaunchDesc {
     // PTO2 integration: kernel_id -> GM function_bin_addr mapping
     uint64_t func_id_to_addr_[RUNTIME_MAX_FUNC_ID];
 
-    // Orchestrator-to-scheduler transition control
-    // When true, orchestrator threads convert to scheduler threads after orchestration completes.
-    // When false (default), orchestrator threads exit after orchestration without dispatching tasks.
-    // Controlled via PTO2_ORCH_TO_SCHED environment variable.
-    bool orch_to_sched;
-
     // Serial orchestrator -> scheduler start control.
     // When true, scheduler threads wait until orchestration has fully built the
     // task graph before entering resolve_and_dispatch().
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp
index 517a40c8b..20587aeaf 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp
+++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp
@@ -94,21 +94,6 @@ LoopAction SchedulerContext::handle_orchestrator_exit(
     return LoopAction::NONE;
 }
 
-LoopAction SchedulerContext::handle_core_transition(bool &cores_released) {
-    if (!transition_requested_.load(std::memory_order_acquire)) return LoopAction::NONE;
-    if (!reassigned_.load(std::memory_order_acquire)) {
-        wait_reassign_.fetch_add(1, std::memory_order_release);
-        while (!reassigned_.load(std::memory_order_acquire)) {
-            if (completed_.load(std::memory_order_acquire)) {
-                return LoopAction::BREAK_LOOP;
-            }
-            SPIN_WAIT_HINT();
-        }
-    }
-    cores_released = true;
-    return LoopAction::NONE;
-}
-
 LoopAction
 SchedulerContext::check_idle_fatal_error(int32_t thread_idx, PTO2SharedMemoryHeader *header, Runtime *runtime) {
     if (completed_.load(std::memory_order_acquire)) {
@@ -327,7 +312,7 @@ void SchedulerContext::log_stall_diagnostics(
 
     // CLUSTER lines: one per cluster this thread owns.
     // cluster_id = local_cluster_idx * active_sched_threads_ + thread_idx, matching the
-    // round-robin assignment in assign_cores_to_threads / reassign_cores_for_all_threads.
+    // round-robin assignment in assign_cores_to_threads.
     int32_t ast = active_sched_threads_ > 0 ? active_sched_threads_ : aicpu_thread_num_;
     for (int32_t cli = 0; cli < tracker.get_cluster_count() && cli < STALL_DUMP_CORE_MAX; cli++) {
         int32_t offset = cli * 3;
@@ -851,76 +836,6 @@ bool SchedulerContext::assign_cores_to_threads() {
     return true;
 }
 
-// =============================================================================
-// Reassign all cores across all threads (sched + orchestrator) after orchestration.
-// =============================================================================
-void SchedulerContext::reassign_cores_for_all_threads() {
-    LOG_INFO_V0(
-        "Reassigning cores (cluster-aligned) for %d threads: %d AIC, %d AIV", aicpu_thread_num_, aic_count_, aiv_count_
-    );
-
-    // Collect running worker_ids from all current trackers
-    bool running_cores[RUNTIME_MAX_WORKER] = {};
-    for (int32_t i = 0; i < aicpu_thread_num_; i++) {
-        auto all_running = core_trackers_[i].get_all_running_cores();
-        int32_t bp;
-        while ((bp = all_running.pop_first()) >= 0) {
-            running_cores[core_trackers_[i].get_core_id_by_offset(bp)] = true;
-        }
-    }
-
-    // Count clusters per thread (round-robin across all threads)
-    int32_t cluster_count = aic_count_;
-    int32_t clusters_per_thread[MAX_AICPU_THREADS] = {};
-    for (int32_t ci = 0; ci < cluster_count; ci++) {
-        clusters_per_thread[ci % aicpu_thread_num_]++;
-    }
-
-    // Re-init all trackers and reset core counts
-    for (int32_t i = 0; i < aicpu_thread_num_; i++) {
-        core_trackers_[i].init(clusters_per_thread[i]);
-    }
-
-    // Assign clusters round-robin and restore running state
-    int32_t cluster_idx_per_thread[MAX_AICPU_THREADS] = {};
-    for (int32_t ci = 0; ci < cluster_count; ci++) {
-        int32_t t = ci % aicpu_thread_num_;
-
-        int32_t aic_wid = aic_worker_ids_[ci];
-        int32_t aiv0_wid = aiv_worker_ids_[2 * ci];
-        int32_t aiv1_wid = aiv_worker_ids_[2 * ci + 1];
-
-        int32_t cl_idx = cluster_idx_per_thread[t]++;
-        core_trackers_[t].set_cluster(cl_idx, aic_wid, aiv0_wid, aiv1_wid);
-
-        // init() marks all idle; toggle cores that were running and restore pending_occupied
-        if (running_cores[aic_wid]) {
-            core_trackers_[t].change_core_state(cl_idx * 3);
-            core_trackers_[t].set_pending_occupied(cl_idx * 3);
-        }
-        if (running_cores[aiv0_wid]) {
-            core_trackers_[t].change_core_state(cl_idx * 3 + 1);
-            core_trackers_[t].set_pending_occupied(cl_idx * 3 + 1);
-        }
-        if (running_cores[aiv1_wid]) {
-            core_trackers_[t].change_core_state(cl_idx * 3 + 2);
-            core_trackers_[t].set_pending_occupied(cl_idx * 3 + 2);
-        }
-    }
-
-    // Log final distribution
-    LOG_INFO_V0("Core reassignment complete:");
-    for (int32_t t = 0; t < aicpu_thread_num_; t++) {
-        int32_t aic_running = core_trackers_[t].get_running_count<CoreType::AIC>();
-        int32_t aiv_running = core_trackers_[t].get_running_count<CoreType::AIV>();
-        LOG_INFO_V0(
-            "  Thread %d: %d cores, %d clusters (AIC running=%d, AIV running=%d)", t, core_trackers_[t].core_num(),
-            core_trackers_[t].get_cluster_count(), aic_running, aiv_running
-        );
-    }
-    active_sched_threads_ = aicpu_thread_num_;
-}
-
 // =============================================================================
 // Emergency shutdown: broadcast exit signal to every handshake'd core and
 // deinit their AICore register blocks. Idempotent.
@@ -948,9 +863,8 @@ void SchedulerContext::emergency_shutdown(Runtime *runtime) {
 // =============================================================================
 // Lifecycle: init / deinit
 // =============================================================================
-int32_t SchedulerContext::init(
-    Runtime *runtime, int32_t aicpu_thread_num, int32_t sched_thread_num, bool orch_to_sched, uint64_t regs_base
-) {
+int32_t
+SchedulerContext::init(Runtime *runtime, int32_t aicpu_thread_num, int32_t sched_thread_num, uint64_t regs_base) {
     always_assert(runtime != nullptr);
 
     // Zero all per-core execution state before handshake
@@ -959,7 +873,6 @@ int32_t SchedulerContext::init(
     // Wire thread/transition configuration that handshake/assign need to read.
     aicpu_thread_num_ = aicpu_thread_num;
     sched_thread_num_ = sched_thread_num;
-    orch_to_sched_ = orch_to_sched;
     regs_ = regs_base;
 
 #if PTO2_PROFILING
@@ -976,10 +889,6 @@ int32_t SchedulerContext::init(
         l2_swimlane_aicpu_init(runtime->dev.worker_count);
         l2_swimlane_level_ = get_l2_swimlane_level();
         if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) {
-            // When orchestrator phases merge into scheduler threads
-            // (PTO2_ORCH_TO_SCHED=1), phase records flow through
-            // aicpu_thread_num_ pools — matches the same branch in the
-            // dump_args_init call below.
             // Sched phase pool count = number of scheduler threads.
             // This block runs before assign_cores_to_threads, so the
             // active_sched_threads_ member isn't set yet — recompute the same
@@ -988,10 +897,9 @@ int32_t SchedulerContext::init(
             // assign_cores_to_threads' active_sched_threads_). Without this
             // normalization here, init_phase would prime zero sched pools
             // and all sched_phase emits would silently drop.
-            const int active_sched = (sched_thread_num_ > 0) ? sched_thread_num_ : aicpu_thread_num_;
-            const int sched_phase_threads = orch_to_sched_ ? aicpu_thread_num_ : active_sched;
+            const int sched_phase_threads = (sched_thread_num_ > 0) ? sched_thread_num_ : aicpu_thread_num_;
             // Orch phase is a single instance (PR #971 design), so the orch
-            // pool count is always 1 regardless of orch_to_sched mode.
+            // pool count is always 1.
             const int orch_phase_threads = 1;
             l2_swimlane_aicpu_init_phase(runtime->dev.worker_count, sched_phase_threads, orch_phase_threads);
         }
@@ -1018,7 +926,7 @@ int32_t SchedulerContext::init(
     // orchestrator thread (see aicpu_executor.cpp).
 #if PTO2_PROFILING
     if (is_dump_args_enabled()) {
-        dump_args_init(orch_to_sched_ ? aicpu_thread_num_ : active_sched_threads_);
+        dump_args_init(active_sched_threads_);
     }
     if (is_pmu_enabled()) {
         pmu_aicpu_init(physical_core_ids_, cores_total_num_);
@@ -1112,11 +1020,6 @@ void SchedulerContext::deinit() {
     completed_tasks_.store(0, std::memory_order_release);
     total_tasks_ = 0;
     orchestrator_done_.store(false, std::memory_order_release);
-
-    // Reset core transition state
-    transition_requested_.store(false, std::memory_order_release);
-    wait_reassign_.store(0, std::memory_order_release);
-    reassigned_.store(false, std::memory_order_release);
     completed_.store(false, std::memory_order_release);
 
     // Reset core discovery and assignment state
@@ -1125,7 +1028,6 @@ void SchedulerContext::deinit() {
     cores_total_num_ = 0;
     aicpu_thread_num_ = 0;
     sched_thread_num_ = 0;
-    orch_to_sched_ = false;
     active_sched_threads_ = 0;
     for (int32_t t = 0; t < MAX_AICPU_THREADS; t++) {
         core_trackers_[t] = CoreTracker{};
@@ -1167,7 +1069,7 @@ void SchedulerContext::wait_for_orchestration_done_before_dispatch(Runtime *runt
 // and drives the orchestrator → scheduler core transition (or fatal shutdown).
 // =============================================================================
 void SchedulerContext::on_orchestration_done(
-    Runtime *runtime, PTO2Runtime *rt, int32_t thread_idx, int32_t total_tasks
+    Runtime *runtime, PTO2Runtime *rt, [[maybe_unused]] int32_t thread_idx, int32_t total_tasks
 ) {
 #if PTO2_PROFILING
     if (l2_swimlane_level_ >= L2SwimlaneLevel::ORCH_PHASES) {
@@ -1199,32 +1101,9 @@ void SchedulerContext::on_orchestration_done(
         }
     }
 
-    // Skip core transition on fatal error — cores already shut down above.
-    if (completed_.load(std::memory_order_acquire)) {
-        // Signal transition to unblock scheduler threads waiting at core transition
-        transition_requested_.store(true, std::memory_order_release);
-        reassigned_.store(true, std::memory_order_release);
-    } else if (orch_to_sched_) {
-        LOG_INFO_V0("Thread %d: Set orchestrator_done=true, requesting core transition", thread_idx);
-        transition_requested_.store(true, std::memory_order_release);
-
-        // Wait for scheduler threads to acknowledge transition request
-        while (wait_reassign_.load(std::memory_order_acquire) != sched_thread_num_) {
-            if (completed_.load(std::memory_order_acquire)) {
-                break;
-            }
-            SPIN_WAIT_HINT();
-        }
-        if (!completed_.load(std::memory_order_acquire)) {
-            reassign_cores_for_all_threads();
-            reassigned_.store(true, std::memory_order_release);
-        }
-    }
-
 #if PTO2_PROFILING
-    // Write core-to-thread mapping AFTER reassignment so the profiling data
-    // reflects the final distribution (all active_sched_threads_, including
-    // former orchestrator threads when orch_to_sched_ is enabled).
+    // Write the core-to-thread mapping so the profiling data reflects the
+    // scheduler threads' final core distribution.
     if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) {
         l2_swimlane_aicpu_init_core_assignments(cores_total_num_);
         for (int32_t t = 0; t < active_sched_threads_; t++) {
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h
index 5cfc08563..a6c0bdfa7 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h
+++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h
@@ -59,8 +59,7 @@ class SchedulerContext {
     // - Binds func_id_to_addr_ / initial sched_ (if rt is already known)
     // - Captures AICore-register base (consumed by handshake_all_cores())
     // Returns 0 on success, negative on failure (handshake / assignment error).
-    int32_t
-    init(Runtime *runtime, int32_t aicpu_thread_num, int32_t sched_thread_num, bool orch_to_sched, uint64_t regs_base);
+    int32_t init(Runtime *runtime, int32_t aicpu_thread_num, int32_t sched_thread_num, uint64_t regs_base);
 
     // Reset all SchedulerContext-owned state to its post-construction defaults.
     // Called by AicpuExecutor::deinit() during per-run teardown.
@@ -150,15 +149,9 @@ class SchedulerContext {
     std::atomic<bool> completed_{false};
     uint64_t *func_id_to_addr_{nullptr};
 
-    // --- Core-transition coordination ---
-    std::atomic<bool> transition_requested_{false};
-    std::atomic<int32_t> wait_reassign_{0};
-    std::atomic<bool> reassigned_{false};
-
     // --- Thread/core configuration ---
     int32_t active_sched_threads_{0};
     int32_t sched_thread_num_{0};
-    bool orch_to_sched_{false};
     int32_t aicpu_thread_num_{0};
     int32_t cores_total_num_{0};
 
@@ -190,9 +183,6 @@ class SchedulerContext {
     // Assign discovered cores (cluster = 1 AIC + 2 AIV) round-robin across scheduler threads.
     bool assign_cores_to_threads();
 
-    // Re-distribute all cores across all threads after orchestration completes.
-    void reassign_cores_for_all_threads();
-
     // Emergency shutdown: broadcast exit signal to every handshake'd core and
     // deinit their AICore register blocks. Idempotent.
     void emergency_shutdown(Runtime *runtime);
@@ -323,8 +313,6 @@ class SchedulerContext {
     __attribute__((noinline, cold)) LoopAction
     handle_orchestrator_exit(int32_t thread_idx, PTO2SharedMemoryHeader *header, Runtime *runtime, int32_t &task_count);
 
-    __attribute__((noinline, cold)) LoopAction handle_core_transition(bool &cores_released);
-
     __attribute__((noinline, cold)) LoopAction
     check_idle_fatal_error(int32_t thread_idx, PTO2SharedMemoryHeader *header, Runtime *runtime);
 
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp
index af17954ed..399cf70b2 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp
+++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp
@@ -520,8 +520,6 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_
     PTO2TaskSlotState *deferred_release_slot_states[PTO2_DEFERRED_RELEASE_CAP];
     int32_t deferred_release_count = 0;
 
-    bool cores_released = false;
-
     // PMU runs require single-issue dispatch — overlapping in-flight tasks
     // pollute per-task PMU counters. Cached at function scope (parity with
     // a2a3): is_pmu_enabled() is extern "C" and the compiler cannot hoist it
@@ -629,11 +627,6 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_
             if (action == LoopAction::BREAK_LOOP) break;
         }
 
-        if (!cores_released && orch_to_sched_) {
-            LoopAction action = handle_core_transition(cores_released);
-            if (action == LoopAction::BREAK_LOOP) break;
-        }
-
 #if PTO2_PROFILING
         CYCLE_COUNT_LAP(l2_swimlane.sched_idle_cycle);
 #endif
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp
index 8b28e620e..5edaa438b 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp
+++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp
@@ -37,7 +37,6 @@ Runtime::Runtime() {
     memset(dev.aicpu_allowed_cpus, 0, sizeof(dev.aicpu_allowed_cpus));
     dev.aicpu_allowed_cpu_count = 0;
     dev.aicpu_launch_count = 0;
-    dev.orch_to_sched = false;
     dev.serial_orch_sched = false;
     dev.gm_sm_ptr_ = nullptr;
     dev.slot_states_ptr_ = nullptr;