From 02ac7e1f249f43f71c550c71fb0c33df8170d6d7 Mon Sep 17 00:00:00 2001 From: Chao Wang <26245345+ChaoWao@users.noreply.github.com> Date: Tue, 30 Jun 2026 20:46:55 +0800 Subject: [PATCH] Refactor: remove PTO2_ORCH_TO_SCHED feature and dead transition branches The default-off PTO2_ORCH_TO_SCHED env flag made orchestrator AICPU threads transition into scheduler threads after the task graph was built, via a core-reassignment handshake. The flag was exercised by no test, CI job, or build script, so the whole transition path was permanent dead weight under the default. Removed across both a2a3 and a5 tensormap_and_ringbuffer runtimes: - Flag plumbing: the getenv("PTO2_ORCH_TO_SCHED") read in runtime_maker.cpp, the Runtime::orch_to_sched member, its init reset, and the orch_to_sched_ executor member + dispatch gate. - Core-transition machinery, reachable only when the flag was set (now dead): handle_core_transition(), reassign_cores_for_all_threads(), the transition_requested_/wait_reassign_/reassigned_ atomics, the cores_released dispatch-loop branch, and the transition block in on_orchestration_done(). The two orch_to_sched_ ternaries collapse to their default arms. - Stale doc/comment references (profiling_levels, RUNTIME_LOGIC, SUBMIT_BY_CLUSTER, dynamic-linking, swimlane collector comments). on_orchestration_done's thread_idx is now used only inside unconditional use), so it is marked [[maybe_unused]] to keep the PTO2_PROFILING=0 build warning-clean under -Werror=unused-parameter. The separate serial_orch_sched (PTO2_SERIAL_ORCH_SCHED) flag is left untouched. Builds clean on a2a3 and a5 across all profiling-flag combos; tensormap sim tests pass on a2a3sim and a5sim. --- docs/dynamic-linking.md | 4 +- .../aicpu/l2_swimlane_collector_aicpu.h | 6 +- .../aicpu/l2_swimlane_collector_aicpu.cpp | 3 +- .../aicpu/aicpu_executor.cpp | 9 +- .../docs/RUNTIME_LOGIC.md | 6 +- .../docs/SUBMIT_BY_CLUSTER.md | 8 +- .../docs/profiling_levels.md | 37 ++--- .../host/runtime_maker.cpp | 4 - .../runtime/runtime.h | 7 - .../runtime/scheduler/scheduler_cold_path.cpp | 135 ++--------------- .../runtime/scheduler/scheduler_context.h | 14 +- .../runtime/scheduler/scheduler_dispatch.cpp | 7 - .../runtime/shared/runtime.cpp | 1 - .../aicpu/l2_swimlane_collector_aicpu.h | 6 +- .../aicpu/l2_swimlane_collector_aicpu.cpp | 3 +- .../aicpu/aicpu_executor.cpp | 9 +- .../docs/RUNTIME_LOGIC.md | 6 +- .../docs/SUBMIT_BY_CLUSTER.md | 8 +- .../docs/profiling_levels.md | 37 ++--- .../host/runtime_maker.cpp | 4 - .../runtime/runtime.h | 7 - .../runtime/scheduler/scheduler_cold_path.cpp | 139 ++---------------- .../runtime/scheduler/scheduler_context.h | 14 +- .../runtime/scheduler/scheduler_dispatch.cpp | 7 - .../runtime/shared/runtime.cpp | 1 - 25 files changed, 64 insertions(+), 418 deletions(-) diff --git a/docs/dynamic-linking.md b/docs/dynamic-linking.md index 88252d2dc..a8fa2e045 100644 --- a/docs/dynamic-linking.md +++ b/docs/dynamic-linking.md @@ -224,11 +224,11 @@ SchedulerContext owns its own teardown: - `SchedulerContext::deinit()` resets every scheduler-owned field — per-core states, payloads, sync-start drain coordination (`sync_start_pending` / `drain_worker_elected` / `drain_ack_mask` / - `pending_task`), task counters, transition flags, worker-id lists, + `pending_task`), task counters, worker-id lists, core trackers, `cores_total_num_` / `aic_count_` / `aiv_count_`, `regs_`, `sched_`, `func_id_to_addr_`, and the `pto2_init_*` flags. - `AicpuExecutor::deinit()` calls `sched_ctx_.deinit()` first, then resets - only its own fields: `thread_num_`, `sched_thread_num_`, `orch_to_sched_`, + only its own fields: `thread_num_`, `sched_thread_num_`, `orch_func_`, `orch_args_cached_`, `orch_so_handle_`, `orch_so_path_`, `runtime_init_ready_`, and the lifecycle atomics (`initialized_`, `init_done_`, `init_failed_`, `finished_`, `thread_idx_`, diff --git a/src/a2a3/platform/include/aicpu/l2_swimlane_collector_aicpu.h b/src/a2a3/platform/include/aicpu/l2_swimlane_collector_aicpu.h index 80da54476..3182eb1c1 100644 --- a/src/a2a3/platform/include/aicpu/l2_swimlane_collector_aicpu.h +++ b/src/a2a3/platform/include/aicpu/l2_swimlane_collector_aicpu.h @@ -155,8 +155,7 @@ void l2_swimlane_aicpu_flush(int thread_idx, const int *cur_thread_cores, int co * to the L2Swimlane base * @param num_sched_phase_threads Number of sched-phase pools to prime * @param num_orch_phase_threads Number of orch-phase pools to prime - * (typically 1; in orch_to_sched mode = - * num_aicpu_threads) + * (typically 1) */ void l2_swimlane_aicpu_init_phase(int worker_count, int num_sched_phase_threads, int num_orch_phase_threads); @@ -200,8 +199,7 @@ void l2_swimlane_aicpu_record_sched_phase( * Must be called once from the orchestrator thread before any * l2_swimlane_aicpu_record_orch_phase() calls. * - * @param thread_idx Thread index for the orchestrator (typically num_sched_threads; - * in orch_to_sched mode each scheduler thread sets its own) + * @param thread_idx Thread index for the orchestrator (typically num_sched_threads) */ void l2_swimlane_aicpu_set_orch_thread_idx(int thread_idx); diff --git a/src/a2a3/platform/shared/aicpu/l2_swimlane_collector_aicpu.cpp b/src/a2a3/platform/shared/aicpu/l2_swimlane_collector_aicpu.cpp index 5ed92cd61..aafffa8ee 100644 --- a/src/a2a3/platform/shared/aicpu/l2_swimlane_collector_aicpu.cpp +++ b/src/a2a3/platform/shared/aicpu/l2_swimlane_collector_aicpu.cpp @@ -69,8 +69,7 @@ static L2SwimlaneAicpuTaskBuffer *s_current_aicpu_task_buffers[PLATFORM_MAX_CORE static L2SwimlaneAicpuSchedPhasePool *s_sched_phase_pools[PLATFORM_MAX_AICPU_THREADS] = {}; static L2SwimlaneAicpuSchedPhaseBuffer *s_current_sched_phase_buffers[PLATFORM_MAX_AICPU_THREADS] = {}; -// Per-thread orch-phase pool/buffer caches (typically one orch thread; in -// orch_to_sched mode all aicpu threads can write here). +// Per-thread orch-phase pool/buffer caches (one orch thread). static L2SwimlaneAicpuOrchPhasePool *s_orch_phase_pools[PLATFORM_MAX_AICPU_THREADS] = {}; static L2SwimlaneAicpuOrchPhaseBuffer *s_current_orch_phase_buffers[PLATFORM_MAX_AICPU_THREADS] = {}; diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp index 2d5613ba8..0832c3c62 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp @@ -114,7 +114,6 @@ struct OrchSoEntry { struct AicpuExecutor { int32_t sched_thread_num_; - bool orch_to_sched_{false}; bool serial_orch_sched_{false}; // ===== Thread management state ===== @@ -206,7 +205,6 @@ int32_t AicpuExecutor::init(Runtime *runtime) { aicpu_thread_num_ = runtime->dev.aicpu_thread_num; if (aicpu_thread_num_ == 0) aicpu_thread_num_ = 1; sched_thread_num_ = aicpu_thread_num_ - 1; - orch_to_sched_ = runtime->dev.orch_to_sched; serial_orch_sched_ = runtime->dev.serial_orch_sched; if (aicpu_thread_num_ < 1 || aicpu_thread_num_ > MAX_AICPU_THREADS) { @@ -215,7 +213,7 @@ int32_t AicpuExecutor::init(Runtime *runtime) { return -1; } - if (sched_ctx_.init(runtime, aicpu_thread_num_, sched_thread_num_, orch_to_sched_, get_platform_regs()) != 0) { + if (sched_ctx_.init(runtime, aicpu_thread_num_, sched_thread_num_, get_platform_regs()) != 0) { init_failed_.store(true, std::memory_order_release); return -1; } @@ -728,8 +726,8 @@ int32_t AicpuExecutor::run(Runtime *runtime) { LOG_INFO_V0("Thread %d: Orchestrator completed", thread_idx); } - // Scheduler thread (orchestrator threads skip dispatch when orch_to_sched_ is false) - if (!sched_ctx_.is_completed() && (thread_idx < sched_thread_num_ || orch_to_sched_)) { + // Scheduler thread (orchestrator thread skips dispatch and exits after orchestration) + if (!sched_ctx_.is_completed() && thread_idx < sched_thread_num_) { // Device orchestration: wait for the primary orchestrator to initialize the SM header while (!runtime_init_ready_.load(std::memory_order_acquire)) { SPIN_WAIT_HINT(); @@ -802,7 +800,6 @@ void AicpuExecutor::deinit(Runtime *runtime) { aicpu_thread_num_ = 0; sched_thread_num_ = 0; - orch_to_sched_ = false; serial_orch_sched_ = false; orch_args_cached_.reset(); diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/docs/RUNTIME_LOGIC.md b/src/a2a3/runtime/tensormap_and_ringbuffer/docs/RUNTIME_LOGIC.md index ef01e3231..ef59d2e98 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/docs/RUNTIME_LOGIC.md +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/docs/RUNTIME_LOGIC.md @@ -544,11 +544,11 @@ Public surface (called from `AicpuExecutor::init/run/deinit`): | Method | Phase | Purpose | | ------ | ----- | ------- | -| `init(runtime, aicpu_thread_num, sched_thread_num, orch_to_sched, regs_base)` | once per run | Handshake + assign cores, reset counters, latch `regs_base`, bind `func_id_to_addr_` | +| `init(runtime, aicpu_thread_num, sched_thread_num, regs_base)` | once per run | Handshake + assign cores, reset counters, latch `regs_base`, bind `func_id_to_addr_` | | `bind_runtime(rt)` | device-orch only | Wire `sched_` to `rt->scheduler` once the orchestrator thread creates `rt` | | `resolve_and_dispatch(runtime, thread_idx)` | per scheduler thread | Main dispatch loop | | `shutdown(thread_idx)` | per thread on exit | `platform_deinit_aicore_regs` for this thread's cores; PMU finalize when enabled | -| `on_orchestration_done(runtime, rt, thread_idx, total_tasks)` | orchestrator thread | Publish core assignments, latch task count, fold inline-completed tasks, flip `orchestrator_done_`, drive orch→sched core transition (or `emergency_shutdown` on fatal) | +| `on_orchestration_done(runtime, rt, thread_idx, total_tasks)` | orchestrator thread | Publish core assignments, latch task count, fold inline-completed tasks, flip `orchestrator_done_` (or `emergency_shutdown` on fatal) | | `deinit()` | once per run | Reset every scheduler-owned field to its post-construction default | | Read-only accessors | various | `aic_count()` / `aiv_count()` / `is_completed()` / `completed_tasks_count()` | @@ -556,7 +556,7 @@ Private internals are split across three .cpp files by responsibility: - `scheduler_completion.cpp` — completion polling, drain protocol - `scheduler_dispatch.cpp` — task dispatch loop and helpers -- `scheduler_cold_path.cpp` — exit checks, stall diagnostics, profiling, lifecycle (`init/deinit`), core management (`handshake_all_cores` / `assign_cores_to_threads` / `reassign_cores_for_all_threads` / `emergency_shutdown`), and `on_orchestration_done` +- `scheduler_cold_path.cpp` — exit checks, stall diagnostics, profiling, lifecycle (`init/deinit`), core management (`handshake_all_cores` / `assign_cores_to_threads` / `emergency_shutdown`), and `on_orchestration_done` `AicpuExecutor` calls neither `handshake_*`, `assign_*`, `reassign_*`, nor `emergency_shutdown` directly — they are private, invoked only by `init` and `on_orchestration_done`. diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/docs/SUBMIT_BY_CLUSTER.md b/src/a2a3/runtime/tensormap_and_ringbuffer/docs/SUBMIT_BY_CLUSTER.md index 8cba7e90c..50f734fee 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/docs/SUBMIT_BY_CLUSTER.md +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/docs/SUBMIT_BY_CLUSTER.md @@ -36,7 +36,7 @@ Legacy per-task submit (`kernel_id + worker_type`) cannot express atomic co-disp Design must preserve the current main runtime architecture: -1. Executor threading split (orchestrator thread vs scheduler threads), and post-orchestrator transition (`transition_requested_` + `reassign_cores_for_all_threads()`). +1. Executor threading split (orchestrator thread vs scheduler threads); the orchestrator thread exits after the task graph is built while scheduler threads dispatch to completion. 2. Shared-memory hot/cold split (`PTO2TaskDescriptor` hot + `PTO2TaskPayload` cold). ## 5. Terminology @@ -146,10 +146,8 @@ This project-defined flattened numbering is kept unchanged. ### 9.2 Cluster Ownership 1. One cluster must be owned by one scheduler domain/thread at a time. -2. No split-cluster ownership in either: - - initial `assign_cores_to_threads()` - - post-orchestrator `reassign_cores_for_all_threads()` -3. Lane occupancy bookkeeping must remain consistent with ownership after reassignment. +2. No split-cluster ownership in `assign_cores_to_threads()`. +3. Lane occupancy bookkeeping must remain consistent with ownership. ## 10. Functional Requirements diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/docs/profiling_levels.md b/src/a2a3/runtime/tensormap_and_ringbuffer/docs/profiling_levels.md index bd669f365..4467cd7b2 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/docs/profiling_levels.md +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/docs/profiling_levels.md @@ -73,7 +73,6 @@ Each sub-level macro requires `PTO2_PROFILING=1`: - Base timing counters for scheduler loop (`sched_complete/dispatch/idle/scan`) - Per-thread orchestration timing (`orch_start`, `orch_end`, `orch_cost`) -- Stage-level orchestration end timestamp (`orch_stage_end`, printed by last orch thread only, marks the moment all orch threads have finished and core transition is about to be requested; only when `orch_to_sched_` is true) - PTO2 total submitted tasks count (printed by last orch thread, after orch timing line) - Scheduler summary output (`total_time`, `loops`, `tasks_scheduled`) - Scheduler lifetime timestamps and cost (`sched_start`, `sched_end`, `sched_cost` — captured inside `resolve_and_dispatch_pto2()`, printed before Scheduler summary) @@ -87,19 +86,17 @@ Each sub-level macro requires `PTO2_PROFILING=1`: - `Thread %d: orch_start=%llu orch_end=%llu orch_cost=%.3fus` — each orch thread, after orchestration fully complete - `PTO2 total submitted tasks = %d, already executed %d tasks` — last orch thread only (×1), after orch timing line -- `Thread %d: orch_stage_end=%llu` — last orch thread only (×1), only when `orch_to_sched_=true` - `Thread %d: sched_start=%llu sched_end=%llu sched_cost=%.3fus` — each sched thread, printed before Scheduler summary - `Thread %d: Scheduler summary: total_time=%.3fus, loops=%llu, tasks_scheduled=%d` — each sched thread - `Thread %d: sched_start=%llu sched_end(timeout)=%llu sched_cost=%.3fus` — timeout path only (replaces normal `sched_end`) **LOG_INFO_V9 count (normal run):** -- `orch_to_sched_=false` (default): `N_sched*2 + N_orch*1 + 1` (orch_timing + PTO2_total + sched_timing + Scheduler_summary) -- `orch_to_sched_=true` (`PTO2_ORCH_TO_SCHED=1`): adds 1 (`orch_stage_end`) +- `N_sched*2 + N_orch*1 + 1` (orch_timing + PTO2_total + sched_timing + Scheduler_summary) > See the table at the end for concrete counts based on the `paged_attention` example. -**Example log output — `orch_to_sched_=false`** (from `paged_attention`, device 10): +**Example log output** (from `paged_attention`, device 10): ```text Thread 2: orch_start=48214752948321 orch_end=48214752959379 orch_cost=230.000us @@ -111,26 +108,10 @@ Thread 0: sched_start=48214752948200 sched_end=48214752963571 sched_cost=320.000 Thread 0: Scheduler summary: total_time=183.180us, loops=4611, tasks_scheduled=7 ``` -**Example log output — `orch_to_sched_=true`** (`PTO2_ORCH_TO_SCHED=1`, from `paged_attention`, device 11): - -```text -Thread 3: orch_stage_end=48236915058307 -Thread 3: orch_start=48236915044001 orch_end=48236915058781 orch_cost=308.000us -Thread 2: orch_start=48236915044003 orch_end=48236915058782 orch_cost=308.000us -PTO2 total submitted tasks = 13, already executed 13 tasks -Thread 0: sched_start=48236915043911 sched_end=48236915059191 sched_cost=318.000us -Thread 0: Scheduler summary: total_time=187.920us, loops=4561, tasks_scheduled=4 -Thread 1: sched_start=48236915043947 sched_end=48236915061881 sched_cost=372.000us -Thread 1: Scheduler summary: total_time=168.620us, loops=3880, tasks_scheduled=9 -``` - -> With `orch_to_sched_=true`, orch threads transition to schedulers after orchestration. They print `orch_end` but do NOT print `Scheduler summary` or `sched_end` (they have no cores assigned at shutdown time). - **Note:** - All logs above are controlled by compile-time macro `PTO2_PROFILING`, not by `enable_l2_swimlane`. - `enable_l2_swimlane` only controls shared-memory data collection / swimlane export. -- Enable `orch_to_sched_` via environment variable: `PTO2_ORCH_TO_SCHED=1`. --- @@ -420,13 +401,13 @@ definitions to runtime headers. > Example: `paged_attention` on Ascend hardware, 2 sched threads + 2 orch threads, normal run (no stall/timeout). -| Level | Macro Settings | LOG_INFO_V9 Count (`orch_to_sched_=false`) | LOG_INFO_V9 Count (`orch_to_sched_=true`) | Description | -| ----- | -------------- | ------------------------------------------ | ----------------------------------------- | ----------- | -| 0 | `PTO2_PROFILING=0` | 0 | 0 | No timing output | -| 1 | `PTO2_PROFILING=1` | 7 | 8 | Timing timestamps + scheduler summary | -| 2 | `+PTO2_SCHED_PROFILING=1` | — | — | Scheduler detailed phase breakdown | -| 3 | `+PTO2_ORCH_PROFILING=1` | — | — | Orchestrator detailed phase breakdown | -| 4 | `+PTO2_TENSORMAP_PROFILING=1` | — | — | TensorMap lookup stats | +| Level | Macro Settings | LOG_INFO_V9 Count | Description | +| ----- | -------------- | ----------------- | ----------- | +| 0 | `PTO2_PROFILING=0` | 0 | No timing output | +| 1 | `PTO2_PROFILING=1` | 7 | Timing timestamps + scheduler summary | +| 2 | `+PTO2_SCHED_PROFILING=1` | — | Scheduler detailed phase breakdown | +| 3 | `+PTO2_ORCH_PROFILING=1` | — | Orchestrator detailed phase breakdown | +| 4 | `+PTO2_TENSORMAP_PROFILING=1` | — | TensorMap lookup stats | --- diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp index dc0ddefb8..7952045ce 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp @@ -477,10 +477,6 @@ static bool stage_device_args( // runtime. Behavior-only env reads (no new gates); kept here so the args and // image steps stay free of unrelated state. static void apply_orch_sched_env_flags(Runtime *runtime) { - const char *orch_env = std::getenv("PTO2_ORCH_TO_SCHED"); - runtime->dev.orch_to_sched = orch_env && (orch_env[0] == '1' || orch_env[0] == 't' || orch_env[0] == 'T'); - LOG_INFO_V0("Orchestrator-to-scheduler transition: %s", runtime->dev.orch_to_sched ? "enabled" : "disabled"); - const char *serial_env = std::getenv("PTO2_SERIAL_ORCH_SCHED"); runtime->dev.serial_orch_sched = serial_env && (serial_env[0] == '1' || serial_env[0] == 't' || serial_env[0] == 'T'); diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h index ae6a2446e..5f3109a08 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h @@ -207,7 +207,6 @@ struct alignas(64) DeviceRuntimeLaunchDesc { // (= orch + schedulers). AicpuExecutor splits this into one orchestrator // thread (highest idx, runs aicpu_orchestration_entry) and the remaining // aicpu_thread_num-1 scheduler threads that dispatch tasks to AICore. - // The orch thread also dispatches when env PTO2_ORCH_TO_SCHED is set. int aicpu_thread_num; int ready_queue_shards; // Number of ready queue shards (1..MAX_AICPU_THREADS, default MAX-1) @@ -223,12 +222,6 @@ struct alignas(64) DeviceRuntimeLaunchDesc { // PTO2 integration: kernel_id -> GM function_bin_addr mapping uint64_t func_id_to_addr_[RUNTIME_MAX_FUNC_ID]; - // Orchestrator-to-scheduler transition control - // When true, orchestrator threads convert to scheduler threads after orchestration completes. - // When false (default), orchestrator threads exit after orchestration without dispatching tasks. - // Controlled via PTO2_ORCH_TO_SCHED environment variable. - bool orch_to_sched; - // Serial orchestrator -> scheduler start control. // When true, scheduler threads wait until orchestration has fully built the // task graph before entering resolve_and_dispatch(). diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp index cae114427..5bac8297a 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp @@ -94,21 +94,6 @@ LoopAction SchedulerContext::handle_orchestrator_exit( return LoopAction::NONE; } -LoopAction SchedulerContext::handle_core_transition(bool &cores_released) { - if (!transition_requested_.load(std::memory_order_acquire)) return LoopAction::NONE; - if (!reassigned_.load(std::memory_order_acquire)) { - wait_reassign_.fetch_add(1, std::memory_order_release); - while (!reassigned_.load(std::memory_order_acquire)) { - if (completed_.load(std::memory_order_acquire)) { - return LoopAction::BREAK_LOOP; - } - SPIN_WAIT_HINT(); - } - } - cores_released = true; - return LoopAction::NONE; -} - LoopAction SchedulerContext::check_idle_fatal_error(int32_t thread_idx, PTO2SharedMemoryHeader *header, Runtime *runtime) { if (completed_.load(std::memory_order_acquire)) { @@ -330,7 +315,7 @@ void SchedulerContext::log_stall_diagnostics( // CLUSTER lines: one per cluster this thread owns. // cluster_id = local_cluster_idx * active_sched_threads_ + thread_idx, matching the - // round-robin assignment in assign_cores_to_threads / reassign_cores_for_all_threads. + // round-robin assignment in assign_cores_to_threads. int32_t ast = active_sched_threads_ > 0 ? active_sched_threads_ : aicpu_thread_num_; for (int32_t cli = 0; cli < tracker.get_cluster_count() && cli < STALL_DUMP_CORE_MAX; cli++) { int32_t offset = cli * 3; @@ -848,76 +833,6 @@ bool SchedulerContext::assign_cores_to_threads() { return true; } -// ============================================================================= -// Reassign all cores across all threads (sched + orchestrator) after orchestration. -// ============================================================================= -void SchedulerContext::reassign_cores_for_all_threads() { - LOG_INFO_V0( - "Reassigning cores (cluster-aligned) for %d threads: %d AIC, %d AIV", aicpu_thread_num_, aic_count_, aiv_count_ - ); - - // Collect running worker_ids from all current trackers - bool running_cores[RUNTIME_MAX_WORKER] = {}; - for (int32_t i = 0; i < aicpu_thread_num_; i++) { - auto all_running = core_trackers_[i].get_all_running_cores(); - int32_t bp; - while ((bp = all_running.pop_first()) >= 0) { - running_cores[core_trackers_[i].get_core_id_by_offset(bp)] = true; - } - } - - // Count clusters per thread (round-robin across all threads) - int32_t cluster_count = aic_count_; - int32_t clusters_per_thread[MAX_AICPU_THREADS] = {}; - for (int32_t ci = 0; ci < cluster_count; ci++) { - clusters_per_thread[ci % aicpu_thread_num_]++; - } - - // Re-init all trackers and reset core counts - for (int32_t i = 0; i < aicpu_thread_num_; i++) { - core_trackers_[i].init(clusters_per_thread[i]); - } - - // Assign clusters round-robin and restore running state - int32_t cluster_idx_per_thread[MAX_AICPU_THREADS] = {}; - for (int32_t ci = 0; ci < cluster_count; ci++) { - int32_t t = ci % aicpu_thread_num_; - - int32_t aic_wid = aic_worker_ids_[ci]; - int32_t aiv0_wid = aiv_worker_ids_[2 * ci]; - int32_t aiv1_wid = aiv_worker_ids_[2 * ci + 1]; - - int32_t cl_idx = cluster_idx_per_thread[t]++; - core_trackers_[t].set_cluster(cl_idx, aic_wid, aiv0_wid, aiv1_wid); - - // init() marks all idle; toggle cores that were running and restore pending_occupied - if (running_cores[aic_wid]) { - core_trackers_[t].change_core_state(cl_idx * 3); - core_trackers_[t].set_pending_occupied(cl_idx * 3); - } - if (running_cores[aiv0_wid]) { - core_trackers_[t].change_core_state(cl_idx * 3 + 1); - core_trackers_[t].set_pending_occupied(cl_idx * 3 + 1); - } - if (running_cores[aiv1_wid]) { - core_trackers_[t].change_core_state(cl_idx * 3 + 2); - core_trackers_[t].set_pending_occupied(cl_idx * 3 + 2); - } - } - - // Log final distribution - LOG_INFO_V0("Core reassignment complete:"); - for (int32_t t = 0; t < aicpu_thread_num_; t++) { - int32_t aic_running = core_trackers_[t].get_running_count(); - int32_t aiv_running = core_trackers_[t].get_running_count(); - LOG_INFO_V0( - " Thread %d: %d cores, %d clusters (AIC running=%d, AIV running=%d)", t, core_trackers_[t].core_num(), - core_trackers_[t].get_cluster_count(), aic_running, aiv_running - ); - } - active_sched_threads_ = aicpu_thread_num_; -} - // ============================================================================= // Emergency shutdown: broadcast exit signal to every handshake'd core and // deinit their AICore register blocks. Idempotent. @@ -945,9 +860,8 @@ void SchedulerContext::emergency_shutdown(Runtime *runtime) { // ============================================================================= // Lifecycle: init / deinit // ============================================================================= -int32_t SchedulerContext::init( - Runtime *runtime, int32_t aicpu_thread_num, int32_t sched_thread_num, bool orch_to_sched, uint64_t regs_base -) { +int32_t +SchedulerContext::init(Runtime *runtime, int32_t aicpu_thread_num, int32_t sched_thread_num, uint64_t regs_base) { always_assert(runtime != nullptr); // Zero all per-core execution state before handshake @@ -956,7 +870,6 @@ int32_t SchedulerContext::init( // Wire thread/transition configuration that handshake/assign need to read. aicpu_thread_num_ = aicpu_thread_num; sched_thread_num_ = sched_thread_num; - orch_to_sched_ = orch_to_sched; regs_ = regs_base; #if PTO2_PROFILING @@ -977,10 +890,9 @@ int32_t SchedulerContext::init( // threads as scheduler threads" (see assign_cores_to_threads' // active_sched_threads_). Without it, init_phase would prime zero // sched pools and all sched_phase emits would silently drop. - const int active_sched = (sched_thread_num_ > 0) ? sched_thread_num_ : aicpu_thread_num_; - const int sched_phase_threads = orch_to_sched_ ? aicpu_thread_num_ : active_sched; + const int sched_phase_threads = (sched_thread_num_ > 0) ? sched_thread_num_ : aicpu_thread_num_; // Orchestration is always single-threaded, so orch-phase is one pool - // (ordinal 0) in both modes — see record_orch_phase. + // (ordinal 0) — see record_orch_phase. const int orch_phase_threads = 1; l2_swimlane_aicpu_init_phase(runtime->dev.worker_count, sched_phase_threads, orch_phase_threads); } @@ -1007,7 +919,7 @@ int32_t SchedulerContext::init( // orchestrator thread (see aicpu_executor.cpp). #if PTO2_PROFILING if (is_dump_args_enabled()) { - dump_args_init(orch_to_sched_ ? aicpu_thread_num_ : active_sched_threads_); + dump_args_init(active_sched_threads_); } if (is_pmu_enabled()) { pmu_aicpu_init(physical_core_ids_, cores_total_num_); @@ -1101,11 +1013,6 @@ void SchedulerContext::deinit() { completed_tasks_.store(0, std::memory_order_release); total_tasks_ = 0; orchestrator_done_.store(false, std::memory_order_release); - - // Reset core transition state - transition_requested_.store(false, std::memory_order_release); - wait_reassign_.store(0, std::memory_order_release); - reassigned_.store(false, std::memory_order_release); completed_.store(false, std::memory_order_release); // Reset core discovery and assignment state @@ -1114,7 +1021,6 @@ void SchedulerContext::deinit() { cores_total_num_ = 0; aicpu_thread_num_ = 0; sched_thread_num_ = 0; - orch_to_sched_ = false; active_sched_threads_ = 0; for (int32_t t = 0; t < MAX_AICPU_THREADS; t++) { core_trackers_[t] = CoreTracker{}; @@ -1156,7 +1062,7 @@ void SchedulerContext::wait_for_orchestration_done_before_dispatch(Runtime *runt // and drives the orchestrator → scheduler core transition (or fatal shutdown). // ============================================================================= void SchedulerContext::on_orchestration_done( - Runtime *runtime, PTO2Runtime *rt, int32_t thread_idx, int32_t total_tasks + Runtime *runtime, PTO2Runtime *rt, [[maybe_unused]] int32_t thread_idx, int32_t total_tasks ) { #if PTO2_PROFILING if (l2_swimlane_level_ >= L2SwimlaneLevel::ORCH_PHASES) { @@ -1190,32 +1096,9 @@ void SchedulerContext::on_orchestration_done( } } - // Skip core transition on fatal error — cores already shut down above. - if (completed_.load(std::memory_order_acquire)) { - // Signal transition to unblock scheduler threads waiting at core transition - transition_requested_.store(true, std::memory_order_release); - reassigned_.store(true, std::memory_order_release); - } else if (orch_to_sched_) { - LOG_INFO_V0("Thread %d: Set orchestrator_done=true, requesting core transition", thread_idx); - transition_requested_.store(true, std::memory_order_release); - - // Wait for scheduler threads to acknowledge transition request - while (wait_reassign_.load(std::memory_order_acquire) != sched_thread_num_) { - if (completed_.load(std::memory_order_acquire)) { - break; - } - SPIN_WAIT_HINT(); - } - if (!completed_.load(std::memory_order_acquire)) { - reassign_cores_for_all_threads(); - reassigned_.store(true, std::memory_order_release); - } - } - #if PTO2_PROFILING - // Write core-to-thread mapping AFTER reassignment so the profiling data - // reflects the final distribution (all active_sched_threads_, including - // former orchestrator threads when orch_to_sched_ is enabled). + // Write the core-to-thread mapping so the profiling data reflects the + // scheduler threads' final core distribution. if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) { l2_swimlane_aicpu_init_core_assignments(cores_total_num_); for (int32_t t = 0; t < active_sched_threads_; t++) { diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h index d8669d42b..02962864d 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h @@ -61,8 +61,7 @@ class SchedulerContext { // - Binds func_id_to_addr_ / initial sched_ (if rt is already known) // - Captures AICore-register base (consumed by handshake_all_cores()) // Returns 0 on success, negative on failure (handshake / assignment error). - int32_t - init(Runtime *runtime, int32_t aicpu_thread_num, int32_t sched_thread_num, bool orch_to_sched, uint64_t regs_base); + int32_t init(Runtime *runtime, int32_t aicpu_thread_num, int32_t sched_thread_num, uint64_t regs_base); // Reset all SchedulerContext-owned state to its post-construction defaults. // Called by AicpuExecutor::deinit() during per-run teardown. @@ -152,15 +151,9 @@ class SchedulerContext { std::atomic completed_{false}; uint64_t *func_id_to_addr_{nullptr}; - // --- Core-transition coordination --- - std::atomic transition_requested_{false}; - std::atomic wait_reassign_{0}; - std::atomic reassigned_{false}; - // --- Thread/core configuration --- int32_t active_sched_threads_{0}; int32_t sched_thread_num_{0}; - bool orch_to_sched_{false}; int32_t aicpu_thread_num_{0}; int32_t cores_total_num_{0}; @@ -190,9 +183,6 @@ class SchedulerContext { // Assign discovered cores (cluster = 1 AIC + 2 AIV) round-robin across scheduler threads. bool assign_cores_to_threads(); - // Re-distribute all cores across all threads after orchestration completes. - void reassign_cores_for_all_threads(); - // Emergency shutdown: broadcast exit signal to every handshake'd core and // deinit their AICore register blocks. Idempotent. void emergency_shutdown(Runtime *runtime); @@ -359,8 +349,6 @@ class SchedulerContext { __attribute__((noinline, cold)) LoopAction handle_orchestrator_exit(int32_t thread_idx, PTO2SharedMemoryHeader *header, Runtime *runtime, int32_t &task_count); - __attribute__((noinline, cold)) LoopAction handle_core_transition(bool &cores_released); - __attribute__((noinline, cold)) LoopAction check_idle_fatal_error(int32_t thread_idx, PTO2SharedMemoryHeader *header, Runtime *runtime); diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp index a3e58c8d6..4b94d0ae0 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp @@ -803,8 +803,6 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_ PTO2TaskSlotState *deferred_release_slot_states[PTO2_DEFERRED_RELEASE_CAP]; int32_t deferred_release_count = 0; - bool cores_released = false; - // PMU runs require single-issue dispatch — overlapping in-flight tasks // pollute per-task PMU counters, so skip the PENDING pre-load phase. // Cached at function scope: is_pmu_enabled() is extern "C" and the @@ -916,11 +914,6 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_ if (action == LoopAction::BREAK_LOOP) break; } - if (!cores_released && orch_to_sched_) { - LoopAction action = handle_core_transition(cores_released); - if (action == LoopAction::BREAK_LOOP) break; - } - #if PTO2_PROFILING CYCLE_COUNT_LAP(l2_swimlane.sched_idle_cycle); #endif diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp index 8b28e620e..5edaa438b 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp @@ -37,7 +37,6 @@ Runtime::Runtime() { memset(dev.aicpu_allowed_cpus, 0, sizeof(dev.aicpu_allowed_cpus)); dev.aicpu_allowed_cpu_count = 0; dev.aicpu_launch_count = 0; - dev.orch_to_sched = false; dev.serial_orch_sched = false; dev.gm_sm_ptr_ = nullptr; dev.slot_states_ptr_ = nullptr; diff --git a/src/a5/platform/include/aicpu/l2_swimlane_collector_aicpu.h b/src/a5/platform/include/aicpu/l2_swimlane_collector_aicpu.h index 80da54476..3182eb1c1 100644 --- a/src/a5/platform/include/aicpu/l2_swimlane_collector_aicpu.h +++ b/src/a5/platform/include/aicpu/l2_swimlane_collector_aicpu.h @@ -155,8 +155,7 @@ void l2_swimlane_aicpu_flush(int thread_idx, const int *cur_thread_cores, int co * to the L2Swimlane base * @param num_sched_phase_threads Number of sched-phase pools to prime * @param num_orch_phase_threads Number of orch-phase pools to prime - * (typically 1; in orch_to_sched mode = - * num_aicpu_threads) + * (typically 1) */ void l2_swimlane_aicpu_init_phase(int worker_count, int num_sched_phase_threads, int num_orch_phase_threads); @@ -200,8 +199,7 @@ void l2_swimlane_aicpu_record_sched_phase( * Must be called once from the orchestrator thread before any * l2_swimlane_aicpu_record_orch_phase() calls. * - * @param thread_idx Thread index for the orchestrator (typically num_sched_threads; - * in orch_to_sched mode each scheduler thread sets its own) + * @param thread_idx Thread index for the orchestrator (typically num_sched_threads) */ void l2_swimlane_aicpu_set_orch_thread_idx(int thread_idx); diff --git a/src/a5/platform/shared/aicpu/l2_swimlane_collector_aicpu.cpp b/src/a5/platform/shared/aicpu/l2_swimlane_collector_aicpu.cpp index 5ed92cd61..aafffa8ee 100644 --- a/src/a5/platform/shared/aicpu/l2_swimlane_collector_aicpu.cpp +++ b/src/a5/platform/shared/aicpu/l2_swimlane_collector_aicpu.cpp @@ -69,8 +69,7 @@ static L2SwimlaneAicpuTaskBuffer *s_current_aicpu_task_buffers[PLATFORM_MAX_CORE static L2SwimlaneAicpuSchedPhasePool *s_sched_phase_pools[PLATFORM_MAX_AICPU_THREADS] = {}; static L2SwimlaneAicpuSchedPhaseBuffer *s_current_sched_phase_buffers[PLATFORM_MAX_AICPU_THREADS] = {}; -// Per-thread orch-phase pool/buffer caches (typically one orch thread; in -// orch_to_sched mode all aicpu threads can write here). +// Per-thread orch-phase pool/buffer caches (one orch thread). static L2SwimlaneAicpuOrchPhasePool *s_orch_phase_pools[PLATFORM_MAX_AICPU_THREADS] = {}; static L2SwimlaneAicpuOrchPhaseBuffer *s_current_orch_phase_buffers[PLATFORM_MAX_AICPU_THREADS] = {}; diff --git a/src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp index 41a97ace8..91488097a 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp +++ b/src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp @@ -114,7 +114,6 @@ struct OrchSoEntry { struct AicpuExecutor { int32_t sched_thread_num_; - bool orch_to_sched_{false}; bool serial_orch_sched_{false}; // ===== Thread management state ===== @@ -208,7 +207,6 @@ int32_t AicpuExecutor::init(Runtime *runtime) { aicpu_thread_num_ = runtime->dev.aicpu_thread_num; if (aicpu_thread_num_ == 0) aicpu_thread_num_ = 1; sched_thread_num_ = aicpu_thread_num_ - 1; - orch_to_sched_ = runtime->dev.orch_to_sched; serial_orch_sched_ = runtime->dev.serial_orch_sched; if (aicpu_thread_num_ < 1 || aicpu_thread_num_ > MAX_AICPU_THREADS) { @@ -217,7 +215,7 @@ int32_t AicpuExecutor::init(Runtime *runtime) { return -1; } - if (sched_ctx_.init(runtime, aicpu_thread_num_, sched_thread_num_, orch_to_sched_, get_platform_regs()) != 0) { + if (sched_ctx_.init(runtime, aicpu_thread_num_, sched_thread_num_, get_platform_regs()) != 0) { init_failed_.store(true, std::memory_order_release); return -1; } @@ -723,8 +721,8 @@ int32_t AicpuExecutor::run(Runtime *runtime) { LOG_INFO_V0("Thread %d: Orchestrator completed", thread_idx); } - // Scheduler thread (orchestrator threads skip dispatch when orch_to_sched_ is false) - if (!sched_ctx_.is_completed() && (thread_idx < sched_thread_num_ || orch_to_sched_)) { + // Scheduler thread (orchestrator thread skips dispatch and exits after orchestration) + if (!sched_ctx_.is_completed() && thread_idx < sched_thread_num_) { // Device orchestration: wait for the primary orchestrator to initialize the SM header while (!runtime_init_ready_.load(std::memory_order_acquire)) { SPIN_WAIT_HINT(); @@ -797,7 +795,6 @@ void AicpuExecutor::deinit(Runtime *runtime) { aicpu_thread_num_ = 0; sched_thread_num_ = 0; - orch_to_sched_ = false; serial_orch_sched_ = false; orch_args_cached_.reset(); diff --git a/src/a5/runtime/tensormap_and_ringbuffer/docs/RUNTIME_LOGIC.md b/src/a5/runtime/tensormap_and_ringbuffer/docs/RUNTIME_LOGIC.md index 83ea5c270..a339b178e 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/docs/RUNTIME_LOGIC.md +++ b/src/a5/runtime/tensormap_and_ringbuffer/docs/RUNTIME_LOGIC.md @@ -544,11 +544,11 @@ Public surface (called from `AicpuExecutor::init/run/deinit`): | Method | Phase | Purpose | | ------ | ----- | ------- | -| `init(runtime, aicpu_thread_num, sched_thread_num, orch_to_sched, regs_base)` | once per run | Handshake + assign cores, reset counters, latch `regs_base`, bind `func_id_to_addr_` | +| `init(runtime, aicpu_thread_num, sched_thread_num, regs_base)` | once per run | Handshake + assign cores, reset counters, latch `regs_base`, bind `func_id_to_addr_` | | `bind_runtime(rt)` | device-orch only | Wire `sched_` to `rt->scheduler` once the orchestrator thread creates `rt` | | `resolve_and_dispatch(runtime, thread_idx)` | per scheduler thread | Main dispatch loop | | `shutdown(thread_idx)` | per thread on exit | `platform_deinit_aicore_regs` for this thread's cores | -| `on_orchestration_done(runtime, rt, thread_idx, total_tasks)` | orchestrator thread | Publish core assignments, latch task count, fold inline-completed tasks, flip `orchestrator_done_`, drive orch→sched core transition (or `emergency_shutdown` on fatal) | +| `on_orchestration_done(runtime, rt, thread_idx, total_tasks)` | orchestrator thread | Publish core assignments, latch task count, fold inline-completed tasks, flip `orchestrator_done_` (or `emergency_shutdown` on fatal) | | `deinit()` | once per run | Reset every scheduler-owned field to its post-construction default | | Read-only accessors | various | `aic_count()` / `aiv_count()` / `is_completed()` / `completed_tasks_count()` | @@ -556,7 +556,7 @@ Private internals are split across three .cpp files by responsibility: - `scheduler_completion.cpp` — completion polling, drain protocol - `scheduler_dispatch.cpp` — task dispatch loop and helpers -- `scheduler_cold_path.cpp` — exit checks, stall diagnostics, profiling, lifecycle (`init/deinit`), core management (`handshake_all_cores` / `assign_cores_to_threads` / `reassign_cores_for_all_threads` / `emergency_shutdown`), and `on_orchestration_done` +- `scheduler_cold_path.cpp` — exit checks, stall diagnostics, profiling, lifecycle (`init/deinit`), core management (`handshake_all_cores` / `assign_cores_to_threads` / `emergency_shutdown`), and `on_orchestration_done` `AicpuExecutor` calls neither `handshake_*`, `assign_*`, `reassign_*`, nor `emergency_shutdown` directly — they are private, invoked only by `init` and `on_orchestration_done`. diff --git a/src/a5/runtime/tensormap_and_ringbuffer/docs/SUBMIT_BY_CLUSTER.md b/src/a5/runtime/tensormap_and_ringbuffer/docs/SUBMIT_BY_CLUSTER.md index 8cba7e90c..50f734fee 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/docs/SUBMIT_BY_CLUSTER.md +++ b/src/a5/runtime/tensormap_and_ringbuffer/docs/SUBMIT_BY_CLUSTER.md @@ -36,7 +36,7 @@ Legacy per-task submit (`kernel_id + worker_type`) cannot express atomic co-disp Design must preserve the current main runtime architecture: -1. Executor threading split (orchestrator thread vs scheduler threads), and post-orchestrator transition (`transition_requested_` + `reassign_cores_for_all_threads()`). +1. Executor threading split (orchestrator thread vs scheduler threads); the orchestrator thread exits after the task graph is built while scheduler threads dispatch to completion. 2. Shared-memory hot/cold split (`PTO2TaskDescriptor` hot + `PTO2TaskPayload` cold). ## 5. Terminology @@ -146,10 +146,8 @@ This project-defined flattened numbering is kept unchanged. ### 9.2 Cluster Ownership 1. One cluster must be owned by one scheduler domain/thread at a time. -2. No split-cluster ownership in either: - - initial `assign_cores_to_threads()` - - post-orchestrator `reassign_cores_for_all_threads()` -3. Lane occupancy bookkeeping must remain consistent with ownership after reassignment. +2. No split-cluster ownership in `assign_cores_to_threads()`. +3. Lane occupancy bookkeeping must remain consistent with ownership. ## 10. Functional Requirements diff --git a/src/a5/runtime/tensormap_and_ringbuffer/docs/profiling_levels.md b/src/a5/runtime/tensormap_and_ringbuffer/docs/profiling_levels.md index 2ef6c1b6a..62a38766e 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/docs/profiling_levels.md +++ b/src/a5/runtime/tensormap_and_ringbuffer/docs/profiling_levels.md @@ -73,7 +73,6 @@ Each sub-level macro requires `PTO2_PROFILING=1`: - Base timing counters for scheduler loop (`sched_complete/dispatch/idle/scan`) - Per-thread orchestration timing (`orch_start`, `orch_end`, `orch_cost`) -- Stage-level orchestration end timestamp (`orch_stage_end`, printed by last orch thread only, marks the moment all orch threads have finished and core transition is about to be requested; only when `orch_to_sched_` is true) - PTO2 total submitted tasks count (printed by last orch thread, after orch timing line) - Scheduler summary output (`total_time`, `loops`, `tasks_scheduled`) - Scheduler lifetime timestamps and cost (`sched_start`, `sched_end`, `sched_cost` — captured inside `resolve_and_dispatch_pto2()`, printed before Scheduler summary) @@ -87,19 +86,17 @@ Each sub-level macro requires `PTO2_PROFILING=1`: - `Thread %d: orch_start=%llu orch_end=%llu orch_cost=%.3fus` — each orch thread, after orchestration fully complete - `PTO2 total submitted tasks = %d, already executed %d tasks` — last orch thread only (×1), after orch timing line -- `Thread %d: orch_stage_end=%llu` — last orch thread only (×1), only when `orch_to_sched_=true` - `Thread %d: sched_start=%llu sched_end=%llu sched_cost=%.3fus` — each sched thread, printed before Scheduler summary - `Thread %d: Scheduler summary: total_time=%.3fus, loops=%llu, tasks_scheduled=%d` — each sched thread - `Thread %d: sched_start=%llu sched_end(timeout)=%llu sched_cost=%.3fus` — timeout path only (replaces normal `sched_end`) **LOG_INFO_V9 count (normal run):** -- `orch_to_sched_=false` (default): `N_sched*2 + N_orch*1 + 1` (orch_timing + PTO2_total + sched_timing + Scheduler_summary) -- `orch_to_sched_=true` (`PTO2_ORCH_TO_SCHED=1`): adds 1 (`orch_stage_end`) +- `N_sched*2 + N_orch*1 + 1` (orch_timing + PTO2_total + sched_timing + Scheduler_summary) > See the table at the end for concrete counts based on the `paged_attention` example. -**Example log output — `orch_to_sched_=false`** (from `paged_attention`, device 10): +**Example log output** (from `paged_attention`, device 10): ```text Thread 2: orch_start=48214752948321 orch_end=48214752959379 orch_cost=230.000us @@ -111,26 +108,10 @@ Thread 0: sched_start=48214752948200 sched_end=48214752963571 sched_cost=320.000 Thread 0: Scheduler summary: total_time=183.180us, loops=4611, tasks_scheduled=7 ``` -**Example log output — `orch_to_sched_=true`** (`PTO2_ORCH_TO_SCHED=1`, from `paged_attention`, device 11): - -```text -Thread 3: orch_stage_end=48236915058307 -Thread 3: orch_start=48236915044001 orch_end=48236915058781 orch_cost=308.000us -Thread 2: orch_start=48236915044003 orch_end=48236915058782 orch_cost=308.000us -PTO2 total submitted tasks = 13, already executed 13 tasks -Thread 0: sched_start=48236915043911 sched_end=48236915059191 sched_cost=318.000us -Thread 0: Scheduler summary: total_time=187.920us, loops=4561, tasks_scheduled=4 -Thread 1: sched_start=48236915043947 sched_end=48236915061881 sched_cost=372.000us -Thread 1: Scheduler summary: total_time=168.620us, loops=3880, tasks_scheduled=9 -``` - -> With `orch_to_sched_=true`, orch threads transition to schedulers after orchestration. They print `orch_end` but do NOT print `Scheduler summary` or `sched_end` (they have no cores assigned at shutdown time). - **Note:** - All logs above are controlled by compile-time macro `PTO2_PROFILING`, not by `enable_l2_swimlane`. - `enable_l2_swimlane` only controls shared-memory data collection / swimlane export. -- Enable `orch_to_sched_` via environment variable: `PTO2_ORCH_TO_SCHED=1`. --- @@ -390,13 +371,13 @@ definitions to runtime headers. > Example: `paged_attention` on Ascend hardware, 2 sched threads + 2 orch threads, normal run (no stall/timeout). -| Level | Macro Settings | LOG_INFO_V9 Count (`orch_to_sched_=false`) | LOG_INFO_V9 Count (`orch_to_sched_=true`) | Description | -| ----- | -------------- | ------------------------------------------ | ----------------------------------------- | ----------- | -| 0 | `PTO2_PROFILING=0` | 0 | 0 | No timing output | -| 1 | `PTO2_PROFILING=1` | 7 | 8 | Timing timestamps + scheduler summary | -| 2 | `+PTO2_SCHED_PROFILING=1` | — | — | Scheduler detailed phase breakdown | -| 3 | `+PTO2_ORCH_PROFILING=1` | — | — | Orchestrator detailed phase breakdown | -| 4 | `+PTO2_TENSORMAP_PROFILING=1` | — | — | TensorMap lookup stats | +| Level | Macro Settings | LOG_INFO_V9 Count | Description | +| ----- | -------------- | ----------------- | ----------- | +| 0 | `PTO2_PROFILING=0` | 0 | No timing output | +| 1 | `PTO2_PROFILING=1` | 7 | Timing timestamps + scheduler summary | +| 2 | `+PTO2_SCHED_PROFILING=1` | — | Scheduler detailed phase breakdown | +| 3 | `+PTO2_ORCH_PROFILING=1` | — | Orchestrator detailed phase breakdown | +| 4 | `+PTO2_TENSORMAP_PROFILING=1` | — | TensorMap lookup stats | --- diff --git a/src/a5/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp b/src/a5/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp index dc0ddefb8..7952045ce 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp +++ b/src/a5/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp @@ -477,10 +477,6 @@ static bool stage_device_args( // runtime. Behavior-only env reads (no new gates); kept here so the args and // image steps stay free of unrelated state. static void apply_orch_sched_env_flags(Runtime *runtime) { - const char *orch_env = std::getenv("PTO2_ORCH_TO_SCHED"); - runtime->dev.orch_to_sched = orch_env && (orch_env[0] == '1' || orch_env[0] == 't' || orch_env[0] == 'T'); - LOG_INFO_V0("Orchestrator-to-scheduler transition: %s", runtime->dev.orch_to_sched ? "enabled" : "disabled"); - const char *serial_env = std::getenv("PTO2_SERIAL_ORCH_SCHED"); runtime->dev.serial_orch_sched = serial_env && (serial_env[0] == '1' || serial_env[0] == 't' || serial_env[0] == 'T'); diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/runtime.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/runtime.h index 6d2bc08a0..22d965231 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/runtime.h +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/runtime.h @@ -215,7 +215,6 @@ struct alignas(64) DeviceRuntimeLaunchDesc { // (= orch + schedulers). AicpuExecutor splits this into one orchestrator // thread (highest idx, runs aicpu_orchestration_entry) and the remaining // aicpu_thread_num-1 scheduler threads that dispatch tasks to AICore. - // The orch thread also dispatches when env PTO2_ORCH_TO_SCHED is set. int aicpu_thread_num; int ready_queue_shards; // Number of ready queue shards (1..MAX_AICPU_THREADS, default MAX-1) @@ -237,12 +236,6 @@ struct alignas(64) DeviceRuntimeLaunchDesc { // PTO2 integration: kernel_id -> GM function_bin_addr mapping uint64_t func_id_to_addr_[RUNTIME_MAX_FUNC_ID]; - // Orchestrator-to-scheduler transition control - // When true, orchestrator threads convert to scheduler threads after orchestration completes. - // When false (default), orchestrator threads exit after orchestration without dispatching tasks. - // Controlled via PTO2_ORCH_TO_SCHED environment variable. - bool orch_to_sched; - // Serial orchestrator -> scheduler start control. // When true, scheduler threads wait until orchestration has fully built the // task graph before entering resolve_and_dispatch(). diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp index 517a40c8b..20587aeaf 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp @@ -94,21 +94,6 @@ LoopAction SchedulerContext::handle_orchestrator_exit( return LoopAction::NONE; } -LoopAction SchedulerContext::handle_core_transition(bool &cores_released) { - if (!transition_requested_.load(std::memory_order_acquire)) return LoopAction::NONE; - if (!reassigned_.load(std::memory_order_acquire)) { - wait_reassign_.fetch_add(1, std::memory_order_release); - while (!reassigned_.load(std::memory_order_acquire)) { - if (completed_.load(std::memory_order_acquire)) { - return LoopAction::BREAK_LOOP; - } - SPIN_WAIT_HINT(); - } - } - cores_released = true; - return LoopAction::NONE; -} - LoopAction SchedulerContext::check_idle_fatal_error(int32_t thread_idx, PTO2SharedMemoryHeader *header, Runtime *runtime) { if (completed_.load(std::memory_order_acquire)) { @@ -327,7 +312,7 @@ void SchedulerContext::log_stall_diagnostics( // CLUSTER lines: one per cluster this thread owns. // cluster_id = local_cluster_idx * active_sched_threads_ + thread_idx, matching the - // round-robin assignment in assign_cores_to_threads / reassign_cores_for_all_threads. + // round-robin assignment in assign_cores_to_threads. int32_t ast = active_sched_threads_ > 0 ? active_sched_threads_ : aicpu_thread_num_; for (int32_t cli = 0; cli < tracker.get_cluster_count() && cli < STALL_DUMP_CORE_MAX; cli++) { int32_t offset = cli * 3; @@ -851,76 +836,6 @@ bool SchedulerContext::assign_cores_to_threads() { return true; } -// ============================================================================= -// Reassign all cores across all threads (sched + orchestrator) after orchestration. -// ============================================================================= -void SchedulerContext::reassign_cores_for_all_threads() { - LOG_INFO_V0( - "Reassigning cores (cluster-aligned) for %d threads: %d AIC, %d AIV", aicpu_thread_num_, aic_count_, aiv_count_ - ); - - // Collect running worker_ids from all current trackers - bool running_cores[RUNTIME_MAX_WORKER] = {}; - for (int32_t i = 0; i < aicpu_thread_num_; i++) { - auto all_running = core_trackers_[i].get_all_running_cores(); - int32_t bp; - while ((bp = all_running.pop_first()) >= 0) { - running_cores[core_trackers_[i].get_core_id_by_offset(bp)] = true; - } - } - - // Count clusters per thread (round-robin across all threads) - int32_t cluster_count = aic_count_; - int32_t clusters_per_thread[MAX_AICPU_THREADS] = {}; - for (int32_t ci = 0; ci < cluster_count; ci++) { - clusters_per_thread[ci % aicpu_thread_num_]++; - } - - // Re-init all trackers and reset core counts - for (int32_t i = 0; i < aicpu_thread_num_; i++) { - core_trackers_[i].init(clusters_per_thread[i]); - } - - // Assign clusters round-robin and restore running state - int32_t cluster_idx_per_thread[MAX_AICPU_THREADS] = {}; - for (int32_t ci = 0; ci < cluster_count; ci++) { - int32_t t = ci % aicpu_thread_num_; - - int32_t aic_wid = aic_worker_ids_[ci]; - int32_t aiv0_wid = aiv_worker_ids_[2 * ci]; - int32_t aiv1_wid = aiv_worker_ids_[2 * ci + 1]; - - int32_t cl_idx = cluster_idx_per_thread[t]++; - core_trackers_[t].set_cluster(cl_idx, aic_wid, aiv0_wid, aiv1_wid); - - // init() marks all idle; toggle cores that were running and restore pending_occupied - if (running_cores[aic_wid]) { - core_trackers_[t].change_core_state(cl_idx * 3); - core_trackers_[t].set_pending_occupied(cl_idx * 3); - } - if (running_cores[aiv0_wid]) { - core_trackers_[t].change_core_state(cl_idx * 3 + 1); - core_trackers_[t].set_pending_occupied(cl_idx * 3 + 1); - } - if (running_cores[aiv1_wid]) { - core_trackers_[t].change_core_state(cl_idx * 3 + 2); - core_trackers_[t].set_pending_occupied(cl_idx * 3 + 2); - } - } - - // Log final distribution - LOG_INFO_V0("Core reassignment complete:"); - for (int32_t t = 0; t < aicpu_thread_num_; t++) { - int32_t aic_running = core_trackers_[t].get_running_count(); - int32_t aiv_running = core_trackers_[t].get_running_count(); - LOG_INFO_V0( - " Thread %d: %d cores, %d clusters (AIC running=%d, AIV running=%d)", t, core_trackers_[t].core_num(), - core_trackers_[t].get_cluster_count(), aic_running, aiv_running - ); - } - active_sched_threads_ = aicpu_thread_num_; -} - // ============================================================================= // Emergency shutdown: broadcast exit signal to every handshake'd core and // deinit their AICore register blocks. Idempotent. @@ -948,9 +863,8 @@ void SchedulerContext::emergency_shutdown(Runtime *runtime) { // ============================================================================= // Lifecycle: init / deinit // ============================================================================= -int32_t SchedulerContext::init( - Runtime *runtime, int32_t aicpu_thread_num, int32_t sched_thread_num, bool orch_to_sched, uint64_t regs_base -) { +int32_t +SchedulerContext::init(Runtime *runtime, int32_t aicpu_thread_num, int32_t sched_thread_num, uint64_t regs_base) { always_assert(runtime != nullptr); // Zero all per-core execution state before handshake @@ -959,7 +873,6 @@ int32_t SchedulerContext::init( // Wire thread/transition configuration that handshake/assign need to read. aicpu_thread_num_ = aicpu_thread_num; sched_thread_num_ = sched_thread_num; - orch_to_sched_ = orch_to_sched; regs_ = regs_base; #if PTO2_PROFILING @@ -976,10 +889,6 @@ int32_t SchedulerContext::init( l2_swimlane_aicpu_init(runtime->dev.worker_count); l2_swimlane_level_ = get_l2_swimlane_level(); if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) { - // When orchestrator phases merge into scheduler threads - // (PTO2_ORCH_TO_SCHED=1), phase records flow through - // aicpu_thread_num_ pools — matches the same branch in the - // dump_args_init call below. // Sched phase pool count = number of scheduler threads. // This block runs before assign_cores_to_threads, so the // active_sched_threads_ member isn't set yet — recompute the same @@ -988,10 +897,9 @@ int32_t SchedulerContext::init( // assign_cores_to_threads' active_sched_threads_). Without this // normalization here, init_phase would prime zero sched pools // and all sched_phase emits would silently drop. - const int active_sched = (sched_thread_num_ > 0) ? sched_thread_num_ : aicpu_thread_num_; - const int sched_phase_threads = orch_to_sched_ ? aicpu_thread_num_ : active_sched; + const int sched_phase_threads = (sched_thread_num_ > 0) ? sched_thread_num_ : aicpu_thread_num_; // Orch phase is a single instance (PR #971 design), so the orch - // pool count is always 1 regardless of orch_to_sched mode. + // pool count is always 1. const int orch_phase_threads = 1; l2_swimlane_aicpu_init_phase(runtime->dev.worker_count, sched_phase_threads, orch_phase_threads); } @@ -1018,7 +926,7 @@ int32_t SchedulerContext::init( // orchestrator thread (see aicpu_executor.cpp). #if PTO2_PROFILING if (is_dump_args_enabled()) { - dump_args_init(orch_to_sched_ ? aicpu_thread_num_ : active_sched_threads_); + dump_args_init(active_sched_threads_); } if (is_pmu_enabled()) { pmu_aicpu_init(physical_core_ids_, cores_total_num_); @@ -1112,11 +1020,6 @@ void SchedulerContext::deinit() { completed_tasks_.store(0, std::memory_order_release); total_tasks_ = 0; orchestrator_done_.store(false, std::memory_order_release); - - // Reset core transition state - transition_requested_.store(false, std::memory_order_release); - wait_reassign_.store(0, std::memory_order_release); - reassigned_.store(false, std::memory_order_release); completed_.store(false, std::memory_order_release); // Reset core discovery and assignment state @@ -1125,7 +1028,6 @@ void SchedulerContext::deinit() { cores_total_num_ = 0; aicpu_thread_num_ = 0; sched_thread_num_ = 0; - orch_to_sched_ = false; active_sched_threads_ = 0; for (int32_t t = 0; t < MAX_AICPU_THREADS; t++) { core_trackers_[t] = CoreTracker{}; @@ -1167,7 +1069,7 @@ void SchedulerContext::wait_for_orchestration_done_before_dispatch(Runtime *runt // and drives the orchestrator → scheduler core transition (or fatal shutdown). // ============================================================================= void SchedulerContext::on_orchestration_done( - Runtime *runtime, PTO2Runtime *rt, int32_t thread_idx, int32_t total_tasks + Runtime *runtime, PTO2Runtime *rt, [[maybe_unused]] int32_t thread_idx, int32_t total_tasks ) { #if PTO2_PROFILING if (l2_swimlane_level_ >= L2SwimlaneLevel::ORCH_PHASES) { @@ -1199,32 +1101,9 @@ void SchedulerContext::on_orchestration_done( } } - // Skip core transition on fatal error — cores already shut down above. - if (completed_.load(std::memory_order_acquire)) { - // Signal transition to unblock scheduler threads waiting at core transition - transition_requested_.store(true, std::memory_order_release); - reassigned_.store(true, std::memory_order_release); - } else if (orch_to_sched_) { - LOG_INFO_V0("Thread %d: Set orchestrator_done=true, requesting core transition", thread_idx); - transition_requested_.store(true, std::memory_order_release); - - // Wait for scheduler threads to acknowledge transition request - while (wait_reassign_.load(std::memory_order_acquire) != sched_thread_num_) { - if (completed_.load(std::memory_order_acquire)) { - break; - } - SPIN_WAIT_HINT(); - } - if (!completed_.load(std::memory_order_acquire)) { - reassign_cores_for_all_threads(); - reassigned_.store(true, std::memory_order_release); - } - } - #if PTO2_PROFILING - // Write core-to-thread mapping AFTER reassignment so the profiling data - // reflects the final distribution (all active_sched_threads_, including - // former orchestrator threads when orch_to_sched_ is enabled). + // Write the core-to-thread mapping so the profiling data reflects the + // scheduler threads' final core distribution. if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) { l2_swimlane_aicpu_init_core_assignments(cores_total_num_); for (int32_t t = 0; t < active_sched_threads_; t++) { diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h index 5cfc08563..a6c0bdfa7 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h @@ -59,8 +59,7 @@ class SchedulerContext { // - Binds func_id_to_addr_ / initial sched_ (if rt is already known) // - Captures AICore-register base (consumed by handshake_all_cores()) // Returns 0 on success, negative on failure (handshake / assignment error). - int32_t - init(Runtime *runtime, int32_t aicpu_thread_num, int32_t sched_thread_num, bool orch_to_sched, uint64_t regs_base); + int32_t init(Runtime *runtime, int32_t aicpu_thread_num, int32_t sched_thread_num, uint64_t regs_base); // Reset all SchedulerContext-owned state to its post-construction defaults. // Called by AicpuExecutor::deinit() during per-run teardown. @@ -150,15 +149,9 @@ class SchedulerContext { std::atomic completed_{false}; uint64_t *func_id_to_addr_{nullptr}; - // --- Core-transition coordination --- - std::atomic transition_requested_{false}; - std::atomic wait_reassign_{0}; - std::atomic reassigned_{false}; - // --- Thread/core configuration --- int32_t active_sched_threads_{0}; int32_t sched_thread_num_{0}; - bool orch_to_sched_{false}; int32_t aicpu_thread_num_{0}; int32_t cores_total_num_{0}; @@ -190,9 +183,6 @@ class SchedulerContext { // Assign discovered cores (cluster = 1 AIC + 2 AIV) round-robin across scheduler threads. bool assign_cores_to_threads(); - // Re-distribute all cores across all threads after orchestration completes. - void reassign_cores_for_all_threads(); - // Emergency shutdown: broadcast exit signal to every handshake'd core and // deinit their AICore register blocks. Idempotent. void emergency_shutdown(Runtime *runtime); @@ -323,8 +313,6 @@ class SchedulerContext { __attribute__((noinline, cold)) LoopAction handle_orchestrator_exit(int32_t thread_idx, PTO2SharedMemoryHeader *header, Runtime *runtime, int32_t &task_count); - __attribute__((noinline, cold)) LoopAction handle_core_transition(bool &cores_released); - __attribute__((noinline, cold)) LoopAction check_idle_fatal_error(int32_t thread_idx, PTO2SharedMemoryHeader *header, Runtime *runtime); diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp index af17954ed..399cf70b2 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp @@ -520,8 +520,6 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_ PTO2TaskSlotState *deferred_release_slot_states[PTO2_DEFERRED_RELEASE_CAP]; int32_t deferred_release_count = 0; - bool cores_released = false; - // PMU runs require single-issue dispatch — overlapping in-flight tasks // pollute per-task PMU counters. Cached at function scope (parity with // a2a3): is_pmu_enabled() is extern "C" and the compiler cannot hoist it @@ -629,11 +627,6 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_ if (action == LoopAction::BREAK_LOOP) break; } - if (!cores_released && orch_to_sched_) { - LoopAction action = handle_core_transition(cores_released); - if (action == LoopAction::BREAK_LOOP) break; - } - #if PTO2_PROFILING CYCLE_COUNT_LAP(l2_swimlane.sched_idle_cycle); #endif diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp index 8b28e620e..5edaa438b 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp @@ -37,7 +37,6 @@ Runtime::Runtime() { memset(dev.aicpu_allowed_cpus, 0, sizeof(dev.aicpu_allowed_cpus)); dev.aicpu_allowed_cpu_count = 0; dev.aicpu_launch_count = 0; - dev.orch_to_sched = false; dev.serial_orch_sched = false; dev.gm_sm_ptr_ = nullptr; dev.slot_states_ptr_ = nullptr;