diff --git a/docs/dynamic-linking.md b/docs/dynamic-linking.md index 88252d2dc..a8fa2e045 100644 --- a/docs/dynamic-linking.md +++ b/docs/dynamic-linking.md @@ -224,11 +224,11 @@ SchedulerContext owns its own teardown: - `SchedulerContext::deinit()` resets every scheduler-owned field — per-core states, payloads, sync-start drain coordination (`sync_start_pending` / `drain_worker_elected` / `drain_ack_mask` / - `pending_task`), task counters, transition flags, worker-id lists, + `pending_task`), task counters, worker-id lists, core trackers, `cores_total_num_` / `aic_count_` / `aiv_count_`, `regs_`, `sched_`, `func_id_to_addr_`, and the `pto2_init_*` flags. - `AicpuExecutor::deinit()` calls `sched_ctx_.deinit()` first, then resets - only its own fields: `thread_num_`, `sched_thread_num_`, `orch_to_sched_`, + only its own fields: `thread_num_`, `sched_thread_num_`, `orch_func_`, `orch_args_cached_`, `orch_so_handle_`, `orch_so_path_`, `runtime_init_ready_`, and the lifecycle atomics (`initialized_`, `init_done_`, `init_failed_`, `finished_`, `thread_idx_`, diff --git a/src/a2a3/platform/include/aicpu/l2_swimlane_collector_aicpu.h b/src/a2a3/platform/include/aicpu/l2_swimlane_collector_aicpu.h index 80da54476..3182eb1c1 100644 --- a/src/a2a3/platform/include/aicpu/l2_swimlane_collector_aicpu.h +++ b/src/a2a3/platform/include/aicpu/l2_swimlane_collector_aicpu.h @@ -155,8 +155,7 @@ void l2_swimlane_aicpu_flush(int thread_idx, const int *cur_thread_cores, int co * to the L2Swimlane base * @param num_sched_phase_threads Number of sched-phase pools to prime * @param num_orch_phase_threads Number of orch-phase pools to prime - * (typically 1; in orch_to_sched mode = - * num_aicpu_threads) + * (typically 1) */ void l2_swimlane_aicpu_init_phase(int worker_count, int num_sched_phase_threads, int num_orch_phase_threads); @@ -200,8 +199,7 @@ void l2_swimlane_aicpu_record_sched_phase( * Must be called once from the orchestrator thread before any * l2_swimlane_aicpu_record_orch_phase() calls. * - * @param thread_idx Thread index for the orchestrator (typically num_sched_threads; - * in orch_to_sched mode each scheduler thread sets its own) + * @param thread_idx Thread index for the orchestrator (typically num_sched_threads) */ void l2_swimlane_aicpu_set_orch_thread_idx(int thread_idx); diff --git a/src/a2a3/platform/shared/aicpu/l2_swimlane_collector_aicpu.cpp b/src/a2a3/platform/shared/aicpu/l2_swimlane_collector_aicpu.cpp index 5ed92cd61..aafffa8ee 100644 --- a/src/a2a3/platform/shared/aicpu/l2_swimlane_collector_aicpu.cpp +++ b/src/a2a3/platform/shared/aicpu/l2_swimlane_collector_aicpu.cpp @@ -69,8 +69,7 @@ static L2SwimlaneAicpuTaskBuffer *s_current_aicpu_task_buffers[PLATFORM_MAX_CORE static L2SwimlaneAicpuSchedPhasePool *s_sched_phase_pools[PLATFORM_MAX_AICPU_THREADS] = {}; static L2SwimlaneAicpuSchedPhaseBuffer *s_current_sched_phase_buffers[PLATFORM_MAX_AICPU_THREADS] = {}; -// Per-thread orch-phase pool/buffer caches (typically one orch thread; in -// orch_to_sched mode all aicpu threads can write here). +// Per-thread orch-phase pool/buffer caches (one orch thread). static L2SwimlaneAicpuOrchPhasePool *s_orch_phase_pools[PLATFORM_MAX_AICPU_THREADS] = {}; static L2SwimlaneAicpuOrchPhaseBuffer *s_current_orch_phase_buffers[PLATFORM_MAX_AICPU_THREADS] = {}; diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp index 2d5613ba8..0832c3c62 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp @@ -114,7 +114,6 @@ struct OrchSoEntry { struct AicpuExecutor { int32_t sched_thread_num_; - bool orch_to_sched_{false}; bool serial_orch_sched_{false}; // ===== Thread management state ===== @@ -206,7 +205,6 @@ int32_t AicpuExecutor::init(Runtime *runtime) { aicpu_thread_num_ = runtime->dev.aicpu_thread_num; if (aicpu_thread_num_ == 0) aicpu_thread_num_ = 1; sched_thread_num_ = aicpu_thread_num_ - 1; - orch_to_sched_ = runtime->dev.orch_to_sched; serial_orch_sched_ = runtime->dev.serial_orch_sched; if (aicpu_thread_num_ < 1 || aicpu_thread_num_ > MAX_AICPU_THREADS) { @@ -215,7 +213,7 @@ int32_t AicpuExecutor::init(Runtime *runtime) { return -1; } - if (sched_ctx_.init(runtime, aicpu_thread_num_, sched_thread_num_, orch_to_sched_, get_platform_regs()) != 0) { + if (sched_ctx_.init(runtime, aicpu_thread_num_, sched_thread_num_, get_platform_regs()) != 0) { init_failed_.store(true, std::memory_order_release); return -1; } @@ -728,8 +726,8 @@ int32_t AicpuExecutor::run(Runtime *runtime) { LOG_INFO_V0("Thread %d: Orchestrator completed", thread_idx); } - // Scheduler thread (orchestrator threads skip dispatch when orch_to_sched_ is false) - if (!sched_ctx_.is_completed() && (thread_idx < sched_thread_num_ || orch_to_sched_)) { + // Scheduler thread (orchestrator thread skips dispatch and exits after orchestration) + if (!sched_ctx_.is_completed() && thread_idx < sched_thread_num_) { // Device orchestration: wait for the primary orchestrator to initialize the SM header while (!runtime_init_ready_.load(std::memory_order_acquire)) { SPIN_WAIT_HINT(); @@ -802,7 +800,6 @@ void AicpuExecutor::deinit(Runtime *runtime) { aicpu_thread_num_ = 0; sched_thread_num_ = 0; - orch_to_sched_ = false; serial_orch_sched_ = false; orch_args_cached_.reset(); diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/docs/RUNTIME_LOGIC.md b/src/a2a3/runtime/tensormap_and_ringbuffer/docs/RUNTIME_LOGIC.md index ef01e3231..ef59d2e98 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/docs/RUNTIME_LOGIC.md +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/docs/RUNTIME_LOGIC.md @@ -544,11 +544,11 @@ Public surface (called from `AicpuExecutor::init/run/deinit`): | Method | Phase | Purpose | | ------ | ----- | ------- | -| `init(runtime, aicpu_thread_num, sched_thread_num, orch_to_sched, regs_base)` | once per run | Handshake + assign cores, reset counters, latch `regs_base`, bind `func_id_to_addr_` | +| `init(runtime, aicpu_thread_num, sched_thread_num, regs_base)` | once per run | Handshake + assign cores, reset counters, latch `regs_base`, bind `func_id_to_addr_` | | `bind_runtime(rt)` | device-orch only | Wire `sched_` to `rt->scheduler` once the orchestrator thread creates `rt` | | `resolve_and_dispatch(runtime, thread_idx)` | per scheduler thread | Main dispatch loop | | `shutdown(thread_idx)` | per thread on exit | `platform_deinit_aicore_regs` for this thread's cores; PMU finalize when enabled | -| `on_orchestration_done(runtime, rt, thread_idx, total_tasks)` | orchestrator thread | Publish core assignments, latch task count, fold inline-completed tasks, flip `orchestrator_done_`, drive orch→sched core transition (or `emergency_shutdown` on fatal) | +| `on_orchestration_done(runtime, rt, thread_idx, total_tasks)` | orchestrator thread | Publish core assignments, latch task count, fold inline-completed tasks, flip `orchestrator_done_` (or `emergency_shutdown` on fatal) | | `deinit()` | once per run | Reset every scheduler-owned field to its post-construction default | | Read-only accessors | various | `aic_count()` / `aiv_count()` / `is_completed()` / `completed_tasks_count()` | @@ -556,7 +556,7 @@ Private internals are split across three .cpp files by responsibility: - `scheduler_completion.cpp` — completion polling, drain protocol - `scheduler_dispatch.cpp` — task dispatch loop and helpers -- `scheduler_cold_path.cpp` — exit checks, stall diagnostics, profiling, lifecycle (`init/deinit`), core management (`handshake_all_cores` / `assign_cores_to_threads` / `reassign_cores_for_all_threads` / `emergency_shutdown`), and `on_orchestration_done` +- `scheduler_cold_path.cpp` — exit checks, stall diagnostics, profiling, lifecycle (`init/deinit`), core management (`handshake_all_cores` / `assign_cores_to_threads` / `emergency_shutdown`), and `on_orchestration_done` `AicpuExecutor` calls neither `handshake_*`, `assign_*`, `reassign_*`, nor `emergency_shutdown` directly — they are private, invoked only by `init` and `on_orchestration_done`. diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/docs/SUBMIT_BY_CLUSTER.md b/src/a2a3/runtime/tensormap_and_ringbuffer/docs/SUBMIT_BY_CLUSTER.md index 8cba7e90c..50f734fee 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/docs/SUBMIT_BY_CLUSTER.md +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/docs/SUBMIT_BY_CLUSTER.md @@ -36,7 +36,7 @@ Legacy per-task submit (`kernel_id + worker_type`) cannot express atomic co-disp Design must preserve the current main runtime architecture: -1. Executor threading split (orchestrator thread vs scheduler threads), and post-orchestrator transition (`transition_requested_` + `reassign_cores_for_all_threads()`). +1. Executor threading split (orchestrator thread vs scheduler threads); the orchestrator thread exits after the task graph is built while scheduler threads dispatch to completion. 2. Shared-memory hot/cold split (`PTO2TaskDescriptor` hot + `PTO2TaskPayload` cold). ## 5. Terminology @@ -146,10 +146,8 @@ This project-defined flattened numbering is kept unchanged. ### 9.2 Cluster Ownership 1. One cluster must be owned by one scheduler domain/thread at a time. -2. No split-cluster ownership in either: - - initial `assign_cores_to_threads()` - - post-orchestrator `reassign_cores_for_all_threads()` -3. Lane occupancy bookkeeping must remain consistent with ownership after reassignment. +2. No split-cluster ownership in `assign_cores_to_threads()`. +3. Lane occupancy bookkeeping must remain consistent with ownership. ## 10. Functional Requirements diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/docs/profiling_levels.md b/src/a2a3/runtime/tensormap_and_ringbuffer/docs/profiling_levels.md index bd669f365..4467cd7b2 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/docs/profiling_levels.md +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/docs/profiling_levels.md @@ -73,7 +73,6 @@ Each sub-level macro requires `PTO2_PROFILING=1`: - Base timing counters for scheduler loop (`sched_complete/dispatch/idle/scan`) - Per-thread orchestration timing (`orch_start`, `orch_end`, `orch_cost`) -- Stage-level orchestration end timestamp (`orch_stage_end`, printed by last orch thread only, marks the moment all orch threads have finished and core transition is about to be requested; only when `orch_to_sched_` is true) - PTO2 total submitted tasks count (printed by last orch thread, after orch timing line) - Scheduler summary output (`total_time`, `loops`, `tasks_scheduled`) - Scheduler lifetime timestamps and cost (`sched_start`, `sched_end`, `sched_cost` — captured inside `resolve_and_dispatch_pto2()`, printed before Scheduler summary) @@ -87,19 +86,17 @@ Each sub-level macro requires `PTO2_PROFILING=1`: - `Thread %d: orch_start=%llu orch_end=%llu orch_cost=%.3fus` — each orch thread, after orchestration fully complete - `PTO2 total submitted tasks = %d, already executed %d tasks` — last orch thread only (×1), after orch timing line -- `Thread %d: orch_stage_end=%llu` — last orch thread only (×1), only when `orch_to_sched_=true` - `Thread %d: sched_start=%llu sched_end=%llu sched_cost=%.3fus` — each sched thread, printed before Scheduler summary - `Thread %d: Scheduler summary: total_time=%.3fus, loops=%llu, tasks_scheduled=%d` — each sched thread - `Thread %d: sched_start=%llu sched_end(timeout)=%llu sched_cost=%.3fus` — timeout path only (replaces normal `sched_end`) **LOG_INFO_V9 count (normal run):** -- `orch_to_sched_=false` (default): `N_sched*2 + N_orch*1 + 1` (orch_timing + PTO2_total + sched_timing + Scheduler_summary) -- `orch_to_sched_=true` (`PTO2_ORCH_TO_SCHED=1`): adds 1 (`orch_stage_end`) +- `N_sched*2 + N_orch*1 + 1` (orch_timing + PTO2_total + sched_timing + Scheduler_summary) > See the table at the end for concrete counts based on the `paged_attention` example. -**Example log output — `orch_to_sched_=false`** (from `paged_attention`, device 10): +**Example log output** (from `paged_attention`, device 10): ```text Thread 2: orch_start=48214752948321 orch_end=48214752959379 orch_cost=230.000us @@ -111,26 +108,10 @@ Thread 0: sched_start=48214752948200 sched_end=48214752963571 sched_cost=320.000 Thread 0: Scheduler summary: total_time=183.180us, loops=4611, tasks_scheduled=7 ``` -**Example log output — `orch_to_sched_=true`** (`PTO2_ORCH_TO_SCHED=1`, from `paged_attention`, device 11): - -```text -Thread 3: orch_stage_end=48236915058307 -Thread 3: orch_start=48236915044001 orch_end=48236915058781 orch_cost=308.000us -Thread 2: orch_start=48236915044003 orch_end=48236915058782 orch_cost=308.000us -PTO2 total submitted tasks = 13, already executed 13 tasks -Thread 0: sched_start=48236915043911 sched_end=48236915059191 sched_cost=318.000us -Thread 0: Scheduler summary: total_time=187.920us, loops=4561, tasks_scheduled=4 -Thread 1: sched_start=48236915043947 sched_end=48236915061881 sched_cost=372.000us -Thread 1: Scheduler summary: total_time=168.620us, loops=3880, tasks_scheduled=9 -``` - -> With `orch_to_sched_=true`, orch threads transition to schedulers after orchestration. They print `orch_end` but do NOT print `Scheduler summary` or `sched_end` (they have no cores assigned at shutdown time). - **Note:** - All logs above are controlled by compile-time macro `PTO2_PROFILING`, not by `enable_l2_swimlane`. - `enable_l2_swimlane` only controls shared-memory data collection / swimlane export. -- Enable `orch_to_sched_` via environment variable: `PTO2_ORCH_TO_SCHED=1`. --- @@ -420,13 +401,13 @@ definitions to runtime headers. > Example: `paged_attention` on Ascend hardware, 2 sched threads + 2 orch threads, normal run (no stall/timeout). -| Level | Macro Settings | LOG_INFO_V9 Count (`orch_to_sched_=false`) | LOG_INFO_V9 Count (`orch_to_sched_=true`) | Description | -| ----- | -------------- | ------------------------------------------ | ----------------------------------------- | ----------- | -| 0 | `PTO2_PROFILING=0` | 0 | 0 | No timing output | -| 1 | `PTO2_PROFILING=1` | 7 | 8 | Timing timestamps + scheduler summary | -| 2 | `+PTO2_SCHED_PROFILING=1` | — | — | Scheduler detailed phase breakdown | -| 3 | `+PTO2_ORCH_PROFILING=1` | — | — | Orchestrator detailed phase breakdown | -| 4 | `+PTO2_TENSORMAP_PROFILING=1` | — | — | TensorMap lookup stats | +| Level | Macro Settings | LOG_INFO_V9 Count | Description | +| ----- | -------------- | ----------------- | ----------- | +| 0 | `PTO2_PROFILING=0` | 0 | No timing output | +| 1 | `PTO2_PROFILING=1` | 7 | Timing timestamps + scheduler summary | +| 2 | `+PTO2_SCHED_PROFILING=1` | — | Scheduler detailed phase breakdown | +| 3 | `+PTO2_ORCH_PROFILING=1` | — | Orchestrator detailed phase breakdown | +| 4 | `+PTO2_TENSORMAP_PROFILING=1` | — | TensorMap lookup stats | --- diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp index dc0ddefb8..7952045ce 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp @@ -477,10 +477,6 @@ static bool stage_device_args( // runtime. Behavior-only env reads (no new gates); kept here so the args and // image steps stay free of unrelated state. static void apply_orch_sched_env_flags(Runtime *runtime) { - const char *orch_env = std::getenv("PTO2_ORCH_TO_SCHED"); - runtime->dev.orch_to_sched = orch_env && (orch_env[0] == '1' || orch_env[0] == 't' || orch_env[0] == 'T'); - LOG_INFO_V0("Orchestrator-to-scheduler transition: %s", runtime->dev.orch_to_sched ? "enabled" : "disabled"); - const char *serial_env = std::getenv("PTO2_SERIAL_ORCH_SCHED"); runtime->dev.serial_orch_sched = serial_env && (serial_env[0] == '1' || serial_env[0] == 't' || serial_env[0] == 'T'); diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h index ae6a2446e..5f3109a08 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h @@ -207,7 +207,6 @@ struct alignas(64) DeviceRuntimeLaunchDesc { // (= orch + schedulers). AicpuExecutor splits this into one orchestrator // thread (highest idx, runs aicpu_orchestration_entry) and the remaining // aicpu_thread_num-1 scheduler threads that dispatch tasks to AICore. - // The orch thread also dispatches when env PTO2_ORCH_TO_SCHED is set. int aicpu_thread_num; int ready_queue_shards; // Number of ready queue shards (1..MAX_AICPU_THREADS, default MAX-1) @@ -223,12 +222,6 @@ struct alignas(64) DeviceRuntimeLaunchDesc { // PTO2 integration: kernel_id -> GM function_bin_addr mapping uint64_t func_id_to_addr_[RUNTIME_MAX_FUNC_ID]; - // Orchestrator-to-scheduler transition control - // When true, orchestrator threads convert to scheduler threads after orchestration completes. - // When false (default), orchestrator threads exit after orchestration without dispatching tasks. - // Controlled via PTO2_ORCH_TO_SCHED environment variable. - bool orch_to_sched; - // Serial orchestrator -> scheduler start control. // When true, scheduler threads wait until orchestration has fully built the // task graph before entering resolve_and_dispatch(). diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp index cae114427..5bac8297a 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp @@ -94,21 +94,6 @@ LoopAction SchedulerContext::handle_orchestrator_exit( return LoopAction::NONE; } -LoopAction SchedulerContext::handle_core_transition(bool &cores_released) { - if (!transition_requested_.load(std::memory_order_acquire)) return LoopAction::NONE; - if (!reassigned_.load(std::memory_order_acquire)) { - wait_reassign_.fetch_add(1, std::memory_order_release); - while (!reassigned_.load(std::memory_order_acquire)) { - if (completed_.load(std::memory_order_acquire)) { - return LoopAction::BREAK_LOOP; - } - SPIN_WAIT_HINT(); - } - } - cores_released = true; - return LoopAction::NONE; -} - LoopAction SchedulerContext::check_idle_fatal_error(int32_t thread_idx, PTO2SharedMemoryHeader *header, Runtime *runtime) { if (completed_.load(std::memory_order_acquire)) { @@ -330,7 +315,7 @@ void SchedulerContext::log_stall_diagnostics( // CLUSTER lines: one per cluster this thread owns. // cluster_id = local_cluster_idx * active_sched_threads_ + thread_idx, matching the - // round-robin assignment in assign_cores_to_threads / reassign_cores_for_all_threads. + // round-robin assignment in assign_cores_to_threads. int32_t ast = active_sched_threads_ > 0 ? active_sched_threads_ : aicpu_thread_num_; for (int32_t cli = 0; cli < tracker.get_cluster_count() && cli < STALL_DUMP_CORE_MAX; cli++) { int32_t offset = cli * 3; @@ -848,76 +833,6 @@ bool SchedulerContext::assign_cores_to_threads() { return true; } -// ============================================================================= -// Reassign all cores across all threads (sched + orchestrator) after orchestration. -// ============================================================================= -void SchedulerContext::reassign_cores_for_all_threads() { - LOG_INFO_V0( - "Reassigning cores (cluster-aligned) for %d threads: %d AIC, %d AIV", aicpu_thread_num_, aic_count_, aiv_count_ - ); - - // Collect running worker_ids from all current trackers - bool running_cores[RUNTIME_MAX_WORKER] = {}; - for (int32_t i = 0; i < aicpu_thread_num_; i++) { - auto all_running = core_trackers_[i].get_all_running_cores(); - int32_t bp; - while ((bp = all_running.pop_first()) >= 0) { - running_cores[core_trackers_[i].get_core_id_by_offset(bp)] = true; - } - } - - // Count clusters per thread (round-robin across all threads) - int32_t cluster_count = aic_count_; - int32_t clusters_per_thread[MAX_AICPU_THREADS] = {}; - for (int32_t ci = 0; ci < cluster_count; ci++) { - clusters_per_thread[ci % aicpu_thread_num_]++; - } - - // Re-init all trackers and reset core counts - for (int32_t i = 0; i < aicpu_thread_num_; i++) { - core_trackers_[i].init(clusters_per_thread[i]); - } - - // Assign clusters round-robin and restore running state - int32_t cluster_idx_per_thread[MAX_AICPU_THREADS] = {}; - for (int32_t ci = 0; ci < cluster_count; ci++) { - int32_t t = ci % aicpu_thread_num_; - - int32_t aic_wid = aic_worker_ids_[ci]; - int32_t aiv0_wid = aiv_worker_ids_[2 * ci]; - int32_t aiv1_wid = aiv_worker_ids_[2 * ci + 1]; - - int32_t cl_idx = cluster_idx_per_thread[t]++; - core_trackers_[t].set_cluster(cl_idx, aic_wid, aiv0_wid, aiv1_wid); - - // init() marks all idle; toggle cores that were running and restore pending_occupied - if (running_cores[aic_wid]) { - core_trackers_[t].change_core_state(cl_idx * 3); - core_trackers_[t].set_pending_occupied(cl_idx * 3); - } - if (running_cores[aiv0_wid]) { - core_trackers_[t].change_core_state(cl_idx * 3 + 1); - core_trackers_[t].set_pending_occupied(cl_idx * 3 + 1); - } - if (running_cores[aiv1_wid]) { - core_trackers_[t].change_core_state(cl_idx * 3 + 2); - core_trackers_[t].set_pending_occupied(cl_idx * 3 + 2); - } - } - - // Log final distribution - LOG_INFO_V0("Core reassignment complete:"); - for (int32_t t = 0; t < aicpu_thread_num_; t++) { - int32_t aic_running = core_trackers_[t].get_running_count(); - int32_t aiv_running = core_trackers_[t].get_running_count(); - LOG_INFO_V0( - " Thread %d: %d cores, %d clusters (AIC running=%d, AIV running=%d)", t, core_trackers_[t].core_num(), - core_trackers_[t].get_cluster_count(), aic_running, aiv_running - ); - } - active_sched_threads_ = aicpu_thread_num_; -} - // ============================================================================= // Emergency shutdown: broadcast exit signal to every handshake'd core and // deinit their AICore register blocks. Idempotent. @@ -945,9 +860,8 @@ void SchedulerContext::emergency_shutdown(Runtime *runtime) { // ============================================================================= // Lifecycle: init / deinit // ============================================================================= -int32_t SchedulerContext::init( - Runtime *runtime, int32_t aicpu_thread_num, int32_t sched_thread_num, bool orch_to_sched, uint64_t regs_base -) { +int32_t +SchedulerContext::init(Runtime *runtime, int32_t aicpu_thread_num, int32_t sched_thread_num, uint64_t regs_base) { always_assert(runtime != nullptr); // Zero all per-core execution state before handshake @@ -956,7 +870,6 @@ int32_t SchedulerContext::init( // Wire thread/transition configuration that handshake/assign need to read. aicpu_thread_num_ = aicpu_thread_num; sched_thread_num_ = sched_thread_num; - orch_to_sched_ = orch_to_sched; regs_ = regs_base; #if PTO2_PROFILING @@ -977,10 +890,9 @@ int32_t SchedulerContext::init( // threads as scheduler threads" (see assign_cores_to_threads' // active_sched_threads_). Without it, init_phase would prime zero // sched pools and all sched_phase emits would silently drop. - const int active_sched = (sched_thread_num_ > 0) ? sched_thread_num_ : aicpu_thread_num_; - const int sched_phase_threads = orch_to_sched_ ? aicpu_thread_num_ : active_sched; + const int sched_phase_threads = (sched_thread_num_ > 0) ? sched_thread_num_ : aicpu_thread_num_; // Orchestration is always single-threaded, so orch-phase is one pool - // (ordinal 0) in both modes — see record_orch_phase. + // (ordinal 0) — see record_orch_phase. const int orch_phase_threads = 1; l2_swimlane_aicpu_init_phase(runtime->dev.worker_count, sched_phase_threads, orch_phase_threads); } @@ -1007,7 +919,7 @@ int32_t SchedulerContext::init( // orchestrator thread (see aicpu_executor.cpp). #if PTO2_PROFILING if (is_dump_args_enabled()) { - dump_args_init(orch_to_sched_ ? aicpu_thread_num_ : active_sched_threads_); + dump_args_init(active_sched_threads_); } if (is_pmu_enabled()) { pmu_aicpu_init(physical_core_ids_, cores_total_num_); @@ -1101,11 +1013,6 @@ void SchedulerContext::deinit() { completed_tasks_.store(0, std::memory_order_release); total_tasks_ = 0; orchestrator_done_.store(false, std::memory_order_release); - - // Reset core transition state - transition_requested_.store(false, std::memory_order_release); - wait_reassign_.store(0, std::memory_order_release); - reassigned_.store(false, std::memory_order_release); completed_.store(false, std::memory_order_release); // Reset core discovery and assignment state @@ -1114,7 +1021,6 @@ void SchedulerContext::deinit() { cores_total_num_ = 0; aicpu_thread_num_ = 0; sched_thread_num_ = 0; - orch_to_sched_ = false; active_sched_threads_ = 0; for (int32_t t = 0; t < MAX_AICPU_THREADS; t++) { core_trackers_[t] = CoreTracker{}; @@ -1156,7 +1062,7 @@ void SchedulerContext::wait_for_orchestration_done_before_dispatch(Runtime *runt // and drives the orchestrator → scheduler core transition (or fatal shutdown). // ============================================================================= void SchedulerContext::on_orchestration_done( - Runtime *runtime, PTO2Runtime *rt, int32_t thread_idx, int32_t total_tasks + Runtime *runtime, PTO2Runtime *rt, [[maybe_unused]] int32_t thread_idx, int32_t total_tasks ) { #if PTO2_PROFILING if (l2_swimlane_level_ >= L2SwimlaneLevel::ORCH_PHASES) { @@ -1190,32 +1096,9 @@ void SchedulerContext::on_orchestration_done( } } - // Skip core transition on fatal error — cores already shut down above. - if (completed_.load(std::memory_order_acquire)) { - // Signal transition to unblock scheduler threads waiting at core transition - transition_requested_.store(true, std::memory_order_release); - reassigned_.store(true, std::memory_order_release); - } else if (orch_to_sched_) { - LOG_INFO_V0("Thread %d: Set orchestrator_done=true, requesting core transition", thread_idx); - transition_requested_.store(true, std::memory_order_release); - - // Wait for scheduler threads to acknowledge transition request - while (wait_reassign_.load(std::memory_order_acquire) != sched_thread_num_) { - if (completed_.load(std::memory_order_acquire)) { - break; - } - SPIN_WAIT_HINT(); - } - if (!completed_.load(std::memory_order_acquire)) { - reassign_cores_for_all_threads(); - reassigned_.store(true, std::memory_order_release); - } - } - #if PTO2_PROFILING - // Write core-to-thread mapping AFTER reassignment so the profiling data - // reflects the final distribution (all active_sched_threads_, including - // former orchestrator threads when orch_to_sched_ is enabled). + // Write the core-to-thread mapping so the profiling data reflects the + // scheduler threads' final core distribution. if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) { l2_swimlane_aicpu_init_core_assignments(cores_total_num_); for (int32_t t = 0; t < active_sched_threads_; t++) { diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h index d8669d42b..02962864d 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h @@ -61,8 +61,7 @@ class SchedulerContext { // - Binds func_id_to_addr_ / initial sched_ (if rt is already known) // - Captures AICore-register base (consumed by handshake_all_cores()) // Returns 0 on success, negative on failure (handshake / assignment error). - int32_t - init(Runtime *runtime, int32_t aicpu_thread_num, int32_t sched_thread_num, bool orch_to_sched, uint64_t regs_base); + int32_t init(Runtime *runtime, int32_t aicpu_thread_num, int32_t sched_thread_num, uint64_t regs_base); // Reset all SchedulerContext-owned state to its post-construction defaults. // Called by AicpuExecutor::deinit() during per-run teardown. @@ -152,15 +151,9 @@ class SchedulerContext { std::atomic completed_{false}; uint64_t *func_id_to_addr_{nullptr}; - // --- Core-transition coordination --- - std::atomic transition_requested_{false}; - std::atomic wait_reassign_{0}; - std::atomic reassigned_{false}; - // --- Thread/core configuration --- int32_t active_sched_threads_{0}; int32_t sched_thread_num_{0}; - bool orch_to_sched_{false}; int32_t aicpu_thread_num_{0}; int32_t cores_total_num_{0}; @@ -190,9 +183,6 @@ class SchedulerContext { // Assign discovered cores (cluster = 1 AIC + 2 AIV) round-robin across scheduler threads. bool assign_cores_to_threads(); - // Re-distribute all cores across all threads after orchestration completes. - void reassign_cores_for_all_threads(); - // Emergency shutdown: broadcast exit signal to every handshake'd core and // deinit their AICore register blocks. Idempotent. void emergency_shutdown(Runtime *runtime); @@ -359,8 +349,6 @@ class SchedulerContext { __attribute__((noinline, cold)) LoopAction handle_orchestrator_exit(int32_t thread_idx, PTO2SharedMemoryHeader *header, Runtime *runtime, int32_t &task_count); - __attribute__((noinline, cold)) LoopAction handle_core_transition(bool &cores_released); - __attribute__((noinline, cold)) LoopAction check_idle_fatal_error(int32_t thread_idx, PTO2SharedMemoryHeader *header, Runtime *runtime); diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp index a3e58c8d6..4b94d0ae0 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp @@ -803,8 +803,6 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_ PTO2TaskSlotState *deferred_release_slot_states[PTO2_DEFERRED_RELEASE_CAP]; int32_t deferred_release_count = 0; - bool cores_released = false; - // PMU runs require single-issue dispatch — overlapping in-flight tasks // pollute per-task PMU counters, so skip the PENDING pre-load phase. // Cached at function scope: is_pmu_enabled() is extern "C" and the @@ -916,11 +914,6 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_ if (action == LoopAction::BREAK_LOOP) break; } - if (!cores_released && orch_to_sched_) { - LoopAction action = handle_core_transition(cores_released); - if (action == LoopAction::BREAK_LOOP) break; - } - #if PTO2_PROFILING CYCLE_COUNT_LAP(l2_swimlane.sched_idle_cycle); #endif diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp index 8b28e620e..5edaa438b 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp @@ -37,7 +37,6 @@ Runtime::Runtime() { memset(dev.aicpu_allowed_cpus, 0, sizeof(dev.aicpu_allowed_cpus)); dev.aicpu_allowed_cpu_count = 0; dev.aicpu_launch_count = 0; - dev.orch_to_sched = false; dev.serial_orch_sched = false; dev.gm_sm_ptr_ = nullptr; dev.slot_states_ptr_ = nullptr; diff --git a/src/a5/platform/include/aicpu/l2_swimlane_collector_aicpu.h b/src/a5/platform/include/aicpu/l2_swimlane_collector_aicpu.h index 80da54476..3182eb1c1 100644 --- a/src/a5/platform/include/aicpu/l2_swimlane_collector_aicpu.h +++ b/src/a5/platform/include/aicpu/l2_swimlane_collector_aicpu.h @@ -155,8 +155,7 @@ void l2_swimlane_aicpu_flush(int thread_idx, const int *cur_thread_cores, int co * to the L2Swimlane base * @param num_sched_phase_threads Number of sched-phase pools to prime * @param num_orch_phase_threads Number of orch-phase pools to prime - * (typically 1; in orch_to_sched mode = - * num_aicpu_threads) + * (typically 1) */ void l2_swimlane_aicpu_init_phase(int worker_count, int num_sched_phase_threads, int num_orch_phase_threads); @@ -200,8 +199,7 @@ void l2_swimlane_aicpu_record_sched_phase( * Must be called once from the orchestrator thread before any * l2_swimlane_aicpu_record_orch_phase() calls. * - * @param thread_idx Thread index for the orchestrator (typically num_sched_threads; - * in orch_to_sched mode each scheduler thread sets its own) + * @param thread_idx Thread index for the orchestrator (typically num_sched_threads) */ void l2_swimlane_aicpu_set_orch_thread_idx(int thread_idx); diff --git a/src/a5/platform/shared/aicpu/l2_swimlane_collector_aicpu.cpp b/src/a5/platform/shared/aicpu/l2_swimlane_collector_aicpu.cpp index 5ed92cd61..aafffa8ee 100644 --- a/src/a5/platform/shared/aicpu/l2_swimlane_collector_aicpu.cpp +++ b/src/a5/platform/shared/aicpu/l2_swimlane_collector_aicpu.cpp @@ -69,8 +69,7 @@ static L2SwimlaneAicpuTaskBuffer *s_current_aicpu_task_buffers[PLATFORM_MAX_CORE static L2SwimlaneAicpuSchedPhasePool *s_sched_phase_pools[PLATFORM_MAX_AICPU_THREADS] = {}; static L2SwimlaneAicpuSchedPhaseBuffer *s_current_sched_phase_buffers[PLATFORM_MAX_AICPU_THREADS] = {}; -// Per-thread orch-phase pool/buffer caches (typically one orch thread; in -// orch_to_sched mode all aicpu threads can write here). +// Per-thread orch-phase pool/buffer caches (one orch thread). static L2SwimlaneAicpuOrchPhasePool *s_orch_phase_pools[PLATFORM_MAX_AICPU_THREADS] = {}; static L2SwimlaneAicpuOrchPhaseBuffer *s_current_orch_phase_buffers[PLATFORM_MAX_AICPU_THREADS] = {}; diff --git a/src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp index 41a97ace8..91488097a 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp +++ b/src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp @@ -114,7 +114,6 @@ struct OrchSoEntry { struct AicpuExecutor { int32_t sched_thread_num_; - bool orch_to_sched_{false}; bool serial_orch_sched_{false}; // ===== Thread management state ===== @@ -208,7 +207,6 @@ int32_t AicpuExecutor::init(Runtime *runtime) { aicpu_thread_num_ = runtime->dev.aicpu_thread_num; if (aicpu_thread_num_ == 0) aicpu_thread_num_ = 1; sched_thread_num_ = aicpu_thread_num_ - 1; - orch_to_sched_ = runtime->dev.orch_to_sched; serial_orch_sched_ = runtime->dev.serial_orch_sched; if (aicpu_thread_num_ < 1 || aicpu_thread_num_ > MAX_AICPU_THREADS) { @@ -217,7 +215,7 @@ int32_t AicpuExecutor::init(Runtime *runtime) { return -1; } - if (sched_ctx_.init(runtime, aicpu_thread_num_, sched_thread_num_, orch_to_sched_, get_platform_regs()) != 0) { + if (sched_ctx_.init(runtime, aicpu_thread_num_, sched_thread_num_, get_platform_regs()) != 0) { init_failed_.store(true, std::memory_order_release); return -1; } @@ -723,8 +721,8 @@ int32_t AicpuExecutor::run(Runtime *runtime) { LOG_INFO_V0("Thread %d: Orchestrator completed", thread_idx); } - // Scheduler thread (orchestrator threads skip dispatch when orch_to_sched_ is false) - if (!sched_ctx_.is_completed() && (thread_idx < sched_thread_num_ || orch_to_sched_)) { + // Scheduler thread (orchestrator thread skips dispatch and exits after orchestration) + if (!sched_ctx_.is_completed() && thread_idx < sched_thread_num_) { // Device orchestration: wait for the primary orchestrator to initialize the SM header while (!runtime_init_ready_.load(std::memory_order_acquire)) { SPIN_WAIT_HINT(); @@ -797,7 +795,6 @@ void AicpuExecutor::deinit(Runtime *runtime) { aicpu_thread_num_ = 0; sched_thread_num_ = 0; - orch_to_sched_ = false; serial_orch_sched_ = false; orch_args_cached_.reset(); diff --git a/src/a5/runtime/tensormap_and_ringbuffer/docs/RUNTIME_LOGIC.md b/src/a5/runtime/tensormap_and_ringbuffer/docs/RUNTIME_LOGIC.md index 83ea5c270..a339b178e 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/docs/RUNTIME_LOGIC.md +++ b/src/a5/runtime/tensormap_and_ringbuffer/docs/RUNTIME_LOGIC.md @@ -544,11 +544,11 @@ Public surface (called from `AicpuExecutor::init/run/deinit`): | Method | Phase | Purpose | | ------ | ----- | ------- | -| `init(runtime, aicpu_thread_num, sched_thread_num, orch_to_sched, regs_base)` | once per run | Handshake + assign cores, reset counters, latch `regs_base`, bind `func_id_to_addr_` | +| `init(runtime, aicpu_thread_num, sched_thread_num, regs_base)` | once per run | Handshake + assign cores, reset counters, latch `regs_base`, bind `func_id_to_addr_` | | `bind_runtime(rt)` | device-orch only | Wire `sched_` to `rt->scheduler` once the orchestrator thread creates `rt` | | `resolve_and_dispatch(runtime, thread_idx)` | per scheduler thread | Main dispatch loop | | `shutdown(thread_idx)` | per thread on exit | `platform_deinit_aicore_regs` for this thread's cores | -| `on_orchestration_done(runtime, rt, thread_idx, total_tasks)` | orchestrator thread | Publish core assignments, latch task count, fold inline-completed tasks, flip `orchestrator_done_`, drive orch→sched core transition (or `emergency_shutdown` on fatal) | +| `on_orchestration_done(runtime, rt, thread_idx, total_tasks)` | orchestrator thread | Publish core assignments, latch task count, fold inline-completed tasks, flip `orchestrator_done_` (or `emergency_shutdown` on fatal) | | `deinit()` | once per run | Reset every scheduler-owned field to its post-construction default | | Read-only accessors | various | `aic_count()` / `aiv_count()` / `is_completed()` / `completed_tasks_count()` | @@ -556,7 +556,7 @@ Private internals are split across three .cpp files by responsibility: - `scheduler_completion.cpp` — completion polling, drain protocol - `scheduler_dispatch.cpp` — task dispatch loop and helpers -- `scheduler_cold_path.cpp` — exit checks, stall diagnostics, profiling, lifecycle (`init/deinit`), core management (`handshake_all_cores` / `assign_cores_to_threads` / `reassign_cores_for_all_threads` / `emergency_shutdown`), and `on_orchestration_done` +- `scheduler_cold_path.cpp` — exit checks, stall diagnostics, profiling, lifecycle (`init/deinit`), core management (`handshake_all_cores` / `assign_cores_to_threads` / `emergency_shutdown`), and `on_orchestration_done` `AicpuExecutor` calls neither `handshake_*`, `assign_*`, `reassign_*`, nor `emergency_shutdown` directly — they are private, invoked only by `init` and `on_orchestration_done`. diff --git a/src/a5/runtime/tensormap_and_ringbuffer/docs/SUBMIT_BY_CLUSTER.md b/src/a5/runtime/tensormap_and_ringbuffer/docs/SUBMIT_BY_CLUSTER.md index 8cba7e90c..50f734fee 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/docs/SUBMIT_BY_CLUSTER.md +++ b/src/a5/runtime/tensormap_and_ringbuffer/docs/SUBMIT_BY_CLUSTER.md @@ -36,7 +36,7 @@ Legacy per-task submit (`kernel_id + worker_type`) cannot express atomic co-disp Design must preserve the current main runtime architecture: -1. Executor threading split (orchestrator thread vs scheduler threads), and post-orchestrator transition (`transition_requested_` + `reassign_cores_for_all_threads()`). +1. Executor threading split (orchestrator thread vs scheduler threads); the orchestrator thread exits after the task graph is built while scheduler threads dispatch to completion. 2. Shared-memory hot/cold split (`PTO2TaskDescriptor` hot + `PTO2TaskPayload` cold). ## 5. Terminology @@ -146,10 +146,8 @@ This project-defined flattened numbering is kept unchanged. ### 9.2 Cluster Ownership 1. One cluster must be owned by one scheduler domain/thread at a time. -2. No split-cluster ownership in either: - - initial `assign_cores_to_threads()` - - post-orchestrator `reassign_cores_for_all_threads()` -3. Lane occupancy bookkeeping must remain consistent with ownership after reassignment. +2. No split-cluster ownership in `assign_cores_to_threads()`. +3. Lane occupancy bookkeeping must remain consistent with ownership. ## 10. Functional Requirements diff --git a/src/a5/runtime/tensormap_and_ringbuffer/docs/profiling_levels.md b/src/a5/runtime/tensormap_and_ringbuffer/docs/profiling_levels.md index 2ef6c1b6a..62a38766e 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/docs/profiling_levels.md +++ b/src/a5/runtime/tensormap_and_ringbuffer/docs/profiling_levels.md @@ -73,7 +73,6 @@ Each sub-level macro requires `PTO2_PROFILING=1`: - Base timing counters for scheduler loop (`sched_complete/dispatch/idle/scan`) - Per-thread orchestration timing (`orch_start`, `orch_end`, `orch_cost`) -- Stage-level orchestration end timestamp (`orch_stage_end`, printed by last orch thread only, marks the moment all orch threads have finished and core transition is about to be requested; only when `orch_to_sched_` is true) - PTO2 total submitted tasks count (printed by last orch thread, after orch timing line) - Scheduler summary output (`total_time`, `loops`, `tasks_scheduled`) - Scheduler lifetime timestamps and cost (`sched_start`, `sched_end`, `sched_cost` — captured inside `resolve_and_dispatch_pto2()`, printed before Scheduler summary) @@ -87,19 +86,17 @@ Each sub-level macro requires `PTO2_PROFILING=1`: - `Thread %d: orch_start=%llu orch_end=%llu orch_cost=%.3fus` — each orch thread, after orchestration fully complete - `PTO2 total submitted tasks = %d, already executed %d tasks` — last orch thread only (×1), after orch timing line -- `Thread %d: orch_stage_end=%llu` — last orch thread only (×1), only when `orch_to_sched_=true` - `Thread %d: sched_start=%llu sched_end=%llu sched_cost=%.3fus` — each sched thread, printed before Scheduler summary - `Thread %d: Scheduler summary: total_time=%.3fus, loops=%llu, tasks_scheduled=%d` — each sched thread - `Thread %d: sched_start=%llu sched_end(timeout)=%llu sched_cost=%.3fus` — timeout path only (replaces normal `sched_end`) **LOG_INFO_V9 count (normal run):** -- `orch_to_sched_=false` (default): `N_sched*2 + N_orch*1 + 1` (orch_timing + PTO2_total + sched_timing + Scheduler_summary) -- `orch_to_sched_=true` (`PTO2_ORCH_TO_SCHED=1`): adds 1 (`orch_stage_end`) +- `N_sched*2 + N_orch*1 + 1` (orch_timing + PTO2_total + sched_timing + Scheduler_summary) > See the table at the end for concrete counts based on the `paged_attention` example. -**Example log output — `orch_to_sched_=false`** (from `paged_attention`, device 10): +**Example log output** (from `paged_attention`, device 10): ```text Thread 2: orch_start=48214752948321 orch_end=48214752959379 orch_cost=230.000us @@ -111,26 +108,10 @@ Thread 0: sched_start=48214752948200 sched_end=48214752963571 sched_cost=320.000 Thread 0: Scheduler summary: total_time=183.180us, loops=4611, tasks_scheduled=7 ``` -**Example log output — `orch_to_sched_=true`** (`PTO2_ORCH_TO_SCHED=1`, from `paged_attention`, device 11): - -```text -Thread 3: orch_stage_end=48236915058307 -Thread 3: orch_start=48236915044001 orch_end=48236915058781 orch_cost=308.000us -Thread 2: orch_start=48236915044003 orch_end=48236915058782 orch_cost=308.000us -PTO2 total submitted tasks = 13, already executed 13 tasks -Thread 0: sched_start=48236915043911 sched_end=48236915059191 sched_cost=318.000us -Thread 0: Scheduler summary: total_time=187.920us, loops=4561, tasks_scheduled=4 -Thread 1: sched_start=48236915043947 sched_end=48236915061881 sched_cost=372.000us -Thread 1: Scheduler summary: total_time=168.620us, loops=3880, tasks_scheduled=9 -``` - -> With `orch_to_sched_=true`, orch threads transition to schedulers after orchestration. They print `orch_end` but do NOT print `Scheduler summary` or `sched_end` (they have no cores assigned at shutdown time). - **Note:** - All logs above are controlled by compile-time macro `PTO2_PROFILING`, not by `enable_l2_swimlane`. - `enable_l2_swimlane` only controls shared-memory data collection / swimlane export. -- Enable `orch_to_sched_` via environment variable: `PTO2_ORCH_TO_SCHED=1`. --- @@ -390,13 +371,13 @@ definitions to runtime headers. > Example: `paged_attention` on Ascend hardware, 2 sched threads + 2 orch threads, normal run (no stall/timeout). -| Level | Macro Settings | LOG_INFO_V9 Count (`orch_to_sched_=false`) | LOG_INFO_V9 Count (`orch_to_sched_=true`) | Description | -| ----- | -------------- | ------------------------------------------ | ----------------------------------------- | ----------- | -| 0 | `PTO2_PROFILING=0` | 0 | 0 | No timing output | -| 1 | `PTO2_PROFILING=1` | 7 | 8 | Timing timestamps + scheduler summary | -| 2 | `+PTO2_SCHED_PROFILING=1` | — | — | Scheduler detailed phase breakdown | -| 3 | `+PTO2_ORCH_PROFILING=1` | — | — | Orchestrator detailed phase breakdown | -| 4 | `+PTO2_TENSORMAP_PROFILING=1` | — | — | TensorMap lookup stats | +| Level | Macro Settings | LOG_INFO_V9 Count | Description | +| ----- | -------------- | ----------------- | ----------- | +| 0 | `PTO2_PROFILING=0` | 0 | No timing output | +| 1 | `PTO2_PROFILING=1` | 7 | Timing timestamps + scheduler summary | +| 2 | `+PTO2_SCHED_PROFILING=1` | — | Scheduler detailed phase breakdown | +| 3 | `+PTO2_ORCH_PROFILING=1` | — | Orchestrator detailed phase breakdown | +| 4 | `+PTO2_TENSORMAP_PROFILING=1` | — | TensorMap lookup stats | --- diff --git a/src/a5/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp b/src/a5/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp index dc0ddefb8..7952045ce 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp +++ b/src/a5/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp @@ -477,10 +477,6 @@ static bool stage_device_args( // runtime. Behavior-only env reads (no new gates); kept here so the args and // image steps stay free of unrelated state. static void apply_orch_sched_env_flags(Runtime *runtime) { - const char *orch_env = std::getenv("PTO2_ORCH_TO_SCHED"); - runtime->dev.orch_to_sched = orch_env && (orch_env[0] == '1' || orch_env[0] == 't' || orch_env[0] == 'T'); - LOG_INFO_V0("Orchestrator-to-scheduler transition: %s", runtime->dev.orch_to_sched ? "enabled" : "disabled"); - const char *serial_env = std::getenv("PTO2_SERIAL_ORCH_SCHED"); runtime->dev.serial_orch_sched = serial_env && (serial_env[0] == '1' || serial_env[0] == 't' || serial_env[0] == 'T'); diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/runtime.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/runtime.h index 6d2bc08a0..22d965231 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/runtime.h +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/runtime.h @@ -215,7 +215,6 @@ struct alignas(64) DeviceRuntimeLaunchDesc { // (= orch + schedulers). AicpuExecutor splits this into one orchestrator // thread (highest idx, runs aicpu_orchestration_entry) and the remaining // aicpu_thread_num-1 scheduler threads that dispatch tasks to AICore. - // The orch thread also dispatches when env PTO2_ORCH_TO_SCHED is set. int aicpu_thread_num; int ready_queue_shards; // Number of ready queue shards (1..MAX_AICPU_THREADS, default MAX-1) @@ -237,12 +236,6 @@ struct alignas(64) DeviceRuntimeLaunchDesc { // PTO2 integration: kernel_id -> GM function_bin_addr mapping uint64_t func_id_to_addr_[RUNTIME_MAX_FUNC_ID]; - // Orchestrator-to-scheduler transition control - // When true, orchestrator threads convert to scheduler threads after orchestration completes. - // When false (default), orchestrator threads exit after orchestration without dispatching tasks. - // Controlled via PTO2_ORCH_TO_SCHED environment variable. - bool orch_to_sched; - // Serial orchestrator -> scheduler start control. // When true, scheduler threads wait until orchestration has fully built the // task graph before entering resolve_and_dispatch(). diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp index 517a40c8b..20587aeaf 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp @@ -94,21 +94,6 @@ LoopAction SchedulerContext::handle_orchestrator_exit( return LoopAction::NONE; } -LoopAction SchedulerContext::handle_core_transition(bool &cores_released) { - if (!transition_requested_.load(std::memory_order_acquire)) return LoopAction::NONE; - if (!reassigned_.load(std::memory_order_acquire)) { - wait_reassign_.fetch_add(1, std::memory_order_release); - while (!reassigned_.load(std::memory_order_acquire)) { - if (completed_.load(std::memory_order_acquire)) { - return LoopAction::BREAK_LOOP; - } - SPIN_WAIT_HINT(); - } - } - cores_released = true; - return LoopAction::NONE; -} - LoopAction SchedulerContext::check_idle_fatal_error(int32_t thread_idx, PTO2SharedMemoryHeader *header, Runtime *runtime) { if (completed_.load(std::memory_order_acquire)) { @@ -327,7 +312,7 @@ void SchedulerContext::log_stall_diagnostics( // CLUSTER lines: one per cluster this thread owns. // cluster_id = local_cluster_idx * active_sched_threads_ + thread_idx, matching the - // round-robin assignment in assign_cores_to_threads / reassign_cores_for_all_threads. + // round-robin assignment in assign_cores_to_threads. int32_t ast = active_sched_threads_ > 0 ? active_sched_threads_ : aicpu_thread_num_; for (int32_t cli = 0; cli < tracker.get_cluster_count() && cli < STALL_DUMP_CORE_MAX; cli++) { int32_t offset = cli * 3; @@ -851,76 +836,6 @@ bool SchedulerContext::assign_cores_to_threads() { return true; } -// ============================================================================= -// Reassign all cores across all threads (sched + orchestrator) after orchestration. -// ============================================================================= -void SchedulerContext::reassign_cores_for_all_threads() { - LOG_INFO_V0( - "Reassigning cores (cluster-aligned) for %d threads: %d AIC, %d AIV", aicpu_thread_num_, aic_count_, aiv_count_ - ); - - // Collect running worker_ids from all current trackers - bool running_cores[RUNTIME_MAX_WORKER] = {}; - for (int32_t i = 0; i < aicpu_thread_num_; i++) { - auto all_running = core_trackers_[i].get_all_running_cores(); - int32_t bp; - while ((bp = all_running.pop_first()) >= 0) { - running_cores[core_trackers_[i].get_core_id_by_offset(bp)] = true; - } - } - - // Count clusters per thread (round-robin across all threads) - int32_t cluster_count = aic_count_; - int32_t clusters_per_thread[MAX_AICPU_THREADS] = {}; - for (int32_t ci = 0; ci < cluster_count; ci++) { - clusters_per_thread[ci % aicpu_thread_num_]++; - } - - // Re-init all trackers and reset core counts - for (int32_t i = 0; i < aicpu_thread_num_; i++) { - core_trackers_[i].init(clusters_per_thread[i]); - } - - // Assign clusters round-robin and restore running state - int32_t cluster_idx_per_thread[MAX_AICPU_THREADS] = {}; - for (int32_t ci = 0; ci < cluster_count; ci++) { - int32_t t = ci % aicpu_thread_num_; - - int32_t aic_wid = aic_worker_ids_[ci]; - int32_t aiv0_wid = aiv_worker_ids_[2 * ci]; - int32_t aiv1_wid = aiv_worker_ids_[2 * ci + 1]; - - int32_t cl_idx = cluster_idx_per_thread[t]++; - core_trackers_[t].set_cluster(cl_idx, aic_wid, aiv0_wid, aiv1_wid); - - // init() marks all idle; toggle cores that were running and restore pending_occupied - if (running_cores[aic_wid]) { - core_trackers_[t].change_core_state(cl_idx * 3); - core_trackers_[t].set_pending_occupied(cl_idx * 3); - } - if (running_cores[aiv0_wid]) { - core_trackers_[t].change_core_state(cl_idx * 3 + 1); - core_trackers_[t].set_pending_occupied(cl_idx * 3 + 1); - } - if (running_cores[aiv1_wid]) { - core_trackers_[t].change_core_state(cl_idx * 3 + 2); - core_trackers_[t].set_pending_occupied(cl_idx * 3 + 2); - } - } - - // Log final distribution - LOG_INFO_V0("Core reassignment complete:"); - for (int32_t t = 0; t < aicpu_thread_num_; t++) { - int32_t aic_running = core_trackers_[t].get_running_count(); - int32_t aiv_running = core_trackers_[t].get_running_count(); - LOG_INFO_V0( - " Thread %d: %d cores, %d clusters (AIC running=%d, AIV running=%d)", t, core_trackers_[t].core_num(), - core_trackers_[t].get_cluster_count(), aic_running, aiv_running - ); - } - active_sched_threads_ = aicpu_thread_num_; -} - // ============================================================================= // Emergency shutdown: broadcast exit signal to every handshake'd core and // deinit their AICore register blocks. Idempotent. @@ -948,9 +863,8 @@ void SchedulerContext::emergency_shutdown(Runtime *runtime) { // ============================================================================= // Lifecycle: init / deinit // ============================================================================= -int32_t SchedulerContext::init( - Runtime *runtime, int32_t aicpu_thread_num, int32_t sched_thread_num, bool orch_to_sched, uint64_t regs_base -) { +int32_t +SchedulerContext::init(Runtime *runtime, int32_t aicpu_thread_num, int32_t sched_thread_num, uint64_t regs_base) { always_assert(runtime != nullptr); // Zero all per-core execution state before handshake @@ -959,7 +873,6 @@ int32_t SchedulerContext::init( // Wire thread/transition configuration that handshake/assign need to read. aicpu_thread_num_ = aicpu_thread_num; sched_thread_num_ = sched_thread_num; - orch_to_sched_ = orch_to_sched; regs_ = regs_base; #if PTO2_PROFILING @@ -976,10 +889,6 @@ int32_t SchedulerContext::init( l2_swimlane_aicpu_init(runtime->dev.worker_count); l2_swimlane_level_ = get_l2_swimlane_level(); if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) { - // When orchestrator phases merge into scheduler threads - // (PTO2_ORCH_TO_SCHED=1), phase records flow through - // aicpu_thread_num_ pools — matches the same branch in the - // dump_args_init call below. // Sched phase pool count = number of scheduler threads. // This block runs before assign_cores_to_threads, so the // active_sched_threads_ member isn't set yet — recompute the same @@ -988,10 +897,9 @@ int32_t SchedulerContext::init( // assign_cores_to_threads' active_sched_threads_). Without this // normalization here, init_phase would prime zero sched pools // and all sched_phase emits would silently drop. - const int active_sched = (sched_thread_num_ > 0) ? sched_thread_num_ : aicpu_thread_num_; - const int sched_phase_threads = orch_to_sched_ ? aicpu_thread_num_ : active_sched; + const int sched_phase_threads = (sched_thread_num_ > 0) ? sched_thread_num_ : aicpu_thread_num_; // Orch phase is a single instance (PR #971 design), so the orch - // pool count is always 1 regardless of orch_to_sched mode. + // pool count is always 1. const int orch_phase_threads = 1; l2_swimlane_aicpu_init_phase(runtime->dev.worker_count, sched_phase_threads, orch_phase_threads); } @@ -1018,7 +926,7 @@ int32_t SchedulerContext::init( // orchestrator thread (see aicpu_executor.cpp). #if PTO2_PROFILING if (is_dump_args_enabled()) { - dump_args_init(orch_to_sched_ ? aicpu_thread_num_ : active_sched_threads_); + dump_args_init(active_sched_threads_); } if (is_pmu_enabled()) { pmu_aicpu_init(physical_core_ids_, cores_total_num_); @@ -1112,11 +1020,6 @@ void SchedulerContext::deinit() { completed_tasks_.store(0, std::memory_order_release); total_tasks_ = 0; orchestrator_done_.store(false, std::memory_order_release); - - // Reset core transition state - transition_requested_.store(false, std::memory_order_release); - wait_reassign_.store(0, std::memory_order_release); - reassigned_.store(false, std::memory_order_release); completed_.store(false, std::memory_order_release); // Reset core discovery and assignment state @@ -1125,7 +1028,6 @@ void SchedulerContext::deinit() { cores_total_num_ = 0; aicpu_thread_num_ = 0; sched_thread_num_ = 0; - orch_to_sched_ = false; active_sched_threads_ = 0; for (int32_t t = 0; t < MAX_AICPU_THREADS; t++) { core_trackers_[t] = CoreTracker{}; @@ -1167,7 +1069,7 @@ void SchedulerContext::wait_for_orchestration_done_before_dispatch(Runtime *runt // and drives the orchestrator → scheduler core transition (or fatal shutdown). // ============================================================================= void SchedulerContext::on_orchestration_done( - Runtime *runtime, PTO2Runtime *rt, int32_t thread_idx, int32_t total_tasks + Runtime *runtime, PTO2Runtime *rt, [[maybe_unused]] int32_t thread_idx, int32_t total_tasks ) { #if PTO2_PROFILING if (l2_swimlane_level_ >= L2SwimlaneLevel::ORCH_PHASES) { @@ -1199,32 +1101,9 @@ void SchedulerContext::on_orchestration_done( } } - // Skip core transition on fatal error — cores already shut down above. - if (completed_.load(std::memory_order_acquire)) { - // Signal transition to unblock scheduler threads waiting at core transition - transition_requested_.store(true, std::memory_order_release); - reassigned_.store(true, std::memory_order_release); - } else if (orch_to_sched_) { - LOG_INFO_V0("Thread %d: Set orchestrator_done=true, requesting core transition", thread_idx); - transition_requested_.store(true, std::memory_order_release); - - // Wait for scheduler threads to acknowledge transition request - while (wait_reassign_.load(std::memory_order_acquire) != sched_thread_num_) { - if (completed_.load(std::memory_order_acquire)) { - break; - } - SPIN_WAIT_HINT(); - } - if (!completed_.load(std::memory_order_acquire)) { - reassign_cores_for_all_threads(); - reassigned_.store(true, std::memory_order_release); - } - } - #if PTO2_PROFILING - // Write core-to-thread mapping AFTER reassignment so the profiling data - // reflects the final distribution (all active_sched_threads_, including - // former orchestrator threads when orch_to_sched_ is enabled). + // Write the core-to-thread mapping so the profiling data reflects the + // scheduler threads' final core distribution. if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) { l2_swimlane_aicpu_init_core_assignments(cores_total_num_); for (int32_t t = 0; t < active_sched_threads_; t++) { diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h index 5cfc08563..a6c0bdfa7 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h @@ -59,8 +59,7 @@ class SchedulerContext { // - Binds func_id_to_addr_ / initial sched_ (if rt is already known) // - Captures AICore-register base (consumed by handshake_all_cores()) // Returns 0 on success, negative on failure (handshake / assignment error). - int32_t - init(Runtime *runtime, int32_t aicpu_thread_num, int32_t sched_thread_num, bool orch_to_sched, uint64_t regs_base); + int32_t init(Runtime *runtime, int32_t aicpu_thread_num, int32_t sched_thread_num, uint64_t regs_base); // Reset all SchedulerContext-owned state to its post-construction defaults. // Called by AicpuExecutor::deinit() during per-run teardown. @@ -150,15 +149,9 @@ class SchedulerContext { std::atomic completed_{false}; uint64_t *func_id_to_addr_{nullptr}; - // --- Core-transition coordination --- - std::atomic transition_requested_{false}; - std::atomic wait_reassign_{0}; - std::atomic reassigned_{false}; - // --- Thread/core configuration --- int32_t active_sched_threads_{0}; int32_t sched_thread_num_{0}; - bool orch_to_sched_{false}; int32_t aicpu_thread_num_{0}; int32_t cores_total_num_{0}; @@ -190,9 +183,6 @@ class SchedulerContext { // Assign discovered cores (cluster = 1 AIC + 2 AIV) round-robin across scheduler threads. bool assign_cores_to_threads(); - // Re-distribute all cores across all threads after orchestration completes. - void reassign_cores_for_all_threads(); - // Emergency shutdown: broadcast exit signal to every handshake'd core and // deinit their AICore register blocks. Idempotent. void emergency_shutdown(Runtime *runtime); @@ -323,8 +313,6 @@ class SchedulerContext { __attribute__((noinline, cold)) LoopAction handle_orchestrator_exit(int32_t thread_idx, PTO2SharedMemoryHeader *header, Runtime *runtime, int32_t &task_count); - __attribute__((noinline, cold)) LoopAction handle_core_transition(bool &cores_released); - __attribute__((noinline, cold)) LoopAction check_idle_fatal_error(int32_t thread_idx, PTO2SharedMemoryHeader *header, Runtime *runtime); diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp index af17954ed..399cf70b2 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp @@ -520,8 +520,6 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_ PTO2TaskSlotState *deferred_release_slot_states[PTO2_DEFERRED_RELEASE_CAP]; int32_t deferred_release_count = 0; - bool cores_released = false; - // PMU runs require single-issue dispatch — overlapping in-flight tasks // pollute per-task PMU counters. Cached at function scope (parity with // a2a3): is_pmu_enabled() is extern "C" and the compiler cannot hoist it @@ -629,11 +627,6 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_ if (action == LoopAction::BREAK_LOOP) break; } - if (!cores_released && orch_to_sched_) { - LoopAction action = handle_core_transition(cores_released); - if (action == LoopAction::BREAK_LOOP) break; - } - #if PTO2_PROFILING CYCLE_COUNT_LAP(l2_swimlane.sched_idle_cycle); #endif diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp index 8b28e620e..5edaa438b 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp @@ -37,7 +37,6 @@ Runtime::Runtime() { memset(dev.aicpu_allowed_cpus, 0, sizeof(dev.aicpu_allowed_cpus)); dev.aicpu_allowed_cpu_count = 0; dev.aicpu_launch_count = 0; - dev.orch_to_sched = false; dev.serial_orch_sched = false; dev.gm_sm_ptr_ = nullptr; dev.slot_states_ptr_ = nullptr;