From 5fe1b0cb0bc95d761d5f71fd77c572df814288fc Mon Sep 17 00:00:00 2001 From: Chao Wang <26245345+ChaoWao@users.noreply.github.com> Date: Wed, 1 Jul 2026 09:41:42 +0800 Subject: [PATCH] Refactor: gate Scheduler summary log behind PTO2_SCHED_PROFILING MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The per-thread "Scheduler summary" device log was emitted under plain PTO2_PROFILING=1 (Level 1), while every other per-thread profiling line (orch_* / sched_* timing) is gated by its Level 2/3 sub-macro. That made Level 1 the only level with a per-thread device log and left the profiling_levels.md counts inconsistent. Move the `Scheduler summary` line (and the `sched_total` computation that only feeds it) into the `#if PTO2_SCHED_PROFILING` block in log_l2_swimlane_summary() for both a2a3 and a5. Level 1 now produces no per-thread device logs — it only feeds the host-side Orch/Sched [STRACE] windows via aicpu_phase_set_window. `cur_thread_completed` is now used only inside the SCHED block, so it is marked [[maybe_unused]] to keep the Level 1 (PTO2_SCHED_PROFILING=0) build warning-clean under -Werror=unused-parameter. Rewrite profiling_levels.md (both arches) accordingly: Level 1 documents zero device logs; Level 2 gains the Scheduler summary (no longer a "replaced" line); the stale per-level LOG_INFO_V9 count formulas (`N_sched*2 + N_orch*1 + 1`, "11 debug + 2 basic + ...", table count 7) are replaced with per-thread descriptions matching the actual macro gating. Builds clean on a2a3 and a5 across all profiling-flag combos (pto2-off / sched / orch / orch-sched / all-on); tensormap sim tests pass on a2a3sim and a5sim. --- .../docs/profiling_levels.md | 70 +++++++++---------- .../runtime/scheduler/scheduler_cold_path.cpp | 6 +- .../docs/profiling_levels.md | 70 +++++++++---------- .../runtime/scheduler/scheduler_cold_path.cpp | 6 +- 4 files changed, 70 insertions(+), 82 deletions(-) diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/docs/profiling_levels.md b/src/a2a3/runtime/tensormap_and_ringbuffer/docs/profiling_levels.md index 4467cd7b2..b49025e11 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/docs/profiling_levels.md +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/docs/profiling_levels.md @@ -71,46 +71,34 @@ Each sub-level macro requires `PTO2_PROFILING=1`: **What's compiled:** -- Base timing counters for scheduler loop (`sched_complete/dispatch/idle/scan`) -- Per-thread orchestration timing (`orch_start`, `orch_end`, `orch_cost`) -- PTO2 total submitted tasks count (printed by last orch thread, after orch timing line) -- Scheduler summary output (`total_time`, `loops`, `tasks_scheduled`) -- Scheduler lifetime timestamps and cost (`sched_start`, `sched_end`, `sched_cost` — captured inside `resolve_and_dispatch_pto2()`, printed before Scheduler summary) +- Base timing counters for the scheduler loop (`sched_complete/dispatch/idle/scan`) +- Host-side phase windows: each sched/orch thread publishes its + start/end window via `aicpu_phase_set_window`, which the host reduces + into the `Orch` / `Sched` `[STRACE]` markers **What's NOT compiled:** +- Per-thread scheduler/orchestrator device-log lines (moved to Level 2 / Level 3) - Detailed phase breakdowns - TensorMap statistics **Log output (additional lines vs Level 0, per normal run):** -- `Thread %d: orch_start=%llu orch_end=%llu orch_cost=%.3fus` — each orch thread, after orchestration fully complete -- `PTO2 total submitted tasks = %d, already executed %d tasks` — last orch thread only (×1), after orch timing line -- `Thread %d: sched_start=%llu sched_end=%llu sched_cost=%.3fus` — each sched thread, printed before Scheduler summary -- `Thread %d: Scheduler summary: total_time=%.3fus, loops=%llu, tasks_scheduled=%d` — each sched thread -- `Thread %d: sched_start=%llu sched_end(timeout)=%llu sched_cost=%.3fus` — timeout path only (replaces normal `sched_end`) +- None on the device side. The per-thread `orch_start/orch_end/orch_cost`, + `sched_start/sched_end/sched_cost`, and `Scheduler summary` lines are NOT + emitted at this level — `orch_*` is gated by `PTO2_ORCH_PROFILING` (Level 3), + `sched_*` and `Scheduler summary` by `PTO2_SCHED_PROFILING` (Level 2). + Level 1 only feeds the host-side `Orch` / `Sched` `[STRACE]` timeline. **LOG_INFO_V9 count (normal run):** -- `N_sched*2 + N_orch*1 + 1` (orch_timing + PTO2_total + sched_timing + Scheduler_summary) - -> See the table at the end for concrete counts based on the `paged_attention` example. - -**Example log output** (from `paged_attention`, device 10): - -```text -Thread 2: orch_start=48214752948321 orch_end=48214752959379 orch_cost=230.000us -Thread 3: orch_start=48214752948316 orch_end=48214752961505 orch_cost=275.000us -PTO2 total submitted tasks = 13, already executed 13 tasks -Thread 1: sched_start=48214752948235 sched_end=48214752962379 sched_cost=295.000us -Thread 1: Scheduler summary: total_time=159.560us, loops=3782, tasks_scheduled=6 -Thread 0: sched_start=48214752948200 sched_end=48214752963571 sched_cost=320.000us -Thread 0: Scheduler summary: total_time=183.180us, loops=4611, tasks_scheduled=7 -``` +- `0` (device-side profiling logs). The timeline is delivered host-side via the + phase windows, not through per-thread device logs. **Note:** -- All logs above are controlled by compile-time macro `PTO2_PROFILING`, not by `enable_l2_swimlane`. +- The host-side `[STRACE]` phase windows are controlled by compile-time macro + `PTO2_PROFILING`, not by `enable_l2_swimlane`. - `enable_l2_swimlane` only controls shared-memory data collection / swimlane export. --- @@ -126,13 +114,15 @@ Thread 0: Scheduler summary: total_time=183.180us, loops=4611, tasks_scheduled=7 - Phase-specific statistics (complete, scan, dispatch, idle) - Hit rate tracking (complete poll, ready queue pop) -**Log output:** 18 LOG_INFO_V9 logs (11 debug + 2 basic + 7 scheduler detailed - 2 replaced) - -- Replaces scheduler summary with detailed breakdown +**Log output (per scheduler thread, normal run):** the `sched_start/sched_end/ +sched_cost` line, the full phase breakdown, and the `Scheduler summary` line +(all gated by `PTO2_SCHED_PROFILING`). The `Scheduler summary` line first +appears at this level — it is not emitted at Level 1. **Scheduler output:** ```text +Thread X: sched_start=XXX sched_end=XXX sched_cost=XXXus Thread X: === Scheduler Phase Breakdown: total=XXXus, XXX tasks === Thread X: complete : XXXus (XX.X%) Thread X: poll : XXXus (XX.X%) hit=XXX, miss=XXX, hit_rate=XX.X% @@ -170,9 +160,11 @@ captured at l2_swimlane_level >= 3) and `deps.json`; consume them via - Atomic operation counters - Wait time tracking -**Log output:** 30 LOG_INFO_V9 logs (11 debug + 2 basic + 1 scheduler summary + 17 orchestrator detailed - 1 replaced) - -- Replaces basic orchestration completion with detailed breakdown +**Log output (per orchestrator thread, normal run):** the orchestrator phase +breakdown, followed by the `orch_start/orch_end/orch_cost` line and the +`PTO2 total submitted tasks` line — all gated by `PTO2_ORCH_PROFILING`. This +level adds orchestrator-side logs only; the scheduler side is unchanged from +Level 1 (add `PTO2_SCHED_PROFILING` / Level 2 for scheduler detail). **Orchestrator output:** @@ -188,6 +180,8 @@ Thread X: fanin+ready : XXXus (XX.X%) work=XXXus wait=XXXus atomics=XXX Thread X: finalize+SM : XXXus (XX.X%) work=XXXus wait=XXXus atomics=XXX Thread X: scope_end : XXXus atomics=XXX Thread X: avg/task : XXXus +Thread X: orch_start=XXX orch_end=XXX orch_cost=XXXus +PTO2 total submitted tasks = XXX, already executed XXX tasks ``` **Note:** Orchestrator logs always print when `PTO2_ORCH_PROFILING=1`, regardless of `enable_l2_swimlane` flag. @@ -205,7 +199,9 @@ Thread X: avg/task : XXXus - Hash chain walk tracking - Overlap check counters -**Log output:** 34 LOG_INFO_V9 logs (30 from Level 3 + 4 tensormap) +**Log output (per orchestrator thread, normal run):** all Level 3 orchestrator +output plus the 4-line TensorMap lookup stats block below (gated by +`PTO2_TENSORMAP_PROFILING`, nested inside `PTO2_ORCH_PROFILING`). **TensorMap output:** @@ -404,10 +400,10 @@ definitions to runtime headers. | Level | Macro Settings | LOG_INFO_V9 Count | Description | | ----- | -------------- | ----------------- | ----------- | | 0 | `PTO2_PROFILING=0` | 0 | No timing output | -| 1 | `PTO2_PROFILING=1` | 7 | Timing timestamps + scheduler summary | -| 2 | `+PTO2_SCHED_PROFILING=1` | — | Scheduler detailed phase breakdown | -| 3 | `+PTO2_ORCH_PROFILING=1` | — | Orchestrator detailed phase breakdown | -| 4 | `+PTO2_TENSORMAP_PROFILING=1` | — | TensorMap lookup stats | +| 1 | `PTO2_PROFILING=1` | 0 | Host-side `Orch`/`Sched` `[STRACE]` windows only; no device logs | +| 2 | `+PTO2_SCHED_PROFILING=1` | per sched thread | `sched_start` + phase breakdown + `Scheduler summary` | +| 3 | `+PTO2_ORCH_PROFILING=1` | per orch thread | Orchestrator phase breakdown + `orch_start` + `PTO2 total` | +| 4 | `+PTO2_TENSORMAP_PROFILING=1` | per orch thread | + TensorMap lookup stats (4 lines) | --- diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp index 5bac8297a..8e1813367 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp @@ -491,7 +491,7 @@ int32_t SchedulerContext::handle_timeout_exit( } #if PTO2_PROFILING -void SchedulerContext::log_l2_swimlane_summary(int32_t thread_idx, int32_t cur_thread_completed) { +void SchedulerContext::log_l2_swimlane_summary(int32_t thread_idx, [[maybe_unused]] int32_t cur_thread_completed) { auto &l2_swimlane = sched_l2_swimlane_[thread_idx]; uint64_t sched_end_ts = get_sys_cnt_aicpu(); // Ride the sched window home to the host phase buffer (the host reduces @@ -506,13 +506,11 @@ void SchedulerContext::log_l2_swimlane_summary(int32_t thread_idx, int32_t cur_t static_cast(l2_swimlane.sched_start_ts), static_cast(sched_end_ts), cycles_to_us(sched_end_ts - l2_swimlane.sched_start_ts) ); -#endif uint64_t sched_total = l2_swimlane.sched_wiring_cycle + l2_swimlane.sched_complete_cycle + l2_swimlane.sched_dispatch_cycle + l2_swimlane.sched_idle_cycle; if (sched_total == 0) sched_total = 1; -#if PTO2_SCHED_PROFILING { PTO2SchedProfilingData sp = scheduler_get_profiling(thread_idx); uint64_t otc_total = sp.lock_cycle + sp.fanout_cycle + sp.fanin_cycle + sp.self_consumed_cycle; @@ -627,11 +625,11 @@ void SchedulerContext::log_l2_swimlane_summary(int32_t thread_idx, int32_t cur_t ); } } -#endif LOG_INFO_V9( "Thread %d: Scheduler summary: total_time=%.3fus, loops=%" PRIu64 ", tasks_scheduled=%d", thread_idx, cycles_to_us(sched_total), static_cast(l2_swimlane.sched_loop_count), cur_thread_completed ); +#endif } #endif diff --git a/src/a5/runtime/tensormap_and_ringbuffer/docs/profiling_levels.md b/src/a5/runtime/tensormap_and_ringbuffer/docs/profiling_levels.md index 62a38766e..ea95b89d1 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/docs/profiling_levels.md +++ b/src/a5/runtime/tensormap_and_ringbuffer/docs/profiling_levels.md @@ -71,46 +71,34 @@ Each sub-level macro requires `PTO2_PROFILING=1`: **What's compiled:** -- Base timing counters for scheduler loop (`sched_complete/dispatch/idle/scan`) -- Per-thread orchestration timing (`orch_start`, `orch_end`, `orch_cost`) -- PTO2 total submitted tasks count (printed by last orch thread, after orch timing line) -- Scheduler summary output (`total_time`, `loops`, `tasks_scheduled`) -- Scheduler lifetime timestamps and cost (`sched_start`, `sched_end`, `sched_cost` — captured inside `resolve_and_dispatch_pto2()`, printed before Scheduler summary) +- Base timing counters for the scheduler loop (`sched_complete/dispatch/idle/scan`) +- Host-side phase windows: each sched/orch thread publishes its + start/end window via `aicpu_phase_set_window`, which the host reduces + into the `Orch` / `Sched` `[STRACE]` markers **What's NOT compiled:** +- Per-thread scheduler/orchestrator device-log lines (moved to Level 2 / Level 3) - Detailed phase breakdowns - TensorMap statistics **Log output (additional lines vs Level 0, per normal run):** -- `Thread %d: orch_start=%llu orch_end=%llu orch_cost=%.3fus` — each orch thread, after orchestration fully complete -- `PTO2 total submitted tasks = %d, already executed %d tasks` — last orch thread only (×1), after orch timing line -- `Thread %d: sched_start=%llu sched_end=%llu sched_cost=%.3fus` — each sched thread, printed before Scheduler summary -- `Thread %d: Scheduler summary: total_time=%.3fus, loops=%llu, tasks_scheduled=%d` — each sched thread -- `Thread %d: sched_start=%llu sched_end(timeout)=%llu sched_cost=%.3fus` — timeout path only (replaces normal `sched_end`) +- None on the device side. The per-thread `orch_start/orch_end/orch_cost`, + `sched_start/sched_end/sched_cost`, and `Scheduler summary` lines are NOT + emitted at this level — `orch_*` is gated by `PTO2_ORCH_PROFILING` (Level 3), + `sched_*` and `Scheduler summary` by `PTO2_SCHED_PROFILING` (Level 2). + Level 1 only feeds the host-side `Orch` / `Sched` `[STRACE]` timeline. **LOG_INFO_V9 count (normal run):** -- `N_sched*2 + N_orch*1 + 1` (orch_timing + PTO2_total + sched_timing + Scheduler_summary) - -> See the table at the end for concrete counts based on the `paged_attention` example. - -**Example log output** (from `paged_attention`, device 10): - -```text -Thread 2: orch_start=48214752948321 orch_end=48214752959379 orch_cost=230.000us -Thread 3: orch_start=48214752948316 orch_end=48214752961505 orch_cost=275.000us -PTO2 total submitted tasks = 13, already executed 13 tasks -Thread 1: sched_start=48214752948235 sched_end=48214752962379 sched_cost=295.000us -Thread 1: Scheduler summary: total_time=159.560us, loops=3782, tasks_scheduled=6 -Thread 0: sched_start=48214752948200 sched_end=48214752963571 sched_cost=320.000us -Thread 0: Scheduler summary: total_time=183.180us, loops=4611, tasks_scheduled=7 -``` +- `0` (device-side profiling logs). The timeline is delivered host-side via the + phase windows, not through per-thread device logs. **Note:** -- All logs above are controlled by compile-time macro `PTO2_PROFILING`, not by `enable_l2_swimlane`. +- The host-side `[STRACE]` phase windows are controlled by compile-time macro + `PTO2_PROFILING`, not by `enable_l2_swimlane`. - `enable_l2_swimlane` only controls shared-memory data collection / swimlane export. --- @@ -126,13 +114,15 @@ Thread 0: Scheduler summary: total_time=183.180us, loops=4611, tasks_scheduled=7 - Phase-specific statistics (complete, scan, dispatch, idle) - Hit rate tracking (complete poll, ready queue pop) -**Log output:** 18 LOG_INFO_V9 logs (11 debug + 2 basic + 7 scheduler detailed - 2 replaced) - -- Replaces scheduler summary with detailed breakdown +**Log output (per scheduler thread, normal run):** the `sched_start/sched_end/ +sched_cost` line, the full phase breakdown, and the `Scheduler summary` line +(all gated by `PTO2_SCHED_PROFILING`). The `Scheduler summary` line first +appears at this level — it is not emitted at Level 1. **Scheduler output:** ```text +Thread X: sched_start=XXX sched_end=XXX sched_cost=XXXus Thread X: === Scheduler Phase Breakdown: total=XXXus, XXX tasks === Thread X: complete : XXXus (XX.X%) Thread X: poll : XXXus (XX.X%) hit=XXX, miss=XXX, hit_rate=XX.X% @@ -170,9 +160,11 @@ captured at l2_swimlane_level >= 3) and `deps.json`; consume them via - Atomic operation counters - Wait time tracking -**Log output:** 30 LOG_INFO_V9 logs (11 debug + 2 basic + 1 scheduler summary + 17 orchestrator detailed - 1 replaced) - -- Replaces basic orchestration completion with detailed breakdown +**Log output (per orchestrator thread, normal run):** the orchestrator phase +breakdown, followed by the `orch_start/orch_end/orch_cost` line and the +`PTO2 total submitted tasks` line — all gated by `PTO2_ORCH_PROFILING`. This +level adds orchestrator-side logs only; the scheduler side is unchanged from +Level 1 (add `PTO2_SCHED_PROFILING` / Level 2 for scheduler detail). **Orchestrator output:** @@ -188,6 +180,8 @@ Thread X: fanin+ready : XXXus (XX.X%) work=XXXus wait=XXXus atomics=XXX Thread X: finalize+SM : XXXus (XX.X%) work=XXXus wait=XXXus atomics=XXX Thread X: scope_end : XXXus atomics=XXX Thread X: avg/task : XXXus +Thread X: orch_start=XXX orch_end=XXX orch_cost=XXXus +PTO2 total submitted tasks = XXX, already executed XXX tasks ``` **Note:** Orchestrator logs always print when `PTO2_ORCH_PROFILING=1`, regardless of `enable_l2_swimlane` flag. @@ -205,7 +199,9 @@ Thread X: avg/task : XXXus - Hash chain walk tracking - Overlap check counters -**Log output:** 34 LOG_INFO_V9 logs (30 from Level 3 + 4 tensormap) +**Log output (per orchestrator thread, normal run):** all Level 3 orchestrator +output plus the 4-line TensorMap lookup stats block below (gated by +`PTO2_TENSORMAP_PROFILING`, nested inside `PTO2_ORCH_PROFILING`). **TensorMap output:** @@ -374,10 +370,10 @@ definitions to runtime headers. | Level | Macro Settings | LOG_INFO_V9 Count | Description | | ----- | -------------- | ----------------- | ----------- | | 0 | `PTO2_PROFILING=0` | 0 | No timing output | -| 1 | `PTO2_PROFILING=1` | 7 | Timing timestamps + scheduler summary | -| 2 | `+PTO2_SCHED_PROFILING=1` | — | Scheduler detailed phase breakdown | -| 3 | `+PTO2_ORCH_PROFILING=1` | — | Orchestrator detailed phase breakdown | -| 4 | `+PTO2_TENSORMAP_PROFILING=1` | — | TensorMap lookup stats | +| 1 | `PTO2_PROFILING=1` | 0 | Host-side `Orch`/`Sched` `[STRACE]` windows only; no device logs | +| 2 | `+PTO2_SCHED_PROFILING=1` | per sched thread | `sched_start` + phase breakdown + `Scheduler summary` | +| 3 | `+PTO2_ORCH_PROFILING=1` | per orch thread | Orchestrator phase breakdown + `orch_start` + `PTO2 total` | +| 4 | `+PTO2_TENSORMAP_PROFILING=1` | per orch thread | + TensorMap lookup stats (4 lines) | --- diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp index 20587aeaf..337cc5df4 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp @@ -488,7 +488,7 @@ int32_t SchedulerContext::handle_timeout_exit( } #if PTO2_PROFILING -void SchedulerContext::log_l2_swimlane_summary(int32_t thread_idx, int32_t cur_thread_completed) { +void SchedulerContext::log_l2_swimlane_summary(int32_t thread_idx, [[maybe_unused]] int32_t cur_thread_completed) { auto &l2_swimlane = sched_l2_swimlane_[thread_idx]; uint64_t sched_end_ts = get_sys_cnt_aicpu(); // Ride the sched window home to the host phase buffer (the host reduces @@ -503,14 +503,12 @@ void SchedulerContext::log_l2_swimlane_summary(int32_t thread_idx, int32_t cur_t static_cast(l2_swimlane.sched_start_ts), static_cast(sched_end_ts), cycles_to_us(sched_end_ts - l2_swimlane.sched_start_ts) ); -#endif uint64_t sched_total = l2_swimlane.sched_wiring_cycle + l2_swimlane.sched_complete_cycle + l2_swimlane.sched_scan_cycle + l2_swimlane.sched_dispatch_cycle + l2_swimlane.sched_idle_cycle; if (sched_total == 0) sched_total = 1; -#if PTO2_SCHED_PROFILING { PTO2SchedProfilingData sp = scheduler_get_profiling(thread_idx); uint64_t otc_total = sp.lock_cycle + sp.fanout_cycle + sp.fanin_cycle + sp.self_consumed_cycle; @@ -630,11 +628,11 @@ void SchedulerContext::log_l2_swimlane_summary(int32_t thread_idx, int32_t cur_t ); } } -#endif LOG_INFO_V9( "Thread %d: Scheduler summary: total_time=%.3fus, loops=%" PRIu64 ", tasks_scheduled=%d", thread_idx, cycles_to_us(sched_total), static_cast(l2_swimlane.sched_loop_count), cur_thread_completed ); +#endif } #endif