hw-native-sys · ChaoWao · Jul 1, 2026 · Jul 1, 2026
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/docs/profiling_levels.md b/src/a2a3/runtime/tensormap_and_ringbuffer/docs/profiling_levels.md
@@ -71,46 +71,34 @@ Each sub-level macro requires `PTO2_PROFILING=1`:
 
 **What's compiled:**
 
-- Base timing counters for scheduler loop (`sched_complete/dispatch/idle/scan`)
-- Per-thread orchestration timing (`orch_start`, `orch_end`, `orch_cost`)
-- PTO2 total submitted tasks count (printed by last orch thread, after orch timing line)
-- Scheduler summary output (`total_time`, `loops`, `tasks_scheduled`)
-- Scheduler lifetime timestamps and cost (`sched_start`, `sched_end`, `sched_cost` — captured inside `resolve_and_dispatch_pto2()`, printed before Scheduler summary)
+- Base timing counters for the scheduler loop (`sched_complete/dispatch/idle/scan`)
+- Host-side phase windows: each sched/orch thread publishes its
+  start/end window via `aicpu_phase_set_window`, which the host reduces
+  into the `Orch` / `Sched` `[STRACE]` markers
 
 **What's NOT compiled:**
 
+- Per-thread scheduler/orchestrator device-log lines (moved to Level 2 / Level 3)
 - Detailed phase breakdowns
 - TensorMap statistics
 
 **Log output (additional lines vs Level 0, per normal run):**
 
-- `Thread %d: orch_start=%llu orch_end=%llu orch_cost=%.3fus` — each orch thread, after orchestration fully complete
-- `PTO2 total submitted tasks = %d, already executed %d tasks` — last orch thread only (×1), after orch timing line
-- `Thread %d: sched_start=%llu sched_end=%llu sched_cost=%.3fus` — each sched thread, printed before Scheduler summary
-- `Thread %d: Scheduler summary: total_time=%.3fus, loops=%llu, tasks_scheduled=%d` — each sched thread
-- `Thread %d: sched_start=%llu sched_end(timeout)=%llu sched_cost=%.3fus` — timeout path only (replaces normal `sched_end`)
+- None on the device side. The per-thread `orch_start/orch_end/orch_cost`,
+  `sched_start/sched_end/sched_cost`, and `Scheduler summary` lines are NOT
+  emitted at this level — `orch_*` is gated by `PTO2_ORCH_PROFILING` (Level 3),
+  `sched_*` and `Scheduler summary` by `PTO2_SCHED_PROFILING` (Level 2).
+  Level 1 only feeds the host-side `Orch` / `Sched` `[STRACE]` timeline.
 
 **LOG_INFO_V9 count (normal run):**
 
-- `N_sched*2 + N_orch*1 + 1` (orch_timing + PTO2_total + sched_timing + Scheduler_summary)
-
-> See the table at the end for concrete counts based on the `paged_attention` example.
-
-**Example log output** (from `paged_attention`, device 10):
-
-```text
-Thread 2: orch_start=48214752948321 orch_end=48214752959379 orch_cost=230.000us
-Thread 3: orch_start=48214752948316 orch_end=48214752961505 orch_cost=275.000us
-PTO2 total submitted tasks = 13, already executed 13 tasks
-Thread 1: sched_start=48214752948235 sched_end=48214752962379 sched_cost=295.000us
-Thread 1: Scheduler summary: total_time=159.560us, loops=3782, tasks_scheduled=6
-Thread 0: sched_start=48214752948200 sched_end=48214752963571 sched_cost=320.000us
-Thread 0: Scheduler summary: total_time=183.180us, loops=4611, tasks_scheduled=7
-```
+- `0` (device-side profiling logs). The timeline is delivered host-side via the
+  phase windows, not through per-thread device logs.
 
 **Note:**
 
-- All logs above are controlled by compile-time macro `PTO2_PROFILING`, not by `enable_l2_swimlane`.
+- The host-side `[STRACE]` phase windows are controlled by compile-time macro
+  `PTO2_PROFILING`, not by `enable_l2_swimlane`.
 - `enable_l2_swimlane` only controls shared-memory data collection / swimlane export.
 
 ---
@@ -126,13 +114,15 @@ Thread 0: Scheduler summary: total_time=183.180us, loops=4611, tasks_scheduled=7
 - Phase-specific statistics (complete, scan, dispatch, idle)
 - Hit rate tracking (complete poll, ready queue pop)
 
-**Log output:** 18 LOG_INFO_V9 logs (11 debug + 2 basic + 7 scheduler detailed - 2 replaced)
-
-- Replaces scheduler summary with detailed breakdown
+**Log output (per scheduler thread, normal run):** the `sched_start/sched_end/
+sched_cost` line, the full phase breakdown, and the `Scheduler summary` line
+(all gated by `PTO2_SCHED_PROFILING`). The `Scheduler summary` line first
+appears at this level — it is not emitted at Level 1.
 
 **Scheduler output:**
 
 ```text
+Thread X: sched_start=XXX sched_end=XXX sched_cost=XXXus
 Thread X: === Scheduler Phase Breakdown: total=XXXus, XXX tasks ===
 Thread X:   complete       : XXXus (XX.X%)
 Thread X:     poll         : XXXus (XX.X%)  hit=XXX, miss=XXX, hit_rate=XX.X%
@@ -170,9 +160,11 @@ captured at l2_swimlane_level >= 3) and `deps.json`; consume them via
 - Atomic operation counters
 - Wait time tracking
 
-**Log output:** 30 LOG_INFO_V9 logs (11 debug + 2 basic + 1 scheduler summary + 17 orchestrator detailed - 1 replaced)
-
-- Replaces basic orchestration completion with detailed breakdown
+**Log output (per orchestrator thread, normal run):** the orchestrator phase
+breakdown, followed by the `orch_start/orch_end/orch_cost` line and the
+`PTO2 total submitted tasks` line — all gated by `PTO2_ORCH_PROFILING`. This
+level adds orchestrator-side logs only; the scheduler side is unchanged from
+Level 1 (add `PTO2_SCHED_PROFILING` / Level 2 for scheduler detail).
 
 **Orchestrator output:**
 
@@ -188,6 +180,8 @@ Thread X:   fanin+ready    : XXXus (XX.X%)  work=XXXus wait=XXXus  atomics=XXX
 Thread X:   finalize+SM    : XXXus (XX.X%)  work=XXXus wait=XXXus  atomics=XXX
 Thread X:   scope_end      : XXXus  atomics=XXX
 Thread X:   avg/task       : XXXus
+Thread X: orch_start=XXX orch_end=XXX orch_cost=XXXus
+PTO2 total submitted tasks = XXX, already executed XXX tasks
 ```
 
 **Note:** Orchestrator logs always print when `PTO2_ORCH_PROFILING=1`, regardless of `enable_l2_swimlane` flag.
@@ -205,7 +199,9 @@ Thread X:   avg/task       : XXXus
 - Hash chain walk tracking
 - Overlap check counters
 
-**Log output:** 34 LOG_INFO_V9 logs (30 from Level 3 + 4 tensormap)
+**Log output (per orchestrator thread, normal run):** all Level 3 orchestrator
+output plus the 4-line TensorMap lookup stats block below (gated by
+`PTO2_TENSORMAP_PROFILING`, nested inside `PTO2_ORCH_PROFILING`).
 
 **TensorMap output:**
 
@@ -404,10 +400,10 @@ definitions to runtime headers.
 | Level | Macro Settings | LOG_INFO_V9 Count | Description |
 | ----- | -------------- | ----------------- | ----------- |
 | 0 | `PTO2_PROFILING=0` | 0 | No timing output |
-| 1 | `PTO2_PROFILING=1` | 7 | Timing timestamps + scheduler summary |
-| 2 | `+PTO2_SCHED_PROFILING=1` | — | Scheduler detailed phase breakdown |
-| 3 | `+PTO2_ORCH_PROFILING=1` | — | Orchestrator detailed phase breakdown |
-| 4 | `+PTO2_TENSORMAP_PROFILING=1` | — | TensorMap lookup stats |
+| 1 | `PTO2_PROFILING=1` | 0 | Host-side `Orch`/`Sched` `[STRACE]` windows only; no device logs |
+| 2 | `+PTO2_SCHED_PROFILING=1` | per sched thread | `sched_start` + phase breakdown + `Scheduler summary` |
+| 3 | `+PTO2_ORCH_PROFILING=1` | per orch thread | Orchestrator phase breakdown + `orch_start` + `PTO2 total` |
+| 4 | `+PTO2_TENSORMAP_PROFILING=1` | per orch thread | + TensorMap lookup stats (4 lines) |
 
 ---
 

diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp
@@ -491,7 +491,7 @@ int32_t SchedulerContext::handle_timeout_exit(
 }
 
 #if PTO2_PROFILING
-void SchedulerContext::log_l2_swimlane_summary(int32_t thread_idx, int32_t cur_thread_completed) {
+void SchedulerContext::log_l2_swimlane_summary(int32_t thread_idx, [[maybe_unused]] int32_t cur_thread_completed) {
     auto &l2_swimlane = sched_l2_swimlane_[thread_idx];
     uint64_t sched_end_ts = get_sys_cnt_aicpu();
     // Ride the sched window home to the host phase buffer (the host reduces
@@ -506,13 +506,11 @@ void SchedulerContext::log_l2_swimlane_summary(int32_t thread_idx, int32_t cur_t
         static_cast<uint64_t>(l2_swimlane.sched_start_ts), static_cast<uint64_t>(sched_end_ts),
         cycles_to_us(sched_end_ts - l2_swimlane.sched_start_ts)
     );
-#endif
 
     uint64_t sched_total = l2_swimlane.sched_wiring_cycle + l2_swimlane.sched_complete_cycle +
                            l2_swimlane.sched_dispatch_cycle + l2_swimlane.sched_idle_cycle;
     if (sched_total == 0) sched_total = 1;
 
-#if PTO2_SCHED_PROFILING
     {
         PTO2SchedProfilingData sp = scheduler_get_profiling(thread_idx);
         uint64_t otc_total = sp.lock_cycle + sp.fanout_cycle + sp.fanin_cycle + sp.self_consumed_cycle;
@@ -627,11 +625,11 @@ void SchedulerContext::log_l2_swimlane_summary(int32_t thread_idx, int32_t cur_t
             );
         }
     }
-#endif
     LOG_INFO_V9(
         "Thread %d: Scheduler summary: total_time=%.3fus, loops=%" PRIu64 ", tasks_scheduled=%d", thread_idx,
         cycles_to_us(sched_total), static_cast<uint64_t>(l2_swimlane.sched_loop_count), cur_thread_completed
     );
+#endif
 }
 #endif
 

diff --git a/src/a5/runtime/tensormap_and_ringbuffer/docs/profiling_levels.md b/src/a5/runtime/tensormap_and_ringbuffer/docs/profiling_levels.md
@@ -71,46 +71,34 @@ Each sub-level macro requires `PTO2_PROFILING=1`:
 
 **What's compiled:**
 
-- Base timing counters for scheduler loop (`sched_complete/dispatch/idle/scan`)
-- Per-thread orchestration timing (`orch_start`, `orch_end`, `orch_cost`)
-- PTO2 total submitted tasks count (printed by last orch thread, after orch timing line)
-- Scheduler summary output (`total_time`, `loops`, `tasks_scheduled`)
-- Scheduler lifetime timestamps and cost (`sched_start`, `sched_end`, `sched_cost` — captured inside `resolve_and_dispatch_pto2()`, printed before Scheduler summary)
+- Base timing counters for the scheduler loop (`sched_complete/dispatch/idle/scan`)
+- Host-side phase windows: each sched/orch thread publishes its
+  start/end window via `aicpu_phase_set_window`, which the host reduces
+  into the `Orch` / `Sched` `[STRACE]` markers
 
 **What's NOT compiled:**
 
+- Per-thread scheduler/orchestrator device-log lines (moved to Level 2 / Level 3)
 - Detailed phase breakdowns
 - TensorMap statistics
 
 **Log output (additional lines vs Level 0, per normal run):**
 
-- `Thread %d: orch_start=%llu orch_end=%llu orch_cost=%.3fus` — each orch thread, after orchestration fully complete
-- `PTO2 total submitted tasks = %d, already executed %d tasks` — last orch thread only (×1), after orch timing line
-- `Thread %d: sched_start=%llu sched_end=%llu sched_cost=%.3fus` — each sched thread, printed before Scheduler summary
-- `Thread %d: Scheduler summary: total_time=%.3fus, loops=%llu, tasks_scheduled=%d` — each sched thread
-- `Thread %d: sched_start=%llu sched_end(timeout)=%llu sched_cost=%.3fus` — timeout path only (replaces normal `sched_end`)
+- None on the device side. The per-thread `orch_start/orch_end/orch_cost`,
+  `sched_start/sched_end/sched_cost`, and `Scheduler summary` lines are NOT
+  emitted at this level — `orch_*` is gated by `PTO2_ORCH_PROFILING` (Level 3),
+  `sched_*` and `Scheduler summary` by `PTO2_SCHED_PROFILING` (Level 2).
+  Level 1 only feeds the host-side `Orch` / `Sched` `[STRACE]` timeline.
 
 **LOG_INFO_V9 count (normal run):**
 
-- `N_sched*2 + N_orch*1 + 1` (orch_timing + PTO2_total + sched_timing + Scheduler_summary)
-
-> See the table at the end for concrete counts based on the `paged_attention` example.
-
-**Example log output** (from `paged_attention`, device 10):
-
-```text
-Thread 2: orch_start=48214752948321 orch_end=48214752959379 orch_cost=230.000us
-Thread 3: orch_start=48214752948316 orch_end=48214752961505 orch_cost=275.000us
-PTO2 total submitted tasks = 13, already executed 13 tasks
-Thread 1: sched_start=48214752948235 sched_end=48214752962379 sched_cost=295.000us
-Thread 1: Scheduler summary: total_time=159.560us, loops=3782, tasks_scheduled=6
-Thread 0: sched_start=48214752948200 sched_end=48214752963571 sched_cost=320.000us
-Thread 0: Scheduler summary: total_time=183.180us, loops=4611, tasks_scheduled=7
-```
+- `0` (device-side profiling logs). The timeline is delivered host-side via the
+  phase windows, not through per-thread device logs.
 
 **Note:**
 
-- All logs above are controlled by compile-time macro `PTO2_PROFILING`, not by `enable_l2_swimlane`.
+- The host-side `[STRACE]` phase windows are controlled by compile-time macro
+  `PTO2_PROFILING`, not by `enable_l2_swimlane`.
 - `enable_l2_swimlane` only controls shared-memory data collection / swimlane export.
 
 ---
@@ -126,13 +114,15 @@ Thread 0: Scheduler summary: total_time=183.180us, loops=4611, tasks_scheduled=7
 - Phase-specific statistics (complete, scan, dispatch, idle)
 - Hit rate tracking (complete poll, ready queue pop)
 
-**Log output:** 18 LOG_INFO_V9 logs (11 debug + 2 basic + 7 scheduler detailed - 2 replaced)
-
-- Replaces scheduler summary with detailed breakdown
+**Log output (per scheduler thread, normal run):** the `sched_start/sched_end/
+sched_cost` line, the full phase breakdown, and the `Scheduler summary` line
+(all gated by `PTO2_SCHED_PROFILING`). The `Scheduler summary` line first
+appears at this level — it is not emitted at Level 1.
 
 **Scheduler output:**
 
 ```text
+Thread X: sched_start=XXX sched_end=XXX sched_cost=XXXus
 Thread X: === Scheduler Phase Breakdown: total=XXXus, XXX tasks ===
 Thread X:   complete       : XXXus (XX.X%)
 Thread X:     poll         : XXXus (XX.X%)  hit=XXX, miss=XXX, hit_rate=XX.X%
@@ -170,9 +160,11 @@ captured at l2_swimlane_level >= 3) and `deps.json`; consume them via
 - Atomic operation counters
 - Wait time tracking
 
-**Log output:** 30 LOG_INFO_V9 logs (11 debug + 2 basic + 1 scheduler summary + 17 orchestrator detailed - 1 replaced)
-
-- Replaces basic orchestration completion with detailed breakdown
+**Log output (per orchestrator thread, normal run):** the orchestrator phase
+breakdown, followed by the `orch_start/orch_end/orch_cost` line and the
+`PTO2 total submitted tasks` line — all gated by `PTO2_ORCH_PROFILING`. This
+level adds orchestrator-side logs only; the scheduler side is unchanged from
+Level 1 (add `PTO2_SCHED_PROFILING` / Level 2 for scheduler detail).
 
 **Orchestrator output:**
 
@@ -188,6 +180,8 @@ Thread X:   fanin+ready    : XXXus (XX.X%)  work=XXXus wait=XXXus  atomics=XXX
 Thread X:   finalize+SM    : XXXus (XX.X%)  work=XXXus wait=XXXus  atomics=XXX
 Thread X:   scope_end      : XXXus  atomics=XXX
 Thread X:   avg/task       : XXXus
+Thread X: orch_start=XXX orch_end=XXX orch_cost=XXXus
+PTO2 total submitted tasks = XXX, already executed XXX tasks
 ```
 
 **Note:** Orchestrator logs always print when `PTO2_ORCH_PROFILING=1`, regardless of `enable_l2_swimlane` flag.
@@ -205,7 +199,9 @@ Thread X:   avg/task       : XXXus
 - Hash chain walk tracking
 - Overlap check counters
 
-**Log output:** 34 LOG_INFO_V9 logs (30 from Level 3 + 4 tensormap)
+**Log output (per orchestrator thread, normal run):** all Level 3 orchestrator
+output plus the 4-line TensorMap lookup stats block below (gated by
+`PTO2_TENSORMAP_PROFILING`, nested inside `PTO2_ORCH_PROFILING`).
 
 **TensorMap output:**
 
@@ -374,10 +370,10 @@ definitions to runtime headers.
 | Level | Macro Settings | LOG_INFO_V9 Count | Description |
 | ----- | -------------- | ----------------- | ----------- |
 | 0 | `PTO2_PROFILING=0` | 0 | No timing output |
-| 1 | `PTO2_PROFILING=1` | 7 | Timing timestamps + scheduler summary |
-| 2 | `+PTO2_SCHED_PROFILING=1` | — | Scheduler detailed phase breakdown |
-| 3 | `+PTO2_ORCH_PROFILING=1` | — | Orchestrator detailed phase breakdown |
-| 4 | `+PTO2_TENSORMAP_PROFILING=1` | — | TensorMap lookup stats |
+| 1 | `PTO2_PROFILING=1` | 0 | Host-side `Orch`/`Sched` `[STRACE]` windows only; no device logs |
+| 2 | `+PTO2_SCHED_PROFILING=1` | per sched thread | `sched_start` + phase breakdown + `Scheduler summary` |
+| 3 | `+PTO2_ORCH_PROFILING=1` | per orch thread | Orchestrator phase breakdown + `orch_start` + `PTO2 total` |
+| 4 | `+PTO2_TENSORMAP_PROFILING=1` | per orch thread | + TensorMap lookup stats (4 lines) |
 
 ---
 

diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp
@@ -488,7 +488,7 @@ int32_t SchedulerContext::handle_timeout_exit(
 }
 
 #if PTO2_PROFILING
-void SchedulerContext::log_l2_swimlane_summary(int32_t thread_idx, int32_t cur_thread_completed) {
+void SchedulerContext::log_l2_swimlane_summary(int32_t thread_idx, [[maybe_unused]] int32_t cur_thread_completed) {
     auto &l2_swimlane = sched_l2_swimlane_[thread_idx];
     uint64_t sched_end_ts = get_sys_cnt_aicpu();
     // Ride the sched window home to the host phase buffer (the host reduces
@@ -503,14 +503,12 @@ void SchedulerContext::log_l2_swimlane_summary(int32_t thread_idx, int32_t cur_t
         static_cast<uint64_t>(l2_swimlane.sched_start_ts), static_cast<uint64_t>(sched_end_ts),
         cycles_to_us(sched_end_ts - l2_swimlane.sched_start_ts)
     );
-#endif
 
     uint64_t sched_total = l2_swimlane.sched_wiring_cycle + l2_swimlane.sched_complete_cycle +
                            l2_swimlane.sched_scan_cycle + l2_swimlane.sched_dispatch_cycle +
                            l2_swimlane.sched_idle_cycle;
     if (sched_total == 0) sched_total = 1;
 
-#if PTO2_SCHED_PROFILING
     {
         PTO2SchedProfilingData sp = scheduler_get_profiling(thread_idx);
         uint64_t otc_total = sp.lock_cycle + sp.fanout_cycle + sp.fanin_cycle + sp.self_consumed_cycle;
@@ -630,11 +628,11 @@ void SchedulerContext::log_l2_swimlane_summary(int32_t thread_idx, int32_t cur_t
             );
         }
     }
-#endif
     LOG_INFO_V9(
         "Thread %d: Scheduler summary: total_time=%.3fus, loops=%" PRIu64 ", tasks_scheduled=%d", thread_idx,
         cycles_to_us(sched_total), static_cast<uint64_t>(l2_swimlane.sched_loop_count), cur_thread_completed
     );
+#endif
 }
 #endif