Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions docs/dynamic-linking.md
Original file line number Diff line number Diff line change
Expand Up @@ -224,11 +224,11 @@ SchedulerContext owns its own teardown:
- `SchedulerContext::deinit()` resets every scheduler-owned field —
per-core states, payloads, sync-start drain coordination
(`sync_start_pending` / `drain_worker_elected` / `drain_ack_mask` /
`pending_task`), task counters, transition flags, worker-id lists,
`pending_task`), task counters, worker-id lists,
core trackers, `cores_total_num_` / `aic_count_` / `aiv_count_`,
`regs_`, `sched_`, `func_id_to_addr_`, and the `pto2_init_*` flags.
- `AicpuExecutor::deinit()` calls `sched_ctx_.deinit()` first, then resets
only its own fields: `thread_num_`, `sched_thread_num_`, `orch_to_sched_`,
only its own fields: `thread_num_`, `sched_thread_num_`,
`orch_func_`, `orch_args_cached_`, `orch_so_handle_`, `orch_so_path_`,
`runtime_init_ready_`, and the lifecycle atomics
(`initialized_`, `init_done_`, `init_failed_`, `finished_`, `thread_idx_`,
Expand Down
6 changes: 2 additions & 4 deletions src/a2a3/platform/include/aicpu/l2_swimlane_collector_aicpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -155,8 +155,7 @@ void l2_swimlane_aicpu_flush(int thread_idx, const int *cur_thread_cores, int co
* to the L2Swimlane base
* @param num_sched_phase_threads Number of sched-phase pools to prime
* @param num_orch_phase_threads Number of orch-phase pools to prime
* (typically 1; in orch_to_sched mode =
* num_aicpu_threads)
* (typically 1)
*/
void l2_swimlane_aicpu_init_phase(int worker_count, int num_sched_phase_threads, int num_orch_phase_threads);

Expand Down Expand Up @@ -200,8 +199,7 @@ void l2_swimlane_aicpu_record_sched_phase(
* Must be called once from the orchestrator thread before any
* l2_swimlane_aicpu_record_orch_phase() calls.
*
* @param thread_idx Thread index for the orchestrator (typically num_sched_threads;
* in orch_to_sched mode each scheduler thread sets its own)
* @param thread_idx Thread index for the orchestrator (typically num_sched_threads)
*/
void l2_swimlane_aicpu_set_orch_thread_idx(int thread_idx);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -69,8 +69,7 @@ static L2SwimlaneAicpuTaskBuffer *s_current_aicpu_task_buffers[PLATFORM_MAX_CORE
static L2SwimlaneAicpuSchedPhasePool *s_sched_phase_pools[PLATFORM_MAX_AICPU_THREADS] = {};
static L2SwimlaneAicpuSchedPhaseBuffer *s_current_sched_phase_buffers[PLATFORM_MAX_AICPU_THREADS] = {};

// Per-thread orch-phase pool/buffer caches (typically one orch thread; in
// orch_to_sched mode all aicpu threads can write here).
// Per-thread orch-phase pool/buffer caches (one orch thread).
static L2SwimlaneAicpuOrchPhasePool *s_orch_phase_pools[PLATFORM_MAX_AICPU_THREADS] = {};
static L2SwimlaneAicpuOrchPhaseBuffer *s_current_orch_phase_buffers[PLATFORM_MAX_AICPU_THREADS] = {};

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,6 @@ struct OrchSoEntry {

struct AicpuExecutor {
int32_t sched_thread_num_;
bool orch_to_sched_{false};
bool serial_orch_sched_{false};

// ===== Thread management state =====
Expand Down Expand Up @@ -206,7 +205,6 @@ int32_t AicpuExecutor::init(Runtime *runtime) {
aicpu_thread_num_ = runtime->dev.aicpu_thread_num;
if (aicpu_thread_num_ == 0) aicpu_thread_num_ = 1;
sched_thread_num_ = aicpu_thread_num_ - 1;
orch_to_sched_ = runtime->dev.orch_to_sched;
serial_orch_sched_ = runtime->dev.serial_orch_sched;

if (aicpu_thread_num_ < 1 || aicpu_thread_num_ > MAX_AICPU_THREADS) {
Expand All @@ -215,7 +213,7 @@ int32_t AicpuExecutor::init(Runtime *runtime) {
return -1;
}

if (sched_ctx_.init(runtime, aicpu_thread_num_, sched_thread_num_, orch_to_sched_, get_platform_regs()) != 0) {
if (sched_ctx_.init(runtime, aicpu_thread_num_, sched_thread_num_, get_platform_regs()) != 0) {
init_failed_.store(true, std::memory_order_release);
return -1;
}
Expand Down Expand Up @@ -728,8 +726,8 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
LOG_INFO_V0("Thread %d: Orchestrator completed", thread_idx);
}

// Scheduler thread (orchestrator threads skip dispatch when orch_to_sched_ is false)
if (!sched_ctx_.is_completed() && (thread_idx < sched_thread_num_ || orch_to_sched_)) {
// Scheduler thread (orchestrator thread skips dispatch and exits after orchestration)
if (!sched_ctx_.is_completed() && thread_idx < sched_thread_num_) {
// Device orchestration: wait for the primary orchestrator to initialize the SM header
while (!runtime_init_ready_.load(std::memory_order_acquire)) {
SPIN_WAIT_HINT();
Expand Down Expand Up @@ -802,7 +800,6 @@ void AicpuExecutor::deinit(Runtime *runtime) {

aicpu_thread_num_ = 0;
sched_thread_num_ = 0;
orch_to_sched_ = false;
serial_orch_sched_ = false;

orch_args_cached_.reset();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -544,19 +544,19 @@ Public surface (called from `AicpuExecutor::init/run/deinit`):

| Method | Phase | Purpose |
| ------ | ----- | ------- |
| `init(runtime, aicpu_thread_num, sched_thread_num, orch_to_sched, regs_base)` | once per run | Handshake + assign cores, reset counters, latch `regs_base`, bind `func_id_to_addr_` |
| `init(runtime, aicpu_thread_num, sched_thread_num, regs_base)` | once per run | Handshake + assign cores, reset counters, latch `regs_base`, bind `func_id_to_addr_` |
| `bind_runtime(rt)` | device-orch only | Wire `sched_` to `rt->scheduler` once the orchestrator thread creates `rt` |
| `resolve_and_dispatch(runtime, thread_idx)` | per scheduler thread | Main dispatch loop |
| `shutdown(thread_idx)` | per thread on exit | `platform_deinit_aicore_regs` for this thread's cores; PMU finalize when enabled |
| `on_orchestration_done(runtime, rt, thread_idx, total_tasks)` | orchestrator thread | Publish core assignments, latch task count, fold inline-completed tasks, flip `orchestrator_done_`, drive orch→sched core transition (or `emergency_shutdown` on fatal) |
| `on_orchestration_done(runtime, rt, thread_idx, total_tasks)` | orchestrator thread | Publish core assignments, latch task count, fold inline-completed tasks, flip `orchestrator_done_` (or `emergency_shutdown` on fatal) |
| `deinit()` | once per run | Reset every scheduler-owned field to its post-construction default |
| Read-only accessors | various | `aic_count()` / `aiv_count()` / `is_completed()` / `completed_tasks_count()` |

Private internals are split across three .cpp files by responsibility:

- `scheduler_completion.cpp` — completion polling, drain protocol
- `scheduler_dispatch.cpp` — task dispatch loop and helpers
- `scheduler_cold_path.cpp` — exit checks, stall diagnostics, profiling, lifecycle (`init/deinit`), core management (`handshake_all_cores` / `assign_cores_to_threads` / `reassign_cores_for_all_threads` / `emergency_shutdown`), and `on_orchestration_done`
- `scheduler_cold_path.cpp` — exit checks, stall diagnostics, profiling, lifecycle (`init/deinit`), core management (`handshake_all_cores` / `assign_cores_to_threads` / `emergency_shutdown`), and `on_orchestration_done`

`AicpuExecutor` calls neither `handshake_*`, `assign_*`, `reassign_*`, nor `emergency_shutdown` directly — they are private, invoked only by `init` and `on_orchestration_done`.

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ Legacy per-task submit (`kernel_id + worker_type`) cannot express atomic co-disp

Design must preserve the current main runtime architecture:

1. Executor threading split (orchestrator thread vs scheduler threads), and post-orchestrator transition (`transition_requested_` + `reassign_cores_for_all_threads()`).
1. Executor threading split (orchestrator thread vs scheduler threads); the orchestrator thread exits after the task graph is built while scheduler threads dispatch to completion.
2. Shared-memory hot/cold split (`PTO2TaskDescriptor` hot + `PTO2TaskPayload` cold).

## 5. Terminology
Expand Down Expand Up @@ -146,10 +146,8 @@ This project-defined flattened numbering is kept unchanged.
### 9.2 Cluster Ownership

1. One cluster must be owned by one scheduler domain/thread at a time.
2. No split-cluster ownership in either:
- initial `assign_cores_to_threads()`
- post-orchestrator `reassign_cores_for_all_threads()`
3. Lane occupancy bookkeeping must remain consistent with ownership after reassignment.
2. No split-cluster ownership in `assign_cores_to_threads()`.
3. Lane occupancy bookkeeping must remain consistent with ownership.

## 10. Functional Requirements

Expand Down
37 changes: 9 additions & 28 deletions src/a2a3/runtime/tensormap_and_ringbuffer/docs/profiling_levels.md
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,6 @@ Each sub-level macro requires `PTO2_PROFILING=1`:

- Base timing counters for scheduler loop (`sched_complete/dispatch/idle/scan`)
- Per-thread orchestration timing (`orch_start`, `orch_end`, `orch_cost`)
- Stage-level orchestration end timestamp (`orch_stage_end`, printed by last orch thread only, marks the moment all orch threads have finished and core transition is about to be requested; only when `orch_to_sched_` is true)
- PTO2 total submitted tasks count (printed by last orch thread, after orch timing line)
- Scheduler summary output (`total_time`, `loops`, `tasks_scheduled`)
- Scheduler lifetime timestamps and cost (`sched_start`, `sched_end`, `sched_cost` — captured inside `resolve_and_dispatch_pto2()`, printed before Scheduler summary)
Expand All @@ -87,19 +86,17 @@ Each sub-level macro requires `PTO2_PROFILING=1`:

- `Thread %d: orch_start=%llu orch_end=%llu orch_cost=%.3fus` — each orch thread, after orchestration fully complete
- `PTO2 total submitted tasks = %d, already executed %d tasks` — last orch thread only (×1), after orch timing line
- `Thread %d: orch_stage_end=%llu` — last orch thread only (×1), only when `orch_to_sched_=true`
- `Thread %d: sched_start=%llu sched_end=%llu sched_cost=%.3fus` — each sched thread, printed before Scheduler summary
- `Thread %d: Scheduler summary: total_time=%.3fus, loops=%llu, tasks_scheduled=%d` — each sched thread
- `Thread %d: sched_start=%llu sched_end(timeout)=%llu sched_cost=%.3fus` — timeout path only (replaces normal `sched_end`)

**LOG_INFO_V9 count (normal run):**

- `orch_to_sched_=false` (default): `N_sched*2 + N_orch*1 + 1` (orch_timing + PTO2_total + sched_timing + Scheduler_summary)
- `orch_to_sched_=true` (`PTO2_ORCH_TO_SCHED=1`): adds 1 (`orch_stage_end`)
- `N_sched*2 + N_orch*1 + 1` (orch_timing + PTO2_total + sched_timing + Scheduler_summary)

> See the table at the end for concrete counts based on the `paged_attention` example.

**Example log output — `orch_to_sched_=false`** (from `paged_attention`, device 10):
**Example log output** (from `paged_attention`, device 10):
Comment thread
coderabbitai[bot] marked this conversation as resolved.

```text
Thread 2: orch_start=48214752948321 orch_end=48214752959379 orch_cost=230.000us
Expand All @@ -111,26 +108,10 @@ Thread 0: sched_start=48214752948200 sched_end=48214752963571 sched_cost=320.000
Thread 0: Scheduler summary: total_time=183.180us, loops=4611, tasks_scheduled=7
```

**Example log output — `orch_to_sched_=true`** (`PTO2_ORCH_TO_SCHED=1`, from `paged_attention`, device 11):

```text
Thread 3: orch_stage_end=48236915058307
Thread 3: orch_start=48236915044001 orch_end=48236915058781 orch_cost=308.000us
Thread 2: orch_start=48236915044003 orch_end=48236915058782 orch_cost=308.000us
PTO2 total submitted tasks = 13, already executed 13 tasks
Thread 0: sched_start=48236915043911 sched_end=48236915059191 sched_cost=318.000us
Thread 0: Scheduler summary: total_time=187.920us, loops=4561, tasks_scheduled=4
Thread 1: sched_start=48236915043947 sched_end=48236915061881 sched_cost=372.000us
Thread 1: Scheduler summary: total_time=168.620us, loops=3880, tasks_scheduled=9
```

> With `orch_to_sched_=true`, orch threads transition to schedulers after orchestration. They print `orch_end` but do NOT print `Scheduler summary` or `sched_end` (they have no cores assigned at shutdown time).

**Note:**

- All logs above are controlled by compile-time macro `PTO2_PROFILING`, not by `enable_l2_swimlane`.
- `enable_l2_swimlane` only controls shared-memory data collection / swimlane export.
- Enable `orch_to_sched_` via environment variable: `PTO2_ORCH_TO_SCHED=1`.

---

Expand Down Expand Up @@ -420,13 +401,13 @@ definitions to runtime headers.

> Example: `paged_attention` on Ascend hardware, 2 sched threads + 2 orch threads, normal run (no stall/timeout).

| Level | Macro Settings | LOG_INFO_V9 Count (`orch_to_sched_=false`) | LOG_INFO_V9 Count (`orch_to_sched_=true`) | Description |
| ----- | -------------- | ------------------------------------------ | ----------------------------------------- | ----------- |
| 0 | `PTO2_PROFILING=0` | 0 | 0 | No timing output |
| 1 | `PTO2_PROFILING=1` | 7 | 8 | Timing timestamps + scheduler summary |
| 2 | `+PTO2_SCHED_PROFILING=1` | — | — | Scheduler detailed phase breakdown |
| 3 | `+PTO2_ORCH_PROFILING=1` | — | — | Orchestrator detailed phase breakdown |
| 4 | `+PTO2_TENSORMAP_PROFILING=1` | — | — | TensorMap lookup stats |
| Level | Macro Settings | LOG_INFO_V9 Count | Description |
| ----- | -------------- | ----------------- | ----------- |
| 0 | `PTO2_PROFILING=0` | 0 | No timing output |
| 1 | `PTO2_PROFILING=1` | 7 | Timing timestamps + scheduler summary |
| 2 | `+PTO2_SCHED_PROFILING=1` | — | Scheduler detailed phase breakdown |
| 3 | `+PTO2_ORCH_PROFILING=1` | — | Orchestrator detailed phase breakdown |
| 4 | `+PTO2_TENSORMAP_PROFILING=1` | — | TensorMap lookup stats |

---

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -477,10 +477,6 @@ static bool stage_device_args(
// runtime. Behavior-only env reads (no new gates); kept here so the args and
// image steps stay free of unrelated state.
static void apply_orch_sched_env_flags(Runtime *runtime) {
const char *orch_env = std::getenv("PTO2_ORCH_TO_SCHED");
runtime->dev.orch_to_sched = orch_env && (orch_env[0] == '1' || orch_env[0] == 't' || orch_env[0] == 'T');
LOG_INFO_V0("Orchestrator-to-scheduler transition: %s", runtime->dev.orch_to_sched ? "enabled" : "disabled");

const char *serial_env = std::getenv("PTO2_SERIAL_ORCH_SCHED");
runtime->dev.serial_orch_sched =
serial_env && (serial_env[0] == '1' || serial_env[0] == 't' || serial_env[0] == 'T');
Expand Down
7 changes: 0 additions & 7 deletions src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,6 @@ struct alignas(64) DeviceRuntimeLaunchDesc {
// (= orch + schedulers). AicpuExecutor splits this into one orchestrator
// thread (highest idx, runs aicpu_orchestration_entry) and the remaining
// aicpu_thread_num-1 scheduler threads that dispatch tasks to AICore.
// The orch thread also dispatches when env PTO2_ORCH_TO_SCHED is set.
int aicpu_thread_num;
int ready_queue_shards; // Number of ready queue shards (1..MAX_AICPU_THREADS, default MAX-1)

Expand All @@ -223,12 +222,6 @@ struct alignas(64) DeviceRuntimeLaunchDesc {
// PTO2 integration: kernel_id -> GM function_bin_addr mapping
uint64_t func_id_to_addr_[RUNTIME_MAX_FUNC_ID];

// Orchestrator-to-scheduler transition control
// When true, orchestrator threads convert to scheduler threads after orchestration completes.
// When false (default), orchestrator threads exit after orchestration without dispatching tasks.
// Controlled via PTO2_ORCH_TO_SCHED environment variable.
bool orch_to_sched;

// Serial orchestrator -> scheduler start control.
// When true, scheduler threads wait until orchestration has fully built the
// task graph before entering resolve_and_dispatch().
Expand Down
Loading
Loading