Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions src/a2a3/platform/include/common/kernel_args.h
Original file line number Diff line number Diff line change
Expand Up @@ -143,9 +143,10 @@ static_assert(offsetof(KernelArgs, regs) == 8, "KernelArgs::regs offset drift");
* register tables consumed on the per-run AICore path and stay in KernelArgs.
*/
struct InitArgs {
uint32_t device_id{0}; // ACL device ordinal -> set_orch_device_id
uint32_t log_level{1}; // Severity floor: 0=DEBUG, 1=INFO, 2=WARN, 3=ERROR, 4=NUL
uint32_t log_info_v{5}; // INFO verbosity threshold (0..9); default V5
uint32_t device_id{0}; // ACL device ordinal -> set_orch_device_id
uint32_t log_level{1}; // Severity floor: 0=DEBUG, 1=INFO, 2=WARN, 3=ERROR, 4=NUL
uint32_t log_info_v{5}; // INFO verbosity threshold (0..9); default V5
int32_t scheduler_timeout_ms{0}; // AICPU no-progress watchdog (ms); 0 -> compile default
};

/**
Expand Down
2 changes: 2 additions & 0 deletions src/a2a3/platform/onboard/aicpu/kernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include "common/unified_log.h"
#include "common/kernel_args.h"
#include "common/platform_config.h"
#include "aicpu/aicpu_device_config.h"
#include "aicpu/dep_gen_collector_aicpu.h"
#include "aicpu/device_log.h"
#include "aicpu/device_phase_aicpu.h"
Expand Down Expand Up @@ -148,6 +149,7 @@ extern "C" __attribute__((visibility("default"))) int simpler_aicpu_init(void *a
set_log_level(static_cast<int>(init_args->log_level));
set_log_info_v(static_cast<int>(init_args->log_info_v));
set_orch_device_id(static_cast<int>(init_args->device_id));
set_scheduler_timeout_ms(static_cast<int>(init_args->scheduler_timeout_ms));

LOG_INFO_V0("%s", "simpler_aicpu_init: per-device invariants latched");
return 0;
Expand Down
14 changes: 14 additions & 0 deletions src/a2a3/platform/sim/host/device_runner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
#include "common/unified_log.h"
#include "cpu_sim_context.h"
#include "host/raii_scope_guard.h"
#include "host/runtime_timeout_config.h"
#include "runtime.h"

// dep_gen_replay_emit_deps_json: strong symbol provided by
Expand Down Expand Up @@ -97,6 +98,19 @@ int DeviceRunner::ensure_binaries_loaded() {
load_optional_sym("simpler_aicpu_register_callable", reinterpret_cast<void **>(&aicpu_register_callable_func_));
if (!load_sym("set_platform_regs", reinterpret_cast<void **>(&set_platform_regs_func_))) return -1;
load_optional_sym("set_orch_device_id", reinterpret_cast<void **>(&set_orch_device_id_func_));
load_optional_sym("set_scheduler_timeout_ms", reinterpret_cast<void **>(&set_scheduler_timeout_ms_func_));
if (set_scheduler_timeout_ms_func_ != nullptr) {
// Per-device one-shot latch (mirrors the onboard InitArgs path):
// honor PTO2_SCHEDULER_TIMEOUT_MS once at SO load, not per run. 0 ->
// the scheduler keeps its compile-time default. Sim skips the
// op/stream ordering check (validate_runtime_timeout_order is onboard).
RuntimeTimeoutParseStatus sched_status;
RuntimeTimeoutConfig sched_cfg =
resolve_runtime_timeout_config(RuntimeTimeoutConfig{1, 1, 0}, &sched_status);
set_scheduler_timeout_ms_func_(
(sched_status.scheduler_env_set && sched_status.scheduler_valid) ? sched_cfg.scheduler_timeout_ms : 0
);
}
if (!load_sym("set_platform_dump_base", reinterpret_cast<void **>(&set_platform_dump_base_func_))) return -1;
if (!load_sym("set_platform_phase_base", reinterpret_cast<void **>(&set_platform_phase_base_func_))) return -1;
if (!load_sym("set_dump_args_enabled", reinterpret_cast<void **>(&set_dump_args_enabled_func_))) return -1;
Expand Down
1 change: 1 addition & 0 deletions src/a2a3/platform/sim/host/device_runner.h
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ class DeviceRunner : public SimDeviceRunnerBase {
void (*aicore_execute_func_)(Runtime *, int, CoreType, uint32_t, uint64_t, uint32_t, uint64_t){nullptr};
void (*set_platform_regs_func_)(uint64_t){nullptr};
void (*set_orch_device_id_func_)(int){nullptr};
void (*set_scheduler_timeout_ms_func_)(int){nullptr};
void (*set_platform_dump_base_func_)(uint64_t){nullptr};
void (*set_platform_phase_base_func_)(uint64_t){nullptr};
void (*set_dump_args_enabled_func_)(bool){nullptr};
Expand Down
35 changes: 2 additions & 33 deletions src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,6 @@
#include "common/strace.h"
#include "common/unified_log.h"
#include "host/platform_compile_info.h"
#include "host/runtime_timeout_config.h"
#include "utils/device_arena.h"
#include "prepare_callable_common.h"

Expand Down Expand Up @@ -246,32 +245,6 @@ static bool resolve_ring_config(
return true;
}

static int32_t resolve_scheduler_timeout_ms() {
RuntimeTimeoutParseStatus parse_status;
RuntimeTimeoutConfig cfg = resolve_runtime_timeout_config(
RuntimeTimeoutConfig{PLATFORM_OP_EXECUTE_TIMEOUT_US, PLATFORM_STREAM_SYNC_TIMEOUT_MS, 0}, &parse_status
);
if (!parse_status.scheduler_env_set) {
return 0;
}
if (!parse_status.scheduler_valid) {
const char *env = std::getenv(PTO2_SCHEDULER_TIMEOUT_MS_ENV);
LOG_WARN("%s=%s invalid, using platform scheduler timeout", PTO2_SCHEDULER_TIMEOUT_MS_ENV, env);
return 0;
}

RuntimeTimeoutOrderStatus status = validate_runtime_timeout_order_for_platform(cfg, get_platform());
if (status != RuntimeTimeoutOrderStatus::OK) {
LOG_WARN(
"Ignoring %s=%d: %s (op_execute=%llu us, stream_sync=%d ms)", PTO2_SCHEDULER_TIMEOUT_MS_ENV,
cfg.scheduler_timeout_ms, runtime_timeout_order_status_name(status),
(unsigned long long)cfg.op_execute_timeout_us, cfg.stream_sync_timeout_ms
);
return 0;
}
return cfg.scheduler_timeout_ms;
}

static int32_t pto2_read_runtime_status(Runtime *runtime, PTO2SharedMemoryHeader *host_header) {
if (runtime == nullptr || host_header == nullptr) {
return 0;
Expand Down Expand Up @@ -348,13 +321,11 @@ register_callable_impl(const ChipCallable *callable, uint64_t (*upload_fn)(const
// Effective ring sizing for one (callable_id, config): the input half of the
// arena description. Resolved once per config from per-task overrides + env +
// compile-time defaults; depends on nothing that varies per run. `total_heap`
// and `sm_size` are the derived backing-allocation sizes; `scheduler_timeout_ms`
// is the resolved per-platform scheduler no-progress budget.
// and `sm_size` are the derived backing-allocation sizes.
struct ArenaSizingConfig {
uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH];
uint64_t heap_sizes[PTO2_MAX_RING_DEPTH];
int32_t dep_pool_capacities[PTO2_MAX_RING_DEPTH];
int32_t scheduler_timeout_ms;
uint64_t total_heap;
uint64_t sm_size;
};
Expand All @@ -369,7 +340,7 @@ struct StaticArenaPtrs {

// per-(cid,config): resolve the arena sizing. Pure host arithmetic over
// per-task overrides, PTO2_RING_* env, and compile-time defaults; derives the
// total heap (with overflow check) and SM sizes and the scheduler timeout.
// total heap (with overflow check) and SM sizes.
// Returns false on an invalid ring config or a heap-size overflow.
static bool resolve_arena_sizing(
const uint64_t *ring_task_window, const uint64_t *ring_heap, const uint64_t *ring_dep_pool, ArenaSizingConfig *out
Expand Down Expand Up @@ -397,7 +368,6 @@ static bool resolve_arena_sizing(
out->total_heap += out->heap_sizes[r];
}
out->sm_size = PTO2SharedMemoryHandle::calculate_size_per_ring(out->task_window_sizes);
out->scheduler_timeout_ms = resolve_scheduler_timeout_ms();
return true;
}

Expand Down Expand Up @@ -554,7 +524,6 @@ static bool build_runtime_image(
) {
PTO2RuntimeArenaLayout layout =
runtime_reserve_layout(*host_arena, sizing.task_window_sizes, sizing.heap_sizes, sizing.dep_pool_capacities);
layout.sizing.scheduler_timeout_ms = sizing.scheduler_timeout_ms;
if (host_arena->commit(DeviceArena::kDefaultBaseAlign) == nullptr) {
LOG_ERROR("Failed to commit host arena for prebuilt runtime image");
return false;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -97,15 +97,13 @@ struct PTO2RuntimeOps {

/**
* Sizing half of the runtime-arena layout: the capacities that *define* the
* layout (the input to runtime_reserve_layout) plus the scheduler timeout.
* Stable per (callable_id, ring config); re-read at AICPU boot to reconstruct
* ring/heap/dep-pool capacities and the scheduler no-progress budget.
* layout (the input to runtime_reserve_layout). Stable per (callable_id, ring
* config); re-read at AICPU boot to reconstruct ring/heap/dep-pool capacities.
*/
struct ArenaSizingKey {
uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH]{};
uint64_t heap_sizes[PTO2_MAX_RING_DEPTH]{};
int32_t dep_pool_capacities[PTO2_MAX_RING_DEPTH]{};
int32_t scheduler_timeout_ms{0};
};

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#include "common.h" // debug_assert

#include "common/unified_log.h"
#include "aicpu/aicpu_device_config.h"
#include "aicpu/device_time.h"
#include "aicpu/platform_regs.h"
#include "callable.h"
Expand Down Expand Up @@ -882,10 +883,14 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_
// "now" so the first budget cycle starts when this thread does, not at
// an undefined value.
uint64_t last_progress_ts = get_sys_cnt_aicpu();
// Per-device override latched once at worker init by simpler_aicpu_init
// (InitArgs.scheduler_timeout_ms -> resident-SO global). 0 means no
// override; fall back to the compile-time SCHEDULER_TIMEOUT_CYCLES.
uint64_t scheduler_timeout_cycles = SCHEDULER_TIMEOUT_CYCLES;
if (rt_ != nullptr && rt_->prebuilt_layout.sizing.scheduler_timeout_ms > 0) {
scheduler_timeout_cycles = static_cast<uint64_t>(rt_->prebuilt_layout.sizing.scheduler_timeout_ms) *
(PLATFORM_PROF_SYS_CNT_FREQ / 1000);
const int32_t scheduler_timeout_ms_override = get_scheduler_timeout_ms();
if (scheduler_timeout_ms_override > 0) {
scheduler_timeout_cycles =
static_cast<uint64_t>(scheduler_timeout_ms_override) * (PLATFORM_PROF_SYS_CNT_FREQ / 1000);
}

while (true) {
Expand Down
7 changes: 4 additions & 3 deletions src/a5/platform/include/common/kernel_args.h
Original file line number Diff line number Diff line change
Expand Up @@ -128,9 +128,10 @@ static_assert(offsetof(KernelArgs, regs) == 8, "KernelArgs::regs offset drift");
* stays in KernelArgs.
*/
struct InitArgs {
uint32_t device_id{0}; // ACL device ordinal -> set_orch_device_id
uint32_t log_level{1}; // Severity floor: 0=DEBUG, 1=INFO, 2=WARN, 3=ERROR, 4=NUL
uint32_t log_info_v{5}; // INFO verbosity threshold (0..9); default V5
uint32_t device_id{0}; // ACL device ordinal -> set_orch_device_id
uint32_t log_level{1}; // Severity floor: 0=DEBUG, 1=INFO, 2=WARN, 3=ERROR, 4=NUL
uint32_t log_info_v{5}; // INFO verbosity threshold (0..9); default V5
int32_t scheduler_timeout_ms{0}; // AICPU no-progress watchdog (ms); 0 -> compile default
};

/**
Expand Down
2 changes: 2 additions & 0 deletions src/a5/platform/onboard/aicpu/kernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include "common/unified_log.h"
#include "common/kernel_args.h"
#include "common/platform_config.h"
#include "aicpu/aicpu_device_config.h"
#include "aicpu/dep_gen_collector_aicpu.h"
#include "aicpu/device_log.h"
#include "aicpu/device_phase_aicpu.h"
Expand Down Expand Up @@ -159,6 +160,7 @@ extern "C" __attribute__((visibility("default"))) int simpler_aicpu_init(void *a
set_log_level(static_cast<int>(init_args->log_level));
set_log_info_v(static_cast<int>(init_args->log_info_v));
set_orch_device_id(static_cast<int>(init_args->device_id));
set_scheduler_timeout_ms(static_cast<int>(init_args->scheduler_timeout_ms));

LOG_INFO_V0("%s", "simpler_aicpu_init: per-device invariants latched");
return 0;
Expand Down
14 changes: 14 additions & 0 deletions src/a5/platform/sim/host/device_runner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
#include "common/unified_log.h"
#include "cpu_sim_context.h"
#include "host/raii_scope_guard.h"
#include "host/runtime_timeout_config.h"
#include "runtime.h"

// dep_gen_replay_emit_deps_json: strong symbol provided by
Expand Down Expand Up @@ -108,6 +109,19 @@ int DeviceRunner::ensure_binaries_loaded() {
load_optional_sym("simpler_aicpu_register_callable", reinterpret_cast<void **>(&aicpu_register_callable_func_));
if (!load_sym("set_platform_regs", reinterpret_cast<void **>(&set_platform_regs_func_))) return -1;
load_optional_sym("set_orch_device_id", reinterpret_cast<void **>(&set_orch_device_id_func_));
load_optional_sym("set_scheduler_timeout_ms", reinterpret_cast<void **>(&set_scheduler_timeout_ms_func_));
if (set_scheduler_timeout_ms_func_ != nullptr) {
// Per-device one-shot latch (mirrors the onboard InitArgs path):
// honor PTO2_SCHEDULER_TIMEOUT_MS once at SO load, not per run. 0 ->
// the scheduler keeps its compile-time default. Sim skips the
// op/stream ordering check (validate_runtime_timeout_order is onboard).
RuntimeTimeoutParseStatus sched_status;
RuntimeTimeoutConfig sched_cfg =
resolve_runtime_timeout_config(RuntimeTimeoutConfig{1, 1, 0}, &sched_status);
set_scheduler_timeout_ms_func_(
(sched_status.scheduler_env_set && sched_status.scheduler_valid) ? sched_cfg.scheduler_timeout_ms : 0
);
}
if (!load_sym("set_platform_dump_base", reinterpret_cast<void **>(&set_platform_dump_base_func_))) return -1;
if (!load_sym("set_platform_phase_base", reinterpret_cast<void **>(&set_platform_phase_base_func_))) return -1;
if (!load_sym("set_dump_args_enabled", reinterpret_cast<void **>(&set_dump_args_enabled_func_))) return -1;
Expand Down
1 change: 1 addition & 0 deletions src/a5/platform/sim/host/device_runner.h
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ class DeviceRunner : public SimDeviceRunnerBase {
void (*aicore_execute_func_)(Runtime *, int, CoreType, uint32_t, uint64_t, uint32_t, uint64_t, uint64_t){nullptr};
void (*set_platform_regs_func_)(uint64_t){nullptr};
void (*set_orch_device_id_func_)(int){nullptr};
void (*set_scheduler_timeout_ms_func_)(int){nullptr};
void (*set_platform_dump_base_func_)(uint64_t){nullptr};
void (*set_platform_phase_base_func_)(uint64_t){nullptr};
void (*set_platform_pmu_base_func_)(uint64_t){nullptr};
Expand Down
35 changes: 2 additions & 33 deletions src/a5/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,6 @@
#include "common/strace.h"
#include "common/unified_log.h"
#include "host/platform_compile_info.h"
#include "host/runtime_timeout_config.h"
#include "utils/device_arena.h"
#include "prepare_callable_common.h"

Expand Down Expand Up @@ -246,32 +245,6 @@ static bool resolve_ring_config(
return true;
}

static int32_t resolve_scheduler_timeout_ms() {
RuntimeTimeoutParseStatus parse_status;
RuntimeTimeoutConfig cfg = resolve_runtime_timeout_config(
RuntimeTimeoutConfig{PLATFORM_OP_EXECUTE_TIMEOUT_US, PLATFORM_STREAM_SYNC_TIMEOUT_MS, 0}, &parse_status
);
if (!parse_status.scheduler_env_set) {
return 0;
}
if (!parse_status.scheduler_valid) {
const char *env = std::getenv(PTO2_SCHEDULER_TIMEOUT_MS_ENV);
LOG_WARN("%s=%s invalid, using platform scheduler timeout", PTO2_SCHEDULER_TIMEOUT_MS_ENV, env);
return 0;
}

RuntimeTimeoutOrderStatus status = validate_runtime_timeout_order_for_platform(cfg, get_platform());
if (status != RuntimeTimeoutOrderStatus::OK) {
LOG_WARN(
"Ignoring %s=%d: %s (op_execute=%llu us, stream_sync=%d ms)", PTO2_SCHEDULER_TIMEOUT_MS_ENV,
cfg.scheduler_timeout_ms, runtime_timeout_order_status_name(status),
(unsigned long long)cfg.op_execute_timeout_us, cfg.stream_sync_timeout_ms
);
return 0;
}
return cfg.scheduler_timeout_ms;
}

static int32_t pto2_read_runtime_status(Runtime *runtime, PTO2SharedMemoryHeader *host_header) {
if (runtime == nullptr || host_header == nullptr) {
return 0;
Expand Down Expand Up @@ -348,13 +321,11 @@ register_callable_impl(const ChipCallable *callable, uint64_t (*upload_fn)(const
// Effective ring sizing for one (callable_id, config): the input half of the
// arena description. Resolved once per config from per-task overrides + env +
// compile-time defaults; depends on nothing that varies per run. `total_heap`
// and `sm_size` are the derived backing-allocation sizes; `scheduler_timeout_ms`
// is the resolved per-platform scheduler no-progress budget.
// and `sm_size` are the derived backing-allocation sizes.
struct ArenaSizingConfig {
uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH];
uint64_t heap_sizes[PTO2_MAX_RING_DEPTH];
int32_t dep_pool_capacities[PTO2_MAX_RING_DEPTH];
int32_t scheduler_timeout_ms;
uint64_t total_heap;
uint64_t sm_size;
};
Expand All @@ -369,7 +340,7 @@ struct StaticArenaPtrs {

// per-(cid,config): resolve the arena sizing. Pure host arithmetic over
// per-task overrides, PTO2_RING_* env, and compile-time defaults; derives the
// total heap (with overflow check) and SM sizes and the scheduler timeout.
// total heap (with overflow check) and SM sizes.
// Returns false on an invalid ring config or a heap-size overflow.
static bool resolve_arena_sizing(
const uint64_t *ring_task_window, const uint64_t *ring_heap, const uint64_t *ring_dep_pool, ArenaSizingConfig *out
Expand Down Expand Up @@ -397,7 +368,6 @@ static bool resolve_arena_sizing(
out->total_heap += out->heap_sizes[r];
}
out->sm_size = PTO2SharedMemoryHandle::calculate_size_per_ring(out->task_window_sizes);
out->scheduler_timeout_ms = resolve_scheduler_timeout_ms();
return true;
}

Expand Down Expand Up @@ -554,7 +524,6 @@ static bool build_runtime_image(
) {
PTO2RuntimeArenaLayout layout =
runtime_reserve_layout(*host_arena, sizing.task_window_sizes, sizing.heap_sizes, sizing.dep_pool_capacities);
layout.sizing.scheduler_timeout_ms = sizing.scheduler_timeout_ms;
if (host_arena->commit(DeviceArena::kDefaultBaseAlign) == nullptr) {
LOG_ERROR("Failed to commit host arena for prebuilt runtime image");
return false;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -98,15 +98,13 @@ struct PTO2RuntimeOps {

/**
* Sizing half of the runtime-arena layout: the capacities that *define* the
* layout (the input to runtime_reserve_layout) plus the scheduler timeout.
* Stable per (callable_id, ring config); re-read at AICPU boot to reconstruct
* ring/heap/dep-pool capacities and the scheduler no-progress budget.
* layout (the input to runtime_reserve_layout). Stable per (callable_id, ring
* config); re-read at AICPU boot to reconstruct ring/heap/dep-pool capacities.
*/
struct ArenaSizingKey {
uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH]{};
uint64_t heap_sizes[PTO2_MAX_RING_DEPTH]{};
int32_t dep_pool_capacities[PTO2_MAX_RING_DEPTH]{};
int32_t scheduler_timeout_ms{0};
};

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

#include "common.h" // debug_assert
#include "common/unified_log.h"
#include "aicpu/aicpu_device_config.h"
#include "aicpu/device_time.h"
#include "aicpu/platform_regs.h"
#include "callable.h"
Expand Down Expand Up @@ -600,10 +601,14 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_
// "now" so the first budget cycle starts when this thread does, not at
// an undefined value.
uint64_t last_progress_ts = get_sys_cnt_aicpu();
// Per-device override latched once at worker init by simpler_aicpu_init
// (InitArgs.scheduler_timeout_ms -> resident-SO global). 0 means no
// override; fall back to the compile-time SCHEDULER_TIMEOUT_CYCLES.
uint64_t scheduler_timeout_cycles = SCHEDULER_TIMEOUT_CYCLES;
if (rt_ != nullptr && rt_->prebuilt_layout.sizing.scheduler_timeout_ms > 0) {
scheduler_timeout_cycles = static_cast<uint64_t>(rt_->prebuilt_layout.sizing.scheduler_timeout_ms) *
(PLATFORM_PROF_SYS_CNT_FREQ / 1000);
const int32_t scheduler_timeout_ms_override = get_scheduler_timeout_ms();
if (scheduler_timeout_ms_override > 0) {
scheduler_timeout_cycles =
static_cast<uint64_t>(scheduler_timeout_ms_override) * (PLATFORM_PROF_SYS_CNT_FREQ / 1000);
}

while (true) {
Expand Down
Loading
Loading