diff --git a/src/a2a3/platform/include/common/kernel_args.h b/src/a2a3/platform/include/common/kernel_args.h index 2ccd86825..3223f1cd5 100644 --- a/src/a2a3/platform/include/common/kernel_args.h +++ b/src/a2a3/platform/include/common/kernel_args.h @@ -143,9 +143,10 @@ static_assert(offsetof(KernelArgs, regs) == 8, "KernelArgs::regs offset drift"); * register tables consumed on the per-run AICore path and stay in KernelArgs. */ struct InitArgs { - uint32_t device_id{0}; // ACL device ordinal -> set_orch_device_id - uint32_t log_level{1}; // Severity floor: 0=DEBUG, 1=INFO, 2=WARN, 3=ERROR, 4=NUL - uint32_t log_info_v{5}; // INFO verbosity threshold (0..9); default V5 + uint32_t device_id{0}; // ACL device ordinal -> set_orch_device_id + uint32_t log_level{1}; // Severity floor: 0=DEBUG, 1=INFO, 2=WARN, 3=ERROR, 4=NUL + uint32_t log_info_v{5}; // INFO verbosity threshold (0..9); default V5 + int32_t scheduler_timeout_ms{0}; // AICPU no-progress watchdog (ms); 0 -> compile default }; /** diff --git a/src/a2a3/platform/onboard/aicpu/kernel.cpp b/src/a2a3/platform/onboard/aicpu/kernel.cpp index b27ff2cf9..aa7fef21d 100644 --- a/src/a2a3/platform/onboard/aicpu/kernel.cpp +++ b/src/a2a3/platform/onboard/aicpu/kernel.cpp @@ -13,6 +13,7 @@ #include "common/unified_log.h" #include "common/kernel_args.h" #include "common/platform_config.h" +#include "aicpu/aicpu_device_config.h" #include "aicpu/dep_gen_collector_aicpu.h" #include "aicpu/device_log.h" #include "aicpu/device_phase_aicpu.h" @@ -148,6 +149,7 @@ extern "C" __attribute__((visibility("default"))) int simpler_aicpu_init(void *a set_log_level(static_cast(init_args->log_level)); set_log_info_v(static_cast(init_args->log_info_v)); set_orch_device_id(static_cast(init_args->device_id)); + set_scheduler_timeout_ms(static_cast(init_args->scheduler_timeout_ms)); LOG_INFO_V0("%s", "simpler_aicpu_init: per-device invariants latched"); return 0; diff --git a/src/a2a3/platform/sim/host/device_runner.cpp b/src/a2a3/platform/sim/host/device_runner.cpp index 1a942f80e..235c90280 100644 --- a/src/a2a3/platform/sim/host/device_runner.cpp +++ b/src/a2a3/platform/sim/host/device_runner.cpp @@ -37,6 +37,7 @@ #include "common/unified_log.h" #include "cpu_sim_context.h" #include "host/raii_scope_guard.h" +#include "host/runtime_timeout_config.h" #include "runtime.h" // dep_gen_replay_emit_deps_json: strong symbol provided by @@ -97,6 +98,19 @@ int DeviceRunner::ensure_binaries_loaded() { load_optional_sym("simpler_aicpu_register_callable", reinterpret_cast(&aicpu_register_callable_func_)); if (!load_sym("set_platform_regs", reinterpret_cast(&set_platform_regs_func_))) return -1; load_optional_sym("set_orch_device_id", reinterpret_cast(&set_orch_device_id_func_)); + load_optional_sym("set_scheduler_timeout_ms", reinterpret_cast(&set_scheduler_timeout_ms_func_)); + if (set_scheduler_timeout_ms_func_ != nullptr) { + // Per-device one-shot latch (mirrors the onboard InitArgs path): + // honor PTO2_SCHEDULER_TIMEOUT_MS once at SO load, not per run. 0 -> + // the scheduler keeps its compile-time default. Sim skips the + // op/stream ordering check (validate_runtime_timeout_order is onboard). + RuntimeTimeoutParseStatus sched_status; + RuntimeTimeoutConfig sched_cfg = + resolve_runtime_timeout_config(RuntimeTimeoutConfig{1, 1, 0}, &sched_status); + set_scheduler_timeout_ms_func_( + (sched_status.scheduler_env_set && sched_status.scheduler_valid) ? sched_cfg.scheduler_timeout_ms : 0 + ); + } if (!load_sym("set_platform_dump_base", reinterpret_cast(&set_platform_dump_base_func_))) return -1; if (!load_sym("set_platform_phase_base", reinterpret_cast(&set_platform_phase_base_func_))) return -1; if (!load_sym("set_dump_args_enabled", reinterpret_cast(&set_dump_args_enabled_func_))) return -1; diff --git a/src/a2a3/platform/sim/host/device_runner.h b/src/a2a3/platform/sim/host/device_runner.h index b757fce21..0bddfb4e1 100644 --- a/src/a2a3/platform/sim/host/device_runner.h +++ b/src/a2a3/platform/sim/host/device_runner.h @@ -61,6 +61,7 @@ class DeviceRunner : public SimDeviceRunnerBase { void (*aicore_execute_func_)(Runtime *, int, CoreType, uint32_t, uint64_t, uint32_t, uint64_t){nullptr}; void (*set_platform_regs_func_)(uint64_t){nullptr}; void (*set_orch_device_id_func_)(int){nullptr}; + void (*set_scheduler_timeout_ms_func_)(int){nullptr}; void (*set_platform_dump_base_func_)(uint64_t){nullptr}; void (*set_platform_phase_base_func_)(uint64_t){nullptr}; void (*set_dump_args_enabled_func_)(bool){nullptr}; diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp index dc776917e..a6ee473a6 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp @@ -50,7 +50,6 @@ #include "common/strace.h" #include "common/unified_log.h" #include "host/platform_compile_info.h" -#include "host/runtime_timeout_config.h" #include "utils/device_arena.h" #include "prepare_callable_common.h" @@ -246,32 +245,6 @@ static bool resolve_ring_config( return true; } -static int32_t resolve_scheduler_timeout_ms() { - RuntimeTimeoutParseStatus parse_status; - RuntimeTimeoutConfig cfg = resolve_runtime_timeout_config( - RuntimeTimeoutConfig{PLATFORM_OP_EXECUTE_TIMEOUT_US, PLATFORM_STREAM_SYNC_TIMEOUT_MS, 0}, &parse_status - ); - if (!parse_status.scheduler_env_set) { - return 0; - } - if (!parse_status.scheduler_valid) { - const char *env = std::getenv(PTO2_SCHEDULER_TIMEOUT_MS_ENV); - LOG_WARN("%s=%s invalid, using platform scheduler timeout", PTO2_SCHEDULER_TIMEOUT_MS_ENV, env); - return 0; - } - - RuntimeTimeoutOrderStatus status = validate_runtime_timeout_order_for_platform(cfg, get_platform()); - if (status != RuntimeTimeoutOrderStatus::OK) { - LOG_WARN( - "Ignoring %s=%d: %s (op_execute=%llu us, stream_sync=%d ms)", PTO2_SCHEDULER_TIMEOUT_MS_ENV, - cfg.scheduler_timeout_ms, runtime_timeout_order_status_name(status), - (unsigned long long)cfg.op_execute_timeout_us, cfg.stream_sync_timeout_ms - ); - return 0; - } - return cfg.scheduler_timeout_ms; -} - static int32_t pto2_read_runtime_status(Runtime *runtime, PTO2SharedMemoryHeader *host_header) { if (runtime == nullptr || host_header == nullptr) { return 0; @@ -348,13 +321,11 @@ register_callable_impl(const ChipCallable *callable, uint64_t (*upload_fn)(const // Effective ring sizing for one (callable_id, config): the input half of the // arena description. Resolved once per config from per-task overrides + env + // compile-time defaults; depends on nothing that varies per run. `total_heap` -// and `sm_size` are the derived backing-allocation sizes; `scheduler_timeout_ms` -// is the resolved per-platform scheduler no-progress budget. +// and `sm_size` are the derived backing-allocation sizes. struct ArenaSizingConfig { uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH]; uint64_t heap_sizes[PTO2_MAX_RING_DEPTH]; int32_t dep_pool_capacities[PTO2_MAX_RING_DEPTH]; - int32_t scheduler_timeout_ms; uint64_t total_heap; uint64_t sm_size; }; @@ -369,7 +340,7 @@ struct StaticArenaPtrs { // per-(cid,config): resolve the arena sizing. Pure host arithmetic over // per-task overrides, PTO2_RING_* env, and compile-time defaults; derives the -// total heap (with overflow check) and SM sizes and the scheduler timeout. +// total heap (with overflow check) and SM sizes. // Returns false on an invalid ring config or a heap-size overflow. static bool resolve_arena_sizing( const uint64_t *ring_task_window, const uint64_t *ring_heap, const uint64_t *ring_dep_pool, ArenaSizingConfig *out @@ -397,7 +368,6 @@ static bool resolve_arena_sizing( out->total_heap += out->heap_sizes[r]; } out->sm_size = PTO2SharedMemoryHandle::calculate_size_per_ring(out->task_window_sizes); - out->scheduler_timeout_ms = resolve_scheduler_timeout_ms(); return true; } @@ -554,7 +524,6 @@ static bool build_runtime_image( ) { PTO2RuntimeArenaLayout layout = runtime_reserve_layout(*host_arena, sizing.task_window_sizes, sizing.heap_sizes, sizing.dep_pool_capacities); - layout.sizing.scheduler_timeout_ms = sizing.scheduler_timeout_ms; if (host_arena->commit(DeviceArena::kDefaultBaseAlign) == nullptr) { LOG_ERROR("Failed to commit host arena for prebuilt runtime image"); return false; diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h index 510fa3b67..058f90f40 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h @@ -97,15 +97,13 @@ struct PTO2RuntimeOps { /** * Sizing half of the runtime-arena layout: the capacities that *define* the - * layout (the input to runtime_reserve_layout) plus the scheduler timeout. - * Stable per (callable_id, ring config); re-read at AICPU boot to reconstruct - * ring/heap/dep-pool capacities and the scheduler no-progress budget. + * layout (the input to runtime_reserve_layout). Stable per (callable_id, ring + * config); re-read at AICPU boot to reconstruct ring/heap/dep-pool capacities. */ struct ArenaSizingKey { uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH]{}; uint64_t heap_sizes[PTO2_MAX_RING_DEPTH]{}; int32_t dep_pool_capacities[PTO2_MAX_RING_DEPTH]{}; - int32_t scheduler_timeout_ms{0}; }; /** diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp index be7a33edd..c4a10369d 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp @@ -17,6 +17,7 @@ #include "common.h" // debug_assert #include "common/unified_log.h" +#include "aicpu/aicpu_device_config.h" #include "aicpu/device_time.h" #include "aicpu/platform_regs.h" #include "callable.h" @@ -882,10 +883,14 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_ // "now" so the first budget cycle starts when this thread does, not at // an undefined value. uint64_t last_progress_ts = get_sys_cnt_aicpu(); + // Per-device override latched once at worker init by simpler_aicpu_init + // (InitArgs.scheduler_timeout_ms -> resident-SO global). 0 means no + // override; fall back to the compile-time SCHEDULER_TIMEOUT_CYCLES. uint64_t scheduler_timeout_cycles = SCHEDULER_TIMEOUT_CYCLES; - if (rt_ != nullptr && rt_->prebuilt_layout.sizing.scheduler_timeout_ms > 0) { - scheduler_timeout_cycles = static_cast(rt_->prebuilt_layout.sizing.scheduler_timeout_ms) * - (PLATFORM_PROF_SYS_CNT_FREQ / 1000); + const int32_t scheduler_timeout_ms_override = get_scheduler_timeout_ms(); + if (scheduler_timeout_ms_override > 0) { + scheduler_timeout_cycles = + static_cast(scheduler_timeout_ms_override) * (PLATFORM_PROF_SYS_CNT_FREQ / 1000); } while (true) { diff --git a/src/a5/platform/include/common/kernel_args.h b/src/a5/platform/include/common/kernel_args.h index 70bb2a842..7bb84177f 100644 --- a/src/a5/platform/include/common/kernel_args.h +++ b/src/a5/platform/include/common/kernel_args.h @@ -128,9 +128,10 @@ static_assert(offsetof(KernelArgs, regs) == 8, "KernelArgs::regs offset drift"); * stays in KernelArgs. */ struct InitArgs { - uint32_t device_id{0}; // ACL device ordinal -> set_orch_device_id - uint32_t log_level{1}; // Severity floor: 0=DEBUG, 1=INFO, 2=WARN, 3=ERROR, 4=NUL - uint32_t log_info_v{5}; // INFO verbosity threshold (0..9); default V5 + uint32_t device_id{0}; // ACL device ordinal -> set_orch_device_id + uint32_t log_level{1}; // Severity floor: 0=DEBUG, 1=INFO, 2=WARN, 3=ERROR, 4=NUL + uint32_t log_info_v{5}; // INFO verbosity threshold (0..9); default V5 + int32_t scheduler_timeout_ms{0}; // AICPU no-progress watchdog (ms); 0 -> compile default }; /** diff --git a/src/a5/platform/onboard/aicpu/kernel.cpp b/src/a5/platform/onboard/aicpu/kernel.cpp index 966c4dea1..7b5ee26b8 100644 --- a/src/a5/platform/onboard/aicpu/kernel.cpp +++ b/src/a5/platform/onboard/aicpu/kernel.cpp @@ -13,6 +13,7 @@ #include "common/unified_log.h" #include "common/kernel_args.h" #include "common/platform_config.h" +#include "aicpu/aicpu_device_config.h" #include "aicpu/dep_gen_collector_aicpu.h" #include "aicpu/device_log.h" #include "aicpu/device_phase_aicpu.h" @@ -159,6 +160,7 @@ extern "C" __attribute__((visibility("default"))) int simpler_aicpu_init(void *a set_log_level(static_cast(init_args->log_level)); set_log_info_v(static_cast(init_args->log_info_v)); set_orch_device_id(static_cast(init_args->device_id)); + set_scheduler_timeout_ms(static_cast(init_args->scheduler_timeout_ms)); LOG_INFO_V0("%s", "simpler_aicpu_init: per-device invariants latched"); return 0; diff --git a/src/a5/platform/sim/host/device_runner.cpp b/src/a5/platform/sim/host/device_runner.cpp index 3230ddf66..638c2133d 100644 --- a/src/a5/platform/sim/host/device_runner.cpp +++ b/src/a5/platform/sim/host/device_runner.cpp @@ -38,6 +38,7 @@ #include "common/unified_log.h" #include "cpu_sim_context.h" #include "host/raii_scope_guard.h" +#include "host/runtime_timeout_config.h" #include "runtime.h" // dep_gen_replay_emit_deps_json: strong symbol provided by @@ -108,6 +109,19 @@ int DeviceRunner::ensure_binaries_loaded() { load_optional_sym("simpler_aicpu_register_callable", reinterpret_cast(&aicpu_register_callable_func_)); if (!load_sym("set_platform_regs", reinterpret_cast(&set_platform_regs_func_))) return -1; load_optional_sym("set_orch_device_id", reinterpret_cast(&set_orch_device_id_func_)); + load_optional_sym("set_scheduler_timeout_ms", reinterpret_cast(&set_scheduler_timeout_ms_func_)); + if (set_scheduler_timeout_ms_func_ != nullptr) { + // Per-device one-shot latch (mirrors the onboard InitArgs path): + // honor PTO2_SCHEDULER_TIMEOUT_MS once at SO load, not per run. 0 -> + // the scheduler keeps its compile-time default. Sim skips the + // op/stream ordering check (validate_runtime_timeout_order is onboard). + RuntimeTimeoutParseStatus sched_status; + RuntimeTimeoutConfig sched_cfg = + resolve_runtime_timeout_config(RuntimeTimeoutConfig{1, 1, 0}, &sched_status); + set_scheduler_timeout_ms_func_( + (sched_status.scheduler_env_set && sched_status.scheduler_valid) ? sched_cfg.scheduler_timeout_ms : 0 + ); + } if (!load_sym("set_platform_dump_base", reinterpret_cast(&set_platform_dump_base_func_))) return -1; if (!load_sym("set_platform_phase_base", reinterpret_cast(&set_platform_phase_base_func_))) return -1; if (!load_sym("set_dump_args_enabled", reinterpret_cast(&set_dump_args_enabled_func_))) return -1; diff --git a/src/a5/platform/sim/host/device_runner.h b/src/a5/platform/sim/host/device_runner.h index 9719a27ba..a912f61b7 100644 --- a/src/a5/platform/sim/host/device_runner.h +++ b/src/a5/platform/sim/host/device_runner.h @@ -63,6 +63,7 @@ class DeviceRunner : public SimDeviceRunnerBase { void (*aicore_execute_func_)(Runtime *, int, CoreType, uint32_t, uint64_t, uint32_t, uint64_t, uint64_t){nullptr}; void (*set_platform_regs_func_)(uint64_t){nullptr}; void (*set_orch_device_id_func_)(int){nullptr}; + void (*set_scheduler_timeout_ms_func_)(int){nullptr}; void (*set_platform_dump_base_func_)(uint64_t){nullptr}; void (*set_platform_phase_base_func_)(uint64_t){nullptr}; void (*set_platform_pmu_base_func_)(uint64_t){nullptr}; diff --git a/src/a5/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp b/src/a5/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp index dc776917e..a6ee473a6 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp +++ b/src/a5/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp @@ -50,7 +50,6 @@ #include "common/strace.h" #include "common/unified_log.h" #include "host/platform_compile_info.h" -#include "host/runtime_timeout_config.h" #include "utils/device_arena.h" #include "prepare_callable_common.h" @@ -246,32 +245,6 @@ static bool resolve_ring_config( return true; } -static int32_t resolve_scheduler_timeout_ms() { - RuntimeTimeoutParseStatus parse_status; - RuntimeTimeoutConfig cfg = resolve_runtime_timeout_config( - RuntimeTimeoutConfig{PLATFORM_OP_EXECUTE_TIMEOUT_US, PLATFORM_STREAM_SYNC_TIMEOUT_MS, 0}, &parse_status - ); - if (!parse_status.scheduler_env_set) { - return 0; - } - if (!parse_status.scheduler_valid) { - const char *env = std::getenv(PTO2_SCHEDULER_TIMEOUT_MS_ENV); - LOG_WARN("%s=%s invalid, using platform scheduler timeout", PTO2_SCHEDULER_TIMEOUT_MS_ENV, env); - return 0; - } - - RuntimeTimeoutOrderStatus status = validate_runtime_timeout_order_for_platform(cfg, get_platform()); - if (status != RuntimeTimeoutOrderStatus::OK) { - LOG_WARN( - "Ignoring %s=%d: %s (op_execute=%llu us, stream_sync=%d ms)", PTO2_SCHEDULER_TIMEOUT_MS_ENV, - cfg.scheduler_timeout_ms, runtime_timeout_order_status_name(status), - (unsigned long long)cfg.op_execute_timeout_us, cfg.stream_sync_timeout_ms - ); - return 0; - } - return cfg.scheduler_timeout_ms; -} - static int32_t pto2_read_runtime_status(Runtime *runtime, PTO2SharedMemoryHeader *host_header) { if (runtime == nullptr || host_header == nullptr) { return 0; @@ -348,13 +321,11 @@ register_callable_impl(const ChipCallable *callable, uint64_t (*upload_fn)(const // Effective ring sizing for one (callable_id, config): the input half of the // arena description. Resolved once per config from per-task overrides + env + // compile-time defaults; depends on nothing that varies per run. `total_heap` -// and `sm_size` are the derived backing-allocation sizes; `scheduler_timeout_ms` -// is the resolved per-platform scheduler no-progress budget. +// and `sm_size` are the derived backing-allocation sizes. struct ArenaSizingConfig { uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH]; uint64_t heap_sizes[PTO2_MAX_RING_DEPTH]; int32_t dep_pool_capacities[PTO2_MAX_RING_DEPTH]; - int32_t scheduler_timeout_ms; uint64_t total_heap; uint64_t sm_size; }; @@ -369,7 +340,7 @@ struct StaticArenaPtrs { // per-(cid,config): resolve the arena sizing. Pure host arithmetic over // per-task overrides, PTO2_RING_* env, and compile-time defaults; derives the -// total heap (with overflow check) and SM sizes and the scheduler timeout. +// total heap (with overflow check) and SM sizes. // Returns false on an invalid ring config or a heap-size overflow. static bool resolve_arena_sizing( const uint64_t *ring_task_window, const uint64_t *ring_heap, const uint64_t *ring_dep_pool, ArenaSizingConfig *out @@ -397,7 +368,6 @@ static bool resolve_arena_sizing( out->total_heap += out->heap_sizes[r]; } out->sm_size = PTO2SharedMemoryHandle::calculate_size_per_ring(out->task_window_sizes); - out->scheduler_timeout_ms = resolve_scheduler_timeout_ms(); return true; } @@ -554,7 +524,6 @@ static bool build_runtime_image( ) { PTO2RuntimeArenaLayout layout = runtime_reserve_layout(*host_arena, sizing.task_window_sizes, sizing.heap_sizes, sizing.dep_pool_capacities); - layout.sizing.scheduler_timeout_ms = sizing.scheduler_timeout_ms; if (host_arena->commit(DeviceArena::kDefaultBaseAlign) == nullptr) { LOG_ERROR("Failed to commit host arena for prebuilt runtime image"); return false; diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h index 557dbfd32..187de69a1 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h @@ -98,15 +98,13 @@ struct PTO2RuntimeOps { /** * Sizing half of the runtime-arena layout: the capacities that *define* the - * layout (the input to runtime_reserve_layout) plus the scheduler timeout. - * Stable per (callable_id, ring config); re-read at AICPU boot to reconstruct - * ring/heap/dep-pool capacities and the scheduler no-progress budget. + * layout (the input to runtime_reserve_layout). Stable per (callable_id, ring + * config); re-read at AICPU boot to reconstruct ring/heap/dep-pool capacities. */ struct ArenaSizingKey { uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH]{}; uint64_t heap_sizes[PTO2_MAX_RING_DEPTH]{}; int32_t dep_pool_capacities[PTO2_MAX_RING_DEPTH]{}; - int32_t scheduler_timeout_ms{0}; }; /** diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp index 68a17bfcc..beea5ada4 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp @@ -16,6 +16,7 @@ #include "common.h" // debug_assert #include "common/unified_log.h" +#include "aicpu/aicpu_device_config.h" #include "aicpu/device_time.h" #include "aicpu/platform_regs.h" #include "callable.h" @@ -600,10 +601,14 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_ // "now" so the first budget cycle starts when this thread does, not at // an undefined value. uint64_t last_progress_ts = get_sys_cnt_aicpu(); + // Per-device override latched once at worker init by simpler_aicpu_init + // (InitArgs.scheduler_timeout_ms -> resident-SO global). 0 means no + // override; fall back to the compile-time SCHEDULER_TIMEOUT_CYCLES. uint64_t scheduler_timeout_cycles = SCHEDULER_TIMEOUT_CYCLES; - if (rt_ != nullptr && rt_->prebuilt_layout.sizing.scheduler_timeout_ms > 0) { - scheduler_timeout_cycles = static_cast(rt_->prebuilt_layout.sizing.scheduler_timeout_ms) * - (PLATFORM_PROF_SYS_CNT_FREQ / 1000); + const int32_t scheduler_timeout_ms_override = get_scheduler_timeout_ms(); + if (scheduler_timeout_ms_override > 0) { + scheduler_timeout_cycles = + static_cast(scheduler_timeout_ms_override) * (PLATFORM_PROF_SYS_CNT_FREQ / 1000); } while (true) { diff --git a/src/common/platform/include/aicpu/aicpu_device_config.h b/src/common/platform/include/aicpu/aicpu_device_config.h new file mode 100644 index 000000000..38ec22e30 --- /dev/null +++ b/src/common/platform/include/aicpu/aicpu_device_config.h @@ -0,0 +1,46 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * Per-device AICPU runtime config. + * + * Home for run-invariant per-device knobs that simpler_aicpu_init latches once + * at worker init (from InitArgs) into resident-SO globals surviving every + * subsequent per-task launch. The runtime consumes these read-only; they do + * NOT ride the per-run KernelArgs or the arena layout. Add fields here as new + * per-device config appears rather than threading it through the per-run path. + * + * Kept separate from platform_regs (which is strictly per-core register + * addressing) so neither file accretes the other's concern. + */ + +#ifndef PLATFORM_COMMON_AICPU_DEVICE_CONFIG_H_ +#define PLATFORM_COMMON_AICPU_DEVICE_CONFIG_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Set the AICPU scheduler no-progress watchdog timeout (ms). Latched once per + * device by simpler_aicpu_init (from InitArgs.scheduler_timeout_ms); read by + * the scheduler dispatch loop each run. 0 means "no override" — the scheduler + * keeps its compile-time SCHEDULER_TIMEOUT_CYCLES. + */ +void set_scheduler_timeout_ms(int timeout_ms); + +/** Get the scheduler watchdog timeout override in ms (0 if unset). */ +int get_scheduler_timeout_ms(); + +#ifdef __cplusplus +} +#endif + +#endif // PLATFORM_COMMON_AICPU_DEVICE_CONFIG_H_ diff --git a/src/common/platform/include/host/runtime_timeout_config.h b/src/common/platform/include/host/runtime_timeout_config.h index 98d15c2f6..a8244677a 100644 --- a/src/common/platform/include/host/runtime_timeout_config.h +++ b/src/common/platform/include/host/runtime_timeout_config.h @@ -40,6 +40,10 @@ struct RuntimeTimeoutConfig { struct HostRuntimeTimeoutConfig { uint64_t op_execute_timeout_us; int32_t stream_sync_timeout_ms; + // AICPU scheduler no-progress watchdog override (ms). 0 means "no env + // override" — the device falls back to its compile-time default + // (SCHEDULER_TIMEOUT_CYCLES). Latched once per device into InitArgs. + int32_t scheduler_timeout_ms{0}; }; struct RuntimeTimeoutParseStatus { diff --git a/src/common/platform/onboard/host/device_runner_base.cpp b/src/common/platform/onboard/host/device_runner_base.cpp index 9e42263b7..cba43b423 100644 --- a/src/common/platform/onboard/host/device_runner_base.cpp +++ b/src/common/platform/onboard/host/device_runner_base.cpp @@ -91,15 +91,25 @@ HostRuntimeTimeoutConfig resolve_onboard_timeout_config() { bool host_timeout_env_set = parse_status.op_execute_env_set || parse_status.stream_sync_env_set || parse_status.scheduler_env_set; RuntimeTimeoutOrderStatus order_status = validate_runtime_timeout_order(cfg); + // The scheduler override is forwarded to the device (via InitArgs at init) + // only when explicitly set, valid, and consistent with the op/stream + // ordering. 0 means "no override" — the AICPU scheduler then keeps its + // compile-time default. op/stream remain host-side acl knobs. + int32_t scheduler_override = (parse_status.scheduler_env_set && parse_status.scheduler_valid && + order_status == RuntimeTimeoutOrderStatus::OK) ? + cfg.scheduler_timeout_ms : + 0; if (host_timeout_env_set && order_status != RuntimeTimeoutOrderStatus::OK) { LOG_WARN( "Ignoring PTO2 timeout env overrides: %s (scheduler=%d ms, op_execute=%llu us, stream_sync=%d ms)", runtime_timeout_order_status_name(order_status), cfg.scheduler_timeout_ms, (unsigned long long)cfg.op_execute_timeout_us, cfg.stream_sync_timeout_ms ); - return HostRuntimeTimeoutConfig{order_defaults.op_execute_timeout_us, order_defaults.stream_sync_timeout_ms}; + return HostRuntimeTimeoutConfig{ + order_defaults.op_execute_timeout_us, order_defaults.stream_sync_timeout_ms, scheduler_override + }; } - return HostRuntimeTimeoutConfig{cfg.op_execute_timeout_us, cfg.stream_sync_timeout_ms}; + return HostRuntimeTimeoutConfig{cfg.op_execute_timeout_us, cfg.stream_sync_timeout_ms, scheduler_override}; } } // namespace @@ -374,6 +384,9 @@ int DeviceRunnerBase::ensure_aicpu_init_launched() { init_args.device_id = static_cast(device_id_); init_args.log_level = static_cast(HostLogger::get_instance().level()); init_args.log_info_v = static_cast(HostLogger::get_instance().info_v()); + // Per-device scheduler watchdog override, resolved once at attach into + // timeout_config_. 0 -> the AICPU scheduler keeps its compile-time default. + init_args.scheduler_timeout_ms = timeout_config_.scheduler_timeout_ms; LOG_INFO_V0("=== launch_aicpu_payload %s ===", host::KernelNames::InitName); int rc = launch_aicpu_payload( diff --git a/src/common/platform/shared/aicpu/aicpu_device_config.cpp b/src/common/platform/shared/aicpu/aicpu_device_config.cpp new file mode 100644 index 000000000..4d6d38e91 --- /dev/null +++ b/src/common/platform/shared/aicpu/aicpu_device_config.cpp @@ -0,0 +1,21 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +#include "aicpu/aicpu_device_config.h" + +namespace { +// Latched once per device by simpler_aicpu_init; survives every per-task launch +// because the AICPU inner SO stays dlopen'd for the runner's life. +int g_scheduler_timeout_ms = 0; +} // namespace + +void set_scheduler_timeout_ms(int timeout_ms) { g_scheduler_timeout_ms = timeout_ms; } + +int get_scheduler_timeout_ms() { return g_scheduler_timeout_ms; }