diff --git a/src/a2a3/platform/include/common/kernel_args.h b/src/a2a3/platform/include/common/kernel_args.h
index 2ccd86825..3223f1cd5 100644
--- a/src/a2a3/platform/include/common/kernel_args.h
+++ b/src/a2a3/platform/include/common/kernel_args.h
@@ -143,9 +143,10 @@ static_assert(offsetof(KernelArgs, regs) == 8, "KernelArgs::regs offset drift");
  * register tables consumed on the per-run AICore path and stay in KernelArgs.
  */
 struct InitArgs {
-    uint32_t device_id{0};   // ACL device ordinal -> set_orch_device_id
-    uint32_t log_level{1};   // Severity floor: 0=DEBUG, 1=INFO, 2=WARN, 3=ERROR, 4=NUL
-    uint32_t log_info_v{5};  // INFO verbosity threshold (0..9); default V5
+    uint32_t device_id{0};            // ACL device ordinal -> set_orch_device_id
+    uint32_t log_level{1};            // Severity floor: 0=DEBUG, 1=INFO, 2=WARN, 3=ERROR, 4=NUL
+    uint32_t log_info_v{5};           // INFO verbosity threshold (0..9); default V5
+    int32_t scheduler_timeout_ms{0};  // AICPU no-progress watchdog (ms); 0 -> compile default
 };
 
 /**
diff --git a/src/a2a3/platform/onboard/aicpu/kernel.cpp b/src/a2a3/platform/onboard/aicpu/kernel.cpp
index b27ff2cf9..aa7fef21d 100644
--- a/src/a2a3/platform/onboard/aicpu/kernel.cpp
+++ b/src/a2a3/platform/onboard/aicpu/kernel.cpp
@@ -13,6 +13,7 @@
 #include "common/unified_log.h"
 #include "common/kernel_args.h"
 #include "common/platform_config.h"
+#include "aicpu/aicpu_device_config.h"
 #include "aicpu/dep_gen_collector_aicpu.h"
 #include "aicpu/device_log.h"
 #include "aicpu/device_phase_aicpu.h"
@@ -148,6 +149,7 @@ extern "C" __attribute__((visibility("default"))) int simpler_aicpu_init(void *a
     set_log_level(static_cast<int>(init_args->log_level));
     set_log_info_v(static_cast<int>(init_args->log_info_v));
     set_orch_device_id(static_cast<int>(init_args->device_id));
+    set_scheduler_timeout_ms(static_cast<int>(init_args->scheduler_timeout_ms));
 
     LOG_INFO_V0("%s", "simpler_aicpu_init: per-device invariants latched");
     return 0;
diff --git a/src/a2a3/platform/sim/host/device_runner.cpp b/src/a2a3/platform/sim/host/device_runner.cpp
index 1a942f80e..235c90280 100644
--- a/src/a2a3/platform/sim/host/device_runner.cpp
+++ b/src/a2a3/platform/sim/host/device_runner.cpp
@@ -37,6 +37,7 @@
 #include "common/unified_log.h"
 #include "cpu_sim_context.h"
 #include "host/raii_scope_guard.h"
+#include "host/runtime_timeout_config.h"
 #include "runtime.h"
 
 // dep_gen_replay_emit_deps_json: strong symbol provided by
@@ -97,6 +98,19 @@ int DeviceRunner::ensure_binaries_loaded() {
         load_optional_sym("simpler_aicpu_register_callable", reinterpret_cast<void **>(&aicpu_register_callable_func_));
         if (!load_sym("set_platform_regs", reinterpret_cast<void **>(&set_platform_regs_func_))) return -1;
         load_optional_sym("set_orch_device_id", reinterpret_cast<void **>(&set_orch_device_id_func_));
+        load_optional_sym("set_scheduler_timeout_ms", reinterpret_cast<void **>(&set_scheduler_timeout_ms_func_));
+        if (set_scheduler_timeout_ms_func_ != nullptr) {
+            // Per-device one-shot latch (mirrors the onboard InitArgs path):
+            // honor PTO2_SCHEDULER_TIMEOUT_MS once at SO load, not per run. 0 ->
+            // the scheduler keeps its compile-time default. Sim skips the
+            // op/stream ordering check (validate_runtime_timeout_order is onboard).
+            RuntimeTimeoutParseStatus sched_status;
+            RuntimeTimeoutConfig sched_cfg =
+                resolve_runtime_timeout_config(RuntimeTimeoutConfig{1, 1, 0}, &sched_status);
+            set_scheduler_timeout_ms_func_(
+                (sched_status.scheduler_env_set && sched_status.scheduler_valid) ? sched_cfg.scheduler_timeout_ms : 0
+            );
+        }
         if (!load_sym("set_platform_dump_base", reinterpret_cast<void **>(&set_platform_dump_base_func_))) return -1;
         if (!load_sym("set_platform_phase_base", reinterpret_cast<void **>(&set_platform_phase_base_func_))) return -1;
         if (!load_sym("set_dump_args_enabled", reinterpret_cast<void **>(&set_dump_args_enabled_func_))) return -1;
diff --git a/src/a2a3/platform/sim/host/device_runner.h b/src/a2a3/platform/sim/host/device_runner.h
index b757fce21..0bddfb4e1 100644
--- a/src/a2a3/platform/sim/host/device_runner.h
+++ b/src/a2a3/platform/sim/host/device_runner.h
@@ -61,6 +61,7 @@ class DeviceRunner : public SimDeviceRunnerBase {
     void (*aicore_execute_func_)(Runtime *, int, CoreType, uint32_t, uint64_t, uint32_t, uint64_t){nullptr};
     void (*set_platform_regs_func_)(uint64_t){nullptr};
     void (*set_orch_device_id_func_)(int){nullptr};
+    void (*set_scheduler_timeout_ms_func_)(int){nullptr};
     void (*set_platform_dump_base_func_)(uint64_t){nullptr};
     void (*set_platform_phase_base_func_)(uint64_t){nullptr};
     void (*set_dump_args_enabled_func_)(bool){nullptr};
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp
index dc776917e..a6ee473a6 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp
@@ -50,7 +50,6 @@
 #include "common/strace.h"
 #include "common/unified_log.h"
 #include "host/platform_compile_info.h"
-#include "host/runtime_timeout_config.h"
 #include "utils/device_arena.h"
 #include "prepare_callable_common.h"
 
@@ -246,32 +245,6 @@ static bool resolve_ring_config(
     return true;
 }
 
-static int32_t resolve_scheduler_timeout_ms() {
-    RuntimeTimeoutParseStatus parse_status;
-    RuntimeTimeoutConfig cfg = resolve_runtime_timeout_config(
-        RuntimeTimeoutConfig{PLATFORM_OP_EXECUTE_TIMEOUT_US, PLATFORM_STREAM_SYNC_TIMEOUT_MS, 0}, &parse_status
-    );
-    if (!parse_status.scheduler_env_set) {
-        return 0;
-    }
-    if (!parse_status.scheduler_valid) {
-        const char *env = std::getenv(PTO2_SCHEDULER_TIMEOUT_MS_ENV);
-        LOG_WARN("%s=%s invalid, using platform scheduler timeout", PTO2_SCHEDULER_TIMEOUT_MS_ENV, env);
-        return 0;
-    }
-
-    RuntimeTimeoutOrderStatus status = validate_runtime_timeout_order_for_platform(cfg, get_platform());
-    if (status != RuntimeTimeoutOrderStatus::OK) {
-        LOG_WARN(
-            "Ignoring %s=%d: %s (op_execute=%llu us, stream_sync=%d ms)", PTO2_SCHEDULER_TIMEOUT_MS_ENV,
-            cfg.scheduler_timeout_ms, runtime_timeout_order_status_name(status),
-            (unsigned long long)cfg.op_execute_timeout_us, cfg.stream_sync_timeout_ms
-        );
-        return 0;
-    }
-    return cfg.scheduler_timeout_ms;
-}
-
 static int32_t pto2_read_runtime_status(Runtime *runtime, PTO2SharedMemoryHeader *host_header) {
     if (runtime == nullptr || host_header == nullptr) {
         return 0;
@@ -348,13 +321,11 @@ register_callable_impl(const ChipCallable *callable, uint64_t (*upload_fn)(const
 // Effective ring sizing for one (callable_id, config): the input half of the
 // arena description. Resolved once per config from per-task overrides + env +
 // compile-time defaults; depends on nothing that varies per run. `total_heap`
-// and `sm_size` are the derived backing-allocation sizes; `scheduler_timeout_ms`
-// is the resolved per-platform scheduler no-progress budget.
+// and `sm_size` are the derived backing-allocation sizes.
 struct ArenaSizingConfig {
     uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH];
     uint64_t heap_sizes[PTO2_MAX_RING_DEPTH];
     int32_t dep_pool_capacities[PTO2_MAX_RING_DEPTH];
-    int32_t scheduler_timeout_ms;
     uint64_t total_heap;
     uint64_t sm_size;
 };
@@ -369,7 +340,7 @@ struct StaticArenaPtrs {
 
 // per-(cid,config): resolve the arena sizing. Pure host arithmetic over
 // per-task overrides, PTO2_RING_* env, and compile-time defaults; derives the
-// total heap (with overflow check) and SM sizes and the scheduler timeout.
+// total heap (with overflow check) and SM sizes.
 // Returns false on an invalid ring config or a heap-size overflow.
 static bool resolve_arena_sizing(
     const uint64_t *ring_task_window, const uint64_t *ring_heap, const uint64_t *ring_dep_pool, ArenaSizingConfig *out
@@ -397,7 +368,6 @@ static bool resolve_arena_sizing(
         out->total_heap += out->heap_sizes[r];
     }
     out->sm_size = PTO2SharedMemoryHandle::calculate_size_per_ring(out->task_window_sizes);
-    out->scheduler_timeout_ms = resolve_scheduler_timeout_ms();
     return true;
 }
 
@@ -554,7 +524,6 @@ static bool build_runtime_image(
 ) {
     PTO2RuntimeArenaLayout layout =
         runtime_reserve_layout(*host_arena, sizing.task_window_sizes, sizing.heap_sizes, sizing.dep_pool_capacities);
-    layout.sizing.scheduler_timeout_ms = sizing.scheduler_timeout_ms;
     if (host_arena->commit(DeviceArena::kDefaultBaseAlign) == nullptr) {
         LOG_ERROR("Failed to commit host arena for prebuilt runtime image");
         return false;
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h
index 510fa3b67..058f90f40 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h
@@ -97,15 +97,13 @@ struct PTO2RuntimeOps {
 
 /**
  * Sizing half of the runtime-arena layout: the capacities that *define* the
- * layout (the input to runtime_reserve_layout) plus the scheduler timeout.
- * Stable per (callable_id, ring config); re-read at AICPU boot to reconstruct
- * ring/heap/dep-pool capacities and the scheduler no-progress budget.
+ * layout (the input to runtime_reserve_layout). Stable per (callable_id, ring
+ * config); re-read at AICPU boot to reconstruct ring/heap/dep-pool capacities.
  */
 struct ArenaSizingKey {
     uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH]{};
     uint64_t heap_sizes[PTO2_MAX_RING_DEPTH]{};
     int32_t dep_pool_capacities[PTO2_MAX_RING_DEPTH]{};
-    int32_t scheduler_timeout_ms{0};
 };
 
 /**
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp
index be7a33edd..c4a10369d 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp
@@ -17,6 +17,7 @@
 #include "common.h"  // debug_assert
 
 #include "common/unified_log.h"
+#include "aicpu/aicpu_device_config.h"
 #include "aicpu/device_time.h"
 #include "aicpu/platform_regs.h"
 #include "callable.h"
@@ -882,10 +883,14 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_
     // "now" so the first budget cycle starts when this thread does, not at
     // an undefined value.
     uint64_t last_progress_ts = get_sys_cnt_aicpu();
+    // Per-device override latched once at worker init by simpler_aicpu_init
+    // (InitArgs.scheduler_timeout_ms -> resident-SO global). 0 means no
+    // override; fall back to the compile-time SCHEDULER_TIMEOUT_CYCLES.
     uint64_t scheduler_timeout_cycles = SCHEDULER_TIMEOUT_CYCLES;
-    if (rt_ != nullptr && rt_->prebuilt_layout.sizing.scheduler_timeout_ms > 0) {
-        scheduler_timeout_cycles = static_cast<uint64_t>(rt_->prebuilt_layout.sizing.scheduler_timeout_ms) *
-                                   (PLATFORM_PROF_SYS_CNT_FREQ / 1000);
+    const int32_t scheduler_timeout_ms_override = get_scheduler_timeout_ms();
+    if (scheduler_timeout_ms_override > 0) {
+        scheduler_timeout_cycles =
+            static_cast<uint64_t>(scheduler_timeout_ms_override) * (PLATFORM_PROF_SYS_CNT_FREQ / 1000);
     }
 
     while (true) {
diff --git a/src/a5/platform/include/common/kernel_args.h b/src/a5/platform/include/common/kernel_args.h
index 70bb2a842..7bb84177f 100644
--- a/src/a5/platform/include/common/kernel_args.h
+++ b/src/a5/platform/include/common/kernel_args.h
@@ -128,9 +128,10 @@ static_assert(offsetof(KernelArgs, regs) == 8, "KernelArgs::regs offset drift");
  * stays in KernelArgs.
  */
 struct InitArgs {
-    uint32_t device_id{0};   // ACL device ordinal -> set_orch_device_id
-    uint32_t log_level{1};   // Severity floor: 0=DEBUG, 1=INFO, 2=WARN, 3=ERROR, 4=NUL
-    uint32_t log_info_v{5};  // INFO verbosity threshold (0..9); default V5
+    uint32_t device_id{0};            // ACL device ordinal -> set_orch_device_id
+    uint32_t log_level{1};            // Severity floor: 0=DEBUG, 1=INFO, 2=WARN, 3=ERROR, 4=NUL
+    uint32_t log_info_v{5};           // INFO verbosity threshold (0..9); default V5
+    int32_t scheduler_timeout_ms{0};  // AICPU no-progress watchdog (ms); 0 -> compile default
 };
 
 /**
diff --git a/src/a5/platform/onboard/aicpu/kernel.cpp b/src/a5/platform/onboard/aicpu/kernel.cpp
index 966c4dea1..7b5ee26b8 100644
--- a/src/a5/platform/onboard/aicpu/kernel.cpp
+++ b/src/a5/platform/onboard/aicpu/kernel.cpp
@@ -13,6 +13,7 @@
 #include "common/unified_log.h"
 #include "common/kernel_args.h"
 #include "common/platform_config.h"
+#include "aicpu/aicpu_device_config.h"
 #include "aicpu/dep_gen_collector_aicpu.h"
 #include "aicpu/device_log.h"
 #include "aicpu/device_phase_aicpu.h"
@@ -159,6 +160,7 @@ extern "C" __attribute__((visibility("default"))) int simpler_aicpu_init(void *a
     set_log_level(static_cast<int>(init_args->log_level));
     set_log_info_v(static_cast<int>(init_args->log_info_v));
     set_orch_device_id(static_cast<int>(init_args->device_id));
+    set_scheduler_timeout_ms(static_cast<int>(init_args->scheduler_timeout_ms));
 
     LOG_INFO_V0("%s", "simpler_aicpu_init: per-device invariants latched");
     return 0;
diff --git a/src/a5/platform/sim/host/device_runner.cpp b/src/a5/platform/sim/host/device_runner.cpp
index 3230ddf66..638c2133d 100644
--- a/src/a5/platform/sim/host/device_runner.cpp
+++ b/src/a5/platform/sim/host/device_runner.cpp
@@ -38,6 +38,7 @@
 #include "common/unified_log.h"
 #include "cpu_sim_context.h"
 #include "host/raii_scope_guard.h"
+#include "host/runtime_timeout_config.h"
 #include "runtime.h"
 
 // dep_gen_replay_emit_deps_json: strong symbol provided by
@@ -108,6 +109,19 @@ int DeviceRunner::ensure_binaries_loaded() {
         load_optional_sym("simpler_aicpu_register_callable", reinterpret_cast<void **>(&aicpu_register_callable_func_));
         if (!load_sym("set_platform_regs", reinterpret_cast<void **>(&set_platform_regs_func_))) return -1;
         load_optional_sym("set_orch_device_id", reinterpret_cast<void **>(&set_orch_device_id_func_));
+        load_optional_sym("set_scheduler_timeout_ms", reinterpret_cast<void **>(&set_scheduler_timeout_ms_func_));
+        if (set_scheduler_timeout_ms_func_ != nullptr) {
+            // Per-device one-shot latch (mirrors the onboard InitArgs path):
+            // honor PTO2_SCHEDULER_TIMEOUT_MS once at SO load, not per run. 0 ->
+            // the scheduler keeps its compile-time default. Sim skips the
+            // op/stream ordering check (validate_runtime_timeout_order is onboard).
+            RuntimeTimeoutParseStatus sched_status;
+            RuntimeTimeoutConfig sched_cfg =
+                resolve_runtime_timeout_config(RuntimeTimeoutConfig{1, 1, 0}, &sched_status);
+            set_scheduler_timeout_ms_func_(
+                (sched_status.scheduler_env_set && sched_status.scheduler_valid) ? sched_cfg.scheduler_timeout_ms : 0
+            );
+        }
         if (!load_sym("set_platform_dump_base", reinterpret_cast<void **>(&set_platform_dump_base_func_))) return -1;
         if (!load_sym("set_platform_phase_base", reinterpret_cast<void **>(&set_platform_phase_base_func_))) return -1;
         if (!load_sym("set_dump_args_enabled", reinterpret_cast<void **>(&set_dump_args_enabled_func_))) return -1;
diff --git a/src/a5/platform/sim/host/device_runner.h b/src/a5/platform/sim/host/device_runner.h
index 9719a27ba..a912f61b7 100644
--- a/src/a5/platform/sim/host/device_runner.h
+++ b/src/a5/platform/sim/host/device_runner.h
@@ -63,6 +63,7 @@ class DeviceRunner : public SimDeviceRunnerBase {
     void (*aicore_execute_func_)(Runtime *, int, CoreType, uint32_t, uint64_t, uint32_t, uint64_t, uint64_t){nullptr};
     void (*set_platform_regs_func_)(uint64_t){nullptr};
     void (*set_orch_device_id_func_)(int){nullptr};
+    void (*set_scheduler_timeout_ms_func_)(int){nullptr};
     void (*set_platform_dump_base_func_)(uint64_t){nullptr};
     void (*set_platform_phase_base_func_)(uint64_t){nullptr};
     void (*set_platform_pmu_base_func_)(uint64_t){nullptr};
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp b/src/a5/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp
index dc776917e..a6ee473a6 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp
+++ b/src/a5/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp
@@ -50,7 +50,6 @@
 #include "common/strace.h"
 #include "common/unified_log.h"
 #include "host/platform_compile_info.h"
-#include "host/runtime_timeout_config.h"
 #include "utils/device_arena.h"
 #include "prepare_callable_common.h"
 
@@ -246,32 +245,6 @@ static bool resolve_ring_config(
     return true;
 }
 
-static int32_t resolve_scheduler_timeout_ms() {
-    RuntimeTimeoutParseStatus parse_status;
-    RuntimeTimeoutConfig cfg = resolve_runtime_timeout_config(
-        RuntimeTimeoutConfig{PLATFORM_OP_EXECUTE_TIMEOUT_US, PLATFORM_STREAM_SYNC_TIMEOUT_MS, 0}, &parse_status
-    );
-    if (!parse_status.scheduler_env_set) {
-        return 0;
-    }
-    if (!parse_status.scheduler_valid) {
-        const char *env = std::getenv(PTO2_SCHEDULER_TIMEOUT_MS_ENV);
-        LOG_WARN("%s=%s invalid, using platform scheduler timeout", PTO2_SCHEDULER_TIMEOUT_MS_ENV, env);
-        return 0;
-    }
-
-    RuntimeTimeoutOrderStatus status = validate_runtime_timeout_order_for_platform(cfg, get_platform());
-    if (status != RuntimeTimeoutOrderStatus::OK) {
-        LOG_WARN(
-            "Ignoring %s=%d: %s (op_execute=%llu us, stream_sync=%d ms)", PTO2_SCHEDULER_TIMEOUT_MS_ENV,
-            cfg.scheduler_timeout_ms, runtime_timeout_order_status_name(status),
-            (unsigned long long)cfg.op_execute_timeout_us, cfg.stream_sync_timeout_ms
-        );
-        return 0;
-    }
-    return cfg.scheduler_timeout_ms;
-}
-
 static int32_t pto2_read_runtime_status(Runtime *runtime, PTO2SharedMemoryHeader *host_header) {
     if (runtime == nullptr || host_header == nullptr) {
         return 0;
@@ -348,13 +321,11 @@ register_callable_impl(const ChipCallable *callable, uint64_t (*upload_fn)(const
 // Effective ring sizing for one (callable_id, config): the input half of the
 // arena description. Resolved once per config from per-task overrides + env +
 // compile-time defaults; depends on nothing that varies per run. `total_heap`
-// and `sm_size` are the derived backing-allocation sizes; `scheduler_timeout_ms`
-// is the resolved per-platform scheduler no-progress budget.
+// and `sm_size` are the derived backing-allocation sizes.
 struct ArenaSizingConfig {
     uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH];
     uint64_t heap_sizes[PTO2_MAX_RING_DEPTH];
     int32_t dep_pool_capacities[PTO2_MAX_RING_DEPTH];
-    int32_t scheduler_timeout_ms;
     uint64_t total_heap;
     uint64_t sm_size;
 };
@@ -369,7 +340,7 @@ struct StaticArenaPtrs {
 
 // per-(cid,config): resolve the arena sizing. Pure host arithmetic over
 // per-task overrides, PTO2_RING_* env, and compile-time defaults; derives the
-// total heap (with overflow check) and SM sizes and the scheduler timeout.
+// total heap (with overflow check) and SM sizes.
 // Returns false on an invalid ring config or a heap-size overflow.
 static bool resolve_arena_sizing(
     const uint64_t *ring_task_window, const uint64_t *ring_heap, const uint64_t *ring_dep_pool, ArenaSizingConfig *out
@@ -397,7 +368,6 @@ static bool resolve_arena_sizing(
         out->total_heap += out->heap_sizes[r];
     }
     out->sm_size = PTO2SharedMemoryHandle::calculate_size_per_ring(out->task_window_sizes);
-    out->scheduler_timeout_ms = resolve_scheduler_timeout_ms();
     return true;
 }
 
@@ -554,7 +524,6 @@ static bool build_runtime_image(
 ) {
     PTO2RuntimeArenaLayout layout =
         runtime_reserve_layout(*host_arena, sizing.task_window_sizes, sizing.heap_sizes, sizing.dep_pool_capacities);
-    layout.sizing.scheduler_timeout_ms = sizing.scheduler_timeout_ms;
     if (host_arena->commit(DeviceArena::kDefaultBaseAlign) == nullptr) {
         LOG_ERROR("Failed to commit host arena for prebuilt runtime image");
         return false;
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h
index 557dbfd32..187de69a1 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h
+++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h
@@ -98,15 +98,13 @@ struct PTO2RuntimeOps {
 
 /**
  * Sizing half of the runtime-arena layout: the capacities that *define* the
- * layout (the input to runtime_reserve_layout) plus the scheduler timeout.
- * Stable per (callable_id, ring config); re-read at AICPU boot to reconstruct
- * ring/heap/dep-pool capacities and the scheduler no-progress budget.
+ * layout (the input to runtime_reserve_layout). Stable per (callable_id, ring
+ * config); re-read at AICPU boot to reconstruct ring/heap/dep-pool capacities.
  */
 struct ArenaSizingKey {
     uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH]{};
     uint64_t heap_sizes[PTO2_MAX_RING_DEPTH]{};
     int32_t dep_pool_capacities[PTO2_MAX_RING_DEPTH]{};
-    int32_t scheduler_timeout_ms{0};
 };
 
 /**
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp
index 68a17bfcc..beea5ada4 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp
+++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp
@@ -16,6 +16,7 @@
 
 #include "common.h"  // debug_assert
 #include "common/unified_log.h"
+#include "aicpu/aicpu_device_config.h"
 #include "aicpu/device_time.h"
 #include "aicpu/platform_regs.h"
 #include "callable.h"
@@ -600,10 +601,14 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_
     // "now" so the first budget cycle starts when this thread does, not at
     // an undefined value.
     uint64_t last_progress_ts = get_sys_cnt_aicpu();
+    // Per-device override latched once at worker init by simpler_aicpu_init
+    // (InitArgs.scheduler_timeout_ms -> resident-SO global). 0 means no
+    // override; fall back to the compile-time SCHEDULER_TIMEOUT_CYCLES.
     uint64_t scheduler_timeout_cycles = SCHEDULER_TIMEOUT_CYCLES;
-    if (rt_ != nullptr && rt_->prebuilt_layout.sizing.scheduler_timeout_ms > 0) {
-        scheduler_timeout_cycles = static_cast<uint64_t>(rt_->prebuilt_layout.sizing.scheduler_timeout_ms) *
-                                   (PLATFORM_PROF_SYS_CNT_FREQ / 1000);
+    const int32_t scheduler_timeout_ms_override = get_scheduler_timeout_ms();
+    if (scheduler_timeout_ms_override > 0) {
+        scheduler_timeout_cycles =
+            static_cast<uint64_t>(scheduler_timeout_ms_override) * (PLATFORM_PROF_SYS_CNT_FREQ / 1000);
     }
 
     while (true) {
diff --git a/src/common/platform/include/aicpu/aicpu_device_config.h b/src/common/platform/include/aicpu/aicpu_device_config.h
new file mode 100644
index 000000000..38ec22e30
--- /dev/null
+++ b/src/common/platform/include/aicpu/aicpu_device_config.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Per-device AICPU runtime config.
+ *
+ * Home for run-invariant per-device knobs that simpler_aicpu_init latches once
+ * at worker init (from InitArgs) into resident-SO globals surviving every
+ * subsequent per-task launch. The runtime consumes these read-only; they do
+ * NOT ride the per-run KernelArgs or the arena layout. Add fields here as new
+ * per-device config appears rather than threading it through the per-run path.
+ *
+ * Kept separate from platform_regs (which is strictly per-core register
+ * addressing) so neither file accretes the other's concern.
+ */
+
+#ifndef PLATFORM_COMMON_AICPU_DEVICE_CONFIG_H_
+#define PLATFORM_COMMON_AICPU_DEVICE_CONFIG_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * Set the AICPU scheduler no-progress watchdog timeout (ms). Latched once per
+ * device by simpler_aicpu_init (from InitArgs.scheduler_timeout_ms); read by
+ * the scheduler dispatch loop each run. 0 means "no override" — the scheduler
+ * keeps its compile-time SCHEDULER_TIMEOUT_CYCLES.
+ */
+void set_scheduler_timeout_ms(int timeout_ms);
+
+/** Get the scheduler watchdog timeout override in ms (0 if unset). */
+int get_scheduler_timeout_ms();
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // PLATFORM_COMMON_AICPU_DEVICE_CONFIG_H_
diff --git a/src/common/platform/include/host/runtime_timeout_config.h b/src/common/platform/include/host/runtime_timeout_config.h
index 98d15c2f6..a8244677a 100644
--- a/src/common/platform/include/host/runtime_timeout_config.h
+++ b/src/common/platform/include/host/runtime_timeout_config.h
@@ -40,6 +40,10 @@ struct RuntimeTimeoutConfig {
 struct HostRuntimeTimeoutConfig {
     uint64_t op_execute_timeout_us;
     int32_t stream_sync_timeout_ms;
+    // AICPU scheduler no-progress watchdog override (ms). 0 means "no env
+    // override" — the device falls back to its compile-time default
+    // (SCHEDULER_TIMEOUT_CYCLES). Latched once per device into InitArgs.
+    int32_t scheduler_timeout_ms{0};
 };
 
 struct RuntimeTimeoutParseStatus {
diff --git a/src/common/platform/onboard/host/device_runner_base.cpp b/src/common/platform/onboard/host/device_runner_base.cpp
index 9e42263b7..cba43b423 100644
--- a/src/common/platform/onboard/host/device_runner_base.cpp
+++ b/src/common/platform/onboard/host/device_runner_base.cpp
@@ -91,15 +91,25 @@ HostRuntimeTimeoutConfig resolve_onboard_timeout_config() {
     bool host_timeout_env_set =
         parse_status.op_execute_env_set || parse_status.stream_sync_env_set || parse_status.scheduler_env_set;
     RuntimeTimeoutOrderStatus order_status = validate_runtime_timeout_order(cfg);
+    // The scheduler override is forwarded to the device (via InitArgs at init)
+    // only when explicitly set, valid, and consistent with the op/stream
+    // ordering. 0 means "no override" — the AICPU scheduler then keeps its
+    // compile-time default. op/stream remain host-side acl knobs.
+    int32_t scheduler_override = (parse_status.scheduler_env_set && parse_status.scheduler_valid &&
+                                  order_status == RuntimeTimeoutOrderStatus::OK) ?
+                                     cfg.scheduler_timeout_ms :
+                                     0;
     if (host_timeout_env_set && order_status != RuntimeTimeoutOrderStatus::OK) {
         LOG_WARN(
             "Ignoring PTO2 timeout env overrides: %s (scheduler=%d ms, op_execute=%llu us, stream_sync=%d ms)",
             runtime_timeout_order_status_name(order_status), cfg.scheduler_timeout_ms,
             (unsigned long long)cfg.op_execute_timeout_us, cfg.stream_sync_timeout_ms
         );
-        return HostRuntimeTimeoutConfig{order_defaults.op_execute_timeout_us, order_defaults.stream_sync_timeout_ms};
+        return HostRuntimeTimeoutConfig{
+            order_defaults.op_execute_timeout_us, order_defaults.stream_sync_timeout_ms, scheduler_override
+        };
     }
-    return HostRuntimeTimeoutConfig{cfg.op_execute_timeout_us, cfg.stream_sync_timeout_ms};
+    return HostRuntimeTimeoutConfig{cfg.op_execute_timeout_us, cfg.stream_sync_timeout_ms, scheduler_override};
 }
 
 }  // namespace
@@ -374,6 +384,9 @@ int DeviceRunnerBase::ensure_aicpu_init_launched() {
     init_args.device_id = static_cast<uint32_t>(device_id_);
     init_args.log_level = static_cast<uint32_t>(HostLogger::get_instance().level());
     init_args.log_info_v = static_cast<uint32_t>(HostLogger::get_instance().info_v());
+    // Per-device scheduler watchdog override, resolved once at attach into
+    // timeout_config_. 0 -> the AICPU scheduler keeps its compile-time default.
+    init_args.scheduler_timeout_ms = timeout_config_.scheduler_timeout_ms;
 
     LOG_INFO_V0("=== launch_aicpu_payload %s ===", host::KernelNames::InitName);
     int rc = launch_aicpu_payload(
diff --git a/src/common/platform/shared/aicpu/aicpu_device_config.cpp b/src/common/platform/shared/aicpu/aicpu_device_config.cpp
new file mode 100644
index 000000000..4d6d38e91
--- /dev/null
+++ b/src/common/platform/shared/aicpu/aicpu_device_config.cpp
@@ -0,0 +1,21 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+#include "aicpu/aicpu_device_config.h"
+
+namespace {
+// Latched once per device by simpler_aicpu_init; survives every per-task launch
+// because the AICPU inner SO stays dlopen'd for the runner's life.
+int g_scheduler_timeout_ms = 0;
+}  // namespace
+
+void set_scheduler_timeout_ms(int timeout_ms) { g_scheduler_timeout_ms = timeout_ms; }
+
+int get_scheduler_timeout_ms() { return g_scheduler_timeout_ms; }