diff --git a/docs/dfx/host-trace.md b/docs/dfx/host-trace.md index 62370c882..fdbf69dc3 100644 --- a/docs/dfx/host-trace.md +++ b/docs/dfx/host-trace.md @@ -45,7 +45,7 @@ One line per span, emitted on scope exit simpler_run (= host_wall) ├─ simpler_run.bind │ ├─ simpler_run.bind.args (ntensor=N: per-tensor device_malloc + H2D) -│ └─ simpler_run.bind.prebuilt (prebuilt runtime-arena image build + upload) +│ └─ simpler_run.bind.prebuilt (prebuilt runtime-arena cache hit or build + upload) ├─ simpler_run.runner_run (launch + blocking sync on the AICPU) │ └─ simpler_run.runner_run.device_wall (whole on-NPU AICPU wall) │ └─ .{preamble,so_load,graph_build,config_validate,arena_wire,sm_reset,post_orch,orch,sched} diff --git a/src/a2a3/platform/sim/host/device_runner.cpp b/src/a2a3/platform/sim/host/device_runner.cpp index 235c90280..9d4f6b32b 100644 --- a/src/a2a3/platform/sim/host/device_runner.cpp +++ b/src/a2a3/platform/sim/host/device_runner.cpp @@ -684,6 +684,12 @@ int DeviceRunner::finalize() { cached_gm_heap_size_ = 0; cached_gm_sm_size_ = 0; cached_runtime_arena_size_ = 0; + prebuilt_runtime_arena_cache_valid_ = false; + prebuilt_runtime_arena_cache_key_.clear(); + prebuilt_runtime_arena_cache_gm_heap_base_ = nullptr; + prebuilt_runtime_arena_cache_sm_base_ = nullptr; + prebuilt_runtime_arena_cache_runtime_arena_base_ = nullptr; + prebuilt_runtime_arena_cache_image_.clear(); // Free the 8-byte device_wall buffer (allocated lazily in run()) before // mem_alloc_.finalize(). diff --git a/src/a2a3/runtime/host_build_graph/runtime/runtime.h b/src/a2a3/runtime/host_build_graph/runtime/runtime.h index 543754c9e..01ff96d8e 100644 --- a/src/a2a3/runtime/host_build_graph/runtime/runtime.h +++ b/src/a2a3/runtime/host_build_graph/runtime/runtime.h @@ -150,6 +150,14 @@ struct HostApi { void *(*acquire_pooled_gm_heap)(); void *(*acquire_pooled_gm_sm)(); void *(*acquire_pooled_runtime_arena)(); + bool (*lookup_prebuilt_runtime_arena_cache)( + uint64_t hash, const void *key_data, size_t key_size, void **gm_heap_base, void **sm_base, + void **runtime_arena_base, size_t *runtime_off, const void **image_data, size_t *image_size + ); + void (*mark_prebuilt_runtime_arena_cached)( + uint64_t hash, const void *key_data, size_t key_size, void *gm_heap_base, void *sm_base, + void *runtime_arena_base, size_t runtime_off, const void *image_data, size_t image_size + ); // Single-shot upload of the entire ChipCallable buffer. `callable` is a // `const ChipCallable *` (declared void* to avoid pulling task_interface // headers into runtime.h). DeviceRunner walks child_offsets_ to compute diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp index 67e92fb29..64227e261 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp @@ -39,6 +39,7 @@ #include #include #include +#include #include "../common/pto_runtime_status.h" #include "../runtime/pto_runtime2.h" @@ -320,12 +321,14 @@ register_callable_impl(const ChipCallable *callable, uint64_t (*upload_fn)(const // Effective ring sizing for one (callable_id, config): the input half of the // arena description. Resolved once per config from per-task overrides + env + -// compile-time defaults; depends on nothing that varies per run. `total_heap` -// and `sm_size` are the derived backing-allocation sizes. +// compile-time defaults; depends on nothing that varies per run. struct ArenaSizingConfig { uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH]; uint64_t heap_sizes[PTO2_MAX_RING_DEPTH]; int32_t dep_pool_capacities[PTO2_MAX_RING_DEPTH]; +}; + +struct ArenaStaticSizes { uint64_t total_heap; uint64_t sm_size; }; @@ -338,10 +341,44 @@ struct StaticArenaPtrs { void *runtime_arena_dev; }; -// per-(cid,config): resolve the arena sizing. Pure host arithmetic over -// per-task overrides, PTO2_RING_* env, and compile-time defaults; derives the -// total heap (with overflow check) and SM sizes. -// Returns false on an invalid ring config or a heap-size overflow. +struct PrebuiltRuntimeArenaCacheProbe { + uint64_t hash{0}; + std::vector serialized_key{}; +}; + +static void hash_mix_u64(uint64_t *hash, uint64_t value) { + constexpr uint64_t kFnvPrime = 1099511628211ULL; + for (int i = 0; i < 8; i++) { + *hash ^= (value >> (i * 8)) & 0xff; + *hash *= kFnvPrime; + } +} + +static void append_cache_key_u64(std::vector *out, uint64_t value) { + for (int i = 0; i < 8; i++) { + out->push_back(static_cast((value >> (i * 8)) & 0xff)); + } +} + +static PrebuiltRuntimeArenaCacheProbe make_prebuilt_runtime_arena_cache_probe(const ArenaSizingConfig &sizing) { + PrebuiltRuntimeArenaCacheProbe probe; + uint64_t hash = 1469598103934665603ULL; + probe.serialized_key.reserve(PTO2_MAX_RING_DEPTH * 3 * sizeof(uint64_t)); + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + hash_mix_u64(&hash, sizing.task_window_sizes[r]); + append_cache_key_u64(&probe.serialized_key, sizing.task_window_sizes[r]); + hash_mix_u64(&hash, sizing.heap_sizes[r]); + append_cache_key_u64(&probe.serialized_key, sizing.heap_sizes[r]); + hash_mix_u64(&hash, static_cast(sizing.dep_pool_capacities[r])); + append_cache_key_u64(&probe.serialized_key, static_cast(sizing.dep_pool_capacities[r])); + } + probe.hash = hash; + return probe; +} + +// per-(cid,config): resolve the cache-key sizing knobs. Pure host parsing over +// per-task overrides, PTO2_RING_* env, and compile-time defaults. Derived +// allocation sizes are computed only on cache miss. static bool resolve_arena_sizing( const uint64_t *ring_task_window, const uint64_t *ring_heap, const uint64_t *ring_dep_pool, ArenaSizingConfig *out ) { @@ -359,15 +396,19 @@ static bool resolve_arena_sizing( dep_pool_log.c_str() ); + return true; +} + +static bool derive_arena_static_sizes(const ArenaSizingConfig &sizing, ArenaStaticSizes *out) { out->total_heap = 0; for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { - if (out->heap_sizes[r] > std::numeric_limits::max() - out->total_heap) { + if (sizing.heap_sizes[r] > std::numeric_limits::max() - out->total_heap) { LOG_ERROR("Total ring heap size overflows uint64_t"); return false; } - out->total_heap += out->heap_sizes[r]; + out->total_heap += sizing.heap_sizes[r]; } - out->sm_size = PTO2SharedMemoryHandle::calculate_size_per_ring(out->task_window_sizes); + out->sm_size = PTO2SharedMemoryHandle::calculate_size_per_ring(sizing.task_window_sizes); return true; } @@ -461,13 +502,15 @@ static void apply_orch_sched_env_flags(Runtime *runtime) { // shot. The runtime-arena size is recovered by replaying the (pure, cheap) // reserve sequence on a throwaway host arena. Idempotent across runs — the // pools are owned by DeviceRunner and freed in DeviceRunner::finalize(). -static bool ensure_static_arenas(Runtime *runtime, const ArenaSizingConfig &sizing, StaticArenaPtrs *out) { +static bool ensure_static_arenas( + Runtime *runtime, const ArenaSizingConfig &sizing, const ArenaStaticSizes &sizes, StaticArenaPtrs *out +) { DeviceArena sizing_arena; // discarded; only its computed arena_size is read PTO2RuntimeArenaLayout layout = runtime_reserve_layout(sizing_arena, sizing.task_window_sizes, sizing.heap_sizes, sizing.dep_pool_capacities); int64_t t_setup_start = _now_ms(); - if (runtime->host_api.setup_static_arena(sizing.total_heap, sizing.sm_size, layout.offsets.arena_size) != 0) { + if (runtime->host_api.setup_static_arena(sizes.total_heap, sizes.sm_size, layout.offsets.arena_size) != 0) { LOG_ERROR("Failed to setup pooled static arena"); return false; } @@ -518,8 +561,8 @@ static bool ensure_static_arenas(Runtime *runtime, const ArenaSizingConfig &sizi // host Runtime (bind_launch_state), since the AICPU needs that pointer *before* // it can dereference the image. static bool build_runtime_image( - const ArenaSizingConfig &sizing, const StaticArenaPtrs &ptrs, DeviceArena *host_arena, - PTO2RuntimeArenaLayout *out_layout + const ArenaSizingConfig &sizing, const ArenaStaticSizes &sizes, const StaticArenaPtrs &ptrs, + DeviceArena *host_arena, PTO2RuntimeArenaLayout *out_layout ) { PTO2RuntimeArenaLayout layout = runtime_reserve_layout(*host_arena, sizing.task_window_sizes, sizing.heap_sizes, sizing.dep_pool_capacities); @@ -529,7 +572,7 @@ static bool build_runtime_image( } PTO2Runtime *rt = runtime_init_data_from_layout( - *host_arena, layout, PTO2_MODE_EXECUTE, ptrs.gm_sm, sizing.sm_size, ptrs.gm_heap, sizing.heap_sizes + *host_arena, layout, PTO2_MODE_EXECUTE, ptrs.gm_sm, sizes.sm_size, ptrs.gm_heap, sizing.heap_sizes ); if (rt == nullptr) { LOG_ERROR("runtime_init_data_from_layout failed"); @@ -561,6 +604,51 @@ static bool bind_launch_state( return true; } +static int bind_cached_runtime_image( + Runtime *runtime, const PrebuiltRuntimeArenaCacheProbe &probe, const ChipStorageTaskArgs &device_args +) { + if (runtime->host_api.lookup_prebuilt_runtime_arena_cache == nullptr) { + return 1; + } + + void *gm_heap = nullptr; + void *sm_ptr = nullptr; + void *runtime_arena_dev = nullptr; + size_t runtime_off = 0; + const void *cached_image = nullptr; + size_t cached_image_size = 0; + bool cache_hit = runtime->host_api.lookup_prebuilt_runtime_arena_cache( + probe.hash, probe.serialized_key.data(), probe.serialized_key.size(), &gm_heap, &sm_ptr, &runtime_arena_dev, + &runtime_off, &cached_image, &cached_image_size + ); + if (!cache_hit) { + return 1; + } + + runtime->set_orch_args(device_args); + int rc_upload = runtime->host_api.copy_to_device(runtime_arena_dev, cached_image, cached_image_size); + if (rc_upload != 0) { + LOG_ERROR("Failed to rtMemcpy cached prebuilt runtime arena to device (rc=%d)", rc_upload); + return -1; + } + runtime->set_gm_sm_ptr(sm_ptr); + runtime->set_prebuilt_arena(runtime_arena_dev, runtime_off); + return 0; +} + +static void store_prebuilt_runtime_image( + Runtime *runtime, const PrebuiltRuntimeArenaCacheProbe &probe, const StaticArenaPtrs &ptrs, + const PTO2RuntimeArenaLayout &layout, const DeviceArena &host_arena +) { + if (runtime->host_api.mark_prebuilt_runtime_arena_cached == nullptr) { + return; + } + runtime->host_api.mark_prebuilt_runtime_arena_cached( + probe.hash, probe.serialized_key.data(), probe.serialized_key.size(), ptrs.gm_heap, ptrs.gm_sm, + ptrs.runtime_arena_dev, layout.offsets.off_runtime, host_arena.base(), layout.offsets.arena_size + ); +} + /** * Per-run binding: build device-side argument storage (tensor copy-out, GM * heap, PTO2 shared memory) and publish it to the runtime. Assumes the @@ -621,19 +709,32 @@ extern "C" int bind_callable_to_runtime_impl( int64_t t_prebuilt_start = _now_ms(); { STRACE("simpler_run.bind.prebuilt"); - StaticArenaPtrs ptrs; - if (!ensure_static_arenas(runtime, sizing, &ptrs)) { + PrebuiltRuntimeArenaCacheProbe cache_probe = make_prebuilt_runtime_arena_cache_probe(sizing); + int cache_rc = bind_cached_runtime_image(runtime, cache_probe, device_args); + if (cache_rc < 0) { return -1; } + if (cache_rc != 0) { + ArenaStaticSizes sizes; + if (!derive_arena_static_sizes(sizing, &sizes)) { + return -1; + } - DeviceArena host_arena; // libc malloc backend; owns the image until upload - PTO2RuntimeArenaLayout layout; - if (!build_runtime_image(sizing, ptrs, &host_arena, &layout)) { - return -1; - } + StaticArenaPtrs ptrs; + if (!ensure_static_arenas(runtime, sizing, sizes, &ptrs)) { + return -1; + } - if (!bind_launch_state(runtime, ptrs, host_arena, layout, device_args)) { - return -1; + DeviceArena host_arena; // libc malloc backend; owns the image until upload + PTO2RuntimeArenaLayout layout; + if (!build_runtime_image(sizing, sizes, ptrs, &host_arena, &layout)) { + return -1; + } + + if (!bind_launch_state(runtime, ptrs, host_arena, layout, device_args)) { + return -1; + } + store_prebuilt_runtime_image(runtime, cache_probe, ptrs, layout, host_arena); } } int64_t t_prebuilt_end = _now_ms(); diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h index b6958757a..620b103ff 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h @@ -149,6 +149,14 @@ struct HostApi { void *(*acquire_pooled_gm_heap)(); void *(*acquire_pooled_gm_sm)(); void *(*acquire_pooled_runtime_arena)(); + bool (*lookup_prebuilt_runtime_arena_cache)( + uint64_t hash, const void *key_data, size_t key_size, void **gm_heap_base, void **sm_base, + void **runtime_arena_base, size_t *runtime_off, const void **image_data, size_t *image_size + ); + void (*mark_prebuilt_runtime_arena_cached)( + uint64_t hash, const void *key_data, size_t key_size, void *gm_heap_base, void *sm_base, + void *runtime_arena_base, size_t runtime_off, const void *image_data, size_t image_size + ); // Single-shot upload of the entire ChipCallable buffer. `callable` is a // `const ChipCallable *` (declared void* to avoid pulling task_interface // headers into runtime.h). DeviceRunner walks child_offsets_ to compute diff --git a/src/a5/platform/sim/host/device_runner.cpp b/src/a5/platform/sim/host/device_runner.cpp index 638c2133d..f1fc92daa 100644 --- a/src/a5/platform/sim/host/device_runner.cpp +++ b/src/a5/platform/sim/host/device_runner.cpp @@ -654,6 +654,12 @@ int DeviceRunner::finalize() { cached_gm_heap_size_ = 0; cached_gm_sm_size_ = 0; cached_runtime_arena_size_ = 0; + prebuilt_runtime_arena_cache_valid_ = false; + prebuilt_runtime_arena_cache_key_.clear(); + prebuilt_runtime_arena_cache_gm_heap_base_ = nullptr; + prebuilt_runtime_arena_cache_sm_base_ = nullptr; + prebuilt_runtime_arena_cache_runtime_arena_base_ = nullptr; + prebuilt_runtime_arena_cache_image_.clear(); mem_alloc_.finalize(); clear_cpu_sim_shared_storage(); diff --git a/src/a5/runtime/host_build_graph/runtime/runtime.h b/src/a5/runtime/host_build_graph/runtime/runtime.h index 54541807d..92e8d70c3 100644 --- a/src/a5/runtime/host_build_graph/runtime/runtime.h +++ b/src/a5/runtime/host_build_graph/runtime/runtime.h @@ -155,6 +155,14 @@ struct HostApi { void *(*acquire_pooled_gm_heap)(); void *(*acquire_pooled_gm_sm)(); void *(*acquire_pooled_runtime_arena)(); + bool (*lookup_prebuilt_runtime_arena_cache)( + uint64_t hash, const void *key_data, size_t key_size, void **gm_heap_base, void **sm_base, + void **runtime_arena_base, size_t *runtime_off, const void **image_data, size_t *image_size + ); + void (*mark_prebuilt_runtime_arena_cached)( + uint64_t hash, const void *key_data, size_t key_size, void *gm_heap_base, void *sm_base, + void *runtime_arena_base, size_t runtime_off, const void *image_data, size_t image_size + ); // Single-shot upload of the entire ChipCallable buffer. `callable` is a // `const ChipCallable *` (declared void* to avoid pulling task_interface // headers into runtime.h). DeviceRunner walks child_offsets_ to compute diff --git a/src/a5/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp b/src/a5/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp index 67e92fb29..64227e261 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp +++ b/src/a5/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp @@ -39,6 +39,7 @@ #include #include #include +#include #include "../common/pto_runtime_status.h" #include "../runtime/pto_runtime2.h" @@ -320,12 +321,14 @@ register_callable_impl(const ChipCallable *callable, uint64_t (*upload_fn)(const // Effective ring sizing for one (callable_id, config): the input half of the // arena description. Resolved once per config from per-task overrides + env + -// compile-time defaults; depends on nothing that varies per run. `total_heap` -// and `sm_size` are the derived backing-allocation sizes. +// compile-time defaults; depends on nothing that varies per run. struct ArenaSizingConfig { uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH]; uint64_t heap_sizes[PTO2_MAX_RING_DEPTH]; int32_t dep_pool_capacities[PTO2_MAX_RING_DEPTH]; +}; + +struct ArenaStaticSizes { uint64_t total_heap; uint64_t sm_size; }; @@ -338,10 +341,44 @@ struct StaticArenaPtrs { void *runtime_arena_dev; }; -// per-(cid,config): resolve the arena sizing. Pure host arithmetic over -// per-task overrides, PTO2_RING_* env, and compile-time defaults; derives the -// total heap (with overflow check) and SM sizes. -// Returns false on an invalid ring config or a heap-size overflow. +struct PrebuiltRuntimeArenaCacheProbe { + uint64_t hash{0}; + std::vector serialized_key{}; +}; + +static void hash_mix_u64(uint64_t *hash, uint64_t value) { + constexpr uint64_t kFnvPrime = 1099511628211ULL; + for (int i = 0; i < 8; i++) { + *hash ^= (value >> (i * 8)) & 0xff; + *hash *= kFnvPrime; + } +} + +static void append_cache_key_u64(std::vector *out, uint64_t value) { + for (int i = 0; i < 8; i++) { + out->push_back(static_cast((value >> (i * 8)) & 0xff)); + } +} + +static PrebuiltRuntimeArenaCacheProbe make_prebuilt_runtime_arena_cache_probe(const ArenaSizingConfig &sizing) { + PrebuiltRuntimeArenaCacheProbe probe; + uint64_t hash = 1469598103934665603ULL; + probe.serialized_key.reserve(PTO2_MAX_RING_DEPTH * 3 * sizeof(uint64_t)); + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + hash_mix_u64(&hash, sizing.task_window_sizes[r]); + append_cache_key_u64(&probe.serialized_key, sizing.task_window_sizes[r]); + hash_mix_u64(&hash, sizing.heap_sizes[r]); + append_cache_key_u64(&probe.serialized_key, sizing.heap_sizes[r]); + hash_mix_u64(&hash, static_cast(sizing.dep_pool_capacities[r])); + append_cache_key_u64(&probe.serialized_key, static_cast(sizing.dep_pool_capacities[r])); + } + probe.hash = hash; + return probe; +} + +// per-(cid,config): resolve the cache-key sizing knobs. Pure host parsing over +// per-task overrides, PTO2_RING_* env, and compile-time defaults. Derived +// allocation sizes are computed only on cache miss. static bool resolve_arena_sizing( const uint64_t *ring_task_window, const uint64_t *ring_heap, const uint64_t *ring_dep_pool, ArenaSizingConfig *out ) { @@ -359,15 +396,19 @@ static bool resolve_arena_sizing( dep_pool_log.c_str() ); + return true; +} + +static bool derive_arena_static_sizes(const ArenaSizingConfig &sizing, ArenaStaticSizes *out) { out->total_heap = 0; for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { - if (out->heap_sizes[r] > std::numeric_limits::max() - out->total_heap) { + if (sizing.heap_sizes[r] > std::numeric_limits::max() - out->total_heap) { LOG_ERROR("Total ring heap size overflows uint64_t"); return false; } - out->total_heap += out->heap_sizes[r]; + out->total_heap += sizing.heap_sizes[r]; } - out->sm_size = PTO2SharedMemoryHandle::calculate_size_per_ring(out->task_window_sizes); + out->sm_size = PTO2SharedMemoryHandle::calculate_size_per_ring(sizing.task_window_sizes); return true; } @@ -461,13 +502,15 @@ static void apply_orch_sched_env_flags(Runtime *runtime) { // shot. The runtime-arena size is recovered by replaying the (pure, cheap) // reserve sequence on a throwaway host arena. Idempotent across runs — the // pools are owned by DeviceRunner and freed in DeviceRunner::finalize(). -static bool ensure_static_arenas(Runtime *runtime, const ArenaSizingConfig &sizing, StaticArenaPtrs *out) { +static bool ensure_static_arenas( + Runtime *runtime, const ArenaSizingConfig &sizing, const ArenaStaticSizes &sizes, StaticArenaPtrs *out +) { DeviceArena sizing_arena; // discarded; only its computed arena_size is read PTO2RuntimeArenaLayout layout = runtime_reserve_layout(sizing_arena, sizing.task_window_sizes, sizing.heap_sizes, sizing.dep_pool_capacities); int64_t t_setup_start = _now_ms(); - if (runtime->host_api.setup_static_arena(sizing.total_heap, sizing.sm_size, layout.offsets.arena_size) != 0) { + if (runtime->host_api.setup_static_arena(sizes.total_heap, sizes.sm_size, layout.offsets.arena_size) != 0) { LOG_ERROR("Failed to setup pooled static arena"); return false; } @@ -518,8 +561,8 @@ static bool ensure_static_arenas(Runtime *runtime, const ArenaSizingConfig &sizi // host Runtime (bind_launch_state), since the AICPU needs that pointer *before* // it can dereference the image. static bool build_runtime_image( - const ArenaSizingConfig &sizing, const StaticArenaPtrs &ptrs, DeviceArena *host_arena, - PTO2RuntimeArenaLayout *out_layout + const ArenaSizingConfig &sizing, const ArenaStaticSizes &sizes, const StaticArenaPtrs &ptrs, + DeviceArena *host_arena, PTO2RuntimeArenaLayout *out_layout ) { PTO2RuntimeArenaLayout layout = runtime_reserve_layout(*host_arena, sizing.task_window_sizes, sizing.heap_sizes, sizing.dep_pool_capacities); @@ -529,7 +572,7 @@ static bool build_runtime_image( } PTO2Runtime *rt = runtime_init_data_from_layout( - *host_arena, layout, PTO2_MODE_EXECUTE, ptrs.gm_sm, sizing.sm_size, ptrs.gm_heap, sizing.heap_sizes + *host_arena, layout, PTO2_MODE_EXECUTE, ptrs.gm_sm, sizes.sm_size, ptrs.gm_heap, sizing.heap_sizes ); if (rt == nullptr) { LOG_ERROR("runtime_init_data_from_layout failed"); @@ -561,6 +604,51 @@ static bool bind_launch_state( return true; } +static int bind_cached_runtime_image( + Runtime *runtime, const PrebuiltRuntimeArenaCacheProbe &probe, const ChipStorageTaskArgs &device_args +) { + if (runtime->host_api.lookup_prebuilt_runtime_arena_cache == nullptr) { + return 1; + } + + void *gm_heap = nullptr; + void *sm_ptr = nullptr; + void *runtime_arena_dev = nullptr; + size_t runtime_off = 0; + const void *cached_image = nullptr; + size_t cached_image_size = 0; + bool cache_hit = runtime->host_api.lookup_prebuilt_runtime_arena_cache( + probe.hash, probe.serialized_key.data(), probe.serialized_key.size(), &gm_heap, &sm_ptr, &runtime_arena_dev, + &runtime_off, &cached_image, &cached_image_size + ); + if (!cache_hit) { + return 1; + } + + runtime->set_orch_args(device_args); + int rc_upload = runtime->host_api.copy_to_device(runtime_arena_dev, cached_image, cached_image_size); + if (rc_upload != 0) { + LOG_ERROR("Failed to rtMemcpy cached prebuilt runtime arena to device (rc=%d)", rc_upload); + return -1; + } + runtime->set_gm_sm_ptr(sm_ptr); + runtime->set_prebuilt_arena(runtime_arena_dev, runtime_off); + return 0; +} + +static void store_prebuilt_runtime_image( + Runtime *runtime, const PrebuiltRuntimeArenaCacheProbe &probe, const StaticArenaPtrs &ptrs, + const PTO2RuntimeArenaLayout &layout, const DeviceArena &host_arena +) { + if (runtime->host_api.mark_prebuilt_runtime_arena_cached == nullptr) { + return; + } + runtime->host_api.mark_prebuilt_runtime_arena_cached( + probe.hash, probe.serialized_key.data(), probe.serialized_key.size(), ptrs.gm_heap, ptrs.gm_sm, + ptrs.runtime_arena_dev, layout.offsets.off_runtime, host_arena.base(), layout.offsets.arena_size + ); +} + /** * Per-run binding: build device-side argument storage (tensor copy-out, GM * heap, PTO2 shared memory) and publish it to the runtime. Assumes the @@ -621,19 +709,32 @@ extern "C" int bind_callable_to_runtime_impl( int64_t t_prebuilt_start = _now_ms(); { STRACE("simpler_run.bind.prebuilt"); - StaticArenaPtrs ptrs; - if (!ensure_static_arenas(runtime, sizing, &ptrs)) { + PrebuiltRuntimeArenaCacheProbe cache_probe = make_prebuilt_runtime_arena_cache_probe(sizing); + int cache_rc = bind_cached_runtime_image(runtime, cache_probe, device_args); + if (cache_rc < 0) { return -1; } + if (cache_rc != 0) { + ArenaStaticSizes sizes; + if (!derive_arena_static_sizes(sizing, &sizes)) { + return -1; + } - DeviceArena host_arena; // libc malloc backend; owns the image until upload - PTO2RuntimeArenaLayout layout; - if (!build_runtime_image(sizing, ptrs, &host_arena, &layout)) { - return -1; - } + StaticArenaPtrs ptrs; + if (!ensure_static_arenas(runtime, sizing, sizes, &ptrs)) { + return -1; + } - if (!bind_launch_state(runtime, ptrs, host_arena, layout, device_args)) { - return -1; + DeviceArena host_arena; // libc malloc backend; owns the image until upload + PTO2RuntimeArenaLayout layout; + if (!build_runtime_image(sizing, sizes, ptrs, &host_arena, &layout)) { + return -1; + } + + if (!bind_launch_state(runtime, ptrs, host_arena, layout, device_args)) { + return -1; + } + store_prebuilt_runtime_image(runtime, cache_probe, ptrs, layout, host_arena); } } int64_t t_prebuilt_end = _now_ms(); diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/runtime.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/runtime.h index 43fab0733..9493c7217 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/runtime.h +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/runtime.h @@ -157,6 +157,14 @@ struct HostApi { void *(*acquire_pooled_gm_heap)(); void *(*acquire_pooled_gm_sm)(); void *(*acquire_pooled_runtime_arena)(); + bool (*lookup_prebuilt_runtime_arena_cache)( + uint64_t hash, const void *key_data, size_t key_size, void **gm_heap_base, void **sm_base, + void **runtime_arena_base, size_t *runtime_off, const void **image_data, size_t *image_size + ); + void (*mark_prebuilt_runtime_arena_cached)( + uint64_t hash, const void *key_data, size_t key_size, void *gm_heap_base, void *sm_base, + void *runtime_arena_base, size_t runtime_off, const void *image_data, size_t image_size + ); // Single-shot upload of the entire ChipCallable buffer. `callable` is a // `const ChipCallable *` (declared void* to avoid pulling task_interface // headers into runtime.h). DeviceRunner walks child_offsets_ to compute diff --git a/src/common/platform/onboard/host/c_api_shared.cpp b/src/common/platform/onboard/host/c_api_shared.cpp index 67cc88d93..041b6a27b 100644 --- a/src/common/platform/onboard/host/c_api_shared.cpp +++ b/src/common/platform/onboard/host/c_api_shared.cpp @@ -158,6 +158,30 @@ static void *acquire_pooled_runtime_arena_wrapper() { } } +static bool lookup_prebuilt_runtime_arena_cache_wrapper( + uint64_t hash, const void *key_data, size_t key_size, void **gm_heap_base, void **sm_base, + void **runtime_arena_base, size_t *runtime_off, const void **image_data, size_t *image_size +) { + try { + return current_runner()->lookup_prebuilt_runtime_arena_cache( + hash, key_data, key_size, gm_heap_base, sm_base, runtime_arena_base, runtime_off, image_data, image_size + ); + } catch (...) { + return false; + } +} + +static void mark_prebuilt_runtime_arena_cached_wrapper( + uint64_t hash, const void *key_data, size_t key_size, void *gm_heap_base, void *sm_base, void *runtime_arena_base, + size_t runtime_off, const void *image_data, size_t image_size +) { + try { + current_runner()->mark_prebuilt_runtime_arena_cached( + hash, key_data, key_size, gm_heap_base, sm_base, runtime_arena_base, runtime_off, image_data, image_size + ); + } catch (...) {} +} + /* =========================================================================== * Public C API (resolved by ChipWorker via dlsym) * @@ -478,6 +502,8 @@ int simpler_run( r->host_api.acquire_pooled_gm_heap = acquire_pooled_gm_heap_wrapper; r->host_api.acquire_pooled_gm_sm = acquire_pooled_gm_sm_wrapper; r->host_api.acquire_pooled_runtime_arena = acquire_pooled_runtime_arena_wrapper; + r->host_api.lookup_prebuilt_runtime_arena_cache = lookup_prebuilt_runtime_arena_cache_wrapper; + r->host_api.mark_prebuilt_runtime_arena_cached = mark_prebuilt_runtime_arena_cached_wrapper; r->host_api.upload_chip_callable_buffer = upload_chip_callable_buffer_wrapper; { diff --git a/src/common/platform/onboard/host/device_runner_base.cpp b/src/common/platform/onboard/host/device_runner_base.cpp index cba43b423..623402609 100644 --- a/src/common/platform/onboard/host/device_runner_base.cpp +++ b/src/common/platform/onboard/host/device_runner_base.cpp @@ -214,6 +214,47 @@ void *DeviceRunnerBase::acquire_pooled_runtime_arena() { return runtime_arena_pool_.base(); } +bool DeviceRunnerBase::lookup_prebuilt_runtime_arena_cache( + uint64_t hash, const void *key_data, size_t key_size, void **gm_heap_base, void **sm_base, + void **runtime_arena_base, size_t *runtime_off, const void **image_data, size_t *image_size +) const { + if (!prebuilt_runtime_arena_cache_valid_ || prebuilt_runtime_arena_cache_hash_ != hash || + prebuilt_runtime_arena_cache_key_.size() != key_size || key_data == nullptr || gm_heap_base == nullptr || + sm_base == nullptr || runtime_arena_base == nullptr || runtime_off == nullptr || image_data == nullptr || + image_size == nullptr) { + return false; + } + if (std::memcmp(prebuilt_runtime_arena_cache_key_.data(), key_data, key_size) != 0) { + return false; + } + *gm_heap_base = prebuilt_runtime_arena_cache_gm_heap_base_; + *sm_base = prebuilt_runtime_arena_cache_sm_base_; + *runtime_arena_base = prebuilt_runtime_arena_cache_runtime_arena_base_; + *runtime_off = prebuilt_runtime_arena_cache_runtime_off_; + *image_data = prebuilt_runtime_arena_cache_image_.data(); + *image_size = prebuilt_runtime_arena_cache_image_.size(); + return true; +} + +void DeviceRunnerBase::mark_prebuilt_runtime_arena_cached( + uint64_t hash, const void *key_data, size_t key_size, void *gm_heap_base, void *sm_base, void *runtime_arena_base, + size_t runtime_off, const void *image_data, size_t image_size +) { + prebuilt_runtime_arena_cache_valid_ = false; + prebuilt_runtime_arena_cache_hash_ = hash; + prebuilt_runtime_arena_cache_key_.assign( + static_cast(key_data), static_cast(key_data) + key_size + ); + prebuilt_runtime_arena_cache_gm_heap_base_ = gm_heap_base; + prebuilt_runtime_arena_cache_sm_base_ = sm_base; + prebuilt_runtime_arena_cache_runtime_arena_base_ = runtime_arena_base; + prebuilt_runtime_arena_cache_runtime_off_ = runtime_off; + prebuilt_runtime_arena_cache_image_.assign( + static_cast(image_data), static_cast(image_data) + image_size + ); + prebuilt_runtime_arena_cache_valid_ = true; +} + int DeviceRunnerBase::setup_static_arena(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size) { // Three independent device_malloc'd buffers: GM heap, PTO2 SM, prebuilt // runtime arena. Split out from a single large allocation because the @@ -225,7 +266,8 @@ int DeviceRunnerBase::setup_static_arena(size_t gm_heap_size, size_t gm_sm_size, // worker's lifetime). If a caller asks for a larger layout on any // region, redo just that region — already-committed peers stay alive // so their callers don't have to re-acquire. - auto commit_region = [](DeviceArena &arena, size_t &cached_size, size_t requested_size) -> int { + bool arena_changed = false; + auto commit_region = [&arena_changed](DeviceArena &arena, size_t &cached_size, size_t requested_size) -> int { if (requested_size == 0) { // hbg's runtime_arena path: caller passed 0 and never reserved // a region. Leave the arena uncommitted; acquire_pooled_* will @@ -233,6 +275,7 @@ int DeviceRunnerBase::setup_static_arena(size_t gm_heap_size, size_t gm_sm_size, if (arena.is_committed() && cached_size != 0) { arena.release(); cached_size = 0; + arena_changed = true; } return 0; } @@ -241,6 +284,7 @@ int DeviceRunnerBase::setup_static_arena(size_t gm_heap_size, size_t gm_sm_size, } arena.release(); cached_size = 0; + arena_changed = true; arena.reserve(requested_size, DeviceArena::kDefaultBaseAlign); if (arena.commit(DeviceArena::kDefaultBaseAlign) == nullptr) { // commit() failure leaves committed_=false, so the next entry's @@ -269,8 +313,22 @@ int DeviceRunnerBase::setup_static_arena(size_t gm_heap_size, size_t gm_sm_size, cached_gm_heap_size_ = 0; cached_gm_sm_size_ = 0; cached_runtime_arena_size_ = 0; + prebuilt_runtime_arena_cache_valid_ = false; + prebuilt_runtime_arena_cache_key_.clear(); + prebuilt_runtime_arena_cache_gm_heap_base_ = nullptr; + prebuilt_runtime_arena_cache_sm_base_ = nullptr; + prebuilt_runtime_arena_cache_runtime_arena_base_ = nullptr; + prebuilt_runtime_arena_cache_image_.clear(); return -1; } + if (arena_changed) { + prebuilt_runtime_arena_cache_valid_ = false; + prebuilt_runtime_arena_cache_key_.clear(); + prebuilt_runtime_arena_cache_gm_heap_base_ = nullptr; + prebuilt_runtime_arena_cache_sm_base_ = nullptr; + prebuilt_runtime_arena_cache_runtime_arena_base_ = nullptr; + prebuilt_runtime_arena_cache_image_.clear(); + } return 0; } @@ -971,6 +1029,12 @@ int DeviceRunnerBase::finalize_common() { gm_heap_arena_.release(); gm_sm_arena_.release(); runtime_arena_pool_.release(); + prebuilt_runtime_arena_cache_valid_ = false; + prebuilt_runtime_arena_cache_key_.clear(); + prebuilt_runtime_arena_cache_gm_heap_base_ = nullptr; + prebuilt_runtime_arena_cache_sm_base_ = nullptr; + prebuilt_runtime_arena_cache_runtime_arena_base_ = nullptr; + prebuilt_runtime_arena_cache_image_.clear(); // Free the 8-byte device_wall buffer (allocated lazily in run()) while // mem_alloc_ and the device context are still live. free_tensor() routes diff --git a/src/common/platform/onboard/host/device_runner_base.h b/src/common/platform/onboard/host/device_runner_base.h index e828499bc..8bf16a077 100644 --- a/src/common/platform/onboard/host/device_runner_base.h +++ b/src/common/platform/onboard/host/device_runner_base.h @@ -124,6 +124,14 @@ class DeviceRunnerBase : public L3L2OrchCommBackend { void *acquire_pooled_gm_heap(); void *acquire_pooled_gm_sm(); void *acquire_pooled_runtime_arena(); + bool lookup_prebuilt_runtime_arena_cache( + uint64_t hash, const void *key_data, size_t key_size, void **gm_heap_base, void **sm_base, + void **runtime_arena_base, size_t *runtime_off, const void **image_data, size_t *image_size + ) const; + void mark_prebuilt_runtime_arena_cached( + uint64_t hash, const void *key_data, size_t key_size, void *gm_heap_base, void *sm_base, + void *runtime_arena_base, size_t runtime_off, const void *image_data, size_t image_size + ); /** * Create a thread bound to this device. The thread calls @@ -803,6 +811,14 @@ class DeviceRunnerBase : public L3L2OrchCommBackend { size_t cached_gm_heap_size_{0}; size_t cached_gm_sm_size_{0}; size_t cached_runtime_arena_size_{0}; + bool prebuilt_runtime_arena_cache_valid_{false}; + uint64_t prebuilt_runtime_arena_cache_hash_{0}; + std::vector prebuilt_runtime_arena_cache_key_; + void *prebuilt_runtime_arena_cache_gm_heap_base_{nullptr}; + void *prebuilt_runtime_arena_cache_sm_base_{nullptr}; + void *prebuilt_runtime_arena_cache_runtime_arena_base_{nullptr}; + size_t prebuilt_runtime_arena_cache_runtime_off_{0}; + std::vector prebuilt_runtime_arena_cache_image_; // Persistent AICPU / AICore streams created in // `ensure_device_initialized()` and torn down in the subclass's diff --git a/src/common/platform/sim/host/c_api_shared.cpp b/src/common/platform/sim/host/c_api_shared.cpp index ea9f4ddd3..b3a87a5e4 100644 --- a/src/common/platform/sim/host/c_api_shared.cpp +++ b/src/common/platform/sim/host/c_api_shared.cpp @@ -155,6 +155,30 @@ static void *acquire_pooled_runtime_arena_wrapper() { } } +static bool lookup_prebuilt_runtime_arena_cache_wrapper( + uint64_t hash, const void *key_data, size_t key_size, void **gm_heap_base, void **sm_base, + void **runtime_arena_base, size_t *runtime_off, const void **image_data, size_t *image_size +) { + try { + return current_runner()->lookup_prebuilt_runtime_arena_cache( + hash, key_data, key_size, gm_heap_base, sm_base, runtime_arena_base, runtime_off, image_data, image_size + ); + } catch (...) { + return false; + } +} + +static void mark_prebuilt_runtime_arena_cached_wrapper( + uint64_t hash, const void *key_data, size_t key_size, void *gm_heap_base, void *sm_base, void *runtime_arena_base, + size_t runtime_off, const void *image_data, size_t image_size +) { + try { + current_runner()->mark_prebuilt_runtime_arena_cached( + hash, key_data, key_size, gm_heap_base, sm_base, runtime_arena_base, runtime_off, image_data, image_size + ); + } catch (...) {} +} + /* =========================================================================== * Public C API (resolved by ChipWorker via dlsym) * =========================================================================== */ @@ -429,6 +453,8 @@ int simpler_run( r->host_api.acquire_pooled_gm_heap = acquire_pooled_gm_heap_wrapper; r->host_api.acquire_pooled_gm_sm = acquire_pooled_gm_sm_wrapper; r->host_api.acquire_pooled_runtime_arena = acquire_pooled_runtime_arena_wrapper; + r->host_api.lookup_prebuilt_runtime_arena_cache = lookup_prebuilt_runtime_arena_cache_wrapper; + r->host_api.mark_prebuilt_runtime_arena_cached = mark_prebuilt_runtime_arena_cached_wrapper; r->host_api.upload_chip_callable_buffer = upload_chip_callable_buffer_wrapper; auto bind_result = runner->bind_callable_to_runtime(*r, callable_id); diff --git a/src/common/platform/sim/host/device_runner_base.cpp b/src/common/platform/sim/host/device_runner_base.cpp index 7d4da0573..547fe58b8 100644 --- a/src/common/platform/sim/host/device_runner_base.cpp +++ b/src/common/platform/sim/host/device_runner_base.cpp @@ -15,6 +15,7 @@ #include #include +#include #include #include #include @@ -87,11 +88,13 @@ int SimDeviceRunnerBase::setup_static_arena(size_t gm_heap_size, size_t gm_sm_si // Idempotent for the production case (sizes do not change across a // worker's lifetime). If a caller asks for a larger layout on any // region, redo just that region. - auto commit_region = [](DeviceArena &arena, size_t &cached_size, size_t requested_size) -> int { + bool arena_changed = false; + auto commit_region = [&arena_changed](DeviceArena &arena, size_t &cached_size, size_t requested_size) -> int { if (requested_size == 0) { if (arena.is_committed() && cached_size != 0) { arena.release(); cached_size = 0; + arena_changed = true; } return 0; } @@ -100,6 +103,7 @@ int SimDeviceRunnerBase::setup_static_arena(size_t gm_heap_size, size_t gm_sm_si } arena.release(); cached_size = 0; + arena_changed = true; arena.reserve(requested_size, DeviceArena::kDefaultBaseAlign); if (arena.commit(DeviceArena::kDefaultBaseAlign) == nullptr) { arena.release(); @@ -122,11 +126,66 @@ int SimDeviceRunnerBase::setup_static_arena(size_t gm_heap_size, size_t gm_sm_si cached_gm_heap_size_ = 0; cached_gm_sm_size_ = 0; cached_runtime_arena_size_ = 0; + prebuilt_runtime_arena_cache_valid_ = false; + prebuilt_runtime_arena_cache_key_.clear(); + prebuilt_runtime_arena_cache_gm_heap_base_ = nullptr; + prebuilt_runtime_arena_cache_sm_base_ = nullptr; + prebuilt_runtime_arena_cache_runtime_arena_base_ = nullptr; + prebuilt_runtime_arena_cache_image_.clear(); return -1; } + if (arena_changed) { + prebuilt_runtime_arena_cache_valid_ = false; + prebuilt_runtime_arena_cache_key_.clear(); + prebuilt_runtime_arena_cache_gm_heap_base_ = nullptr; + prebuilt_runtime_arena_cache_sm_base_ = nullptr; + prebuilt_runtime_arena_cache_runtime_arena_base_ = nullptr; + prebuilt_runtime_arena_cache_image_.clear(); + } return 0; } +bool SimDeviceRunnerBase::lookup_prebuilt_runtime_arena_cache( + uint64_t hash, const void *key_data, size_t key_size, void **gm_heap_base, void **sm_base, + void **runtime_arena_base, size_t *runtime_off, const void **image_data, size_t *image_size +) const { + if (!prebuilt_runtime_arena_cache_valid_ || prebuilt_runtime_arena_cache_hash_ != hash || + prebuilt_runtime_arena_cache_key_.size() != key_size || key_data == nullptr || gm_heap_base == nullptr || + sm_base == nullptr || runtime_arena_base == nullptr || runtime_off == nullptr || image_data == nullptr || + image_size == nullptr) { + return false; + } + if (std::memcmp(prebuilt_runtime_arena_cache_key_.data(), key_data, key_size) != 0) { + return false; + } + *gm_heap_base = prebuilt_runtime_arena_cache_gm_heap_base_; + *sm_base = prebuilt_runtime_arena_cache_sm_base_; + *runtime_arena_base = prebuilt_runtime_arena_cache_runtime_arena_base_; + *runtime_off = prebuilt_runtime_arena_cache_runtime_off_; + *image_data = prebuilt_runtime_arena_cache_image_.data(); + *image_size = prebuilt_runtime_arena_cache_image_.size(); + return true; +} + +void SimDeviceRunnerBase::mark_prebuilt_runtime_arena_cached( + uint64_t hash, const void *key_data, size_t key_size, void *gm_heap_base, void *sm_base, void *runtime_arena_base, + size_t runtime_off, const void *image_data, size_t image_size +) { + prebuilt_runtime_arena_cache_valid_ = false; + prebuilt_runtime_arena_cache_hash_ = hash; + prebuilt_runtime_arena_cache_key_.assign( + static_cast(key_data), static_cast(key_data) + key_size + ); + prebuilt_runtime_arena_cache_gm_heap_base_ = gm_heap_base; + prebuilt_runtime_arena_cache_sm_base_ = sm_base; + prebuilt_runtime_arena_cache_runtime_arena_base_ = runtime_arena_base; + prebuilt_runtime_arena_cache_runtime_off_ = runtime_off; + prebuilt_runtime_arena_cache_image_.assign( + static_cast(image_data), static_cast(image_data) + image_size + ); + prebuilt_runtime_arena_cache_valid_ = true; +} + void *SimDeviceRunnerBase::acquire_pooled_gm_heap() { if (!gm_heap_arena_.is_committed()) return nullptr; return gm_heap_arena_.base(); diff --git a/src/common/platform/sim/host/device_runner_base.h b/src/common/platform/sim/host/device_runner_base.h index e8e3d6886..a147bc015 100644 --- a/src/common/platform/sim/host/device_runner_base.h +++ b/src/common/platform/sim/host/device_runner_base.h @@ -77,6 +77,14 @@ class SimDeviceRunnerBase : public L3L2OrchCommBackend { void *acquire_pooled_gm_heap(); void *acquire_pooled_gm_sm(); void *acquire_pooled_runtime_arena(); + bool lookup_prebuilt_runtime_arena_cache( + uint64_t hash, const void *key_data, size_t key_size, void **gm_heap_base, void **sm_base, + void **runtime_arena_base, size_t *runtime_off, const void **image_data, size_t *image_size + ) const; + void mark_prebuilt_runtime_arena_cached( + uint64_t hash, const void *key_data, size_t key_size, void *gm_heap_base, void *sm_base, + void *runtime_arena_base, size_t runtime_off, const void *image_data, size_t image_size + ); std::thread create_thread(std::function fn); int attach_current_thread(int device_id); @@ -198,6 +206,14 @@ class SimDeviceRunnerBase : public L3L2OrchCommBackend { size_t cached_gm_heap_size_{0}; size_t cached_gm_sm_size_{0}; size_t cached_runtime_arena_size_{0}; + bool prebuilt_runtime_arena_cache_valid_{false}; + uint64_t prebuilt_runtime_arena_cache_hash_{0}; + std::vector prebuilt_runtime_arena_cache_key_; + void *prebuilt_runtime_arena_cache_gm_heap_base_{nullptr}; + void *prebuilt_runtime_arena_cache_sm_base_{nullptr}; + void *prebuilt_runtime_arena_cache_runtime_arena_base_{nullptr}; + size_t prebuilt_runtime_arena_cache_runtime_off_{0}; + std::vector prebuilt_runtime_arena_cache_image_; // Simulation state — written by run() / init_* and read by the AICPU / // AICore execute functions via the platform-regs setter functions. diff --git a/tests/lint/clang_tidy.py b/tests/lint/clang_tidy.py index 11d856d16..b8feb0af5 100644 --- a/tests/lint/clang_tidy.py +++ b/tests/lint/clang_tidy.py @@ -97,6 +97,11 @@ def _strip_gcc_flags(command: str) -> str: return shlex.join(filtered_parts) +def _strip_gcc_flags_from_args(arguments: list[str]) -> list[str]: + """Remove GCC-only flags from an argv-style compile command.""" + return [arg for arg in arguments if arg not in _GCC_ONLY_FLAGS] + + def _resolve_target_dirs(config_dir: Path, build_config: dict, target: str) -> tuple[list[str], list[str]]: """Resolve include and source dirs for a target from build_config.""" cfg = build_config[target] @@ -162,6 +167,10 @@ def _parse_compile_database(raw: str, db_file: Path) -> list[dict]: def _load_compile_database(db_file: Path) -> tuple[str, list[dict]]: """Load a compile database, rebuilding its target cache dir when it is broken.""" + if not db_file.is_file(): + print(f"WARNING: compile database disappeared, skipping: {db_file}", file=sys.stderr) + return "", [] + raw = db_file.read_text() try: return raw, _parse_compile_database(raw, db_file) @@ -169,8 +178,16 @@ def _load_compile_database(db_file: Path) -> tuple[str, list[dict]]: print(f"WARNING: invalid compile database detected: {exc}", file=sys.stderr) _reconfigure_compile_database(db_file) + if not db_file.is_file(): + print(f"WARNING: compile database recovery produced no file, skipping: {db_file}", file=sys.stderr) + return "", [] + rebuilt_raw = db_file.read_text() - return rebuilt_raw, _parse_compile_database(rebuilt_raw, db_file) + try: + return rebuilt_raw, _parse_compile_database(rebuilt_raw, db_file) + except (ValueError, json.JSONDecodeError) as exc: + print(f"WARNING: recovered compile database is still invalid, skipping: {exc}", file=sys.stderr) + return "", [] def _build_file_index() -> dict[str, list[Path]]: @@ -191,6 +208,8 @@ def _build_file_index() -> dict[str, list[Path]]: for entry in entries: if "command" in entry: entry["command"] = _strip_gcc_flags(entry["command"]) + if "arguments" in entry: + entry["arguments"] = _strip_gcc_flags_from_args(entry["arguments"]) db_file.write_text(json.dumps(entries, indent=2)) for entry in entries: filepath = entry["file"]