diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 7008917a8..e0fbf5f8e 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -21,6 +21,7 @@ jobs: - name: Checkout uses: actions/checkout@v5 with: + submodules: recursive fetch-depth: 0 - name: Install build and lint tools @@ -71,6 +72,8 @@ jobs: steps: - name: Checkout repository uses: actions/checkout@v5 + with: + submodules: recursive - name: Set up C++ compiler (Linux) if: runner.os == 'Linux' @@ -113,6 +116,8 @@ jobs: steps: - name: Checkout repository uses: actions/checkout@v5 + with: + submodules: recursive - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v6 @@ -175,6 +180,8 @@ jobs: steps: - name: Checkout repository uses: actions/checkout@v5 + with: + submodules: recursive - name: Load pinned pto-isa commit uses: ./.github/actions/read-pto-isa @@ -264,6 +271,8 @@ jobs: steps: - name: Checkout repository uses: actions/checkout@v5 + with: + submodules: recursive - name: Load pinned pto-isa commit uses: ./.github/actions/read-pto-isa @@ -354,6 +363,8 @@ jobs: steps: - name: Checkout repository uses: actions/checkout@v5 + with: + submodules: recursive - name: Set up C++ compiler run: | @@ -454,6 +465,8 @@ jobs: steps: - name: Checkout repository uses: actions/checkout@v5 + with: + submodules: recursive - name: Load pinned pto-isa commit uses: ./.github/actions/read-pto-isa @@ -546,6 +559,8 @@ jobs: steps: - name: Checkout repository uses: actions/checkout@v5 + with: + submodules: recursive - name: Load pinned pto-isa commit uses: ./.github/actions/read-pto-isa @@ -622,6 +637,7 @@ jobs: - name: Checkout repository uses: actions/checkout@v5 with: + submodules: recursive fetch-depth: 0 - name: Check file changes id: check @@ -668,6 +684,8 @@ jobs: steps: - name: Checkout repository uses: actions/checkout@v5 + with: + submodules: recursive - name: Load pinned pto-isa commit uses: ./.github/actions/read-pto-isa @@ -743,6 +761,8 @@ jobs: steps: - name: Checkout repository uses: actions/checkout@v5 + with: + submodules: recursive - name: Load pinned pto-isa commit uses: ./.github/actions/read-pto-isa diff --git a/.github/workflows/sanitizers.yml b/.github/workflows/sanitizers.yml index 2444fab32..7661e19f0 100644 --- a/.github/workflows/sanitizers.yml +++ b/.github/workflows/sanitizers.yml @@ -38,6 +38,8 @@ jobs: steps: - name: Checkout repository uses: actions/checkout@v5 + with: + submodules: recursive - name: Load pinned pto-isa commit uses: ./.github/actions/read-pto-isa diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 000000000..e8945dcab --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "tools/tracr"] + path = tools/tracr + url = https://github.com/huawei-csl/TracR.git diff --git a/examples/a2a3/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp b/examples/a2a3/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp index 2ed86cdf2..7220228d3 100644 --- a/examples/a2a3/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp +++ b/examples/a2a3/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp @@ -25,6 +25,9 @@ #include #include +#include +#include + #include "pto_orchestration_api.h" #define FUNC_QK_MATMUL 0 @@ -89,6 +92,8 @@ __attribute__((visibility("default"))) void aicpu_orchestration_entry(const L2Ta CYCLE_COUNT_START(); + INSTRUMENTATION_MARK_SET(g_TraCR_thread_idx, Read_Dimensions, 0); + // Read dimensions from tensor metadata uint64_t batch = orch_args.tensor(0).ref().shapes[0]; uint64_t num_heads = orch_args.tensor(0).ref().shapes[1]; @@ -107,6 +112,8 @@ __attribute__((visibility("default"))) void aicpu_orchestration_entry(const L2Ta LOG_INFO_V9(">>>>>> batch = %" PRIu64, batch); + INSTRUMENTATION_MARK_SET(g_TraCR_thread_idx, Reshape_Kernels, 0); + // Reshape tensors for kernel consumption (2D flattened) void *query_ptr = orch_args.tensor(0).ref().data_as(); void *kc_ptr = orch_args.tensor(1).ref().data_as(); @@ -136,6 +143,8 @@ __attribute__((visibility("default"))) void aicpu_orchestration_entry(const L2Ta Tensor context_lens = make_tensor_external(orch_args.tensor(4).ref().data_as(), cl_shapes, 1, DataType::INT32, false); + INSTRUMENTATION_MARK_SET(g_TraCR_thread_idx, Pre_Loop_Info, uint32_t(batch)); + // Create infos are loop-invariant — shapes depend only on q_tile/head_dim/block_size uint32_t tile2d_shapes[2] = {static_cast(q_tile), static_cast(head_dim)}; uint32_t scalar_shapes[1] = {static_cast(q_tile)}; @@ -148,11 +157,16 @@ __attribute__((visibility("default"))) void aicpu_orchestration_entry(const L2Ta PROF_INC(prof_make_count, 4); CYCLE_COUNT_LAP(prof_make_tensor); + LOG_INFO_V0( + "Thread %d: Orch PTO2_SCOPE loop: #batch=%" PRIu64 ", q_loop=%" PRIu64, g_TraCR_thread_idx, batch, q_loop + ); + for (uint64_t b_idx = 0; b_idx < batch; b_idx++) { uint32_t cl_idx[1] = {static_cast(b_idx)}; uint64_t cur_seq = static_cast(get_tensor_data(context_lens, 1, cl_idx)); uint64_t bn_this_batch = (cur_seq + block_size - 1) / block_size; for (uint64_t q_idx = 0; q_idx < q_loop; q_idx++) { + INSTRUMENTATION_MARK_SET(g_TraCR_thread_idx, PTO2_SCOPE_, uint32_t(b_idx + batch * q_idx)); PTO2_SCOPE() { CYCLE_COUNT_LAP(prof_scope); uint64_t cur_offset = b_idx * q_head_num + q_idx * q_tile; @@ -250,7 +264,6 @@ __attribute__((visibility("default"))) void aicpu_orchestration_entry(const L2Ta CYCLE_COUNT_LAP(prof_scope); } } - #ifdef ENABLE_PROFILING uint64_t total = prof_param_extract + prof_ext_tensor + prof_make_tensor + prof_tensor_view + prof_param_setup + prof_submit_task + prof_scope; diff --git a/simpler_setup/kernel_compiler.py b/simpler_setup/kernel_compiler.py index 2ea17b66e..85ac7aed5 100644 --- a/simpler_setup/kernel_compiler.py +++ b/simpler_setup/kernel_compiler.py @@ -143,7 +143,17 @@ def get_orchestration_include_dirs(self, runtime_name: str) -> list[str]: runtime_dir = str(self.project_root / "src" / arch / "runtime" / runtime_name / "runtime") runtime_common_dir = str(self.project_root / "src" / arch / "runtime" / runtime_name / "common") common_dir = str(self.project_root / "src" / "common" / "task_interface") - return [runtime_dir, runtime_common_dir, common_dir] + self.get_platform_include_dirs() + tracr_dir1 = str(self.project_root / "tools") + tracr_dir2 = str(self.project_root / "tools" / "tracr" / "include") + tracr_dir3 = str(self.project_root / "tools" / "tracr" / "extern") + return [ + runtime_dir, + runtime_common_dir, + common_dir, + tracr_dir1, + tracr_dir2, + tracr_dir3, + ] + self.get_platform_include_dirs() def get_incore_include_dirs(self) -> list[str]: """ @@ -487,6 +497,15 @@ def _compile_orchestration_shared_lib( if sys.platform != "darwin": cmd.append("-Wl,--build-id=sha1") + if os.getenv("BUILD_TRACR", "OFF") == "ON": + cmd.extend( + [ + "-DENABLE_TRACR", + "-DTRACR_DISABLE_FLUSH", + "-DUSE_HW_COUNTER", + ] + ) + if extra_sources: for src in extra_sources: src = os.path.abspath(src) diff --git a/src/a2a3/platform/onboard/aicpu/CMakeLists.txt b/src/a2a3/platform/onboard/aicpu/CMakeLists.txt index f044df3df..84f76fb46 100644 --- a/src/a2a3/platform/onboard/aicpu/CMakeLists.txt +++ b/src/a2a3/platform/onboard/aicpu/CMakeLists.txt @@ -63,6 +63,19 @@ message(STATUS "AICpu kernel: ${NUM_AICPU_KERNEL_SOURCES} source files") message(VERBOSE "AICpu kernel sources: ${AICPU_KERNEL_SOURCES}") add_library(aicpu_kernel SHARED ${AICPU_KERNEL_SOURCES}) +# TraCR +include(${CMAKE_CURRENT_SOURCE_DIR}/../../../../../tools/tracr.cmake) +tracr_enable(aicpu_kernel) + +# Option to make the Orchestrator run independently (i.e. finalize before letting the schedulers run) +option(INDEP_ORCH "Run Orchestrator independent from the Schedulers" OFF) +if(DEFINED ENV{INDEP_ORCH}) + set(INDEP_ORCH $ENV{INDEP_ORCH}) +endif() +if(INDEP_ORCH) + target_compile_definitions(aicpu_kernel PRIVATE INDEP_ORCH) +endif() + option(WERROR "Treat compiler warnings as errors" ON) if(DEFINED ENV{SIMPLER_DISABLE_WARNINGS_AS_ERRORS}) set(WERROR OFF) diff --git a/src/a2a3/platform/onboard/host/CMakeLists.txt b/src/a2a3/platform/onboard/host/CMakeLists.txt index f0a403b14..bbef3da5b 100644 --- a/src/a2a3/platform/onboard/host/CMakeLists.txt +++ b/src/a2a3/platform/onboard/host/CMakeLists.txt @@ -91,6 +91,19 @@ message(STATUS "Host runtime: ${NUM_HOST_RUNTIME_SOURCES} source files") message(VERBOSE "Host runtime sources: ${HOST_RUNTIME_SOURCES}") add_library(host_runtime SHARED ${HOST_RUNTIME_SOURCES}) +# TraCR +include(${CMAKE_CURRENT_SOURCE_DIR}/../../../../../tools/tracr.cmake) +tracr_enable(host_runtime) + +# Optional" to make the Orchestrator run independently (i.e. finalize before letting the schedulers run) +option(INDEP_ORCH "Run Orchestrator independent from the Schedulers" OFF) +if(DEFINED ENV{INDEP_ORCH}) + set(INDEP_ORCH $ENV{INDEP_ORCH}) +endif() +if(INDEP_ORCH) + target_compile_definitions(host_runtime PRIVATE INDEP_ORCH) +endif() + # C++ standard (applied only to C++ files) set_target_properties(host_runtime PROPERTIES CXX_STANDARD 17 diff --git a/src/a2a3/platform/onboard/host/device_runner.cpp b/src/a2a3/platform/onboard/host/device_runner.cpp index de517a9cc..9d3c8ef3e 100644 --- a/src/a2a3/platform/onboard/host/device_runner.cpp +++ b/src/a2a3/platform/onboard/host/device_runner.cpp @@ -28,6 +28,9 @@ #include #include #include + +#include + #include "acl/acl.h" // Include HAL constants from CANN (header only, library loaded dynamically) @@ -264,6 +267,15 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) { if (prepare_runtime_for_launch(runtime, block_dim, launch_aicpu_num) != 0) return -1; + // Initialize TraCR memory on the device +#ifdef ENABLE_TRACR + // LOG_INFO_V9("[TraCR] thread[%d] DevAllocTraCR device_id_=%d", sched_getcpu(), device_id_); + rc = DevAllocTraCR(this, runtime); + if (rc != 0) { + LOG_ERROR("DevAllocTraCR failed rc=%d", rc); + return rc; + } +#endif // a2a3 onboard now uses the same host-computed, device-filtered affinity // shape as a5. Host probes the AICPU user pool once, chooses the active // cpu_ids deterministically, writes them into Runtime, and the AICPU-side @@ -471,6 +483,15 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) { read_device_wall_ns(); + // Download and Free TraCR memory from Device and store in memory (~/ascend/) +#ifdef ENABLE_TRACR + rc = StoreTracrData(this, runtime); + if (rc != 0) { + LOG_ERROR("FreeTraCR failed: %d", rc); + return -1; + } +#endif + // Tear down collectors. stop() joins mgmt then collector in the only safe // order (mgmt's final-drain pass into L2 has poll as its consumer). teardown_shared_collectors_after_run(); diff --git a/src/a2a3/platform/sim/aicpu/CMakeLists.txt b/src/a2a3/platform/sim/aicpu/CMakeLists.txt index dd18486c1..11fa4349d 100644 --- a/src/a2a3/platform/sim/aicpu/CMakeLists.txt +++ b/src/a2a3/platform/sim/aicpu/CMakeLists.txt @@ -70,6 +70,27 @@ endif() # Create shared library (host-compatible for dlopen) add_library(aicpu_kernel SHARED ${AICPU_SOURCES}) +# TraCR +include(${CMAKE_CURRENT_SOURCE_DIR}/../../../../../tools/tracr.cmake) +tracr_enable(aicpu_kernel) + +# TODO: move this somewhere such that EVERY platform launches this once. Placing this here is hacky... +# Only build the host-side trace post-processor when TraCR is enabled: it is an +# offline analysis tool, and it pulls in Linux-only APIs (sched_getcpu) that do +# not compile on the macOS packaging build. +if(BUILD_TRACR) + include(${CMAKE_CURRENT_SOURCE_DIR}/../../../../../tools/tracr_postprocessing_script.cmake) +endif() + +# Optional: to make the Orchestrator run independently (i.e. finalize before letting the schedulers run) +option(INDEP_ORCH "Run Orchestrator independent from the Schedulers" OFF) +if(DEFINED ENV{INDEP_ORCH}) + set(INDEP_ORCH $ENV{INDEP_ORCH}) +endif() +if(INDEP_ORCH) + target_compile_definitions(aicpu_kernel PRIVATE INDEP_ORCH) +endif() + option(WERROR "Treat compiler warnings as errors" ON) if(DEFINED ENV{SIMPLER_DISABLE_WARNINGS_AS_ERRORS}) set(WERROR OFF) diff --git a/src/a2a3/platform/sim/host/CMakeLists.txt b/src/a2a3/platform/sim/host/CMakeLists.txt index 3b9f283f0..e460cdd7c 100644 --- a/src/a2a3/platform/sim/host/CMakeLists.txt +++ b/src/a2a3/platform/sim/host/CMakeLists.txt @@ -75,6 +75,19 @@ endif() # Create shared library add_library(host_runtime SHARED ${HOST_RUNTIME_SOURCES}) +# TraCR +include(${CMAKE_CURRENT_SOURCE_DIR}/../../../../../tools/tracr.cmake) +tracr_enable(host_runtime) + +# Optional: to make the Orchestrator run independently (i.e. finalize before letting the schedulers run) +option(INDEP_ORCH "Run Orchestrator independent from the Schedulers" OFF) +if(DEFINED ENV{INDEP_ORCH}) + set(INDEP_ORCH $ENV{INDEP_ORCH}) +endif() +if(INDEP_ORCH) + target_compile_definitions(host_runtime PRIVATE INDEP_ORCH) +endif() + # C++ standard (applied only to C++ files) set_target_properties(host_runtime PROPERTIES CXX_STANDARD 17 diff --git a/src/a2a3/platform/sim/host/device_runner.cpp b/src/a2a3/platform/sim/host/device_runner.cpp index c8e0929b6..232eba512 100644 --- a/src/a2a3/platform/sim/host/device_runner.cpp +++ b/src/a2a3/platform/sim/host/device_runner.cpp @@ -29,6 +29,8 @@ #include #include +#include + #include "aicpu/device_phase_aicpu.h" #include "aicpu/platform_aicpu_affinity.h" #include "callable_protocol.h" @@ -253,6 +255,15 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) { worker_count_ = num_aicore; runtime.set_aicpu_thread_num(launch_aicpu_num); + // Initialize TraCR memory on the device +#ifdef ENABLE_TRACR + rc = DevAllocTraCR(this, runtime); + if (rc != 0) { + LOG_ERROR("DevAllocTraCR failed rc=%d", rc); + return rc; + } +#endif + int num_aic = block_dim; uint32_t enable_profiling_flag = PROFILING_FLAG_NONE; if (enable_dump_tensor_) { @@ -571,6 +582,15 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) { return runtime_rc; } + // Download and Free TraCR memory from Device and store in memory (~/ascend/) +#ifdef ENABLE_TRACR + rc = StoreTracrData(this, runtime); + if (rc != 0) { + LOG_ERROR("FreeTraCR failed: %d", rc); + return -1; + } +#endif + // Tear down collectors. stop() joins mgmt then collector in the only safe // order (mgmt's final-drain pass into L2 has poll as its consumer). if (enable_l2_swimlane_) { diff --git a/src/a2a3/runtime/host_build_graph/runtime/pto_runtime2_types.h b/src/a2a3/runtime/host_build_graph/runtime/pto_runtime2_types.h index 82bb7c193..39ddabdc3 100644 --- a/src/a2a3/runtime/host_build_graph/runtime/pto_runtime2_types.h +++ b/src/a2a3/runtime/host_build_graph/runtime/pto_runtime2_types.h @@ -20,7 +20,12 @@ // Tensor dump uses these defaults to size its selective mask table so task-id // ring/slot lookup stays aligned with PTO2 task id layout. +#ifdef INDEP_ORCH +#define PTO2_TASK_WINDOW_SIZE 65536 // Default per-ring task window size (power of 2) +#else #define PTO2_TASK_WINDOW_SIZE 16384 // Default per-ring task window size (power of 2) -#define PTO2_MAX_RING_DEPTH 4 // Number of task-id ring layers +#endif + +#define PTO2_MAX_RING_DEPTH 4 // Number of task-id ring layers #endif // SRC_A2A3_RUNTIME_HOST_BUILD_GRAPH_RUNTIME_PTO_RUNTIME2_TYPES_H_ diff --git a/src/a2a3/runtime/host_build_graph/runtime/runtime.h b/src/a2a3/runtime/host_build_graph/runtime/runtime.h index 9f787c8c1..d22d20713 100644 --- a/src/a2a3/runtime/host_build_graph/runtime/runtime.h +++ b/src/a2a3/runtime/host_build_graph/runtime/runtime.h @@ -186,6 +186,11 @@ class Runtime { // Task storage Task tasks[RUNTIME_MAX_TASKS]; // Fixed-size task array + // TraCR data placeholder + // Those are the pointers with the allocated memory on the device + void *tracrData_; + void *tracrDataSizes_; + // Filter-style affinity gate input (a2a3 onboard). Placed AFTER `tasks` // because AICore reads runtime->tasks[] by offset. Host fills these before // launch from AICPU OCCUPY; the device gate keeps threads whose @@ -239,6 +244,10 @@ class Runtime { void set_worker_count(int n) { worker_count = n; } int get_aicpu_thread_num() const { return aicpu_thread_num; } void set_aicpu_thread_num(int n) { aicpu_thread_num = n; } + void *get_tracr_data() const { return tracrData_; } + void set_tracr_data(void *p) { tracrData_ = p; } + void *get_tracr_data_sizes() const { return tracrDataSizes_; } + void set_tracr_data_sizes(void *p) { tracrDataSizes_ = p; } Handshake *get_workers() { return workers; } int32_t get_aicpu_allowed_cpu_count() const { return aicpu_allowed_cpu_count; } void set_aicpu_allowed_cpu_count(int32_t n) { aicpu_allowed_cpu_count = n; } diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp index 152f82fc4..f43f79894 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp @@ -22,6 +22,9 @@ #include #endif +#include +#include + #include "aicpu/device_time.h" #include "aicpu/device_phase_aicpu.h" #include "aicpu/orch_so_file.h" @@ -192,6 +195,8 @@ int32_t AicpuExecutor::init(Runtime *runtime) { return 0; } + INSTRUMENTATION_MARK_SET(g_TraCR_thread_idx, Initializing, uint32_t(tracr_getcpu())); + LOG_INFO_V0("AicpuExecutor: Initializing"); if (runtime == nullptr) { @@ -458,6 +463,8 @@ int32_t AicpuExecutor::run(Runtime *runtime) { LOG_INFO_V0("Thread %d: No config function, using defaults", thread_idx); } + INSTRUMENTATION_MARK_SET(g_TraCR_thread_idx, Allocating, 0); + // sm_handle / rt are bound to *this* run's memory and must be // (re)created every run, regardless of whether the SO itself was // reused above. @@ -591,6 +598,7 @@ int32_t AicpuExecutor::run(Runtime *runtime) { #if PTO2_PROFILING orch_cycle_start = get_sys_cnt_aicpu(); #endif + INSTRUMENTATION_MARK_SET(g_TraCR_thread_idx, Orchestrating, thread_idx); framework_bind_runtime(rt); if (*p_bind != nullptr) { (*p_bind)(rt); @@ -734,6 +742,7 @@ int32_t AicpuExecutor::run(Runtime *runtime) { if (rt == nullptr) { LOG_ERROR("Thread %d: rt is null after orchestrator error, skipping dispatch", thread_idx); } else { + INSTRUMENTATION_MARK_SET(g_TraCR_thread_idx, Scheduling, thread_idx); sched_ctx_.bind_runtime(rt); if (serial_orch_sched_) { sched_ctx_.wait_for_orchestration_done_before_dispatch(runtime, thread_idx); @@ -751,6 +760,7 @@ int32_t AicpuExecutor::run(Runtime *runtime) { // Always shutdown AICore — even if sched_ctx_.completed_ was already true. // platform_deinit_aicore_regs is idempotent; orchestrator threads have // core_trackers_[thread_idx].core_num() == 0 so they skip the loop harmlessly. + INSTRUMENTATION_MARK_SET(g_TraCR_thread_idx, De_Initializing, 0); int32_t shutdown_rc = sched_ctx_.shutdown(thread_idx); if (shutdown_rc != 0 && run_rc == 0) { run_rc = shutdown_rc; @@ -825,6 +835,64 @@ void AicpuExecutor::deinit(Runtime *runtime) { // ===== Public Entry Point ===== +/** + * init tracr profiler + * + * NOTE: make sure g_TraCR_thread_idx starts at 0 and follows in the positive direction + */ +inline void TRACR_START() { + g_TraCR_thread_idx = g_TraCR_thread_idx_counter.fetch_add(1, std::memory_order_relaxed); + + if (g_TraCR_thread_idx == 0) { + INSTRUMENTATION_START(); + } else { + INSTRUMENTATION_THREAD_INIT(); + } +} + +/** + * finalizing tracr function + * + * NOTE: make shure g_TraCR_thread_idx starts at 0 and follows in the positive direction + */ +inline void TRACR_FINALIZE(Runtime *runtime) { + (void)(runtime); + +#ifdef ENABLE_TRACR + LOG_INFO_V9( + "[TraCR] thread[%d] dumping the #traces: %lu %p", g_TraCR_thread_idx, tracrThread->_traceIdx, + runtime->get_tracr_data() + ); + + if (g_TraCR_thread_idx >= 0 && g_TraCR_thread_idx < runtime->get_aicpu_thread_num()) { + if (runtime->get_tracr_data() != nullptr && tracrThread->_traceIdx > 0) { + TraCR::Payload *tracrData = reinterpret_cast(runtime->get_tracr_data()); + const size_t payload_size = tracrThread->_traceIdx * sizeof(TraCR::Payload); + + std::memcpy(&tracrData[g_TraCR_thread_idx * TraCR::CAPACITY], tracrThread->_traces.data(), payload_size); + } + + if (runtime->get_tracr_data_sizes() != nullptr) { + size_t *tracrDataSizes = reinterpret_cast(runtime->get_tracr_data_sizes()); + tracrDataSizes[g_TraCR_thread_idx] = tracrThread->_traceIdx; + } + } else { + LOG_ERROR( + "[TraCR] thread index %d out of bounds (max=%d)", g_TraCR_thread_idx, runtime->get_aicpu_thread_num() + ); + } +#endif + + if (g_TraCR_thread_idx == 0) { + INSTRUMENTATION_END(); + g_TraCR_thread_idx_counter.store(0, std::memory_order_relaxed); + } else { + INSTRUMENTATION_THREAD_FINALIZE(); + } + + g_TraCR_thread_idx = -1; +} + // Device orchestration SO registration entry. Exported directly by the runtime // (not via a platform forwarding shell): registration is a TMARB-only ability, // so the symbol lives where the capability does. host_build_graph does not @@ -871,6 +939,12 @@ extern "C" int32_t aicpu_execute(Runtime *runtime) { LOG_INFO_V0("%s", "aicpu_execute: Starting AICPU kernel execution"); + // INIT TraCR all threads coming in + TRACR_START(); + LOG_INFO_V9( + "[TraCR] thread[%d:%d] start ENABLE_TRACR=%d", g_TraCR_thread_idx, tracr_getcpu(), INSTRUMENTATION_ACTIVE + ); + // Each phase is bracketed by its own scope so the start/end boundaries are // visible and an early `return` still records the end via the guard dtor. // rc / runtime_rc are declared out here because they outlive their phase. @@ -909,6 +983,10 @@ extern "C" int32_t aicpu_execute(Runtime *runtime) { g_aicpu_executor.deinit(runtime); } + INSTRUMENTATION_MARK_RESET(g_TraCR_thread_idx); + // Finalize TraCR all threads coming in + TRACR_FINALIZE(runtime); + if (runtime_rc != 0) { LOG_ERROR("aicpu_execute: PTO2 runtime failed with rc=%d", runtime_rc); return runtime_rc; diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h index 01194134a..584f5dc87 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h @@ -63,17 +63,26 @@ // NOTE: PTO2_TASK_WINDOW_SIZE is now a per-ring default value. // Actual window size is passed at runtime to runtime_create_from_sm(). // Use pto2_task_slot(sched, task_id) for slot calculation. +#ifdef INDEP_ORCH +#define PTO2_TASK_WINDOW_SIZE 65536 // Default per-ring task window size (power of 2) +#else #define PTO2_TASK_WINDOW_SIZE 16384 // Default per-ring task window size (power of 2) +#endif // Multi-ring: number of independent ring layers (HeapRing + TaskRing + DepPool per layer) // Scope depth maps to ring index via: min(scope_depth, PTO2_MAX_RING_DEPTH - 1) #define PTO2_MAX_RING_DEPTH 4 // Memory pools (per-ring defaults; total = value × PTO2_MAX_RING_DEPTH) +#ifdef INDEP_ORCH +#define PTO2_HEAP_SIZE (256 * 1024 * 1024 * 2) // 512MB per ring (2GB total) +#else #define PTO2_HEAP_SIZE (256 * 1024 * 1024) // 256MB per ring (1GB total) -#define PTO2_DEP_LIST_POOL_SIZE 16384 // Per-ring dependency list pool entries -#define PTO2_TENSORMAP_POOL_SIZE (65536) // TensorMap entry pool -#define PTO2_TENSORMAP_NUM_BUCKETS 4096 // Power of 2 for fast hash (4096×8B=32KB fits L1) +#endif + +#define PTO2_DEP_LIST_POOL_SIZE 16384 // Per-ring dependency list pool entries +#define PTO2_TENSORMAP_POOL_SIZE (65536) // TensorMap entry pool +#define PTO2_TENSORMAP_NUM_BUCKETS 4096 // Power of 2 for fast hash (4096×8B=32KB fits L1) // Scope management #define PTO2_MAX_SCOPE_DEPTH 64 // Maximum nesting depth @@ -91,7 +100,11 @@ #define PTO2_EARLY_DISPATCH_QUEUE_SIZE 64 // Wiring queue +#ifdef INDEP_ORCH +#define PTO2_WRIRING_QUEUE_SIZE 65536 // Per-shape queue size +#else #define PTO2_WRIRING_QUEUE_SIZE 1024 // Per-shape queue size +#endif // Fanin storage #define PTO2_FANIN_INLINE_CAP 64 diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h index 8a41434de..656a98740 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h @@ -175,6 +175,11 @@ struct alignas(64) DeviceRuntimeLaunchDesc { // PTO2 integration: kernel_id -> GM function_bin_addr mapping uint64_t func_id_to_addr_[RUNTIME_MAX_FUNC_ID]; + // TraCR data placeholder + // Those are the pointers with the allocated memory on the device + void *tracrData_; + void *tracrDataSizes_; + // Serial orchestrator -> scheduler start control. // When true, scheduler threads wait until orchestration has fully built the // task graph before entering resolve_and_dispatch(). @@ -243,6 +248,10 @@ class Runtime { void set_worker_count(int n) { dev.worker_count = n; } int get_aicpu_thread_num() const { return dev.aicpu_thread_num; } void set_aicpu_thread_num(int n) { dev.aicpu_thread_num = n; } + void *get_tracr_data() const { return dev.tracrData_; } + void set_tracr_data(void *p) { dev.tracrData_ = p; } + void *get_tracr_data_sizes() const { return dev.tracrDataSizes_; } + void set_tracr_data_sizes(void *p) { dev.tracrDataSizes_ = p; } Handshake *get_workers() { return dev.workers; } int32_t get_aicpu_allowed_cpu_count() const { return dev.aicpu_allowed_cpu_count; } void set_aicpu_allowed_cpu_count(int32_t n) { dev.aicpu_allowed_cpu_count = n; } diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp index 8e1813367..6823307f4 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp @@ -13,6 +13,9 @@ #include #include +#include +#include + #include "common/unified_log.h" #include "aicpu/dep_gen_collector_aicpu.h" #include "aicpu/device_phase_aicpu.h" @@ -984,6 +987,7 @@ SchedulerContext::init(Runtime *runtime, int32_t aicpu_thread_num, int32_t sched void SchedulerContext::deinit() { // Reset all per-core execution state for (int32_t i = 0; i < RUNTIME_MAX_WORKER; i++) { + INSTRUMENTATION_MARK_RESET(sched_thread_num_ + 1 + i); core_exec_states_[i] = {}; core_exec_states_[i].running_reg_task_id = AICPU_TASK_INVALID; core_exec_states_[i].pending_reg_task_id = AICPU_TASK_INVALID; diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_completion.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_completion.cpp index 774589865..f6409d077 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_completion.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_completion.cpp @@ -12,6 +12,9 @@ #include +#include +#include + #include "common/unified_log.h" #include "aicpu/device_time.h" #include "aicpu/platform_regs.h" @@ -384,6 +387,7 @@ void SchedulerContext::check_running_cores_for_completion( #endif ); cur_thread_completed++; + INSTRUMENTATION_MARK_RESET(sched_thread_num_ + 1 + core_id); } if (t.running_done) { complete_slot_task( @@ -395,6 +399,7 @@ void SchedulerContext::check_running_cores_for_completion( #endif ); cur_thread_completed++; + INSTRUMENTATION_MARK_RESET(sched_thread_num_ + 1 + core_id); } // 2. Update slot data diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp index c4a10369d..a5c7696f6 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp @@ -14,6 +14,9 @@ #include #include +#include +#include + #include "common.h" // debug_assert #include "common/unified_log.h" @@ -167,10 +170,12 @@ SchedulerContext::PublishHandle SchedulerContext::prepare_subtask_to_core( build_payload(payload, slot_state, subslot, async_ctx, block_idx); if (to_pending) { + INSTRUMENTATION_MARK_SET(sched_thread_num_ + 1 + core_id, Running_Task_Pair, 0); core_exec_state.pending_subslot = subslot; core_exec_state.pending_slot_state = &slot_state; core_exec_state.pending_reg_task_id = static_cast(reg_task_id); } else { + INSTRUMENTATION_MARK_SET(sched_thread_num_ + 1 + core_id, Running_Task_Single, 0); core_exec_state.running_subslot = subslot; core_exec_state.running_slot_state = &slot_state; core_exec_state.running_reg_task_id = static_cast(reg_task_id); @@ -816,6 +821,17 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_ constexpr bool pmu_active = false; #endif +#ifdef INDEP_ORCH + INSTRUMENTATION_MARK_SET(g_TraCR_thread_idx, Barrier, orchestrator_done_.load(std::memory_order_relaxed)); + LOG_INFO_V9( + "[TraCR] Thread %d: Waiting before the Orch to finish: %d, orchestrator_done_=%d", g_TraCR_thread_idx, + g_TraCR_thread_idx_counter.load(), orchestrator_done_.load(std::memory_order_relaxed) + ); + while (!orchestrator_done_.load(std::memory_order_acquire)) { + SPIN_WAIT_HINT(); + } +#endif + #if PTO2_PROFILING l2_swimlane.sched_start_ts = get_sys_cnt_aicpu(); #endif @@ -924,6 +940,7 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_ #endif // Phase 1: Check running cores for completion + INSTRUMENTATION_MARK_SET(g_TraCR_thread_idx, Phase1, 0); int32_t completed_this_turn = 0; bool try_completed = tracker.has_any_running_cores(); @@ -1020,6 +1037,7 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_ // Phase 2 drain check if (drain_state_.sync_start_pending.load(std::memory_order_acquire) != 0) { + INSTRUMENTATION_MARK_SET(g_TraCR_thread_idx, Phase2, 0); handle_drain_mode(thread_idx); continue; } @@ -1030,6 +1048,7 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_ wired = sched_->drain_wiring_queue(orchestrator_done_.load(std::memory_order_relaxed)); if (wired > 0) { made_progress = true; + INSTRUMENTATION_MARK_SET(g_TraCR_thread_idx, Phase3, 0); #if PTO2_SCHED_PROFILING l2_swimlane.phase_wiring_count += wired; #endif @@ -1066,6 +1085,12 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_ constexpr int DUMMY_DRAIN_BATCH = 16; PTO2TaskSlotState *dummy_batch[DUMMY_DRAIN_BATCH]; int dummy_got = sched_->dummy_ready_queue.pop_batch(dummy_batch, DUMMY_DRAIN_BATCH); + + if (dummy_got > 0) { + (void)(dummy_got); + INSTRUMENTATION_MARK_SET(g_TraCR_thread_idx, Phase3b, 0); + } + #if PTO2_PROFILING // Dummy outer phase: covers handling of all dummies popped this // iter. Per-dummy DummyTask markers are emitted to a SEPARATE lane @@ -1075,6 +1100,7 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_ uint64_t dummy_outer_t0 = (dummy_got > 0 && l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) ? get_sys_cnt_aicpu() : 0; #endif + for (int di = 0; di < dummy_got; di++) { PTO2TaskSlotState &dummy_slot = *dummy_batch[di]; @@ -1175,6 +1201,8 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_ // Phase 4: MIX-strict-priority dispatch with phase-split and // cross-thread idle gating. See dispatch_ready_tasks for the policy. + INSTRUMENTATION_MARK_SET(g_TraCR_thread_idx, Phase4, 0); + #if PTO2_PROFILING uint64_t dispatch_t0 = (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) ? get_sys_cnt_aicpu() : 0; #endif @@ -1334,6 +1362,7 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_ 0; #endif while (deferred_release_count > 0) { + INSTRUMENTATION_MARK_SET(g_TraCR_thread_idx, Drain, 0); #if PTO2_SCHED_PROFILING (void)sched_->on_task_release(*deferred_release_slot_states[--deferred_release_count], thread_idx); #else diff --git a/tools/tracr b/tools/tracr new file mode 160000 index 000000000..f0574cdad --- /dev/null +++ b/tools/tracr @@ -0,0 +1 @@ +Subproject commit f0574cdad14f56da69706533f3fa8e4b75b4f476 diff --git a/tools/tracr.cmake b/tools/tracr.cmake new file mode 100644 index 000000000..55013897c --- /dev/null +++ b/tools/tracr.cmake @@ -0,0 +1,114 @@ +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- + +# The tracr.cmake directory +set(TRACR_ROOT_DIR "${CMAKE_CURRENT_LIST_DIR}") + +# This BUILD_TRACR is a Environment variable used to toggle the build of TraCR +# Use: BUILD_TRACR=ON pip install --no-build-isolation -e '.[test]' +# Default is 'OFF' +option(BUILD_TRACR "Enable TraCR" OFF) +if(DEFINED ENV{BUILD_TRACR}) + set(BUILD_TRACR $ENV{BUILD_TRACR}) +endif() + +function(tracr_enable target) + message(STATUS "Enabling TraCR '${BUILD_TRACR}' for target: ${target}") + + if (NOT TARGET ${target}) + message(FATAL_ERROR "Target '${target}' does not exist.") + endif() + + # Create the TraCR include directory path + set(TRACR_INCLUDE_DIR + ${TRACR_ROOT_DIR}/tracr/include + ) + + # Check if it even exists + if (NOT EXISTS "${TRACR_INCLUDE_DIR}/tracr/tracr.hpp") + message(FATAL_ERROR + "tracr.hpp not found at ${TRACR_INCLUDE_DIR}/tracr/tracr.hpp" + ) + endif() + + # Append the nlohmann json path as well + set(TRACR_INCLUDE_DIR + ${TRACR_INCLUDE_DIR} + ${TRACR_ROOT_DIR} + ${TRACR_ROOT_DIR}/tracr/extern + ) + + # --- include the directories --- + # SYSTEM: TraCR and its vendored third-party headers (e.g. extern/nlohmann + # json) are external to simpler and don't compile cleanly under the build's + # -Wall -Wextra -Werror (modern clang flags nlohmann's deprecated literal + # operators). Marking them system suppresses warnings from those headers. + target_include_directories(${target} SYSTEM PRIVATE + ${TRACR_INCLUDE_DIR} + ) + + # --- compiler flags of TraCR --- + if (BUILD_TRACR) + # Flag to enable/disable TraCR calls at compile time + target_compile_definitions(${target} PRIVATE ENABLE_TRACR) + + # TraCR threads capacity (default is 1<<20 ~= 1 million traces per thread = ~17MB per thread buffer size) + set(TRACR_CAPACITY "" CACHE STRING "Optional TraCR buffer capacity (empty = use internal default)") + + if(NOT "${TRACR_CAPACITY}" STREQUAL "") + message(STATUS "TraCR adding capacity: ${TRACR_CAPACITY}") + + if(NOT TRACR_CAPACITY MATCHES "^[0-9]+$") + message(FATAL_ERROR "TRACR_CAPACITY must be a positive integer") + endif() + + target_compile_definitions(${target} PRIVATE + TRACR_CAPACITY=${TRACR_CAPACITY} + ) + endif() + + # As the traces are collected on the Ascend device, + # there is no need to store them on the device filesystem. + target_compile_definitions(${target} PRIVATE TRACR_DISABLE_FLUSH USE_HW_COUNTER) + + # TraCR full size buffer modes: + # default (none): Abort if buffer is full + # TRACR_POLICY_PERIODIC: If buffer is full, overwrite from the beginning + # TRACR_POLICY_IGNORE_IF_FULL: If buffer is full, ignore incoming traces + # if (TRACR_POLICY) + # target_compile_definitions(${target} PRIVATE TRACR_POLICY_PERIODIC) + # endif() + set(TRACR_POLICY "" CACHE STRING "TraCR policy (empty = use C++ default)") + + set_property(CACHE TRACR_POLICY PROPERTY STRINGS + "" # default: abort if full + TRACR_POLICY_PERIODIC + TRACR_POLICY_IGNORE_IF_FULL + ) + + if(NOT "${TRACR_POLICY}" STREQUAL "") + if(TRACR_POLICY STREQUAL "TRACR_POLICY_PERIODIC") + message(STATUS "TraCR adding policy: 'TRACR_POLICY_PERIODIC'") + target_compile_definitions(${target} PRIVATE TRACR_POLICY_PERIODIC) + elseif(TRACR_POLICY STREQUAL "TRACR_POLICY_IGNORE_IF_FULL") + message(STATUS "TraCR adding policy: 'TRACR_POLICY_IGNORE_IF_FULL'") + target_compile_definitions(${target} PRIVATE TRACR_POLICY_IGNORE_IF_FULL) + else() + message(FATAL_ERROR "Unknown TRACR_POLICY: ${TRACR_POLICY}") + endif() + else() + message(STATUS "No TraCR policy given: using C++ default") + endif() + + # Flag to enable TraCR debugging prints (TODO: Not yet working!) + # if (TRACR_DEBUG) + # target_compile_definitions(${target} PRIVATE ENABLE_TRACR_DEBUG) + # endif() + endif() +endfunction() diff --git a/tools/tracr_postprocessing_script.cmake b/tools/tracr_postprocessing_script.cmake new file mode 100644 index 000000000..b5cb4e49e --- /dev/null +++ b/tools/tracr_postprocessing_script.cmake @@ -0,0 +1,28 @@ +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- + +set(BUILD_DIR "${CMAKE_CURRENT_LIST_DIR}/../build/output/bin/") + +message(STATUS "TraCR: REAL_SOURCE_DIR: '${CMAKE_CURRENT_LIST_DIR}'") + +# Paraver format configuration file +configure_file( + ${CMAKE_CURRENT_LIST_DIR}/tracr/postprocessing/paraver/state.cfg + ${BUILD_DIR}/state.cfg + COPYONLY +) + +add_executable(tracr_process ${CMAKE_CURRENT_LIST_DIR}/tracr/postprocessing/tracr_process.cpp) + +tracr_enable(tracr_process) + +# Set the output directory for the compiled executable +set_target_properties(tracr_process PROPERTIES + RUNTIME_OUTPUT_DIRECTORY ${BUILD_DIR} +) diff --git a/tools/tracr_simpler_api.hpp b/tools/tracr_simpler_api.hpp new file mode 100644 index 000000000..5b8110f9b --- /dev/null +++ b/tools/tracr_simpler_api.hpp @@ -0,0 +1,248 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +/** + * TraCR API functions for Simpler A2A3, A2A3sim, A5, A5sim + * + * TODO: A5 not yet able to test + */ + +#pragma once + +#include // C++17 or newer +#include +#include +#include +#include +#include + +#include +#include + +namespace fs = std::filesystem; +using json = nlohmann::json; + +// TraCR profiling/benchmarking stuff +size_t getSampleID() { + const auto env = std::getenv("PYPTO_RUN_SAMPLE_ID"); + return env ? std::stoul(env) : 0; +} +size_t sampleID = getSampleID(); + +std::string tracr_dir = "~/ascend/tracr/proc.1"; + +/** + * A function for defining the path of the TraCR traces in home + */ +fs::path expand_user_path(const std::string &path) { + if (!path.empty() && path[0] == '~') { + const char *home = std::getenv("HOME"); + if (!home) throw std::runtime_error("HOME not set"); + + std::string sub = path.substr(1); // remove ~ + if (!sub.empty() && sub[0] == '/') sub = sub.substr(1); // remove leading slash + + return fs::path(home) / sub; + } + return fs::path(path); +} + +/** + * + */ +inline int TracrData2BTS(const TraCR::Payload *tracrData, const size_t *tracrDataSizes, const size_t num_threads) { + fs::path base_dir = expand_user_path(tracr_dir); + + fs::create_directories(base_dir); + + for (uint32_t t = 0; t < num_threads; ++t) { + size_t num_traces = tracrDataSizes[t]; + + if (num_traces == 0) continue; + + if (num_traces > TraCR::CAPACITY) { + LOG_ERROR("Thread %u exceeds CAPACITY", t); + return -1; + } + + fs::path thread_dir = base_dir / ("thread." + std::to_string(t + 1)); + + fs::create_directories(thread_dir); + + fs::path file_path = thread_dir / "traces.bts"; + + std::ofstream out(file_path, std::ios::binary); + if (!out) { + LOG_ERROR("Cannot open %s", file_path); + return -1; + } + + const TraCR::Payload *thread_ptr = tracrData + t * TraCR::CAPACITY; + + out.write(reinterpret_cast(thread_ptr), num_traces * sizeof(TraCR::Payload)); + + if (!out) { + LOG_ERROR("Write failed for %s", file_path); + return -1; + } + } + return 0; +} + +/** + * A method for storing the TraCR metadata.json + */ +template +int StoreTracrMetaData(RuntimeT &runtime) { + fs::path base_dir = expand_user_path(tracr_dir); + + // Add the metadata.json + nlohmann::json metadata; + + // channel_names + nlohmann::json channel_names = nlohmann::json::array(); + for (int i = 0; i < runtime.get_aicpu_thread_num(); ++i) { + channel_names.push_back("AICPU_" + std::to_string(i)); + } + for (int i = 0; i < int(runtime.get_worker_count() / 3); ++i) { + channel_names.push_back("AICube_" + std::to_string(i)); + } + for (int i = 0; i < int(2 * runtime.get_worker_count() / 3); ++i) { + channel_names.push_back("AIVector_" + std::to_string(i)); + } + channel_names.push_back("INVALID"); + + metadata["channel_names"] = channel_names; + metadata["num_channels"] = channel_names.size(); + + // markerTypes + metadata["markerTypes"] = nlohmann::json::object(); + + for (int i = 0; i < MARKERTYPE_COUNT; ++i) { + std::ostringstream oss; + oss << std::setw(2) << std::setfill('0') << (i + 1); + metadata["markerTypes"][oss.str()] = MarkerTypeNames[i]; + } + + metadata["pid"] = 1; + metadata["start_time"] = 0; + metadata["tid"] = 0; + + fs::path metadata_dir = base_dir / ("metadata.json"); + + std::ofstream file(metadata_dir); + if (!file.is_open()) { + LOG_ERROR("Failed to open file for writing.\n"); + return -1; + } + + // Dump JSON into file + file << metadata.dump(4); + + // Close the file + file.close(); + + return 0; +} + +/** + * A function for extracting the TraCR data from the Device to Host + */ +template +int StoreTracrData(DeviceRunnerT *device_runner, RuntimeT &runtime) { + static_assert( + std::is_trivially_copyable_v, "TraCR::Payload must be trivially copyable for raw binary dump" + ); + + if (runtime.get_tracr_data() == nullptr) { + LOG_ERROR("runtime.tracrData_ is a nullptr"); + return -1; + } + + if (runtime.get_tracr_data_sizes() == nullptr) { + LOG_ERROR("runtime.tracrDataSizes_ is a nullptr"); + return -1; + } + + if (runtime.get_aicpu_thread_num() <= 0) { + LOG_ERROR("runtime.aicpu_thread_num is zero or negative: %d", runtime.get_aicpu_thread_num()); + return -1; + } + + // Download the tracrData_ from Device to Host + size_t size = sizeof(TraCR::Payload) * TraCR::CAPACITY * runtime.get_aicpu_thread_num(); + std::vector tracrData(TraCR::CAPACITY * runtime.get_aicpu_thread_num()); + int rc = device_runner->copy_from_device( + reinterpret_cast(tracrData.data()), reinterpret_cast(runtime.get_tracr_data()), size + ); + if (rc != 0) { + LOG_ERROR("device_runner->copy_from_device 'tracrData' failed rc=%d", rc); + return rc; + } + + // Download the tracrDataSizes_ from Device to Host + size = sizeof(size_t) * runtime.get_aicpu_thread_num(); + std::vector tracrDataSizes(runtime.get_aicpu_thread_num()); + rc = device_runner->copy_from_device( + reinterpret_cast(tracrDataSizes.data()), reinterpret_cast(runtime.get_tracr_data_sizes()), size + ); + if (rc != 0) { + LOG_ERROR("device_runner->copy_from_device 'tracrDataSizes' failed rc=%d", rc); + return rc; + } + + // Now, store the traces into '~/ascend/tracr/' + tracr_dir = + "~/ascend/tracr_" + std::to_string(sampleID++) + "/proc." + std::to_string(1000 + device_runner->device_id()); + rc = TracrData2BTS(tracrData.data(), tracrDataSizes.data(), runtime.get_aicpu_thread_num()); + if (rc != 0) { + LOG_ERROR("TracrData2BTS() failed"); + return rc; + } + + // Free device TraCR memory data placeholder + device_runner->free_tensor(runtime.get_tracr_data()); + device_runner->free_tensor(runtime.get_tracr_data_sizes()); + + rc = StoreTracrMetaData(runtime); + if (rc != 0) { + LOG_ERROR("StoreTracrMetaData failed: %d", rc); + return rc; + } + + return 0; +} + +/** + * A method for allocating memory on the device + * + * Polymorphic to A2A3 and A5 (should be) + */ +template +int DevAllocTraCR(DeviceRunnerT *device_runner, RuntimeT &runtime) { + const size_t size = sizeof(TraCR::Payload) * runtime.get_aicpu_thread_num() * TraCR::CAPACITY; + // LOG_INFO_V9("Device alloc start of size=%u, %p", size, runtime.get_tracr_data()); + runtime.set_tracr_data(device_runner->allocate_tensor(size)); + if (runtime.get_tracr_data() == nullptr) { + LOG_ERROR("runtime.tracrData_: alloc %zu bytes failed", size); + return -1; + } + // LOG_INFO_V9("Device alloc start of size=%u, %p", size, runtime.get_tracr_data()); + runtime.set_tracr_data_sizes(device_runner->allocate_tensor(runtime.get_aicpu_thread_num() * sizeof(size_t))); + if (runtime.get_tracr_data_sizes() == nullptr) { + const size_t sizes_bytes = runtime.get_aicpu_thread_num() * sizeof(size_t); + LOG_ERROR("runtime.tracrDataSizes_: alloc %zu bytes failed", sizes_bytes); + device_runner->free_tensor(runtime.get_tracr_data()); + runtime.set_tracr_data(nullptr); + return -1; + } + return 0; +} diff --git a/tools/tracr_simpler_markers.hpp b/tools/tracr_simpler_markers.hpp new file mode 100644 index 000000000..99fd8cf59 --- /dev/null +++ b/tools/tracr_simpler_markers.hpp @@ -0,0 +1,69 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * TraCR Simpler Marker Types + */ + +#pragma once + +#include +#include + +// sched_getcpu() is a glibc/Linux-only API, but the simulator/host build also +// compiles on non-Linux targets (e.g. the macOS packaging CI). Route the TraCR +// call sites through this portable shim instead of calling sched_getcpu directly. +#if defined(__linux__) +#include +inline int tracr_getcpu() { return sched_getcpu(); } +#else +inline int tracr_getcpu() { return -1; } +#endif + +// Global TraCR thread idx counter +inline std::atomic g_TraCR_thread_idx_counter{0}; + +// Global thread local thread idx placeholder +inline thread_local int g_TraCR_thread_idx{-1}; + +#define MARKER_TYPES \ + X(Orchestrating) \ + X(Read_Dimensions) \ + X(Reshape_Kernels) \ + X(Pre_Loop_Info) \ + X(PTO2_SCOPE_) \ + X(Scheduling) \ + X(Phase1) \ + X(Phase2) \ + X(Phase3) \ + X(Phase3b) \ + X(Phase4) \ + X(Drain) \ + X(Initializing) \ + X(De_Initializing) \ + X(DLL_loading) \ + X(Allocating) \ + X(Running_Task_Single) \ + X(Running_Task_Pair) \ + X(Barrier) + +enum MarkerType { +#define X(name) name, + MARKER_TYPES +#undef X + + MARKERTYPE_COUNT +}; + +constexpr std::string_view MarkerTypeNames[] = { +#define X(name) #name, + MARKER_TYPES +#undef X +};