From 9f7efd7c70ef1a21c45781711ab3ecb8f5c3014a Mon Sep 17 00:00:00 2001 From: Noah Baumann Date: Wed, 10 Jun 2026 17:32:45 +0200 Subject: [PATCH 01/14] adding tracr back in again (working) --- .gitmodules | 3 + simpler_setup/kernel_compiler.py | 12 +- .../platform/onboard/aicpu/CMakeLists.txt | 13 + src/a2a3/platform/onboard/host/CMakeLists.txt | 13 + .../platform/onboard/host/device_runner.cpp | 22 ++ src/a2a3/platform/sim/aicpu/CMakeLists.txt | 16 ++ src/a2a3/platform/sim/host/CMakeLists.txt | 13 + src/a2a3/platform/sim/host/device_runner.cpp | 20 ++ .../host_build_graph/runtime/runtime.h | 5 + .../aicpu/aicpu_executor.cpp | 71 +++++ .../runtime/runtime.h | 5 + tools/tracr | 1 + tools/tracr.cmake | 111 ++++++++ tools/tracr_postprocessing_script.cmake | 29 ++ tools/tracr_simpler_api.hpp | 254 ++++++++++++++++++ tools/tracr_simpler_markers.hpp | 60 +++++ 16 files changed, 647 insertions(+), 1 deletion(-) create mode 100644 .gitmodules create mode 160000 tools/tracr create mode 100644 tools/tracr.cmake create mode 100644 tools/tracr_postprocessing_script.cmake create mode 100644 tools/tracr_simpler_api.hpp create mode 100644 tools/tracr_simpler_markers.hpp diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 000000000..e8945dcab --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "tools/tracr"] + path = tools/tracr + url = https://github.com/huawei-csl/TracR.git diff --git a/simpler_setup/kernel_compiler.py b/simpler_setup/kernel_compiler.py index 2ea17b66e..6d2e10f3f 100644 --- a/simpler_setup/kernel_compiler.py +++ b/simpler_setup/kernel_compiler.py @@ -143,7 +143,10 @@ def get_orchestration_include_dirs(self, runtime_name: str) -> list[str]: runtime_dir = str(self.project_root / "src" / arch / "runtime" / runtime_name / "runtime") runtime_common_dir = str(self.project_root / "src" / arch / "runtime" / runtime_name / "common") common_dir = str(self.project_root / "src" / "common" / "task_interface") - return [runtime_dir, runtime_common_dir, common_dir] + self.get_platform_include_dirs() + tracr_dir1 = str(self.project_root / "tools") + tracr_dir2 = str(self.project_root / "tools" / "tracr" / "include") + tracr_dir3 = str(self.project_root / "tools" / "tracr" / "extern") + return [runtime_dir, runtime_common_dir, common_dir, tracr_dir1, tracr_dir2, tracr_dir3] + self.get_platform_include_dirs() def get_incore_include_dirs(self) -> list[str]: """ @@ -487,6 +490,13 @@ def _compile_orchestration_shared_lib( if sys.platform != "darwin": cmd.append("-Wl,--build-id=sha1") + if os.getenv("BUILD_TRACR", "OFF") == "ON": + cmd.extend([ + "-DENABLE_TRACR", + "-DTRACR_DISABLE_FLUSH", + "-DUSE_HW_COUNTER", + ]) + if extra_sources: for src in extra_sources: src = os.path.abspath(src) diff --git a/src/a2a3/platform/onboard/aicpu/CMakeLists.txt b/src/a2a3/platform/onboard/aicpu/CMakeLists.txt index 926a76c9b..bf5a11d27 100644 --- a/src/a2a3/platform/onboard/aicpu/CMakeLists.txt +++ b/src/a2a3/platform/onboard/aicpu/CMakeLists.txt @@ -63,6 +63,19 @@ message(STATUS "AICpu kernel: ${NUM_AICPU_KERNEL_SOURCES} source files") message(VERBOSE "AICpu kernel sources: ${AICPU_KERNEL_SOURCES}") add_library(aicpu_kernel SHARED ${AICPU_KERNEL_SOURCES}) +# TraCR +include(${CMAKE_CURRENT_SOURCE_DIR}/../../../../../tools/tracr.cmake) +tracr_enable(aicpu_kernel) + +# Option to make the Orchestrator run independently (i.e. finalize before letting the schedulers run) +option(INDEP_ORCH "Run Orchestrator independent from the Schedulers" OFF) +if(DEFINED ENV{INDEP_ORCH}) + set(INDEP_ORCH $ENV{INDEP_ORCH}) +endif() +if(INDEP_ORCH) + target_compile_definitions(aicpu_kernel PRIVATE INDEP_ORCH) +endif() + # Compile options (common to both C and C++) target_compile_options(aicpu_kernel PRIVATE diff --git a/src/a2a3/platform/onboard/host/CMakeLists.txt b/src/a2a3/platform/onboard/host/CMakeLists.txt index 37fb7a59f..c5488f344 100644 --- a/src/a2a3/platform/onboard/host/CMakeLists.txt +++ b/src/a2a3/platform/onboard/host/CMakeLists.txt @@ -89,6 +89,19 @@ message(STATUS "Host runtime: ${NUM_HOST_RUNTIME_SOURCES} source files") message(VERBOSE "Host runtime sources: ${HOST_RUNTIME_SOURCES}") add_library(host_runtime SHARED ${HOST_RUNTIME_SOURCES}) +# TraCR +include(${CMAKE_CURRENT_SOURCE_DIR}/../../../../../tools/tracr.cmake) +tracr_enable(host_runtime) + +# Optional" to make the Orchestrator run independently (i.e. finalize before letting the schedulers run) +option(INDEP_ORCH "Run Orchestrator independent from the Schedulers" OFF) +if(DEFINED ENV{INDEP_ORCH}) + set(INDEP_ORCH $ENV{INDEP_ORCH}) +endif() +if(INDEP_ORCH) + target_compile_definitions(host_runtime PRIVATE INDEP_ORCH) +endif() + # C++ standard (applied only to C++ files) set_target_properties(host_runtime PROPERTIES CXX_STANDARD 17 diff --git a/src/a2a3/platform/onboard/host/device_runner.cpp b/src/a2a3/platform/onboard/host/device_runner.cpp index e0dc7742e..7139356f1 100644 --- a/src/a2a3/platform/onboard/host/device_runner.cpp +++ b/src/a2a3/platform/onboard/host/device_runner.cpp @@ -28,6 +28,9 @@ #include #include #include + +#include + #include "acl/acl.h" // Include HAL constants from CANN (header only, library loaded dynamically) @@ -263,6 +266,16 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) { if (prepare_runtime_for_launch(runtime, block_dim, launch_aicpu_num) != 0) return -1; + // Initialize TraCR memory on the device +#ifdef ENABLE_TRACR + // LOG_INFO_V9("[TraCR] thread[%d] DevAllocTraCR device_id_=%d", sched_getcpu(), device_id_); + rc = DevAllocTraCR(this, runtime); + if (rc != 0) { + LOG_ERROR("DevAllocTraCR failed rc=%d", rc); + return rc; + } +#endif + auto runtime_args_cleanup = RAIIScopeGuard([this]() { kernel_args_.finalize_device_kernel_args(); kernel_args_.finalize_runtime_args(); @@ -405,6 +418,15 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) { read_device_wall_ns(); + // Download and Free TraCR memory from Device and store in memory (~/ascend/) +#ifdef ENABLE_TRACR + rc = StoreTracrData(this, runtime); + if (rc != 0) { + LOG_ERROR("FreeTraCR failed: %d", rc); + return -1; + } +#endif + // Tear down collectors. stop() joins mgmt then collector in the only safe // order (mgmt's final-drain pass into L2 has poll as its consumer). teardown_shared_collectors_after_run(); diff --git a/src/a2a3/platform/sim/aicpu/CMakeLists.txt b/src/a2a3/platform/sim/aicpu/CMakeLists.txt index b3240c7e3..7829abc8d 100644 --- a/src/a2a3/platform/sim/aicpu/CMakeLists.txt +++ b/src/a2a3/platform/sim/aicpu/CMakeLists.txt @@ -70,6 +70,22 @@ endif() # Create shared library (host-compatible for dlopen) add_library(aicpu_kernel SHARED ${AICPU_SOURCES}) +# TraCR +include(${CMAKE_CURRENT_SOURCE_DIR}/../../../../../tools/tracr.cmake) +tracr_enable(aicpu_kernel) + +# TODO: move this somewhere such that EVERY platform launches this once. Placing this here is hacky... +include(${CMAKE_CURRENT_SOURCE_DIR}/../../../../../tools/tracr_postprocessing_script.cmake) + +# Optional: to make the Orchestrator run independently (i.e. finalize before letting the schedulers run) +option(INDEP_ORCH "Run Orchestrator independent from the Schedulers" OFF) +if(DEFINED ENV{INDEP_ORCH}) + set(INDEP_ORCH $ENV{INDEP_ORCH}) +endif() +if(INDEP_ORCH) + target_compile_definitions(aicpu_kernel PRIVATE INDEP_ORCH) +endif() + # Compile options (host g++/gcc) target_compile_options(aicpu_kernel PRIVATE diff --git a/src/a2a3/platform/sim/host/CMakeLists.txt b/src/a2a3/platform/sim/host/CMakeLists.txt index c4fac0396..f567690d4 100644 --- a/src/a2a3/platform/sim/host/CMakeLists.txt +++ b/src/a2a3/platform/sim/host/CMakeLists.txt @@ -74,6 +74,19 @@ endif() # Create shared library add_library(host_runtime SHARED ${HOST_RUNTIME_SOURCES}) +# TraCR +include(${CMAKE_CURRENT_SOURCE_DIR}/../../../../../tools/tracr.cmake) +tracr_enable(host_runtime) + +# Optional: to make the Orchestrator run independently (i.e. finalize before letting the schedulers run) +option(INDEP_ORCH "Run Orchestrator independent from the Schedulers" OFF) +if(DEFINED ENV{INDEP_ORCH}) + set(INDEP_ORCH $ENV{INDEP_ORCH}) +endif() +if(INDEP_ORCH) + target_compile_definitions(host_runtime PRIVATE INDEP_ORCH) +endif() + # C++ standard (applied only to C++ files) set_target_properties(host_runtime PROPERTIES CXX_STANDARD 17 diff --git a/src/a2a3/platform/sim/host/device_runner.cpp b/src/a2a3/platform/sim/host/device_runner.cpp index 65dffc52f..7ad56c04a 100644 --- a/src/a2a3/platform/sim/host/device_runner.cpp +++ b/src/a2a3/platform/sim/host/device_runner.cpp @@ -29,6 +29,8 @@ #include #include +#include + #include "aicpu/platform_aicpu_affinity.h" #include "callable_protocol.h" #include "common/memory_barrier.h" @@ -214,6 +216,15 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) { worker_count_ = num_aicore; runtime.aicpu_thread_num = launch_aicpu_num; + // Initialize TraCR memory on the device +#ifdef ENABLE_TRACR + rc = DevAllocTraCR(this, runtime); + if (rc != 0) { + LOG_ERROR("DevAllocTraCR failed rc=%d", rc); + return rc; + } +#endif + int num_aic = block_dim; uint32_t enable_profiling_flag = PROFILING_FLAG_NONE; if (enable_dump_tensor_) { @@ -490,6 +501,15 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) { return runtime_rc; } + // Download and Free TraCR memory from Device and store in memory (~/ascend/) +#ifdef ENABLE_TRACR + rc = StoreTracrData(this, runtime); + if (rc != 0) { + LOG_ERROR("FreeTraCR failed: %d", rc); + return -1; + } +#endif + // Tear down collectors. stop() joins mgmt then collector in the only safe // order (mgmt's final-drain pass into L2 has poll as its consumer). if (enable_l2_swimlane_) { diff --git a/src/a2a3/runtime/host_build_graph/runtime/runtime.h b/src/a2a3/runtime/host_build_graph/runtime/runtime.h index 44473ee0d..733d517db 100644 --- a/src/a2a3/runtime/host_build_graph/runtime/runtime.h +++ b/src/a2a3/runtime/host_build_graph/runtime/runtime.h @@ -223,6 +223,11 @@ class Runtime { // Task storage Task tasks[RUNTIME_MAX_TASKS]; // Fixed-size task array + // TraCR data placeholder + // Those are the pointers with the allocated memory on the device + void *tracrData_; + void *tracrDataSizes_; + private: int next_task_id; // Next available task ID diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp index 7a7b5378a..f57a92f42 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp @@ -22,6 +22,9 @@ #include #endif +#include +#include + #include "aicpu/device_time.h" #include "aicpu/orch_so_file.h" #include "callable_protocol.h" @@ -796,6 +799,59 @@ void AicpuExecutor::deinit(Runtime *runtime) { // ===== Public Entry Point ===== + +/** + * init tracr profiler + * + * NOTE: make sure g_TraCR_thread_idx starts at 0 and follows in the positive direction + */ +inline void TRACR_START() { + g_TraCR_thread_idx = g_TraCR_thread_idx_counter.fetch_add(1, std::memory_order_relaxed); + + if (g_TraCR_thread_idx == 0) { + INSTRUMENTATION_START(); + } else { + INSTRUMENTATION_THREAD_INIT(); + } +} + +/** + * finalizing tracr function + * + * NOTE: make shure g_TraCR_thread_idx starts at 0 and follows in the positive direction + */ +inline void TRACR_FINALIZE(Runtime *runtime) { + (void)(runtime); + +#ifdef ENABLE_TRACR + LOG_INFO_V9("[TraCR] thread[%d] dumping the #traces: %lu %p", g_TraCR_thread_idx, tracrThread->_traceIdx, runtime->tracrData_); + + if (tracrThread->_traceIdx > 0) { + TraCR::Payload* tracrData = reinterpret_cast(runtime->tracrData_); + const size_t payload_size = tracrThread->_traceIdx * sizeof(TraCR::Payload); + + std::memcpy( + &tracrData[g_TraCR_thread_idx * TraCR::CAPACITY], + tracrThread->_traces.data(), + payload_size + ); + } + + size_t* tracrDataSizes = reinterpret_cast(runtime->tracrDataSizes_); + tracrDataSizes[g_TraCR_thread_idx] = tracrThread->_traceIdx; +#endif + + if (g_TraCR_thread_idx == 0) { + INSTRUMENTATION_END(); + g_TraCR_thread_idx_counter.store(0, std::memory_order_relaxed); + } else { + INSTRUMENTATION_THREAD_FINALIZE(); + } + + g_TraCR_thread_idx = -1; +} + + /** * aicpu_execute - Main AICPU kernel execution entry point * @@ -817,6 +873,17 @@ extern "C" int32_t aicpu_execute(Runtime *runtime) { LOG_INFO_V0("%s", "aicpu_execute: Starting AICPU kernel execution"); + // Watch out, basicly hardcoded, we assume the all four threads to lie on each CPU on the same NUMA domain + // if (sched_getcpu() <= 3) { + // LOG_ERROR("DynTileFwkBackendKernelServer: Scheduling thread is in the wrong NUMA domain! aicpu_thread_num=%d sched_getcpu=%d", runtime->aicpu_thread_num, sched_getcpu()); + // return -1; + // } + + // INIT TraCR all threads coming in + TRACR_START(); + LOG_INFO_V9("[TraCR] thread[%d:%d] start ENABLE_TRACR=%d", g_TraCR_thread_idx, sched_getcpu(), INSTRUMENTATION_ACTIVE); + + g_aicpu_executor.init(runtime); while (!g_aicpu_executor.init_done_.load(std::memory_order_acquire)) { @@ -839,6 +906,10 @@ extern "C" int32_t aicpu_execute(Runtime *runtime) { g_aicpu_executor.deinit(runtime); } + INSTRUMENTATION_MARK_RESET(g_TraCR_thread_idx); + // Finalize TraCR all threads coming in + TRACR_FINALIZE(runtime); + if (runtime_rc != 0) { LOG_ERROR("aicpu_execute: PTO2 runtime failed with rc=%d", runtime_rc); return runtime_rc; diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h index c6bbd0395..abe289262 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h @@ -207,6 +207,11 @@ class Runtime { // Controlled via PTO2_ORCH_TO_SCHED environment variable. bool orch_to_sched; + // TraCR data placeholder + // Those are the pointers with the allocated memory on the device + void *tracrData_; + void *tracrDataSizes_; + private: // Kernel binary tracking for cleanup int registered_kernel_func_ids_[RUNTIME_MAX_FUNC_ID]; diff --git a/tools/tracr b/tools/tracr new file mode 160000 index 000000000..806fa6a84 --- /dev/null +++ b/tools/tracr @@ -0,0 +1 @@ +Subproject commit 806fa6a849ddd4a1eb544080bc7ec1423a834287 diff --git a/tools/tracr.cmake b/tools/tracr.cmake new file mode 100644 index 000000000..fdd6234af --- /dev/null +++ b/tools/tracr.cmake @@ -0,0 +1,111 @@ +# ----------------------------------------------------------------------------------------------------------- +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- + +# The tracr.cmake directory +set(TRACR_ROOT_DIR "${CMAKE_CURRENT_LIST_DIR}") + +# This BUILD_TRACR is a Environment variable used to toggle the build of TraCR +# Use: BUILD_TRACR=ON pip install --no-build-isolation -e '.[test]' +# Default is 'OFF' +option(BUILD_TRACR "Enable TraCR" OFF) +if(DEFINED ENV{BUILD_TRACR}) + set(BUILD_TRACR $ENV{BUILD_TRACR}) +endif() + +function(tracr_enable target) + message(STATUS "Enabling TraCR '${BUILD_TRACR}' for target: ${target}") + + if (NOT TARGET ${target}) + message(FATAL_ERROR "Target '${target}' does not exist.") + endif() + + # Create the TraCR include directory path + set(TRACR_INCLUDE_DIR + ${TRACR_ROOT_DIR}/tracr/include + ) + + # Check if it even exists + if (NOT EXISTS "${TRACR_INCLUDE_DIR}/tracr/tracr.hpp") + message(FATAL_ERROR + "tracr.hpp not found at ${TRACR_INCLUDE_DIR}/tracr/tracr.hpp" + ) + endif() + + # Append the nlohmann json path as well + set(TRACR_INCLUDE_DIR + ${TRACR_INCLUDE_DIR} + ${TRACR_ROOT_DIR} + ${TRACR_ROOT_DIR}/tracr/extern + ) + + # --- include the directories --- + target_include_directories(${target} PRIVATE + ${TRACR_INCLUDE_DIR} + ) + + # --- compiler flags of TraCR --- + if (BUILD_TRACR) + # Flag to enable/disable TraCR calls at compile time + target_compile_definitions(${target} PRIVATE ENABLE_TRACR) + + # TraCR threads capacity (default is 1<<20 ~= 1 million traces per thread = ~17MB per thread buffer size) + set(TRACR_CAPACITY "" CACHE STRING "Optional TraCR buffer capacity (empty = use internal default)") + + if(NOT TRACR_CAPACITY STREQUAL "") + message(STATUS "TraCR adding capacity: ${TRACR_CAPACITY}") + + if(NOT TRACR_CAPACITY MATCHES "^[0-9]+$") + message(FATAL_ERROR "TRACR_CAPACITY must be a positive integer") + endif() + + target_compile_definitions(${target} PRIVATE + TRACR_CAPACITY=${TRACR_CAPACITY} + ) + endif() + + # As the traces are collected on the Ascend device, + # there is no need to store them on the device filesystem. + target_compile_definitions(${target} PRIVATE TRACR_DISABLE_FLUSH USE_HW_COUNTER) + + # TraCR full size buffer modes: + # default (none): Abort if buffer is full + # TRACR_POLICY_PERIODIC: If buffer is full, overwrite from the beginning + # TRACR_POLICY_IGNORE_IF_FULL: If buffer is full, ignore incoming traces + # if (TRACR_POLICY) + # target_compile_definitions(${target} PRIVATE TRACR_POLICY_PERIODIC) + # endif() + set(TRACR_POLICY "" CACHE STRING "TraCR policy (empty = use C++ default)") + + set_property(CACHE TRACR_POLICY PROPERTY STRINGS + "" # default: abort if full + TRACR_POLICY_PERIODIC + TRACR_POLICY_IGNORE_IF_FULL + ) + + if(NOT TRACR_POLICY STREQUAL "") + if(TRACR_POLICY STREQUAL "TRACR_POLICY_PERIODIC") + message(STATUS "TraCR adding policy: 'TRACR_POLICY_PERIODIC'") + target_compile_definitions(${target} PRIVATE TRACR_POLICY_PERIODIC) + elseif(TRACR_POLICY STREQUAL "TRACR_POLICY_IGNORE_IF_FULL") + message(STATUS "TraCR adding policy: 'TRACR_POLICY_IGNORE_IF_FULL'") + target_compile_definitions(${target} PRIVATE TRACR_POLICY_IGNORE_IF_FULL) + else() + message(FATAL_ERROR "Unknown TRACR_POLICY: ${TRACR_POLICY}") + endif() + else() + message(STATUS "No TraCR policy given: using C++ default") + endif() + + # Flag to enable TraCR debugging prints (TODO: Not yet working!) + # if (TRACR_DEBUG) + # target_compile_definitions(${target} PRIVATE ENABLE_TRACR_DEBUG) + # endif() + endif() +endfunction() \ No newline at end of file diff --git a/tools/tracr_postprocessing_script.cmake b/tools/tracr_postprocessing_script.cmake new file mode 100644 index 000000000..0bcdd6a1e --- /dev/null +++ b/tools/tracr_postprocessing_script.cmake @@ -0,0 +1,29 @@ +# ----------------------------------------------------------------------------------------------------------- +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- + +set(BUILD_DIR "${CMAKE_CURRENT_LIST_DIR}/../build/output/bin/") + +message(STATUS "TraCR: REAL_SOURCE_DIR: '${CMAKE_CURRENT_LIST_DIR}'") + +# Paraver format configuration file +configure_file( + ${CMAKE_CURRENT_LIST_DIR}/tracr/postprocessing/paraver/state.cfg + ${BUILD_DIR}/state.cfg + COPYONLY +) + +add_executable(tracr_process ${CMAKE_CURRENT_LIST_DIR}/tracr/postprocessing/tracr_process.cpp) + +tracr_enable(tracr_process) + +# Set the output directory for the compiled executable +set_target_properties(tracr_process PROPERTIES + RUNTIME_OUTPUT_DIRECTORY ${BUILD_DIR} +) diff --git a/tools/tracr_simpler_api.hpp b/tools/tracr_simpler_api.hpp new file mode 100644 index 000000000..8a62540b3 --- /dev/null +++ b/tools/tracr_simpler_api.hpp @@ -0,0 +1,254 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +/** + * TraCR API functions for Simpler A2A3, A2A3sim, A5, A5sim + * + * TODO: A5 not yet able to test + */ + +#pragma once + +#include // C++17 or newer +#include +#include +#include +#include + +#include +#include + +namespace fs = std::filesystem; +using json = nlohmann::json; + +// TraCR profiling/benchmarking stuff +size_t getSampleID() { + const auto env = std::getenv("PYPTO_RUN_SAMPLE_ID"); + return env ? std::stoul(env) : 0; +} +size_t sampleID = getSampleID(); + +std::string tracr_dir = "~/ascend/tracr/proc.1"; + +/** + * A function for defining the path of the TraCR traces in home + */ +fs::path expand_user_path(const std::string& path) +{ + if (!path.empty() && path[0] == '~') { + const char* home = std::getenv("HOME"); + if (!home) + throw std::runtime_error("HOME not set"); + + std::string sub = path.substr(1); // remove ~ + if (!sub.empty() && sub[0] == '/') + sub = sub.substr(1); // remove leading slash + + return fs::path(home) / sub; + } + return fs::path(path); +} + +/** + * + */ +inline int TracrData2BTS(const TraCR::Payload* tracrData, const size_t* tracrDataSizes, const size_t num_threads) { + fs::path base_dir = expand_user_path(tracr_dir); + + fs::create_directories(base_dir); + + for (uint32_t t = 0; t < num_threads; ++t) { + size_t num_traces = tracrDataSizes[t]; + + if (num_traces == 0) + continue; + + if (num_traces > TraCR::CAPACITY) { + LOG_ERROR("Thread %u exceeds CAPACITY", t); + return -1; + } + + fs::path thread_dir = + base_dir / ("thread." + std::to_string(t + 1)); + + fs::create_directories(thread_dir); + + fs::path file_path = thread_dir / "traces.bts"; + + std::ofstream out(file_path, std::ios::binary); + if (!out) { + LOG_ERROR("Cannot open %s", file_path); + return -1; + } + + const TraCR::Payload* thread_ptr = + tracrData + t * TraCR::CAPACITY; + + out.write( + reinterpret_cast(thread_ptr), + num_traces * sizeof(TraCR::Payload) + ); + + if (!out) { + LOG_ERROR("Write failed for %s", file_path); + return -1; + } + } + return 0; +} + +/** + * A method for storing the TraCR metadata.json + */ +template +int StoreTracrMetaData(RuntimeT &runtime) { + fs::path base_dir = expand_user_path(tracr_dir); + + // Add the metadata.json + nlohmann::json metadata; + + // channel_names + nlohmann::json channel_names = nlohmann::json::array(); + for(int i = 0; i < runtime.aicpu_thread_num; ++i) { + channel_names.push_back("AICPU_" + std::to_string(i)); + } + for(int i = 0; i < int(RUNTIME_MAX_WORKER/3); ++i) { + channel_names.push_back("AICube_" + std::to_string(i)); + } + for(int i = 0; i < int(2*RUNTIME_MAX_WORKER/3); ++i) { + channel_names.push_back("AIVector_" + std::to_string(i)); + } + channel_names.push_back("INVALID"); + + metadata["channel_names"] = channel_names; + metadata["num_channels"] = channel_names.size(); + + // markerTypes + metadata["markerTypes"] = nlohmann::json::object(); + + for (int i = 0; i < MARKERTYPE_COUNT; ++i) { + std::ostringstream oss; + oss << std::setw(2) << std::setfill('0') << (i + 1); + metadata["markerTypes"][oss.str()] = MarkerTypeNames[i]; + } + + metadata["pid"] = 1; + metadata["start_time"] = 0; + metadata["tid"] = 0; + + fs::path metadata_dir = base_dir / ("metadata.json"); + + std::ofstream file(metadata_dir); + if (!file.is_open()) { + LOG_ERROR("Failed to open file for writing.\n"); + return -1; + } + + // Dump JSON into file + file << metadata.dump(4); + + // Close the file + file.close(); + + return 0; +} + +/** + * A function for extracting the TraCR data from the Device to Host + */ +template +int StoreTracrData(DeviceRunnerT *device_runner, RuntimeT &runtime) { + static_assert(std::is_trivially_copyable_v, + "TraCR::Payload must be trivially copyable for raw binary dump"); + + if (runtime.tracrData_ == nullptr) { + LOG_ERROR("runtime.tracrData_ is a nullptr"); + return -1; + } + + if (runtime.tracrDataSizes_ == nullptr) { + LOG_ERROR("runtime.tracrDataSizes_ is a nullptr"); + return -1; + } + + LOG_INFO_V9("[TraCR] runtime.aicpu_thread_num is zero or negative: %d", runtime.aicpu_thread_num); + if (runtime.aicpu_thread_num <= 0) { + LOG_ERROR("runtime.aicpu_thread_num is zero or negative: %d", runtime.aicpu_thread_num); + return -1; + } + + // Download the tracrData_ from Device to Host + size_t size = sizeof(TraCR::Payload) * TraCR::CAPACITY * runtime.aicpu_thread_num; + TraCR::Payload* tracrData = reinterpret_cast(std::malloc(size)); + int rc = device_runner->copy_from_device(reinterpret_cast(tracrData), + reinterpret_cast(runtime.tracrData_), size); + if (rc != 0) { + LOG_ERROR("device_runner->copy_from_device 'tracrData' failed rc=%d", rc); + return rc; + } + + // Download the tracrDataSizes_ from Device to Host + size = sizeof(size_t) * runtime.aicpu_thread_num; + size_t* tracrDataSizes = reinterpret_cast(std::malloc(size)); + rc = device_runner->copy_from_device(reinterpret_cast(tracrDataSizes), + reinterpret_cast(runtime.tracrDataSizes_), size); + if (rc != 0) { + LOG_ERROR("device_runner->copy_from_device 'tracrDataSizes' failed rc=%d", rc); + return rc; + } + + // Now, store the traces into '~/ascend/tracr/' + tracr_dir = "~/ascend/tracr_" + std::to_string(sampleID++) + "/proc." + std::to_string(200 + device_runner->device_id()); + rc = TracrData2BTS(tracrData, tracrDataSizes, runtime.aicpu_thread_num); + if (rc != 0) { + LOG_ERROR("TracrData2BTS() failed"); + return rc; + } + + // Free tmp Host TraCR data placeholders + std::free(reinterpret_cast(tracrData)); + std::free(reinterpret_cast(tracrDataSizes)); + + // Free device TraCR memory data placeholder + device_runner->free_tensor(runtime.tracrData_); + device_runner->free_tensor(runtime.tracrDataSizes_); + + rc = StoreTracrMetaData(runtime); + if (rc != 0) { + LOG_ERROR("StoreTracrMetaData failed: %d", rc); + return rc; + } + + return 0; +} + +/** + * A method for allocating memory on the device + * + * Polymorphic to A2A3 and A5 (should be) + */ +template +int DevAllocTraCR(DeviceRunnerT *device_runner, RuntimeT &runtime) { + const size_t size = sizeof(TraCR::Payload) * runtime.aicpu_thread_num * TraCR::CAPACITY; + // LOG_INFO_V9("Device alloc start of size=%u, %p", size, runtime.tracrData_); + runtime.tracrData_ = device_runner->allocate_tensor(size); + if (runtime.tracrData_ == nullptr) { + LOG_ERROR("runtime.tracrData_: alloc %zu bytes failed", size); + return -1; + } + // LOG_INFO_V9("Device alloc start of size=%u, %p", size, runtime.tracrData_); + runtime.tracrDataSizes_ = device_runner->allocate_tensor(runtime.aicpu_thread_num * sizeof(size_t)); + if (runtime.tracrDataSizes_ == nullptr) { + LOG_ERROR("runtime.tracrDataSizes_: alloc %zu bytes failed", size); + return -1; + } + return 0; +} \ No newline at end of file diff --git a/tools/tracr_simpler_markers.hpp b/tools/tracr_simpler_markers.hpp new file mode 100644 index 000000000..d30abfa6c --- /dev/null +++ b/tools/tracr_simpler_markers.hpp @@ -0,0 +1,60 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * TraCR Simpler Marker Types + */ + +#pragma once + +#include +#include + +// Global TraCR thread idx counter +inline std::atomic g_TraCR_thread_idx_counter{0}; + +// Global thread local thread idx placeholder +inline thread_local int g_TraCR_thread_idx{-1}; + +#define MARKER_TYPES \ + X(Orchestrating) \ + X(Read_Dimensions) \ + X(Reshape_Kernels) \ + X(Pre_Loop_Info) \ + X(PTO2_SCOPE_) \ + X(Scheduling) \ + X(Phase1) \ + X(Phase2) \ + X(Phase3) \ + X(Phase3b) \ + X(Phase4) \ + X(Drain) \ + X(Initializing) \ + X(De_Initializing) \ + X(DLL_loading) \ + X(Allocating) \ + X(Running_Task_Single) \ + X(Running_Task_Pair) \ + X(Barrier) + + +enum MarkerType { +#define X(name) name, + MARKER_TYPES +#undef X + + MARKERTYPE_COUNT +}; + +constexpr std::string_view MarkerTypeNames[] = { +#define X(name) #name, + MARKER_TYPES +#undef X +}; \ No newline at end of file From 6c3e026a8cfc8d0ba97b374307c87cfc73400b87 Mon Sep 17 00:00:00 2001 From: Noah Baumann Date: Thu, 11 Jun 2026 10:28:07 +0200 Subject: [PATCH 02/14] adding tracr markers and the INDEP_ORCH method --- .../orchestration/paged_attention_orch.cpp | 14 ++++++++++++- .../runtime/pto_runtime2_types.h | 5 +++++ .../aicpu/aicpu_executor.cpp | 15 ++++++++------ .../runtime/pto_runtime2_types.h | 14 +++++++++++++ .../runtime/scheduler/scheduler_cold_path.cpp | 4 ++++ .../scheduler/scheduler_completion.cpp | 5 +++++ .../runtime/scheduler/scheduler_dispatch.cpp | 20 +++++++++++++++++++ 7 files changed, 70 insertions(+), 7 deletions(-) diff --git a/examples/a2a3/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp b/examples/a2a3/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp index 4b11d437f..09f9245e3 100644 --- a/examples/a2a3/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp +++ b/examples/a2a3/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp @@ -25,6 +25,9 @@ #include #include +#include +#include + #include "pto_orchestration_api.h" #define FUNC_QK_MATMUL 0 @@ -90,6 +93,9 @@ __attribute__((visibility("default"))) void aicpu_orchestration_entry(const Chip CYCLE_COUNT_START(); + + INSTRUMENTATION_MARK_SET(g_TraCR_thread_idx, Read_Dimensions, 0); + // Read dimensions from tensor metadata uint64_t batch = orch_args.tensor(0).shapes[0]; uint64_t num_heads = orch_args.tensor(0).shapes[1]; @@ -108,6 +114,8 @@ __attribute__((visibility("default"))) void aicpu_orchestration_entry(const Chip LOG_INFO_V9(">>>>>> batch = %" PRIu64, batch); + INSTRUMENTATION_MARK_SET(g_TraCR_thread_idx, Reshape_Kernels, 0); + // Reshape tensors for kernel consumption (2D flattened) void *query_ptr = orch_args.tensor(0).data_as(); void *kc_ptr = orch_args.tensor(1).data_as(); @@ -137,6 +145,8 @@ __attribute__((visibility("default"))) void aicpu_orchestration_entry(const Chip Tensor context_lens = make_tensor_external(orch_args.tensor(4).data_as(), cl_shapes, 1, DataType::INT32, false); + INSTRUMENTATION_MARK_SET(g_TraCR_thread_idx, Pre_Loop_Info, uint32_t(batch)); + // Create infos are loop-invariant — shapes depend only on q_tile/head_dim/block_size uint32_t tile2d_shapes[2] = {static_cast(q_tile), static_cast(head_dim)}; uint32_t scalar_shapes[1] = {static_cast(q_tile)}; @@ -149,11 +159,14 @@ __attribute__((visibility("default"))) void aicpu_orchestration_entry(const Chip PROF_INC(prof_make_count, 4); CYCLE_COUNT_LAP(prof_make_tensor); + LOG_INFO_V0("Thread %d: Orch PTO2_SCOPE loop: #batch=%u, q_loop=%u", g_TraCR_thread_idx, batch, q_loop); + for (uint64_t b_idx = 0; b_idx < batch; b_idx++) { uint32_t cl_idx[1] = {static_cast(b_idx)}; uint64_t cur_seq = static_cast(get_tensor_data(context_lens, 1, cl_idx)); uint64_t bn_this_batch = (cur_seq + block_size - 1) / block_size; for (uint64_t q_idx = 0; q_idx < q_loop; q_idx++) { + INSTRUMENTATION_MARK_SET(g_TraCR_thread_idx, PTO2_SCOPE_, uint32_t(b_idx + batch * q_idx)); PTO2_SCOPE() { CYCLE_COUNT_LAP(prof_scope); uint64_t cur_offset = b_idx * q_head_num + q_idx * q_tile; @@ -251,7 +264,6 @@ __attribute__((visibility("default"))) void aicpu_orchestration_entry(const Chip CYCLE_COUNT_LAP(prof_scope); } } - #ifdef ENABLE_PROFILING uint64_t total = prof_param_extract + prof_ext_tensor + prof_make_tensor + prof_tensor_view + prof_param_setup + prof_submit_task + prof_scope; diff --git a/src/a2a3/runtime/host_build_graph/runtime/pto_runtime2_types.h b/src/a2a3/runtime/host_build_graph/runtime/pto_runtime2_types.h index 4d4bb9313..ff45b94ba 100644 --- a/src/a2a3/runtime/host_build_graph/runtime/pto_runtime2_types.h +++ b/src/a2a3/runtime/host_build_graph/runtime/pto_runtime2_types.h @@ -26,7 +26,12 @@ // Tensor dump uses these defaults to size its selective mask table so task-id // ring/slot lookup stays aligned with PTO2 task id layout. +#ifdef INDEP_ORCH +#define PTO2_TASK_WINDOW_SIZE 65536 // Default per-ring task window size (power of 2) +#else #define PTO2_TASK_WINDOW_SIZE 16384 // Default per-ring task window size (power of 2) +#endif + #define PTO2_MAX_RING_DEPTH 4 // Number of task-id ring layers #endif // SRC_A2A3_RUNTIME_HOST_BUILD_GRAPH_RUNTIME_PTO_RUNTIME2_TYPES_H_ diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp index f57a92f42..4f9846b0b 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp @@ -175,6 +175,8 @@ int32_t AicpuExecutor::init(Runtime *runtime) { return 0; } + INSTRUMENTATION_MARK_SET(g_TraCR_thread_idx, Initializing, uint32_t(sched_getcpu())); + LOG_INFO_V0("AicpuExecutor: Initializing"); if (runtime == nullptr) { @@ -244,6 +246,8 @@ int32_t AicpuExecutor::run(Runtime *runtime) { DeviceOrchestrationConfigFunc *p_config_func = &orch_so_table_[callable_id].config_func; const bool reload_so = runtime->register_new_callable_id(); + INSTRUMENTATION_MARK_SET(g_TraCR_thread_idx, DLL_loading, 0); + if (reload_so) { LOG_INFO_V0("Thread %d: New orch SO detected (callable_id=%d), (re)loading", thread_idx, callable_id); if (*p_handle != nullptr) { @@ -435,6 +439,8 @@ int32_t AicpuExecutor::run(Runtime *runtime) { LOG_INFO_V0("Thread %d: No config function, using defaults", thread_idx); } + INSTRUMENTATION_MARK_SET(g_TraCR_thread_idx, Allocating, 0); + // sm_handle / rt are bound to *this* run's memory and must be // (re)created every run, regardless of whether the SO itself was // reused above. @@ -576,6 +582,7 @@ int32_t AicpuExecutor::run(Runtime *runtime) { #if PTO2_PROFILING orch_cycle_start = get_sys_cnt_aicpu(); #endif + INSTRUMENTATION_MARK_SET(g_TraCR_thread_idx, Orchestrating, thread_idx); framework_bind_runtime(rt); if (*p_bind != nullptr) { (*p_bind)(rt); @@ -713,6 +720,7 @@ int32_t AicpuExecutor::run(Runtime *runtime) { if (rt == nullptr) { LOG_ERROR("Thread %d: rt is null after orchestrator error, skipping dispatch", thread_idx); } else { + INSTRUMENTATION_MARK_SET(g_TraCR_thread_idx, Scheduling, thread_idx); sched_ctx_.bind_runtime(rt); int32_t completed = sched_ctx_.resolve_and_dispatch(runtime, thread_idx); if (completed < 0) { @@ -727,6 +735,7 @@ int32_t AicpuExecutor::run(Runtime *runtime) { // Always shutdown AICore — even if sched_ctx_.completed_ was already true. // platform_deinit_aicore_regs is idempotent; orchestrator threads have // core_trackers_[thread_idx].core_num() == 0 so they skip the loop harmlessly. + INSTRUMENTATION_MARK_SET(g_TraCR_thread_idx, De_Initializing, 0); int32_t shutdown_rc = sched_ctx_.shutdown(thread_idx); if (shutdown_rc != 0 && run_rc == 0) { run_rc = shutdown_rc; @@ -873,12 +882,6 @@ extern "C" int32_t aicpu_execute(Runtime *runtime) { LOG_INFO_V0("%s", "aicpu_execute: Starting AICPU kernel execution"); - // Watch out, basicly hardcoded, we assume the all four threads to lie on each CPU on the same NUMA domain - // if (sched_getcpu() <= 3) { - // LOG_ERROR("DynTileFwkBackendKernelServer: Scheduling thread is in the wrong NUMA domain! aicpu_thread_num=%d sched_getcpu=%d", runtime->aicpu_thread_num, sched_getcpu()); - // return -1; - // } - // INIT TraCR all threads coming in TRACR_START(); LOG_INFO_V9("[TraCR] thread[%d:%d] start ENABLE_TRACR=%d", g_TraCR_thread_idx, sched_getcpu(), INSTRUMENTATION_ACTIVE); diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h index 52bee1878..d8db2faef 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h @@ -94,14 +94,23 @@ // NOTE: PTO2_TASK_WINDOW_SIZE is now a per-ring default value. // Actual window size is passed at runtime to runtime_create_from_sm(). // Use pto2_task_slot(sched, task_id) for slot calculation. +#ifdef INDEP_ORCH +#define PTO2_TASK_WINDOW_SIZE 65536 // Default per-ring task window size (power of 2) +#else #define PTO2_TASK_WINDOW_SIZE 16384 // Default per-ring task window size (power of 2) +#endif // Multi-ring: number of independent ring layers (HeapRing + TaskRing + DepPool per layer) // Scope depth maps to ring index via: min(scope_depth, PTO2_MAX_RING_DEPTH - 1) #define PTO2_MAX_RING_DEPTH 4 // Memory pools (per-ring defaults; total = value × PTO2_MAX_RING_DEPTH) +#ifdef INDEP_ORCH +#define PTO2_HEAP_SIZE (256 * 1024 * 1024 * 2) // 512MB per ring (2GB total) +#else #define PTO2_HEAP_SIZE (256 * 1024 * 1024) // 256MB per ring (1GB total) +#endif + #define PTO2_DEP_LIST_POOL_SIZE 16384 // Per-ring dependency list pool entries #define PTO2_TENSORMAP_POOL_SIZE (65536) // TensorMap entry pool #define PTO2_TENSORMAP_NUM_BUCKETS 4096 // Power of 2 for fast hash (4096×8B=32KB fits L1) @@ -119,7 +128,12 @@ #define PTO2_READY_QUEUE_SIZE 65536 // Per-shape queue size // Wiring queue +#ifdef INDEP_ORCH +#define PTO2_WRIRING_QUEUE_SIZE 65536 // Per-shape queue size +#else #define PTO2_WRIRING_QUEUE_SIZE 1024 // Per-shape queue size +#endif + // Fanin storage #define PTO2_FANIN_INLINE_CAP 64 diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp index af62fd9e7..aefeaf339 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp @@ -13,6 +13,9 @@ #include #include +#include +#include + #include "common/unified_log.h" #include "aicpu/device_time.h" #include "aicpu/l2_swimlane_collector_aicpu.h" @@ -930,6 +933,7 @@ int32_t SchedulerContext::init( void SchedulerContext::deinit() { // Reset all per-core execution state for (int32_t i = 0; i < RUNTIME_MAX_WORKER; i++) { + INSTRUMENTATION_MARK_RESET(sched_thread_num_ + 1 + i); core_exec_states_[i] = {}; core_exec_states_[i].running_reg_task_id = AICPU_TASK_INVALID; core_exec_states_[i].pending_reg_task_id = AICPU_TASK_INVALID; diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_completion.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_completion.cpp index 488ced5fc..83d6ee269 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_completion.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_completion.cpp @@ -12,6 +12,9 @@ #include +#include +#include + #include "common/unified_log.h" #include "aicpu/device_time.h" #include "aicpu/platform_regs.h" @@ -312,6 +315,7 @@ void SchedulerContext::check_running_cores_for_completion( #endif ); cur_thread_completed++; + INSTRUMENTATION_MARK_RESET(sched_thread_num_ + 1 + core_id); } if (t.running_done) { complete_slot_task( @@ -323,6 +327,7 @@ void SchedulerContext::check_running_cores_for_completion( #endif ); cur_thread_completed++; + INSTRUMENTATION_MARK_RESET(sched_thread_num_ + 1 + core_id); } // 2. Update slot data diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp index a4e3cfa01..158375cff 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp @@ -14,6 +14,9 @@ #include #include +#include +#include + #include "common.h" // debug_assert #include "common/unified_log.h" @@ -155,10 +158,12 @@ SchedulerContext::PublishHandle SchedulerContext::prepare_subtask_to_core( build_payload(payload, slot_state, subslot, async_ctx, block_idx); if (to_pending) { + INSTRUMENTATION_MARK_SET(sched_thread_num_ + 1 + core_id, Running_Task_Pair, 0); core_exec_state.pending_subslot = subslot; core_exec_state.pending_slot_state = &slot_state; core_exec_state.pending_reg_task_id = static_cast(reg_task_id); } else { + INSTRUMENTATION_MARK_SET(sched_thread_num_ + 1 + core_id, Running_Task_Single, 0); core_exec_state.running_subslot = subslot; core_exec_state.running_slot_state = &slot_state; core_exec_state.running_reg_task_id = static_cast(reg_task_id); @@ -638,6 +643,12 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_ // compiler cannot hoist it across the dispatch loop on its own. const bool pmu_active = is_pmu_enabled(); +#ifdef INDEP_ORCH + INSTRUMENTATION_MARK_SET(g_TraCR_thread_idx, Barrier, orchestrator_done_); + LOG_INFO_V9("[TraCR] Thread %d: Waiting before the Orch to finish: %d, orchestrator_done_=%d", g_TraCR_thread_idx, g_TraCR_thread_idx_counter.load(), orchestrator_done_); + while (!orchestrator_done_){}; +#endif + #if PTO2_PROFILING l2_swimlane.sched_start_ts = get_sys_cnt_aicpu(); #endif @@ -737,6 +748,7 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_ #endif // Phase 1: Check running cores for completion + INSTRUMENTATION_MARK_SET(g_TraCR_thread_idx, Phase1, 0); int32_t completed_this_turn = 0; bool try_completed = tracker.has_any_running_cores(); @@ -827,6 +839,7 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_ // Phase 2 drain check if (drain_state_.sync_start_pending.load(std::memory_order_acquire) != 0) { + INSTRUMENTATION_MARK_SET(g_TraCR_thread_idx, Phase2, 0); handle_drain_mode(thread_idx); continue; } @@ -836,6 +849,7 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_ int wired = sched_->drain_wiring_queue(orchestrator_done_); if (wired > 0) { made_progress = true; + INSTRUMENTATION_MARK_SET(g_TraCR_thread_idx, Phase3, 0); #if PTO2_SCHED_PROFILING l2_swimlane.phase_wiring_count += wired; #endif @@ -855,6 +869,10 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_ constexpr int DUMMY_DRAIN_BATCH = 16; PTO2TaskSlotState *dummy_batch[DUMMY_DRAIN_BATCH]; int dummy_got = sched_->dummy_ready_queue.pop_batch(dummy_batch, DUMMY_DRAIN_BATCH); + if (dummy_got > 0) { + (void)(dummy_got); + INSTRUMENTATION_MARK_SET(g_TraCR_thread_idx, Phase3b, 0); + } for (int di = 0; di < dummy_got; di++) { PTO2TaskSlotState &dummy_slot = *dummy_batch[di]; #if PTO2_SCHED_PROFILING @@ -888,6 +906,7 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_ // Phase 4: MIX-strict-priority dispatch with phase-split and // cross-thread idle gating. See dispatch_ready_tasks for the policy. + INSTRUMENTATION_MARK_SET(g_TraCR_thread_idx, Phase4, 0); dispatch_ready_tasks(thread_idx, tracker, local_bufs, pmu_active, made_progress, try_pushed); #if PTO2_PROFILING @@ -936,6 +955,7 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_ last_progress_ts = get_sys_cnt_aicpu(); } else { while (deferred_release_count > 0) { + INSTRUMENTATION_MARK_SET(g_TraCR_thread_idx, Drain, 0); #if PTO2_SCHED_PROFILING (void)sched_->on_task_release(*deferred_release_slot_states[--deferred_release_count], thread_idx); #else From 2ca19549fd1d10951b49964da4e7f487578d6cb3 Mon Sep 17 00:00:00 2001 From: Noah Baumann Date: Fri, 19 Jun 2026 11:31:47 +0200 Subject: [PATCH 03/14] remove this redundant print --- tools/tracr_simpler_api.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/tracr_simpler_api.hpp b/tools/tracr_simpler_api.hpp index 8a62540b3..f8fd2bdad 100644 --- a/tools/tracr_simpler_api.hpp +++ b/tools/tracr_simpler_api.hpp @@ -179,7 +179,6 @@ int StoreTracrData(DeviceRunnerT *device_runner, RuntimeT &runtime) { return -1; } - LOG_INFO_V9("[TraCR] runtime.aicpu_thread_num is zero or negative: %d", runtime.aicpu_thread_num); if (runtime.aicpu_thread_num <= 0) { LOG_ERROR("runtime.aicpu_thread_num is zero or negative: %d", runtime.aicpu_thread_num); return -1; From 22d49f7337881dc899271841cece129f480a9b69 Mon Sep 17 00:00:00 2001 From: Noah Baumann Date: Fri, 26 Jun 2026 11:08:17 +0200 Subject: [PATCH 04/14] 200 -> 1000 --- tools/tracr_simpler_api.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/tracr_simpler_api.hpp b/tools/tracr_simpler_api.hpp index f8fd2bdad..326d1ac5e 100644 --- a/tools/tracr_simpler_api.hpp +++ b/tools/tracr_simpler_api.hpp @@ -205,7 +205,7 @@ int StoreTracrData(DeviceRunnerT *device_runner, RuntimeT &runtime) { } // Now, store the traces into '~/ascend/tracr/' - tracr_dir = "~/ascend/tracr_" + std::to_string(sampleID++) + "/proc." + std::to_string(200 + device_runner->device_id()); + tracr_dir = "~/ascend/tracr_" + std::to_string(sampleID++) + "/proc." + std::to_string(1000 + device_runner->device_id()); rc = TracrData2BTS(tracrData, tracrDataSizes, runtime.aicpu_thread_num); if (rc != 0) { LOG_ERROR("TracrData2BTS() failed"); From 9d8699f2ac2c02434359d0e3b162a6121a3dde27 Mon Sep 17 00:00:00 2001 From: Noah Baumann Date: Fri, 26 Jun 2026 13:17:20 +0200 Subject: [PATCH 05/14] adding corrections gemini wanted --- .../orchestration/paged_attention_orch.cpp | 2 +- .../aicpu/aicpu_executor.cpp | 30 +++++++++++-------- .../runtime/scheduler/scheduler_dispatch.cpp | 4 ++- tools/tracr.cmake | 4 +-- tools/tracr_simpler_api.hpp | 24 +++++++-------- 5 files changed, 36 insertions(+), 28 deletions(-) diff --git a/examples/a2a3/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp b/examples/a2a3/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp index dec6be021..f92202134 100644 --- a/examples/a2a3/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp +++ b/examples/a2a3/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp @@ -158,7 +158,7 @@ __attribute__((visibility("default"))) void aicpu_orchestration_entry(const L2Ta PROF_INC(prof_make_count, 4); CYCLE_COUNT_LAP(prof_make_tensor); - LOG_INFO_V0("Thread %d: Orch PTO2_SCOPE loop: #batch=%u, q_loop=%u", g_TraCR_thread_idx, batch, q_loop); + LOG_INFO_V0("Thread %d: Orch PTO2_SCOPE loop: #batch=%" PRIu64 ", q_loop=%" PRIu64, g_TraCR_thread_idx, batch, q_loop); for (uint64_t b_idx = 0; b_idx < batch; b_idx++) { uint32_t cl_idx[1] = {static_cast(b_idx)}; diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp index 6b43902ab..e4ce0946e 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp @@ -817,19 +817,25 @@ inline void TRACR_FINALIZE(Runtime *runtime) { #ifdef ENABLE_TRACR LOG_INFO_V9("[TraCR] thread[%d] dumping the #traces: %lu %p", g_TraCR_thread_idx, tracrThread->_traceIdx, runtime->tracrData_); - if (tracrThread->_traceIdx > 0) { - TraCR::Payload* tracrData = reinterpret_cast(runtime->tracrData_); - const size_t payload_size = tracrThread->_traceIdx * sizeof(TraCR::Payload); - - std::memcpy( - &tracrData[g_TraCR_thread_idx * TraCR::CAPACITY], - tracrThread->_traces.data(), - payload_size - ); - } + if (g_TraCR_thread_idx >= 0 && g_TraCR_thread_idx < runtime->aicpu_thread_num) { + if (runtime->tracrData_ != nullptr && tracrThread->_traceIdx > 0) { + TraCR::Payload* tracrData = reinterpret_cast(runtime->tracrData_); + const size_t payload_size = tracrThread->_traceIdx * sizeof(TraCR::Payload); + + std::memcpy( + &tracrData[g_TraCR_thread_idx * TraCR::CAPACITY], + tracrThread->_traces.data(), + payload_size + ); + } - size_t* tracrDataSizes = reinterpret_cast(runtime->tracrDataSizes_); - tracrDataSizes[g_TraCR_thread_idx] = tracrThread->_traceIdx; + if (runtime->tracrDataSizes_ != nullptr) { + size_t* tracrDataSizes = reinterpret_cast(runtime->tracrDataSizes_); + tracrDataSizes[g_TraCR_thread_idx] = tracrThread->_traceIdx; + } + } else { + LOG_ERROR("[TraCR] thread index %d out of bounds (max=%d)", g_TraCR_thread_idx, runtime->aicpu_thread_num); + } #endif if (g_TraCR_thread_idx == 0) { diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp index b6e00da96..68a1499c1 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp @@ -819,7 +819,9 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_ #ifdef INDEP_ORCH INSTRUMENTATION_MARK_SET(g_TraCR_thread_idx, Barrier, orchestrator_done_); LOG_INFO_V9("[TraCR] Thread %d: Waiting before the Orch to finish: %d, orchestrator_done_=%d", g_TraCR_thread_idx, g_TraCR_thread_idx_counter.load(), orchestrator_done_); - while (!orchestrator_done_){}; + while (!orchestrator_done_) { + SPIN_WAIT_HINT(); + } #endif #if PTO2_PROFILING diff --git a/tools/tracr.cmake b/tools/tracr.cmake index fdd6234af..df9f31411 100644 --- a/tools/tracr.cmake +++ b/tools/tracr.cmake @@ -58,7 +58,7 @@ function(tracr_enable target) # TraCR threads capacity (default is 1<<20 ~= 1 million traces per thread = ~17MB per thread buffer size) set(TRACR_CAPACITY "" CACHE STRING "Optional TraCR buffer capacity (empty = use internal default)") - if(NOT TRACR_CAPACITY STREQUAL "") + if(NOT "${TRACR_CAPACITY}" STREQUAL "") message(STATUS "TraCR adding capacity: ${TRACR_CAPACITY}") if(NOT TRACR_CAPACITY MATCHES "^[0-9]+$") @@ -89,7 +89,7 @@ function(tracr_enable target) TRACR_POLICY_IGNORE_IF_FULL ) - if(NOT TRACR_POLICY STREQUAL "") + if(NOT "${TRACR_POLICY}" STREQUAL "") if(TRACR_POLICY STREQUAL "TRACR_POLICY_PERIODIC") message(STATUS "TraCR adding policy: 'TRACR_POLICY_PERIODIC'") target_compile_definitions(${target} PRIVATE TRACR_POLICY_PERIODIC) diff --git a/tools/tracr_simpler_api.hpp b/tools/tracr_simpler_api.hpp index 326d1ac5e..97e0f417e 100644 --- a/tools/tracr_simpler_api.hpp +++ b/tools/tracr_simpler_api.hpp @@ -21,6 +21,7 @@ #include #include #include +#include #include #include @@ -120,10 +121,10 @@ int StoreTracrMetaData(RuntimeT &runtime) { for(int i = 0; i < runtime.aicpu_thread_num; ++i) { channel_names.push_back("AICPU_" + std::to_string(i)); } - for(int i = 0; i < int(RUNTIME_MAX_WORKER/3); ++i) { + for(int i = 0; i < int(runtime.worker_count/3); ++i) { channel_names.push_back("AICube_" + std::to_string(i)); } - for(int i = 0; i < int(2*RUNTIME_MAX_WORKER/3); ++i) { + for(int i = 0; i < int(2*runtime.worker_count/3); ++i) { channel_names.push_back("AIVector_" + std::to_string(i)); } channel_names.push_back("INVALID"); @@ -186,8 +187,8 @@ int StoreTracrData(DeviceRunnerT *device_runner, RuntimeT &runtime) { // Download the tracrData_ from Device to Host size_t size = sizeof(TraCR::Payload) * TraCR::CAPACITY * runtime.aicpu_thread_num; - TraCR::Payload* tracrData = reinterpret_cast(std::malloc(size)); - int rc = device_runner->copy_from_device(reinterpret_cast(tracrData), + std::vector tracrData(TraCR::CAPACITY * runtime.aicpu_thread_num); + int rc = device_runner->copy_from_device(reinterpret_cast(tracrData.data()), reinterpret_cast(runtime.tracrData_), size); if (rc != 0) { LOG_ERROR("device_runner->copy_from_device 'tracrData' failed rc=%d", rc); @@ -196,8 +197,8 @@ int StoreTracrData(DeviceRunnerT *device_runner, RuntimeT &runtime) { // Download the tracrDataSizes_ from Device to Host size = sizeof(size_t) * runtime.aicpu_thread_num; - size_t* tracrDataSizes = reinterpret_cast(std::malloc(size)); - rc = device_runner->copy_from_device(reinterpret_cast(tracrDataSizes), + std::vector tracrDataSizes(runtime.aicpu_thread_num); + rc = device_runner->copy_from_device(reinterpret_cast(tracrDataSizes.data()), reinterpret_cast(runtime.tracrDataSizes_), size); if (rc != 0) { LOG_ERROR("device_runner->copy_from_device 'tracrDataSizes' failed rc=%d", rc); @@ -206,16 +207,12 @@ int StoreTracrData(DeviceRunnerT *device_runner, RuntimeT &runtime) { // Now, store the traces into '~/ascend/tracr/' tracr_dir = "~/ascend/tracr_" + std::to_string(sampleID++) + "/proc." + std::to_string(1000 + device_runner->device_id()); - rc = TracrData2BTS(tracrData, tracrDataSizes, runtime.aicpu_thread_num); + rc = TracrData2BTS(tracrData.data(), tracrDataSizes.data(), runtime.aicpu_thread_num); if (rc != 0) { LOG_ERROR("TracrData2BTS() failed"); return rc; } - // Free tmp Host TraCR data placeholders - std::free(reinterpret_cast(tracrData)); - std::free(reinterpret_cast(tracrDataSizes)); - // Free device TraCR memory data placeholder device_runner->free_tensor(runtime.tracrData_); device_runner->free_tensor(runtime.tracrDataSizes_); @@ -246,7 +243,10 @@ int DevAllocTraCR(DeviceRunnerT *device_runner, RuntimeT &runtime) { // LOG_INFO_V9("Device alloc start of size=%u, %p", size, runtime.tracrData_); runtime.tracrDataSizes_ = device_runner->allocate_tensor(runtime.aicpu_thread_num * sizeof(size_t)); if (runtime.tracrDataSizes_ == nullptr) { - LOG_ERROR("runtime.tracrDataSizes_: alloc %zu bytes failed", size); + const size_t sizes_bytes = runtime.aicpu_thread_num * sizeof(size_t); + LOG_ERROR("runtime.tracrDataSizes_: alloc %zu bytes failed", sizes_bytes); + device_runner->free_tensor(runtime.tracrData_); + runtime.tracrData_ = nullptr; return -1; } return 0; From 53a28aee3b0a0e5206320415ef3ef7a1f99b9b31 Mon Sep 17 00:00:00 2001 From: Noah Baumann Date: Fri, 26 Jun 2026 15:09:09 +0200 Subject: [PATCH 06/14] updating the CI --- .github/workflows/ci.yml | 20 ++++++++++++++++++++ .github/workflows/sanitizers.yml | 2 ++ 2 files changed, 22 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index f3e39a3e3..ce95cb450 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -21,6 +21,7 @@ jobs: - name: Checkout uses: actions/checkout@v5 with: + submodules: recursive fetch-depth: 0 - name: Install build and lint tools @@ -71,6 +72,8 @@ jobs: steps: - name: Checkout repository uses: actions/checkout@v5 + with: + submodules: recursive - name: Set up C++ compiler (Linux) if: runner.os == 'Linux' @@ -113,6 +116,8 @@ jobs: steps: - name: Checkout repository uses: actions/checkout@v5 + with: + submodules: recursive - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v6 @@ -173,6 +178,8 @@ jobs: steps: - name: Checkout repository uses: actions/checkout@v5 + with: + submodules: recursive - name: Load pinned pto-isa commit uses: ./.github/actions/read-pto-isa @@ -260,6 +267,8 @@ jobs: steps: - name: Checkout repository uses: actions/checkout@v5 + with: + submodules: recursive - name: Load pinned pto-isa commit uses: ./.github/actions/read-pto-isa @@ -350,6 +359,8 @@ jobs: steps: - name: Checkout repository uses: actions/checkout@v5 + with: + submodules: recursive - name: Set up C++ compiler run: | @@ -446,6 +457,8 @@ jobs: steps: - name: Checkout repository uses: actions/checkout@v5 + with: + submodules: recursive - name: Load pinned pto-isa commit uses: ./.github/actions/read-pto-isa @@ -534,6 +547,8 @@ jobs: steps: - name: Checkout repository uses: actions/checkout@v5 + with: + submodules: recursive - name: Load pinned pto-isa commit uses: ./.github/actions/read-pto-isa @@ -610,6 +625,7 @@ jobs: - name: Checkout repository uses: actions/checkout@v5 with: + submodules: recursive fetch-depth: 0 - name: Check file changes id: check @@ -652,6 +668,8 @@ jobs: steps: - name: Checkout repository uses: actions/checkout@v5 + with: + submodules: recursive - name: Load pinned pto-isa commit uses: ./.github/actions/read-pto-isa @@ -723,6 +741,8 @@ jobs: steps: - name: Checkout repository uses: actions/checkout@v5 + with: + submodules: recursive - name: Load pinned pto-isa commit uses: ./.github/actions/read-pto-isa diff --git a/.github/workflows/sanitizers.yml b/.github/workflows/sanitizers.yml index 2444fab32..7661e19f0 100644 --- a/.github/workflows/sanitizers.yml +++ b/.github/workflows/sanitizers.yml @@ -38,6 +38,8 @@ jobs: steps: - name: Checkout repository uses: actions/checkout@v5 + with: + submodules: recursive - name: Load pinned pto-isa commit uses: ./.github/actions/read-pto-isa From ea0b944a71316b86d21acd14d6d23d68345729b0 Mon Sep 17 00:00:00 2001 From: Noah Baumann Date: Fri, 26 Jun 2026 15:53:42 +0200 Subject: [PATCH 07/14] adding more ci fixes --- src/a2a3/platform/sim/aicpu/CMakeLists.txt | 7 ++++++- .../tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp | 4 ++-- tools/tracr_simpler_markers.hpp | 10 ++++++++++ 3 files changed, 18 insertions(+), 3 deletions(-) diff --git a/src/a2a3/platform/sim/aicpu/CMakeLists.txt b/src/a2a3/platform/sim/aicpu/CMakeLists.txt index 71c5d5ec9..11fa4349d 100644 --- a/src/a2a3/platform/sim/aicpu/CMakeLists.txt +++ b/src/a2a3/platform/sim/aicpu/CMakeLists.txt @@ -75,7 +75,12 @@ include(${CMAKE_CURRENT_SOURCE_DIR}/../../../../../tools/tracr.cmake) tracr_enable(aicpu_kernel) # TODO: move this somewhere such that EVERY platform launches this once. Placing this here is hacky... -include(${CMAKE_CURRENT_SOURCE_DIR}/../../../../../tools/tracr_postprocessing_script.cmake) +# Only build the host-side trace post-processor when TraCR is enabled: it is an +# offline analysis tool, and it pulls in Linux-only APIs (sched_getcpu) that do +# not compile on the macOS packaging build. +if(BUILD_TRACR) + include(${CMAKE_CURRENT_SOURCE_DIR}/../../../../../tools/tracr_postprocessing_script.cmake) +endif() # Optional: to make the Orchestrator run independently (i.e. finalize before letting the schedulers run) option(INDEP_ORCH "Run Orchestrator independent from the Schedulers" OFF) diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp index e4ce0946e..95b35374b 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp @@ -177,7 +177,7 @@ int32_t AicpuExecutor::init(Runtime *runtime) { return 0; } - INSTRUMENTATION_MARK_SET(g_TraCR_thread_idx, Initializing, uint32_t(sched_getcpu())); + INSTRUMENTATION_MARK_SET(g_TraCR_thread_idx, Initializing, uint32_t(tracr_getcpu())); LOG_INFO_V0("AicpuExecutor: Initializing"); @@ -887,7 +887,7 @@ extern "C" int32_t aicpu_execute(Runtime *runtime) { // INIT TraCR all threads coming in TRACR_START(); - LOG_INFO_V9("[TraCR] thread[%d:%d] start ENABLE_TRACR=%d", g_TraCR_thread_idx, sched_getcpu(), INSTRUMENTATION_ACTIVE); + LOG_INFO_V9("[TraCR] thread[%d:%d] start ENABLE_TRACR=%d", g_TraCR_thread_idx, tracr_getcpu(), INSTRUMENTATION_ACTIVE); g_aicpu_executor.init(runtime); diff --git a/tools/tracr_simpler_markers.hpp b/tools/tracr_simpler_markers.hpp index d30abfa6c..36d0b5290 100644 --- a/tools/tracr_simpler_markers.hpp +++ b/tools/tracr_simpler_markers.hpp @@ -17,6 +17,16 @@ #include #include +// sched_getcpu() is a glibc/Linux-only API, but the simulator/host build also +// compiles on non-Linux targets (e.g. the macOS packaging CI). Route the TraCR +// call sites through this portable shim instead of calling sched_getcpu directly. +#if defined(__linux__) +#include +inline int tracr_getcpu() { return sched_getcpu(); } +#else +inline int tracr_getcpu() { return -1; } +#endif + // Global TraCR thread idx counter inline std::atomic g_TraCR_thread_idx_counter{0}; From 079f147caed69f8bfce1b55bb4dd49e5910b9f54 Mon Sep 17 00:00:00 2001 From: Noah Baumann Date: Fri, 26 Jun 2026 16:53:05 +0200 Subject: [PATCH 08/14] minor fix for the ci --- tools/tracr.cmake | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tools/tracr.cmake b/tools/tracr.cmake index df9f31411..12da91dda 100644 --- a/tools/tracr.cmake +++ b/tools/tracr.cmake @@ -46,7 +46,11 @@ function(tracr_enable target) ) # --- include the directories --- - target_include_directories(${target} PRIVATE + # SYSTEM: TraCR and its vendored third-party headers (e.g. extern/nlohmann + # json) are external to simpler and don't compile cleanly under the build's + # -Wall -Wextra -Werror (modern clang flags nlohmann's deprecated literal + # operators). Marking them system suppresses warnings from those headers. + target_include_directories(${target} SYSTEM PRIVATE ${TRACR_INCLUDE_DIR} ) From 83eef5a517e21ee3f82daf3e90289a1cbd48066c Mon Sep 17 00:00:00 2001 From: Noah Baumann Date: Tue, 30 Jun 2026 16:55:29 +0200 Subject: [PATCH 09/14] Fix(tracr): adapt to refactored Runtime dev-split API (#1216) main #1216 split the trb Runtime into a device-copied DeviceRuntimeLaunchDesc (`dev`) + host-only tail, so aicpu_thread_num / worker_count / the tracr pointers are no longer direct `Runtime` members on trb (they live in `dev`, reached via accessors); host_build_graph keeps them flat. The templated shared TraCR code (tracr_simpler_api.hpp) and the trb TRACR_FINALIZE used the old direct-field API and failed to compile under BUILD_TRACR=ON. - Add get/set_tracr_data + get/set_tracr_data_sizes accessors to both Runtime variants (mirroring get_aicpu_thread_num), identical signatures so the shared header compiles against either. - Switch tracr_simpler_api.hpp (DevAllocTraCR / StoreTracrData / StoreTracrMetaData) and aicpu_executor.cpp TRACR_FINALIZE to the accessors (get_aicpu_thread_num / get_worker_count / get/set_tracr_data*). Verified: BUILD_TRACR=ON build is clean (simpler-cann9 docker). Co-Authored-By: Claude Opus 4.8 --- .../host_build_graph/runtime/runtime.h | 4 ++ .../aicpu/aicpu_executor.cpp | 14 ++--- .../runtime/runtime.h | 4 ++ tools/tracr_simpler_api.hpp | 54 +++++++++---------- 4 files changed, 42 insertions(+), 34 deletions(-) diff --git a/src/a2a3/runtime/host_build_graph/runtime/runtime.h b/src/a2a3/runtime/host_build_graph/runtime/runtime.h index 88ebdd940..dea4be4ed 100644 --- a/src/a2a3/runtime/host_build_graph/runtime/runtime.h +++ b/src/a2a3/runtime/host_build_graph/runtime/runtime.h @@ -281,6 +281,10 @@ class Runtime { void set_worker_count(int n) { worker_count = n; } int get_aicpu_thread_num() const { return aicpu_thread_num; } void set_aicpu_thread_num(int n) { aicpu_thread_num = n; } + void *get_tracr_data() const { return tracrData_; } + void set_tracr_data(void *p) { tracrData_ = p; } + void *get_tracr_data_sizes() const { return tracrDataSizes_; } + void set_tracr_data_sizes(void *p) { tracrDataSizes_ = p; } Handshake *get_workers() { return workers; } int32_t get_aicpu_allowed_cpu_count() const { return aicpu_allowed_cpu_count; } void set_aicpu_allowed_cpu_count(int32_t n) { aicpu_allowed_cpu_count = n; } diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp index 9d898bb26..2e7db4d96 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp @@ -864,11 +864,11 @@ inline void TRACR_FINALIZE(Runtime *runtime) { (void)(runtime); #ifdef ENABLE_TRACR - LOG_INFO_V9("[TraCR] thread[%d] dumping the #traces: %lu %p", g_TraCR_thread_idx, tracrThread->_traceIdx, runtime->tracrData_); + LOG_INFO_V9("[TraCR] thread[%d] dumping the #traces: %lu %p", g_TraCR_thread_idx, tracrThread->_traceIdx, runtime->get_tracr_data()); - if (g_TraCR_thread_idx >= 0 && g_TraCR_thread_idx < runtime->aicpu_thread_num) { - if (runtime->tracrData_ != nullptr && tracrThread->_traceIdx > 0) { - TraCR::Payload* tracrData = reinterpret_cast(runtime->tracrData_); + if (g_TraCR_thread_idx >= 0 && g_TraCR_thread_idx < runtime->get_aicpu_thread_num()) { + if (runtime->get_tracr_data() != nullptr && tracrThread->_traceIdx > 0) { + TraCR::Payload* tracrData = reinterpret_cast(runtime->get_tracr_data()); const size_t payload_size = tracrThread->_traceIdx * sizeof(TraCR::Payload); std::memcpy( @@ -878,12 +878,12 @@ inline void TRACR_FINALIZE(Runtime *runtime) { ); } - if (runtime->tracrDataSizes_ != nullptr) { - size_t* tracrDataSizes = reinterpret_cast(runtime->tracrDataSizes_); + if (runtime->get_tracr_data_sizes() != nullptr) { + size_t* tracrDataSizes = reinterpret_cast(runtime->get_tracr_data_sizes()); tracrDataSizes[g_TraCR_thread_idx] = tracrThread->_traceIdx; } } else { - LOG_ERROR("[TraCR] thread index %d out of bounds (max=%d)", g_TraCR_thread_idx, runtime->aicpu_thread_num); + LOG_ERROR("[TraCR] thread index %d out of bounds (max=%d)", g_TraCR_thread_idx, runtime->get_aicpu_thread_num()); } #endif diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h index fb23fed92..4f8700c80 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h @@ -305,6 +305,10 @@ class Runtime { void set_worker_count(int n) { dev.worker_count = n; } int get_aicpu_thread_num() const { return dev.aicpu_thread_num; } void set_aicpu_thread_num(int n) { dev.aicpu_thread_num = n; } + void *get_tracr_data() const { return dev.tracrData_; } + void set_tracr_data(void *p) { dev.tracrData_ = p; } + void *get_tracr_data_sizes() const { return dev.tracrDataSizes_; } + void set_tracr_data_sizes(void *p) { dev.tracrDataSizes_ = p; } Handshake *get_workers() { return dev.workers; } int32_t get_aicpu_allowed_cpu_count() const { return dev.aicpu_allowed_cpu_count; } void set_aicpu_allowed_cpu_count(int32_t n) { dev.aicpu_allowed_cpu_count = n; } diff --git a/tools/tracr_simpler_api.hpp b/tools/tracr_simpler_api.hpp index 97e0f417e..8d325439b 100644 --- a/tools/tracr_simpler_api.hpp +++ b/tools/tracr_simpler_api.hpp @@ -118,13 +118,13 @@ int StoreTracrMetaData(RuntimeT &runtime) { // channel_names nlohmann::json channel_names = nlohmann::json::array(); - for(int i = 0; i < runtime.aicpu_thread_num; ++i) { + for(int i = 0; i < runtime.get_aicpu_thread_num(); ++i) { channel_names.push_back("AICPU_" + std::to_string(i)); } - for(int i = 0; i < int(runtime.worker_count/3); ++i) { + for(int i = 0; i < int(runtime.get_worker_count()/3); ++i) { channel_names.push_back("AICube_" + std::to_string(i)); } - for(int i = 0; i < int(2*runtime.worker_count/3); ++i) { + for(int i = 0; i < int(2*runtime.get_worker_count()/3); ++i) { channel_names.push_back("AIVector_" + std::to_string(i)); } channel_names.push_back("INVALID"); @@ -170,36 +170,36 @@ int StoreTracrData(DeviceRunnerT *device_runner, RuntimeT &runtime) { static_assert(std::is_trivially_copyable_v, "TraCR::Payload must be trivially copyable for raw binary dump"); - if (runtime.tracrData_ == nullptr) { + if (runtime.get_tracr_data() == nullptr) { LOG_ERROR("runtime.tracrData_ is a nullptr"); return -1; } - if (runtime.tracrDataSizes_ == nullptr) { + if (runtime.get_tracr_data_sizes() == nullptr) { LOG_ERROR("runtime.tracrDataSizes_ is a nullptr"); return -1; } - if (runtime.aicpu_thread_num <= 0) { - LOG_ERROR("runtime.aicpu_thread_num is zero or negative: %d", runtime.aicpu_thread_num); + if (runtime.get_aicpu_thread_num() <= 0) { + LOG_ERROR("runtime.aicpu_thread_num is zero or negative: %d", runtime.get_aicpu_thread_num()); return -1; } - + // Download the tracrData_ from Device to Host - size_t size = sizeof(TraCR::Payload) * TraCR::CAPACITY * runtime.aicpu_thread_num; - std::vector tracrData(TraCR::CAPACITY * runtime.aicpu_thread_num); + size_t size = sizeof(TraCR::Payload) * TraCR::CAPACITY * runtime.get_aicpu_thread_num(); + std::vector tracrData(TraCR::CAPACITY * runtime.get_aicpu_thread_num()); int rc = device_runner->copy_from_device(reinterpret_cast(tracrData.data()), - reinterpret_cast(runtime.tracrData_), size); + reinterpret_cast(runtime.get_tracr_data()), size); if (rc != 0) { LOG_ERROR("device_runner->copy_from_device 'tracrData' failed rc=%d", rc); return rc; } // Download the tracrDataSizes_ from Device to Host - size = sizeof(size_t) * runtime.aicpu_thread_num; - std::vector tracrDataSizes(runtime.aicpu_thread_num); + size = sizeof(size_t) * runtime.get_aicpu_thread_num(); + std::vector tracrDataSizes(runtime.get_aicpu_thread_num()); rc = device_runner->copy_from_device(reinterpret_cast(tracrDataSizes.data()), - reinterpret_cast(runtime.tracrDataSizes_), size); + reinterpret_cast(runtime.get_tracr_data_sizes()), size); if (rc != 0) { LOG_ERROR("device_runner->copy_from_device 'tracrDataSizes' failed rc=%d", rc); return rc; @@ -207,15 +207,15 @@ int StoreTracrData(DeviceRunnerT *device_runner, RuntimeT &runtime) { // Now, store the traces into '~/ascend/tracr/' tracr_dir = "~/ascend/tracr_" + std::to_string(sampleID++) + "/proc." + std::to_string(1000 + device_runner->device_id()); - rc = TracrData2BTS(tracrData.data(), tracrDataSizes.data(), runtime.aicpu_thread_num); + rc = TracrData2BTS(tracrData.data(), tracrDataSizes.data(), runtime.get_aicpu_thread_num()); if (rc != 0) { LOG_ERROR("TracrData2BTS() failed"); return rc; } // Free device TraCR memory data placeholder - device_runner->free_tensor(runtime.tracrData_); - device_runner->free_tensor(runtime.tracrDataSizes_); + device_runner->free_tensor(runtime.get_tracr_data()); + device_runner->free_tensor(runtime.get_tracr_data_sizes()); rc = StoreTracrMetaData(runtime); if (rc != 0) { @@ -233,20 +233,20 @@ int StoreTracrData(DeviceRunnerT *device_runner, RuntimeT &runtime) { */ template int DevAllocTraCR(DeviceRunnerT *device_runner, RuntimeT &runtime) { - const size_t size = sizeof(TraCR::Payload) * runtime.aicpu_thread_num * TraCR::CAPACITY; - // LOG_INFO_V9("Device alloc start of size=%u, %p", size, runtime.tracrData_); - runtime.tracrData_ = device_runner->allocate_tensor(size); - if (runtime.tracrData_ == nullptr) { + const size_t size = sizeof(TraCR::Payload) * runtime.get_aicpu_thread_num() * TraCR::CAPACITY; + // LOG_INFO_V9("Device alloc start of size=%u, %p", size, runtime.get_tracr_data()); + runtime.set_tracr_data(device_runner->allocate_tensor(size)); + if (runtime.get_tracr_data() == nullptr) { LOG_ERROR("runtime.tracrData_: alloc %zu bytes failed", size); return -1; } - // LOG_INFO_V9("Device alloc start of size=%u, %p", size, runtime.tracrData_); - runtime.tracrDataSizes_ = device_runner->allocate_tensor(runtime.aicpu_thread_num * sizeof(size_t)); - if (runtime.tracrDataSizes_ == nullptr) { - const size_t sizes_bytes = runtime.aicpu_thread_num * sizeof(size_t); + // LOG_INFO_V9("Device alloc start of size=%u, %p", size, runtime.get_tracr_data()); + runtime.set_tracr_data_sizes(device_runner->allocate_tensor(runtime.get_aicpu_thread_num() * sizeof(size_t))); + if (runtime.get_tracr_data_sizes() == nullptr) { + const size_t sizes_bytes = runtime.get_aicpu_thread_num() * sizeof(size_t); LOG_ERROR("runtime.tracrDataSizes_: alloc %zu bytes failed", sizes_bytes); - device_runner->free_tensor(runtime.tracrData_); - runtime.tracrData_ = nullptr; + device_runner->free_tensor(runtime.get_tracr_data()); + runtime.set_tracr_data(nullptr); return -1; } return 0; From 279ef8c7f7d854dfd59e8bea8ae25dd6d75c91b8 Mon Sep 17 00:00:00 2001 From: Noah Baumann Date: Wed, 1 Jul 2026 10:34:30 +0200 Subject: [PATCH 10/14] style: clang-format TraCR changes (clang-format v21.1.0) Whitespace/brace-style only, applied via the repo's pinned pre-commit clang-format hook (v21.1.0) scoped to the TraCR diff vs main. No semantic changes. Co-Authored-By: Claude Opus 4.8 --- .../orchestration/paged_attention_orch.cpp | 5 +- .../runtime/pto_runtime2_types.h | 2 +- .../host_build_graph/runtime/runtime.h | 2 +- .../aicpu/aicpu_executor.cpp | 26 ++++---- .../runtime/pto_runtime2_types.h | 7 +- .../runtime/runtime.h | 2 +- .../runtime/scheduler/scheduler_dispatch.cpp | 7 +- tools/tracr_simpler_api.hpp | 65 +++++++++---------- tools/tracr_simpler_markers.hpp | 41 ++++++------ 9 files changed, 77 insertions(+), 80 deletions(-) diff --git a/examples/a2a3/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp b/examples/a2a3/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp index f92202134..7220228d3 100644 --- a/examples/a2a3/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp +++ b/examples/a2a3/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp @@ -92,7 +92,6 @@ __attribute__((visibility("default"))) void aicpu_orchestration_entry(const L2Ta CYCLE_COUNT_START(); - INSTRUMENTATION_MARK_SET(g_TraCR_thread_idx, Read_Dimensions, 0); // Read dimensions from tensor metadata @@ -158,7 +157,9 @@ __attribute__((visibility("default"))) void aicpu_orchestration_entry(const L2Ta PROF_INC(prof_make_count, 4); CYCLE_COUNT_LAP(prof_make_tensor); - LOG_INFO_V0("Thread %d: Orch PTO2_SCOPE loop: #batch=%" PRIu64 ", q_loop=%" PRIu64, g_TraCR_thread_idx, batch, q_loop); + LOG_INFO_V0( + "Thread %d: Orch PTO2_SCOPE loop: #batch=%" PRIu64 ", q_loop=%" PRIu64, g_TraCR_thread_idx, batch, q_loop + ); for (uint64_t b_idx = 0; b_idx < batch; b_idx++) { uint32_t cl_idx[1] = {static_cast(b_idx)}; diff --git a/src/a2a3/runtime/host_build_graph/runtime/pto_runtime2_types.h b/src/a2a3/runtime/host_build_graph/runtime/pto_runtime2_types.h index e4d3b1126..39ddabdc3 100644 --- a/src/a2a3/runtime/host_build_graph/runtime/pto_runtime2_types.h +++ b/src/a2a3/runtime/host_build_graph/runtime/pto_runtime2_types.h @@ -26,6 +26,6 @@ #define PTO2_TASK_WINDOW_SIZE 16384 // Default per-ring task window size (power of 2) #endif -#define PTO2_MAX_RING_DEPTH 4 // Number of task-id ring layers +#define PTO2_MAX_RING_DEPTH 4 // Number of task-id ring layers #endif // SRC_A2A3_RUNTIME_HOST_BUILD_GRAPH_RUNTIME_PTO_RUNTIME2_TYPES_H_ diff --git a/src/a2a3/runtime/host_build_graph/runtime/runtime.h b/src/a2a3/runtime/host_build_graph/runtime/runtime.h index 8e28b44f4..d22d20713 100644 --- a/src/a2a3/runtime/host_build_graph/runtime/runtime.h +++ b/src/a2a3/runtime/host_build_graph/runtime/runtime.h @@ -190,7 +190,7 @@ class Runtime { // Those are the pointers with the allocated memory on the device void *tracrData_; void *tracrDataSizes_; - + // Filter-style affinity gate input (a2a3 onboard). Placed AFTER `tasks` // because AICore reads runtime->tasks[] by offset. Host fills these before // launch from AICPU OCCUPY; the device gate keeps threads whose diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp index 0e27f092c..f43f79894 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp @@ -835,7 +835,6 @@ void AicpuExecutor::deinit(Runtime *runtime) { // ===== Public Entry Point ===== - /** * init tracr profiler * @@ -860,26 +859,27 @@ inline void TRACR_FINALIZE(Runtime *runtime) { (void)(runtime); #ifdef ENABLE_TRACR - LOG_INFO_V9("[TraCR] thread[%d] dumping the #traces: %lu %p", g_TraCR_thread_idx, tracrThread->_traceIdx, runtime->get_tracr_data()); + LOG_INFO_V9( + "[TraCR] thread[%d] dumping the #traces: %lu %p", g_TraCR_thread_idx, tracrThread->_traceIdx, + runtime->get_tracr_data() + ); if (g_TraCR_thread_idx >= 0 && g_TraCR_thread_idx < runtime->get_aicpu_thread_num()) { if (runtime->get_tracr_data() != nullptr && tracrThread->_traceIdx > 0) { - TraCR::Payload* tracrData = reinterpret_cast(runtime->get_tracr_data()); + TraCR::Payload *tracrData = reinterpret_cast(runtime->get_tracr_data()); const size_t payload_size = tracrThread->_traceIdx * sizeof(TraCR::Payload); - std::memcpy( - &tracrData[g_TraCR_thread_idx * TraCR::CAPACITY], - tracrThread->_traces.data(), - payload_size - ); + std::memcpy(&tracrData[g_TraCR_thread_idx * TraCR::CAPACITY], tracrThread->_traces.data(), payload_size); } if (runtime->get_tracr_data_sizes() != nullptr) { - size_t* tracrDataSizes = reinterpret_cast(runtime->get_tracr_data_sizes()); + size_t *tracrDataSizes = reinterpret_cast(runtime->get_tracr_data_sizes()); tracrDataSizes[g_TraCR_thread_idx] = tracrThread->_traceIdx; } } else { - LOG_ERROR("[TraCR] thread index %d out of bounds (max=%d)", g_TraCR_thread_idx, runtime->get_aicpu_thread_num()); + LOG_ERROR( + "[TraCR] thread index %d out of bounds (max=%d)", g_TraCR_thread_idx, runtime->get_aicpu_thread_num() + ); } #endif @@ -893,7 +893,6 @@ inline void TRACR_FINALIZE(Runtime *runtime) { g_TraCR_thread_idx = -1; } - // Device orchestration SO registration entry. Exported directly by the runtime // (not via a platform forwarding shell): registration is a TMARB-only ability, // so the symbol lives where the capability does. host_build_graph does not @@ -942,8 +941,9 @@ extern "C" int32_t aicpu_execute(Runtime *runtime) { // INIT TraCR all threads coming in TRACR_START(); - LOG_INFO_V9("[TraCR] thread[%d:%d] start ENABLE_TRACR=%d", g_TraCR_thread_idx, tracr_getcpu(), INSTRUMENTATION_ACTIVE); - + LOG_INFO_V9( + "[TraCR] thread[%d:%d] start ENABLE_TRACR=%d", g_TraCR_thread_idx, tracr_getcpu(), INSTRUMENTATION_ACTIVE + ); // Each phase is bracketed by its own scope so the start/end boundaries are // visible and an early `return` still records the end via the guard dtor. diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h index 681b9632e..584f5dc87 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h @@ -80,9 +80,9 @@ #define PTO2_HEAP_SIZE (256 * 1024 * 1024) // 256MB per ring (1GB total) #endif -#define PTO2_DEP_LIST_POOL_SIZE 16384 // Per-ring dependency list pool entries -#define PTO2_TENSORMAP_POOL_SIZE (65536) // TensorMap entry pool -#define PTO2_TENSORMAP_NUM_BUCKETS 4096 // Power of 2 for fast hash (4096×8B=32KB fits L1) +#define PTO2_DEP_LIST_POOL_SIZE 16384 // Per-ring dependency list pool entries +#define PTO2_TENSORMAP_POOL_SIZE (65536) // TensorMap entry pool +#define PTO2_TENSORMAP_NUM_BUCKETS 4096 // Power of 2 for fast hash (4096×8B=32KB fits L1) // Scope management #define PTO2_MAX_SCOPE_DEPTH 64 // Maximum nesting depth @@ -106,7 +106,6 @@ #define PTO2_WRIRING_QUEUE_SIZE 1024 // Per-shape queue size #endif - // Fanin storage #define PTO2_FANIN_INLINE_CAP 64 diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h index 43db09fc5..656a98740 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h @@ -179,7 +179,7 @@ struct alignas(64) DeviceRuntimeLaunchDesc { // Those are the pointers with the allocated memory on the device void *tracrData_; void *tracrDataSizes_; - + // Serial orchestrator -> scheduler start control. // When true, scheduler threads wait until orchestration has fully built the // task graph before entering resolve_and_dispatch(). diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp index 50fd07531..a5c7696f6 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp @@ -823,7 +823,10 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_ #ifdef INDEP_ORCH INSTRUMENTATION_MARK_SET(g_TraCR_thread_idx, Barrier, orchestrator_done_.load(std::memory_order_relaxed)); - LOG_INFO_V9("[TraCR] Thread %d: Waiting before the Orch to finish: %d, orchestrator_done_=%d", g_TraCR_thread_idx, g_TraCR_thread_idx_counter.load(), orchestrator_done_.load(std::memory_order_relaxed)); + LOG_INFO_V9( + "[TraCR] Thread %d: Waiting before the Orch to finish: %d, orchestrator_done_=%d", g_TraCR_thread_idx, + g_TraCR_thread_idx_counter.load(), orchestrator_done_.load(std::memory_order_relaxed) + ); while (!orchestrator_done_.load(std::memory_order_acquire)) { SPIN_WAIT_HINT(); } @@ -1199,7 +1202,7 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_ // Phase 4: MIX-strict-priority dispatch with phase-split and // cross-thread idle gating. See dispatch_ready_tasks for the policy. INSTRUMENTATION_MARK_SET(g_TraCR_thread_idx, Phase4, 0); - + #if PTO2_PROFILING uint64_t dispatch_t0 = (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) ? get_sys_cnt_aicpu() : 0; #endif diff --git a/tools/tracr_simpler_api.hpp b/tools/tracr_simpler_api.hpp index 8d325439b..7d6593e96 100644 --- a/tools/tracr_simpler_api.hpp +++ b/tools/tracr_simpler_api.hpp @@ -11,13 +11,13 @@ /** * TraCR API functions for Simpler A2A3, A2A3sim, A5, A5sim - * + * * TODO: A5 not yet able to test */ #pragma once -#include // C++17 or newer +#include // C++17 or newer #include #include #include @@ -42,16 +42,13 @@ std::string tracr_dir = "~/ascend/tracr/proc.1"; /** * A function for defining the path of the TraCR traces in home */ -fs::path expand_user_path(const std::string& path) -{ +fs::path expand_user_path(const std::string &path) { if (!path.empty() && path[0] == '~') { - const char* home = std::getenv("HOME"); - if (!home) - throw std::runtime_error("HOME not set"); + const char *home = std::getenv("HOME"); + if (!home) throw std::runtime_error("HOME not set"); - std::string sub = path.substr(1); // remove ~ - if (!sub.empty() && sub[0] == '/') - sub = sub.substr(1); // remove leading slash + std::string sub = path.substr(1); // remove ~ + if (!sub.empty() && sub[0] == '/') sub = sub.substr(1); // remove leading slash return fs::path(home) / sub; } @@ -59,26 +56,24 @@ fs::path expand_user_path(const std::string& path) } /** - * + * */ -inline int TracrData2BTS(const TraCR::Payload* tracrData, const size_t* tracrDataSizes, const size_t num_threads) { +inline int TracrData2BTS(const TraCR::Payload *tracrData, const size_t *tracrDataSizes, const size_t num_threads) { fs::path base_dir = expand_user_path(tracr_dir); fs::create_directories(base_dir); for (uint32_t t = 0; t < num_threads; ++t) { - size_t num_traces = tracrDataSizes[t]; + size_t num_traces = tracrDataSizes[t]; - if (num_traces == 0) - continue; + if (num_traces == 0) continue; if (num_traces > TraCR::CAPACITY) { LOG_ERROR("Thread %u exceeds CAPACITY", t); return -1; } - fs::path thread_dir = - base_dir / ("thread." + std::to_string(t + 1)); + fs::path thread_dir = base_dir / ("thread." + std::to_string(t + 1)); fs::create_directories(thread_dir); @@ -90,13 +85,9 @@ inline int TracrData2BTS(const TraCR::Payload* tracrData, const size_t* tracrDat return -1; } - const TraCR::Payload* thread_ptr = - tracrData + t * TraCR::CAPACITY; + const TraCR::Payload *thread_ptr = tracrData + t * TraCR::CAPACITY; - out.write( - reinterpret_cast(thread_ptr), - num_traces * sizeof(TraCR::Payload) - ); + out.write(reinterpret_cast(thread_ptr), num_traces * sizeof(TraCR::Payload)); if (!out) { LOG_ERROR("Write failed for %s", file_path); @@ -114,17 +105,17 @@ int StoreTracrMetaData(RuntimeT &runtime) { fs::path base_dir = expand_user_path(tracr_dir); // Add the metadata.json - nlohmann::json metadata; + nlohmann::json metadata; // channel_names nlohmann::json channel_names = nlohmann::json::array(); - for(int i = 0; i < runtime.get_aicpu_thread_num(); ++i) { + for (int i = 0; i < runtime.get_aicpu_thread_num(); ++i) { channel_names.push_back("AICPU_" + std::to_string(i)); } - for(int i = 0; i < int(runtime.get_worker_count()/3); ++i) { + for (int i = 0; i < int(runtime.get_worker_count() / 3); ++i) { channel_names.push_back("AICube_" + std::to_string(i)); } - for(int i = 0; i < int(2*runtime.get_worker_count()/3); ++i) { + for (int i = 0; i < int(2 * runtime.get_worker_count() / 3); ++i) { channel_names.push_back("AIVector_" + std::to_string(i)); } channel_names.push_back("INVALID"); @@ -167,8 +158,9 @@ int StoreTracrMetaData(RuntimeT &runtime) { */ template int StoreTracrData(DeviceRunnerT *device_runner, RuntimeT &runtime) { - static_assert(std::is_trivially_copyable_v, - "TraCR::Payload must be trivially copyable for raw binary dump"); + static_assert( + std::is_trivially_copyable_v, "TraCR::Payload must be trivially copyable for raw binary dump" + ); if (runtime.get_tracr_data() == nullptr) { LOG_ERROR("runtime.tracrData_ is a nullptr"); @@ -188,8 +180,9 @@ int StoreTracrData(DeviceRunnerT *device_runner, RuntimeT &runtime) { // Download the tracrData_ from Device to Host size_t size = sizeof(TraCR::Payload) * TraCR::CAPACITY * runtime.get_aicpu_thread_num(); std::vector tracrData(TraCR::CAPACITY * runtime.get_aicpu_thread_num()); - int rc = device_runner->copy_from_device(reinterpret_cast(tracrData.data()), - reinterpret_cast(runtime.get_tracr_data()), size); + int rc = device_runner->copy_from_device( + reinterpret_cast(tracrData.data()), reinterpret_cast(runtime.get_tracr_data()), size + ); if (rc != 0) { LOG_ERROR("device_runner->copy_from_device 'tracrData' failed rc=%d", rc); return rc; @@ -198,15 +191,17 @@ int StoreTracrData(DeviceRunnerT *device_runner, RuntimeT &runtime) { // Download the tracrDataSizes_ from Device to Host size = sizeof(size_t) * runtime.get_aicpu_thread_num(); std::vector tracrDataSizes(runtime.get_aicpu_thread_num()); - rc = device_runner->copy_from_device(reinterpret_cast(tracrDataSizes.data()), - reinterpret_cast(runtime.get_tracr_data_sizes()), size); + rc = device_runner->copy_from_device( + reinterpret_cast(tracrDataSizes.data()), reinterpret_cast(runtime.get_tracr_data_sizes()), size + ); if (rc != 0) { LOG_ERROR("device_runner->copy_from_device 'tracrDataSizes' failed rc=%d", rc); return rc; } // Now, store the traces into '~/ascend/tracr/' - tracr_dir = "~/ascend/tracr_" + std::to_string(sampleID++) + "/proc." + std::to_string(1000 + device_runner->device_id()); + tracr_dir = + "~/ascend/tracr_" + std::to_string(sampleID++) + "/proc." + std::to_string(1000 + device_runner->device_id()); rc = TracrData2BTS(tracrData.data(), tracrDataSizes.data(), runtime.get_aicpu_thread_num()); if (rc != 0) { LOG_ERROR("TracrData2BTS() failed"); @@ -228,7 +223,7 @@ int StoreTracrData(DeviceRunnerT *device_runner, RuntimeT &runtime) { /** * A method for allocating memory on the device - * + * * Polymorphic to A2A3 and A5 (should be) */ template diff --git a/tools/tracr_simpler_markers.hpp b/tools/tracr_simpler_markers.hpp index 36d0b5290..b965eed25 100644 --- a/tools/tracr_simpler_markers.hpp +++ b/tools/tracr_simpler_markers.hpp @@ -33,34 +33,33 @@ inline std::atomic g_TraCR_thread_idx_counter{0}; // Global thread local thread idx placeholder inline thread_local int g_TraCR_thread_idx{-1}; -#define MARKER_TYPES \ - X(Orchestrating) \ - X(Read_Dimensions) \ - X(Reshape_Kernels) \ - X(Pre_Loop_Info) \ - X(PTO2_SCOPE_) \ - X(Scheduling) \ - X(Phase1) \ - X(Phase2) \ - X(Phase3) \ - X(Phase3b) \ - X(Phase4) \ - X(Drain) \ - X(Initializing) \ - X(De_Initializing) \ - X(DLL_loading) \ - X(Allocating) \ - X(Running_Task_Single) \ - X(Running_Task_Pair) \ +#define MARKER_TYPES \ + X(Orchestrating) \ + X(Read_Dimensions) \ + X(Reshape_Kernels) \ + X(Pre_Loop_Info) \ + X(PTO2_SCOPE_) \ + X(Scheduling) \ + X(Phase1) \ + X(Phase2) \ + X(Phase3) \ + X(Phase3b) \ + X(Phase4) \ + X(Drain) \ + X(Initializing) \ + X(De_Initializing) \ + X(DLL_loading) \ + X(Allocating) \ + X(Running_Task_Single) \ + X(Running_Task_Pair) \ X(Barrier) - enum MarkerType { #define X(name) name, MARKER_TYPES #undef X - MARKERTYPE_COUNT + MARKERTYPE_COUNT }; constexpr std::string_view MarkerTypeNames[] = { From 3b729e5b016ece5faa09d10a9a64c14879ffb704 Mon Sep 17 00:00:00 2001 From: Noah Baumann Date: Wed, 1 Jul 2026 15:24:35 +0200 Subject: [PATCH 11/14] updating tracr version --- tools/tracr | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/tracr b/tools/tracr index 806fa6a84..f0574cdad 160000 --- a/tools/tracr +++ b/tools/tracr @@ -1 +1 @@ -Subproject commit 806fa6a849ddd4a1eb544080bc7ec1423a834287 +Subproject commit f0574cdad14f56da69706533f3fa8e4b75b4f476 From d03630000c64eb1f114a79a6080efa6269ae83cf Mon Sep 17 00:00:00 2001 From: Noah Baumann Date: Wed, 1 Jul 2026 14:41:44 +0200 Subject: [PATCH 12/14] style: ruff-format kernel_compiler.py TraCR changes Formatting only, via the repo's pinned ruff hooks (v0.14.8). Co-Authored-By: Claude Opus 4.8 --- simpler_setup/kernel_compiler.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/simpler_setup/kernel_compiler.py b/simpler_setup/kernel_compiler.py index 6d2e10f3f..85ac7aed5 100644 --- a/simpler_setup/kernel_compiler.py +++ b/simpler_setup/kernel_compiler.py @@ -146,7 +146,14 @@ def get_orchestration_include_dirs(self, runtime_name: str) -> list[str]: tracr_dir1 = str(self.project_root / "tools") tracr_dir2 = str(self.project_root / "tools" / "tracr" / "include") tracr_dir3 = str(self.project_root / "tools" / "tracr" / "extern") - return [runtime_dir, runtime_common_dir, common_dir, tracr_dir1, tracr_dir2, tracr_dir3] + self.get_platform_include_dirs() + return [ + runtime_dir, + runtime_common_dir, + common_dir, + tracr_dir1, + tracr_dir2, + tracr_dir3, + ] + self.get_platform_include_dirs() def get_incore_include_dirs(self) -> list[str]: """ @@ -491,11 +498,13 @@ def _compile_orchestration_shared_lib( cmd.append("-Wl,--build-id=sha1") if os.getenv("BUILD_TRACR", "OFF") == "ON": - cmd.extend([ - "-DENABLE_TRACR", - "-DTRACR_DISABLE_FLUSH", - "-DUSE_HW_COUNTER", - ]) + cmd.extend( + [ + "-DENABLE_TRACR", + "-DTRACR_DISABLE_FLUSH", + "-DUSE_HW_COUNTER", + ] + ) if extra_sources: for src in extra_sources: From 6c6c5097cff7bcc735bf7bd582a985e4c76a021e Mon Sep 17 00:00:00 2001 From: Noah Baumann Date: Wed, 1 Jul 2026 15:50:59 +0200 Subject: [PATCH 13/14] style: fix license headers on TraCR .cmake files check-headers requires the standard "# Copyright (c) PyPTO Contributors." header (PY_HEADER) for .cmake files; tracr.cmake and tracr_postprocessing_script.cmake carried a different Huawei header with an extra leading rule line. Also picks up trailing-whitespace / end-of-file fixes on tracr.cmake. Co-Authored-By: Claude Opus 4.8 --- tools/tracr.cmake | 7 +++---- tools/tracr_postprocessing_script.cmake | 3 +-- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/tools/tracr.cmake b/tools/tracr.cmake index 12da91dda..55013897c 100644 --- a/tools/tracr.cmake +++ b/tools/tracr.cmake @@ -1,5 +1,4 @@ -# ----------------------------------------------------------------------------------------------------------- -# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# Copyright (c) PyPTO Contributors. # This program is free software, you can redistribute it and/or modify it under the terms and conditions of # CANN Open Software License Agreement Version 2.0 (the "License"). # Please refer to the License for details. You may not use this file except in compliance with the License. @@ -74,7 +73,7 @@ function(tracr_enable target) ) endif() - # As the traces are collected on the Ascend device, + # As the traces are collected on the Ascend device, # there is no need to store them on the device filesystem. target_compile_definitions(${target} PRIVATE TRACR_DISABLE_FLUSH USE_HW_COUNTER) @@ -112,4 +111,4 @@ function(tracr_enable target) # target_compile_definitions(${target} PRIVATE ENABLE_TRACR_DEBUG) # endif() endif() -endfunction() \ No newline at end of file +endfunction() diff --git a/tools/tracr_postprocessing_script.cmake b/tools/tracr_postprocessing_script.cmake index 0bcdd6a1e..b5cb4e49e 100644 --- a/tools/tracr_postprocessing_script.cmake +++ b/tools/tracr_postprocessing_script.cmake @@ -1,5 +1,4 @@ -# ----------------------------------------------------------------------------------------------------------- -# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# Copyright (c) PyPTO Contributors. # This program is free software, you can redistribute it and/or modify it under the terms and conditions of # CANN Open Software License Agreement Version 2.0 (the "License"). # Please refer to the License for details. You may not use this file except in compliance with the License. From e5d7da405d1b14f8f662f7d64ce2f93523c08d0a Mon Sep 17 00:00:00 2001 From: Noah Baumann Date: Wed, 1 Jul 2026 16:06:42 +0200 Subject: [PATCH 14/14] style: add final newline to TraCR header files end-of-file-fixer requires a trailing newline; tracr_simpler_api.hpp and tracr_simpler_markers.hpp were missing it. Co-Authored-By: Claude Opus 4.8 --- tools/tracr_simpler_api.hpp | 2 +- tools/tracr_simpler_markers.hpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/tracr_simpler_api.hpp b/tools/tracr_simpler_api.hpp index 7d6593e96..5b8110f9b 100644 --- a/tools/tracr_simpler_api.hpp +++ b/tools/tracr_simpler_api.hpp @@ -245,4 +245,4 @@ int DevAllocTraCR(DeviceRunnerT *device_runner, RuntimeT &runtime) { return -1; } return 0; -} \ No newline at end of file +} diff --git a/tools/tracr_simpler_markers.hpp b/tools/tracr_simpler_markers.hpp index b965eed25..99fd8cf59 100644 --- a/tools/tracr_simpler_markers.hpp +++ b/tools/tracr_simpler_markers.hpp @@ -66,4 +66,4 @@ constexpr std::string_view MarkerTypeNames[] = { #define X(name) #name, MARKER_TYPES #undef X -}; \ No newline at end of file +};