From 93105fe032ec82bede47652d8b293cbb380184f8 Mon Sep 17 00:00:00 2001 From: Brian Coutinho Date: Tue, 23 Dec 2025 12:35:25 -0800 Subject: [PATCH] enable generic event types and add gpu pm counter event --- libkineto/include/ActivityType.h | 118 +++++++++++++++++++++---------- libkineto/src/ActivityType.cpp | 81 ++++++++++++--------- libkineto/src/output_json.cpp | 51 +++++++------ libkineto/test/ConfigTest.cpp | 27 +++++++ 4 files changed, 185 insertions(+), 92 deletions(-) diff --git a/libkineto/include/ActivityType.h b/libkineto/include/ActivityType.h index 702d6ebb2..ed74c5b4e 100644 --- a/libkineto/include/ActivityType.h +++ b/libkineto/include/ActivityType.h @@ -14,43 +14,61 @@ namespace libkineto { -// Note : All activity types are not enabled by default. Please add them -// at correct position in the enum enum class ActivityType { - // Activity types enabled by default - CPU_OP = 0, // cpu side ops - USER_ANNOTATION, - GPU_USER_ANNOTATION, - GPU_MEMCPY, - GPU_MEMSET, - CONCURRENT_KERNEL, // on-device kernels - EXTERNAL_CORRELATION, - CUDA_RUNTIME, // host side cuda runtime events - CUDA_DRIVER, // host side cuda driver events - CPU_INSTANT_EVENT, // host side point-like events - PYTHON_FUNCTION, - OVERHEAD, // CUPTI induced overhead events sampled from its overhead API. - MTIA_RUNTIME, // host side MTIA runtime events - MTIA_CCP_EVENTS, // MTIA ondevice CCP events - MTIA_INSIGHT, // MTIA Insight Events - CUDA_SYNC, // synchronization events between runtime and kernels - CUDA_EVENT, // CUDA event activities (cudaEventRecord, etc.) + // ------------------------------------------------------------------------- + // Accelerator-Agnostic Event Types + // ------------------------------------------------------------------------- + // These are the canonical event types that work across all accelerators. + // Prefer using these over device-specific types for better extensibility + // and maintainability. + + CPU_OP = 0, // CPU-side ops (e.g., from PyTorch) + USER_ANNOTATION, // User-defined annotations + GPU_USER_ANNOTATION, // GPU-side user annotations + GPU_MEMCPY, // Memory copy operations + GPU_MEMSET, // Memory set operations + CONCURRENT_KERNEL, // On-device kernel execution + EXTERNAL_CORRELATION, // Correlation with external events + RUNTIME, // Host-side runtime events + DRIVER, // Host-side driver events + CPU_INSTANT_EVENT, // Host-side point-like events + PYTHON_FUNCTION, // Python function calls + OVERHEAD, // Profiler-induced overhead events + COLLECTIVE_COMM, // Collective communication operations + GPU_PM_COUNTER, // Performance monitoring counters - // Optional Activity types - GLOW_RUNTIME, // host side glow runtime events + // ------------------------------------------------------------------------- + // Device-Specific Event Types + // ------------------------------------------------------------------------- + // These events don't fit into the accelerator-agnostic categories above. + // Use sparingly; prefer agnostic types when possible. + + MTIA_INSIGHT, // MTIA Insight events + CUDA_SYNC, // CUDA synchronization events + CUDA_EVENT, // CUDA event activities (cudaEventRecord, etc.) CUDA_PROFILER_RANGE, // CUPTI Profiler range for performance metrics - HPU_OP, // HPU host side runtime event - XPU_RUNTIME, // host side xpu runtime events - COLLECTIVE_COMM, // collective communication - - // PRIVATEUSE1 Activity types are used for custom backends. - // The corresponding device type is `DeviceType::PrivateUse1` in PyTorch. - PRIVATEUSE1_RUNTIME, // host side privateUse1 runtime events - PRIVATEUSE1_DRIVER, // host side privateUse1 driver events - - ENUM_COUNT, // This is to add buffer and not used for any profiling logic. Add - // your new type before it. - OPTIONAL_ACTIVITY_TYPE_START = GLOW_RUNTIME, + + // ------------------------------------------------------------------------- + ENUM_COUNT, // Sentinel value; add new types above this line + + // ------------------------------------------------------------------------- + // Aliased Event Types (Deprecated) + // ------------------------------------------------------------------------- + // These are aliases to accelerator-agnostic types for backward compatibility. + // Do NOT add new aliases. We aim to remove these in the future. + + CUDA_RUNTIME = RUNTIME, + CUDA_DRIVER = DRIVER, + MTIA_RUNTIME = RUNTIME, + MTIA_CCP_EVENTS = CONCURRENT_KERNEL, + GLOW_RUNTIME = RUNTIME, + HPU_OP = RUNTIME, + XPU_RUNTIME = RUNTIME, + + // PrivateUse1: Custom backend activity types + // Corresponds to DeviceType::PrivateUse1 in PyTorch. + PRIVATEUSE1_RUNTIME = RUNTIME, + PRIVATEUSE1_DRIVER = DRIVER, }; const char* toString(ActivityType t); @@ -58,9 +76,35 @@ ActivityType toActivityType(const std::string& str); // Return an array of all activity types except COUNT constexpr int activityTypeCount = (int)ActivityType::ENUM_COUNT; -constexpr int defaultActivityTypeCount = - (int)ActivityType::OPTIONAL_ACTIVITY_TYPE_START; + +// Return an array of all activity types, note does not return aliases. const std::array activityTypes(); -const std::array defaultActivityTypes(); + +// Default activity types that are enabled by default during profiling +inline constexpr std::array defaultActivityTypesArray = { + ActivityType::CPU_OP, + ActivityType::USER_ANNOTATION, + ActivityType::GPU_USER_ANNOTATION, + ActivityType::GPU_MEMCPY, + ActivityType::GPU_MEMSET, + ActivityType::CONCURRENT_KERNEL, + ActivityType::EXTERNAL_CORRELATION, + ActivityType::RUNTIME, + ActivityType::DRIVER, + ActivityType::CPU_INSTANT_EVENT, + ActivityType::PYTHON_FUNCTION, + ActivityType::OVERHEAD, + ActivityType::MTIA_RUNTIME, + ActivityType::MTIA_CCP_EVENTS, + ActivityType::MTIA_INSIGHT, + ActivityType::CUDA_SYNC, + ActivityType::CUDA_EVENT, +}; + +constexpr int defaultActivityTypeCount = defaultActivityTypesArray.size(); + +constexpr auto defaultActivityTypes() { + return defaultActivityTypesArray; +} } // namespace libkineto diff --git a/libkineto/src/ActivityType.cpp b/libkineto/src/ActivityType.cpp index 4acd30420..0df4360b4 100644 --- a/libkineto/src/ActivityType.cpp +++ b/libkineto/src/ActivityType.cpp @@ -17,32 +17,31 @@ struct ActivityTypeName { ActivityType type; }; -static constexpr std::array map{ - {{"cpu_op", ActivityType::CPU_OP}, - {"user_annotation", ActivityType::USER_ANNOTATION}, - {"gpu_user_annotation", ActivityType::GPU_USER_ANNOTATION}, - {"gpu_memcpy", ActivityType::GPU_MEMCPY}, - {"gpu_memset", ActivityType::GPU_MEMSET}, - {"kernel", ActivityType::CONCURRENT_KERNEL}, - {"external_correlation", ActivityType::EXTERNAL_CORRELATION}, - {"cuda_runtime", ActivityType::CUDA_RUNTIME}, - {"cuda_driver", ActivityType::CUDA_DRIVER}, - {"cpu_instant_event", ActivityType::CPU_INSTANT_EVENT}, - {"python_function", ActivityType::PYTHON_FUNCTION}, - {"overhead", ActivityType::OVERHEAD}, - {"mtia_runtime", ActivityType::MTIA_RUNTIME}, - {"mtia_ccp_events", ActivityType::MTIA_CCP_EVENTS}, - {"mtia_insight", ActivityType::MTIA_INSIGHT}, - {"cuda_sync", ActivityType::CUDA_SYNC}, - {"cuda_event", ActivityType::CUDA_EVENT}, - {"glow_runtime", ActivityType::GLOW_RUNTIME}, - {"cuda_profiler_range", ActivityType::CUDA_PROFILER_RANGE}, - {"hpu_op", ActivityType::HPU_OP}, - {"xpu_runtime", ActivityType::XPU_RUNTIME}, - {"collective_comm", ActivityType::COLLECTIVE_COMM}, - {"privateuse1_runtime", ActivityType::PRIVATEUSE1_RUNTIME}, - {"privateuse1_driver", ActivityType::PRIVATEUSE1_DRIVER}, - {"ENUM_COUNT", ActivityType::ENUM_COUNT}}}; +// Canonical names for each unique ActivityType value, ordered by enum value. +// This array is used for toString() via direct indexing. +static constexpr std::array map{{ + // Accelerator-Agnostic Event Types + {"cpu_op", ActivityType::CPU_OP}, + {"user_annotation", ActivityType::USER_ANNOTATION}, + {"gpu_user_annotation", ActivityType::GPU_USER_ANNOTATION}, + {"gpu_memcpy", ActivityType::GPU_MEMCPY}, + {"gpu_memset", ActivityType::GPU_MEMSET}, + {"kernel", ActivityType::CONCURRENT_KERNEL}, + {"external_correlation", ActivityType::EXTERNAL_CORRELATION}, + {"runtime", ActivityType::RUNTIME}, + {"driver", ActivityType::DRIVER}, + {"cpu_instant_event", ActivityType::CPU_INSTANT_EVENT}, + {"python_function", ActivityType::PYTHON_FUNCTION}, + {"overhead", ActivityType::OVERHEAD}, + {"collective_comm", ActivityType::COLLECTIVE_COMM}, + {"gpu_pm_counter", ActivityType::GPU_PM_COUNTER}, + // Accelerator-Specific Event Types + {"mtia_insight", ActivityType::MTIA_INSIGHT}, + {"cuda_sync", ActivityType::CUDA_SYNC}, + {"cuda_event", ActivityType::CUDA_EVENT}, + {"cuda_profiler_range", ActivityType::CUDA_PROFILER_RANGE}, + {"ENUM_COUNT", ActivityType::ENUM_COUNT}, +}}; static constexpr bool matchingOrder(int idx = 0) { return map[idx].type == ActivityType::ENUM_COUNT || @@ -50,16 +49,37 @@ static constexpr bool matchingOrder(int idx = 0) { } static_assert(matchingOrder(), "ActivityTypeName map is out of order"); +// Alias names for backward compatibility in toActivityType(). +// These map old/alternate string names to their canonical ActivityType. +static constexpr std::array aliasMap{{ + {"cuda_runtime", ActivityType::CUDA_RUNTIME}, + {"cuda_driver", ActivityType::CUDA_DRIVER}, + {"mtia_runtime", ActivityType::MTIA_RUNTIME}, + {"mtia_ccp_events", ActivityType::MTIA_CCP_EVENTS}, + {"glow_runtime", ActivityType::GLOW_RUNTIME}, + {"hpu_op", ActivityType::HPU_OP}, + {"xpu_runtime", ActivityType::XPU_RUNTIME}, + {"privateuse1_runtime", ActivityType::PRIVATEUSE1_RUNTIME}, + {"privateuse1_driver", ActivityType::PRIVATEUSE1_DRIVER}, +}}; + const char* toString(ActivityType t) { return map[(int)t].name; } ActivityType toActivityType(const std::string& str) { + // Search canonical names first for (int i = 0; i < activityTypeCount; i++) { if (str == map[i].name) { return map[i].type; } } + // Search alias names for backward compatibility + for (const auto& alias : aliasMap) { + if (str == alias.name) { + return alias.type; + } + } throw std::invalid_argument(fmt::format("Invalid activity type: {}", str)); } @@ -71,13 +91,4 @@ const std::array activityTypes() { return res; } -const std::array -defaultActivityTypes() { - std::array res; - for (int i = 0; i < defaultActivityTypeCount; i++) { - res[i] = map[i].type; - } - return res; -} - } // namespace libkineto diff --git a/libkineto/src/output_json.cpp b/libkineto/src/output_json.cpp index 87bf8baf8..666108226 100644 --- a/libkineto/src/output_json.cpp +++ b/libkineto/src/output_json.cpp @@ -412,6 +412,7 @@ void ChromeTraceLogger::handleActivity(const libkineto::ITraceActivity& op) { static const std::set excludedTypes = { libkineto::ActivityType::GPU_MEMCPY, libkineto::ActivityType::GPU_MEMSET, + libkineto::ActivityType::GPU_PM_COUNTER, libkineto::ActivityType::CONCURRENT_KERNEL, libkineto::ActivityType::CUDA_RUNTIME, libkineto::ActivityType::CUDA_DRIVER, @@ -450,19 +451,18 @@ void ChromeTraceLogger::handleActivity(const libkineto::ITraceActivity& op) { if (!arg_values.empty()) { arg_values.append(","); } - arg_values.append( - fmt::format( - R"( "{}": {}, "{}": {}, "{}": {}, "{}": {}, "{}": {})", - kCollectiveName, - collectiveName, - kInMsgNelems, - inMsgSize, - kOutMsgNelems, - outMsgSize, - kGroupSize, - groupSize, - kDtype, - dtype)); + arg_values.append(fmt::format( + R"( "{}": {}, "{}": {}, "{}": {}, "{}": {}, "{}": {})", + kCollectiveName, + collectiveName, + kInMsgNelems, + inMsgSize, + kOutMsgNelems, + outMsgSize, + kGroupSize, + groupSize, + kDtype, + dtype)); } const auto& input_tensor_starts = collectiveRecord->getMetadataValue(kInTensorsStart); @@ -489,13 +489,12 @@ void ChromeTraceLogger::handleActivity(const libkineto::ITraceActivity& op) { if (!arg_values.empty()) { arg_values.append(","); } - arg_values.append( - fmt::format( - R"( "{}": {}, "{}": {})", - kInSplit, - inSplitSize, - kOutSplit, - outSplitSize)); + arg_values.append(fmt::format( + R"( "{}": {}, "{}": {})", + kInSplit, + inSplitSize, + kOutSplit, + outSplitSize)); } const auto& processGroupName = collectiveRecord->getMetadataValue(kProcessGroupName); @@ -569,6 +568,18 @@ void ChromeTraceLogger::handleActivity(const libkineto::ITraceActivity& op) { // clang-format off ts = transToRelativeTime(ts); + + if (op.type() == libkineto::ActivityType::GPU_PM_COUNTER) { + fmt::print(traceOf_, R"JSON( + {{ + "ph": "C", "cat": "{}", "name": "{}", "pid": {}, "tid": {}, + "ts": {}.{:03} {} + }},)JSON", + toString(op.type()), op_name, device, sanitizeTid(resource), + ts/1000, ts %1000, args); + return; + } + fmt::print(traceOf_, R"JSON( {{ "ph": "X", "cat": "{}", "name": "{}", "pid": {}, "tid": {}, diff --git a/libkineto/test/ConfigTest.cpp b/libkineto/test/ConfigTest.cpp index 7420d93c0..704f7d31d 100644 --- a/libkineto/test/ConfigTest.cpp +++ b/libkineto/test/ConfigTest.cpp @@ -86,6 +86,7 @@ TEST(ParseTest, ActivityTypes) { EXPECT_TRUE(cfg.parse("ACTIVITY_TYPES=")); EXPECT_FALSE(cfg.parse("=ACTIVITY_TYPES=")); + // Default activity types EXPECT_EQ( cfg.selectedActivityTypes(), std::set( @@ -141,6 +142,32 @@ TEST(ParseTest, ActivityTypes) { std::set( {ActivityType::PRIVATEUSE1_RUNTIME, ActivityType::PRIVATEUSE1_DRIVER})); + + // Generic events + EXPECT_TRUE(cfg2.parse("ACTIVITY_TYPES = runtime, driver, kernel")); + EXPECT_EQ( + cfg2.selectedActivityTypes(), + std::set( + {ActivityType::RUNTIME, + ActivityType::DRIVER, + ActivityType::CONCURRENT_KERNEL})); + + // Generic events match aliases + EXPECT_TRUE( + cfg2.selectedActivityTypes().count(ActivityType::CUDA_RUNTIME) > 0); + EXPECT_TRUE( + cfg2.selectedActivityTypes().count(ActivityType::CUDA_DRIVER) > 0); + EXPECT_TRUE( + cfg2.selectedActivityTypes().count(ActivityType::MTIA_RUNTIME) > 0); + EXPECT_TRUE( + cfg2.selectedActivityTypes().count(ActivityType::MTIA_CCP_EVENTS) > 0); + EXPECT_TRUE( + cfg2.selectedActivityTypes().count(ActivityType::XPU_RUNTIME) > 0); + EXPECT_TRUE( + cfg2.selectedActivityTypes().count(ActivityType::PRIVATEUSE1_RUNTIME) > + 0); + EXPECT_TRUE( + cfg2.selectedActivityTypes().count(ActivityType::PRIVATEUSE1_DRIVER) > 0); } TEST(ParseTest, SamplePeriod) {