Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
118 changes: 81 additions & 37 deletions libkineto/include/ActivityType.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,53 +14,97 @@

namespace libkineto {

// Note : All activity types are not enabled by default. Please add them
// at correct position in the enum
enum class ActivityType {
// Activity types enabled by default
CPU_OP = 0, // cpu side ops
USER_ANNOTATION,
GPU_USER_ANNOTATION,
GPU_MEMCPY,
GPU_MEMSET,
CONCURRENT_KERNEL, // on-device kernels
EXTERNAL_CORRELATION,
CUDA_RUNTIME, // host side cuda runtime events
CUDA_DRIVER, // host side cuda driver events
CPU_INSTANT_EVENT, // host side point-like events
PYTHON_FUNCTION,
OVERHEAD, // CUPTI induced overhead events sampled from its overhead API.
MTIA_RUNTIME, // host side MTIA runtime events
MTIA_CCP_EVENTS, // MTIA ondevice CCP events
MTIA_INSIGHT, // MTIA Insight Events
CUDA_SYNC, // synchronization events between runtime and kernels
CUDA_EVENT, // CUDA event activities (cudaEventRecord, etc.)
// -------------------------------------------------------------------------
// Accelerator-Agnostic Event Types
// -------------------------------------------------------------------------
// These are the canonical event types that work across all accelerators.
// Prefer using these over device-specific types for better extensibility
// and maintainability.

CPU_OP = 0, // CPU-side ops (e.g., from PyTorch)
USER_ANNOTATION, // User-defined annotations
GPU_USER_ANNOTATION, // GPU-side user annotations
GPU_MEMCPY, // Memory copy operations
GPU_MEMSET, // Memory set operations
CONCURRENT_KERNEL, // On-device kernel execution
EXTERNAL_CORRELATION, // Correlation with external events
RUNTIME, // Host-side runtime events
DRIVER, // Host-side driver events
CPU_INSTANT_EVENT, // Host-side point-like events
PYTHON_FUNCTION, // Python function calls
OVERHEAD, // Profiler-induced overhead events
COLLECTIVE_COMM, // Collective communication operations
GPU_PM_COUNTER, // Performance monitoring counters

// Optional Activity types
GLOW_RUNTIME, // host side glow runtime events
// -------------------------------------------------------------------------
// Device-Specific Event Types
// -------------------------------------------------------------------------
// These events don't fit into the accelerator-agnostic categories above.
// Use sparingly; prefer agnostic types when possible.

MTIA_INSIGHT, // MTIA Insight events
CUDA_SYNC, // CUDA synchronization events
CUDA_EVENT, // CUDA event activities (cudaEventRecord, etc.)
CUDA_PROFILER_RANGE, // CUPTI Profiler range for performance metrics
HPU_OP, // HPU host side runtime event
XPU_RUNTIME, // host side xpu runtime events
COLLECTIVE_COMM, // collective communication

// PRIVATEUSE1 Activity types are used for custom backends.
// The corresponding device type is `DeviceType::PrivateUse1` in PyTorch.
PRIVATEUSE1_RUNTIME, // host side privateUse1 runtime events
PRIVATEUSE1_DRIVER, // host side privateUse1 driver events

ENUM_COUNT, // This is to add buffer and not used for any profiling logic. Add
// your new type before it.
OPTIONAL_ACTIVITY_TYPE_START = GLOW_RUNTIME,

// -------------------------------------------------------------------------
ENUM_COUNT, // Sentinel value; add new types above this line

// -------------------------------------------------------------------------
// Aliased Event Types (Deprecated)
// -------------------------------------------------------------------------
// These are aliases to accelerator-agnostic types for backward compatibility.
// Do NOT add new aliases. We aim to remove these in the future.

CUDA_RUNTIME = RUNTIME,
CUDA_DRIVER = DRIVER,
MTIA_RUNTIME = RUNTIME,
MTIA_CCP_EVENTS = CONCURRENT_KERNEL,
GLOW_RUNTIME = RUNTIME,
HPU_OP = RUNTIME,
XPU_RUNTIME = RUNTIME,

// PrivateUse1: Custom backend activity types
// Corresponds to DeviceType::PrivateUse1 in PyTorch.
PRIVATEUSE1_RUNTIME = RUNTIME,
PRIVATEUSE1_DRIVER = DRIVER,
};

const char* toString(ActivityType t);
ActivityType toActivityType(const std::string& str);

// Return an array of all activity types except COUNT
constexpr int activityTypeCount = (int)ActivityType::ENUM_COUNT;
constexpr int defaultActivityTypeCount =
(int)ActivityType::OPTIONAL_ACTIVITY_TYPE_START;

// Return an array of all activity types, note does not return aliases.
const std::array<ActivityType, activityTypeCount> activityTypes();
const std::array<ActivityType, defaultActivityTypeCount> defaultActivityTypes();

// Default activity types that are enabled by default during profiling
inline constexpr std::array defaultActivityTypesArray = {
ActivityType::CPU_OP,
ActivityType::USER_ANNOTATION,
ActivityType::GPU_USER_ANNOTATION,
ActivityType::GPU_MEMCPY,
ActivityType::GPU_MEMSET,
ActivityType::CONCURRENT_KERNEL,
ActivityType::EXTERNAL_CORRELATION,
ActivityType::RUNTIME,
ActivityType::DRIVER,
ActivityType::CPU_INSTANT_EVENT,
ActivityType::PYTHON_FUNCTION,
ActivityType::OVERHEAD,
ActivityType::MTIA_RUNTIME,
ActivityType::MTIA_CCP_EVENTS,
ActivityType::MTIA_INSIGHT,
ActivityType::CUDA_SYNC,
ActivityType::CUDA_EVENT,
};

constexpr int defaultActivityTypeCount = defaultActivityTypesArray.size();

constexpr auto defaultActivityTypes() {
return defaultActivityTypesArray;
}

} // namespace libkineto
81 changes: 46 additions & 35 deletions libkineto/src/ActivityType.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,49 +17,69 @@ struct ActivityTypeName {
ActivityType type;
};

static constexpr std::array<ActivityTypeName, activityTypeCount + 1> map{
{{"cpu_op", ActivityType::CPU_OP},
{"user_annotation", ActivityType::USER_ANNOTATION},
{"gpu_user_annotation", ActivityType::GPU_USER_ANNOTATION},
{"gpu_memcpy", ActivityType::GPU_MEMCPY},
{"gpu_memset", ActivityType::GPU_MEMSET},
{"kernel", ActivityType::CONCURRENT_KERNEL},
{"external_correlation", ActivityType::EXTERNAL_CORRELATION},
{"cuda_runtime", ActivityType::CUDA_RUNTIME},
{"cuda_driver", ActivityType::CUDA_DRIVER},
{"cpu_instant_event", ActivityType::CPU_INSTANT_EVENT},
{"python_function", ActivityType::PYTHON_FUNCTION},
{"overhead", ActivityType::OVERHEAD},
{"mtia_runtime", ActivityType::MTIA_RUNTIME},
{"mtia_ccp_events", ActivityType::MTIA_CCP_EVENTS},
{"mtia_insight", ActivityType::MTIA_INSIGHT},
{"cuda_sync", ActivityType::CUDA_SYNC},
{"cuda_event", ActivityType::CUDA_EVENT},
{"glow_runtime", ActivityType::GLOW_RUNTIME},
{"cuda_profiler_range", ActivityType::CUDA_PROFILER_RANGE},
{"hpu_op", ActivityType::HPU_OP},
{"xpu_runtime", ActivityType::XPU_RUNTIME},
{"collective_comm", ActivityType::COLLECTIVE_COMM},
{"privateuse1_runtime", ActivityType::PRIVATEUSE1_RUNTIME},
{"privateuse1_driver", ActivityType::PRIVATEUSE1_DRIVER},
{"ENUM_COUNT", ActivityType::ENUM_COUNT}}};
// Canonical names for each unique ActivityType value, ordered by enum value.
// This array is used for toString() via direct indexing.
static constexpr std::array<ActivityTypeName, activityTypeCount + 1> map{{
// Accelerator-Agnostic Event Types
{"cpu_op", ActivityType::CPU_OP},
{"user_annotation", ActivityType::USER_ANNOTATION},
{"gpu_user_annotation", ActivityType::GPU_USER_ANNOTATION},
{"gpu_memcpy", ActivityType::GPU_MEMCPY},
{"gpu_memset", ActivityType::GPU_MEMSET},
{"kernel", ActivityType::CONCURRENT_KERNEL},
{"external_correlation", ActivityType::EXTERNAL_CORRELATION},
{"runtime", ActivityType::RUNTIME},
{"driver", ActivityType::DRIVER},
{"cpu_instant_event", ActivityType::CPU_INSTANT_EVENT},
{"python_function", ActivityType::PYTHON_FUNCTION},
{"overhead", ActivityType::OVERHEAD},
{"collective_comm", ActivityType::COLLECTIVE_COMM},
{"gpu_pm_counter", ActivityType::GPU_PM_COUNTER},
// Accelerator-Specific Event Types
{"mtia_insight", ActivityType::MTIA_INSIGHT},
{"cuda_sync", ActivityType::CUDA_SYNC},
{"cuda_event", ActivityType::CUDA_EVENT},
{"cuda_profiler_range", ActivityType::CUDA_PROFILER_RANGE},
{"ENUM_COUNT", ActivityType::ENUM_COUNT},
}};

static constexpr bool matchingOrder(int idx = 0) {
return map[idx].type == ActivityType::ENUM_COUNT ||
((idx == (int)map[idx].type) && matchingOrder(idx + 1));
}
static_assert(matchingOrder(), "ActivityTypeName map is out of order");

// Alias names for backward compatibility in toActivityType().
// These map old/alternate string names to their canonical ActivityType.
static constexpr std::array<ActivityTypeName, 9> aliasMap{{
{"cuda_runtime", ActivityType::CUDA_RUNTIME},
{"cuda_driver", ActivityType::CUDA_DRIVER},
{"mtia_runtime", ActivityType::MTIA_RUNTIME},
{"mtia_ccp_events", ActivityType::MTIA_CCP_EVENTS},
{"glow_runtime", ActivityType::GLOW_RUNTIME},
{"hpu_op", ActivityType::HPU_OP},
{"xpu_runtime", ActivityType::XPU_RUNTIME},
{"privateuse1_runtime", ActivityType::PRIVATEUSE1_RUNTIME},
{"privateuse1_driver", ActivityType::PRIVATEUSE1_DRIVER},
}};

const char* toString(ActivityType t) {
return map[(int)t].name;
}

ActivityType toActivityType(const std::string& str) {
// Search canonical names first
for (int i = 0; i < activityTypeCount; i++) {
if (str == map[i].name) {
return map[i].type;
}
}
// Search alias names for backward compatibility
for (const auto& alias : aliasMap) {
if (str == alias.name) {
return alias.type;
}
}
throw std::invalid_argument(fmt::format("Invalid activity type: {}", str));
}

Expand All @@ -71,13 +91,4 @@ const std::array<ActivityType, activityTypeCount> activityTypes() {
return res;
}

const std::array<ActivityType, defaultActivityTypeCount>
defaultActivityTypes() {
std::array<ActivityType, defaultActivityTypeCount> res;
for (int i = 0; i < defaultActivityTypeCount; i++) {
res[i] = map[i].type;
}
return res;
}

} // namespace libkineto
51 changes: 31 additions & 20 deletions libkineto/src/output_json.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -412,6 +412,7 @@ void ChromeTraceLogger::handleActivity(const libkineto::ITraceActivity& op) {
static const std::set<libkineto::ActivityType> excludedTypes = {
libkineto::ActivityType::GPU_MEMCPY,
libkineto::ActivityType::GPU_MEMSET,
libkineto::ActivityType::GPU_PM_COUNTER,
libkineto::ActivityType::CONCURRENT_KERNEL,
libkineto::ActivityType::CUDA_RUNTIME,
libkineto::ActivityType::CUDA_DRIVER,
Expand Down Expand Up @@ -450,19 +451,18 @@ void ChromeTraceLogger::handleActivity(const libkineto::ITraceActivity& op) {
if (!arg_values.empty()) {
arg_values.append(",");
}
arg_values.append(
fmt::format(
R"( "{}": {}, "{}": {}, "{}": {}, "{}": {}, "{}": {})",
kCollectiveName,
collectiveName,
kInMsgNelems,
inMsgSize,
kOutMsgNelems,
outMsgSize,
kGroupSize,
groupSize,
kDtype,
dtype));
arg_values.append(fmt::format(
R"( "{}": {}, "{}": {}, "{}": {}, "{}": {}, "{}": {})",
kCollectiveName,
collectiveName,
kInMsgNelems,
inMsgSize,
kOutMsgNelems,
outMsgSize,
kGroupSize,
groupSize,
kDtype,
dtype));
}
const auto& input_tensor_starts =
collectiveRecord->getMetadataValue(kInTensorsStart);
Expand All @@ -489,13 +489,12 @@ void ChromeTraceLogger::handleActivity(const libkineto::ITraceActivity& op) {
if (!arg_values.empty()) {
arg_values.append(",");
}
arg_values.append(
fmt::format(
R"( "{}": {}, "{}": {})",
kInSplit,
inSplitSize,
kOutSplit,
outSplitSize));
arg_values.append(fmt::format(
R"( "{}": {}, "{}": {})",
kInSplit,
inSplitSize,
kOutSplit,
outSplitSize));
}
const auto& processGroupName =
collectiveRecord->getMetadataValue(kProcessGroupName);
Expand Down Expand Up @@ -569,6 +568,18 @@ void ChromeTraceLogger::handleActivity(const libkineto::ITraceActivity& op) {

// clang-format off
ts = transToRelativeTime(ts);

if (op.type() == libkineto::ActivityType::GPU_PM_COUNTER) {
fmt::print(traceOf_, R"JSON(
{{
"ph": "C", "cat": "{}", "name": "{}", "pid": {}, "tid": {},
"ts": {}.{:03} {}
}},)JSON",
toString(op.type()), op_name, device, sanitizeTid(resource),
ts/1000, ts %1000, args);
return;
}

fmt::print(traceOf_, R"JSON(
{{
"ph": "X", "cat": "{}", "name": "{}", "pid": {}, "tid": {},
Expand Down
27 changes: 27 additions & 0 deletions libkineto/test/ConfigTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ TEST(ParseTest, ActivityTypes) {
EXPECT_TRUE(cfg.parse("ACTIVITY_TYPES="));
EXPECT_FALSE(cfg.parse("=ACTIVITY_TYPES="));

// Default activity types
EXPECT_EQ(
cfg.selectedActivityTypes(),
std::set<ActivityType>(
Expand Down Expand Up @@ -141,6 +142,32 @@ TEST(ParseTest, ActivityTypes) {
std::set<ActivityType>(
{ActivityType::PRIVATEUSE1_RUNTIME,
ActivityType::PRIVATEUSE1_DRIVER}));

// Generic events
EXPECT_TRUE(cfg2.parse("ACTIVITY_TYPES = runtime, driver, kernel"));
EXPECT_EQ(
cfg2.selectedActivityTypes(),
std::set<ActivityType>(
{ActivityType::RUNTIME,
ActivityType::DRIVER,
ActivityType::CONCURRENT_KERNEL}));

// Generic events match aliases
EXPECT_TRUE(
cfg2.selectedActivityTypes().count(ActivityType::CUDA_RUNTIME) > 0);
EXPECT_TRUE(
cfg2.selectedActivityTypes().count(ActivityType::CUDA_DRIVER) > 0);
EXPECT_TRUE(
cfg2.selectedActivityTypes().count(ActivityType::MTIA_RUNTIME) > 0);
EXPECT_TRUE(
cfg2.selectedActivityTypes().count(ActivityType::MTIA_CCP_EVENTS) > 0);
EXPECT_TRUE(
cfg2.selectedActivityTypes().count(ActivityType::XPU_RUNTIME) > 0);
EXPECT_TRUE(
cfg2.selectedActivityTypes().count(ActivityType::PRIVATEUSE1_RUNTIME) >
0);
EXPECT_TRUE(
cfg2.selectedActivityTypes().count(ActivityType::PRIVATEUSE1_DRIVER) > 0);
}

TEST(ParseTest, SamplePeriod) {
Expand Down