Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
52c714a
scope profiler squashed
moksiuc Oct 31, 2025
0cbe479
Type fix
moksiuc Nov 13, 2025
7c98f84
Merge branch 'refs/heads/main' into moksiuci_6674_scope_profiler
moksiuc Nov 17, 2025
8438227
Fixes
moksiuc Nov 17, 2025
fd2a6e1
Code review fixes
moksiuc Nov 18, 2025
88082d5
Fix clang build on PT level
moksiuc Nov 18, 2025
04f305c
Fix clang build on PT level 2nd approach
moksiuc Nov 18, 2025
dbc709f
lintrunner
moksiuc Nov 19, 2025
ad4b1dd
Improve json for presentation in perfetto
moksiuc Nov 20, 2025
e050a30
Merge branch 'refs/heads/main' into moksiuci_6674_scope_profiler
moksiuc Nov 21, 2025
37eafea
Fix gtest after json update
moksiuc Nov 21, 2025
a266b04
Put correct oneapi version in error msg
moksiuc Nov 21, 2025
4a70122
Fix correct PTI and OneApi version
moksiuc Nov 24, 2025
535b19e
lint
sraikund16 Dec 15, 2025
b639a8c
Merge branch 'refs/heads/main' into moksiuci_6674_scope_profiler
moksiuc Dec 17, 2025
e85fd26
Fix merge error
moksiuc Dec 17, 2025
115ae6c
Merge branch 'refs/heads/main' into moksiuci_6674_scope_profiler
moksiuc Dec 18, 2025
02b8684
Move device gathering to enableScopeProfiler
moksiuc Dec 18, 2025
371c860
lint
sraikund16 Dec 16, 2025
9e3f9fd
Merge branch 'refs/heads/main' into moksiuci_6674_scope_profiler
moksiuc Dec 19, 2025
0d46ae9
Resolve exception from desctuctor
moksiuc Dec 19, 2025
906085b
Merge branch 'refs/heads/main' into moksiuci_6674_scope_profiler
moksiuc Dec 22, 2025
898ce26
Fix Win build: error C2039: 'back_inserter': is not a member of 'std'
moksiuc Dec 22, 2025
a014058
Update min PTI version error msg
moksiuc Dec 23, 2025
3fe5339
Remove unnecesary includes
moksiuc Dec 23, 2025
75cd8f4
Merge branch 'refs/heads/main' into moksiuci_6674_scope_profiler
moksiuc Jan 15, 2026
b2c9521
Avoid adding getMetadata method to the interface
moksiuc Jan 15, 2026
d5c79dd
Merge branch 'refs/heads/main' into moksiuci_6674_scope_profiler
moksiuc Jan 20, 2026
0ebfe7b
Split classes to avoid excessive #ifdef macro usage
moksiuc Jan 16, 2026
aeec1ca
Merge branch 'refs/heads/main' into moksiuci_6674_scope_profiler
moksiuc Feb 2, 2026
2bfb1c0
resolve merge commit issues
moksiuc Feb 2, 2026
703ce04
Merge branch 'refs/heads/main' into moksiuci_6674_scope_profiler
moksiuc Feb 3, 2026
830cc33
Fix win build
moksiuc Feb 3, 2026
1b1a031
Fix win build
moksiuc Feb 3, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 5 additions & 4 deletions libkineto/include/ActivityType.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,15 +42,16 @@ enum class ActivityType {
HPU_OP = 19, // HPU host side runtime event
XPU_RUNTIME = 20, // host side xpu runtime events
XPU_DRIVER = 21, // host side xpu driver events
COLLECTIVE_COMM = 22, // collective communication
XPU_SCOPE_PROFILER = 22, // XPUPTI Profiler scope for performance metrics
COLLECTIVE_COMM = 23, // collective communication

// PRIVATEUSE1 Activity types are used for custom backends.
// The corresponding device type is `DeviceType::PrivateUse1` in PyTorch.
PRIVATEUSE1_RUNTIME = 23, // host side privateUse1 runtime events
PRIVATEUSE1_DRIVER = 24, // host side privateUse1 driver events
PRIVATEUSE1_RUNTIME = 24, // host side privateUse1 runtime events
PRIVATEUSE1_DRIVER = 25, // host side privateUse1 driver events

ENUM_COUNT =
25, // This is to add buffer and not used for any profiling logic. Add
26, // This is to add buffer and not used for any profiling logic. Add
// your new type before it.
OPTIONAL_ACTIVITY_TYPE_START = GLOW_RUNTIME,
};
Expand Down
7 changes: 6 additions & 1 deletion libkineto/libkineto_defs.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,13 @@ def get_libkineto_roctracer_srcs(with_api = True):
def get_libkineto_xpupti_srcs(with_api = True):
return [
"src/plugin/xpupti/XpuptiActivityApi.cpp",
"src/plugin/xpupti/XpuptiActivityProfiler.cpp",
"src/plugin/xpupti/XpuptiActivityApiV2.cpp",
"src/plugin/xpupti/XpuptiActivityHandlers.cpp",
"src/plugin/xpupti/XpuptiActivityHandlersV2.cpp",
"src/plugin/xpupti/XpuptiActivityProfiler.cpp",
"src/plugin/xpupti/XpuptiActivityProfilerSession.cpp",
"src/plugin/xpupti/XpuptiActivityProfilerSessionV1.cpp",
"src/plugin/xpupti/XpuptiProfilerMacros.cpp",
"src/plugin/xpupti/XpuptiScopeProfilerConfig.cpp",
] + (get_libkineto_cpu_only_srcs(with_api))

Expand Down
1 change: 1 addition & 0 deletions libkineto/src/ActivityType.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ static constexpr std::array<ActivityTypeName, activityTypeCount + 1> map{
{"hpu_op", ActivityType::HPU_OP},
{"xpu_runtime", ActivityType::XPU_RUNTIME},
{"xpu_driver", ActivityType::XPU_DRIVER},
{"xpu_scope_profiler", ActivityType::XPU_SCOPE_PROFILER},
{"collective_comm", ActivityType::COLLECTIVE_COMM},
{"privateuse1_runtime", ActivityType::PRIVATEUSE1_RUNTIME},
{"privateuse1_driver", ActivityType::PRIVATEUSE1_DRIVER},
Expand Down
3 changes: 3 additions & 0 deletions libkineto/src/CuptiActivityProfiler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@
#include "DeviceUtil.h"
#include "KernelRegistry.h"
#include "Logger.h"
#ifdef HAS_XPUPTI
#include "plugin/xpupti/XpuptiActivityProfilerSession.h"
#endif

using namespace std::chrono;
using std::string;
Expand Down
16 changes: 3 additions & 13 deletions libkineto/src/init.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
#include "EventProfilerController.h"
#endif
#ifdef HAS_XPUPTI
#include "plugin/xpupti/XpuptiActivityApi.h"
#include "plugin/xpupti/XpuptiActivityApiV2.h"
#include "plugin/xpupti/XpuptiActivityProfiler.h"
#include "plugin/xpupti/XpuptiScopeProfilerConfig.h"
#endif
Expand Down Expand Up @@ -182,19 +182,9 @@ void libkineto_init(bool cpuOnly, bool logOnError) {
[]() -> std::unique_ptr<IActivityProfiler> {
auto returnCode = ptiViewGPULocalAvailable();
if (returnCode != PTI_SUCCESS) {
std::string errPrefixMsg(
"Fail to enable Kineto Profiler on XPU due to error code: ");
errPrefixMsg = errPrefixMsg + std::to_string(returnCode);
#if PTI_VERSION_AT_LEAST(0, 10)
std::string errMsg(ptiResultTypeToString(returnCode));
throw std::runtime_error(
errPrefixMsg + std::string(". The detailed error message is: ") +
errMsg);
#else
throw std::runtime_error(errPrefixMsg);
#endif
throwXpuRuntimeError(
"Fail to enable Kineto Profiler on XPU.", returnCode);
}

XpuptiScopeProfilerConfig::registerFactory();
return std::make_unique<XPUActivityProfiler>();
});
Expand Down
83 changes: 58 additions & 25 deletions libkineto/src/output_json.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -468,19 +468,18 @@ void ChromeTraceLogger::handleActivity(const libkineto::ITraceActivity& op) {
if (!arg_values.empty()) {
arg_values.append(",");
}
arg_values.append(
fmt::format(
R"( "{}": {}, "{}": {}, "{}": {}, "{}": {}, "{}": {})",
kCollectiveName,
collectiveName,
kInMsgNelems,
inMsgSize,
kOutMsgNelems,
outMsgSize,
kGroupSize,
groupSize,
kDtype,
dtype));
arg_values.append(fmt::format(
R"( "{}": {}, "{}": {}, "{}": {}, "{}": {}, "{}": {})",
kCollectiveName,
collectiveName,
kInMsgNelems,
inMsgSize,
kOutMsgNelems,
outMsgSize,
kGroupSize,
groupSize,
kDtype,
dtype));
}
const auto& input_tensor_starts =
collectiveRecord->getMetadataValue(std::string(kInTensorsStart));
Expand Down Expand Up @@ -509,13 +508,12 @@ void ChromeTraceLogger::handleActivity(const libkineto::ITraceActivity& op) {
if (!arg_values.empty()) {
arg_values.append(",");
}
arg_values.append(
fmt::format(
R"( "{}": {}, "{}": {})",
kInSplit,
inSplitSize,
kOutSplit,
outSplitSize));
arg_values.append(fmt::format(
R"( "{}": {}, "{}": {})",
kInSplit,
inSplitSize,
kOutSplit,
outSplitSize));
}
const auto& processGroupName =
collectiveRecord->getMetadataValue(std::string(kProcessGroupName));
Expand Down Expand Up @@ -591,16 +589,51 @@ void ChromeTraceLogger::handleActivity(const libkineto::ITraceActivity& op) {
sanitizeStrForJSON(op_name);
sanitizeForNonReadableChars(op_name);

// clang-format off
ts = transToRelativeTime(ts);
fmt::print(traceOf_, R"JSON(

if (op.type() == ActivityType::XPU_SCOPE_PROFILER) {
std::string metricsStr = op.metadataJson();
std::string activityName = toString(op.type());
fmt::print(
traceOf_,
// clang-format off
R"JSON(
{{
"name": "{}",
"ph": "C",
"ts": {}.{:03},
"pid": {},
"tid": {},
"args": {{ {} }}
}},)JSON",
// clang-format on
activityName.substr(0, activityName.find('_')),
ts / 1000,
ts % 1000,
device,
sanitizeTid(resource),
metricsStr);
} else {
fmt::print(
traceOf_,
// clang-format off
R"JSON(
{{
"ph": "X", "cat": "{}", "name": "{}", "pid": {}, "tid": {},
"ts": {}.{:03}, "dur": {}.{:03}{}
}},)JSON",
toString(op.type()), op_name, device, sanitizeTid(resource),
ts/1000, ts %1000, duration/1000, duration %1000, args);
// clang-format on
// clang-format on
toString(op.type()),
op_name,
device,
sanitizeTid(resource),
ts / 1000,
ts % 1000,
duration / 1000,
duration % 1000,
args);
}

if (op.flowId() > 0) {
handleGenericLink(op);
}
Expand Down
47 changes: 25 additions & 22 deletions libkineto/src/plugin/xpupti/XpuptiActivityApi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,20 +8,13 @@

#include "XpuptiActivityApi.h"

#include <algorithm>
#include <chrono>
#include <vector>
#include <stdexcept>

namespace KINETO_NAMESPACE {

constexpr size_t kBufSize(4 * 1024 * 1024);

XpuptiActivityApi& XpuptiActivityApi::singleton() {
static XpuptiActivityApi instance;
return instance;
}

void XpuptiActivityApi::pushCorrelationID(int id, CorrelationFlowType type) {
void XpuptiActivityApiV1::pushCorrelationID(int id, CorrelationFlowType type) {
#ifdef HAS_XPUPTI
if (!singleton().externalCorrelationEnabled_) {
return;
Expand All @@ -38,7 +31,7 @@ void XpuptiActivityApi::pushCorrelationID(int id, CorrelationFlowType type) {
#endif
}

void XpuptiActivityApi::popCorrelationID(CorrelationFlowType type) {
void XpuptiActivityApiV1::popCorrelationID(CorrelationFlowType type) {
#ifdef HAS_XPUPTI
if (!singleton().externalCorrelationEnabled_) {
return;
Expand Down Expand Up @@ -68,13 +61,13 @@ static bool nextActivityRecord(
return record != nullptr;
}

void XpuptiActivityApi::bufferRequestedTrampoline(
void XpuptiActivityApiV1::bufferRequestedTrampoline(
uint8_t** buffer,
size_t* size) {
singleton().bufferRequested(buffer, size);
}

void XpuptiActivityApi::bufferRequested(uint8_t** buffer, size_t* size) {
void XpuptiActivityApiV1::bufferRequested(uint8_t** buffer, size_t* size) {
std::lock_guard<std::mutex> guard(mutex_);

auto buf = std::make_unique<XpuptiActivityBuffer>(kBufSize);
Expand All @@ -84,7 +77,8 @@ void XpuptiActivityApi::bufferRequested(uint8_t** buffer, size_t* size) {
allocatedGpuTraceBuffers_[*buffer] = std::move(buf);
}

std::unique_ptr<XpuptiActivityBufferMap> XpuptiActivityApi::activityBuffers() {
std::unique_ptr<XpuptiActivityBufferMap>
XpuptiActivityApiV1::activityBuffers() {
{
std::lock_guard<std::mutex> guard(mutex_);
if (allocatedGpuTraceBuffers_.empty()) {
Expand All @@ -102,7 +96,7 @@ std::unique_ptr<XpuptiActivityBufferMap> XpuptiActivityApi::activityBuffers() {
}

#ifdef HAS_XPUPTI
int XpuptiActivityApi::processActivitiesForBuffer(
int XpuptiActivityApiV1::processActivitiesForBuffer(
uint8_t* buf,
size_t validSize,
std::function<void(const pti_view_record_base*)> handler) {
Expand All @@ -118,7 +112,7 @@ int XpuptiActivityApi::processActivitiesForBuffer(
}
#endif

const std::pair<int, int> XpuptiActivityApi::processActivities(
const std::pair<int, int> XpuptiActivityApiV1::processActivities(
XpuptiActivityBufferMap& buffers,
std::function<void(const pti_view_record_base*)> handler) {
std::pair<int, int> res{0, 0};
Expand All @@ -132,13 +126,13 @@ const std::pair<int, int> XpuptiActivityApi::processActivities(
return res;
}

void XpuptiActivityApi::flushActivities() {
void XpuptiActivityApiV1::flushActivities() {
#ifdef HAS_XPUPTI
XPUPTI_CALL(ptiFlushAllViews());
#endif
}

void XpuptiActivityApi::clearActivities() {
void XpuptiActivityApiV1::clearActivities() {
{
std::lock_guard<std::mutex> guard(mutex_);
if (allocatedGpuTraceBuffers_.empty()) {
Expand All @@ -153,14 +147,14 @@ void XpuptiActivityApi::clearActivities() {
}

#ifdef HAS_XPUPTI
void XpuptiActivityApi::bufferCompletedTrampoline(
void XpuptiActivityApiV1::bufferCompletedTrampoline(
uint8_t* buffer,
size_t size,
size_t validSize) {
singleton().bufferCompleted(buffer, size, validSize);
}

void XpuptiActivityApi::bufferCompleted(
void XpuptiActivityApiV1::bufferCompleted(
uint8_t* buffer,
size_t size,
size_t validSize) {
Expand Down Expand Up @@ -202,8 +196,9 @@ static void enableSpecifcRuntimeAPIsTracing() {
}
#endif

void XpuptiActivityApi::enableXpuptiActivities(
const std::set<ActivityType>& selected_activities) {
void XpuptiActivityApiV1::enableXpuptiActivities(
const std::set<ActivityType>& selected_activities,
bool scopeProfilerActivityAccepted) {
#ifdef HAS_XPUPTI
XPUPTI_CALL(ptiViewSetCallbacks(
bufferRequestedTrampoline, bufferCompletedTrampoline));
Expand Down Expand Up @@ -245,6 +240,14 @@ void XpuptiActivityApi::enableXpuptiActivities(
XPUPTI_CALL(ptiViewEnable(PTI_VIEW_DRIVER_API));
break;

case ActivityType::XPU_SCOPE_PROFILER:
if (!scopeProfilerActivityAccepted) {
throw std::runtime_error(
"Intel® PTI version required to use scope profiler is at least 0.15 "
"(available with Intel® oneAPI in version at least 2025.3.1).");
}
break;

case ActivityType::OVERHEAD:
XPUPTI_CALL(ptiViewEnable(PTI_VIEW_COLLECTION_OVERHEAD));
break;
Expand All @@ -253,7 +256,7 @@ void XpuptiActivityApi::enableXpuptiActivities(
#endif
}

void XpuptiActivityApi::disablePtiActivities(
void XpuptiActivityApiV1::disablePtiActivities(
const std::set<ActivityType>& selected_activities) {
#ifdef HAS_XPUPTI
for (const auto& activity : selected_activities) {
Expand Down
19 changes: 8 additions & 11 deletions libkineto/src/plugin/xpupti/XpuptiActivityApi.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,35 +12,32 @@
#include "XpuptiProfilerMacros.h"

#include "ActivityType.h"
#include "Config.h"

#include <pti/pti_view.h>

#include <functional>
#include <memory>
#include <mutex>
#include <optional>
#include <set>

namespace KINETO_NAMESPACE {

class XpuptiActivityApi {
class XpuptiActivityApiV1 {
public:
enum CorrelationFlowType { Default, User };

XpuptiActivityApi() = default;
XpuptiActivityApi(const XpuptiActivityApi&) = delete;
XpuptiActivityApi& operator=(const XpuptiActivityApi&) = delete;
XpuptiActivityApiV1() = default;
XpuptiActivityApiV1(const XpuptiActivityApiV1&) = delete;
XpuptiActivityApiV1& operator=(const XpuptiActivityApiV1&) = delete;

virtual ~XpuptiActivityApi() {}
virtual ~XpuptiActivityApiV1() {}

static XpuptiActivityApi& singleton();
static XpuptiActivityApiV1& singleton();

static void pushCorrelationID(int id, CorrelationFlowType type);
static void popCorrelationID(CorrelationFlowType type);

void enableXpuptiActivities(
const std::set<ActivityType>& selected_activities);
const std::set<ActivityType>& selected_activities,
bool scopeProfilerActivityAccepted = false);
void disablePtiActivities(const std::set<ActivityType>& selected_activities);
void clearActivities();
void flushActivities();
Expand Down
Loading