Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 37 additions & 0 deletions src/plugins/intel_cpu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,11 @@ ie_option(ENABLE_MLAS_FOR_CPU "Enable MLAS for OpenVINO CPU Plugin" ${ENABLE_MLA

add_subdirectory(thirdparty)

ie_dependent_option(ENABLE_CPU_PROFILER "enable CPU profiler" OFF "X86 OR X86_64" OFF)
if (ENABLE_CPU_PROFILER)
add_definitions(-DOV_CPU_WITH_PROFILER)
endif()

if(WIN32)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DNOMINMAX")
endif()
Expand Down Expand Up @@ -136,7 +141,39 @@ cross_compiled_file(${TARGET_NAME}
NAME proposal_exec
NAMESPACE InferenceEngine::Extensions::Cpu::XARCH
)
cross_compiled_file(${TARGET_NAME}
ARCH AVX512F AVX2 ANY
src/nodes/vnode_attn_softmax.cpp
API src/nodes/vnode_attn_softmax.hpp
NAME attn_softmax
NAMESPACE InferenceEngine::Extensions::Cpu::XARCH
)
cross_compiled_file(${TARGET_NAME}
ARCH AVX2 ANY
src/nodes/vnode_rms_norm.cpp
API src/nodes/vnode_rms_norm.hpp
NAME rms_norm
NAMESPACE InferenceEngine::Extensions::Cpu::XARCH
)

# for avx2 compilation
set_property(SOURCE src/nodes/vnode.cpp APPEND_STRING PROPERTY COMPILE_OPTIONS
"-mavx2")
set_property(SOURCE src/nodes/vnode.cpp APPEND_STRING PROPERTY COMPILE_OPTIONS
"-mfma")

check_cxx_source_compiles("#include <immintrin.h>\nint main() {auto v_scale = _mm512_set1_ps(0.0f);return 0;}" ENABLE_AVX512)
if(ENABLE_AVX512)
set_property(SOURCE src/nodes/vnode.cpp APPEND_STRING PROPERTY COMPILE_OPTIONS
"-mavx512f")
set_property(SOURCE src/nodes/vnode.cpp APPEND_STRING PROPERTY COMPILE_OPTIONS
"-mfma")
endif()
check_cxx_source_compiles("#include <immintrin.h>\nint main() { __m256bh _s; __m512bh _a, _b; _s = _mm512_cvtneps_pbh(_mm512_dpbf16_ps(_mm512_cvtpbh_ps(_s), _a, _b)); return 0; }" ENABLE_AVX512_BF16)
if(ENABLE_AVX512_BF16)
set_property(SOURCE src/nodes/vnode.cpp APPEND_STRING PROPERTY COMPILE_OPTIONS
"-mavx512bf16")
endif()
# system dependencies must go last
target_link_libraries(${TARGET_NAME} PRIVATE openvino::pugixml)
set_ie_threading_interface_for(${TARGET_NAME})
Expand Down
6 changes: 6 additions & 0 deletions src/plugins/intel_cpu/docs/chrome_tracing_profiler.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# Profiler based on Chrome Tracing
Chrome tracing is a profiling tool which is easily accessible at URL `chrome://tracing/` in Chrome browser. CPU plugin compiled with `-DENABLE_CPU_PROFILER=ON` has the capability of generating such json-format tracing logs that can be loaded into & viewed with this powerful tool.

It cannot replace ITT & Vtune based profiling, but it provides a very convenient & customizable alternative.

set `OV_CPU_PROFILE` environment variable to `1` will enable the profiling and tracing log generation.
4 changes: 3 additions & 1 deletion src/plugins/intel_cpu/docs/debug_capabilities/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,16 @@ They can be activated at runtime and might be used for analyzing issues, getting

Use the following cmake option to enable debug capabilities:

`-DENABLE_DEBUG_CAPS=ON`
`-DENABLE_DEBUG_CAPS=ON -DENABLE_CPU_DEBUG_CAPS=ON`

* [Verbose mode](verbose.md)
* [Blob dumping](blob_dumping.md)
* [Graph serialization](graph_serialization.md)
* [Graph transformation disabling](feature_disabling.md#graph-transformations)
* [Logging](logging.md)
* [Inference Precision](infer_prc.md)
* Dump runtime models
* set `OV_CPU_DUMP_MODELS` environment variable to `1` will enable runtime models to be dumpped as cpp-style text file and IR xml file.
* Performance summary
* set `OV_CPU_SUMMARY_PERF` environment variable to display performance summary at the time when model is being destructed.
* Internal performance counter will be enabled automatically.
6 changes: 5 additions & 1 deletion src/plugins/intel_cpu/src/cpu_types.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,9 @@ const InferenceEngine::details::caseless_unordered_map<std::string, Type> type_t
{ "Interaction", Type::Interaction},
{ "MHA", Type::MHA},
{ "Unique", Type::Unique},
{ "Ngram", Type::Ngram}
{ "Ngram", Type::Ngram},
{ "VNode", Type::VNode},
{ "RoPE", Type::RoPE}
};

Type TypeFromName(const std::string& type) {
Expand Down Expand Up @@ -313,6 +315,8 @@ std::string NameFromType(const Type type) {
CASE(MHA);
CASE(Unique);
CASE(Ngram);
CASE(VNode);
CASE(RoPE);
CASE(Unknown);
}
#undef CASE
Expand Down
4 changes: 3 additions & 1 deletion src/plugins/intel_cpu/src/cpu_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,9 @@ enum class Type {
Interaction,
MHA,
Unique,
Ngram
Ngram,
VNode,
RoPE,
};

enum class Algorithm {
Expand Down
1 change: 1 addition & 0 deletions src/plugins/intel_cpu/src/exec_network.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,7 @@ ExecNetwork::GraphGuard::Lock ExecNetwork::GetGraph() const {
ctx = std::make_shared<GraphContext>(_cfg, extensionManager, weightsCache, isQuantizedFlag);
}
graphLock._graph.CreateGraph(_network, ctx);
DEBUG_DUMP_GRAPH(graphLock._graph, "cpu_Graph");
} catch (...) {
exception = std::current_exception();
}
Expand Down
4 changes: 4 additions & 0 deletions src/plugins/intel_cpu/src/extension.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,10 @@
#include "transformations/cpu_opset/common/op/power_static.hpp"
#include "transformations/cpu_opset/common/op/swish_cpu.hpp"
#include "transformations/cpu_opset/common/op/ngram.hpp"
#include "transformations/cpu_opset/common/op/vnode.hpp"
#include "transformations/cpu_opset/x64/op/mha.hpp"
#include "transformations/cpu_opset/x64/op/interaction.hpp"
#include "transformations/cpu_opset/x64/op/rope.hpp"
#include "transformations/snippets/x64/op/load_convert.hpp"
#include "transformations/snippets/x64/op/store_convert.hpp"
#include "transformations/snippets/x64/op/brgemm_cpu.hpp"
Expand Down Expand Up @@ -60,6 +62,8 @@ std::map<std::string, ngraph::OpSet> Extension::getOpSets() {
NGRAPH_OP(NgramNode, ov::intel_cpu)
NGRAPH_OP_X64(MHANode, ov::intel_cpu)
NGRAPH_OP_X64(InteractionNode, ov::intel_cpu)
NGRAPH_OP_X64(VNode, ov::intel_cpu)
NGRAPH_OP_X64(RoPENode, ov::intel_cpu)
#undef NGRAPH_OP

return opset;
Expand Down
38 changes: 29 additions & 9 deletions src/plugins/intel_cpu/src/graph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
#include "utils/ngraph_utils.hpp"
#include "utils/cpu_utils.hpp"
#include "utils/verbose.h"
#include "utils/profiler.hpp"
#include "memory_desc/cpu_memory_desc_utils.h"

#include <openvino/core/model.hpp>
Expand All @@ -53,6 +54,7 @@
#include "memory_desc/dnnl_blocked_memory_desc.h"
#include <common/primitive_desc.hpp>
#include <common/primitive_desc_iface.hpp>
#include "transformations/cpu_opset/common/op/vnode.hpp"
#if (OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO)
# include <tbb/task.h>
#endif
Expand Down Expand Up @@ -247,8 +249,13 @@ void Graph::Replicate(const CNNNetwork &network) {

if (op->get_type_info() == op::v0::Result::get_type_info_static()) {
const auto &input = op->input_value(0);
const auto name = op::util::get_ie_output_name(input);

std::string name;
// VNode may produces many results
if (auto vnode = ov::as_type_ptr<ov::intel_cpu::VNode>(input.get_node_shared_ptr())) {
name = vnode->get_output_name(input.get_index());
} else {
name = op::util::get_ie_output_name(input);
}
if (outputsInfo.count(name) != 0) {
outputNodesMap[name] = node;
}
Expand Down Expand Up @@ -647,6 +654,15 @@ void Graph::InitEdges() {

for (auto node : vecConsumers) {
if (node->getExecIndex() >= execIndex) {
DEBUG_LOG(edge->name(),
" needReorder due to execIndex: ",
node->getName(),
" ",
node->getExecIndex(),
">",
modifyingNode->getName(),
" ",
execIndex);
return true;
}
}
Expand Down Expand Up @@ -1000,7 +1016,7 @@ bool Graph::ProcessDynNodes() {

void Graph::PushInputData(const std::string& name, const InferenceEngine::Blob::Ptr &in) {
if (!IsReady()) IE_THROW()<< "Wrong state. Topology not ready.";

PROFILE(_prof, "Graph::PushInputData");
auto input = inputNodesMap.find(name);
if (input != inputNodesMap.end()) {
auto& inTensorDesc = in->getTensorDesc();
Expand Down Expand Up @@ -1035,6 +1051,7 @@ void Graph::PushInputData(const std::string& name, const InferenceEngine::Blob::

// suppose always being shared infer_request intel_cpu::Tensor to Graph if isDynamic.
void Graph::PullOutputData(BlobMap &out) {
PROFILE(_prof, "Graph::PullOutputData");
if (!IsReady())
IE_THROW() << "Wrong state. Topology not ready.";

Expand Down Expand Up @@ -1121,11 +1138,11 @@ void Graph::PullOutputData(BlobMap &out) {

void Graph::InferStatic(InferRequestBase* request) {
dnnl::stream stream(getEngine());

PROFILE(_prof0, std::string("Graph::InferStatic_#") + std::to_string(infer_count), {});
for (const auto& node : executableGraphNodes) {
VERBOSE(node, getConfig().debugCaps.verbose);
PERF(node, getConfig().collectPerfCounters);

PROFILE(_prof1, node->getTypeStr(), node->getName());
if (request)
request->ThrowIfCanceled();
ExecuteNode(node, stream);
Expand Down Expand Up @@ -1330,7 +1347,7 @@ class UpdateNodes : public UpdateNodesBase {

void Graph::InferDynamic(InferRequestBase* request) {
dnnl::stream stream(getEngine());

PROFILE(_prof0, std::string("Graph::InferDynamic_#") + std::to_string(infer_count));
std::set<size_t> syncIndsWorkSet;
for (const auto& nodeIndx : syncNodesInds) {
syncIndsWorkSet.insert(nodeIndx.second);
Expand All @@ -1349,12 +1366,15 @@ void Graph::InferDynamic(InferRequestBase* request) {
size_t inferCounter = 0;

for (auto stopIndx : syncIndsWorkSet) {
updateNodes->run(stopIndx);
{
PROFILE(_prof, "updateNodes");
updateNodes->run(stopIndx);
}
for (; inferCounter < stopIndx; ++inferCounter) {
auto& node = executableGraphNodes[inferCounter];
VERBOSE(node, getConfig().debugCaps.verbose);
PERF(node, getConfig().collectPerfCounters);

PROFILE(_prof, node->getTypeStr(), node->getName());
if (request)
request->ThrowIfCanceled();
ExecuteNode(node, stream);
Expand Down Expand Up @@ -1387,7 +1407,7 @@ void Graph::Infer(InferRequestBase* request) {
IE_THROW() << "Unknown ov::intel_cpu::Graph state: " << static_cast<size_t>(status);
}

if (infer_count != -1) infer_count++;
infer_count++;
}

void Graph::VisitNode(NodePtr node, std::vector<NodePtr>& sortedNodes) {
Expand Down
Loading