Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cmake/templates/OpenVINOConfig.cmake.in
Original file line number Diff line number Diff line change
Expand Up @@ -369,7 +369,7 @@ macro(_ov_find_itt)
endmacro()

macro(_ov_find_intel_cpu_dependencies)
set(_OV_ENABLE_CPU_ACL "@DNNL_USE_ACL@")
set(_OV_ENABLE_CPU_ACL "@DNNL_AARCH64_USE_ACL@")
if(_OV_ENABLE_CPU_ACL)
set(_ov_in_install_tree "@PACKAGE_OPENVINO_LIB_DIR@")
if(_ov_in_install_tree)
Expand Down
2 changes: 1 addition & 1 deletion src/cmake/openvino.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -227,7 +227,7 @@ endif()

# build tree

if(DNNL_USE_ACL)
if(DNNL_AARCH64_USE_ACL)
list(APPEND BUILD_PATH_VARS "FIND_ACL_PATH;CMAKE_ARCHIVE_OUTPUT_DIRECTORY")
set(FIND_ACL_PATH "${intel_cpu_thirdparty_SOURCE_DIR}")
endif()
Expand Down
13 changes: 4 additions & 9 deletions src/plugins/intel_cpu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -87,12 +87,7 @@ set(OV_CPU_ARM_TARGET_GENERIC_ARCHS armv8a
armv8.6-a armv8.6-a-sve armv8.6-a-sve2 armv8.6-a-sve2-sme2
armv8r64 # the same as armv8.4-a
)
if(ARM)
set(OV_CPU_ARM_TARGET_ARCH_DEFAULT armv7a)
set(OV_CPU_ARM_TARGET_ARCHS armv7a armv7a-hf
# requires estate=32
${OV_CPU_ARM_TARGET_GENERIC_ARCHS})
elseif(AARCH64)
if(AARCH64)
if(APPLE)
set(OV_CPU_ARM_TARGET_ARCH_DEFAULT arm64-v8.2-a)
else()
Expand Down Expand Up @@ -153,7 +148,7 @@ if(OV_CPU_WITH_DNNL)
add_definitions(-DOV_CPU_WITH_DNNL)
endif()

if(DNNL_USE_ACL)
if(DNNL_AARCH64_USE_ACL)
add_definitions(-DOV_CPU_WITH_ACL)
set(OV_CPU_WITH_ACL ON)
endif()
Expand Down Expand Up @@ -183,7 +178,7 @@ if(NOT X86_64)
${CMAKE_CURRENT_SOURCE_DIR}/src/transformations/tpp/x64/*)
endif()

if(NOT (AARCH64 OR ARM))
if(NOT AARCH64)
list(APPEND EXCLUDE_PATHS ${CMAKE_CURRENT_SOURCE_DIR}/src/transformations/cpu_opset/arm/*
${CMAKE_CURRENT_SOURCE_DIR}/src/transformations/tpp/aarch64/*
${CMAKE_CURRENT_SOURCE_DIR}/src/emitters/plugin/aarch64/*
Expand Down Expand Up @@ -239,7 +234,7 @@ ov_add_plugin(NAME ${TARGET_NAME}
ADD_CLANG_TIDY)

# give a different file name depending on target platform architecture
if(ARM OR AARCH64)
if(AARCH64)
set_target_properties(${TARGET_NAME} PROPERTIES OUTPUT_NAME "openvino_arm_cpu_plugin")
elseif(RISCV64)
set_target_properties(${TARGET_NAME} PROPERTIES OUTPUT_NAME "openvino_riscv_cpu_plugin")
Expand Down
4 changes: 1 addition & 3 deletions src/plugins/intel_cpu/src/config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -496,18 +496,16 @@ void Config::readProperties(const ov::AnyMap& prop, const ModelType modelType) {
if (!inferencePrecisionSetExplicitly) {
if (executionMode == ov::hint::ExecutionMode::PERFORMANCE) {
inferencePrecision = ov::element::f32;
#if defined(OPENVINO_ARCH_ARM) || defined(OPENVINO_ARCH_ARM64)
#if defined(OPENVINO_ARCH_ARM64)
if (hasHardwareSupport(ov::element::f16)) {
inferencePrecision = ov::element::f16;
}
# if defined(OPENVINO_ARCH_ARM64)
// enforce fp32 inference precision for dynamic quantization
// to preserve fp32 matmul output precision
if (fcDynamicQuantizationGroupSizeSetExplicitly &&
fcDynamicQuantizationGroupSize == std::numeric_limits<uint64_t>::max()) {
inferencePrecision = ov::element::f32;
}
# endif
#endif
if (mayiuse(avx512_core_bf16)) {
inferencePrecision = ov::element::bf16;
Expand Down
39 changes: 2 additions & 37 deletions src/plugins/intel_cpu/src/cpu_streams_calculation.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1039,39 +1039,7 @@ static void configure_arm64_linux_threads(Config& config,
}
#endif

#if defined(OPENVINO_ARCH_ARM) && defined(__linux__)
void configure_arm_linux_threads(Config& config,
const std::vector<std::vector<int>>& proc_type_table,
const ov::MemBandwidthPressure& tolerance,
bool int8_intensive,
bool is_LLM) {
using namespace ThreadPreferenceConstants;
config.modelPreferThreadsThroughput = ARM_THREADS_DEFAULT;

if (tolerance.max_mem_tolerance == ov::MemBandwidthPressure::UNKNOWN) {
if (tolerance.ratio_compute_convs == ov::MemBandwidthPressure::ALL) {
config.modelPreferThreadsThroughput = ARM_THREADS_HIGH;
}
} else if ((tolerance.max_mem_tolerance < ov::MemBandwidthPressure::LIMITED) &&
((tolerance.ratio_mem_limited_deconvs > ov::MemBandwidthPressure::LIMITED) ||
(tolerance.ratio_mem_limited_gemms > ov::MemBandwidthPressure::LIMITED))) {
config.modelPreferThreadsThroughput = ARM_THREADS_HIGH;
}

const int main_cores = proc_type_table[0][MAIN_CORE_PROC];
const int efficient_cores = proc_type_table[0][EFFICIENT_CORE_PROC];

bool use_all_cores = should_use_all_cores_for_latency(main_cores, efficient_cores, int8_intensive);

if (use_all_cores && (!is_LLM || should_use_ecores_for_llm(efficient_cores, main_cores))) {
config.modelPreferThreadsLatency = main_cores + efficient_cores;
} else {
config.modelPreferThreadsLatency = main_cores;
}
}
#endif

#if (defined(OPENVINO_ARCH_ARM) || defined(OPENVINO_ARCH_ARM64)) && defined(__APPLE__)
#if (defined(OPENVINO_ARCH_ARM64)) && defined(__APPLE__)
void configure_apple_threads(Config& config,
const std::vector<std::vector<int>>& proc_type_table,
const ov::MemBandwidthPressure& tolerance,
Expand Down Expand Up @@ -1244,10 +1212,7 @@ int get_model_prefer_threads(const int num_streams,
memThresholdAssumeLimitedForISA,
config.inferencePrecision);

# if defined(OPENVINO_ARCH_ARM) && defined(__linux__)
configure_arm_linux_threads(config, proc_type_table, networkToleranceForLowCache, int8_intensive, is_LLM);

# elif (defined(OPENVINO_ARCH_ARM) || defined(OPENVINO_ARCH_ARM64)) && defined(__APPLE__)
# if (defined(OPENVINO_ARCH_ARM64)) && defined(__APPLE__)
configure_apple_threads(config,
proc_type_table,
networkToleranceForLowCache,
Expand Down
10 changes: 1 addition & 9 deletions src/plugins/intel_cpu/src/cpu_streams_calculation.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -124,15 +124,7 @@ void get_num_streams(int streams, const std::shared_ptr<ov::Model>& model, Confi
void sort_table_by_numa_node_id(int current_numa_node, std::vector<std::vector<int>>& proc_type_table);

// Internal configure_* helpers are declared below and are publicly callable.
#if defined(OPENVINO_ARCH_ARM) && defined(__linux__)
void configure_arm_linux_threads(Config& config,
const std::vector<std::vector<int>>& proc_type_table,
const ov::MemBandwidthPressure& tolerance,
bool int8_intensive,
bool is_LLM);
#endif

#if (defined(OPENVINO_ARCH_ARM) || defined(OPENVINO_ARCH_ARM64)) && defined(__APPLE__)
#if (defined(OPENVINO_ARCH_ARM64)) && defined(__APPLE__)
void configure_apple_threads(Config& config,
const std::vector<std::vector<int>>& proc_type_table,
const ov::MemBandwidthPressure& tolerance,
Expand Down
2 changes: 1 addition & 1 deletion src/plugins/intel_cpu/src/dnnl_postops_composer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -487,7 +487,7 @@ void DnnlPostOpsComposer::appendBinary(const dnnl::algorithm alg, const std::vec
DEBUG_LOG("Append binary post op with algorithm: ", convert_to_c(alg), " Shape: ", Shape(*pdims));

ov::element::Type binaryType = ov::element::f32;
#if defined(OPENVINO_ARCH_ARM64) || defined(OPENVINO_ARCH_ARM)
#if defined(OPENVINO_ARCH_ARM64)
if (outDataType == dnnl::memory::data_type::f16) {
// ACL executor is not able to handle different precisions between convolution output and post op input
// in this case original post op tensor is f32 even the model runs in f16 precision
Expand Down
4 changes: 2 additions & 2 deletions src/plugins/intel_cpu/src/graph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@
# include "openvino/runtime/properties.hpp"
#endif

#if defined(OPENVINO_ARCH_ARM) || defined(OPENVINO_ARCH_ARM64)
#if defined(OPENVINO_ARCH_ARM64)
# include <common/primitive_desc_iface.hpp>

# include "onednn/iml_type_mapper.h"
Expand Down Expand Up @@ -606,7 +606,7 @@ static bool isReorderAvailable(const MemoryDescPtr& parentDesc,
dstMemDesc.get(),
eng.get(),
attr.get());
#if defined(OPENVINO_ARCH_ARM) || defined(OPENVINO_ARCH_ARM64)
#if defined(OPENVINO_ARCH_ARM64)
// temporary WA for slow FP32->FP16 conversion reorder in oneDNN on ARM
// pretend the reorder is not available to use Convert node instead
if (hasHardwareSupport(ov::element::f16) && (result != nullptr) &&
Expand Down
8 changes: 4 additions & 4 deletions src/plugins/intel_cpu/src/graph_optimizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ void GraphOptimizer::ApplyCommonGraphOptimizations(Graph& graph) {
// The order of applying scales and shifts is different for ARM to get specific postops order:
// postops order on ARM: bias, scale, fq
// postops order on x86: scale, bias, fq
#if defined(OPENVINO_ARCH_ARM) || defined(OPENVINO_ARCH_ARM64)
#if defined(OPENVINO_ARCH_ARM64)
OV_ITT_SCOPE_NEXT(FIRST_INFERENCE, taskChain, "FuseConvolutionAndBias");
FuseConvolutionMatMulDeconvAndBias(graph);
graph.RemoveDroppedNodes();
Expand Down Expand Up @@ -280,7 +280,7 @@ void GraphOptimizer::FuseConvMatmulFCDeconvAndDQScales(Graph& graph) {
return false;
}
// The order of applying scales and shifts is different for ARM, so bias could be already fused here for ARM
#if defined(OPENVINO_ARCH_ARM) || defined(OPENVINO_ARCH_ARM64)
#if defined(OPENVINO_ARCH_ARM64)
return any_of(parentNode->getParentEdges().size(), 2U, 3U);
#else
return (parentNode->getParentEdges().size() == 2);
Expand Down Expand Up @@ -932,7 +932,7 @@ void GraphOptimizer::FuseFCAndTransposeOnWeights(Graph& graph) {
void GraphOptimizer::FuseConvolutionAndZeroPoints(Graph& graph) {
const auto& graphNodes = graph.GetNodes();
// zero points fusing is skipped on ARM platforms because oneDNN is not involved into int8 convolution inference
#if defined(OPENVINO_ARCH_ARM) || defined(OPENVINO_ARCH_ARM64)
#if defined(OPENVINO_ARCH_ARM64)
return;
#endif

Expand Down Expand Up @@ -3414,7 +3414,7 @@ void GraphOptimizer::TailNodesPrecisionOptimize(Graph& graph) {
if (inferPrec != ov::element::f16) {
return;
}
#if defined(OPENVINO_ARCH_ARM) || defined(OPENVINO_ARCH_ARM64)
#if defined(OPENVINO_ARCH_ARM64)
return; // precision of configured by ov::pass::ConvertPrecision
#endif
const std::vector<NodePtr> outputNodes = [&] {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,19 +5,23 @@

#include <arm_compute/core/CoreTypes.h>
#include <arm_compute/core/Error.h>
#include <arm_compute/core/Strides.h>
#include <arm_compute/core/TensorInfo.h>
#include <arm_compute/core/TensorShape.h>
#include <arm_compute/core/Types.h>
#include <arm_compute/function_info/FullyConnectedLayerInfo.h>
#include <arm_compute/runtime/NEON/functions/NECast.h>
#include <arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h>
#include <oneapi/dnnl/dnnl_common_types.h>
#include <oneapi/dnnl/dnnl_types.h>

#include <any>
#include <common/c_types_map.hpp>
#include <common/memory_desc_wrapper.hpp>
#include <cstddef>
#include <cstdint>
#include <functional>
#include <limits>
#include <memory>
#include <numeric>
#include <oneapi/dnnl/dnnl.hpp>
Expand All @@ -29,7 +33,6 @@

#include "acl_utils.hpp"
#include "common/primitive_desc_iface.hpp"
#include "cpu/acl/acl_utils.hpp"
#include "cpu_memory.h"
#include "cpu_shape.h"
#include "cpu_types.h"
Expand All @@ -53,6 +56,71 @@
#include "utils/cpu_utils.hpp"
#include "utils/debug_capabilities.h"

namespace {

dnnl::impl::dim_t roundUpToBlock(dnnl::impl::dim_t value, int block) {
OPENVINO_ASSERT(block > 0, "Unsupported ACL weight format block size: ", block);
return ((value + block - 1) / block) * block;
}

void setAclStride(arm_compute::Strides& strides, size_t dim, size_t value) {
if (value > std::numeric_limits<uint32_t>::max()) {
OPENVINO_THROW("ACL weight format stride is too large: ", value);
}
strides.set(dim, value);
}

void reorderToAclFcWeightFormat(arm_compute::TensorInfo& info,
dnnl::impl::memory_desc_t& md,
arm_compute::WeightFormat weightFormat,
dnnl::impl::dim_t inputDim,
dnnl::impl::dim_t outputDim) {
md.format_kind = dnnl::impl::format_kind::blocked;
md.format_desc.blocking = dnnl::impl::blocking_desc_t{};

const int interleavedBy = arm_compute::interleave_by(weightFormat);
const int blockBy = arm_compute::block_by(weightFormat);

md.format_desc.blocking.strides[inputDim] = interleavedBy * blockBy;
md.padded_dims[inputDim] = roundUpToBlock(md.dims[inputDim], blockBy);

const dnnl::impl::dim_t ldb = interleavedBy * md.padded_dims[inputDim];
md.format_desc.blocking.strides[outputDim] = ldb;
md.padded_dims[outputDim] = roundUpToBlock(md.dims[outputDim], interleavedBy);

const dnnl::impl::dim_t innermostBatchStride = md.padded_dims[inputDim] * md.padded_dims[outputDim];

if (interleavedBy > 1) {
md.format_desc.blocking.inner_nblks = 1 + static_cast<int>(blockBy > 1);
md.format_desc.blocking.inner_idxs[0] = outputDim;
md.format_desc.blocking.inner_blks[0] = interleavedBy;
if (blockBy > 1) {
md.format_desc.blocking.inner_idxs[1] = inputDim;
md.format_desc.blocking.inner_blks[1] = blockBy;
}
}

if (arm_compute::is_fixed_format_fast_math(weightFormat)) {
md.data_type = dnnl_bf16;
info.set_data_type(arm_compute::DataType::BFLOAT16);
}

info.set_data_layout(arm_compute::DataLayout::UNKNOWN);

arm_compute::Strides newStridesInBytes = info.strides_in_bytes();
setAclStride(newStridesInBytes, 1, static_cast<size_t>(ldb) * info.element_size());
setAclStride(newStridesInBytes, 2, static_cast<size_t>(innermostBatchStride) * info.element_size());

info.init(info.tensor_shape(),
info.num_channels(),
info.data_type(),
newStridesInBytes,
info.offset_first_element_in_bytes(),
dnnl::impl::memory_desc_wrapper(md).size());
}

} // namespace

namespace ov::intel_cpu {

VectorDims acl_fc_executor::makeDummyInputDims(const Shape& inShape, const Shape& wShape) {
Expand Down Expand Up @@ -284,15 +352,8 @@ MemoryPtr acl_fc_executor::prepareWeightMemory(const MemoryArgs& memory,
if (isNeededReorder) {
dnnl::impl::dim_t o_dim = 0;
dnnl::impl::dim_t inner_dim = 1;
std::vector<dnnl::impl::dim_t> remaining_dims = {};
auto* weights_md_ = dnnlDstDesc->getDnnlDesc().get();
dnnl::impl::cpu::acl::acl_utils::reorder_to_weight_format(weiTensorInfo,
*weights_md_,
expectedWeightFormat,
inner_dim,
o_dim,
remaining_dims,
{});
reorderToAclFcWeightFormat(weiTensorInfo, *weights_md_, expectedWeightFormat, inner_dim, o_dim);
if (aclfcAttrs.weightsNonTransposed) {
dnnlSrcDesc = makeTransposedWeightDescriptor(dnnlSrcDesc, dnnlDstDesc);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ static const TypeMapping dnnlConvTypeMapping {
{{_f16, _bf16, _any, _any}, {bypass(), bypass(), use<0>(), use<0>()}},
// quantization configuration is not applicable for ARM
// because there is the dedicated low-precision implementation for ARM
#if !defined(OPENVINO_ARCH_ARM64) && !defined(OPENVINO_ARCH_ARM)
#if !defined(OPENVINO_ARCH_ARM64)
// int8 conv does not support f16 output and bias
{{_u8 | _i8, _i8, _quant |_bf16 | _f32 | _i32 | _dynamic, _quant | _bf16 | _f32 | _i32 | _dynamic}, {bypass(), bypass(), bypass(), bypass()}},
{{_u8 | _i8, _i8, _f16, _u8 | _i8 | _i32 | _bf16 | _f32}, {bypass(), bypass(), just<f32>(), bypass()}},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -245,7 +245,7 @@ static DnnlPrimitiveAttrs createPrimitiveAttrs(const MatMulAttrs& attrs,
}
// by default fp16 matmul ACL kernels accumulate into fp32
// the default behaviour is changed by using f16 accumulator to improve performance
#if defined(OPENVINO_ARCH_ARM) || defined(OPENVINO_ARCH_ARM64)
#if defined(OPENVINO_ARCH_ARM64)
if (srcDesc->getPrecision() == ov::element::f16 && weiDesc->getPrecision() == ov::element::f16 &&
dstDesc->getPrecision() == ov::element::f16) {
primAttrs.attr.set_accumulation_mode(dnnl::accumulation_mode::f16);
Expand Down
Loading
Loading