openvinotoolkit · allnes · Jun 1, 2026 · Jun 2, 2026 · Jun 2, 2026 · Jun 2, 2026
@@ -369,7 +369,7 @@ macro(_ov_find_itt)
 endmacro()
 
 macro(_ov_find_intel_cpu_dependencies)
-    set(_OV_ENABLE_CPU_ACL "@DNNL_USE_ACL@")
+    set(_OV_ENABLE_CPU_ACL "@DNNL_AARCH64_USE_ACL@")
     if(_OV_ENABLE_CPU_ACL)
         set(_ov_in_install_tree "@PACKAGE_OPENVINO_LIB_DIR@")
         if(_ov_in_install_tree)

@@ -227,7 +227,7 @@ endif()
 
 # build tree
 
-if(DNNL_USE_ACL)
+if(DNNL_AARCH64_USE_ACL)
     list(APPEND BUILD_PATH_VARS "FIND_ACL_PATH;CMAKE_ARCHIVE_OUTPUT_DIRECTORY")
     set(FIND_ACL_PATH "${intel_cpu_thirdparty_SOURCE_DIR}")
 endif()

@@ -87,12 +87,7 @@ set(OV_CPU_ARM_TARGET_GENERIC_ARCHS armv8a
                                     armv8.6-a armv8.6-a-sve armv8.6-a-sve2 armv8.6-a-sve2-sme2
                                     armv8r64 # the same as armv8.4-a
 )
-if(ARM)
-    set(OV_CPU_ARM_TARGET_ARCH_DEFAULT armv7a)
-    set(OV_CPU_ARM_TARGET_ARCHS armv7a armv7a-hf
-                                # requires estate=32
-                                ${OV_CPU_ARM_TARGET_GENERIC_ARCHS})
-elseif(AARCH64)
+if(AARCH64)
     if(APPLE)
         set(OV_CPU_ARM_TARGET_ARCH_DEFAULT arm64-v8.2-a)
     else()
@@ -153,7 +148,7 @@ if(OV_CPU_WITH_DNNL)
     add_definitions(-DOV_CPU_WITH_DNNL)
 endif()
 
-if(DNNL_USE_ACL)
+if(DNNL_AARCH64_USE_ACL)
     add_definitions(-DOV_CPU_WITH_ACL)
     set(OV_CPU_WITH_ACL ON)
 endif()
@@ -183,7 +178,7 @@ if(NOT X86_64)
                               ${CMAKE_CURRENT_SOURCE_DIR}/src/transformations/tpp/x64/*)
 endif()
 
-if(NOT (AARCH64 OR ARM))
+if(NOT AARCH64)
     list(APPEND EXCLUDE_PATHS ${CMAKE_CURRENT_SOURCE_DIR}/src/transformations/cpu_opset/arm/*
                               ${CMAKE_CURRENT_SOURCE_DIR}/src/transformations/tpp/aarch64/*
                               ${CMAKE_CURRENT_SOURCE_DIR}/src/emitters/plugin/aarch64/*
@@ -239,7 +234,7 @@ ov_add_plugin(NAME ${TARGET_NAME}
               ADD_CLANG_TIDY)
 
 # give a different file name depending on target platform architecture
-if(ARM OR AARCH64)
+if(AARCH64)
     set_target_properties(${TARGET_NAME} PROPERTIES OUTPUT_NAME "openvino_arm_cpu_plugin")
 elseif(RISCV64)
     set_target_properties(${TARGET_NAME} PROPERTIES OUTPUT_NAME "openvino_riscv_cpu_plugin")

@@ -496,18 +496,16 @@ void Config::readProperties(const ov::AnyMap& prop, const ModelType modelType) {
     if (!inferencePrecisionSetExplicitly) {
         if (executionMode == ov::hint::ExecutionMode::PERFORMANCE) {
             inferencePrecision = ov::element::f32;
-#if defined(OPENVINO_ARCH_ARM) || defined(OPENVINO_ARCH_ARM64)
+#if defined(OPENVINO_ARCH_ARM64)
             if (hasHardwareSupport(ov::element::f16)) {
                 inferencePrecision = ov::element::f16;
             }
-#    if defined(OPENVINO_ARCH_ARM64)
             // enforce fp32 inference precision for dynamic quantization
             // to preserve fp32 matmul output precision
             if (fcDynamicQuantizationGroupSizeSetExplicitly &&
                 fcDynamicQuantizationGroupSize == std::numeric_limits<uint64_t>::max()) {
                 inferencePrecision = ov::element::f32;
             }
-#    endif
 #endif
             if (mayiuse(avx512_core_bf16)) {
                 inferencePrecision = ov::element::bf16;

@@ -1039,39 +1039,7 @@ static void configure_arm64_linux_threads(Config& config,
 }
 #endif
 
-#if defined(OPENVINO_ARCH_ARM) && defined(__linux__)
-void configure_arm_linux_threads(Config& config,
-                                 const std::vector<std::vector<int>>& proc_type_table,
-                                 const ov::MemBandwidthPressure& tolerance,
-                                 bool int8_intensive,
-                                 bool is_LLM) {
-    using namespace ThreadPreferenceConstants;
-    config.modelPreferThreadsThroughput = ARM_THREADS_DEFAULT;
-
-    if (tolerance.max_mem_tolerance == ov::MemBandwidthPressure::UNKNOWN) {
-        if (tolerance.ratio_compute_convs == ov::MemBandwidthPressure::ALL) {
-            config.modelPreferThreadsThroughput = ARM_THREADS_HIGH;
-        }
-    } else if ((tolerance.max_mem_tolerance < ov::MemBandwidthPressure::LIMITED) &&
-               ((tolerance.ratio_mem_limited_deconvs > ov::MemBandwidthPressure::LIMITED) ||
-                (tolerance.ratio_mem_limited_gemms > ov::MemBandwidthPressure::LIMITED))) {
-        config.modelPreferThreadsThroughput = ARM_THREADS_HIGH;
-    }
-
-    const int main_cores = proc_type_table[0][MAIN_CORE_PROC];
-    const int efficient_cores = proc_type_table[0][EFFICIENT_CORE_PROC];
-
-    bool use_all_cores = should_use_all_cores_for_latency(main_cores, efficient_cores, int8_intensive);
-
-    if (use_all_cores && (!is_LLM || should_use_ecores_for_llm(efficient_cores, main_cores))) {
-        config.modelPreferThreadsLatency = main_cores + efficient_cores;
-    } else {
-        config.modelPreferThreadsLatency = main_cores;
-    }
-}
-#endif
-
-#if (defined(OPENVINO_ARCH_ARM) || defined(OPENVINO_ARCH_ARM64)) && defined(__APPLE__)
+#if (defined(OPENVINO_ARCH_ARM64)) && defined(__APPLE__)
 void configure_apple_threads(Config& config,
                              const std::vector<std::vector<int>>& proc_type_table,
                              const ov::MemBandwidthPressure& tolerance,
@@ -1244,10 +1212,7 @@ int get_model_prefer_threads(const int num_streams,
                                                  memThresholdAssumeLimitedForISA,
                                                  config.inferencePrecision);
 
-#    if defined(OPENVINO_ARCH_ARM) && defined(__linux__)
-        configure_arm_linux_threads(config, proc_type_table, networkToleranceForLowCache, int8_intensive, is_LLM);
-
-#    elif (defined(OPENVINO_ARCH_ARM) || defined(OPENVINO_ARCH_ARM64)) && defined(__APPLE__)
+#    if (defined(OPENVINO_ARCH_ARM64)) && defined(__APPLE__)
         configure_apple_threads(config,
                                 proc_type_table,
                                 networkToleranceForLowCache,

@@ -124,15 +124,7 @@ void get_num_streams(int streams, const std::shared_ptr<ov::Model>& model, Confi
 void sort_table_by_numa_node_id(int current_numa_node, std::vector<std::vector<int>>& proc_type_table);
 
 // Internal configure_* helpers are declared below and are publicly callable.
-#if defined(OPENVINO_ARCH_ARM) && defined(__linux__)
-void configure_arm_linux_threads(Config& config,
-                                 const std::vector<std::vector<int>>& proc_type_table,
-                                 const ov::MemBandwidthPressure& tolerance,
-                                 bool int8_intensive,
-                                 bool is_LLM);
-#endif
-
-#if (defined(OPENVINO_ARCH_ARM) || defined(OPENVINO_ARCH_ARM64)) && defined(__APPLE__)
+#if (defined(OPENVINO_ARCH_ARM64)) && defined(__APPLE__)
 void configure_apple_threads(Config& config,
                              const std::vector<std::vector<int>>& proc_type_table,
                              const ov::MemBandwidthPressure& tolerance,

@@ -487,7 +487,7 @@ void DnnlPostOpsComposer::appendBinary(const dnnl::algorithm alg, const std::vec
     DEBUG_LOG("Append binary post op with algorithm: ", convert_to_c(alg), " Shape: ", Shape(*pdims));
 
     ov::element::Type binaryType = ov::element::f32;
-#if defined(OPENVINO_ARCH_ARM64) || defined(OPENVINO_ARCH_ARM)
+#if defined(OPENVINO_ARCH_ARM64)
     if (outDataType == dnnl::memory::data_type::f16) {
         // ACL executor is not able to handle different precisions between convolution output and post op input
         // in this case original post op tensor is f32 even the model runs in f16 precision

@@ -92,7 +92,7 @@
 #    include "openvino/runtime/properties.hpp"
 #endif
 
-#if defined(OPENVINO_ARCH_ARM) || defined(OPENVINO_ARCH_ARM64)
+#if defined(OPENVINO_ARCH_ARM64)
 #    include <common/primitive_desc_iface.hpp>
 
 #    include "onednn/iml_type_mapper.h"
@@ -606,7 +606,7 @@ static bool isReorderAvailable(const MemoryDescPtr& parentDesc,
                                                      dstMemDesc.get(),
                                                      eng.get(),
                                                      attr.get());
-#if defined(OPENVINO_ARCH_ARM) || defined(OPENVINO_ARCH_ARM64)
+#if defined(OPENVINO_ARCH_ARM64)
     // temporary WA for slow FP32->FP16 conversion reorder in oneDNN on ARM
     // pretend the reorder is not available to use Convert node instead
     if (hasHardwareSupport(ov::element::f16) && (result != nullptr) &&

@@ -101,7 +101,7 @@ void GraphOptimizer::ApplyCommonGraphOptimizations(Graph& graph) {
 // The order of applying scales and shifts is different for ARM to get specific postops order:
 // postops order on ARM: bias, scale, fq
 // postops order on x86: scale, bias, fq
-#if defined(OPENVINO_ARCH_ARM) || defined(OPENVINO_ARCH_ARM64)
+#if defined(OPENVINO_ARCH_ARM64)
     OV_ITT_SCOPE_NEXT(FIRST_INFERENCE, taskChain, "FuseConvolutionAndBias");
     FuseConvolutionMatMulDeconvAndBias(graph);
     graph.RemoveDroppedNodes();
@@ -280,7 +280,7 @@ void GraphOptimizer::FuseConvMatmulFCDeconvAndDQScales(Graph& graph) {
             return false;
         }
 // The order of applying scales and shifts is different for ARM, so bias could be already fused here for ARM
-#if defined(OPENVINO_ARCH_ARM) || defined(OPENVINO_ARCH_ARM64)
+#if defined(OPENVINO_ARCH_ARM64)
         return any_of(parentNode->getParentEdges().size(), 2U, 3U);
 #else
         return (parentNode->getParentEdges().size() == 2);
@@ -932,7 +932,7 @@ void GraphOptimizer::FuseFCAndTransposeOnWeights(Graph& graph) {
 void GraphOptimizer::FuseConvolutionAndZeroPoints(Graph& graph) {
     const auto& graphNodes = graph.GetNodes();
 // zero points fusing is skipped on ARM platforms because oneDNN is not involved into int8 convolution inference
-#if defined(OPENVINO_ARCH_ARM) || defined(OPENVINO_ARCH_ARM64)
+#if defined(OPENVINO_ARCH_ARM64)
     return;
 #endif
 
@@ -3414,7 +3414,7 @@ void GraphOptimizer::TailNodesPrecisionOptimize(Graph& graph) {
     if (inferPrec != ov::element::f16) {
         return;
     }
-#if defined(OPENVINO_ARCH_ARM) || defined(OPENVINO_ARCH_ARM64)
+#if defined(OPENVINO_ARCH_ARM64)
     return;  // precision of configured by ov::pass::ConvertPrecision
 #endif
     const std::vector<NodePtr> outputNodes = [&] {

@@ -5,19 +5,23 @@
 
 #include <arm_compute/core/CoreTypes.h>
 #include <arm_compute/core/Error.h>
+#include <arm_compute/core/Strides.h>
 #include <arm_compute/core/TensorInfo.h>
 #include <arm_compute/core/TensorShape.h>
 #include <arm_compute/core/Types.h>
 #include <arm_compute/function_info/FullyConnectedLayerInfo.h>
 #include <arm_compute/runtime/NEON/functions/NECast.h>
 #include <arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h>
+#include <oneapi/dnnl/dnnl_common_types.h>
 #include <oneapi/dnnl/dnnl_types.h>
 
 #include <any>
 #include <common/c_types_map.hpp>
+#include <common/memory_desc_wrapper.hpp>
 #include <cstddef>
 #include <cstdint>
 #include <functional>
+#include <limits>
 #include <memory>
 #include <numeric>
 #include <oneapi/dnnl/dnnl.hpp>
@@ -29,7 +33,6 @@
 
 #include "acl_utils.hpp"
 #include "common/primitive_desc_iface.hpp"
-#include "cpu/acl/acl_utils.hpp"
 #include "cpu_memory.h"
 #include "cpu_shape.h"
 #include "cpu_types.h"
@@ -53,6 +56,71 @@
 #include "utils/cpu_utils.hpp"
 #include "utils/debug_capabilities.h"
 
+namespace {
+
+dnnl::impl::dim_t roundUpToBlock(dnnl::impl::dim_t value, int block) {
+    OPENVINO_ASSERT(block > 0, "Unsupported ACL weight format block size: ", block);
+    return ((value + block - 1) / block) * block;
+}
+
+void setAclStride(arm_compute::Strides& strides, size_t dim, size_t value) {
+    if (value > std::numeric_limits<uint32_t>::max()) {
+        OPENVINO_THROW("ACL weight format stride is too large: ", value);
+    }
+    strides.set(dim, value);
+}
+
+void reorderToAclFcWeightFormat(arm_compute::TensorInfo& info,
+                                dnnl::impl::memory_desc_t& md,
+                                arm_compute::WeightFormat weightFormat,
+                                dnnl::impl::dim_t inputDim,
+                                dnnl::impl::dim_t outputDim) {
+    md.format_kind = dnnl::impl::format_kind::blocked;
+    md.format_desc.blocking = dnnl::impl::blocking_desc_t{};
+
+    const int interleavedBy = arm_compute::interleave_by(weightFormat);
+    const int blockBy = arm_compute::block_by(weightFormat);
+
+    md.format_desc.blocking.strides[inputDim] = interleavedBy * blockBy;
+    md.padded_dims[inputDim] = roundUpToBlock(md.dims[inputDim], blockBy);
+
+    const dnnl::impl::dim_t ldb = interleavedBy * md.padded_dims[inputDim];
+    md.format_desc.blocking.strides[outputDim] = ldb;
+    md.padded_dims[outputDim] = roundUpToBlock(md.dims[outputDim], interleavedBy);
+
+    const dnnl::impl::dim_t innermostBatchStride = md.padded_dims[inputDim] * md.padded_dims[outputDim];
+
+    if (interleavedBy > 1) {
+        md.format_desc.blocking.inner_nblks = 1 + static_cast<int>(blockBy > 1);
+        md.format_desc.blocking.inner_idxs[0] = outputDim;
+        md.format_desc.blocking.inner_blks[0] = interleavedBy;
+        if (blockBy > 1) {
+            md.format_desc.blocking.inner_idxs[1] = inputDim;
+            md.format_desc.blocking.inner_blks[1] = blockBy;
+        }
+    }
+
+    if (arm_compute::is_fixed_format_fast_math(weightFormat)) {
+        md.data_type = dnnl_bf16;
+        info.set_data_type(arm_compute::DataType::BFLOAT16);
+    }
+
+    info.set_data_layout(arm_compute::DataLayout::UNKNOWN);
+
+    arm_compute::Strides newStridesInBytes = info.strides_in_bytes();
+    setAclStride(newStridesInBytes, 1, static_cast<size_t>(ldb) * info.element_size());
+    setAclStride(newStridesInBytes, 2, static_cast<size_t>(innermostBatchStride) * info.element_size());
+
+    info.init(info.tensor_shape(),
+              info.num_channels(),
+              info.data_type(),
+              newStridesInBytes,
+              info.offset_first_element_in_bytes(),
+              dnnl::impl::memory_desc_wrapper(md).size());
+}
+
+}  // namespace
+
 namespace ov::intel_cpu {
 
 VectorDims acl_fc_executor::makeDummyInputDims(const Shape& inShape, const Shape& wShape) {
@@ -284,15 +352,8 @@ MemoryPtr acl_fc_executor::prepareWeightMemory(const MemoryArgs& memory,
     if (isNeededReorder) {
         dnnl::impl::dim_t o_dim = 0;
         dnnl::impl::dim_t inner_dim = 1;
-        std::vector<dnnl::impl::dim_t> remaining_dims = {};
         auto* weights_md_ = dnnlDstDesc->getDnnlDesc().get();
-        dnnl::impl::cpu::acl::acl_utils::reorder_to_weight_format(weiTensorInfo,
-                                                                  *weights_md_,
-                                                                  expectedWeightFormat,
-                                                                  inner_dim,
-                                                                  o_dim,
-                                                                  remaining_dims,
-                                                                  {});
+        reorderToAclFcWeightFormat(weiTensorInfo, *weights_md_, expectedWeightFormat, inner_dim, o_dim);
         if (aclfcAttrs.weightsNonTransposed) {
             dnnlSrcDesc = makeTransposedWeightDescriptor(dnnlSrcDesc, dnnlDstDesc);
         }

@@ -61,7 +61,7 @@ static const TypeMapping dnnlConvTypeMapping {
     {{_f16, _bf16, _any, _any},                               {bypass(), bypass(), use<0>(), use<0>()}},
     // quantization configuration is not applicable for ARM
     // because there is the dedicated low-precision implementation for ARM
-#if !defined(OPENVINO_ARCH_ARM64) && !defined(OPENVINO_ARCH_ARM)
+#if !defined(OPENVINO_ARCH_ARM64)
     // int8 conv does not support f16 output and bias
     {{_u8 | _i8, _i8,  _quant |_bf16 | _f32 | _i32 | _dynamic,  _quant | _bf16 | _f32 | _i32 | _dynamic}, {bypass(), bypass(), bypass(),  bypass()}},
     {{_u8 | _i8, _i8, _f16, _u8 | _i8 | _i32 | _bf16 | _f32}, {bypass(), bypass(), just<f32>(), bypass()}},

@@ -245,7 +245,7 @@ static DnnlPrimitiveAttrs createPrimitiveAttrs(const MatMulAttrs& attrs,
     }
     // by default fp16 matmul ACL kernels accumulate into fp32
     // the default behaviour is changed by using f16 accumulator to improve performance
-#if defined(OPENVINO_ARCH_ARM) || defined(OPENVINO_ARCH_ARM64)
+#if defined(OPENVINO_ARCH_ARM64)
     if (srcDesc->getPrecision() == ov::element::f16 && weiDesc->getPrecision() == ov::element::f16 &&
         dstDesc->getPrecision() == ov::element::f16) {
         primAttrs.attr.set_accumulation_mode(dnnl::accumulation_mode::f16);