From 0959856a8cfaad774c2160b0b8377e0fd6343445 Mon Sep 17 00:00:00 2001 From: Nikolai Shchegolev Date: Fri, 3 Feb 2023 14:54:49 +0400 Subject: [PATCH] [CPU] I64 native support. --- .../src/transformations/convert_precision.cpp | 12 +- .../include/ngraph/runtime/reference/mvn.hpp | 1 - .../ngraph/runtime/reference/reduce_l1.hpp | 7 +- .../ngraph/runtime/reference/reduce_l2.hpp | 2 +- src/core/src/op/reduce_l1.cpp | 2 + src/core/src/op/reduce_l2.cpp | 4 + .../interface/ie_internal_plugin_config.hpp | 5 + src/plugins/intel_cpu/src/config.cpp | 11 +- src/plugins/intel_cpu/src/config.h | 1 + .../intel_cpu/src/dnnl_extension_utils.cpp | 56 +- .../src/emitters/x64/jit_eltwise_emitters.cpp | 1638 +++++++----- .../src/emitters/x64/jit_eltwise_emitters.hpp | 202 +- .../src/emitters/x64/jit_emitter.cpp | 19 +- .../src/emitters/x64/jit_emitter.hpp | 54 +- .../emitters/x64/jit_snippets_emitters.cpp | 4 + src/plugins/intel_cpu/src/graph.cpp | 25 +- src/plugins/intel_cpu/src/graph_optimizer.cpp | 1 - src/plugins/intel_cpu/src/node.cpp | 16 +- src/plugins/intel_cpu/src/node.h | 16 +- src/plugins/intel_cpu/src/nodes/broadcast.cpp | 111 +- src/plugins/intel_cpu/src/nodes/broadcast.h | 8 +- .../src/nodes/common/cpu_convert.cpp | 7 +- .../src/nodes/common/tile_broadcast_utils.cpp | 14 +- src/plugins/intel_cpu/src/nodes/concat.cpp | 47 +- src/plugins/intel_cpu/src/nodes/concat.h | 8 +- src/plugins/intel_cpu/src/nodes/convert.cpp | 45 +- src/plugins/intel_cpu/src/nodes/convert.h | 6 +- src/plugins/intel_cpu/src/nodes/cum_sum.cpp | 20 +- src/plugins/intel_cpu/src/nodes/cum_sum.h | 5 +- src/plugins/intel_cpu/src/nodes/def_conv.cpp | 11 +- src/plugins/intel_cpu/src/nodes/eltwise.cpp | 414 ++- src/plugins/intel_cpu/src/nodes/eltwise.h | 7 +- .../executors/common/ref_opt_transpose.cpp | 3 +- src/plugins/intel_cpu/src/nodes/eye.cpp | 3 +- src/plugins/intel_cpu/src/nodes/gather.cpp | 139 +- src/plugins/intel_cpu/src/nodes/gather.h | 16 +- src/plugins/intel_cpu/src/nodes/gather_nd.cpp | 168 +- src/plugins/intel_cpu/src/nodes/gather_nd.h | 8 +- .../intel_cpu/src/nodes/grid_sample.cpp | 4 +- .../intel_cpu/src/nodes/grid_sample.hpp | 6 +- src/plugins/intel_cpu/src/nodes/input.cpp | 12 +- src/plugins/intel_cpu/src/nodes/input.h | 6 +- .../nodes/kernels/x64/gather_uni_kernel.cpp | 3 + .../nodes/kernels/x64/gather_uni_kernel.hpp | 2 + .../src/nodes/kernels/x64/grid_sample.cpp | 21 +- .../src/nodes/kernels/x64/grid_sample.hpp | 21 +- .../src/nodes/kernels/x64/jit_kernel_base.cpp | 1072 ++++++-- .../src/nodes/kernels/x64/jit_kernel_base.hpp | 284 ++- .../src/nodes/kernels/x64/reduce.cpp | 1915 ++++++++++++++ .../src/nodes/kernels/x64/reduce.hpp | 246 ++ .../src/nodes/kernels/x64/registers_pool.hpp | 5 +- .../intel_cpu/src/nodes/mathematics.cpp | 60 +- src/plugins/intel_cpu/src/nodes/mathematics.h | 6 +- src/plugins/intel_cpu/src/nodes/non_zero.cpp | 34 +- src/plugins/intel_cpu/src/nodes/non_zero.h | 5 +- src/plugins/intel_cpu/src/nodes/one_hot.cpp | 101 +- src/plugins/intel_cpu/src/nodes/one_hot.h | 16 +- src/plugins/intel_cpu/src/nodes/pooling.cpp | 4 +- src/plugins/intel_cpu/src/nodes/range.cpp | 14 +- src/plugins/intel_cpu/src/nodes/range.h | 2 +- src/plugins/intel_cpu/src/nodes/reduce.cpp | 2239 +++-------------- src/plugins/intel_cpu/src/nodes/reduce.h | 117 +- src/plugins/intel_cpu/src/nodes/reference.cpp | 9 +- src/plugins/intel_cpu/src/nodes/reference.h | 4 +- src/plugins/intel_cpu/src/nodes/reorder.cpp | 4 +- src/plugins/intel_cpu/src/nodes/reorder.h | 4 +- src/plugins/intel_cpu/src/nodes/reshape.cpp | 82 +- src/plugins/intel_cpu/src/nodes/reshape.h | 16 +- src/plugins/intel_cpu/src/nodes/rnn.cpp | 10 +- .../intel_cpu/src/nodes/scatter_update.cpp | 28 +- .../intel_cpu/src/nodes/scatter_update.h | 4 +- src/plugins/intel_cpu/src/nodes/shapeof.cpp | 55 +- src/plugins/intel_cpu/src/nodes/shapeof.h | 5 +- .../intel_cpu/src/nodes/shuffle_channels.cpp | 5 +- src/plugins/intel_cpu/src/nodes/split.cpp | 60 +- src/plugins/intel_cpu/src/nodes/split.h | 4 +- .../intel_cpu/src/nodes/strided_slice.cpp | 18 +- .../intel_cpu/src/nodes/strided_slice.h | 2 +- src/plugins/intel_cpu/src/nodes/subgraph.cpp | 46 +- src/plugins/intel_cpu/src/nodes/subgraph.h | 11 +- .../intel_cpu/src/nodes/tensoriterator.cpp | 2 +- src/plugins/intel_cpu/src/nodes/tile.cpp | 33 +- src/plugins/intel_cpu/src/nodes/tile.h | 4 +- src/plugins/intel_cpu/src/nodes/topk.cpp | 126 +- src/plugins/intel_cpu/src/nodes/topk.h | 15 +- src/plugins/intel_cpu/src/nodes/transpose.cpp | 56 +- src/plugins/intel_cpu/src/nodes/transpose.h | 5 +- src/plugins/intel_cpu/src/nodes/unique.cpp | 193 +- src/plugins/intel_cpu/src/nodes/unique.hpp | 9 +- src/plugins/intel_cpu/src/plugin.cpp | 26 +- .../convert_to_cpu_specific_opset.hpp | 12 +- .../x64/pass/convert_precision_i64_i32.cpp | 155 ++ .../x64/pass/convert_precision_i64_i32.hpp | 21 + .../transformation_pipeline.cpp | 40 +- .../transformations/transformation_pipeline.h | 2 +- src/plugins/intel_cpu/src/utils/blob_dump.cpp | 12 + src/plugins/intel_cpu/src/utils/cpu_utils.hpp | 11 +- .../single_layer_tests/comparison.cpp | 48 +- .../single_layer_tests/concat.cpp | 7 +- .../single_layer_tests/eltwise.cpp | 2 +- .../single_layer_tests/minimum_maximum.cpp | 51 - .../non_max_suppression.cpp | 15 + .../single_layer_tests/range.cpp | 3 +- .../single_layer_tests/reduce_ops.cpp | 4 +- .../single_layer_tests/reshape.cpp | 3 +- .../single_layer_tests/scatter_ND_update.cpp | 1 + .../scatter_elements_update.cpp | 1 + .../single_layer_tests/scatter_update.cpp | 2 +- .../single_layer_tests/select.cpp | 4 +- .../single_layer_tests/squeeze_unsqueeze.cpp | 3 +- .../single_layer_tests/tile.cpp | 3 +- .../skip_tests_config.cpp | 7 +- .../single_layer_tests/broadcast.cpp | 123 +- .../single_layer_tests/classes/activation.cpp | 68 +- .../single_layer_tests/classes/activation.hpp | 16 +- .../single_layer_tests/classes/conversion.cpp | 93 +- .../single_layer_tests/classes/conversion.hpp | 34 +- .../single_layer_tests/classes/eltwise.cpp | 48 +- .../single_layer_tests/classes/reduce.cpp | 106 +- .../single_layer_tests/classes/reduce.hpp | 37 +- .../single_layer_tests/classes/transpose.cpp | 43 +- .../single_layer_tests/classes/transpose.hpp | 30 +- .../functional/single_layer_tests/concat.cpp | 133 +- .../functional/single_layer_tests/cum_sum.cpp | 121 +- .../functional/single_layer_tests/gather.cpp | 190 +- .../single_layer_tests/gather_nd.cpp | 106 +- .../instances/common/activation.cpp | 22 +- .../instances/common/conversion.cpp | 8 +- .../instances/common/reduce.cpp | 59 +- .../instances/common/transpose.cpp | 20 +- .../instances/x64/activation.cpp | 107 +- .../instances/x64/conversion.cpp | 61 +- .../instances/x64/eltwise.cpp | 51 + .../instances/x64/reduce.cpp | 453 +++- .../instances/x64/transpose.cpp | 93 +- .../single_layer_tests/minimum_maximum.cpp | 170 ++ .../non_max_suppression.cpp | 53 +- .../functional/single_layer_tests/one_hot.cpp | 169 +- .../single_layer_tests/scatter_ND_update.cpp | 1 + .../scatter_elements_update.cpp | 1 + .../single_layer_tests/scatter_update.cpp | 59 +- .../functional/single_layer_tests/split.cpp | 169 +- .../single_layer_tests/strided_slice.cpp | 101 +- .../functional/single_layer_tests/tile.cpp | 115 +- .../functional/single_layer_tests/topk.cpp | 49 +- .../functional/single_layer_tests/unique.cpp | 52 +- .../functional/test_utils/cpu_test_utils.cpp | 76 +- .../functional/test_utils/cpu_test_utils.hpp | 17 +- src/plugins/intel_cpu/thirdparty/onednn | 2 +- src/plugins/template/backend/ops/reduce.cpp | 74 + .../template/backend/opset_int_tbl.hpp | 2 + .../src/non_max_suppression.cpp | 10 +- 152 files changed, 8732 insertions(+), 4964 deletions(-) create mode 100644 src/plugins/intel_cpu/src/nodes/kernels/x64/reduce.cpp create mode 100644 src/plugins/intel_cpu/src/nodes/kernels/x64/reduce.hpp create mode 100644 src/plugins/intel_cpu/src/transformations/cpu_opset/x64/pass/convert_precision_i64_i32.cpp create mode 100644 src/plugins/intel_cpu/src/transformations/cpu_opset/x64/pass/convert_precision_i64_i32.hpp delete mode 100644 src/plugins/intel_cpu/tests/functional/shared_tests_instances/single_layer_tests/minimum_maximum.cpp create mode 100644 src/plugins/intel_cpu/tests/functional/single_layer_tests/minimum_maximum.cpp create mode 100644 src/plugins/template/backend/ops/reduce.cpp diff --git a/src/common/transformations/src/transformations/convert_precision.cpp b/src/common/transformations/src/transformations/convert_precision.cpp index b3b474c3b989ed..88fcc0e899e0ea 100644 --- a/src/common/transformations/src/transformations/convert_precision.cpp +++ b/src/common/transformations/src/transformations/convert_precision.cpp @@ -871,6 +871,14 @@ inline int32_t convert_value(uint32_t val) { return static_cast(val); } +template <> +inline int64_t convert_value(uint64_t val) { + if (val >= static_cast(std::numeric_limits::max())) { + return std::numeric_limits::max(); + } + return static_cast(val); +} + namespace { template std::shared_ptr change_constant_precision(std::shared_ptr& constant) { @@ -1110,7 +1118,9 @@ bool fuse_type_to_constant(const std::shared_ptr& node, const auto& to = it->second; if (auto constant = ov::as_type_ptr(node)) { std::shared_ptr new_const; - if (from == ov::element::u64 && to == ov::element::i32) { + if (from == ov::element::u64 && to == ov::element::i64) { + new_const = change_constant_precision(constant); + } else if (from == ov::element::u64 && to == ov::element::i32) { new_const = change_constant_precision(constant); } else if (from == ov::element::i64 && to == ov::element::i32) { new_const = change_constant_precision(constant); diff --git a/src/core/reference/include/ngraph/runtime/reference/mvn.hpp b/src/core/reference/include/ngraph/runtime/reference/mvn.hpp index 7ffc557b185cc7..89ddd2d27c5484 100644 --- a/src/core/reference/include/ngraph/runtime/reference/mvn.hpp +++ b/src/core/reference/include/ngraph/runtime/reference/mvn.hpp @@ -12,7 +12,6 @@ #include #include #include -#include #include namespace ngraph { diff --git a/src/core/reference/include/ngraph/runtime/reference/reduce_l1.hpp b/src/core/reference/include/ngraph/runtime/reference/reduce_l1.hpp index 37477aa7e727f7..9ae28cf3e712f9 100644 --- a/src/core/reference/include/ngraph/runtime/reference/reduce_l1.hpp +++ b/src/core/reference/include/ngraph/runtime/reference/reduce_l1.hpp @@ -32,7 +32,12 @@ void reduce_l1(const T* arg, T* out, const Shape& in_shape, const AxisSet& reduc const size_t out_idx = std::inner_product(output_coord.begin(), output_coord.end(), out_strides.begin(), uint64_t(0)); - out[out_idx] = out[out_idx] + std::abs(arg[in_idx]); + // WA for abs function, due to it's not defined for some data types. + auto val = arg[in_idx]; + if (val < T(0)) { + val *= T(-1); + } + out[out_idx] = out[out_idx] + val; } OPENVINO_SUPPRESS_DEPRECATED_END } diff --git a/src/core/reference/include/ngraph/runtime/reference/reduce_l2.hpp b/src/core/reference/include/ngraph/runtime/reference/reduce_l2.hpp index c338f340be8958..21918c9f5f010e 100644 --- a/src/core/reference/include/ngraph/runtime/reference/reduce_l2.hpp +++ b/src/core/reference/include/ngraph/runtime/reference/reduce_l2.hpp @@ -35,7 +35,7 @@ void reduce_l2(const T* arg, T* out, const Shape& in_shape, const AxisSet& reduc out[out_idx] = out[out_idx] + arg[in_idx] * arg[in_idx]; } std::transform(out, out + shape_size(out_shape), out, [](T elem) { - return sqrt(elem); + return static_cast(std::sqrt(static_cast(elem))); }); OPENVINO_SUPPRESS_DEPRECATED_END } diff --git a/src/core/src/op/reduce_l1.cpp b/src/core/src/op/reduce_l1.cpp index 74d522a47f869d..8f6a88f0347363 100644 --- a/src/core/src/op/reduce_l1.cpp +++ b/src/core/src/op/reduce_l1.cpp @@ -43,6 +43,7 @@ bool evaluate_sum(const HostTensorPtr& arg, const HostTensorPtr& out, const Axis switch (arg->get_element_type()) { NGRAPH_TYPE_CASE(evaluate_reducel1_sum, i32, arg, out, axes, keep_dims); NGRAPH_TYPE_CASE(evaluate_reducel1_sum, i64, arg, out, axes, keep_dims); + NGRAPH_TYPE_CASE(evaluate_reducel1_sum, u64, arg, out, axes, keep_dims); NGRAPH_TYPE_CASE(evaluate_reducel1_sum, bf16, arg, out, axes, keep_dims); NGRAPH_TYPE_CASE(evaluate_reducel1_sum, f16, arg, out, axes, keep_dims); NGRAPH_TYPE_CASE(evaluate_reducel1_sum, f32, arg, out, axes, keep_dims); @@ -73,6 +74,7 @@ bool op::v4::ReduceL1::has_evaluate() const { switch (get_input_element_type(0)) { case ngraph::element::i32: case ngraph::element::i64: + case ngraph::element::u64: case ngraph::element::bf16: case ngraph::element::f16: case ngraph::element::f32: diff --git a/src/core/src/op/reduce_l2.cpp b/src/core/src/op/reduce_l2.cpp index a56160415de1df..1f5e33c1c81154 100644 --- a/src/core/src/op/reduce_l2.cpp +++ b/src/core/src/op/reduce_l2.cpp @@ -44,6 +44,8 @@ bool evaluate_reduce_l2(const HostTensorPtr& arg, const HostTensorPtr& out, cons NGRAPH_TYPE_CASE(evaluate_reduce_l2, bf16, arg, out, axes, keep_dims); NGRAPH_TYPE_CASE(evaluate_reduce_l2, f16, arg, out, axes, keep_dims); NGRAPH_TYPE_CASE(evaluate_reduce_l2, f32, arg, out, axes, keep_dims); + NGRAPH_TYPE_CASE(evaluate_reduce_l2, i64, arg, out, axes, keep_dims); + NGRAPH_TYPE_CASE(evaluate_reduce_l2, u64, arg, out, axes, keep_dims); default: rc = false; break; @@ -72,6 +74,8 @@ bool op::v4::ReduceL2::has_evaluate() const { case ngraph::element::bf16: case ngraph::element::f16: case ngraph::element::f32: + case ngraph::element::i64: + case ngraph::element::u64: return true; default: break; diff --git a/src/inference/dev_api/cpp_interfaces/interface/ie_internal_plugin_config.hpp b/src/inference/dev_api/cpp_interfaces/interface/ie_internal_plugin_config.hpp index eeac793acc7dcc..4af3d785d0fd41 100644 --- a/src/inference/dev_api/cpp_interfaces/interface/ie_internal_plugin_config.hpp +++ b/src/inference/dev_api/cpp_interfaces/interface/ie_internal_plugin_config.hpp @@ -110,6 +110,11 @@ INFERENCE_ENGINE_1_0_DEPRECATED DECLARE_CONFIG_VALUE(ENABLE); INFERENCE_ENGINE_1_0_DEPRECATED DECLARE_CONFIG_VALUE(IGNORE_CALLBACK); INFERENCE_ENGINE_1_0_DEPRECATED DECLARE_CONFIG_VALUE(DISABLE); +/** + * @brief Enables inference with INT64 data type in CPU plugin if it's presented in the original model. + */ +DECLARE_CONFIG_KEY(CPU_NATIVE_I64); + } // namespace PluginConfigInternalParams } // namespace InferenceEngine diff --git a/src/plugins/intel_cpu/src/config.cpp b/src/plugins/intel_cpu/src/config.cpp index a1a4eac265b3f7..245ac784619297 100644 --- a/src/plugins/intel_cpu/src/config.cpp +++ b/src/plugins/intel_cpu/src/config.cpp @@ -230,6 +230,15 @@ void Config::readProperties(const std::map &prop) { IE_THROW() << "Wrong value for property key " << ov::hint::execution_mode.name() << ". Supported values: PERFORMANCE, ACCURACY"; } + } else if (key == PluginConfigInternalParams::KEY_CPU_NATIVE_I64) { + if (val == PluginConfigParams::YES) { + enableNativeI64 = true; + } else if (val == PluginConfigParams::NO) { + enableNativeI64 = false; + } else { + IE_THROW() << "Wrong value for property key " << PluginConfigInternalParams::KEY_CPU_NATIVE_I64 << ": " << val + << ". Expected only YES or NO values."; + } } else { IE_THROW(NotFound) << "Unsupported property " << key << " by CPU plugin"; } @@ -314,4 +323,4 @@ void Config::updateProperties() { } } // namespace intel_cpu -} // namespace ov +} // namespace ov diff --git a/src/plugins/intel_cpu/src/config.h b/src/plugins/intel_cpu/src/config.h index 4be16563c8991c..4e74e086252780 100644 --- a/src/plugins/intel_cpu/src/config.h +++ b/src/plugins/intel_cpu/src/config.h @@ -57,6 +57,7 @@ struct Config { // TODO: Executor cache may leads to incorrect behavior on oneDNN ACL primitives size_t rtCacheCapacity = 0ul; #endif + bool enableNativeI64 = false; InferenceEngine::IStreamsExecutor::Config streamExecutorConfig; InferenceEngine::PerfHintsConfig perfHintsConfig; bool enableCpuPinning = true; diff --git a/src/plugins/intel_cpu/src/dnnl_extension_utils.cpp b/src/plugins/intel_cpu/src/dnnl_extension_utils.cpp index 1cef0551d1eb08..0146c0cfa7b9af 100644 --- a/src/plugins/intel_cpu/src/dnnl_extension_utils.cpp +++ b/src/plugins/intel_cpu/src/dnnl_extension_utils.cpp @@ -4,45 +4,43 @@ #include "dnnl_extension_utils.h" -#include "utils/general_utils.h" #include #include "memory_desc/dnnl_blocked_memory_desc.h" -#include "onednn/iml_type_mapper.h" -#include #include -#include - using namespace dnnl; namespace ov { namespace intel_cpu { -uint8_t DnnlExtensionUtils::sizeOfDataType(dnnl::memory::data_type dataType) { +uint8_t DnnlExtensionUtils::sizeOfDataType(memory::data_type dataType) { switch (dataType) { - case dnnl::memory::data_type::f32: - return 4; - case dnnl::memory::data_type::s32: + case memory::data_type::f64: + case memory::data_type::s64: + return 8; + case memory::data_type::f32: + case memory::data_type::s32: return 4; - case dnnl::memory::data_type::bf16: + case memory::data_type::bf16: + case memory::data_type::f16: return 2; - case dnnl::memory::data_type::s8: - return 1; - case dnnl::memory::data_type::u8: + case memory::data_type::s8: + case memory::data_type::u8: + case memory::data_type::bin: return 1; - case dnnl::memory::data_type::bin: - return 1; - case dnnl::memory::data_type::f16: - return 2; - case dnnl::memory::data_type::undef: + case memory::data_type::undef: return 0; default: - IE_THROW() << "Unsupported data type."; + IE_THROW() << "Unsupported data type: " << DataTypeToIEPrecision(dataType); } } memory::data_type DnnlExtensionUtils::IEPrecisionToDataType(const InferenceEngine::Precision& prec) { switch (prec) { + case InferenceEngine::Precision::FP64: + return memory::data_type::f64; + case InferenceEngine::Precision::I64: + return memory::data_type::s64; case InferenceEngine::Precision::FP32: return memory::data_type::f32; case InferenceEngine::Precision::I32: @@ -68,6 +66,10 @@ memory::data_type DnnlExtensionUtils::IEPrecisionToDataType(const InferenceEngin InferenceEngine::Precision DnnlExtensionUtils::DataTypeToIEPrecision(memory::data_type dataType) { switch (dataType) { + case memory::data_type::f64: + return InferenceEngine::Precision::FP64; + case memory::data_type::s64: + return InferenceEngine::Precision::I64; case memory::data_type::f32: return InferenceEngine::Precision::FP32; case memory::data_type::s32: @@ -90,11 +92,11 @@ InferenceEngine::Precision DnnlExtensionUtils::DataTypeToIEPrecision(memory::dat } } -Dim DnnlExtensionUtils::convertToDim(const dnnl::memory::dim &dim) { +Dim DnnlExtensionUtils::convertToDim(const memory::dim &dim) { return dim == DNNL_RUNTIME_DIM_VAL ? Shape::UNDEFINED_DIM : static_cast(dim); } -dnnl::memory::dim DnnlExtensionUtils::convertToDnnlDim(const Dim &dim) { - return dim == Shape::UNDEFINED_DIM ? DNNL_RUNTIME_DIM_VAL : static_cast(dim); +memory::dim DnnlExtensionUtils::convertToDnnlDim(const Dim &dim) { + return dim == Shape::UNDEFINED_DIM ? DNNL_RUNTIME_DIM_VAL : static_cast(dim); } VectorDims DnnlExtensionUtils::convertToVectorDims(const memory::dims& dims) { @@ -133,19 +135,19 @@ memory::format_tag DnnlExtensionUtils::GetPlainFormatByRank(size_t rank) { } } -DnnlMemoryDescPtr DnnlExtensionUtils::makeDescriptor(const dnnl::memory::desc &desc) { +DnnlMemoryDescPtr DnnlExtensionUtils::makeDescriptor(const memory::desc &desc) { return makeDescriptor(desc.get()); } DnnlMemoryDescPtr DnnlExtensionUtils::makeDescriptor(const_dnnl_memory_desc_t desc) { - if (desc->format_kind == dnnl::impl::format_kind_t::dnnl_blocked) { + if (desc->format_kind == impl::format_kind_t::dnnl_blocked) { return std::shared_ptr(new DnnlBlockedMemoryDesc(desc)); } else { return std::shared_ptr(new DnnlMemoryDesc(desc)); } } -size_t DnnlExtensionUtils::getMemSizeForDnnlDesc(const dnnl::memory::desc& desc) { +size_t DnnlExtensionUtils::getMemSizeForDnnlDesc(const memory::desc& desc) { auto tmpDesc = desc; const auto offset0 = tmpDesc.get()->offset0; @@ -167,8 +169,8 @@ std::shared_ptr DnnlExtensionUtils::makeUndefinedDesc(con } } -DnnlMemoryDescPtr DnnlExtensionUtils::query_md(const const_dnnl_primitive_desc_t& pd, const dnnl::query& what, int idx) { - auto query = dnnl::convert_to_c(what); +DnnlMemoryDescPtr DnnlExtensionUtils::query_md(const const_dnnl_primitive_desc_t& pd, const query& what, int idx) { + auto query = convert_to_c(what); const auto* cdesc = dnnl_primitive_desc_query_md(pd, query, idx); if (!cdesc) diff --git a/src/plugins/intel_cpu/src/emitters/x64/jit_eltwise_emitters.cpp b/src/plugins/intel_cpu/src/emitters/x64/jit_eltwise_emitters.cpp index 0ba374b68b93be..02bf09290e37c1 100644 --- a/src/plugins/intel_cpu/src/emitters/x64/jit_eltwise_emitters.cpp +++ b/src/plugins/intel_cpu/src/emitters/x64/jit_eltwise_emitters.cpp @@ -36,9 +36,9 @@ InferenceEngine::Precision get_arithmetic_binary_exec_precision(const std::share /// ADD /// jit_add_emitter::jit_add_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node) -: jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) {} -jit_add_emitter::jit_add_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, Precision exec_prc) -: jit_emitter(host, host_isa, exec_prc) {} + : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) {} +jit_add_emitter::jit_add_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const Precision& exec_prc) + : jit_emitter(host, host_isa, exec_prc) {} size_t jit_add_emitter::get_inputs_num() const { return 2; } @@ -50,42 +50,34 @@ void jit_add_emitter::emit_impl(const std::vector &in_vec_idxs, const st } else if (host_isa_ == x64::avx512_core) { emit_isa(in_vec_idxs, out_vec_idxs); } else { - assert(!"unsupported isa"); + IE_THROW() << "jit_add_emitter doesn't support ISA '" << host_isa_ << "'"; } } template void jit_add_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { using Vmm = typename conditional3::type; - Vmm vmm_src0 = Vmm(in_vec_idxs[0]); - Vmm vmm_src1 = Vmm(in_vec_idxs[1]); + Vmm vmm_src_0 = Vmm(in_vec_idxs[0]); + Vmm vmm_src_1 = Vmm(in_vec_idxs[1]); Vmm vmm_dst = Vmm(out_vec_idxs[0]); - auto uni_vadd = [this](Vmm vmm_dst, Vmm vmm_src0, Vmm vmm_src1) { - switch (exec_prc_) { - case Precision::FP32: h->uni_vaddps(vmm_dst, vmm_src0, vmm_src1); break; - case Precision::I32: h->uni_vpaddd(vmm_dst, vmm_src0, vmm_src1); break; - default: assert(!"unsupported precision"); - } - }; - - if (isa == x64::sse41) { - h->uni_vmovups(vmm_dst, vmm_src0); - uni_vadd(vmm_dst, vmm_dst, vmm_src1); - } else { - uni_vadd(vmm_dst, vmm_src0, vmm_src1); + switch (exec_prc_) { + case Precision::FP32: h->uni_vaddps(vmm_dst, vmm_src_0, vmm_src_1); break; + case Precision::I32: h->uni_vpaddd(vmm_dst, vmm_src_0, vmm_src_1); break; + case Precision::I64: h->uni_vpaddq(vmm_dst, vmm_src_0, vmm_src_1); break; + default: IE_THROW() << "jit_add_emitter doesn't support precision '" << exec_prc_ << "'"; } } -std::set> jit_add_emitter::get_supported_precisions(const std::shared_ptr& node) { - return {{element::f32, element::f32}, {element::i32, element::i32}}; +std::set> jit_add_emitter::get_supported_precisions(const std::shared_ptr& node) { + return {{element::f32, element::f32}, {element::i32, element::i32}, {element::i64, element::i64}}; } /// MUL_ADD /// jit_mul_add_emitter::jit_mul_add_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node) -: jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) {} -jit_mul_add_emitter::jit_mul_add_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, Precision exec_prc) -: jit_emitter(host, host_isa, exec_prc) {} + : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) {} +jit_mul_add_emitter::jit_mul_add_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const Precision& exec_prc) + : jit_emitter(host, host_isa, exec_prc) {} size_t jit_mul_add_emitter::get_inputs_num() const { return 3; } @@ -97,86 +89,139 @@ void jit_mul_add_emitter::emit_impl(const std::vector &in_vec_idxs, cons } else if (host_isa_ == x64::avx512_core) { emit_isa(in_vec_idxs, out_vec_idxs); } else { - assert(!"unsupported isa"); + IE_THROW() << "jit_mul_add_emitter doesn't support ISA '" << host_isa_ << "'"; } } template void jit_mul_add_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { using Vmm = typename conditional3::type; - Vmm vmm_src0 = Vmm(in_vec_idxs[0]); - Vmm vmm_src1 = Vmm(in_vec_idxs[1]); - Vmm vmm_src2 = Vmm(in_vec_idxs[2]); - Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]); + Vmm vmm_src_0 = Vmm(in_vec_idxs[0]); + Vmm vmm_src_1 = Vmm(in_vec_idxs[1]); + Vmm vmm_src_2 = Vmm(in_vec_idxs[2]); Vmm vmm_dst = Vmm(out_vec_idxs[0]); - auto uni_vfmadd231_xmm = [this](Xmm vmm_dst, Xmm vmm_src0, Xmm vmm_src1, Xmm vmm_src2) { - h->uni_vmovups(vmm_dst, vmm_src0); + auto uni_madd_xmm = [this](const Xmm &vmm_dst, const Xmm &vmm_src_0, const Xmm &vmm_src_1, const Xmm &vmm_src_2) { switch (exec_prc_) { case Precision::FP32: { - h->uni_vmulps(vmm_dst, vmm_dst, vmm_src1); - h->uni_vaddps(vmm_dst, vmm_dst, vmm_src2); + if (vmm_dst.getIdx() == vmm_src_1.getIdx()) { + h->uni_vmulps(vmm_dst, vmm_src_1, vmm_src_0); + h->uni_vaddps(vmm_dst, vmm_dst, vmm_src_2); + } else if (vmm_dst.getIdx() == vmm_src_2.getIdx()) { + Vmm vmm_aux_0 = Vmm(aux_vec_idxs[0]); + h->uni_vmulps(vmm_aux_0, vmm_src_0, vmm_src_1); + h->uni_vaddps(vmm_dst, vmm_dst, vmm_aux_0); + } else { + h->uni_vmulps(vmm_dst, vmm_src_0, vmm_src_1); + h->uni_vaddps(vmm_dst, vmm_dst, vmm_src_2); + } } break; case Precision::I32: { - h->uni_vpmulld(vmm_dst, vmm_dst, vmm_src1); - h->uni_vpaddd(vmm_dst, vmm_dst, vmm_src2); + if (vmm_dst.getIdx() == vmm_src_1.getIdx()) { + h->uni_vpmulld(vmm_dst, vmm_src_1, vmm_src_0); + h->uni_vpaddd(vmm_dst, vmm_dst, vmm_src_2); + } else if (vmm_dst.getIdx() == vmm_src_2.getIdx()) { + Vmm vmm_aux_0 = Vmm(aux_vec_idxs[0]); + h->uni_vpmulld(vmm_aux_0, vmm_src_0, vmm_src_1); + h->uni_vpaddd(vmm_dst, vmm_dst, vmm_aux_0); + } else { + h->uni_vpmulld(vmm_dst, vmm_src_0, vmm_src_1); + h->uni_vpaddd(vmm_dst, vmm_dst, vmm_src_2); + } + } break; + case Precision::I64: { + Vmm vmm_aux_0 = Vmm(aux_vec_idxs[0]); + Vmm vmm_aux_1 = Vmm(aux_vec_idxs[1]); + // There is no multiply int64 instruction on AVX2 and SSE41, thus the WA is used. + // vmm_src_0 = ab; vmm_src_1 = cd; + h->uni_vpsrlq(vmm_aux_0, vmm_src_0, 32); + h->uni_vpmuludq(vmm_aux_0, vmm_aux_0, vmm_src_1); // a * d + h->uni_vpsrlq(vmm_aux_1, vmm_src_1, 32); + h->uni_vpmuludq(vmm_aux_1, vmm_aux_1, vmm_src_0); // b * c + h->uni_vpaddq(vmm_aux_1, vmm_aux_1, vmm_aux_0); // a * d + b * c + h->uni_vpsllq(vmm_aux_1, vmm_aux_1, 32); + h->uni_vpmuludq(vmm_aux_0, vmm_src_0, vmm_src_1); // b * d + h->uni_vpaddq(vmm_aux_0, vmm_aux_0, vmm_aux_1); // (a * d + b * c) << 32 + b * d + + h->uni_vpaddq(vmm_dst, vmm_src_2, vmm_aux_0); } break; - default: assert(!"unsupported precision"); + default: IE_THROW() << "jit_mul_add_emitter doesn't support precision '" << exec_prc_ << "'"; } }; - auto uni_vfmadd231_vmm = [this, vmm_aux0](Vmm vmm_dst, Vmm vmm_src0, Vmm vmm_src1, Vmm vmm_src2) { + auto uni_madd_vmm = [this](const Vmm &vmm_dst, const Vmm &vmm_src_0, const Vmm &vmm_src_1, const Vmm &vmm_src_2) { switch (exec_prc_) { case Precision::FP32: { - Vmm vmm_mul0; - if (vmm_dst.getIdx() == vmm_src0.getIdx()) { - h->uni_vmovups(vmm_aux0, vmm_src0); - vmm_mul0 = vmm_aux0; + if (vmm_dst.getIdx() == vmm_src_0.getIdx()) { + h->uni_vfmadd132ps(vmm_src_0, vmm_src_2, vmm_src_1); + } else if (vmm_dst.getIdx() == vmm_src_1.getIdx()) { + h->uni_vfmadd132ps(vmm_src_1, vmm_src_2, vmm_src_0); + } else if (vmm_dst.getIdx() == vmm_src_2.getIdx()) { + h->uni_vfmadd231ps(vmm_src_2, vmm_src_0, vmm_src_1); } else { - vmm_mul0 = vmm_src0; + h->uni_vmovups(vmm_dst, vmm_src_2); + h->uni_vfmadd231ps(vmm_dst, vmm_src_0, vmm_src_1); } - - Vmm vmm_mul1; - if (vmm_dst.getIdx() == vmm_src1.getIdx()) { - h->uni_vmovups(vmm_aux0, vmm_src1); - vmm_mul1 = vmm_aux0; + } break; + case Precision::I32: { + if (vmm_dst.getIdx() == vmm_src_2.getIdx()) { + Vmm vmm_aux_0 = Vmm(aux_vec_idxs[0]); + h->uni_vpmulld(vmm_aux_0, vmm_src_0, vmm_src_1); + h->uni_vpaddd(vmm_dst, vmm_dst, vmm_aux_0); } else { - vmm_mul1 = vmm_src1; + h->uni_vpmulld(vmm_dst, vmm_src_0, vmm_src_1); + h->uni_vpaddd(vmm_dst, vmm_dst, vmm_src_2); } - - if (vmm_dst.getIdx() != vmm_src2.getIdx()) - h->uni_vmovups(vmm_dst, vmm_src2); - - h->uni_vfmadd231ps(vmm_dst, vmm_mul0, vmm_mul1); } break; - case Precision::I32: { - h->uni_vpmulld(vmm_dst, vmm_src0, vmm_src1); - h->uni_vpaddd(vmm_dst, vmm_dst, vmm_src2); + case Precision::I64: { + if (isa == x64::avx512_core) { + h->vpmullq(vmm_dst, vmm_src_0, vmm_src_1); + h->uni_vpaddq(vmm_dst, vmm_dst, vmm_src_2); + } else { + Vmm vmm_aux_0 = Vmm(aux_vec_idxs[0]); + Vmm vmm_aux_1 = Vmm(aux_vec_idxs[1]); + // There is no multiply int64 instruction on AVX2 and SSE41, thus the WA is used. + // vmm_src_0 = ab; vmm_src_1 = cd; + h->uni_vpsrlq(vmm_aux_0, vmm_src_0, 32); + h->uni_vpmuludq(vmm_aux_0, vmm_aux_0, vmm_src_1); // a * d + h->uni_vpsrlq(vmm_aux_1, vmm_src_1, 32); + h->uni_vpmuludq(vmm_aux_1, vmm_aux_1, vmm_src_0); // b * c + h->uni_vpaddq(vmm_aux_1, vmm_aux_1, vmm_aux_0); // a * d + b * c + h->uni_vpsllq(vmm_aux_1, vmm_aux_1, 32); + h->uni_vpmuludq(vmm_aux_0, vmm_src_0, vmm_src_1); // b * d + h->uni_vpaddq(vmm_aux_0, vmm_aux_0, vmm_aux_1); // (a * d + b * c) << 32 + b * d + + h->uni_vpaddq(vmm_dst, vmm_aux_0, vmm_src_2); + } } break; - default: assert(!"unsupported precision"); + default: IE_THROW() << "jit_mul_add_emitter doesn't support precision '" << exec_prc_ << "'"; } }; if (isa == x64::sse41) { - uni_vfmadd231_xmm(vmm_dst, vmm_src0, vmm_src1, vmm_src2); + uni_madd_xmm(vmm_dst, vmm_src_0, vmm_src_1, vmm_src_2); } else { - uni_vfmadd231_vmm(vmm_dst, vmm_src0, vmm_src1, vmm_src2); + uni_madd_vmm(vmm_dst, vmm_src_0, vmm_src_1, vmm_src_2); } } -size_t jit_mul_add_emitter::aux_vecs_count() const { - return 1; +std::set> jit_mul_add_emitter::get_supported_precisions(const std::shared_ptr& node) { + return {{element::f32, element::f32, element::f32}, {element::i32, element::i32, element::i32}, {element::i64, element::i64, element::i64}}; } -std::set> jit_mul_add_emitter::get_supported_precisions(const std::shared_ptr& node) { - return {{element::f32, element::f32, element::f32}, {element::i32, element::i32, element::i32}}; +size_t jit_mul_add_emitter::aux_vecs_count() const { + if (!x64::mayiuse(x64::avx512_core) && exec_prc_ == Precision::I64) { + return 2; + } else { + return 0; + } } /// SUB /// jit_subtract_emitter::jit_subtract_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node) -: jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) {} -jit_subtract_emitter::jit_subtract_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, Precision exec_prc) -: jit_emitter(host, host_isa, exec_prc) {} + : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) {} +jit_subtract_emitter::jit_subtract_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const Precision& exec_prc) + : jit_emitter(host, host_isa, exec_prc) {} size_t jit_subtract_emitter::get_inputs_num() const { return 2; } @@ -188,42 +233,34 @@ void jit_subtract_emitter::emit_impl(const std::vector &in_vec_idxs, con } else if (host_isa_ == x64::avx512_core) { emit_isa(in_vec_idxs, out_vec_idxs); } else { - assert(!"unsupported isa"); + IE_THROW() << "jit_subtract_emitter doesn't support ISA '" << host_isa_ << "'"; } } template void jit_subtract_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { using Vmm = typename conditional3::type; - Vmm vmm_src0 = Vmm(in_vec_idxs[0]); - Vmm vmm_src1 = Vmm(in_vec_idxs[1]); + Vmm vmm_src_0 = Vmm(in_vec_idxs[0]); + Vmm vmm_src_1 = Vmm(in_vec_idxs[1]); Vmm vmm_dst = Vmm(out_vec_idxs[0]); - auto uni_vsub = [this](Vmm vmm_dst, Vmm vmm_src0, Vmm vmm_src1) { - switch (exec_prc_) { - case Precision::FP32: h->uni_vsubps(vmm_dst, vmm_src0, vmm_src1); break; - case Precision::I32: h->uni_vpsubd(vmm_dst, vmm_src0, vmm_src1); break; - default: assert(!"unsupported precision"); - } - }; - - if (isa == x64::sse41) { - h->uni_vmovups(vmm_dst, vmm_src0); - uni_vsub(vmm_dst, vmm_dst, vmm_src1); - } else { - uni_vsub(vmm_dst, vmm_src0, vmm_src1); + switch (exec_prc_) { + case Precision::FP32: h->uni_vsubps(vmm_dst, vmm_src_0, vmm_src_1); break; + case Precision::I32: h->uni_vpsubd(vmm_dst, vmm_src_0, vmm_src_1); break; + case Precision::I64: h->uni_vpsubq(vmm_dst, vmm_src_0, vmm_src_1); break; + default: IE_THROW() << "jit_subtract_emitter doesn't support precision '" << exec_prc_ << "'"; } } -std::set> jit_subtract_emitter::get_supported_precisions(const std::shared_ptr& node) { - return {{element::f32, element::f32}, {element::i32, element::i32}}; +std::set> jit_subtract_emitter::get_supported_precisions(const std::shared_ptr& node) { + return {{element::f32, element::f32}, {element::i32, element::i32}, {element::i64, element::i64}}; } /// MULTIPLY /// jit_multiply_emitter::jit_multiply_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node) -: jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) {} -jit_multiply_emitter::jit_multiply_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, Precision exec_prc) -: jit_emitter(host, host_isa, exec_prc) {} + : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) {} +jit_multiply_emitter::jit_multiply_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const Precision& exec_prc) + : jit_emitter(host, host_isa, exec_prc) {} size_t jit_multiply_emitter::get_inputs_num() const { return 2; } @@ -235,44 +272,82 @@ void jit_multiply_emitter::emit_impl(const std::vector &in_vec_idxs, con } else if (host_isa_ == x64::avx512_core) { emit_isa(in_vec_idxs, out_vec_idxs); } else { - assert(!"unsupported isa"); + IE_THROW() << "jit_multiply_emitter doesn't support ISA '" << host_isa_ << "'"; } } template -void jit_multiply_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_multiply_emitter::emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { using Vmm = typename conditional3::type; - Vmm vmm_src0 = Vmm(in_vec_idxs[0]); - Vmm vmm_src1 = Vmm(in_vec_idxs[1]); - Vmm vmm_dst = Vmm(out_vec_idxs[0]); - auto uni_vmul = [this](Vmm vmm_dst, Vmm vmm_src0, Vmm vmm_src1) { - switch (exec_prc_) { - case Precision::FP32: h->uni_vmulps(vmm_dst, vmm_src0, vmm_src1); break; - case Precision::I32: h->uni_vpmulld(vmm_dst, vmm_src0, vmm_src1); break; - default: assert(!"unsupported precision"); - } - }; - - if (isa == x64::sse41) { - h->uni_vmovups(vmm_dst, vmm_src0); - uni_vmul(vmm_dst, vmm_dst, vmm_src1); + Vmm vmm_src_0 = Vmm(in_vec_idxs[0]); + Operand op_src_1; + if (in_vec_idxs.size() > 1) { + op_src_1 = Vmm(in_vec_idxs[1]); + } else if (aux_gpr_idxs.size() > 0) { + op_src_1 = h->ptr[Reg64(aux_gpr_idxs[0])]; } else { - uni_vmul(vmm_dst, vmm_src0, vmm_src1); + IE_THROW() << "jit_multiply_emitter has invalid inputs number."; + } + Vmm vmm_dst = Vmm(out_vec_idxs[0]); + + switch (exec_prc_) { + case Precision::FP32: h->uni_vmulps(vmm_dst, vmm_src_0, op_src_1); break; + case Precision::I32: h->uni_vpmulld(vmm_dst, vmm_src_0, op_src_1); break; + case Precision::I64: { + if (isa == x64::avx512_core) { + h->vpmullq(vmm_dst, vmm_src_0, op_src_1); + } else { + if (aux_vec_idxs.size() < 2) { + IE_THROW() << "jit_multiply_emitter has invalid number of aux vectors."; + } + auto vmm_aux_0 = Vmm(aux_vec_idxs[0]); + auto vmm_aux_1 = Vmm(aux_vec_idxs[1]); + // There is no multiply int64 instruction on AVX2 and SSE41, thus the WA is used. + // Represent inputs as vmm_src_0 -> AB and op_src_1 -> CD + h->uni_vpsrlq(vmm_aux_0, vmm_src_0, 32); + h->uni_vpmuludq(vmm_aux_0, vmm_aux_0, op_src_1); // A * D + if (!op_src_1.isMEM() && vmm_src_0.getIdx() == op_src_1.getIdx()) { // Optimization for the case of src ^ 2 + h->uni_vpaddq(vmm_aux_1, vmm_aux_0, vmm_aux_0); // A * B + A * B + } else { + h->uni_vpsrlq(vmm_aux_1, op_src_1, 32); + h->uni_vpmuludq(vmm_aux_1, vmm_aux_1, vmm_src_0); // B * C + h->uni_vpaddq(vmm_aux_1, vmm_aux_1, vmm_aux_0); // A * D + B * C + } + h->uni_vpsllq(vmm_aux_1, vmm_aux_1, 32); + h->uni_vpmuludq(vmm_aux_0, vmm_src_0, op_src_1); // B * D + h->uni_vpaddq(vmm_dst, vmm_aux_0, vmm_aux_1); // (A * D + B * C) << 32 + B * D + } + } break; + default: IE_THROW() << "jit_multiply_emitter doesn't support precision '" << exec_prc_ << "'"; } } -std::set> jit_multiply_emitter::get_supported_precisions(const std::shared_ptr& node) { - return {{element::f32, element::f32}, {element::i32, element::i32}}; +std::set> jit_multiply_emitter::get_supported_precisions(const std::shared_ptr& node) { + return {{element::f32, element::f32}, {element::i32, element::i32}, {element::i64, element::i64}}; +} + +size_t jit_multiply_emitter::aux_vecs_count() const { + if (exec_prc_ == Precision::I64 && !x64::mayiuse(x64::avx512_core)) { + return 2; + } else { + return 0; + } } /// DIVIDE /// -jit_divide_emitter::jit_divide_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) -: jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) {} -jit_divide_emitter::jit_divide_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, Precision exec_prc) -: jit_emitter(host, host_isa, exec_prc) {} +jit_divide_emitter::jit_divide_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node) + : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { + prepare_table(); +} +jit_divide_emitter::jit_divide_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const Precision& exec_prc) + : jit_emitter(host, host_isa, exec_prc) { + prepare_table(); +} -size_t jit_divide_emitter::get_inputs_num() const { return 2; } +size_t jit_divide_emitter::get_inputs_num() const { + return 2; +} void jit_divide_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { if (host_isa_ == x64::sse41) { @@ -282,64 +357,103 @@ void jit_divide_emitter::emit_impl(const std::vector &in_vec_idxs, const } else if (host_isa_ == x64::avx512_core) { emit_isa(in_vec_idxs, out_vec_idxs); } else { - assert(!"unsupported isa"); + IE_THROW() << "jit_divide_emitter doesn't support ISA '" << host_isa_ << "'"; } } template void jit_divide_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { using Vmm = typename conditional3::type; - Vmm vmm_src0 = Vmm(in_vec_idxs[0]); - Vmm vmm_src1 = Vmm(in_vec_idxs[1]); + Vmm vmm_src_0 = Vmm(in_vec_idxs[0]); + Vmm vmm_src_1 = Vmm(in_vec_idxs[1]); Vmm vmm_dst = Vmm(out_vec_idxs[0]); - auto uni_vdiv = [this](Vmm vmm_dst, Vmm vmm_src0, Vmm vmm_src1) { - switch (exec_prc_) { - case Precision::FP32: { - h->uni_vdivps(vmm_dst, vmm_src0, vmm_src1); - break; - } - case Precision::I32: { - Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]); - - // The opset doesn't contain vector instruction for integer divide operation - // As WA we emulate its behavior via fp divide followed by rounding to zero - h->uni_vcvtdq2ps(vmm_dst, vmm_src0); - h->uni_vcvtdq2ps(vmm_aux0, vmm_src1); - h->uni_vdivps(vmm_dst, vmm_dst, vmm_aux0); - h->uni_vroundps(vmm_dst, vmm_dst, 3); // rounding to zero - h->uni_vcvtps2dq(vmm_dst, vmm_dst); - break; - } - default: assert(!"unsupported precision"); + // The opset doesn't contain vector instruction for integer divide operation + // As WA we emulate its behavior via fp divide followed by rounding to zero + switch (exec_prc_) { + case Precision::FP32: { + h->uni_vdivps(vmm_dst, vmm_src_0, vmm_src_1); + break; } - }; + case Precision::I32: { + Vmm vmm_aux_0 = Vmm(aux_vec_idxs[0]); - if (isa == x64::sse41) { - h->uni_vmovups(vmm_dst, vmm_src0); - uni_vdiv(vmm_dst, vmm_dst, vmm_src1); - } else { - uni_vdiv(vmm_dst, vmm_src0, vmm_src1); + h->uni_vcvtdq2ps(vmm_dst, vmm_src_0); + if (second_is_float) { + h->uni_vdivps(vmm_dst, vmm_dst, vmm_src_1); + } else { + h->uni_vcvtdq2ps(vmm_aux_0, vmm_src_1); + h->uni_vdivps(vmm_dst, vmm_dst, vmm_aux_0); + } + h->uni_vroundps(vmm_dst, vmm_dst, 3); // rounding to zero + h->uni_vcvtps2dq(vmm_dst, vmm_dst); + } break; + case Precision::I64: { + if (isa == x64::avx512_core) { + Vmm vmm_aux_0 = Vmm(aux_vec_idxs[0]); + + h->vcvtqq2pd(vmm_dst, vmm_src_0); + if (second_is_float) { + h->uni_vdivpd(vmm_dst, vmm_dst, vmm_src_1); + } else { + h->vcvtqq2pd(vmm_aux_0, vmm_src_1); + h->uni_vdivpd(vmm_dst, vmm_dst, vmm_aux_0); + } + h->uni_vroundpd(vmm_dst, vmm_dst, 3); // rounding to zero + h->vcvtpd2qq(vmm_dst, vmm_dst); + } else { + Vmm vmm_aux_0 = Vmm(aux_vec_idxs[0]); + Vmm vmm_aux_1 = Vmm(aux_vec_idxs[1]); + h->uni_vmovups(vmm_aux_1, table_val_64("dMask")); + + h->uni_vpaddq(vmm_dst, vmm_src_0, vmm_aux_1); + h->uni_vsubpd(vmm_dst, vmm_dst, vmm_aux_1); + + if (second_is_float) { + h->uni_vdivpd(vmm_dst, vmm_dst, vmm_src_1); + } else { + h->uni_vpaddq(vmm_aux_0, vmm_src_1, vmm_aux_1); + h->uni_vsubpd(vmm_aux_0, vmm_aux_0, vmm_aux_1); + + h->uni_vdivpd(vmm_dst, vmm_dst, vmm_aux_0); + } + h->uni_vroundpd(vmm_dst, vmm_dst, 3); // rounding to zero + + h->uni_vaddpd(vmm_dst, vmm_dst, vmm_aux_1); + h->uni_vpsubq(vmm_dst, vmm_dst, vmm_aux_1); + } break; + } + default: IE_THROW() << "jit_divide_emitter doesn't support precision '" << exec_prc_ << "'"; } } -std::set> jit_divide_emitter::get_supported_precisions(const std::shared_ptr& node) { - return {{element::f32, element::f32}, {element::i32, element::i32}}; +std::set> jit_divide_emitter::get_supported_precisions(const std::shared_ptr& node) { + return {{element::f32, element::f32}, {element::i32, element::i32}, {element::i64, element::i64}}; } size_t jit_divide_emitter::aux_vecs_count() const { - return exec_prc_ == Precision::I32 ? 1 : 0; + if (x64::mayiuse(x64::avx512_core)) { + return (exec_prc_ == Precision::I32 || exec_prc_ == Precision::I64) ? 1 : 0; + } else { + return exec_prc_ == Precision::I32 ? 1 : exec_prc_ == Precision::I64 ? 2 : 0; + } +} + +void jit_divide_emitter::register_table_entries() { + if (host_isa_ != x64::avx512_core) { + push_arg_entry_of_64("dMask", 0x433800002150d000, true); + } } /// FLOOR /// -jit_floor_emitter::jit_floor_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) -: jit_emitter(host, host_isa, node, exec_prc) {} -jit_floor_emitter::jit_floor_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, Precision exec_prc) -: jit_emitter(host, host_isa, exec_prc) {} +jit_floor_emitter::jit_floor_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node, const Precision& exec_prc) + : jit_emitter(host, host_isa, node, exec_prc) {} +jit_floor_emitter::jit_floor_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const Precision& exec_prc) + : jit_emitter(host, host_isa, exec_prc) {} size_t jit_floor_emitter::get_inputs_num() const { return 1; } -std::set> jit_floor_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_floor_emitter::get_supported_precisions(const std::shared_ptr& node) { return {{element::f32}}; } @@ -351,7 +465,7 @@ void jit_floor_emitter::emit_impl(const std::vector& in_vec_idxs, const } else if (host_isa_ == x64::avx512_core) { emit_isa(in_vec_idxs, out_vec_idxs); } else { - assert(!"unsupported isa"); + IE_THROW() << "jit_floor_emitter doesn't support ISA '" << host_isa_ << "'"; } } @@ -364,14 +478,14 @@ void jit_floor_emitter::emit_isa(const std::vector &in_vec_idxs, const s } /// CEILING /// -jit_ceiling_emitter::jit_ceiling_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) +jit_ceiling_emitter::jit_ceiling_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node, const Precision& exec_prc) : jit_emitter(host, host_isa, node, exec_prc) {} -jit_ceiling_emitter::jit_ceiling_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, Precision exec_prc) +jit_ceiling_emitter::jit_ceiling_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const Precision& exec_prc) : jit_emitter(host, host_isa, exec_prc) {} size_t jit_ceiling_emitter::get_inputs_num() const { return 1; } -std::set> jit_ceiling_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_ceiling_emitter::get_supported_precisions(const std::shared_ptr& node) { return {{element::f32}}; } @@ -384,7 +498,7 @@ void jit_ceiling_emitter::emit_impl(const std::vector& in_vec_idxs, } else if (host_isa_ == x64::avx512_core) { emit_isa(in_vec_idxs, out_vec_idxs); } else { - assert(!"unsupported isa"); + IE_THROW() << "jit_ceiling_emitter doesn't support ISA '" << host_isa_ << "'"; } } @@ -397,15 +511,23 @@ void jit_ceiling_emitter::emit_isa(const std::vector &in_vec_idxs, const } /// FLOOR_MOD /// -jit_floor_mod_emitter::jit_floor_mod_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) -: jit_emitter(host, host_isa, node, exec_prc) {} -jit_floor_mod_emitter::jit_floor_mod_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, Precision exec_prc) -: jit_emitter(host, host_isa, exec_prc) {} +jit_floor_mod_emitter::jit_floor_mod_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node, + const Precision& exec_prc) : jit_emitter(host, host_isa, node, exec_prc) { + prepare_table(); +} +jit_floor_mod_emitter::jit_floor_mod_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const Precision& exec_prc) + : jit_emitter(host, host_isa, exec_prc) { + prepare_table(); +} size_t jit_floor_mod_emitter::get_inputs_num() const { return 2; } -std::set> jit_floor_mod_emitter::get_supported_precisions(const std::shared_ptr& node) { - return {{element::f32, element::f32}}; +std::set> jit_floor_mod_emitter::get_supported_precisions(const std::shared_ptr& node) { + if (x64::mayiuse(x64::avx512_core)) { + return {{element::f32, element::f32}, {element::f64, element::f64}}; + } else { + return {{element::f32, element::f32}, {element::i64, element::i64}}; + } } void jit_floor_mod_emitter::emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { @@ -416,49 +538,120 @@ void jit_floor_mod_emitter::emit_impl(const std::vector& in_vec_idxs, co } else if (host_isa_ == x64::avx512_core) { emit_isa(in_vec_idxs, out_vec_idxs); } else { - assert(!"unsupported isa"); + IE_THROW() << "jit_floor_mod_emitter doesn't support ISA '" << host_isa_ << "'"; } } template void jit_floor_mod_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { using Vmm = typename conditional3::type; - Vmm vmm_src0 = Vmm(in_vec_idxs[0]); - Vmm vmm_src1 = Vmm(in_vec_idxs[1]); + Vmm vmm_src_0 = Vmm(in_vec_idxs[0]); + Vmm vmm_src_1 = Vmm(in_vec_idxs[1]); Vmm vmm_dst = Vmm(out_vec_idxs[0]); - Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]); + Vmm vmm_aux_0 = Vmm(aux_vec_idxs[0]); - if (isa == x64::sse41) { - if (vmm_dst.getIdx() != vmm_src0.getIdx()) - h->uni_vmovups(vmm_dst, vmm_src0); - h->uni_vmovups(vmm_aux0, vmm_src0); - h->uni_vdivps(vmm_aux0, vmm_aux0, vmm_src1); - h->uni_vroundps(vmm_aux0, vmm_aux0, 1); // rounding down - h->uni_vmulps(vmm_aux0, vmm_aux0, vmm_src1); - h->uni_vsubps(vmm_dst, vmm_dst, vmm_aux0); - } else { - if (vmm_dst.getIdx() != vmm_src0.getIdx()) - h->uni_vmovups(vmm_dst, vmm_src0); - h->uni_vdivps(vmm_aux0, vmm_src0, vmm_src1); - h->uni_vroundps(vmm_aux0, vmm_aux0, 1); // rounding down - h->uni_vmulps(vmm_aux0, vmm_aux0, vmm_src1); - h->uni_vsubps(vmm_dst, vmm_dst, vmm_aux0); + switch (exec_prc_) { + case Precision::FP32: { + if (isa == x64::sse41) { + if (vmm_dst.getIdx() != vmm_src_0.getIdx()) { + h->uni_vmovups(vmm_dst, vmm_src_0); + } + h->uni_vdivps(vmm_aux_0, vmm_src_0, vmm_src_1); + h->uni_vroundps(vmm_aux_0, vmm_aux_0, 1); // rounding down + h->uni_vmulps(vmm_aux_0, vmm_aux_0, vmm_src_1); + h->uni_vsubps(vmm_dst, vmm_dst, vmm_aux_0); + } else { + if (vmm_dst.getIdx() != vmm_src_0.getIdx()) { + h->uni_vdivps(vmm_dst, vmm_src_0, vmm_src_1); + h->uni_vroundps(vmm_dst, vmm_dst, 1); // rounding down + h->vfnmadd132ps(vmm_dst, vmm_src_0, vmm_src_1); + } else { + h->uni_vdivps(vmm_aux_0, vmm_src_0, vmm_src_1); + h->uni_vroundps(vmm_aux_0, vmm_aux_0, 1); // rounding down + h->vfnmadd231ps(vmm_dst, vmm_aux_0, vmm_src_1); + } + } + } break; + case Precision::I64: { + Vmm vmm_aux_1 = Vmm(aux_vec_idxs[1]); + + if (isa == x64::avx512_core) { + h->vcvtqq2pd(vmm_aux_0, vmm_src_0); + h->vcvtqq2pd(vmm_aux_1, vmm_src_1); + + h->uni_vdivpd(vmm_dst, vmm_aux_0, vmm_aux_1); + h->uni_vroundpd(vmm_dst, vmm_dst, 1); // rounding down + h->vfnmadd132pd(vmm_dst, vmm_aux_0, vmm_aux_1); + + h->vcvtpd2qq(vmm_dst, vmm_dst); + } else { + Vmm vmm_aux2 = Vmm(aux_vec_idxs[2]); + h->uni_vmovups(vmm_aux2, table_val_64("dMask")); + + h->uni_vpaddq(vmm_aux_0, vmm_src_0, vmm_aux2); + h->uni_vsubpd(vmm_aux_0, vmm_aux_0, vmm_aux2); + h->uni_vpaddq(vmm_aux_1, vmm_src_1, vmm_aux2); + h->uni_vsubpd(vmm_aux_1, vmm_aux_1, vmm_aux2); + + if (isa == x64::sse41) { + h->uni_vdivpd(vmm_dst, vmm_aux_0, vmm_aux_1); + h->uni_vroundpd(vmm_dst, vmm_dst, 1); // rounding down + h->uni_vmulpd(vmm_aux_1, vmm_aux_1, vmm_dst); + h->uni_vsubpd(vmm_dst, vmm_aux_0, vmm_aux_1); + } else { + h->uni_vdivpd(vmm_dst, vmm_aux_0, vmm_aux_1); + h->uni_vroundpd(vmm_dst, vmm_dst, 1); // rounding down + h->vfnmadd132pd(vmm_dst, vmm_aux_0, vmm_aux_1); + } + + h->uni_vaddpd(vmm_dst, vmm_dst, vmm_aux2); + h->uni_vpsubq(vmm_dst, vmm_dst, vmm_aux2); + } + } break; + case Precision::FP64: { + if (isa == x64::sse41) { + if (vmm_dst.getIdx() != vmm_src_0.getIdx()) { + h->uni_vmovups(vmm_dst, vmm_src_0); + } + h->uni_vdivpd(vmm_aux_0, vmm_src_0, vmm_src_1); + h->uni_vroundpd(vmm_aux_0, vmm_aux_0, 1); // rounding down + h->uni_vmulpd(vmm_aux_0, vmm_aux_0, vmm_src_1); + h->uni_vsubpd(vmm_dst, vmm_dst, vmm_aux_0); + } else { + if (vmm_dst.getIdx() != vmm_src_0.getIdx()) { + h->uni_vdivpd(vmm_dst, vmm_src_0, vmm_src_1); + h->uni_vroundpd(vmm_dst, vmm_dst, 1); // rounding down + h->vfnmadd132pd(vmm_dst, vmm_src_0, vmm_src_1); + } else { + h->uni_vdivpd(vmm_aux_0, vmm_src_0, vmm_src_1); + h->uni_vroundpd(vmm_aux_0, vmm_aux_0, 1); // rounding down + h->vfnmadd231pd(vmm_dst, vmm_aux_0, vmm_src_1); + } + } + } break; + default: IE_THROW() << "jit_floor_mod_emitter doesn't support precision '" << exec_prc_ << "'"; } } size_t jit_floor_mod_emitter::aux_vecs_count() const { - return 1; + return (host_isa_ != x64::avx512_core && exec_prc_ == Precision::I64) ? 3 : 1; +} + +void jit_floor_mod_emitter::register_table_entries() { + if (host_isa_ != x64::avx512_core && exec_prc_ == Precision::I64) { + push_arg_entry_of_64("dMask", 0x433800002150d000, true); + } } /// MOD /// -jit_mod_emitter::jit_mod_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) -: jit_emitter(host, host_isa, node, exec_prc) {} -jit_mod_emitter::jit_mod_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, Precision exec_prc) -: jit_emitter(host, host_isa, exec_prc) {} +jit_mod_emitter::jit_mod_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node, const Precision& exec_prc) + : jit_emitter(host, host_isa, node, exec_prc) {} +jit_mod_emitter::jit_mod_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const Precision& exec_prc) + : jit_emitter(host, host_isa, exec_prc) {} size_t jit_mod_emitter::get_inputs_num() const { return 2; } -std::set> jit_mod_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_mod_emitter::get_supported_precisions(const std::shared_ptr& node) { return {{element::f32, element::f32}}; } @@ -470,33 +663,33 @@ void jit_mod_emitter::emit_impl(const std::vector& in_vec_idxs, const st } else if (host_isa_ == x64::avx512_core) { emit_isa(in_vec_idxs, out_vec_idxs); } else { - assert(!"unsupported isa"); + IE_THROW() << "jit_mod_emitter doesn't support ISA '" << host_isa_ << "'"; } } template void jit_mod_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { using Vmm = typename conditional3::type; - Vmm vmm_src0 = Vmm(in_vec_idxs[0]); - Vmm vmm_src1 = Vmm(in_vec_idxs[1]); + Vmm vmm_src_0 = Vmm(in_vec_idxs[0]); + Vmm vmm_src_1 = Vmm(in_vec_idxs[1]); Vmm vmm_dst = Vmm(out_vec_idxs[0]); - Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]); + Vmm vmm_aux_0 = Vmm(aux_vec_idxs[0]); if (isa == x64::sse41) { - if (vmm_dst.getIdx() != vmm_src0.getIdx()) - h->uni_vmovups(vmm_dst, vmm_src0); - h->uni_vmovups(vmm_aux0, vmm_src0); - h->uni_vdivps(vmm_aux0, vmm_aux0, vmm_src1); - h->uni_vroundps(vmm_aux0, vmm_aux0, 3); // truncate - h->uni_vmulps(vmm_aux0, vmm_aux0, vmm_src1); - h->uni_vsubps(vmm_dst, vmm_dst, vmm_aux0); + if (vmm_dst.getIdx() != vmm_src_0.getIdx()) + h->uni_vmovups(vmm_dst, vmm_src_0); + h->uni_vmovups(vmm_aux_0, vmm_src_0); + h->uni_vdivps(vmm_aux_0, vmm_aux_0, vmm_src_1); + h->uni_vroundps(vmm_aux_0, vmm_aux_0, 3); // truncate + h->uni_vmulps(vmm_aux_0, vmm_aux_0, vmm_src_1); + h->uni_vsubps(vmm_dst, vmm_dst, vmm_aux_0); } else { - if (vmm_dst.getIdx() != vmm_src0.getIdx()) - h->uni_vmovups(vmm_dst, vmm_src0); - h->uni_vdivps(vmm_aux0, vmm_src0, vmm_src1); - h->uni_vroundps(vmm_aux0, vmm_aux0, 3); // truncate - h->uni_vmulps(vmm_aux0, vmm_aux0, vmm_src1); - h->uni_vsubps(vmm_dst, vmm_dst, vmm_aux0); + if (vmm_dst.getIdx() != vmm_src_0.getIdx()) + h->uni_vmovups(vmm_dst, vmm_src_0); + h->uni_vdivps(vmm_aux_0, vmm_src_0, vmm_src_1); + h->uni_vroundps(vmm_aux_0, vmm_aux_0, 3); // truncate + h->uni_vmulps(vmm_aux_0, vmm_aux_0, vmm_src_1); + h->uni_vsubps(vmm_dst, vmm_dst, vmm_aux_0); } } @@ -506,9 +699,9 @@ size_t jit_mod_emitter::aux_vecs_count() const { /// MAXIMUM /// jit_maximum_emitter::jit_maximum_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node) -: jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) {} -jit_maximum_emitter::jit_maximum_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, Precision exec_prc) -: jit_emitter(host, host_isa, exec_prc) {} + : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) {} +jit_maximum_emitter::jit_maximum_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const Precision& exec_prc) + : jit_emitter(host, host_isa, exec_prc) {} size_t jit_maximum_emitter::get_inputs_num() const { return 2; } @@ -520,43 +713,71 @@ void jit_maximum_emitter::emit_impl(const std::vector &in_vec_idxs, cons } else if (host_isa_ == x64::avx512_core) { emit_isa(in_vec_idxs, out_vec_idxs); } else { - assert(!"unsupported isa"); + IE_THROW() << "jit_maximum_emitter doesn't support ISA '" << host_isa_ << "'"; } } template void jit_maximum_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { using Vmm = typename conditional3::type; - Vmm vmm_src0 = Vmm(in_vec_idxs[0]); - Vmm vmm_src1 = Vmm(in_vec_idxs[1]); - Vmm vmm_dst = Vmm(out_vec_idxs[0]); - - auto uni_vmax = [this](Vmm vmm_dst, Vmm vmm_src0, Vmm vmm_src1) { - switch (exec_prc_) { - case Precision::FP32: h->uni_vmaxps(vmm_dst, vmm_src0, vmm_src1); break; - case Precision::I32: h->uni_vpmaxsd(vmm_dst, vmm_src0, vmm_src1); break; - default: assert(!"unsupported precision"); - } - }; - if (isa == x64::sse41) { - if (vmm_src0.getIdx() != vmm_dst.getIdx()) - h->uni_vmovups(vmm_dst, vmm_src0); - uni_vmax(vmm_dst, vmm_dst, vmm_src1); + Vmm vmm_src_0 = Vmm(in_vec_idxs[0]); + Operand op_src_1; + if (in_vec_idxs.size() > 1) { + op_src_1 = Vmm(in_vec_idxs[1]); + } else if (aux_gpr_idxs.size() > 0) { + op_src_1 = h->ptr[Reg64(aux_gpr_idxs[0])]; } else { - uni_vmax(vmm_dst, vmm_src0, vmm_src1); + IE_THROW() << "jit_maximum_emitter has invalid inputs number."; + } + Vmm vmm_dst = Vmm(out_vec_idxs[0]); + + switch (exec_prc_) { + case Precision::FP32: h->uni_vmaxps(vmm_dst, vmm_src_0, op_src_1); break; + case Precision::I32: h->uni_vpmaxsd(vmm_dst, vmm_src_0, op_src_1); break; + case Precision::I64: { + if (isa == x64::avx512_core) { + h->vpmaxsq(vmm_dst, vmm_src_0, op_src_1); + } else { + if (aux_vec_idxs.size() < 1) { + IE_THROW() << "jit_maximum_emitter has invalid number of aux vectors."; + } + auto vmm_aux = Vmm(aux_vec_idxs[0]); + if (isa == x64::avx2) { + h->vpcmpgtq(vmm_aux, vmm_src_0, op_src_1); + h->vandpd(vmm_dst, vmm_src_0, vmm_aux); + h->vandnpd(vmm_aux, vmm_aux, op_src_1); + h->vorpd(vmm_dst, vmm_dst, vmm_aux); + } else { + h->movups(vmm_aux, vmm_src_0); + h->pcmpgtq(vmm_aux, op_src_1); + h->andpd(vmm_aux, vmm_src_0); + if (vmm_dst.getIdx() != vmm_src_0.getIdx()) { + h->movups(vmm_dst, vmm_src_0); + } + h->pcmpgtq(vmm_dst, op_src_1); + h->andnpd(vmm_dst, op_src_1); + h->orpd(vmm_dst, vmm_aux); + } + } + } break; + default: IE_THROW() << "jit_maximum_emitter doesn't support precision '" << exec_prc_ << "'"; } } -std::set> jit_maximum_emitter::get_supported_precisions(const std::shared_ptr& node) { - return {{element::f32, element::f32}, {element::i32, element::i32}}; +std::set> jit_maximum_emitter::get_supported_precisions(const std::shared_ptr& node) { + return {{element::f32, element::f32}, {element::i32, element::i32}, {element::i64, element::i64}}; +} + +size_t jit_maximum_emitter::aux_vecs_count() const { + return (host_isa_ != x64::avx512_core && exec_prc_ == Precision::I64) ? 1 : 0; } /// MINIMUM /// jit_minimum_emitter::jit_minimum_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node) -: jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) {} -jit_minimum_emitter::jit_minimum_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, Precision exec_prc) -: jit_emitter(host, host_isa, exec_prc) {} + : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) {} +jit_minimum_emitter::jit_minimum_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const Precision& exec_prc) + : jit_emitter(host, host_isa, exec_prc) {} size_t jit_minimum_emitter::get_inputs_num() const { return 2; } @@ -568,44 +789,75 @@ void jit_minimum_emitter::emit_impl(const std::vector &in_vec_idxs, cons } else if (host_isa_ == x64::avx512_core) { emit_isa(in_vec_idxs, out_vec_idxs); } else { - assert(!"unsupported isa"); + IE_THROW() << "jit_minimum_emitter doesn't support ISA '" << host_isa_ << "'"; } } template void jit_minimum_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { using Vmm = typename conditional3::type; - Vmm vmm_src0 = Vmm(in_vec_idxs[0]); - Vmm vmm_src1 = Vmm(in_vec_idxs[1]); - Vmm vmm_dst = Vmm(out_vec_idxs[0]); - - auto uni_vmin = [this](Vmm vmm_dst, Vmm vmm_src0, Vmm vmm_src1) { - switch (exec_prc_) { - case Precision::FP32: h->uni_vminps(vmm_dst, vmm_src0, vmm_src1); break; - case Precision::I32: h->uni_vpminsd(vmm_dst, vmm_src0, vmm_src1); break; - default: assert(!"unsupported precision"); - } - }; - if (isa == x64::sse41) { - if (vmm_src0.getIdx() != vmm_dst.getIdx()) - h->uni_vmovups(vmm_dst, vmm_src0); - uni_vmin(vmm_dst, vmm_dst, vmm_src1); + Vmm vmm_src_0 = Vmm(in_vec_idxs[0]); + Operand op_src_1; + if (in_vec_idxs.size() > 1) { + op_src_1 = Vmm(in_vec_idxs[1]); + } else if (aux_gpr_idxs.size() > 0) { + op_src_1 = h->ptr[Reg64(aux_gpr_idxs[0])]; } else { - uni_vmin(vmm_dst, vmm_src0, vmm_src1); + IE_THROW() << "jit_minimum_emitter has invalid inputs number."; } + Vmm vmm_dst = Vmm(out_vec_idxs[0]); + + switch (exec_prc_) { + case Precision::FP32: h->uni_vminps(vmm_dst, vmm_src_0, op_src_1); break; + case Precision::I32: h->uni_vpminsd(vmm_dst, vmm_src_0, op_src_1); break; + case Precision::I64: { + if (isa == x64::avx512_core) { + h->vpminsq(vmm_dst, vmm_src_0, op_src_1); + } else { + if (aux_vec_idxs.size() < 1) { + IE_THROW() << "jit_minimum_emitter has invalid number of aux vectors."; + } + auto vmm_aux = Vmm(aux_vec_idxs[0]); + if (isa == x64::avx2) { + h->vpcmpgtq(vmm_aux, vmm_src_0, op_src_1); + h->vandnpd(vmm_dst, vmm_aux, vmm_src_0); + h->vandpd(vmm_aux, vmm_aux, op_src_1); + h->vorpd(vmm_dst, vmm_dst, vmm_aux); + } else { + h->movups(vmm_aux, vmm_src_0); + h->pcmpgtq(vmm_aux, op_src_1); + h->andpd(vmm_aux, op_src_1); + if (vmm_dst.getIdx() != vmm_src_0.getIdx()) { + h->movups(vmm_dst, vmm_src_0); + } + h->pcmpgtq(vmm_dst, op_src_1); + h->andnpd(vmm_dst, vmm_src_0); + h->orpd(vmm_dst, vmm_aux); + } + } + } break; + default: IE_THROW() << "jit_minimum_emitter doesn't support precision '" << exec_prc_ << "'"; + } +} + +std::set> jit_minimum_emitter::get_supported_precisions(const std::shared_ptr& node) { + return {{element::f32, element::f32}, {element::i32, element::i32}, {element::i64, element::i64}}; } -std::set> jit_minimum_emitter::get_supported_precisions(const std::shared_ptr& node) { - return {{element::f32, element::f32}, {element::i32, element::i32}}; +size_t jit_minimum_emitter::aux_vecs_count() const { + return (host_isa_ != x64::avx512_core && exec_prc_ == Precision::I64) ? 1 : 0; } /// SQUARED_DIFFERENCE /// -jit_squared_difference_emitter::jit_squared_difference_emitter( - x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) -: jit_emitter(host, host_isa, node, exec_prc) {} -jit_squared_difference_emitter::jit_squared_difference_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, Precision exec_prc) -: jit_emitter(host, host_isa, exec_prc) {} +jit_squared_difference_emitter::jit_squared_difference_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node, + const Precision& exec_prc) : jit_emitter(host, host_isa, node, exec_prc) { + prepare_table(); +} +jit_squared_difference_emitter::jit_squared_difference_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const Precision& exec_prc) + : jit_emitter(host, host_isa, exec_prc) { + prepare_table(); +} size_t jit_squared_difference_emitter::get_inputs_num() const { return 2; } @@ -617,54 +869,66 @@ void jit_squared_difference_emitter::emit_impl(const std::vector &in_vec } else if (host_isa_ == x64::avx512_core) { emit_isa(in_vec_idxs, out_vec_idxs); } else { - assert(!"unsupported isa"); + IE_THROW() << "jit_squared_difference_emitter doesn't support ISA '" << host_isa_ << "'"; } } template void jit_squared_difference_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { using Vmm = typename conditional3::type; - Vmm vmm_src0 = Vmm(in_vec_idxs[0]); - Vmm vmm_src1 = Vmm(in_vec_idxs[1]); + Vmm vmm_src_0 = Vmm(in_vec_idxs[0]); + Vmm vmm_src_1 = Vmm(in_vec_idxs[1]); Vmm vmm_dst = Vmm(out_vec_idxs[0]); - auto uni_vsqdiff = [this](Vmm vmm_dst, Vmm vmm_src0, Vmm vmm_src1) { - switch (exec_prc_) { - case Precision::FP32: { - h->uni_vsubps(vmm_dst, vmm_src0, vmm_src1); - h->uni_vmulps(vmm_dst, vmm_dst, vmm_dst); - } break; - case Precision::I32: { - h->uni_vpsubd(vmm_dst, vmm_src0, vmm_src1); - h->uni_vpmulld(vmm_dst, vmm_dst, vmm_dst); - } break; - default: assert(!"unsupported precision"); - } - }; - - if (isa == x64::sse41) { - if (vmm_src0.getIdx() != vmm_dst.getIdx()) - h->uni_vmovups(vmm_dst, vmm_src0); - uni_vsqdiff(vmm_dst, vmm_dst, vmm_src1); - } else { - uni_vsqdiff(vmm_dst, vmm_src0, vmm_src1); + switch (exec_prc_) { + case Precision::FP32: { + h->uni_vsubps(vmm_dst, vmm_src_0, vmm_src_1); + h->uni_vmulps(vmm_dst, vmm_dst, vmm_dst); + } break; + case Precision::I32: { + h->uni_vpsubd(vmm_dst, vmm_src_0, vmm_src_1); + h->uni_vpmulld(vmm_dst, vmm_dst, vmm_dst); + } break; + case Precision::I64: { + if (isa == x64::avx512_core) { + h->uni_vpsubq(vmm_dst, vmm_src_0, vmm_src_1); + h->vpmullq(vmm_dst, vmm_dst, vmm_dst); + } else { + h->uni_vpsubq(vmm_dst, vmm_src_0, vmm_src_1); + + Vmm vmm_aux_0 = Vmm(aux_vec_idxs[0]); + // There is no multiply int64 instruction on AVX2 and SSE41, thus the WA is used. + // vmm_src_0 = ab; vmm_src_1 = cd; + h->uni_vpsrlq(vmm_aux_0, vmm_dst, 32); + h->uni_vpmuludq(vmm_aux_0, vmm_aux_0, vmm_dst); // a * d + h->uni_vpaddq(vmm_aux_0, vmm_aux_0, vmm_aux_0); // a * d + b * c + h->uni_vpsllq(vmm_aux_0, vmm_aux_0, 32); + h->uni_vpmuludq(vmm_dst, vmm_dst, vmm_dst); // b * d + h->uni_vpaddq(vmm_dst, vmm_dst, vmm_aux_0); // (a * d + b * c) << 32 + b * d + } + } break; + default: IE_THROW() << "jit_squared_difference_emitter doesn't support precision '" << exec_prc_ << "'"; } } -std::set> jit_squared_difference_emitter::get_supported_precisions(const std::shared_ptr& node) { - return {{element::f32, element::f32}, {element::i32, element::i32}}; +std::set> jit_squared_difference_emitter::get_supported_precisions(const std::shared_ptr& node) { + return {{element::f32, element::f32}, {element::i32, element::i32}, {element::i64, element::i64}}; +} + +size_t jit_squared_difference_emitter::aux_vecs_count() const { + return (!x64::mayiuse(x64::avx512_core) && exec_prc_ == Precision::I64) ? 1 : 0; } /// POWER_DYNAMIC /// jit_power_dynamic_emitter::jit_power_dynamic_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node, - Precision exec_prc) + const Precision& exec_prc) : jit_emitter(host, host_isa, node, exec_prc) {} -jit_power_dynamic_emitter::jit_power_dynamic_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, Precision exec_prc) +jit_power_dynamic_emitter::jit_power_dynamic_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const Precision& exec_prc) : jit_emitter(host, host_isa, exec_prc) {} size_t jit_power_dynamic_emitter::get_inputs_num() const { return 2; } -std::set> jit_power_dynamic_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_power_dynamic_emitter::get_supported_precisions(const std::shared_ptr& node) { return {{element::f32, element::f32}}; } @@ -676,23 +940,23 @@ void jit_power_dynamic_emitter::emit_impl(const std::vector& in_vec_idxs } else if (host_isa_ == x64::avx512_core) { emit_isa(in_vec_idxs, out_vec_idxs); } else { - assert(!"unsupported isa"); + IE_THROW() << "jit_power_dynamic_emitter doesn't support ISA '" << host_isa_ << "'"; } } template void jit_power_dynamic_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { using Vmm = typename conditional3::type; - Vmm vmm_src0 = Vmm(in_vec_idxs[0]); - Vmm vmm_src1 = Vmm(in_vec_idxs[1]); + Vmm vmm_src_0 = Vmm(in_vec_idxs[0]); + Vmm vmm_src_1 = Vmm(in_vec_idxs[1]); Vmm vmm_dst = Vmm(out_vec_idxs[0]); Xmm xmm0 = Xmm(0), xmm1 = Xmm(1); // caller obligation to save gprs as callee may use them size_t gpr_size = 8; - Xbyak::Operand gprs_to_save[] = {h->r8, h->r9, h->r10, h->r11, h->rax, - h->rcx, h->rdx, h->rdi, h->rsi, h->rbp, h->rbx}; + Operand gprs_to_save[] = {h->r8, h->r9, h->r10, h->r11, h->rax, + h->rcx, h->rdx, h->rdi, h->rsi, h->rbp, h->rbx}; size_t n_gprs_to_save = sizeof(gprs_to_save) / sizeof(gprs_to_save[0]); h->sub(h->rsp, n_gprs_to_save * gpr_size); @@ -721,8 +985,8 @@ void jit_power_dynamic_emitter::emit_isa(const std::vector &in_vec_idxs, h->sub(h->rsp, (get_max_vecs_count() + 2) * get_vec_length()); for (size_t i = 2; i < get_max_vecs_count() + 2; ++i) h->uni_vmovups(h->ptr[h->rsp + i * get_vec_length()], Vmm(i - 2)); - h->uni_vmovups(h->ptr[h->rsp + 0 * get_vec_length()], vmm_src0); // src - h->uni_vmovups(h->ptr[h->rsp + 1 * get_vec_length()], vmm_src1); // beta + h->uni_vmovups(h->ptr[h->rsp + 0 * get_vec_length()], vmm_src_0); // src + h->uni_vmovups(h->ptr[h->rsp + 1 * get_vec_length()], vmm_src_1); // beta // save function address in gpr to pass in in call instruction h->mov(h->rbp, reinterpret_cast(powf)); @@ -768,19 +1032,19 @@ void jit_power_dynamic_emitter::emit_isa(const std::vector &in_vec_idxs, /// EQUAL /// -jit_equal_emitter::jit_equal_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) -: jit_emitter(host, host_isa, node, exec_prc) { +jit_equal_emitter::jit_equal_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node, const Precision& exec_prc) + : jit_emitter(host, host_isa, node, exec_prc) { prepare_table(); } -jit_equal_emitter::jit_equal_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, Precision exec_prc) -: jit_emitter(host, host_isa, exec_prc) { +jit_equal_emitter::jit_equal_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const Precision& exec_prc) + : jit_emitter(host, host_isa, exec_prc) { prepare_table(); } size_t jit_equal_emitter::get_inputs_num() const { return 2; } -std::set> jit_equal_emitter::get_supported_precisions(const std::shared_ptr& node) { - return {{element::f32, element::f32}}; +std::set> jit_equal_emitter::get_supported_precisions(const std::shared_ptr& node) { + return {{element::f32, element::f32}, {element::i32, element::i32}, {element::i64, element::i64}}; } void jit_equal_emitter::emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { @@ -791,58 +1055,80 @@ void jit_equal_emitter::emit_impl(const std::vector& in_vec_idxs, const } else if (host_isa_ == x64::avx512_core) { emit_isa(in_vec_idxs, out_vec_idxs); } else { - assert(!"unsupported isa"); + IE_THROW() << "jit_equal_emitter doesn't support ISA '" << host_isa_ << "'"; } } template void jit_equal_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { using Vmm = typename conditional3::type; - Vmm vmm_src0 = Vmm(in_vec_idxs[0]); - Vmm vmm_src1 = Vmm(in_vec_idxs[1]); + Vmm vmm_src_0 = Vmm(in_vec_idxs[0]); + Vmm vmm_src_1 = Vmm(in_vec_idxs[1]); Vmm vmm_dst = Vmm(out_vec_idxs[0]); - Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]); - Vmm vmm_aux1 = Vmm(aux_vec_idxs[1]); - if (isa == x64::sse41) { - h->movups(vmm_aux0, vmm_src0); - h->cmpps(vmm_aux0, vmm_src1, _cmp_eq_oq); - h->movups(vmm_aux1, table_val("one")); - h->pxor(vmm_dst, vmm_dst); - h->blendvps(vmm_dst, vmm_aux1); - } else if (isa == x64::avx2) { - h->vcmpeqps(vmm_aux0, vmm_src0, vmm_src1); - h->uni_vmovups(vmm_dst, table_val("zero")); - h->uni_vblendvps(vmm_dst, vmm_dst, table_val("one"), vmm_aux0); + // TODO: Actually the Result is bool in U8 representation. 0x01 or 0xFF - is there a difference for real models? + // Remove all vpsrld instructions if there is no difference. + if (isa == x64::sse41 || isa == x64::avx2) { + Vmm vmm_src0_t = vmm_src_0; + if (isa == x64::sse41 && vmm_dst.getIdx() != vmm_src_0.getIdx()) { + h->uni_vmovups(vmm_dst, vmm_src_0); + vmm_src0_t = vmm_dst; + } + switch (exec_prc_) { + case Precision::FP32: + h->uni_vcmpps(vmm_dst, vmm_src0_t, vmm_src_1, _cmp_eq_oq); + h->uni_vandps(vmm_dst, vmm_dst, table_val("oneF")); + break; + case Precision::I32: + h->uni_vpcmpeqd(vmm_dst, vmm_src0_t, vmm_src_1); + h->uni_vpsrld(vmm_dst, vmm_dst, 31); + break; + case Precision::I64: + h->uni_vpcmpeqq(vmm_dst, vmm_src0_t, vmm_src_1); + h->uni_vpsrlq(vmm_dst, vmm_dst, 63); + break; + default: IE_THROW() << "jit_equal_emitter doesn't support precision '" << exec_prc_ << "'"; + } } else { - h->vcmpps(k_mask, vmm_src0, vmm_src1, _cmp_eq_oq); - h->uni_vmovups(vmm_dst, table_val("zero")); - h->vblendmps(vmm_dst | k_mask, vmm_dst, table_val("one")); + switch (exec_prc_) { + case Precision::FP32: + h->vcmpps(k_mask, vmm_src_0, vmm_src_1, _cmp_eq_oq); + h->uni_vmovups(vmm_dst | k_mask | h->T_z, table_val("oneF")); + break; + case Precision::I32: + h->vpcmpeqd(k_mask, vmm_src_0, vmm_src_1); + h->vpmovm2d(vmm_dst, k_mask); + h->uni_vpsrld(vmm_dst, vmm_dst, 31); + break; + case Precision::I64: + h->vpcmpeqq(k_mask, vmm_src_0, vmm_src_1); + h->vpmovm2q(vmm_dst, k_mask); + h->vpsrlq(vmm_dst, vmm_dst, 63); + break; + default: IE_THROW() << "jit_equal_emitter doesn't support precision '" << exec_prc_ << "'"; + } } } void jit_equal_emitter::register_table_entries() { - push_arg_entry_of("zero", 0x00000000, true); - push_arg_entry_of("one", CONST_1_F, true); -} - -size_t jit_equal_emitter::aux_vecs_count() const { - return 2; + if (exec_prc_ == Precision::FP32) { + push_arg_entry_of("oneF", CONST_1_F, true); + } } /// NOT_EQUAL /// -jit_not_equal_emitter::jit_not_equal_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) -: jit_emitter(host, host_isa, node, exec_prc) { +jit_not_equal_emitter::jit_not_equal_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node, + const Precision& exec_prc) : jit_emitter(host, host_isa, node, exec_prc) { prepare_table(); } -jit_not_equal_emitter::jit_not_equal_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, Precision exec_prc) +jit_not_equal_emitter::jit_not_equal_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const Precision& exec_prc) : jit_emitter(host, host_isa, exec_prc) { prepare_table(); } size_t jit_not_equal_emitter::get_inputs_num() const { return 2; } -std::set> jit_not_equal_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_not_equal_emitter::get_supported_precisions(const std::shared_ptr& node) { return {{element::f32, element::f32}}; } @@ -854,31 +1140,31 @@ void jit_not_equal_emitter::emit_impl(const std::vector& in_vec_idxs, co } else if (host_isa_ == x64::avx512_core) { emit_isa(in_vec_idxs, out_vec_idxs); } else { - assert(!"unsupported isa"); + IE_THROW() << "jit_not_equal_emitter doesn't support ISA '" << host_isa_ << "'"; } } template void jit_not_equal_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { using Vmm = typename conditional3::type; - Vmm vmm_src0 = Vmm(in_vec_idxs[0]); - Vmm vmm_src1 = Vmm(in_vec_idxs[1]); + Vmm vmm_src_0 = Vmm(in_vec_idxs[0]); + Vmm vmm_src_1 = Vmm(in_vec_idxs[1]); Vmm vmm_dst = Vmm(out_vec_idxs[0]); - Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]); - Vmm vmm_aux1 = Vmm(aux_vec_idxs[1]); + Vmm vmm_aux_0 = Vmm(aux_vec_idxs[0]); + Vmm vmm_aux_1 = Vmm(aux_vec_idxs[1]); if (isa == x64::sse41) { - h->movups(vmm_aux0, vmm_src0); - h->cmpps(vmm_aux0, vmm_src1, _cmp_eq_oq); + h->movups(vmm_aux_0, vmm_src_0); + h->cmpps(vmm_aux_0, vmm_src_1, _cmp_eq_oq); h->movups(vmm_dst, table_val("one")); - h->pxor(vmm_aux1, vmm_aux1); - h->blendvps(vmm_dst, vmm_aux1); + h->pxor(vmm_aux_1, vmm_aux_1); + h->blendvps(vmm_dst, vmm_aux_1); } else if (isa == x64::avx2) { - h->vcmpeqps(vmm_aux0, vmm_src0, vmm_src1); + h->vcmpeqps(vmm_aux_0, vmm_src_0, vmm_src_1); h->uni_vmovups(vmm_dst, table_val("one")); - h->vblendvps(vmm_dst, vmm_dst, table_val("zero"), vmm_aux0); + h->vblendvps(vmm_dst, vmm_dst, table_val("zero"), vmm_aux_0); } else { - h->vcmpps(k_mask, vmm_src0, vmm_src1, _cmp_eq_oq); + h->vcmpps(k_mask, vmm_src_0, vmm_src_1, _cmp_eq_oq); h->uni_vmovups(vmm_dst, table_val("one")); h->vblendmps(vmm_dst | k_mask, vmm_dst, table_val("zero")); } @@ -894,19 +1180,19 @@ size_t jit_not_equal_emitter::aux_vecs_count() const { } /// GREATER /// -jit_greater_emitter::jit_greater_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) -: jit_emitter(host, host_isa, node, exec_prc) { +jit_greater_emitter::jit_greater_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node, const Precision& exec_prc) + : jit_emitter(host, host_isa, node, exec_prc) { prepare_table(); } -jit_greater_emitter::jit_greater_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, Precision exec_prc) -: jit_emitter(host, host_isa, exec_prc) { +jit_greater_emitter::jit_greater_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const Precision& exec_prc) + : jit_emitter(host, host_isa, exec_prc) { prepare_table(); } size_t jit_greater_emitter::get_inputs_num() const { return 2; } -std::set> jit_greater_emitter::get_supported_precisions(const std::shared_ptr& node) { - return {{element::f32, element::f32}}; +std::set> jit_greater_emitter::get_supported_precisions(const std::shared_ptr& node) { + return {{element::f32, element::f32}, {element::i32, element::i32}, {element::i64, element::i64}}; } void jit_greater_emitter::emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { @@ -917,59 +1203,79 @@ void jit_greater_emitter::emit_impl(const std::vector& in_vec_idxs, cons } else if (host_isa_ == x64::avx512_core) { emit_isa(in_vec_idxs, out_vec_idxs); } else { - assert(!"unsupported isa"); + IE_THROW() << "jit_greater_emitter doesn't support ISA '" << host_isa_ << "'"; } } template void jit_greater_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { using Vmm = typename conditional3::type; - Vmm vmm_src0 = Vmm(in_vec_idxs[0]); - Vmm vmm_src1 = Vmm(in_vec_idxs[1]); + Vmm vmm_src_0 = Vmm(in_vec_idxs[0]); + Vmm vmm_src_1 = Vmm(in_vec_idxs[1]); Vmm vmm_dst = Vmm(out_vec_idxs[0]); - Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]); - Vmm vmm_aux1 = Vmm(aux_vec_idxs[1]); - if (isa == x64::sse41) { - h->movups(vmm_aux0, vmm_src0); - h->cmpps(vmm_aux0, vmm_src1, _cmp_gt_os); - h->movups(vmm_aux1, table_val("one")); - h->pxor(vmm_dst, vmm_dst); - h->blendvps(vmm_dst, vmm_aux1); - } else if (isa == x64::avx2) { - h->vcmpgtps(vmm_aux0, vmm_src0, vmm_src1); - h->uni_vmovups(vmm_dst, table_val("zero")); - h->vblendvps(vmm_dst, vmm_dst, table_val("one"), vmm_aux0); + if (isa == x64::sse41 || isa == x64::avx2) { + Vmm vmm_src0_t = vmm_src_0; + if (isa == x64::sse41 && vmm_dst.getIdx() != vmm_src_0.getIdx()) { + h->uni_vmovups(vmm_dst, vmm_src_0); + vmm_src0_t = vmm_dst; + } + switch (exec_prc_) { + case Precision::FP32: + h->uni_vcmpps(vmm_dst, vmm_src0_t, vmm_src_1, _cmp_gt_os); + h->uni_vandps(vmm_dst, vmm_dst, table_val("oneF")); + break; + case Precision::I32: + h->uni_vpcmpgtd(vmm_dst, vmm_src0_t, vmm_src_1); + h->uni_vpsrld(vmm_dst, vmm_dst, 31); + break; + case Precision::I64: + h->uni_vpcmpgtq(vmm_dst, vmm_src0_t, vmm_src_1); + h->uni_vpsrlq(vmm_dst, vmm_dst, 63); + break; + default: IE_THROW() << "jit_greater_emitter doesn't support precision '" << exec_prc_ << "'"; + } } else { - h->vcmpps(k_mask, vmm_src0, vmm_src1, _cmp_gt_os); - h->uni_vmovups(vmm_dst, table_val("zero")); - h->vblendmps(vmm_dst | k_mask, vmm_dst, table_val("one")); + switch (exec_prc_) { + case Precision::FP32: + h->vcmpps(k_mask, vmm_src_0, vmm_src_1, _cmp_gt_os); + h->uni_vmovups(vmm_dst | k_mask | h->T_z, table_val("oneF")); + break; + case Precision::I32: + h->vpcmpgtd(k_mask, vmm_src_0, vmm_src_1); + h->vpmovm2d(vmm_dst, k_mask); + h->uni_vpsrld(vmm_dst, vmm_dst, 31); + break; + case Precision::I64: + h->vpcmpgtq(k_mask, vmm_src_0, vmm_src_1); + h->vpmovm2q(vmm_dst, k_mask); + h->vpsrlq(vmm_dst, vmm_dst, 63); + break; + default: IE_THROW() << "jit_greater_emitter doesn't support precision '" << exec_prc_ << "'"; + } } } void jit_greater_emitter::register_table_entries() { - push_arg_entry_of("zero", 0x00000000, true); - push_arg_entry_of("one", CONST_1_F, true); -} - -size_t jit_greater_emitter::aux_vecs_count() const { - return 2; + if (exec_prc_ == Precision::FP32) { + push_arg_entry_of("oneF", CONST_1_F, true); + } } /// GREATER_EQUAL /// jit_greater_equal_emitter::jit_greater_equal_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node, - Precision exec_prc) + const Precision& exec_prc) : jit_emitter(host, host_isa, node, exec_prc) { prepare_table(); } -jit_greater_equal_emitter::jit_greater_equal_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, Precision exec_prc) +jit_greater_equal_emitter::jit_greater_equal_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const Precision& exec_prc) : jit_emitter(host, host_isa, exec_prc) { prepare_table(); } size_t jit_greater_equal_emitter::get_inputs_num() const { return 2; } -std::set> jit_greater_equal_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_greater_equal_emitter::get_supported_precisions(const std::shared_ptr& node) { return {{element::f32, element::f32}}; } @@ -981,31 +1287,31 @@ void jit_greater_equal_emitter::emit_impl(const std::vector& in_vec_idxs } else if (host_isa_ == x64::avx512_core) { emit_isa(in_vec_idxs, out_vec_idxs); } else { - assert(!"unsupported isa"); + IE_THROW() << "jit_greater_equal_emitter doesn't support ISA '" << host_isa_ << "'"; } } template void jit_greater_equal_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { using Vmm = typename conditional3::type; - Vmm vmm_src0 = Vmm(in_vec_idxs[0]); - Vmm vmm_src1 = Vmm(in_vec_idxs[1]); + Vmm vmm_src_0 = Vmm(in_vec_idxs[0]); + Vmm vmm_src_1 = Vmm(in_vec_idxs[1]); Vmm vmm_dst = Vmm(out_vec_idxs[0]); - Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]); - Vmm vmm_aux1 = Vmm(aux_vec_idxs[1]); + Vmm vmm_aux_0 = Vmm(aux_vec_idxs[0]); + Vmm vmm_aux_1 = Vmm(aux_vec_idxs[1]); if (isa == x64::sse41) { - h->movups(vmm_aux0, vmm_src0); - h->cmpps(vmm_aux0, vmm_src1, _cmp_ge_os); - h->movups(vmm_aux1, table_val("one")); + h->movups(vmm_aux_0, vmm_src_0); + h->cmpps(vmm_aux_0, vmm_src_1, _cmp_ge_os); + h->movups(vmm_aux_1, table_val("one")); h->pxor(vmm_dst, vmm_dst); - h->blendvps(vmm_dst, vmm_aux1); + h->blendvps(vmm_dst, vmm_aux_1); } else if (isa == x64::avx2) { - h->vcmpgeps(vmm_aux0, vmm_src0, vmm_src1); + h->vcmpgeps(vmm_aux_0, vmm_src_0, vmm_src_1); h->uni_vmovups(vmm_dst, table_val("zero")); - h->vblendvps(vmm_dst, vmm_dst, table_val("one"), vmm_aux0); + h->vblendvps(vmm_dst, vmm_dst, table_val("one"), vmm_aux_0); } else { - h->vcmpps(k_mask, vmm_src0, vmm_src1, _cmp_ge_os); + h->vcmpps(k_mask, vmm_src_0, vmm_src_1, _cmp_ge_os); h->uni_vmovups(vmm_dst, table_val("zero")); h->vblendmps(vmm_dst | k_mask, vmm_dst, table_val("one")); } @@ -1021,19 +1327,19 @@ size_t jit_greater_equal_emitter::aux_vecs_count() const { } /// LESS /// -jit_less_emitter::jit_less_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) -: jit_emitter(host, host_isa, node, exec_prc) { +jit_less_emitter::jit_less_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node, const Precision& exec_prc) + : jit_emitter(host, host_isa, node, exec_prc) { prepare_table(); } -jit_less_emitter::jit_less_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, Precision exec_prc) -: jit_emitter(host, host_isa, exec_prc) { +jit_less_emitter::jit_less_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const Precision& exec_prc) + : jit_emitter(host, host_isa, exec_prc) { prepare_table(); } size_t jit_less_emitter::get_inputs_num() const { return 2; } -std::set> jit_less_emitter::get_supported_precisions(const std::shared_ptr& node) { - return {{element::f32, element::f32}}; +std::set> jit_less_emitter::get_supported_precisions(const std::shared_ptr& node) { + return {{element::f32, element::f32}, {element::i32, element::i32}, {element::i64, element::i64}}; } void jit_less_emitter::emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { @@ -1044,58 +1350,78 @@ void jit_less_emitter::emit_impl(const std::vector& in_vec_idxs, const s } else if (host_isa_ == x64::avx512_core) { emit_isa(in_vec_idxs, out_vec_idxs); } else { - assert(!"unsupported isa"); + IE_THROW() << "jit_less_emitter doesn't support ISA '" << host_isa_ << "'"; } } template void jit_less_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { using Vmm = typename conditional3::type; - Vmm vmm_src0 = Vmm(in_vec_idxs[0]); - Vmm vmm_src1 = Vmm(in_vec_idxs[1]); + Vmm vmm_src_0 = Vmm(in_vec_idxs[0]); + Vmm vmm_src_1 = Vmm(in_vec_idxs[1]); Vmm vmm_dst = Vmm(out_vec_idxs[0]); - Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]); - Vmm vmm_aux1 = Vmm(aux_vec_idxs[1]); - if (isa == x64::sse41) { - h->movups(vmm_aux0, vmm_src0); - h->cmpps(vmm_aux0, vmm_src1, _cmp_lt_os); - h->movups(vmm_aux1, table_val("one")); - h->pxor(vmm_dst, vmm_dst); - h->blendvps(vmm_dst, vmm_aux1); - } else if (isa == x64::avx2) { - h->vcmpltps(vmm_aux0, vmm_src0, vmm_src1); - h->uni_vmovups(vmm_dst, table_val("zero")); - h->vblendvps(vmm_dst, vmm_dst, table_val("one"), vmm_aux0); + if (isa == x64::sse41 || isa == x64::avx2) { + Vmm vmm_src0_t = vmm_src_0; + if (isa == x64::sse41 && vmm_dst.getIdx() != vmm_src_0.getIdx()) { + h->uni_vmovups(vmm_dst, vmm_src_0); + vmm_src0_t = vmm_dst; + } + switch (exec_prc_) { + case Precision::FP32: + h->uni_vcmpps(vmm_dst, vmm_src0_t, vmm_src_1, _cmp_lt_os); + h->uni_vandps(vmm_dst, vmm_dst, table_val("oneF")); + break; + case Precision::I32: + h->uni_vpcmpgtd(vmm_dst, vmm_src_1, vmm_src0_t); + h->uni_vpsrld(vmm_dst, vmm_dst, 31); + break; + case Precision::I64: + h->uni_vpcmpgtq(vmm_dst, vmm_src_1, vmm_src0_t); + h->uni_vpsrlq(vmm_dst, vmm_dst, 63); + break; + default: IE_THROW() << "jit_less_emitter doesn't support precision '" << exec_prc_ << "'"; + } } else { - h->vcmpps(k_mask, vmm_src0, vmm_src1, _cmp_lt_os); - h->uni_vmovups(vmm_dst, table_val("zero")); - h->vblendmps(vmm_dst | k_mask, vmm_dst, table_val("one")); + switch (exec_prc_) { + case Precision::FP32: + h->vcmpps(k_mask, vmm_src_0, vmm_src_1, _cmp_lt_os); + h->uni_vmovups(vmm_dst | k_mask | h->T_z, table_val("oneF")); + break; + case Precision::I32: + h->vpcmpgtd(k_mask, vmm_src_1, vmm_src_0); + h->vpmovm2d(vmm_dst, k_mask); + h->uni_vpsrld(vmm_dst, vmm_dst, 31); + break; + case Precision::I64: + h->vpcmpgtq(k_mask, vmm_src_1, vmm_src_0); + h->vpmovm2q(vmm_dst, k_mask); + h->vpsrlq(vmm_dst, vmm_dst, 63); + break; + default: IE_THROW() << "jit_less_emitter doesn't support precision '" << exec_prc_ << "'"; + } } } void jit_less_emitter::register_table_entries() { - push_arg_entry_of("zero", 0x00000000, true); - push_arg_entry_of("one", CONST_1_F, true); -} - -size_t jit_less_emitter::aux_vecs_count() const { - return 2; + if (exec_prc_ == Precision::FP32) { + push_arg_entry_of("oneF", CONST_1_F, true); + } } /// LESS_EQUAL /// -jit_less_equal_emitter::jit_less_equal_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) -: jit_emitter(host, host_isa, node, exec_prc) { +jit_less_equal_emitter::jit_less_equal_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node, + const Precision& exec_prc) : jit_emitter(host, host_isa, node, exec_prc) { prepare_table(); } -jit_less_equal_emitter::jit_less_equal_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, Precision exec_prc) +jit_less_equal_emitter::jit_less_equal_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const Precision& exec_prc) : jit_emitter(host, host_isa, exec_prc) { prepare_table(); } size_t jit_less_equal_emitter::get_inputs_num() const { return 2; } -std::set> jit_less_equal_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_less_equal_emitter::get_supported_precisions(const std::shared_ptr& node) { return {{element::f32, element::f32}}; } @@ -1107,32 +1433,32 @@ void jit_less_equal_emitter::emit_impl(const std::vector& in_vec_idxs, c } else if (host_isa_ == x64::avx512_core) { emit_isa(in_vec_idxs, out_vec_idxs); } else { - assert(!"unsupported isa"); + IE_THROW() << "jit_less_equal_emitter doesn't support ISA '" << host_isa_ << "'"; } } template void jit_less_equal_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { using Vmm = typename conditional3::type; - Vmm vmm_src0 = Vmm(in_vec_idxs[0]); - Vmm vmm_src1 = Vmm(in_vec_idxs[1]); + Vmm vmm_src_0 = Vmm(in_vec_idxs[0]); + Vmm vmm_src_1 = Vmm(in_vec_idxs[1]); Vmm vmm_dst = Vmm(out_vec_idxs[0]); - Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]); - Vmm vmm_aux1 = Vmm(aux_vec_idxs[1]); + Vmm vmm_aux_0 = Vmm(aux_vec_idxs[0]); + Vmm vmm_aux_1 = Vmm(aux_vec_idxs[1]); if (isa == x64::sse41) { - h->movups(vmm_aux0, vmm_src0); - h->cmpps(vmm_aux0, vmm_src1, _cmp_le_os); - h->movups(vmm_aux1, table_val("one")); + h->movups(vmm_aux_0, vmm_src_0); + h->cmpps(vmm_aux_0, vmm_src_1, _cmp_le_os); + h->movups(vmm_aux_1, table_val("one")); h->pxor(vmm_dst, vmm_dst); - h->blendvps(vmm_dst, vmm_aux1); + h->blendvps(vmm_dst, vmm_aux_1); } else if (isa == x64::avx2) { - h->vcmpleps(vmm_aux0, vmm_src0, vmm_src1); + h->vcmpleps(vmm_aux_0, vmm_src_0, vmm_src_1); h->uni_vmovups(vmm_dst, table_val("zero")); - h->vblendvps(vmm_dst, vmm_dst, table_val("one"), vmm_aux0); + h->vblendvps(vmm_dst, vmm_dst, table_val("one"), vmm_aux_0); } else { - h->vcmpps(k_mask, vmm_src0, vmm_src1, _cmp_le_os); + h->vcmpps(k_mask, vmm_src_0, vmm_src_1, _cmp_le_os); h->uni_vmovups(vmm_dst, table_val("zero")); h->vblendmps(vmm_dst | k_mask, vmm_dst, table_val("one")); } @@ -1148,18 +1474,18 @@ size_t jit_less_equal_emitter::aux_vecs_count() const { } /// LOGICAL_AND /// -jit_logical_and_emitter::jit_logical_and_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) -: jit_emitter(host, host_isa, node, exec_prc) { +jit_logical_and_emitter::jit_logical_and_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node, + const Precision& exec_prc) : jit_emitter(host, host_isa, node, exec_prc) { prepare_table(); } -jit_logical_and_emitter::jit_logical_and_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, Precision exec_prc) +jit_logical_and_emitter::jit_logical_and_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const Precision& exec_prc) : jit_emitter(host, host_isa, exec_prc) { prepare_table(); } size_t jit_logical_and_emitter::get_inputs_num() const { return 2; } -std::set> jit_logical_and_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_logical_and_emitter::get_supported_precisions(const std::shared_ptr& node) { return {{element::f32, element::f32}}; } @@ -1171,53 +1497,53 @@ void jit_logical_and_emitter::emit_impl(const std::vector& in_vec_idxs, } else if (host_isa_ == x64::avx512_core) { emit_isa(in_vec_idxs, out_vec_idxs); } else { - assert(!"unsupported isa"); + IE_THROW() << "jit_logical_and_emitter doesn't support ISA '" << host_isa_ << "'"; } } template void jit_logical_and_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { using Vmm = typename conditional3::type; - Vmm vmm_src0 = Vmm(in_vec_idxs[0]); - Vmm vmm_src1 = Vmm(in_vec_idxs[1]); + Vmm vmm_src_0 = Vmm(in_vec_idxs[0]); + Vmm vmm_src_1 = Vmm(in_vec_idxs[1]); Vmm vmm_dst = Vmm(out_vec_idxs[0]); - Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]); - Vmm vmm_aux1 = Vmm(aux_vec_idxs[1]); + Vmm vmm_aux_0 = Vmm(aux_vec_idxs[0]); + Vmm vmm_aux_1 = Vmm(aux_vec_idxs[1]); Vmm vmm_aux2 = Vmm(aux_vec_idxs[2]); if (isa == x64::sse41) { - h->pxor(vmm_aux0, vmm_aux0); - h->cmpps(vmm_aux0, vmm_src0, _cmp_eq_oq); + h->pxor(vmm_aux_0, vmm_aux_0); + h->cmpps(vmm_aux_0, vmm_src_0, _cmp_eq_oq); h->movups(vmm_dst, table_val("one")); - h->pxor(vmm_aux1, vmm_aux1); - h->blendvps(vmm_dst, vmm_aux1); + h->pxor(vmm_aux_1, vmm_aux_1); + h->blendvps(vmm_dst, vmm_aux_1); - h->pxor(vmm_aux0, vmm_aux0); - h->cmpps(vmm_aux0, vmm_src1, _cmp_eq_oq); + h->pxor(vmm_aux_0, vmm_aux_0); + h->cmpps(vmm_aux_0, vmm_src_1, _cmp_eq_oq); h->movups(vmm_aux2, table_val("one")); - h->pxor(vmm_aux1, vmm_aux1); - h->blendvps(vmm_aux2, vmm_aux1); + h->pxor(vmm_aux_1, vmm_aux_1); + h->blendvps(vmm_aux2, vmm_aux_1); h->uni_vandps(vmm_dst, vmm_dst, vmm_aux2); } else if (isa == x64::avx2) { - h->vcmpeqps(vmm_aux0, vmm_src0, table_val("zero")); + h->vcmpeqps(vmm_aux_0, vmm_src_0, table_val("zero")); h->uni_vmovups(vmm_dst, table_val("one")); - h->vblendvps(vmm_dst, vmm_dst, table_val("zero"), vmm_aux0); + h->vblendvps(vmm_dst, vmm_dst, table_val("zero"), vmm_aux_0); - h->vcmpeqps(vmm_aux1, vmm_src1, table_val("zero")); - h->uni_vmovups(vmm_aux0, table_val("one")); - h->vblendvps(vmm_aux0, vmm_aux0, table_val("zero"), vmm_aux1); + h->vcmpeqps(vmm_aux_1, vmm_src_1, table_val("zero")); + h->uni_vmovups(vmm_aux_0, table_val("one")); + h->vblendvps(vmm_aux_0, vmm_aux_0, table_val("zero"), vmm_aux_1); - h->uni_vandps(vmm_dst, vmm_dst, vmm_aux0); + h->uni_vandps(vmm_dst, vmm_dst, vmm_aux_0); } else { - h->vcmpps(k_mask, vmm_src0, table_val("zero"), _cmp_eq_oq); - h->uni_vmovups(vmm_aux0, table_val("one")); - h->vblendmps(vmm_dst | k_mask, vmm_aux0, table_val("zero")); + h->vcmpps(k_mask, vmm_src_0, table_val("zero"), _cmp_eq_oq); + h->uni_vmovups(vmm_aux_0, table_val("one")); + h->vblendmps(vmm_dst | k_mask, vmm_aux_0, table_val("zero")); - h->vcmpps(k_mask, vmm_src1, table_val("zero"), _cmp_eq_oq); - h->vblendmps(vmm_aux0 | k_mask, vmm_aux0, table_val("zero")); + h->vcmpps(k_mask, vmm_src_1, table_val("zero"), _cmp_eq_oq); + h->vblendmps(vmm_aux_0 | k_mask, vmm_aux_0, table_val("zero")); - h->uni_vandps(vmm_dst, vmm_dst, vmm_aux0); + h->uni_vandps(vmm_dst, vmm_dst, vmm_aux_0); } } @@ -1232,18 +1558,18 @@ size_t jit_logical_and_emitter::aux_vecs_count() const { /// LOGICAL_OR /// -jit_logical_or_emitter::jit_logical_or_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) -: jit_emitter(host, host_isa, node, exec_prc) { +jit_logical_or_emitter::jit_logical_or_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node, + const Precision& exec_prc) : jit_emitter(host, host_isa, node, exec_prc) { prepare_table(); } -jit_logical_or_emitter::jit_logical_or_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, Precision exec_prc) +jit_logical_or_emitter::jit_logical_or_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const Precision& exec_prc) : jit_emitter(host, host_isa, exec_prc) { prepare_table(); } size_t jit_logical_or_emitter::get_inputs_num() const { return 2; } -std::set> jit_logical_or_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_logical_or_emitter::get_supported_precisions(const std::shared_ptr& node) { return {{element::f32, element::f32}}; } @@ -1255,53 +1581,53 @@ void jit_logical_or_emitter::emit_impl(const std::vector& in_vec_idxs, c } else if (host_isa_ == x64::avx512_core) { emit_isa(in_vec_idxs, out_vec_idxs); } else { - assert(!"unsupported isa"); + IE_THROW() << "jit_logical_or_emitter doesn't support ISA '" << host_isa_ << "'"; } } template void jit_logical_or_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { using Vmm = typename conditional3::type; - Vmm vmm_src0 = Vmm(in_vec_idxs[0]); - Vmm vmm_src1 = Vmm(in_vec_idxs[1]); + Vmm vmm_src_0 = Vmm(in_vec_idxs[0]); + Vmm vmm_src_1 = Vmm(in_vec_idxs[1]); Vmm vmm_dst = Vmm(out_vec_idxs[0]); - Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]); - Vmm vmm_aux1 = Vmm(aux_vec_idxs[1]); + Vmm vmm_aux_0 = Vmm(aux_vec_idxs[0]); + Vmm vmm_aux_1 = Vmm(aux_vec_idxs[1]); Vmm vmm_aux2 = Vmm(aux_vec_idxs[2]); if (isa == x64::sse41) { - h->pxor(vmm_aux0, vmm_aux0); - h->cmpps(vmm_aux0, vmm_src0, _cmp_eq_oq); + h->pxor(vmm_aux_0, vmm_aux_0); + h->cmpps(vmm_aux_0, vmm_src_0, _cmp_eq_oq); h->movups(vmm_dst, table_val("one")); - h->pxor(vmm_aux1, vmm_aux1); - h->blendvps(vmm_dst, vmm_aux1); + h->pxor(vmm_aux_1, vmm_aux_1); + h->blendvps(vmm_dst, vmm_aux_1); - h->pxor(vmm_aux0, vmm_aux0); - h->cmpps(vmm_aux0, vmm_src1, _cmp_eq_oq); + h->pxor(vmm_aux_0, vmm_aux_0); + h->cmpps(vmm_aux_0, vmm_src_1, _cmp_eq_oq); h->movups(vmm_aux2, table_val("one")); - h->pxor(vmm_aux1, vmm_aux1); - h->blendvps(vmm_aux2, vmm_aux1); + h->pxor(vmm_aux_1, vmm_aux_1); + h->blendvps(vmm_aux2, vmm_aux_1); h->uni_vorps(vmm_dst, vmm_dst, vmm_aux2); } else if (isa == x64::avx2) { - h->vcmpeqps(vmm_aux0, vmm_src0, table_val("zero")); + h->vcmpeqps(vmm_aux_0, vmm_src_0, table_val("zero")); h->uni_vmovups(vmm_dst, table_val("one")); - h->vblendvps(vmm_dst, vmm_dst, table_val("zero"), vmm_aux0); + h->vblendvps(vmm_dst, vmm_dst, table_val("zero"), vmm_aux_0); - h->vcmpeqps(vmm_aux1, vmm_src1, table_val("zero")); - h->uni_vmovups(vmm_aux0, table_val("one")); - h->vblendvps(vmm_aux0, vmm_aux0, table_val("zero"), vmm_aux1); + h->vcmpeqps(vmm_aux_1, vmm_src_1, table_val("zero")); + h->uni_vmovups(vmm_aux_0, table_val("one")); + h->vblendvps(vmm_aux_0, vmm_aux_0, table_val("zero"), vmm_aux_1); - h->uni_vorps(vmm_dst, vmm_dst, vmm_aux0); + h->uni_vorps(vmm_dst, vmm_dst, vmm_aux_0); } else { - h->vcmpps(k_mask, vmm_src0, table_val("zero"), _cmp_eq_oq); - h->uni_vmovups(vmm_aux0, table_val("one")); - h->vblendmps(vmm_dst | k_mask, vmm_aux0, table_val("zero")); + h->vcmpps(k_mask, vmm_src_0, table_val("zero"), _cmp_eq_oq); + h->uni_vmovups(vmm_aux_0, table_val("one")); + h->vblendmps(vmm_dst | k_mask, vmm_aux_0, table_val("zero")); - h->vcmpps(k_mask, vmm_src1, table_val("zero"), _cmp_eq_oq); - h->vblendmps(vmm_aux0 | k_mask, vmm_aux0, table_val("zero")); + h->vcmpps(k_mask, vmm_src_1, table_val("zero"), _cmp_eq_oq); + h->vblendmps(vmm_aux_0 | k_mask, vmm_aux_0, table_val("zero")); - h->uni_vorps(vmm_dst, vmm_dst, vmm_aux0); + h->uni_vorps(vmm_dst, vmm_dst, vmm_aux_0); } } @@ -1315,18 +1641,18 @@ size_t jit_logical_or_emitter::aux_vecs_count() const { } /// LOGICAL_XOR /// -jit_logical_xor_emitter::jit_logical_xor_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) -: jit_emitter(host, host_isa, node, exec_prc) { +jit_logical_xor_emitter::jit_logical_xor_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node, + const Precision& exec_prc) : jit_emitter(host, host_isa, node, exec_prc) { prepare_table(); } -jit_logical_xor_emitter::jit_logical_xor_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, Precision exec_prc) +jit_logical_xor_emitter::jit_logical_xor_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const Precision& exec_prc) : jit_emitter(host, host_isa, exec_prc) { prepare_table(); } size_t jit_logical_xor_emitter::get_inputs_num() const { return 2; } -std::set> jit_logical_xor_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_logical_xor_emitter::get_supported_precisions(const std::shared_ptr& node) { return {{element::f32, element::f32}}; } @@ -1338,53 +1664,53 @@ void jit_logical_xor_emitter::emit_impl(const std::vector& in_vec_idxs, } else if (host_isa_ == x64::avx512_core) { emit_isa(in_vec_idxs, out_vec_idxs); } else { - assert(!"unsupported isa"); + IE_THROW() << "jit_logical_xor_emitter doesn't support ISA '" << host_isa_ << "'"; } } template void jit_logical_xor_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { using Vmm = typename conditional3::type; - Vmm vmm_src0 = Vmm(in_vec_idxs[0]); - Vmm vmm_src1 = Vmm(in_vec_idxs[1]); + Vmm vmm_src_0 = Vmm(in_vec_idxs[0]); + Vmm vmm_src_1 = Vmm(in_vec_idxs[1]); Vmm vmm_dst = Vmm(out_vec_idxs[0]); - Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]); - Vmm vmm_aux1 = Vmm(aux_vec_idxs[1]); + Vmm vmm_aux_0 = Vmm(aux_vec_idxs[0]); + Vmm vmm_aux_1 = Vmm(aux_vec_idxs[1]); Vmm vmm_aux2 = Vmm(aux_vec_idxs[2]); if (isa == x64::sse41) { - h->pxor(vmm_aux0, vmm_aux0); - h->cmpps(vmm_aux0, vmm_src0, _cmp_eq_oq); + h->pxor(vmm_aux_0, vmm_aux_0); + h->cmpps(vmm_aux_0, vmm_src_0, _cmp_eq_oq); h->movups(vmm_dst, table_val("one")); - h->pxor(vmm_aux1, vmm_aux1); - h->blendvps(vmm_dst, vmm_aux1); + h->pxor(vmm_aux_1, vmm_aux_1); + h->blendvps(vmm_dst, vmm_aux_1); - h->pxor(vmm_aux0, vmm_aux0); - h->cmpps(vmm_aux0, vmm_src1, _cmp_eq_oq); + h->pxor(vmm_aux_0, vmm_aux_0); + h->cmpps(vmm_aux_0, vmm_src_1, _cmp_eq_oq); h->movups(vmm_aux2, table_val("one")); - h->pxor(vmm_aux1, vmm_aux1); - h->blendvps(vmm_aux2, vmm_aux1); + h->pxor(vmm_aux_1, vmm_aux_1); + h->blendvps(vmm_aux2, vmm_aux_1); h->uni_vxorps(vmm_dst, vmm_dst, vmm_aux2); } else if (isa == x64::avx2) { - h->vcmpeqps(vmm_aux0, vmm_src0, table_val("zero")); + h->vcmpeqps(vmm_aux_0, vmm_src_0, table_val("zero")); h->uni_vmovups(vmm_dst, table_val("one")); - h->vblendvps(vmm_dst, vmm_dst, table_val("zero"), vmm_aux0); + h->vblendvps(vmm_dst, vmm_dst, table_val("zero"), vmm_aux_0); - h->vcmpeqps(vmm_aux1, vmm_src1, table_val("zero")); - h->uni_vmovups(vmm_aux0, table_val("one")); - h->vblendvps(vmm_aux0, vmm_aux0, table_val("zero"), vmm_aux1); + h->vcmpeqps(vmm_aux_1, vmm_src_1, table_val("zero")); + h->uni_vmovups(vmm_aux_0, table_val("one")); + h->vblendvps(vmm_aux_0, vmm_aux_0, table_val("zero"), vmm_aux_1); - h->uni_vxorps(vmm_dst, vmm_dst, vmm_aux0); + h->uni_vxorps(vmm_dst, vmm_dst, vmm_aux_0); } else { - h->vcmpps(k_mask, vmm_src0, table_val("zero"), _cmp_eq_oq); - h->uni_vmovups(vmm_aux0, table_val("one")); - h->vblendmps(vmm_dst | k_mask, vmm_aux0, table_val("zero")); + h->vcmpps(k_mask, vmm_src_0, table_val("zero"), _cmp_eq_oq); + h->uni_vmovups(vmm_aux_0, table_val("one")); + h->vblendmps(vmm_dst | k_mask, vmm_aux_0, table_val("zero")); - h->vcmpps(k_mask, vmm_src1, table_val("zero"), _cmp_eq_oq); - h->vblendmps(vmm_aux0 | k_mask, vmm_aux0, table_val("zero")); + h->vcmpps(k_mask, vmm_src_1, table_val("zero"), _cmp_eq_oq); + h->vblendmps(vmm_aux_0 | k_mask, vmm_aux_0, table_val("zero")); - h->uni_vxorps(vmm_dst, vmm_dst, vmm_aux0); + h->uni_vxorps(vmm_dst, vmm_dst, vmm_aux_0); } } @@ -1398,18 +1724,18 @@ size_t jit_logical_xor_emitter::aux_vecs_count() const { } /// LOGICAL_NOT /// -jit_logical_not_emitter::jit_logical_not_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) -: jit_emitter(host, host_isa, node, exec_prc) { +jit_logical_not_emitter::jit_logical_not_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node, + const Precision& exec_prc) : jit_emitter(host, host_isa, node, exec_prc) { prepare_table(); } -jit_logical_not_emitter::jit_logical_not_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, Precision exec_prc) +jit_logical_not_emitter::jit_logical_not_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const Precision& exec_prc) : jit_emitter(host, host_isa, exec_prc) { prepare_table(); } size_t jit_logical_not_emitter::get_inputs_num() const { return 1; } -std::set> jit_logical_not_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_logical_not_emitter::get_supported_precisions(const std::shared_ptr& node) { return {{element::f32}}; } @@ -1421,30 +1747,30 @@ void jit_logical_not_emitter::emit_impl(const std::vector& in_vec_idxs, } else if (host_isa_ == x64::avx512_core) { emit_isa(in_vec_idxs, out_vec_idxs); } else { - assert(!"unsupported isa"); + IE_THROW() << "jit_logical_not_emitter doesn't support ISA '" << host_isa_ << "'"; } } template void jit_logical_not_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { using Vmm = typename conditional3::type; - Vmm vmm_src0 = Vmm(in_vec_idxs[0]); + Vmm vmm_src_0 = Vmm(in_vec_idxs[0]); Vmm vmm_dst = Vmm(out_vec_idxs[0]); - Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]); - Vmm vmm_aux1 = Vmm(aux_vec_idxs[1]); + Vmm vmm_aux_0 = Vmm(aux_vec_idxs[0]); + Vmm vmm_aux_1 = Vmm(aux_vec_idxs[1]); if (isa == x64::sse41) { - h->pxor(vmm_aux0, vmm_aux0); - h->cmpps(vmm_aux0, vmm_src0, _cmp_eq_oq); - h->movups(vmm_aux1, table_val("one")); + h->pxor(vmm_aux_0, vmm_aux_0); + h->cmpps(vmm_aux_0, vmm_src_0, _cmp_eq_oq); + h->movups(vmm_aux_1, table_val("one")); h->pxor(vmm_dst, vmm_dst); - h->blendvps(vmm_dst, vmm_aux1); + h->blendvps(vmm_dst, vmm_aux_1); } else if (isa == x64::avx2) { - h->vcmpeqps(vmm_aux0, vmm_src0, table_val("zero")); + h->vcmpeqps(vmm_aux_0, vmm_src_0, table_val("zero")); h->uni_vmovups(vmm_dst, table_val("zero")); - h->vblendvps(vmm_dst, vmm_dst, table_val("one"), vmm_aux0); + h->vblendvps(vmm_dst, vmm_dst, table_val("one"), vmm_aux_0); } else { - h->vcmpps(k_mask, vmm_src0, table_val("zero"), _cmp_eq_oq); + h->vcmpps(k_mask, vmm_src_0, table_val("zero"), _cmp_eq_oq); h->uni_vmovups(vmm_dst, table_val("zero")); h->vblendmps(vmm_dst | k_mask, vmm_dst, table_val("one")); } @@ -1460,8 +1786,8 @@ size_t jit_logical_not_emitter::aux_vecs_count() const { } /// POWER_STATIC /// -jit_power_static_emitter::jit_power_static_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) -: jit_emitter(host, host_isa, node, exec_prc) { +jit_power_static_emitter::jit_power_static_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node, + const Precision& exec_prc) : jit_emitter(host, host_isa, node, exec_prc) { auto powerStaticNode = ov::as_type_ptr(node); if (powerStaticNode == nullptr) { IE_THROW() << "Can't cast to snippets::op::PowerStatic"; @@ -1476,14 +1802,14 @@ jit_power_static_emitter::jit_power_static_emitter(x64::jit_generator *host, x64 jit_power_static_emitter::jit_power_static_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, float inpPower, float inpScale, float inpShift, - Precision exec_prc) + const Precision& exec_prc) : jit_emitter(host, host_isa, exec_prc), power(inpPower), scale(inpScale), shift(inpShift) { prepare_table(); } size_t jit_power_static_emitter::get_inputs_num() const { return 1; } -std::set> jit_power_static_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_power_static_emitter::get_supported_precisions(const std::shared_ptr& node) { return {{element::f32}}; } @@ -1495,38 +1821,38 @@ void jit_power_static_emitter::emit_impl(const std::vector& in_vec_idxs, } else if (host_isa_ == x64::avx512_core) { emit_isa(in_vec_idxs, out_vec_idxs); } else { - assert(!"unsupported isa"); + IE_THROW() << "jit_power_static_emitter doesn't support ISA '" << host_isa_ << "'"; } } template void jit_power_static_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { using Vmm = typename conditional3::type; - Vmm vmm_src0 = Vmm(in_vec_idxs[0]); + Vmm vmm_src_0 = Vmm(in_vec_idxs[0]); Vmm vmm_dst = Vmm(out_vec_idxs[0]); - Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]); + Vmm vmm_aux_0 = Vmm(aux_vec_idxs[0]); Xmm xmm0 = Xmm(0), xmm1 = Xmm(1); if (scale != 1.f || shift != 0.f) { if (isa == x64::sse41) { - h->uni_vmovups(vmm_aux0, table_val("scale")); - h->uni_vmulps(vmm_aux0, vmm_aux0, vmm_src0); + h->uni_vmovups(vmm_aux_0, table_val("scale")); + h->uni_vmulps(vmm_aux_0, vmm_aux_0, vmm_src_0); h->uni_vmovups(vmm_dst, table_val("shift")); - h->uni_vaddps(vmm_dst, vmm_dst, vmm_aux0); + h->uni_vaddps(vmm_dst, vmm_dst, vmm_aux_0); } else { - if (vmm_dst.getIdx() != vmm_src0.getIdx()) { + if (vmm_dst.getIdx() != vmm_src_0.getIdx()) { h->uni_vmovups(vmm_dst, table_val("shift")); - h->uni_vfmadd231ps(vmm_dst, vmm_src0, table_val("scale")); + h->uni_vfmadd231ps(vmm_dst, vmm_src_0, table_val("scale")); } else { - h->uni_vmovups(vmm_aux0, table_val("shift")); - h->uni_vfmadd231ps(vmm_aux0, vmm_src0, table_val("scale")); - h->uni_vmovups(vmm_dst, vmm_aux0); + h->uni_vmovups(vmm_aux_0, table_val("shift")); + h->uni_vfmadd231ps(vmm_aux_0, vmm_src_0, table_val("scale")); + h->uni_vmovups(vmm_dst, vmm_aux_0); } } } else { - if (vmm_dst.getIdx() != vmm_src0.getIdx()) - h->uni_vmovups(vmm_dst, vmm_src0); + if (vmm_dst.getIdx() != vmm_src_0.getIdx()) + h->uni_vmovups(vmm_dst, vmm_src_0); } if (power == 1.f) { @@ -1534,37 +1860,37 @@ void jit_power_static_emitter::emit_isa(const std::vector &in_vec_idxs, h->uni_vsqrtps(vmm_dst, vmm_dst); if (power < 0.f) { - h->uni_vmovups(vmm_aux0, table_val("one")); + h->uni_vmovups(vmm_aux_0, table_val("one")); if (isa == x64::sse41) { - h->uni_vdivps(vmm_aux0, vmm_aux0, vmm_dst); - h->uni_vmovups(vmm_dst, vmm_aux0); + h->uni_vdivps(vmm_aux_0, vmm_aux_0, vmm_dst); + h->uni_vmovups(vmm_dst, vmm_aux_0); } else { - h->uni_vdivps(vmm_dst, vmm_aux0, vmm_dst); + h->uni_vdivps(vmm_dst, vmm_aux_0, vmm_dst); } } } else if (std::floor(power) == power && power != 0) { int ipower = std::abs(static_cast(power)); - h->uni_vmovups(vmm_aux0, vmm_dst); + h->uni_vmovups(vmm_aux_0, vmm_dst); for (int i = 1; i < ipower; i++) { - h->uni_vmulps(vmm_dst, vmm_dst, vmm_aux0); + h->uni_vmulps(vmm_dst, vmm_dst, vmm_aux_0); } if (power < 0.f) { - h->uni_vmovups(vmm_aux0, table_val("one")); + h->uni_vmovups(vmm_aux_0, table_val("one")); if (isa == x64::sse41) { - h->uni_vdivps(vmm_aux0, vmm_aux0, vmm_dst); - h->uni_vmovups(vmm_dst, vmm_aux0); + h->uni_vdivps(vmm_aux_0, vmm_aux_0, vmm_dst); + h->uni_vmovups(vmm_dst, vmm_aux_0); } else { - h->uni_vdivps(vmm_dst, vmm_aux0, vmm_dst); + h->uni_vdivps(vmm_dst, vmm_aux_0, vmm_dst); } } } else { - h->uni_vmovups(vmm_aux0, table_val("power")); + h->uni_vmovups(vmm_aux_0, table_val("power")); // caller obligation to save gprs as callee may use them size_t gpr_size = 8; - Xbyak::Operand gprs_to_save[] = {h->r8, h->r9, h->r10, h->r11, h->rax, - h->rcx, h->rdx, h->rdi, h->rsi, h->rbp, h->rbx}; + Operand gprs_to_save[] = {h->r8, h->r9, h->r10, h->r11, h->rax, + h->rcx, h->rdx, h->rdi, h->rsi, h->rbp, h->rbx}; size_t n_gprs_to_save = sizeof(gprs_to_save) / sizeof(gprs_to_save[0]); h->sub(h->rsp, n_gprs_to_save * gpr_size); @@ -1594,7 +1920,7 @@ void jit_power_static_emitter::emit_isa(const std::vector &in_vec_idxs, for (size_t i = 2; i < get_max_vecs_count() + 2; ++i) h->uni_vmovups(h->ptr[h->rsp + i * get_vec_length()], Vmm(i - 2)); h->uni_vmovups(h->ptr[h->rsp + 0 * get_vec_length()], vmm_dst); // src - h->uni_vmovups(h->ptr[h->rsp + 1 * get_vec_length()], vmm_aux0); // beta + h->uni_vmovups(h->ptr[h->rsp + 1 * get_vec_length()], vmm_aux_0); // beta // save function address in gpr to pass in in call instruction h->mov(h->rbp, reinterpret_cast(powf)); @@ -1651,17 +1977,17 @@ size_t jit_power_static_emitter::aux_vecs_count() const { } /// PRELU /// -jit_prelu_emitter::jit_prelu_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) +jit_prelu_emitter::jit_prelu_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node, const Precision& exec_prc) : jit_emitter(host, host_isa, node, exec_prc) { prepare_table(); } -jit_prelu_emitter::jit_prelu_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, Precision exec_prc) +jit_prelu_emitter::jit_prelu_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const Precision& exec_prc) : jit_emitter(host, host_isa, exec_prc) { prepare_table(); } size_t jit_prelu_emitter::get_inputs_num() const { return 2; } -std::set> jit_prelu_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_prelu_emitter::get_supported_precisions(const std::shared_ptr& node) { return {{element::f32, element::f32}}; } @@ -1673,38 +1999,38 @@ void jit_prelu_emitter::emit_impl(const std::vector& in_vec_idxs, const } else if (host_isa_ == x64::avx512_core) { emit_isa(in_vec_idxs, out_vec_idxs); } else { - assert(!"unsupported isa"); + IE_THROW() << "jit_prelu_emitter doesn't support ISA '" << host_isa_ << "'"; } } template void jit_prelu_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { using Vmm = typename conditional3::type; - Vmm vmm_src0 = Vmm(in_vec_idxs[0]); - Vmm vmm_src1 = Vmm(in_vec_idxs[1]); + Vmm vmm_src_0 = Vmm(in_vec_idxs[0]); + Vmm vmm_src_1 = Vmm(in_vec_idxs[1]); Vmm vmm_dst = Vmm(out_vec_idxs[0]); - Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]); - Vmm vmm_aux1 = Vmm(aux_vec_idxs[1]); + Vmm vmm_aux_0 = Vmm(aux_vec_idxs[0]); + Vmm vmm_aux_1 = Vmm(aux_vec_idxs[1]); if (isa == x64::sse41) { - h->pxor(vmm_aux0, vmm_aux0); - h->cmpps(vmm_aux0, vmm_src0, _cmp_gt_os); - h->movups(vmm_aux1, vmm_src1); - h->mulps(vmm_aux1, vmm_src0); - if (vmm_src0.getIdx() != vmm_dst.getIdx()) - h->movups(vmm_dst, vmm_src0); - h->blendvps(vmm_dst, vmm_aux1); + h->pxor(vmm_aux_0, vmm_aux_0); + h->cmpps(vmm_aux_0, vmm_src_0, _cmp_gt_os); + h->movups(vmm_aux_1, vmm_src_1); + h->mulps(vmm_aux_1, vmm_src_0); + if (vmm_src_0.getIdx() != vmm_dst.getIdx()) + h->movups(vmm_dst, vmm_src_0); + h->blendvps(vmm_dst, vmm_aux_1); } else if (isa == x64::avx2) { - h->vmulps(vmm_aux0, vmm_src0, vmm_src1); - h->vxorps(vmm_aux1, vmm_aux1, vmm_aux1); - h->vcmpgtps(vmm_aux1, vmm_src0, vmm_aux1); - h->vblendvps(vmm_dst, vmm_aux0, vmm_src0, vmm_aux1); + h->vmulps(vmm_aux_0, vmm_src_0, vmm_src_1); + h->vxorps(vmm_aux_1, vmm_aux_1, vmm_aux_1); + h->vcmpgtps(vmm_aux_1, vmm_src_0, vmm_aux_1); + h->vblendvps(vmm_dst, vmm_aux_0, vmm_src_0, vmm_aux_1); } else if (isa == x64::avx512_core) { - h->vxorpd(vmm_aux0, vmm_aux0, vmm_aux0); - if (vmm_src0.getIdx() != vmm_dst.getIdx()) - h->vmovups(vmm_dst, vmm_src0); - h->vcmpps(k_mask, vmm_src0, vmm_aux0, _cmp_lt_os); - h->vmulps(vmm_dst | k_mask, vmm_src0, vmm_src1); + h->vxorpd(vmm_aux_0, vmm_aux_0, vmm_aux_0); + if (vmm_src_0.getIdx() != vmm_dst.getIdx()) + h->vmovups(vmm_dst, vmm_src_0); + h->vcmpps(k_mask, vmm_src_0, vmm_aux_0, _cmp_lt_os); + h->vmulps(vmm_dst | k_mask, vmm_src_0, vmm_src_1); } } @@ -1713,15 +2039,19 @@ size_t jit_prelu_emitter::aux_vecs_count() const { } /// SQRT /// -jit_sqrt_emitter::jit_sqrt_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) -: jit_emitter(host, host_isa, node, exec_prc) {} -jit_sqrt_emitter::jit_sqrt_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, Precision exec_prc) -: jit_emitter(host, host_isa, exec_prc) {} +jit_sqrt_emitter::jit_sqrt_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node, const Precision& exec_prc) + : jit_emitter(host, host_isa, node, exec_prc) { + prepare_table(); +} +jit_sqrt_emitter::jit_sqrt_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const Precision& exec_prc) + : jit_emitter(host, host_isa, exec_prc) { + prepare_table(); +} size_t jit_sqrt_emitter::get_inputs_num() const { return 1; } -std::set> jit_sqrt_emitter::get_supported_precisions(const std::shared_ptr& node) { - return {{element::f32}}; +std::set> jit_sqrt_emitter::get_supported_precisions(const std::shared_ptr& node) { + return { {element::f32}, {element::i64} }; } void jit_sqrt_emitter::emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { @@ -1732,26 +2062,65 @@ void jit_sqrt_emitter::emit_impl(const std::vector& in_vec_idxs, const s } else if (host_isa_ == x64::avx512_core) { emit_isa(in_vec_idxs, out_vec_idxs); } else { - assert(!"unsupported isa"); + IE_THROW() << "jit_sqrt_emitter doesn't support ISA '" << host_isa_ << "'"; } } template void jit_sqrt_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { using Vmm = typename conditional3::type; - Vmm vmm_src0 = Vmm(in_vec_idxs[0]); + Vmm vmm_src_0 = Vmm(in_vec_idxs[0]); Vmm vmm_dst = Vmm(out_vec_idxs[0]); - h->uni_vsqrtps(vmm_dst, vmm_src0); + switch (exec_prc_) { + case Precision::FP32: h->uni_vsqrtps(vmm_dst, vmm_src_0); break; + case Precision::I64: { + if (isa == x64::avx512_core) { + h->vcvtqq2pd(vmm_dst, vmm_src_0); + h->uni_vsqrtpd(vmm_dst, vmm_dst); + if (rounding_type != RoundType::nearest) { + h->uni_vroundpd(vmm_dst, vmm_dst, rounding_type); + } + h->vcvtpd2qq(vmm_dst, vmm_dst); + } else { + Vmm vmm_aux_0 = Vmm(aux_vec_idxs[0]); + h->uni_vmovups(vmm_aux_0, table_val_64("dMask")); + + h->uni_vpaddq(vmm_dst, vmm_src_0, vmm_aux_0); + h->uni_vsubpd(vmm_dst, vmm_dst, vmm_aux_0); + + h->uni_vsqrtpd(vmm_dst, vmm_dst); + if (rounding_type != RoundType::nearest) { + h->uni_vroundpd(vmm_dst, vmm_dst, rounding_type); + } + + h->uni_vaddpd(vmm_dst, vmm_dst, vmm_aux_0); + h->uni_vpsubq(vmm_dst, vmm_dst, vmm_aux_0); + } + } break; + default: IE_THROW() << "jit_sqrt_emitter doesn't support precision '" << exec_prc_ << "'"; + } +} + +size_t jit_sqrt_emitter::aux_vecs_count() const { + if (host_isa_ != x64::avx512_core && exec_prc_ == Precision::I64) { + return 1; + } +} + +void jit_sqrt_emitter::register_table_entries() { + if (host_isa_ != x64::avx512_core && exec_prc_ == Precision::I64) { + push_arg_entry_of_64("dMask", 0x433800002150d000, true); + } } /// Negate /// -jit_negative_emitter::jit_negative_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) +jit_negative_emitter::jit_negative_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node, const Precision& exec_prc) : jit_emitter(host, host_isa, node, exec_prc) {} size_t jit_negative_emitter::get_inputs_num() const { return 1; } -std::set> jit_negative_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_negative_emitter::get_supported_precisions(const std::shared_ptr& node) { return {{element::f32}}; } @@ -1763,7 +2132,7 @@ void jit_negative_emitter::emit_impl(const std::vector& in_vec_idxs, con } else if (host_isa_ == x64::avx512_core) { emit_isa(in_vec_idxs, out_vec_idxs); } else { - assert(!"unsupported isa"); + IE_THROW() << "jit_negative_emitter doesn't support ISA '" << host_isa_ << "'"; } } @@ -1777,19 +2146,19 @@ void jit_negative_emitter::emit_isa(const std::vector &in_vec_idxs, cons } /// ERF /// -jit_erf_emitter::jit_erf_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, Precision exec_prc) +jit_erf_emitter::jit_erf_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const Precision& exec_prc) : jit_emitter(host, host_isa, exec_prc) { prepare_table(); } -jit_erf_emitter::jit_erf_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) +jit_erf_emitter::jit_erf_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node, const Precision& exec_prc) : jit_emitter(host, host_isa, node, exec_prc) { prepare_table(); } size_t jit_erf_emitter::get_inputs_num() const { return 1; } -std::set> jit_erf_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_erf_emitter::get_supported_precisions(const std::shared_ptr& node) { return {{element::f32}}; } @@ -1803,7 +2172,7 @@ void jit_erf_emitter::emit_impl( } else if (host_isa_ == x64::avx512_core) { emit_isa(in_vec_idxs, out_vec_idxs); } else { - assert(!"unsupported isa"); + IE_THROW() << "jit_erf_emitter doesn't support ISA '" << host_isa_ << "'"; } } @@ -1814,14 +2183,13 @@ void jit_erf_emitter::emit_isa(const std::vector &in_vec_idxs, const std Vmm vmm_dst = Vmm(out_vec_idxs[0]); Vmm vmm_mask = Vmm(aux_vec_idxs[0]); - Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]); - Vmm vmm_aux1 = Vmm(aux_vec_idxs[1]); + Vmm vmm_aux_0 = Vmm(aux_vec_idxs[0]); + Vmm vmm_aux_1 = Vmm(aux_vec_idxs[1]); Vmm vmm_aux2 = Vmm(aux_vec_idxs[2]); Vmm vmm_aux3 = Vmm(aux_vec_idxs[3]); Vmm vmm_aux4 = Vmm(aux_vec_idxs[4]); - auto compute_cmp_mask = [&](const Vmm &vmm_src, - const Xbyak::Operand &compare_operand, int cmp_predicate) { + auto compute_cmp_mask = [&](const Vmm &vmm_src, const Operand &compare_operand, int cmp_predicate) { if (host_isa_ == x64::avx512_core) { h->vcmpps(k_mask, vmm_src, compare_operand, cmp_predicate); } else { @@ -1829,7 +2197,7 @@ void jit_erf_emitter::emit_isa(const std::vector &in_vec_idxs, const std } }; - auto blend_with_mask = [&](const Vmm &vmm_dst, const Xbyak::Operand &src) { + auto blend_with_mask = [&](const Vmm &vmm_dst, const Operand &src) { if (host_isa_ == x64::avx512_core) { h->vblendmps(vmm_dst | k_mask, vmm_dst, src); } else { @@ -1843,7 +2211,7 @@ void jit_erf_emitter::emit_isa(const std::vector &in_vec_idxs, const std h->uni_vminps(vmm_src, vmm_src, table_val("exp_ln_flt_max_f")); h->uni_vmaxps(vmm_src, vmm_src, table_val("exp_ln_flt_min_f")); - h->uni_vmovups(vmm_aux1, vmm_src); + h->uni_vmovups(vmm_aux_1, vmm_src); // calculate exp(x) // fx = x * log2ef + 0.5 @@ -1858,7 +2226,7 @@ void jit_erf_emitter::emit_isa(const std::vector &in_vec_idxs, const std h->uni_vmovups(vmm_src, vmm_aux2); // x = x - fx * ln2 - h->uni_vfnmadd231ps(vmm_aux1, vmm_aux2, table_val("ln2f")); + h->uni_vfnmadd231ps(vmm_aux_1, vmm_aux2, table_val("ln2f")); // compute 2^n h->uni_vcvtps2dq(vmm_aux2, vmm_src); @@ -1873,11 +2241,11 @@ void jit_erf_emitter::emit_isa(const std::vector &in_vec_idxs, const std // compute polynomial h->uni_vmovups(vmm_src, table_val("ex_pol5")); - h->uni_vfmadd213ps(vmm_src, vmm_aux1, table_val("ex_pol4")); - h->uni_vfmadd213ps(vmm_src, vmm_aux1, table_val("ex_pol3")); - h->uni_vfmadd213ps(vmm_src, vmm_aux1, table_val("ex_pol2")); - h->uni_vfmadd213ps(vmm_src, vmm_aux1, table_val("ex_pol1")); - h->uni_vfmadd213ps(vmm_src, vmm_aux1, table_val("one")); + h->uni_vfmadd213ps(vmm_src, vmm_aux_1, table_val("ex_pol4")); + h->uni_vfmadd213ps(vmm_src, vmm_aux_1, table_val("ex_pol3")); + h->uni_vfmadd213ps(vmm_src, vmm_aux_1, table_val("ex_pol2")); + h->uni_vfmadd213ps(vmm_src, vmm_aux_1, table_val("ex_pol1")); + h->uni_vfmadd213ps(vmm_src, vmm_aux_1, table_val("one")); // y = y * 2^n h->uni_vmulps(vmm_src, vmm_src, vmm_aux2); }; @@ -1899,17 +2267,17 @@ void jit_erf_emitter::emit_isa(const std::vector &in_vec_idxs, const std h->uni_vxorps(vmm_src, vmm_src, table_val("sign_mask")); // get sign - h->uni_vmovups(vmm_aux0, vmm_aux3); - h->uni_vandps(vmm_aux0, vmm_aux0, table_val("sign_mask")); + h->uni_vmovups(vmm_aux_0, vmm_aux3); + h->uni_vandps(vmm_aux_0, vmm_aux_0, table_val("sign_mask")); // abs(x) - h->uni_vmovups(vmm_aux1, vmm_aux3); + h->uni_vmovups(vmm_aux_1, vmm_aux3); // compute abs(x) = _mm_and_ps(x, 01111..111)); - abs_compute_vector_fwd(vmm_aux1); + abs_compute_vector_fwd(vmm_aux_1); // t = 1 / (p*x + 1) h->uni_vmovups(vmm_aux2, table_val("approx_const")); - h->uni_vfmadd213ps(vmm_aux2, vmm_aux1, table_val("one")); + h->uni_vfmadd213ps(vmm_aux2, vmm_aux_1, table_val("one")); h->uni_vmovups(vmm_aux4, table_val("one")); h->uni_vdivps(vmm_aux4, vmm_aux4, vmm_aux2); @@ -1917,15 +2285,15 @@ void jit_erf_emitter::emit_isa(const std::vector &in_vec_idxs, const std h->uni_vmulps(vmm_src, vmm_src, vmm_aux4); // compute polynomialial r - h->uni_vmovups(vmm_aux1, table_val("erf_pol5")); - h->uni_vfmadd213ps(vmm_aux1, vmm_aux4, table_val("erf_pol4")); - h->uni_vfmadd213ps(vmm_aux1, vmm_aux4, table_val("erf_pol3")); - h->uni_vfmadd213ps(vmm_aux1, vmm_aux4, table_val("erf_pol2")); - h->uni_vfmadd213ps(vmm_aux1, vmm_aux4, table_val("erf_pol1")); + h->uni_vmovups(vmm_aux_1, table_val("erf_pol5")); + h->uni_vfmadd213ps(vmm_aux_1, vmm_aux4, table_val("erf_pol4")); + h->uni_vfmadd213ps(vmm_aux_1, vmm_aux4, table_val("erf_pol3")); + h->uni_vfmadd213ps(vmm_aux_1, vmm_aux4, table_val("erf_pol2")); + h->uni_vfmadd213ps(vmm_aux_1, vmm_aux4, table_val("erf_pol1")); // erf = sign * (1 - r * t * exp(-x*x)) - h->uni_vfmadd213ps(vmm_src, vmm_aux1, table_val("one")); - h->uni_vxorps(vmm_dst, vmm_src, vmm_aux0); + h->uni_vfmadd213ps(vmm_src, vmm_aux_1, table_val("one")); + h->uni_vxorps(vmm_dst, vmm_src, vmm_aux_0); } void jit_erf_emitter::register_table_entries() { @@ -1962,18 +2330,18 @@ size_t jit_erf_emitter::aux_vecs_count() const { } /// SOFT SIGN /// -jit_soft_sign_emitter::jit_soft_sign_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) -: jit_emitter(host, host_isa, node, exec_prc) { +jit_soft_sign_emitter::jit_soft_sign_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node, + const Precision& exec_prc) : jit_emitter(host, host_isa, node, exec_prc) { prepare_table(); } -jit_soft_sign_emitter::jit_soft_sign_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, Precision exec_prc) +jit_soft_sign_emitter::jit_soft_sign_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const Precision& exec_prc) : jit_emitter(host, host_isa, exec_prc) { prepare_table(); } size_t jit_soft_sign_emitter::get_inputs_num() const { return 1; } -std::set> jit_soft_sign_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_soft_sign_emitter::get_supported_precisions(const std::shared_ptr& node) { return {{element::f32}}; } @@ -1985,7 +2353,7 @@ void jit_soft_sign_emitter::emit_impl(const std::vector& in_vec_idxs, co } else if (host_isa_ == x64::avx512_core) { emit_isa(in_vec_idxs, out_vec_idxs); } else { - assert(!"unsupported isa"); + IE_THROW() << "jit_soft_sign_emitter doesn't support ISA '" << host_isa_ << "'"; } } @@ -2181,14 +2549,14 @@ void jit_is_nan_emitter::register_table_entries() { } /// SELECT /// -jit_select_emitter::jit_select_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) +jit_select_emitter::jit_select_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node, const Precision& exec_prc) : jit_emitter(host, host_isa, node, exec_prc) {} -jit_select_emitter::jit_select_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, Precision exec_prc) +jit_select_emitter::jit_select_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const Precision& exec_prc) : jit_emitter(host, host_isa, exec_prc) {} size_t jit_select_emitter::get_inputs_num() const { return 3; } -std::set> jit_select_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_select_emitter::get_supported_precisions(const std::shared_ptr& node) { return {{element::f32, element::f32, element::f32}}; } @@ -2209,16 +2577,16 @@ void jit_select_emitter::emit_impl(const std::vector &in_vec_idxs, const } else if (host_isa_ == x64::avx512_core) { emit_isa(in_vec_idxs, out_vec_idxs); } else { - assert(!"unsupported isa"); + IE_THROW() << "jit_select_emitter doesn't support ISA '" << host_isa_ << "'"; } } -template +template void jit_select_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { using Vmm = typename conditional3::type; Vmm vmm_cond = Vmm(in_vec_idxs[0]); - Vmm vmm_src0 = Vmm(in_vec_idxs[1]); - Vmm vmm_src1 = Vmm(in_vec_idxs[2]); + Vmm vmm_src_0 = Vmm(in_vec_idxs[1]); + Vmm vmm_src_1 = Vmm(in_vec_idxs[2]); Vmm vmm_dst = Vmm(out_vec_idxs[0]); if (isa == x64::sse41) { @@ -2229,18 +2597,18 @@ void jit_select_emitter::emit_isa(const std::vector &in_vec_idxs, const if (vmm_mask.getIdx() != vmm_cond.getIdx()) { h->uni_vmovups(vmm_mask, vmm_cond); } - if (vmm_src1.getIdx() != vmm_dst.getIdx()) { - h->uni_vmovups(vmm_dst, vmm_src1); + if (vmm_src_1.getIdx() != vmm_dst.getIdx()) { + h->uni_vmovups(vmm_dst, vmm_src_1); } - h->uni_vblendvps(vmm_dst, vmm_dst, vmm_src0, vmm_mask); + h->uni_vblendvps(vmm_dst, vmm_dst, vmm_src_0, vmm_mask); } else if (isa == x64::avx2) { Vmm vmm_zero = Vmm(aux_vec_idxs[0]); h->uni_vpxor(vmm_zero, vmm_zero, vmm_zero); h->uni_vcmpps(vmm_cond, vmm_cond, vmm_zero, 0x4); - h->uni_vblendvps(vmm_dst, vmm_src1, vmm_src0, vmm_cond); + h->uni_vblendvps(vmm_dst, vmm_src_1, vmm_src_0, vmm_cond); } else { h->vptestmd(k_mask, vmm_cond, vmm_cond); - h->vblendmps(vmm_dst | k_mask, vmm_src1, vmm_src0); + h->vblendmps(vmm_dst | k_mask, vmm_src_1, vmm_src_0); } } } // namespace intel_cpu diff --git a/src/plugins/intel_cpu/src/emitters/x64/jit_eltwise_emitters.hpp b/src/plugins/intel_cpu/src/emitters/x64/jit_eltwise_emitters.hpp index 5c00e4584b4274..858e6ee5dd8edd 100644 --- a/src/plugins/intel_cpu/src/emitters/x64/jit_eltwise_emitters.hpp +++ b/src/plugins/intel_cpu/src/emitters/x64/jit_eltwise_emitters.hpp @@ -12,11 +12,11 @@ namespace intel_cpu { class jit_add_emitter : public jit_emitter { public: jit_add_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, - InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + const InferenceEngine::Precision &exec_prc = InferenceEngine::Precision::FP32); jit_add_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n); size_t get_inputs_num() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); private: void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; @@ -28,11 +28,11 @@ class jit_add_emitter : public jit_emitter { class jit_mul_add_emitter : public jit_emitter { public: jit_mul_add_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, - InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + const InferenceEngine::Precision &exec_prc = InferenceEngine::Precision::FP32); jit_mul_add_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n); size_t get_inputs_num() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); private: void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; @@ -47,11 +47,11 @@ class jit_mul_add_emitter : public jit_emitter { class jit_subtract_emitter : public jit_emitter { public: jit_subtract_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, - InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + const InferenceEngine::Precision &exec_prc = InferenceEngine::Precision::FP32); jit_subtract_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n); size_t get_inputs_num() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); private: void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; @@ -64,11 +64,13 @@ class jit_subtract_emitter : public jit_emitter { class jit_multiply_emitter : public jit_emitter { public: jit_multiply_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, - InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + const InferenceEngine::Precision &exec_prc = InferenceEngine::Precision::FP32); jit_multiply_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n); size_t get_inputs_num() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + + size_t aux_vecs_count() const override; private: void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; @@ -81,30 +83,34 @@ class jit_multiply_emitter : public jit_emitter { class jit_divide_emitter : public jit_emitter { public: jit_divide_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, - InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); - jit_divide_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, - InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + const InferenceEngine::Precision &exec_prc = InferenceEngine::Precision::FP32); + jit_divide_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n); size_t get_inputs_num() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + + bool second_is_float = false; private: void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; template void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + size_t aux_vecs_count() const override; + + void register_table_entries() override; }; class jit_floor_emitter : public jit_emitter { public: jit_floor_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, - InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + const InferenceEngine::Precision &exec_prc = InferenceEngine::Precision::FP32); jit_floor_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, - InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + const InferenceEngine::Precision &exec_prc = InferenceEngine::Precision::FP32); size_t get_inputs_num() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); private: void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; @@ -116,12 +122,12 @@ class jit_floor_emitter : public jit_emitter { class jit_ceiling_emitter : public jit_emitter { public: jit_ceiling_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, - InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + const InferenceEngine::Precision &exec_prc = InferenceEngine::Precision::FP32); jit_ceiling_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, - InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + const InferenceEngine::Precision &exec_prc = InferenceEngine::Precision::FP32); size_t get_inputs_num() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); private: void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; @@ -133,12 +139,12 @@ class jit_ceiling_emitter : public jit_emitter { class jit_floor_mod_emitter : public jit_emitter { public: jit_floor_mod_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, - InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + const InferenceEngine::Precision &exec_prc = InferenceEngine::Precision::FP32); jit_floor_mod_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, - InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + const InferenceEngine::Precision &exec_prc = InferenceEngine::Precision::FP32); size_t get_inputs_num() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); private: void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; @@ -146,18 +152,20 @@ class jit_floor_mod_emitter : public jit_emitter { template void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; size_t aux_vecs_count() const override; + + void register_table_entries() override; }; class jit_mod_emitter : public jit_emitter { public: jit_mod_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, - InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + const InferenceEngine::Precision &exec_prc = InferenceEngine::Precision::FP32); jit_mod_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, - InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + const InferenceEngine::Precision &exec_prc = InferenceEngine::Precision::FP32); size_t get_inputs_num() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); private: void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; @@ -171,65 +179,70 @@ class jit_mod_emitter : public jit_emitter { class jit_maximum_emitter : public jit_emitter { public: jit_maximum_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, - InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + const InferenceEngine::Precision &exec_prc = InferenceEngine::Precision::FP32); jit_maximum_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n); size_t get_inputs_num() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); private: void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; template void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + + size_t aux_vecs_count() const override; }; class jit_minimum_emitter : public jit_emitter { public: jit_minimum_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, - InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + const InferenceEngine::Precision &exec_prc = InferenceEngine::Precision::FP32); jit_minimum_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n); size_t get_inputs_num() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); private: void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; template void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + + size_t aux_vecs_count() const override; }; class jit_squared_difference_emitter : public jit_emitter { public: jit_squared_difference_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, - InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); - jit_squared_difference_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, - const std::shared_ptr& n, - InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + const InferenceEngine::Precision &exec_prc = InferenceEngine::Precision::FP32); + jit_squared_difference_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, + const InferenceEngine::Precision &exec_prc = InferenceEngine::Precision::FP32); size_t get_inputs_num() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); private: void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; template void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + + size_t aux_vecs_count() const override; }; class jit_power_dynamic_emitter : public jit_emitter { public: jit_power_dynamic_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, - InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + const InferenceEngine::Precision &exec_prc = InferenceEngine::Precision::FP32); jit_power_dynamic_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, - InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + const InferenceEngine::Precision &exec_prc = InferenceEngine::Precision::FP32); size_t get_inputs_num() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); private: void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; @@ -242,12 +255,12 @@ class jit_power_dynamic_emitter : public jit_emitter { class jit_equal_emitter : public jit_emitter { public: jit_equal_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, - InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + const InferenceEngine::Precision &exec_prc = InferenceEngine::Precision::FP32); jit_equal_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, - InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + const InferenceEngine::Precision &exec_prc = InferenceEngine::Precision::FP32); size_t get_inputs_num() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); private: void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; @@ -256,19 +269,18 @@ class jit_equal_emitter : public jit_emitter { void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; void register_table_entries() override; - size_t aux_vecs_count() const override; }; class jit_not_equal_emitter : public jit_emitter { public: jit_not_equal_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, - InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + const InferenceEngine::Precision &exec_prc = InferenceEngine::Precision::FP32); jit_not_equal_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, - InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + const InferenceEngine::Precision &exec_prc = InferenceEngine::Precision::FP32); size_t get_inputs_num() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); private: void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; @@ -284,12 +296,12 @@ class jit_not_equal_emitter : public jit_emitter { class jit_greater_emitter : public jit_emitter { public: jit_greater_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, - InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + const InferenceEngine::Precision &exec_prc = InferenceEngine::Precision::FP32); jit_greater_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, - InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + const InferenceEngine::Precision &exec_prc = InferenceEngine::Precision::FP32); size_t get_inputs_num() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); private: void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; @@ -298,19 +310,18 @@ class jit_greater_emitter : public jit_emitter { void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; void register_table_entries() override; - size_t aux_vecs_count() const override; }; class jit_greater_equal_emitter : public jit_emitter { public: jit_greater_equal_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, - InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + const InferenceEngine::Precision &exec_prc = InferenceEngine::Precision::FP32); jit_greater_equal_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, - InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + const InferenceEngine::Precision &exec_prc = InferenceEngine::Precision::FP32); size_t get_inputs_num() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); private: void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; @@ -326,12 +337,12 @@ class jit_greater_equal_emitter : public jit_emitter { class jit_less_emitter : public jit_emitter { public: jit_less_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, - InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + const InferenceEngine::Precision &exec_prc = InferenceEngine::Precision::FP32); jit_less_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, - InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + const InferenceEngine::Precision &exec_prc = InferenceEngine::Precision::FP32); size_t get_inputs_num() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); private: void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; @@ -340,20 +351,19 @@ class jit_less_emitter : public jit_emitter { void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; void register_table_entries() override; - size_t aux_vecs_count() const override; }; class jit_less_equal_emitter : public jit_emitter { public: jit_less_equal_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, - InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + const InferenceEngine::Precision &exec_prc = InferenceEngine::Precision::FP32); jit_less_equal_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, - InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + const InferenceEngine::Precision &exec_prc = InferenceEngine::Precision::FP32); size_t get_inputs_num() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); private: void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; @@ -369,12 +379,12 @@ class jit_less_equal_emitter : public jit_emitter { class jit_logical_and_emitter : public jit_emitter { public: jit_logical_and_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, - InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + const InferenceEngine::Precision &exec_prc = InferenceEngine::Precision::FP32); jit_logical_and_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, - InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + const InferenceEngine::Precision &exec_prc = InferenceEngine::Precision::FP32); size_t get_inputs_num() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); private: void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; @@ -390,12 +400,12 @@ class jit_logical_and_emitter : public jit_emitter { class jit_logical_or_emitter : public jit_emitter { public: jit_logical_or_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, - InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + const InferenceEngine::Precision &exec_prc = InferenceEngine::Precision::FP32); jit_logical_or_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, - InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + const InferenceEngine::Precision &exec_prc = InferenceEngine::Precision::FP32); size_t get_inputs_num() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); private: void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; @@ -411,12 +421,12 @@ class jit_logical_or_emitter : public jit_emitter { class jit_logical_xor_emitter : public jit_emitter { public: jit_logical_xor_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, - InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + const InferenceEngine::Precision &exec_prc = InferenceEngine::Precision::FP32); jit_logical_xor_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, - InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + const InferenceEngine::Precision &exec_prc = InferenceEngine::Precision::FP32); size_t get_inputs_num() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); private: void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; @@ -431,12 +441,12 @@ class jit_logical_xor_emitter : public jit_emitter { class jit_logical_not_emitter : public jit_emitter { public: jit_logical_not_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, - InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + const InferenceEngine::Precision &exec_prc = InferenceEngine::Precision::FP32); jit_logical_not_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, - InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + const InferenceEngine::Precision &exec_prc = InferenceEngine::Precision::FP32); size_t get_inputs_num() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); private: void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; @@ -452,12 +462,12 @@ class jit_power_static_emitter : public jit_emitter { public: jit_power_static_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, float inpPower, float inpScale, float inpShift, - InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + const InferenceEngine::Precision &exec_prc = InferenceEngine::Precision::FP32); jit_power_static_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, - InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + const InferenceEngine::Precision &exec_prc = InferenceEngine::Precision::FP32); size_t get_inputs_num() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); private: @@ -477,12 +487,12 @@ class jit_power_static_emitter : public jit_emitter { class jit_prelu_emitter : public jit_emitter { public: jit_prelu_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, - InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + const InferenceEngine::Precision &exec_prc = InferenceEngine::Precision::FP32); jit_prelu_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, - InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + const InferenceEngine::Precision &exec_prc = InferenceEngine::Precision::FP32); size_t get_inputs_num() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); private: void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; @@ -496,27 +506,33 @@ class jit_prelu_emitter : public jit_emitter { class jit_sqrt_emitter : public jit_emitter { public: jit_sqrt_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, - InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + const InferenceEngine::Precision &exec_prc = InferenceEngine::Precision::FP32); jit_sqrt_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, - InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + const InferenceEngine::Precision &exec_prc = InferenceEngine::Precision::FP32); size_t get_inputs_num() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + + RoundType rounding_type = RoundType::nearest; private: void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; template void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + + size_t aux_vecs_count() const override; + + void register_table_entries() override; }; class jit_negative_emitter : public jit_emitter { public: jit_negative_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, - InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + const InferenceEngine::Precision &exec_prc = InferenceEngine::Precision::FP32); size_t get_inputs_num() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); private: void emit_impl(const std::vector& in, const std::vector& out) const override; @@ -528,13 +544,13 @@ class jit_negative_emitter : public jit_emitter { class jit_erf_emitter : public jit_emitter { public: jit_erf_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, - InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + const InferenceEngine::Precision &exec_prc = InferenceEngine::Precision::FP32); jit_erf_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, - InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + const InferenceEngine::Precision &exec_prc = InferenceEngine::Precision::FP32); size_t get_inputs_num() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); private: void emit_impl( @@ -551,12 +567,12 @@ class jit_erf_emitter : public jit_emitter { class jit_soft_sign_emitter : public jit_emitter { public: jit_soft_sign_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, - InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + const InferenceEngine::Precision &exec_prc = InferenceEngine::Precision::FP32); jit_soft_sign_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, - InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + const InferenceEngine::Precision &exec_prc = InferenceEngine::Precision::FP32); size_t get_inputs_num() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); private: void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; @@ -579,7 +595,7 @@ class jit_is_finite_emitter : public jit_emitter { } size_t get_inputs_num() const override { return 1; }; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr) { + static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr) { return {{element::f32}}; } @@ -607,7 +623,7 @@ class jit_is_inf_emitter : public jit_emitter { } size_t get_inputs_num() const override { return 1; }; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr) { + static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr) { return {{element::f32}}; } @@ -637,7 +653,7 @@ class jit_is_nan_emitter : public jit_emitter { } size_t get_inputs_num() const override { return 1; } - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr) { + static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr) { return {{element::f32}}; } @@ -655,12 +671,12 @@ class jit_is_nan_emitter : public jit_emitter { class jit_select_emitter : public jit_emitter { public: jit_select_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, - InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); - jit_select_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, - InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + const InferenceEngine::Precision &exec_prc = InferenceEngine::Precision::FP32); + jit_select_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, + const InferenceEngine::Precision &exec_prc = InferenceEngine::Precision::FP32); size_t get_inputs_num() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); size_t aux_vecs_count() const override; private: diff --git a/src/plugins/intel_cpu/src/emitters/x64/jit_emitter.cpp b/src/plugins/intel_cpu/src/emitters/x64/jit_emitter.cpp index f727f8d9d1d7a5..fa18f576dcc470 100644 --- a/src/plugins/intel_cpu/src/emitters/x64/jit_emitter.cpp +++ b/src/plugins/intel_cpu/src/emitters/x64/jit_emitter.cpp @@ -52,7 +52,7 @@ emitter_in_out_map jit_emitter::get_in_out_type() const { size_t jit_emitter::aux_gprs_count() const { // We need one gpr to load table address - return entry_map_.empty() ? 0 : 1; + return entry_map_.empty() && entry_map_64.empty() ? 0 : 1; } std::set> jit_emitter::get_supported_precisions(const std::shared_ptr& node) { @@ -133,7 +133,7 @@ void jit_emitter::emitter_preamble(const std::vector &in_idxs, const std if (aux_gpr_idxs.size() < aux_gprs_count()) IE_THROW() << "Failed to allocate required number of general-purpose registers"; - if (!entry_map_.empty()) { + if (!entry_map_.empty() || !entry_map_64.empty()) { // last aux_gpr_idx is for p_table, we can use aux_gpr_idxs from idx 0 for other purpose p_table = Reg64(aux_gpr_idxs[aux_gprs_count() - 1]); aux_gpr_idxs.erase(aux_gpr_idxs.end() - 1); @@ -149,8 +149,9 @@ void jit_emitter::emitter_preamble(const std::vector &in_idxs, const std push_vec(h->ptr[h->rsp + i * get_vec_length()], preserved_vec_idxs[i]); } - if (!entry_map_.empty()) + if (!entry_map_.empty() || !entry_map_64.empty()) { load_table_addr(); + } } @@ -187,6 +188,13 @@ void jit_emitter::emit_data() const { for (size_t d = 0; d < len; d += sizeof(table_entry_val_t)) h->dd(te.val); } + for (auto it = entry_map_64.begin(); it != entry_map_64.end(); it++) { + const auto &te = (*it).second; // get map entry for a given key + const auto len = te.bcast ? get_vec_length() : sizeof(table_entry_val_t_64); + for (size_t d = 0; d < len; d += sizeof(table_entry_val_t_64)) { + h->dq(te.val); + } + } } void jit_emitter::prepare_table() { @@ -202,6 +210,11 @@ void jit_emitter::prepare_table() { te.off = off; off += te.bcast ? get_vec_length() : sizeof(table_entry_val_t); } + for (auto it = entry_map_64.begin(); it != entry_map_64.end(); it++) { + auto &te = (*it).second; + te.off = off; + off += te.bcast ? get_vec_length() : sizeof(table_entry_val_t_64); + } } void jit_emitter::emit_code(const std::vector &in_idxs, const std::vector &out_idxs, diff --git a/src/plugins/intel_cpu/src/emitters/x64/jit_emitter.hpp b/src/plugins/intel_cpu/src/emitters/x64/jit_emitter.hpp index d2e3a33b914406..7b42a0dcae0b95 100644 --- a/src/plugins/intel_cpu/src/emitters/x64/jit_emitter.hpp +++ b/src/plugins/intel_cpu/src/emitters/x64/jit_emitter.hpp @@ -36,7 +36,7 @@ class jit_emitter : public ov::snippets::Emitter { k_mask = Xbyak::Opmask(1); // FIXME: in general case we need preserve k_mask state as well } - jit_emitter(dnnl::impl::cpu::x64::jit_generator* host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, + jit_emitter(dnnl::impl::cpu::x64::jit_generator* host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32, emitter_in_out_map in_out_type = emitter_in_out_map::vec_to_vec) : Emitter(n), h(host), host_isa_(host_isa), exec_prc_(exec_prc), l_table (new Xbyak::Label()), in_out_type_(in_out_type) { k_mask = Xbyak::Opmask(1); // FIXME: in general case we need preserve k_mask state as well @@ -55,7 +55,14 @@ class jit_emitter : public ov::snippets::Emitter { * Precisions are ordered, the first bigger bitness precision with the same type will be selected. * Empty collection means the emitter supports any input precisions. */ - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + + enum RoundType { + nearest = 0, + floor, + ceil, + truncation + }; protected: virtual size_t aux_gprs_count() const; @@ -75,6 +82,7 @@ class jit_emitter : public ov::snippets::Emitter { // we accept only 32bit hexadecimal table values to avoid any rounding using table_entry_val_t = uint32_t; + using table_entry_val_t_64 = uint64_t; using table_entry_offset_t = size_t; // offsets are in bytes wrt p_table using table_entry_bcast_t = bool; // true => bcast value @@ -82,11 +90,20 @@ class jit_emitter : public ov::snippets::Emitter { table_entry_val_t val; table_entry_bcast_t bcast; }; + struct table_entry_t_64 { + table_entry_val_t_64 val; + table_entry_bcast_t bcast; + }; struct mapped_table_entry_t { table_entry_offset_t off; table_entry_val_t val; table_entry_bcast_t bcast; }; + struct mapped_table_entry_t_64 { + table_entry_offset_t off; + table_entry_val_t_64 val; + table_entry_bcast_t bcast; + }; mutable Xbyak::Reg64 p_table; mutable std::shared_ptr l_table; @@ -118,16 +135,29 @@ class jit_emitter : public ov::snippets::Emitter { return h->ptr[p_table + off]; } + Xbyak::Address table_val_64(std::string key, size_t key_off_val_shift = 0) const { + auto off = table_off_64(key, key_off_val_shift); + return h->ptr[p_table + off]; + } + using table_t = std::multimap; + using table_t_64 = std::multimap; using mapped_table_t = std::multimap; + using mapped_table_t_64 = std::multimap; mapped_table_t entry_map_; + mapped_table_t_64 entry_map_64; void push_arg_entry_of(const std::string key, const table_entry_val_t val, const bool broadcast) { mapped_table_entry_t te {0, val, broadcast}; entry_map_.insert(std::make_pair(key, te)); } + void push_arg_entry_of_64(const std::string key, const table_entry_val_t_64 val, const bool broadcast) { + mapped_table_entry_t_64 te {0, val, broadcast}; + entry_map_64.insert(std::make_pair(key, te)); + } + void push_entries_of(const table_t &t) { for (auto it = t.begin(); it != t.end(); it++) { auto key = (*it).first; @@ -136,6 +166,14 @@ class jit_emitter : public ov::snippets::Emitter { } } + void push_entries_of(const table_t_64 &t) { + for (auto it = t.begin(); it != t.end(); it++) { + auto key = (*it).first; + auto te = (*it).second; // copy values from table + push_arg_entry_of_64(key, te.val, te.bcast); + } + } + private: mutable std::vector preserved_vec_idxs; mutable std::vector preserved_gpr_idxs; @@ -153,6 +191,18 @@ class jit_emitter : public ov::snippets::Emitter { const auto scale = te.bcast ? get_vec_length() : sizeof(table_entry_val_t); return te.off + key_off_val_shift * scale; } + + size_t table_off_64(std::string& key, size_t key_off_val_shift = 0) const { + // assumption: all table entries sharing the same key also + // share their broadcast property + // TODO: enforce through data structure + const auto it = entry_map_64.find(key); // search an entry for a key + assert(it != entry_map_64.end()); + const auto &te = (*it).second; + const auto scale = te.bcast ? get_vec_length() : sizeof(table_entry_val_t_64); + return te.off + key_off_val_shift * scale; + } + virtual void validate_arguments(const std::vector&, const std::vector&) const {} }; diff --git a/src/plugins/intel_cpu/src/emitters/x64/jit_snippets_emitters.cpp b/src/plugins/intel_cpu/src/emitters/x64/jit_snippets_emitters.cpp index 1d5cb7946eecba..2837219529917d 100644 --- a/src/plugins/intel_cpu/src/emitters/x64/jit_snippets_emitters.cpp +++ b/src/plugins/intel_cpu/src/emitters/x64/jit_snippets_emitters.cpp @@ -479,6 +479,10 @@ ScalarEmitter::ScalarEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl: value = ov::as_type_ptr(n)->cast_vector()[0]; break; } + case element::i64: { + value = ov::as_type_ptr(n)->cast_vector()[0]; + break; + } case element::f32: { value = dnnl::impl::cpu::x64::float2int(ov::as_type_ptr(n)->cast_vector()[0]); break; diff --git a/src/plugins/intel_cpu/src/graph.cpp b/src/plugins/intel_cpu/src/graph.cpp index b65b582e4384b2..cf38459a4da930 100644 --- a/src/plugins/intel_cpu/src/graph.cpp +++ b/src/plugins/intel_cpu/src/graph.cpp @@ -45,7 +45,6 @@ #include "memory_desc/cpu_memory_desc_utils.h" #include -#include #include #include #include @@ -306,7 +305,10 @@ void Graph::Replicate(const CNNNetwork &network) { // change precision for input/output nodes to avoid extra data conversion when set input/output blobs // also we need to change input/output precisions for consumers/producers to avoid inserting reorder for (auto &input : inputNodesMap) { - const auto precToSet = normalizeToSupportedPrecision(inputsInfo.at(input.first)->getPrecision()); + auto precToSet = normalizeToSupportedPrecision(inputsInfo.at(input.first)->getPrecision()); + if (!getConfig().enableNativeI64 && precToSet == Precision::I64) { + precToSet = Precision::I32; + } input.second->setOriginalOutputPrecisionAtPort(0, precToSet); const auto childEdges = input.second->getChildEdgesAtPort(0); for (size_t i = 0; i < childEdges.size(); i++) { @@ -319,7 +321,10 @@ void Graph::Replicate(const CNNNetwork &network) { } for (auto &output : outputNodesMap) { - const auto precToSet = normalizeToSupportedPrecision(outputsInfo.at(output.first)->getPrecision()); + auto precToSet = normalizeToSupportedPrecision(outputsInfo.at(output.first)->getPrecision()); + if (!getConfig().enableNativeI64 && precToSet == Precision::I64) { + precToSet = Precision::I32; + } output.second->setOriginalInputPrecisionAtPort(0, precToSet); const auto parentEdges = output.second->getParentEdgesAtPort(0); for (size_t i = 0; i < parentEdges.size(); i++) { @@ -977,7 +982,7 @@ void Graph::PushInputData(const std::string& name, const InferenceEngine::Blob:: // todo: make sure 'name' exists in this map... if (_normalizePreprocMap.find(name) != _normalizePreprocMap.end()) { - if (inTensorDesc.getPrecision() == InferenceEngine::Precision::FP32) { + if (inTensorDesc.getPrecision() == Precision::FP32) { _normalizePreprocMap[name].NormalizeImage(outDims, reinterpret_cast(inter_data_ptr), inTensorDesc.getLayout()); } else { @@ -1424,16 +1429,16 @@ void Graph::SortTopologically() { } } -void Graph::GetPerfData(std::map &perfMap) const { +void Graph::GetPerfData(std::map &perfMap) const { unsigned i = 0; - std::function &, const NodePtr&)> - getPerfMapFor = [&](std::map &perfMap, const NodePtr& node) { - InferenceEngine::InferenceEngineProfileInfo &pc = perfMap[node->getName()]; + std::function &, const NodePtr&)> + getPerfMapFor = [&](std::map &perfMap, const NodePtr& node) { + InferenceEngineProfileInfo &pc = perfMap[node->getName()]; pc.execution_index = i++; // TODO: Why time counter is signed? pc.cpu_uSec = pc.realTime_uSec = (long long) node->PerfCounter().avg(); - pc.status = pc.cpu_uSec > 0 ? InferenceEngine::InferenceEngineProfileInfo::EXECUTED - : InferenceEngine::InferenceEngineProfileInfo::NOT_RUN; + pc.status = pc.cpu_uSec > 0 ? InferenceEngineProfileInfo::EXECUTED + : InferenceEngineProfileInfo::NOT_RUN; std::string pdType = node->getPrimitiveDescriptorType(); size_t typeLen = sizeof(pc.exec_type) / sizeof(pc.exec_type[0]); pdType.copy(pc.exec_type, typeLen, 0); diff --git a/src/plugins/intel_cpu/src/graph_optimizer.cpp b/src/plugins/intel_cpu/src/graph_optimizer.cpp index 8952b09ea6f9af..f52779ad33d1e7 100644 --- a/src/plugins/intel_cpu/src/graph_optimizer.cpp +++ b/src/plugins/intel_cpu/src/graph_optimizer.cpp @@ -18,7 +18,6 @@ #include "nodes/mvn.h" #include "nodes/transpose.h" #include "nodes/interpolate.h" -#include "nodes/reduce.h" #include "nodes/input.h" #include "nodes/rnn.h" #include "nodes/common/cpu_convert.h" diff --git a/src/plugins/intel_cpu/src/node.cpp b/src/plugins/intel_cpu/src/node.cpp index cdd343c126277c..685daa25ead8c4 100644 --- a/src/plugins/intel_cpu/src/node.cpp +++ b/src/plugins/intel_cpu/src/node.cpp @@ -38,7 +38,6 @@ #include "nodes/memory.hpp" #include "nodes/mvn.h" #include "nodes/normalize.h" -#include "nodes/reduce.h" #include "nodes/tensoriterator.h" #include "nodes/scatter_update.h" #include "nodes/interpolate.h" @@ -52,7 +51,7 @@ #include "nodes/common/cpu_memcpy.h" #include "utils/rt_info/memory_formats_attribute.hpp" -#include +#include #include #include @@ -80,7 +79,7 @@ Node::NodesFactory & Node::factory() { return factoryInstance; } -Node::Node(const std::shared_ptr& op, +Node::Node(const std::shared_ptr& op, const GraphContext::CPtr ctx, const ShapeInferFactory& shapeInferFactory) : selectedPrimitiveDescriptorIndex(-1), @@ -95,8 +94,6 @@ Node::Node(const std::shared_ptr& op, typeStr(op->get_type_name()), type(TypeFromName(op->get_type_name())), profiling(op->get_friendly_name()) { - const std::string errorPrefix = "Ngraph operation " + std::string(op->get_type_name()) + " with name " + op->get_friendly_name(); - for (size_t i = 0; i < op->get_input_size(); i++) { const auto &shape = op->get_input_partial_shape(i); if (shape.rank().is_dynamic()) { @@ -104,11 +101,11 @@ Node::Node(const std::shared_ptr& op, } bool isScalar = shape.rank().get_length() == 0; - inputShapes.emplace_back(isScalar ? ngraph::PartialShape{1} : shape); + inputShapes.emplace_back(isScalar ? ov::PartialShape{1} : shape); originalInputPrecisions.emplace_back(details::convertPrecision(op->get_input_element_type(i))); } - if (typeStr != "Result" && typeStr != "Assign") { + if (type != Type::Output && type != Type::MemoryOutput) { if (op->get_output_size() == 0) { IE_THROW() << "Node with type '" << typeStr << "' and name '" << name << "' does not have any outputs."; } @@ -119,11 +116,10 @@ Node::Node(const std::shared_ptr& op, } bool isScalar = shape.rank().get_length() == 0; - outputShapes.emplace_back(isScalar ? ngraph::PartialShape{1} : shape); + outputShapes.emplace_back(isScalar ? ov::PartialShape{1} : shape); originalOutputPrecisions.emplace_back(details::convertPrecision(op->get_output_element_type(i))); } } - isDynamic = std::any_of(inputShapes.begin(), inputShapes.end(), [](const Shape& shape){ return shape.isDynamic(); }) || std::any_of(outputShapes.begin(), outputShapes.end(), [](const Shape& shape){ return shape.isDynamic(); }); @@ -1274,7 +1270,7 @@ InferenceEngine::Precision Node::getRuntimePrecision() const { return runtimePrecision; } -Node* Node::NodesFactory::create(const std::shared_ptr& op, const GraphContext::CPtr context) { +Node* Node::NodesFactory::create(const std::shared_ptr& op, const GraphContext::CPtr& context) { // getExceptionDescWithoutStatus removes redundant information from the exception message. For instance, the NotImplemented // exception is generated in the form: full_path_to_src_file:line_number [ NOT_IMPLEMENTED ] reason. // An example for gather node: diff --git a/src/plugins/intel_cpu/src/node.h b/src/plugins/intel_cpu/src/node.h index 4cfe9c7d708660..cf49b6e1ebf0b2 100644 --- a/src/plugins/intel_cpu/src/node.h +++ b/src/plugins/intel_cpu/src/node.h @@ -22,8 +22,6 @@ #include "dnnl_scratch_pad.h" #include #include "utils/ngraph_utils.hpp" -#include -#include #include #include #include "cpu_types.h" @@ -41,6 +39,8 @@ #include "nodes/executors/mvn_list.hpp" #include "nodes/executors/executor.hpp" +#define THROW_CPU_NODE_ERR IE_THROW() << getTypeStr() << " node with name '" << getName() << "' " + namespace ov { namespace intel_cpu { @@ -436,13 +436,13 @@ class Node { return originalOutputPrecisions; } - InferenceEngine::Precision getOriginalInputPrecisionAtPort(size_t port) const { + const InferenceEngine::Precision &getOriginalInputPrecisionAtPort(size_t port) const { if (originalInputPrecisions.size() <= port) { IE_THROW() << "Incorrect input port number for node " << getName(); } return originalInputPrecisions[port]; } - InferenceEngine::Precision getOriginalOutputPrecisionAtPort(size_t port) const { + const InferenceEngine::Precision &getOriginalOutputPrecisionAtPort(size_t port) const { if (originalOutputPrecisions.size() <= port) { IE_THROW() << "Incorrect output port number for node " << getName(); } @@ -584,7 +584,7 @@ class Node { std::string originalLayers; // contains names of the original layers separated by comma - Node(const std::shared_ptr& op, const GraphContext::CPtr ctx, const ShapeInferFactory& shapeInferFactory); + Node(const std::shared_ptr& op, const GraphContext::CPtr ctx, const ShapeInferFactory& shapeInferFactory); Node(const std::string& type, const std::string& name, const GraphContext::CPtr ctx); int selectedPrimitiveDescriptorIndex = -1; @@ -740,17 +740,17 @@ constexpr uint64_t PortMask(T... rest) { } class Node::NodesFactory : public openvino::cc::Factory& op, + Node*(const std::shared_ptr& op, const GraphContext::CPtr)> { public: NodesFactory(); - Node* create(const std::shared_ptr& op, const GraphContext::CPtr context); + Node* create(const std::shared_ptr& op, const GraphContext::CPtr& context); }; template struct NodeImpl : public NodeType { - NodeImpl(const std::shared_ptr& op, const GraphContext::CPtr context) + NodeImpl(const std::shared_ptr& op, const GraphContext::CPtr context) : NodeType(op, context) { NodeType::perfCounters().template buildClassCounters(NameFromType(NodeType::getType())); } diff --git a/src/plugins/intel_cpu/src/nodes/broadcast.cpp b/src/plugins/intel_cpu/src/nodes/broadcast.cpp index 2293e36850aada..26f889d9f04e71 100644 --- a/src/plugins/intel_cpu/src/nodes/broadcast.cpp +++ b/src/plugins/intel_cpu/src/nodes/broadcast.cpp @@ -2,17 +2,12 @@ // SPDX-License-Identifier: Apache-2.0 // -#include -#include -#include -#include -#include "ie_parallel.hpp" -#include "utils/bfloat16.hpp" -#include #include "broadcast.h" -#include -#include + #include "common/cpu_memcpy.h" +#include "ie_parallel.hpp" +#include +#include using namespace InferenceEngine; @@ -22,12 +17,12 @@ namespace node { bool Broadcast::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { try { - if (!ov::is_type(op)) { + if (!ov::is_type(op)) { errorMessage = "Only Broadcast operations from opset1 are supported."; return false; } - if (!one_of(ov::as_type_ptr(op)->get_broadcast_spec().m_type, - ov::op::AutoBroadcastType::NUMPY, ov::op::AutoBroadcastType::EXPLICIT)) { + if (!one_of(ov::as_type_ptr(op)->get_broadcast_spec().m_type, + op::AutoBroadcastType::NUMPY, op::AutoBroadcastType::EXPLICIT)) { errorMessage = "Only NUMPY and EXPLICIT broadcast types are supported."; return false; } @@ -37,9 +32,9 @@ bool Broadcast::isSupportedOperation(const std::shared_ptr& op, return false; } if (!isDynamicNgraphNode(op) && - (!ov::is_type(op->get_input_node_ptr(TARGET_SHAPE_IDX)) || + (!ov::is_type(op->get_input_node_ptr(TARGET_SHAPE_IDX)) || (op->get_input_size() > AXES_MAPPING_IDX && - !ov::is_type(op->get_input_node_ptr(AXES_MAPPING_IDX))))) { + !ov::is_type(op->get_input_node_ptr(AXES_MAPPING_IDX))))) { errorMessage = "Only constant target shapes and axis mapping inputs are supported for static shapes."; return false; } @@ -50,7 +45,7 @@ bool Broadcast::isSupportedOperation(const std::shared_ptr& op, } Broadcast::Broadcast(const std::shared_ptr& op, const GraphContext::CPtr context) - : Node(op, context, NgraphShapeInferFactory(op, PortMask(TARGET_SHAPE_IDX, AXES_MAPPING_IDX))) { + : Node(op, context, NgraphShapeInferFactory(op, PortMask(TARGET_SHAPE_IDX, AXES_MAPPING_IDX))) { std::string errorMessage; if (!isSupportedOperation(op, errorMessage)) { IE_THROW(NotImplemented) << errorMessage; @@ -62,10 +57,10 @@ Broadcast::Broadcast(const std::shared_ptr& op, const GraphContext::CP if (op->get_output_size() == 0) IE_THROW() << errorPrefix << "has no output edges."; - auto broadcastOp = ov::as_type_ptr(op); - if (broadcastOp->get_broadcast_spec().m_type == ov::op::AutoBroadcastType::NUMPY) { + auto broadcastOp = ov::as_type_ptr(op); + if (broadcastOp->get_broadcast_spec().m_type == op::AutoBroadcastType::NUMPY) { broadcastType = NUMPY; - } else if (broadcastOp->get_broadcast_spec().m_type == ov::op::AutoBroadcastType::EXPLICIT) { + } else if (broadcastOp->get_broadcast_spec().m_type == op::AutoBroadcastType::EXPLICIT) { if (op->get_input_size() <= AXES_MAPPING_IDX) IE_THROW() << errorPrefix << " and EXPLICIT mode must have tree input edges: " << getParentEdges().size(); broadcastType = EXPLICIT; @@ -73,14 +68,16 @@ Broadcast::Broadcast(const std::shared_ptr& op, const GraphContext::CP IE_THROW() << errorPrefix << "has unexpected broadcast type: " << broadcastOp->get_broadcast_spec().m_type; } - if (ov::is_type(op->get_input_node_ptr(TARGET_SHAPE_IDX))) { + if (auto shapeOp = ov::as_type(op->get_input_node_ptr(TARGET_SHAPE_IDX))) { constMap[TARGET_SHAPE_IDX] = true; - targetShape = (ov::as_type(op->get_input_node_ptr(TARGET_SHAPE_IDX)))->get_vector(); + targetShape = shapeOp->cast_vector(); } - if (broadcastType == EXPLICIT && - ov::is_type(op->get_input_node_ptr(AXES_MAPPING_IDX))) { - constMap[AXES_MAPPING_IDX] = true; - axesMapping = ov::as_type(op->get_input_node_ptr(AXES_MAPPING_IDX))->get_vector(); + + if (broadcastType == EXPLICIT) { + if (auto axesOp = ov::as_type(op->get_input_node_ptr(AXES_MAPPING_IDX))) { + constMap[AXES_MAPPING_IDX] = true; + axesMapping = axesOp->cast_vector(); + } } } @@ -117,13 +114,29 @@ bool Broadcast::needPrepareParams() const { void Broadcast::prepareParams() { if (!constMap[TARGET_SHAPE_IDX]) { const auto& targetShapeMem = getParentEdgesAtPort(TARGET_SHAPE_IDX)[0]->getMemory(); - const int32_t* targetShapeData = reinterpret_cast(targetShapeMem.getData()); - targetShape.assign(targetShapeData, targetShapeData + targetShapeMem.getStaticDims()[0]); + if (targetShapeMem.getDataType() == dnnl::memory::data_type::s64) { + const auto *targetShapeData = reinterpret_cast(targetShapeMem.getData()); + targetShape.assign(targetShapeData, targetShapeData + targetShapeMem.getStaticDims()[0]); + } else if (targetShapeMem.getDataType() == dnnl::memory::data_type::s32) { + const auto *targetShapeData = reinterpret_cast(targetShapeMem.getData()); + targetShape.assign(targetShapeData, targetShapeData + targetShapeMem.getStaticDims()[0]); + } else { + IE_THROW() << errorPrefix << " does not support precision '" << int(targetShapeMem.getDataType()) + << "' for the Target shape input."; + } } if (broadcastType == EXPLICIT && !constMap[AXES_MAPPING_IDX]) { const auto& axesMapMem = getParentEdgesAtPort(AXES_MAPPING_IDX)[0]->getMemory(); - const int32_t* axesMapData = reinterpret_cast(axesMapMem.getData()); - axesMapping.assign(axesMapData, axesMapData + axesMapMem.getStaticDims()[0]); + if (axesMapMem.getDataType() == dnnl::memory::data_type::s64) { + const auto axesMapData = reinterpret_cast(axesMapMem.getData()); + axesMapping.assign(axesMapData, axesMapData + axesMapMem.getStaticDims()[0]); + } else if (axesMapMem.getDataType() == dnnl::memory::data_type::s32) { + const auto axesMapData = reinterpret_cast(axesMapMem.getData()); + axesMapping.assign(axesMapData, axesMapData + axesMapMem.getStaticDims()[0]); + } else { + IE_THROW() << errorPrefix << " does not support precision '" << int(axesMapMem.getDataType()) + << "' for the Axes mapping input."; + } } const auto& srcDims = getParentEdgesAtPort(INPUT_DATA_IDX)[0]->getMemory().getShape().getStaticDims(); @@ -162,22 +175,48 @@ bool Broadcast::needShapeInfer() const { if (targetShape.empty()) { return true; } - const int32_t* targetShapeData = reinterpret_cast(getParentEdgesAtPort(TARGET_SHAPE_IDX)[0]->getMemory().getData()); - for (size_t i = 0lu; i < targetShape.size(); i++) { - if (targetShape[i] != targetShapeData[i]) { - return true; + const auto& targetShapeMem = getParentEdgesAtPort(TARGET_SHAPE_IDX)[0]->getMemory(); + if (targetShapeMem.getDataType() == dnnl::memory::data_type::s64) { + const auto *targetShapeData = reinterpret_cast(targetShapeMem.getData()); + for (size_t i = 0lu; i < targetShape.size(); i++) { + if (targetShape[i] != targetShapeData[i]) { + return true; + } + } + } else if (targetShapeMem.getDataType() == dnnl::memory::data_type::s32) { + const auto *targetShapeData = reinterpret_cast(targetShapeMem.getData()); + for (size_t i = 0lu; i < targetShape.size(); i++) { + if (targetShape[i] != targetShapeData[i]) { + return true; + } } + } else { + IE_THROW() << errorPrefix << " does not support precision '" << int(targetShapeMem.getDataType()) + << "' for the Target shape input."; } } if (broadcastType == EXPLICIT && !constMap[AXES_MAPPING_IDX]) { if (axesMapping.empty()) { return true; } - const int32_t* axesMappingData = reinterpret_cast(getParentEdgesAtPort(AXES_MAPPING_IDX)[0]->getMemory().getData()); - for (size_t i = 0lu; i < axesMapping.size(); i++) { - if (axesMapping[i] != axesMappingData[i]) { - return true; + const auto& axesMapMem = getParentEdgesAtPort(AXES_MAPPING_IDX)[0]->getMemory(); + if (axesMapMem.getDataType() == dnnl::memory::data_type::s64) { + const auto *axesMappingData = reinterpret_cast(axesMapMem.getData()); + for (size_t i = 0lu; i < axesMapping.size(); i++) { + if (axesMapping[i] != axesMappingData[i]) { + return true; + } + } + } else if (axesMapMem.getDataType() == dnnl::memory::data_type::s32) { + const auto *axesMappingData = reinterpret_cast(axesMapMem.getData()); + for (size_t i = 0lu; i < axesMapping.size(); i++) { + if (axesMapping[i] != axesMappingData[i]) { + return true; + } } + } else { + IE_THROW() << errorPrefix << " does not support precision '" << int(axesMapMem.getDataType()) + << "' for the Axes mapping input."; } } needPrepareParamsVar = false; diff --git a/src/plugins/intel_cpu/src/nodes/broadcast.h b/src/plugins/intel_cpu/src/nodes/broadcast.h index 4ab3201365e05f..34ac289eef2c85 100644 --- a/src/plugins/intel_cpu/src/nodes/broadcast.h +++ b/src/plugins/intel_cpu/src/nodes/broadcast.h @@ -6,10 +6,6 @@ #include "common/tile_broadcast_utils.h" -#include -#include -#include - namespace ov { namespace intel_cpu { namespace node { @@ -45,8 +41,8 @@ class Broadcast : public Node, public TileBroadcastCommon { static constexpr size_t TARGET_SHAPE_IDX = 1; static constexpr size_t AXES_MAPPING_IDX = 2; - std::vector targetShape; - std::vector axesMapping; + VectorDims targetShape; + VectorDims axesMapping; std::string errorPrefix; }; diff --git a/src/plugins/intel_cpu/src/nodes/common/cpu_convert.cpp b/src/plugins/intel_cpu/src/nodes/common/cpu_convert.cpp index d8322c709e2288..f4bd26dbfdafd2 100644 --- a/src/plugins/intel_cpu/src/nodes/common/cpu_convert.cpp +++ b/src/plugins/intel_cpu/src/nodes/common/cpu_convert.cpp @@ -224,7 +224,12 @@ const std::tuple & Range::fit(const Precision & prec) { IE_THROW() << "Unsupported precision"; } std::get<0>(_range) = static_cast(std::max(static_cast(std::get<0>(_range)), lbound)); - std::get<1>(_range) = static_cast(std::min(static_cast(std::get<1>(_range)), ubound)); + + auto v1 = static_cast(std::min(static_cast(std::get<1>(_range)), ubound)); + if (v1 < U(0)) { // WA for convertion double->int64: 9.2233720368547758e+18 -> -9223372036854775808 + v1 -= U(1); + } + std::get<1>(_range) = v1; } else { int64_t lbound; uint64_t ubound; diff --git a/src/plugins/intel_cpu/src/nodes/common/tile_broadcast_utils.cpp b/src/plugins/intel_cpu/src/nodes/common/tile_broadcast_utils.cpp index e7921c24abd8e0..9f3c031714b4fa 100644 --- a/src/plugins/intel_cpu/src/nodes/common/tile_broadcast_utils.cpp +++ b/src/plugins/intel_cpu/src/nodes/common/tile_broadcast_utils.cpp @@ -92,8 +92,12 @@ bool TileBroadcastCommon::canBeExecutedInNSPCLayout(VectorDims srcBlockedDims, V std::vector TileBroadcastCommon::getSupportedConfigs(const Node *node) { std::vector supportedPrimitiveDescriptors; - auto precision = node->getOriginalInputPrecisionAtPort(0); + const auto &precision = node->getOriginalInputPrecisionAtPort(0); auto dataType = DnnlExtensionUtils::IEPrecisionToDataType(precision); + auto secPrecision = node->getOriginalInputPrecisionAtPort(1); + if (!one_of(secPrecision, Precision::I32, Precision::I64)) { + secPrecision = Precision::I32; + } const auto& srcDims = node->getInputShapeAtPort(0).getDims(); const auto& inDataShape = node->getInputShapeAtPort(0); @@ -109,11 +113,15 @@ std::vector TileBroadcastCommon::getSupportedConfigs(const Node *node) config.inConfs[0].constant(constMap[0]); config.inConfs[1].inPlace(-1); config.inConfs[1].constant(constMap[1]); - config.inConfs[1].setMemDesc(std::make_shared(Precision::I32, node->getInputShapeAtPort(1))); + config.inConfs[1].setMemDesc(std::make_shared(secPrecision, node->getInputShapeAtPort(1))); if (config.inConfs.size() == 3) { + auto thrdPrecision = node->getOriginalInputPrecisionAtPort(2); + if (!one_of(thrdPrecision, Precision::I32, Precision::I64)) { + thrdPrecision = Precision::I32; + } config.inConfs[2].inPlace(-1); config.inConfs[2].constant(constMap[2]); - config.inConfs[2].setMemDesc(std::make_shared(Precision::I32, node->getInputShapeAtPort(2))); + config.inConfs[2].setMemDesc(std::make_shared(thrdPrecision, node->getInputShapeAtPort(2))); } config.outConfs.resize(node->getChildEdges().size()); diff --git a/src/plugins/intel_cpu/src/nodes/concat.cpp b/src/plugins/intel_cpu/src/nodes/concat.cpp index 633f40cea00fa5..7a306f35c82cc6 100644 --- a/src/plugins/intel_cpu/src/nodes/concat.cpp +++ b/src/plugins/intel_cpu/src/nodes/concat.cpp @@ -3,26 +3,11 @@ // #include "concat.h" - -#include -#include -#include -#include - -#include -#include -#include -#include #include "ie_parallel.hpp" -#include "conv.h" -#include "fake_quantize.h" -#include "pooling.h" -#include "eltwise.h" -#include #include "common/cpu_memcpy.h" -#include "common/blocked_desc_creator.h" -#include +#include #include + using namespace dnnl; using namespace InferenceEngine; @@ -37,10 +22,9 @@ bool Concat::isExecutable() const { return !isInPlace() && !hasEmptyOutputTensors(); } -bool Concat::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { +bool Concat::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { try { - const auto concatOp = ngraph::as_type_ptr(op); - if (!concatOp) { + if (op->get_type_info() != op::v0::Concat::get_type_info_static()) { errorMessage = "Node is not an instance of the Concat operation."; return false; } @@ -50,7 +34,7 @@ bool Concat::isSupportedOperation(const std::shared_ptr& op, return true; } -Concat::Concat(const std::shared_ptr& op, const GraphContext::CPtr context) +Concat::Concat(const std::shared_ptr& op, const GraphContext::CPtr context) : Node(op, context, NgraphShapeInferFactory(op, EMPTY_PORT_MASK)) { std::string errorMessage; if (!isSupportedOperation(op, errorMessage)) { @@ -58,13 +42,13 @@ Concat::Concat(const std::shared_ptr& op, const GraphContext::CPtr } const auto inRank = getInputShapeAtPort(0).getRank(); - auto concatOp = ngraph::as_type_ptr(op); + auto concatOp = ov::as_type_ptr(op); auto axis = concatOp->get_axis(); if (axis < 0) { axis += inRank; } if (axis >= static_cast(inRank) || axis < 0) { - IE_THROW() << "Concat node with name '" << getName() << "' has invalid value of axis parameter: " << axis; + THROW_CPU_NODE_ERR << "has invalid value of axis parameter: " << axis; } this->axis = axis; } @@ -83,7 +67,7 @@ void Concat::getSupportedDescriptors() { } } if (incorrectDims || firstParentDims.size() == 0) { - IE_THROW() << "Incorrect input dimensions for concat node " << getName(); + THROW_CPU_NODE_ERR << " has incorrect input dimensions."; } } @@ -195,8 +179,14 @@ void Concat::selectOptimalPrimitiveDescriptor() { // be replicated. Inplace approach is not applicable // for that case. for (size_t i = 0; i < getParentEdges().size(); i++) { + if (!canBeInPlace) { + break; + } for (size_t j = i + 1; j < getParentEdges().size(); j++) { - if (getParentEdgeAt(i) == getParentEdgeAt(j)) canBeInPlace = false; + if (getParentEdgeAt(i) == getParentEdgeAt(j)) { + canBeInPlace = false; + break; + } } } @@ -324,7 +314,7 @@ void Concat::prepareParams() { IE_THROW() << "Destination memory didn't allocate."; auto dstMemDesc = dstMemPtr->getDescWithType(); if (getSelectedPrimitiveDescriptor() == nullptr) - IE_THROW() << "Preferable primitive descriptor is not set."; + THROW_CPU_NODE_ERR << "does not have preferable primitive descriptor."; const auto& outputStrides = dstMemDesc->getStrides(); size_t curConcatOffset = 0; @@ -348,8 +338,7 @@ void Concat::prepareParams() { const auto& srcMemPtr = getParentEdgesAtPort(i)[0]->getMemoryPtr(); if (!srcMemPtr || !srcMemPtr->isAllocated()) { auto parent = getParentEdgeAt(i)->getParent(); - IE_THROW() << "Source memory from " << parent->getName() << " didn't allocate for node " - << getName() << "."; + THROW_CPU_NODE_ERR << "has input '" << parent->getName() << "' with not allocated memory."; } if (canExecRef) { @@ -413,7 +402,7 @@ size_t Concat::inverseOrder(const SizeVector& order, size_t axis) { void Concat::initOptimalPrimitiveDescriptor() { auto selected_pd = getSelectedPrimitiveDescriptor(); if (selected_pd == nullptr) - IE_THROW() << "Preferable primitive descriptor is not set."; + THROW_CPU_NODE_ERR << "does not have preferable primitive descriptor."; if (!isInPlace()) { Node::initOptimalPrimitiveDescriptor(); diff --git a/src/plugins/intel_cpu/src/nodes/concat.h b/src/plugins/intel_cpu/src/nodes/concat.h index e9a4c9e764a7b3..1504cb92f38eb2 100644 --- a/src/plugins/intel_cpu/src/nodes/concat.h +++ b/src/plugins/intel_cpu/src/nodes/concat.h @@ -4,11 +4,7 @@ #pragma once -#include #include -#include -#include -#include namespace ov { namespace intel_cpu { @@ -16,9 +12,9 @@ namespace node { class Concat : public Node { public: - Concat(const std::shared_ptr& op, const GraphContext::CPtr context); + Concat(const std::shared_ptr& op, const GraphContext::CPtr context); - static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; + static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; void getSupportedDescriptors() override; void initSupportedPrimitiveDescriptors() override; void initOptimalPrimitiveDescriptor() override; diff --git a/src/plugins/intel_cpu/src/nodes/convert.cpp b/src/plugins/intel_cpu/src/nodes/convert.cpp index 2f3fa0d1b675b1..10b010959c2a29 100644 --- a/src/plugins/intel_cpu/src/nodes/convert.cpp +++ b/src/plugins/intel_cpu/src/nodes/convert.cpp @@ -2,12 +2,12 @@ // SPDX-License-Identifier: Apache-2.0 // -#include +// #include #include "convert.h" -#include "common/blocked_desc_creator.h" -#include +// #include "common/blocked_desc_creator.h" +#include #include -#include +// #include #include using namespace dnnl; @@ -17,10 +17,9 @@ namespace ov { namespace intel_cpu { namespace node { -bool Convert::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { +bool Convert::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { try { - const auto convert = std::dynamic_pointer_cast(op); - if (!convert) { + if (op->get_type_info() != op::v0::Convert::get_type_info_static()) { errorMessage = "Only opset1 Convert operation is supported"; return false; } @@ -30,21 +29,19 @@ bool Convert::isSupportedOperation(const std::shared_ptr& op return true; } -Convert::Convert(const std::shared_ptr& op, const GraphContext::CPtr context) +Convert::Convert(const std::shared_ptr& op, const GraphContext::CPtr& context) : Node(op, context, PassThroughShapeInferFactory()) { std::string errorMessage; - if (isSupportedOperation(op, errorMessage)) { - errorPrefix = "Convert node with name '" + getName() + "'"; - } else { + if (!isSupportedOperation(op, errorMessage)) { IE_THROW(NotImplemented) << errorMessage; } - auto convert = ov::as_type_ptr(op); + auto convert = ov::as_type_ptr(op); convertParams.origPrc = details::convertPrecision(convert->get_destination_type()); } Convert::Convert(const Shape &shape, const InferenceEngine::Precision &inPrc, const InferenceEngine::Precision &outPrc, - const std::string &nodeName, const GraphContext::CPtr context) + const std::string &nodeName, const GraphContext::CPtr& context) : Node("Convert", nodeName, context) { convertParams.origPrc = outPrc; inputShapes.push_back(shape); @@ -56,8 +53,6 @@ Convert::Convert(const Shape &shape, const InferenceEngine::Precision &inPrc, co if (isDynamicNode()) { shapeInference = std::make_shared(); } - - errorPrefix = "Convert node with name '" + getName() + "'"; } void Convert::getSupportedDescriptors() { @@ -68,9 +63,9 @@ void Convert::getSupportedDescriptors() { if (inputShapes.empty()) inputShapes.push_back(input->getShape()); if (getParentEdges().size() != 1) - IE_THROW() << errorPrefix << " has incorrect number of input edges"; + THROW_CPU_NODE_ERR << " has incorrect number of input edges"; if (getChildEdges().empty()) - IE_THROW() << errorPrefix << " has incorrect number of output edges"; + THROW_CPU_NODE_ERR << " has incorrect number of output edges"; } bool Convert::isSupportedDesc(const MemoryDesc &desc) { @@ -117,25 +112,25 @@ void Convert::initSupportedPrimitiveDescriptors() { config.outConfs.push_back(dataConfigOut); supportedPrimitiveDescriptorsBuilder(config); } else if (inputShapes.size() == 1 && outputShapes.size() == 1) { - const Shape& insShape = getInputShapeAtPort(0); - auto insPrecision = getOriginalInputPrecisionAtPort(0); - const Shape& outputShape = getOutputShapeAtPort(0); - auto outPrecision = getOriginalOutputPrecisionAtPort(0); + const auto& inShape = getInputShapeAtPort(0); + const auto& inPrecision = getOriginalInputPrecisionAtPort(0); + const auto& outputShape = getOutputShapeAtPort(0); + const auto& outPrecision = getOriginalOutputPrecisionAtPort(0); config.inConfs.push_back(dataIn); config.outConfs.push_back(dataConfigOut); auto creators = BlockedDescCreator::getCommonCreators(); - auto range = BlockedDescCreator::makeFilteredRange(creators, insShape.getRank()); + auto range = BlockedDescCreator::makeFilteredRange(creators, inShape.getRank()); for (auto itr = range.first; itr != range.second; ++itr) { - config.inConfs[0].setMemDesc(std::make_shared(itr->second->createDesc(insPrecision, insShape))); + config.inConfs[0].setMemDesc(std::make_shared(itr->second->createDesc(inPrecision, inShape))); config.outConfs[0].setMemDesc(std::make_shared(itr->second->createDesc(outPrecision, outputShape))); supportedPrimitiveDescriptorsBuilder(config); } } else { - IE_THROW() << errorPrefix << " has incorrect number of input/output edges"; + THROW_CPU_NODE_ERR << " has incorrect number of input/output edges"; } } @@ -165,7 +160,7 @@ void Convert::execute(dnnl::stream strm) { const auto childPaddElemCount = childMem.getDescWithType()->getPaddedElementsCount(); if (parentPaddElemCount != childPaddElemCount) - IE_THROW() << errorPrefix << " has different elements number in input and output buffers"; + THROW_CPU_NODE_ERR << " has different elements number in input and output buffers"; MemoryCPtr srcMemory = getParentEdgeAt(0)->getMemoryPtr(); MemoryPtr dstMemory = getChildEdgeAt(0)->getMemoryPtr(); diff --git a/src/plugins/intel_cpu/src/nodes/convert.h b/src/plugins/intel_cpu/src/nodes/convert.h index 3fd65ebb20a5d6..d8676ec0217fff 100644 --- a/src/plugins/intel_cpu/src/nodes/convert.h +++ b/src/plugins/intel_cpu/src/nodes/convert.h @@ -16,9 +16,9 @@ namespace node { class Convert : public Node { public: - Convert(const std::shared_ptr& op, const GraphContext::CPtr context); + Convert(const std::shared_ptr& op, const GraphContext::CPtr& context); Convert(const Shape &shape, const InferenceEngine::Precision &inPrc, const InferenceEngine::Precision &outPrc, - const std::string &nodeName, const GraphContext::CPtr context); + const std::string &nodeName, const GraphContext::CPtr& context); void getSupportedDescriptors() override; void initSupportedPrimitiveDescriptors() override; @@ -54,8 +54,6 @@ class Convert : public Node { ConvertParams convertParams; std::shared_ptr execPtr = nullptr; NodeConfig config; - - std::string errorPrefix; }; } // namespace node diff --git a/src/plugins/intel_cpu/src/nodes/cum_sum.cpp b/src/plugins/intel_cpu/src/nodes/cum_sum.cpp index 65d3a55ada0cb4..20563df8667e3d 100644 --- a/src/plugins/intel_cpu/src/nodes/cum_sum.cpp +++ b/src/plugins/intel_cpu/src/nodes/cum_sum.cpp @@ -2,15 +2,10 @@ // SPDX-License-Identifier: Apache-2.0 // -#include -#include +#include "cum_sum.h" -#include -#include #include "ie_parallel.hpp" -#include "ie_precision.hpp" -#include -#include "cum_sum.h" +#include #include "utils/bfloat16.hpp" using namespace InferenceEngine; @@ -19,10 +14,9 @@ namespace ov { namespace intel_cpu { namespace node { -bool CumSum::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { +bool CumSum::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { try { - const auto cumsum = std::dynamic_pointer_cast(op); - if (!cumsum) { + if (op->get_type_info() != op::v0::CumSum::get_type_info_static()) { errorMessage = "Only opset3 CumSum operation is supported"; return false; } @@ -32,7 +26,7 @@ bool CumSum::isSupportedOperation(const std::shared_ptr& op, return true; } -CumSum::CumSum(const std::shared_ptr& op, const GraphContext::CPtr context) : Node(op, context, NgraphShapeInferFactory(op, EMPTY_PORT_MASK)) { +CumSum::CumSum(const std::shared_ptr& op, const GraphContext::CPtr context) : Node(op, context, NgraphShapeInferFactory(op, EMPTY_PORT_MASK)) { std::string errorMessage; if (!isSupportedOperation(op, errorMessage)) { IE_THROW(NotImplemented) << errorMessage; @@ -49,7 +43,7 @@ CumSum::CumSum(const std::shared_ptr& op, const GraphContext::CPtr IE_THROW() << errorPrefix << " doesn't support 'data' input tensor with rank: " << numOfDims; } - const auto cumsum = std::dynamic_pointer_cast(op); + const auto cumsum = ov::as_type_ptr(op); if (cumsum == nullptr) IE_THROW() << "Operation with name '" << op->get_friendly_name() << "' is not an instance of CumSum from opset3."; @@ -59,7 +53,7 @@ CumSum::CumSum(const std::shared_ptr& op, const GraphContext::CPtr if (getOriginalInputsNumber() == numOfInputs) { const auto axis_shape = cumsum->get_input_partial_shape(AXIS); - if (axis_shape.is_dynamic() || !ngraph::is_scalar(axis_shape.to_shape())) + if (axis_shape.is_dynamic() || !ov::is_scalar(axis_shape.to_shape())) IE_THROW() << errorPrefix << " doesn't support 'axis' input tensor with non scalar rank"; } diff --git a/src/plugins/intel_cpu/src/nodes/cum_sum.h b/src/plugins/intel_cpu/src/nodes/cum_sum.h index eee2da8c085472..961ae5362a15f0 100644 --- a/src/plugins/intel_cpu/src/nodes/cum_sum.h +++ b/src/plugins/intel_cpu/src/nodes/cum_sum.h @@ -4,7 +4,6 @@ #pragma once -#include #include namespace ov { @@ -13,7 +12,7 @@ namespace node { class CumSum : public Node { public: - CumSum(const std::shared_ptr& op, const GraphContext::CPtr context); + CumSum(const std::shared_ptr& op, const GraphContext::CPtr context); void getSupportedDescriptors() override {}; void initSupportedPrimitiveDescriptors() override; @@ -23,7 +22,7 @@ class CumSum : public Node { bool needPrepareParams() const override; void executeDynamicImpl(dnnl::stream strm) override; - static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; + static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; private: template diff --git a/src/plugins/intel_cpu/src/nodes/def_conv.cpp b/src/plugins/intel_cpu/src/nodes/def_conv.cpp index d8dd6bb1a6b586..d90b0c8ab42e71 100644 --- a/src/plugins/intel_cpu/src/nodes/def_conv.cpp +++ b/src/plugins/intel_cpu/src/nodes/def_conv.cpp @@ -16,6 +16,7 @@ #include #include #include +#include using namespace InferenceEngine; using namespace dnnl; @@ -673,8 +674,8 @@ struct jit_uni_def_conv_kernel_f32 : public jit_uni_def_conv_kernel, public jit_ bool DeformableConvolution::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { try { if (!one_of(op->get_type_info(), - ngraph::op::v1::DeformableConvolution::get_type_info_static(), - ngraph::op::v8::DeformableConvolution::get_type_info_static())) { + op::v1::DeformableConvolution::get_type_info_static(), + op::v8::DeformableConvolution::get_type_info_static())) { errorMessage = "Node is not an instance of DeformableConvolution form the operation set v1 or v8."; return false; } @@ -749,7 +750,7 @@ DeformableConvolution::DeformableConvolution(const std::shared_ptr IE_THROW(NotImplemented) << errorMessage; } errorPrefix = "Deformable convolution with name '" + op->get_friendly_name() + "'"; - auto defConvNodeBase = std::dynamic_pointer_cast(op); + auto defConvNodeBase = std::dynamic_pointer_cast(op); if (defConvNodeBase == nullptr) IE_THROW() << errorPrefix << " is not an instance of DeformableConvolutionBase."; @@ -769,8 +770,8 @@ DeformableConvolution::DeformableConvolution(const std::shared_ptr autoPadding = one_of(defConvNodeBase->get_auto_pad(), ov::op::PadType::SAME_UPPER, ov::op::PadType::SAME_LOWER); - if (op->get_type_info() == ngraph::op::v8::DeformableConvolution::get_type_info_static()) { - auto defConvNode = std::dynamic_pointer_cast(op); + if (op->get_type_info() == op::v8::DeformableConvolution::get_type_info_static()) { + auto defConvNode = std::dynamic_pointer_cast(op); if (defConvNode == nullptr) IE_THROW() << errorPrefix << " is not an instance of DeformableConvolution from opset8."; defConvAttr.with_bilinear_pad = defConvNode->get_bilinear_interpolation_pad(); diff --git a/src/plugins/intel_cpu/src/nodes/eltwise.cpp b/src/plugins/intel_cpu/src/nodes/eltwise.cpp index 1f74c4f70a2c1a..a46ff1fefe63a0 100644 --- a/src/plugins/intel_cpu/src/nodes/eltwise.cpp +++ b/src/plugins/intel_cpu/src/nodes/eltwise.cpp @@ -1,4 +1,3 @@ - // Copyright (C) 2018-2023 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -10,7 +9,6 @@ #include -#include "cpu_types.h" #include "utils/bfloat16.hpp" #include "ie_ngraph_utils.hpp" #include @@ -18,8 +16,6 @@ #include #include -#include "fake_quantize.h" -#include "pooling.h" #include "input.h" #include "common/cpu_convert.h" @@ -29,11 +25,9 @@ #include "emitters/x64/jit_bf16_emitters.hpp" #include #include "utils/general_utils.h" -#include "utils/cpu_utils.hpp" #include -#include "ngraph/ngraph.hpp" -#include +#include #include "transformations/cpu_opset/common/op/power_static.hpp" #include "transformations/cpu_opset/common/op/leaky_relu.hpp" #include "transformations/cpu_opset/common/op/swish_cpu.hpp" @@ -43,9 +37,9 @@ #include #include #include -#include #include #include "memory_desc/dnnl_blocked_memory_desc.h" +#include "executors/eltwise_list.hpp" using namespace InferenceEngine; using namespace dnnl::impl::utils; @@ -73,7 +67,7 @@ struct EltwiseEmitterContext { jit_generator *host; cpu_isa_t host_isa; const Eltwise::EltwiseData& opData; - InferenceEngine::Precision exec_prc; + Precision exec_prc; }; template @@ -137,7 +131,7 @@ InferenceEngine::Precision eltwise_precision_helper::get_precision(const size_t // for element-wise operations all inputs must to have the same precisions auto has_same_precision = [](const std::vector& precisions) { - return std::all_of(precisions.begin(), precisions.end(), [&precisions](const element::Type precision) { + return std::all_of(precisions.begin(), precisions.end(), [&precisions](const element::Type& precision) { return precision == precisions[0]; }); }; @@ -165,15 +159,17 @@ InferenceEngine::Precision eltwise_precision_helper::get_precision(const size_t element::i16, element::bf16, element::i32, + element::i64, element::f32 }; for (const auto prc : exec_precisions_priority) { if (std::any_of( - supported_precision_intersection.begin(), - supported_precision_intersection.end(), - [&prc](const std::vector& precisions) { return std::find(precisions.begin(), precisions.end(), prc) != precisions.end(); })) { - exec_prc = InferenceEngine::details::convertPrecision(prc); + supported_precision_intersection.begin(), + supported_precision_intersection.end(), + [&prc](const std::vector& precisions) { + return std::find(precisions.begin(), precisions.end(), prc) != precisions.end(); })) { + exec_prc = details::convertPrecision(prc); break; } } @@ -185,6 +181,24 @@ InferenceEngine::Precision eltwise_precision_helper::get_precision(const size_t } } + bool allInpI64 = true; + for (size_t i = 0lu; i < inputs_number; i++) { + if (src_prc[i] != Precision::I64) { + allInpI64 = false; + break; + } + } + if (allInpI64) { + for (const auto &prcs : supported_precision_intersection) { + if (prcs[0] == element::i64) { + exec_prc = Precision::I64; + break; + } else if (prcs[0] == element::f64) { + exec_prc = Precision::FP64; + } + } + } + if (exec_prc == Precision::UNSPECIFIED) { IE_THROW() << "Eltwise jitter failed to specify execution precision for Eltwise node"; } @@ -203,7 +217,7 @@ std::set> eltwise_precision_helper::get_supported_pre OV_CASE(Algorithm::EltwiseTanh, jit_dnnl_aux_emitter), OV_CASE(Algorithm::EltwiseSigmoid, jit_dnnl_aux_emitter), OV_CASE(Algorithm::EltwiseAbs, jit_dnnl_aux_emitter), - OV_CASE(Algorithm::EltwiseSqrt, jit_dnnl_aux_emitter), + OV_CASE(Algorithm::EltwiseSqrt, jit_sqrt_emitter), OV_CASE(Algorithm::EltwiseSoftRelu, jit_dnnl_aux_emitter), OV_CASE(Algorithm::EltwiseExp, jit_dnnl_aux_emitter), OV_CASE(Algorithm::EltwiseClamp, jit_dnnl_aux_emitter), @@ -340,12 +354,12 @@ struct jit_uni_eltwise_generic : public jit_uni_eltwise_kernel, public jit_gener mov(reg_post_op_ptrs, ptr[reg_const_params + GET_OFF(post_op_data)]); - Xbyak::Label unroll_loop_label; - Xbyak::Label unroll_loop_end_label; - Xbyak::Label main_loop_label; - Xbyak::Label main_loop_end_label; - Xbyak::Label tail_loop_label; - Xbyak::Label tail_loop_end_label; + Label unroll_loop_label; + Label unroll_loop_end_label; + Label main_loop_label; + Label main_loop_end_label; + Label tail_loop_label; + Label tail_loop_end_label; if (isa == x64::avx512_core) vpxord(vmm_zero, vmm_zero, vmm_zero); @@ -577,7 +591,7 @@ struct jit_uni_eltwise_generic : public jit_uni_eltwise_kernel, public jit_gener OV_CASE(Algorithm::EltwiseTanh, jit_dnnl_aux_emitter), OV_CASE(Algorithm::EltwiseSigmoid, jit_dnnl_aux_emitter), OV_CASE(Algorithm::EltwiseAbs, jit_dnnl_aux_emitter), - OV_CASE(Algorithm::EltwiseSqrt, jit_dnnl_aux_emitter), + OV_CASE(Algorithm::EltwiseSqrt, jit_sqrt_emitter), OV_CASE(Algorithm::EltwiseSoftRelu, jit_dnnl_aux_emitter), OV_CASE(Algorithm::EltwiseExp, jit_dnnl_aux_emitter), OV_CASE(Algorithm::EltwiseClamp, jit_dnnl_aux_emitter), @@ -682,17 +696,40 @@ struct jit_uni_eltwise_generic : public jit_uni_eltwise_kernel, public jit_gener } } - inline void load_vector(Vmm vmm_src, const Xbyak::Address &op, Precision src_prc, Precision dst_prc, bool broadcast) { + inline void load_vector(const Vmm &vmm_src, const Address &op, const Precision &src_prc, const Precision &dst_prc, bool broadcast) { Xmm xmm_src = Xmm(vmm_src.getIdx()); + Ymm ymm_src = Ymm(vmm_src.getIdx()); if (broadcast) { - load_scalar(xmm_src, op, src_prc, dst_prc); - uni_vbroadcastss(vmm_src, xmm_src); + load_scalar(xmm_src, op, src_prc, dst_prc); + if (src_prc.size() == 8) { + uni_vbroadcastsd(vmm_src, xmm_src); + } else { + uni_vbroadcastss(vmm_src, xmm_src); + } } else { switch (src_prc) { + case Precision::I64: + if (dst_prc == Precision::I64 || dst_prc == Precision::I32) { + uni_vmovups(vmm_src, op); + } else if (dst_prc == Precision::FP64) { + if (x64::mayiuse(x64::avx512_core)) { + vcvtqq2pd(vmm_src, op); + } else { + // Do conversion inside the emitter. + uni_vmovups(vmm_src, op); + } + } + break; case Precision::FP32: + if (dst_prc == Precision::FP32) { + uni_vmovups(vmm_src, op); + } + break; case Precision::I32: - uni_vmovups(vmm_src, op); + if (dst_prc == Precision::I32) { + uni_vmovups(vmm_src, op); + } break; case Precision::BF16: vpmovzxwd(vmm_src, op); @@ -714,29 +751,53 @@ struct jit_uni_eltwise_generic : public jit_uni_eltwise_kernel, public jit_gener uni_vpmovzxbd(vmm_src, op); break; default: - assert(!"unknown src_prc"); + IE_THROW() << "Unknown src_prc: " << src_prc; } switch (dst_prc) { case Precision::FP32: - if (!src_prc.is_float()) + if (src_prc == Precision::I64) { + vcvtqq2ps(ymm_src, op); + } else if (one_of(src_prc, Precision::U8, Precision::I8, Precision::I16, Precision::U16)) { uni_vcvtdq2ps(vmm_src, vmm_src); + } else if (src_prc == Precision::I32) { + uni_vcvtdq2ps(vmm_src, op); + } break; case Precision::I32: - if (src_prc.is_float()) - uni_vcvtps2dq(vmm_src, vmm_src); + if (src_prc == Precision::I64) { + vpmovsqd(ymm_src, vmm_src); + } else if (src_prc == Precision::FP32 || src_prc == Precision::BF16 || src_prc == Precision::FP16) { + uni_vcvtps2dq(vmm_src, op); + } + break; + case Precision::I64: + case Precision::FP64: break; default: - assert(!"unknown dst_prc"); + IE_THROW() << "Unsupported destination precision: " << dst_prc; } } } - inline void load_scalar(Xmm xmm_src, const Xbyak::Address &op, Precision src_prc, Precision dst_prc) { + inline void load_scalar(const Xmm &xmm_src, const Address &op, const Precision &src_prc, const Precision &dst_prc, bool broadcast = false) { + Address srcAdrBcst(op.getBit(), true, op.getRegExp()); switch (src_prc) { + case Precision::I64: + if (dst_prc == Precision::I64) { + uni_vmovsd(xmm_src, op); + } else if (dst_prc == Precision::FP64) { + if (x64::mayiuse(x64::avx512_core)) { + vcvtqq2pd(xmm_src, srcAdrBcst); + } else { + // Do conversion inside the emitter. + uni_vmovsd(xmm_src, op); + } + } + break; case Precision::FP32: case Precision::I32: - uni_vmovss(xmm_src, op); + uni_vmovss(xmm_src, op); // TODO: AVX512 uni_vcvtdq2ps with bct break; case Precision::BF16: uni_vpinsrw(xmm_src, xmm_src, op, 0); @@ -762,45 +823,88 @@ struct jit_uni_eltwise_generic : public jit_uni_eltwise_kernel, public jit_gener uni_vmovq(xmm_src, reg_tmp_64); break; default: - assert(!"unknown src_prc"); + IE_THROW() << "Unkown source precision '" << src_prc << "'"; } switch (dst_prc) { case Precision::FP32: - if (!src_prc.is_float()) + if (src_prc == Precision::I64) { + vcvtqq2ps(xmm_src, xmm_src); + } else if (src_prc != Precision::FP32 && src_prc != Precision::BF16 && src_prc != Precision::FP16) { uni_vcvtdq2ps(xmm_src, xmm_src); + } break; case Precision::I32: - if (src_prc.is_float()) + if (src_prc == Precision::I64) { + vpmovsqd(xmm_src, xmm_src); + } else if (src_prc == Precision::FP32 || src_prc == Precision::BF16 || src_prc == Precision::FP16) { uni_vcvtps2dq(xmm_src, xmm_src); + } + break; + case Precision::I64: + case Precision::FP64: break; default: - assert(!"unknown dst_prc"); + IE_THROW() << "Unsupported destination precision: " << dst_prc; } } - inline void store_vector(const Xbyak::Address &op, Vmm vmm_dst, Precision src_prc, Precision dst_prc) { + inline void store_vector(const Address &op, const Vmm &vmm_dst, const Precision &src_prc, const Precision &dst_prc) { Xmm xmm_dst = Xmm(vmm_dst.getIdx()); Ymm ymm_dst = Ymm(vmm_dst.getIdx()); switch (src_prc) { + case Precision::FP64: + if (dst_prc == Precision::FP32) { + uni_vcvtpd2ps(x64::mayiuse(x64::avx512_core) ? ymm_dst : xmm_dst, vmm_dst); + } else if (dst_prc == Precision::I64) { + if (x64::mayiuse(x64::avx512_core)) { + vcvtpd2qq(vmm_dst, vmm_dst); + } else { + // Do conversion inside the emitter. + } + } else if (dst_prc == Precision::I32) { + vcvtpd2dq(ymm_dst, vmm_dst); + } + break; case Precision::FP32: - if (!dst_prc.is_float()) + if (dst_prc == Precision::I64) { + vcvtps2qq(vmm_dst, ymm_dst); + } else if (dst_prc != Precision::FP32 && dst_prc != Precision::BF16 && dst_prc != Precision::FP16) { uni_vcvtps2dq(vmm_dst, vmm_dst); + } break; case Precision::I32: - if (dst_prc.is_float()) + if (dst_prc == Precision::FP32 || dst_prc == Precision::BF16 || dst_prc == Precision::FP16) uni_vcvtdq2ps(vmm_dst, vmm_dst); break; + case Precision::I64: + if (dst_prc == Precision::FP32 || dst_prc == Precision::BF16 || dst_prc == Precision::FP16) { + vcvtqq2ps(ymm_dst, vmm_dst); + } + break; default: - assert(!"unknown src_prc"); + IE_THROW() << "Unsupported source precision: " << src_prc; } switch (dst_prc) { case Precision::FP32: - case Precision::I32: + if (src_prc == Precision::I64) { + uni_vmovups(op, ymm_dst); + } else { + uni_vmovups(op, vmm_dst); + } + break; + case Precision::I64: uni_vmovups(op, vmm_dst); break; + case Precision::I32: + if (src_prc == Precision::I64) { + vpmovsqd(op, vmm_dst); + } else { + uni_vmovups(op, vmm_dst); + } + break; case Precision::BF16: uni_vcvtneps2bf16->emit_code({static_cast(vmm_dst.getIdx())}, {static_cast(ymm_dst.getIdx())}); vmovdqu16(op, ymm_dst); @@ -837,7 +941,11 @@ struct jit_uni_eltwise_generic : public jit_uni_eltwise_kernel, public jit_gener break; case Precision::I8: if (isa == x64::avx512_core) { - vpmovsdb(op, vmm_dst); + if (src_prc == Precision::I64) { + vpmovsqb(xmm_dst, vmm_dst); + } else { + vpmovsdb(op, vmm_dst); + } } else { uni_vpackssdw(vmm_dst, vmm_dst, vmm_dst); if (isa != x64::sse41) @@ -851,8 +959,12 @@ struct jit_uni_eltwise_generic : public jit_uni_eltwise_kernel, public jit_gener break; case Precision::U8: if (isa == x64::avx512_core) { - vpmaxsd(vmm_dst, vmm_zero, vmm_dst); - vpmovusdb(op, vmm_dst); + if (src_prc == Precision::I64) { + vpmovusqb(xmm_dst, vmm_dst); + } else { + vpmaxsd(vmm_dst, vmm_zero, vmm_dst); + vpmovusdb(op, vmm_dst); + } } else { uni_vpackusdw(vmm_dst, vmm_dst, vmm_dst); if (isa != x64::sse41) @@ -865,25 +977,52 @@ struct jit_uni_eltwise_generic : public jit_uni_eltwise_kernel, public jit_gener } break; default: - assert(!"unknown dst_prc"); + IE_THROW() << "Unsupported destination precision: " << dst_prc; } } - inline void store_scalar(const Xbyak::Address &op, Xmm xmm_dst, Precision src_prc, Precision dst_prc) { + inline void store_scalar(const Address &op, const Xmm &xmm_dst, const Precision &src_prc, const Precision &dst_prc) { switch (src_prc) { + case Precision::FP64: + if (dst_prc == Precision::FP32) { + uni_vcvtpd2ps(xmm_dst, xmm_dst); + } else if (dst_prc == Precision::I64) { + if (x64::mayiuse(x64::avx512_core)) { + vcvtpd2qq(xmm_dst, xmm_dst); + } else { + // Do conversion inside the emitter. + } + } else if (dst_prc == Precision::I32) { + uni_vcvtpd2dq(xmm_dst, xmm_dst); + } + break; case Precision::FP32: - if (!dst_prc.is_float()) + if (dst_prc == Precision::I64) { + vcvtps2qq(xmm_dst, xmm_dst); + } else if (dst_prc != Precision::FP32 && dst_prc != Precision::BF16) { uni_vcvtps2dq(xmm_dst, xmm_dst); + } break; case Precision::I32: - if (dst_prc.is_float()) + if (dst_prc == Precision::FP32 || dst_prc == Precision::BF16 || dst_prc == Precision::FP16) uni_vcvtdq2ps(xmm_dst, xmm_dst); break; + case Precision::I64: + if (dst_prc == Precision::FP32 || dst_prc == Precision::BF16 || dst_prc == Precision::FP16) { + vcvtqq2ps(xmm_dst, xmm_dst); + } else if (dst_prc == Precision::I32) { + vpmovsqd(xmm_dst, xmm_dst); + } + break; default: - assert(!"unknown src_prc"); + IE_THROW() << "Unsupported source precision: " << src_prc; } switch (dst_prc) { + case Precision::FP64: + case Precision::I64: + uni_vmovsd(op, xmm_dst); + break; case Precision::FP32: case Precision::I32: uni_vmovss(op, xmm_dst); @@ -914,13 +1053,10 @@ struct jit_uni_eltwise_generic : public jit_uni_eltwise_kernel, public jit_gener mov(op, reg_tmp_8); break; case Precision::U8: - uni_vpackusdw(xmm_dst, xmm_dst, xmm_dst); - uni_vpackuswb(xmm_dst, xmm_dst, xmm_dst); - movq(reg_tmp_64, xmm_dst); - mov(op, reg_tmp_8); + uni_vpextrb(op, xmm_dst, 0); break; default: - assert(!"unknown dst_prc"); + IE_THROW() << "Unsupported destination precision: " << dst_prc; } } }; @@ -985,9 +1121,9 @@ class EltwiseShapeInferFactory : public ShapeInferFactory { } // namespace -Eltwise::BroadcastingPolicy Eltwise::determineBroadcastingPolicy(const std::shared_ptr& op) { - const auto const1 = ov::as_type_ptr(op->get_input_node_shared_ptr(0)); - const auto const2 = ov::as_type_ptr(op->get_input_node_shared_ptr(1)); +Eltwise::BroadcastingPolicy Eltwise::determineBroadcastingPolicy(const std::shared_ptr& op) { + const auto const1 = ov::as_type_ptr(op->get_input_node_shared_ptr(0)); + const auto const2 = ov::as_type_ptr(op->get_input_node_shared_ptr(1)); int constPort = -1; if (const2) { constPort = 1; @@ -998,48 +1134,48 @@ Eltwise::BroadcastingPolicy Eltwise::determineBroadcastingPolicy(const std::shar } auto const_shape = op->get_input_shape(constPort); - if (ngraph::shape_size(const_shape) == 1) + if (ov::shape_size(const_shape) == 1) return PerTensor; else return PerChannel; } -const std::map Eltwise::initializers = { - {ngraph::op::v1::Add::get_type_info_static(), [](const std::shared_ptr& op, Eltwise& node) { +const std::map Eltwise::initializers = { + {op::v1::Add::get_type_info_static(), [](const std::shared_ptr& op, Eltwise& node) { node.algorithm = Algorithm::EltwiseAdd; node.broadcastingPolicy = determineBroadcastingPolicy(op); }}, - {ngraph::op::v1::Subtract::get_type_info_static(), [](const std::shared_ptr& op, Eltwise& node) { + {op::v1::Subtract::get_type_info_static(), [](const std::shared_ptr& op, Eltwise& node) { node.algorithm = Algorithm::EltwiseSubtract; node.broadcastingPolicy = determineBroadcastingPolicy(op); }}, - {ngraph::op::v1::Multiply::get_type_info_static(), [](const std::shared_ptr& op, Eltwise& node) { + {op::v1::Multiply::get_type_info_static(), [](const std::shared_ptr& op, Eltwise& node) { node.algorithm = Algorithm::EltwiseMultiply; node.broadcastingPolicy = determineBroadcastingPolicy(op); }}, - {ngraph::op::v1::Divide::get_type_info_static(), [](const std::shared_ptr& op, Eltwise& node) { + {op::v1::Divide::get_type_info_static(), [](const std::shared_ptr& op, Eltwise& node) { node.algorithm = Algorithm::EltwiseDivide; node.broadcastingPolicy = determineBroadcastingPolicy(op); }}, - {ngraph::op::v0::SquaredDifference::get_type_info_static(), [](const std::shared_ptr& op, Eltwise& node) { + {op::v0::SquaredDifference::get_type_info_static(), [](const std::shared_ptr& op, Eltwise& node) { node.algorithm = Algorithm::EltwiseSquaredDifference; }}, - {ngraph::op::v1::Maximum::get_type_info_static(), [](const std::shared_ptr& op, Eltwise& node) { + {op::v1::Maximum::get_type_info_static(), [](const std::shared_ptr& op, Eltwise& node) { node.algorithm = Algorithm::EltwiseMaximum; }}, - {ngraph::op::v1::Minimum::get_type_info_static(), [](const std::shared_ptr& op, Eltwise& node) { + {op::v1::Minimum::get_type_info_static(), [](const std::shared_ptr& op, Eltwise& node) { node.algorithm = Algorithm::EltwiseMinimum; }}, - {ngraph::op::v1::Mod::get_type_info_static(), [](const std::shared_ptr& op, Eltwise& node) { + {op::v1::Mod::get_type_info_static(), [](const std::shared_ptr& op, Eltwise& node) { node.algorithm = Algorithm::EltwiseMod; }}, - {ngraph::op::v1::FloorMod::get_type_info_static(), [](const std::shared_ptr& op, Eltwise& node) { + {op::v1::FloorMod::get_type_info_static(), [](const std::shared_ptr& op, Eltwise& node) { node.algorithm = Algorithm::EltwiseFloorMod; }}, - {ngraph::op::v1::Power::get_type_info_static(), [](const std::shared_ptr& op, Eltwise& node) { + {op::v1::Power::get_type_info_static(), [](const std::shared_ptr& op, Eltwise& node) { node.algorithm = Algorithm::EltwisePowerDynamic; }}, - {PowerStaticNode::get_type_info_static(), [](const std::shared_ptr& op, Eltwise& node) { + {PowerStaticNode::get_type_info_static(), [](const std::shared_ptr& op, Eltwise& node) { auto powerStatic = getNgraphOpAs(op); node.algorithm = Algorithm::EltwisePowerStatic; node.alpha = powerStatic->get_power(); @@ -1047,100 +1183,100 @@ const std::map Eltwise::in node.gamma = powerStatic->get_shift(); node.broadcastingPolicy = PerTensor; }}, - {ngraph::op::v1::Equal::get_type_info_static(), [](const std::shared_ptr& op, Eltwise& node) { + {op::v1::Equal::get_type_info_static(), [](const std::shared_ptr& op, Eltwise& node) { node.algorithm = Algorithm::EltwiseEqual; }}, - {ngraph::op::v1::NotEqual::get_type_info_static(), [](const std::shared_ptr& op, Eltwise& node) { + {op::v1::NotEqual::get_type_info_static(), [](const std::shared_ptr& op, Eltwise& node) { node.algorithm = Algorithm::EltwiseNotEqual; }}, - {ov::op::v10::IsFinite::get_type_info_static(), [](const std::shared_ptr& op, Eltwise& node) { + {op::v10::IsFinite::get_type_info_static(), [](const std::shared_ptr& op, Eltwise& node) { node.algorithm = Algorithm::EltwiseIsFinite; }}, - {ov::op::v10::IsInf::get_type_info_static(), [](const std::shared_ptr& op, Eltwise& node) { + {op::v10::IsInf::get_type_info_static(), [](const std::shared_ptr& op, Eltwise& node) { node.algorithm = Algorithm::EltwiseIsInf; - const auto& attributes = ov::as_type_ptr(op)->get_attributes(); + const auto& attributes = ov::as_type_ptr(op)->get_attributes(); node.alpha = attributes.detect_negative; node.beta = attributes.detect_positive; }}, - {ov::op::v10::IsNaN::get_type_info_static(), [](const std::shared_ptr& op, Eltwise& node) { + {op::v10::IsNaN::get_type_info_static(), [](const std::shared_ptr& op, Eltwise& node) { node.algorithm = Algorithm::EltwiseIsNaN; }}, - {ngraph::op::v1::Greater::get_type_info_static(), [](const std::shared_ptr& op, Eltwise& node) { + {op::v1::Greater::get_type_info_static(), [](const std::shared_ptr& op, Eltwise& node) { node.algorithm = Algorithm::EltwiseGreater; }}, - {ngraph::op::v1::GreaterEqual::get_type_info_static(), [](const std::shared_ptr& op, Eltwise& node) { + {op::v1::GreaterEqual::get_type_info_static(), [](const std::shared_ptr& op, Eltwise& node) { node.algorithm = Algorithm::EltwiseGreaterEqual; }}, - {ngraph::op::v1::Less::get_type_info_static(), [](const std::shared_ptr& op, Eltwise& node) { + {op::v1::Less::get_type_info_static(), [](const std::shared_ptr& op, Eltwise& node) { node.algorithm = Algorithm::EltwiseLess; }}, - {ngraph::op::v1::LessEqual::get_type_info_static(), [](const std::shared_ptr& op, Eltwise& node) { + {op::v1::LessEqual::get_type_info_static(), [](const std::shared_ptr& op, Eltwise& node) { node.algorithm = Algorithm::EltwiseLessEqual; }}, - {ngraph::op::v1::LogicalAnd::get_type_info_static(), [](const std::shared_ptr& op, Eltwise& node) { + {op::v1::LogicalAnd::get_type_info_static(), [](const std::shared_ptr& op, Eltwise& node) { node.algorithm = Algorithm::EltwiseLogicalAnd; }}, - {ngraph::op::v1::LogicalOr::get_type_info_static(), [](const std::shared_ptr& op, Eltwise& node) { + {op::v1::LogicalOr::get_type_info_static(), [](const std::shared_ptr& op, Eltwise& node) { node.algorithm = Algorithm::EltwiseLogicalOr; }}, - {ngraph::op::v1::LogicalXor::get_type_info_static(), [](const std::shared_ptr& op, Eltwise& node) { + {op::v1::LogicalXor::get_type_info_static(), [](const std::shared_ptr& op, Eltwise& node) { node.algorithm = Algorithm::EltwiseLogicalXor; }}, - {ngraph::op::v1::LogicalNot::get_type_info_static(), [](const std::shared_ptr& op, Eltwise& node) { + {op::v1::LogicalNot::get_type_info_static(), [](const std::shared_ptr& op, Eltwise& node) { node.algorithm = Algorithm::EltwiseLogicalNot; }}, - {ngraph::op::v0::Relu::get_type_info_static(), [](const std::shared_ptr& op, Eltwise& node) { + {op::v0::Relu::get_type_info_static(), [](const std::shared_ptr& op, Eltwise& node) { node.algorithm = Algorithm::EltwiseRelu; node.onednnAlgorithm = dnnl::algorithm::eltwise_relu; }}, - {LeakyReluNode::get_type_info_static(), [](const std::shared_ptr& op, Eltwise& node) { + {LeakyReluNode::get_type_info_static(), [](const std::shared_ptr& op, Eltwise& node) { auto leakyRelu = getNgraphOpAs(op); node.algorithm = Algorithm::EltwiseRelu; node.onednnAlgorithm = dnnl::algorithm::eltwise_relu; node.alpha = leakyRelu->get_slope(); node.beta = 0.0f; }}, - {ngraph::op::v0::Gelu::get_type_info_static(), [](const std::shared_ptr& op, Eltwise& node) { + {op::v0::Gelu::get_type_info_static(), [](const std::shared_ptr& op, Eltwise& node) { node.algorithm = Algorithm::EltwiseGeluErf; node.onednnAlgorithm = dnnl::algorithm::eltwise_gelu_erf; }}, - {ngraph::op::v7::Gelu::get_type_info_static(), [](const std::shared_ptr& op, Eltwise& node) { - auto gelu = getNgraphOpAs(op); - ngraph::op::GeluApproximationMode approximationMode = gelu->get_approximation_mode(); - if (approximationMode == ngraph::op::GeluApproximationMode::ERF) { + {op::v7::Gelu::get_type_info_static(), [](const std::shared_ptr& op, Eltwise& node) { + auto gelu = getNgraphOpAs(op); + op::GeluApproximationMode approximationMode = gelu->get_approximation_mode(); + if (approximationMode == op::GeluApproximationMode::ERF) { node.algorithm = Algorithm::EltwiseGeluErf; node.onednnAlgorithm = dnnl::algorithm::eltwise_gelu_erf; - } else if (approximationMode == ngraph::op::GeluApproximationMode::TANH) { + } else if (approximationMode == op::GeluApproximationMode::TANH) { node.algorithm = Algorithm::EltwiseGeluTanh; node.onednnAlgorithm = dnnl::algorithm::eltwise_gelu_tanh; } else { IE_THROW(NotImplemented) << "CPU Eltwise node doesn't support ngraph operation Gelu with approximation mode: " << approximationMode; } }}, - {ngraph::op::v0::Elu::get_type_info_static(), [](const std::shared_ptr& op, Eltwise& node) { - auto eluOp = getNgraphOpAs(op); + {op::v0::Elu::get_type_info_static(), [](const std::shared_ptr& op, Eltwise& node) { + auto eluOp = getNgraphOpAs(op); node.alpha = static_cast(eluOp->get_alpha()); node.algorithm = Algorithm::EltwiseElu; node.onednnAlgorithm = dnnl::algorithm::eltwise_elu; }}, - {ngraph::op::v0::Tanh::get_type_info_static(), [](const std::shared_ptr& op, Eltwise& node) { + {op::v0::Tanh::get_type_info_static(), [](const std::shared_ptr& op, Eltwise& node) { node.algorithm = Algorithm::EltwiseTanh; node.onednnAlgorithm = dnnl::algorithm::eltwise_tanh; }}, - {ngraph::op::v0::Sigmoid::get_type_info_static(), [](const std::shared_ptr& op, Eltwise& node) { + {op::v0::Sigmoid::get_type_info_static(), [](const std::shared_ptr& op, Eltwise& node) { node.algorithm = Algorithm::EltwiseSigmoid; node.onednnAlgorithm = dnnl::algorithm::eltwise_logistic; }}, - {ngraph::op::v0::Abs::get_type_info_static(), [](const std::shared_ptr& op, Eltwise& node) { + {op::v0::Abs::get_type_info_static(), [](const std::shared_ptr& op, Eltwise& node) { node.algorithm = Algorithm::EltwiseAbs; node.onednnAlgorithm = dnnl::algorithm::eltwise_abs; }}, - {ngraph::op::v0::Sqrt::get_type_info_static(), [](const std::shared_ptr& op, Eltwise& node) { + {op::v0::Sqrt::get_type_info_static(), [](const std::shared_ptr& op, Eltwise& node) { node.algorithm = Algorithm::EltwiseSqrt; node.onednnAlgorithm = dnnl::algorithm::eltwise_sqrt; }}, - {ngraph::op::v0::Clamp::get_type_info_static(), [](const std::shared_ptr& op, Eltwise& node) { - auto clampOp = getNgraphOpAs(op); + {op::v0::Clamp::get_type_info_static(), [](const std::shared_ptr& op, Eltwise& node) { + auto clampOp = getNgraphOpAs(op); float alpha_ = static_cast(clampOp->get_min()); float beta_ = static_cast(clampOp->get_max()); @@ -1154,64 +1290,64 @@ const std::map Eltwise::in node.algorithm = Algorithm::EltwiseClamp; node.onednnAlgorithm = dnnl::algorithm::eltwise_clip; }}, - {ngraph::op::v0::Exp::get_type_info_static(), [](const std::shared_ptr& op, Eltwise& node) { + {op::v0::Exp::get_type_info_static(), [](const std::shared_ptr& op, Eltwise& node) { node.algorithm = Algorithm::EltwiseExp; node.onednnAlgorithm = dnnl::algorithm::eltwise_exp; }}, - {SwishNode::get_type_info_static(), [](const std::shared_ptr& op, Eltwise& node) { + {SwishNode::get_type_info_static(), [](const std::shared_ptr& op, Eltwise& node) { auto swishOp = getNgraphOpAs(op); node.algorithm = Algorithm::EltwiseSwish; node.onednnAlgorithm = dnnl::algorithm::eltwise_swish; node.alpha = swishOp->get_alpha(); }}, - {ngraph::op::v4::HSwish::get_type_info_static(), [](const std::shared_ptr& op, Eltwise& node) { + {op::v4::HSwish::get_type_info_static(), [](const std::shared_ptr& op, Eltwise& node) { // since v3.0 version, oneDNN has flexible implementation of hardswish, ov still uses the one with hardcoded alpha and beta node.alpha = 1.f / 6.f; node.beta = 0.5f; node.algorithm = Algorithm::EltwiseHswish; node.onednnAlgorithm = dnnl::algorithm::eltwise_hardswish; }}, - {ngraph::op::v4::Mish::get_type_info_static(), [](const std::shared_ptr& op, Eltwise& node) { + {op::v4::Mish::get_type_info_static(), [](const std::shared_ptr& op, Eltwise& node) { node.algorithm = Algorithm::EltwiseMish; node.onednnAlgorithm = dnnl::algorithm::eltwise_mish; }}, - {ngraph::op::v5::HSigmoid::get_type_info_static(), [](const std::shared_ptr& op, Eltwise& node) { + {op::v5::HSigmoid::get_type_info_static(), [](const std::shared_ptr& op, Eltwise& node) { node.algorithm = Algorithm::EltwiseHsigmoid; node.onednnAlgorithm = dnnl::algorithm::eltwise_hsigmoid; }}, - {ngraph::op::v5::Round::get_type_info_static(), [](const std::shared_ptr& op, Eltwise& node) { - auto roundOp = getNgraphOpAs(op); + {op::v5::Round::get_type_info_static(), [](const std::shared_ptr& op, Eltwise& node) { + auto roundOp = getNgraphOpAs(op); switch (roundOp->get_mode()) { - case ngraph::op::v5::Round::RoundMode::HALF_TO_EVEN: + case op::v5::Round::RoundMode::HALF_TO_EVEN: node.algorithm = Algorithm::EltwiseRoundHalfToEven; node.onednnAlgorithm = dnnl::algorithm::eltwise_round_half_to_even; break; - case ngraph::op::v5::Round::RoundMode::HALF_AWAY_FROM_ZERO: + case op::v5::Round::RoundMode::HALF_AWAY_FROM_ZERO: node.algorithm = Algorithm::EltwiseRoundHalfAwayFromZero; node.onednnAlgorithm = dnnl::algorithm::eltwise_round_half_away_from_zero; break; } }}, - {ngraph::op::v0::PRelu::get_type_info_static(), [](const std::shared_ptr& op, Eltwise& node) { + {op::v0::PRelu::get_type_info_static(), [](const std::shared_ptr& op, Eltwise& node) { node.algorithm = Algorithm::EltwisePrelu; node.broadcastingPolicy = determineBroadcastingPolicy(op); }}, - {ngraph::op::v0::Erf::get_type_info_static(), [](const std::shared_ptr& op, Eltwise& node) { + {op::v0::Erf::get_type_info_static(), [](const std::shared_ptr& op, Eltwise& node) { node.algorithm = Algorithm::EltwiseErf; }}, - {ngraph::op::v4::SoftPlus::get_type_info_static(), [](const std::shared_ptr& op, Eltwise& node) { + {op::v4::SoftPlus::get_type_info_static(), [](const std::shared_ptr& op, Eltwise& node) { node.algorithm = Algorithm::EltwiseSoftRelu; node.alpha = 1.f; node.onednnAlgorithm = dnnl::algorithm::eltwise_soft_relu; }}, - {ngraph::op::v9::SoftSign::get_type_info_static(), [](const std::shared_ptr& op, Eltwise& node) { + {op::v9::SoftSign::get_type_info_static(), [](const std::shared_ptr& op, Eltwise& node) { node.algorithm = Algorithm::EltwiseSoftSign; }}, - {ngraph::op::v1::Select::get_type_info_static(), [](const std::shared_ptr& op, Eltwise& node) { + {op::v1::Select::get_type_info_static(), [](const std::shared_ptr& op, Eltwise& node) { node.algorithm = Algorithm::EltwiseSelect; }}, - {ngraph::op::v0::Log::get_type_info_static(), [](const std::shared_ptr& op, Eltwise& node) { + {op::v0::Log::get_type_info_static(), [](const std::shared_ptr& op, Eltwise& node) { node.algorithm = Algorithm::EltwiseLog; }}, }; @@ -1224,8 +1360,8 @@ struct EltwiseKey { VectorDims outBlkDims; VectorDims outOrder; std::vector inpDims; - std::vector inpPrc; - InferenceEngine::Precision outPrc; + std::vector inpPrc; + Precision outPrc; dnnl::post_ops postOps; EltwiseImplType implType; @@ -1323,8 +1459,8 @@ class EltwiseJitExecutor : public Eltwise::IEltwiseExecutor { const VectorDims& outBlkDims, const VectorDims& outOrder, std::vector inpDims, - const std::vector& inpPrc, - const InferenceEngine::Precision& outPrc, + const std::vector& inpPrc, + const Precision& outPrc, const dnnl::post_ops& post_ops, bool useRuntimePtrs) { auto collapseLastDims = [](std::vector& dims, int dimsToCollapse) { @@ -1544,6 +1680,7 @@ class EltwiseJitExecutor : public Eltwise::IEltwiseExecutor { args.indexes[3] = i3; args.indexes[4] = i4; + (*_pKernel)(&args_ptrs, &args); }); } else { @@ -1834,23 +1971,23 @@ static Eltwise::executorPtr buildExecutor(const EltwiseKey& key) { return execPtr; } -bool Eltwise::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { +bool Eltwise::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { try { if (initializers.find(op->get_type_info()) == initializers.end()) { errorMessage = "Doesn't support Eltwise algorithm: " + std::string(op->get_type_name()); return false; } - if (const auto binOp = ov::as_type_ptr(op)) { - if (binOp->get_autob().m_type != ngraph::op::AutoBroadcastType::NONE && - binOp->get_autob().m_type != ngraph::op::AutoBroadcastType::NUMPY) { - errorMessage = "Doesn't support broadcast type: " + ngraph::as_string(binOp->get_autob().m_type); + if (const auto binOp = ov::as_type_ptr(op)) { + if (binOp->get_autob().m_type != op::AutoBroadcastType::NONE && + binOp->get_autob().m_type != op::AutoBroadcastType::NUMPY) { + errorMessage = "Doesn't support broadcast type: " + ov::as_string(binOp->get_autob().m_type); return false; } } - if (const auto select = ov::as_type_ptr(op)) { - if (select->get_auto_broadcast().m_type != ngraph::op::AutoBroadcastType::NONE && - select->get_auto_broadcast().m_type != ngraph::op::AutoBroadcastType::NUMPY) { - errorMessage = "Doesn't support broadcast type: " + ngraph::as_string(select->get_autob().m_type); + if (const auto select = ov::as_type_ptr(op)) { + if (select->get_auto_broadcast().m_type != op::AutoBroadcastType::NONE && + select->get_auto_broadcast().m_type != op::AutoBroadcastType::NUMPY) { + errorMessage = "Doesn't support broadcast type: " + ov::as_string(select->get_autob().m_type); return false; } } @@ -1860,8 +1997,8 @@ bool Eltwise::isSupportedOperation(const std::shared_ptr& op return true; } -Eltwise::Eltwise(const std::shared_ptr& op, const GraphContext::CPtr context) : - Node(op, context, EltwiseShapeInferFactory()), broadcastingPolicy(Undefined) { +Eltwise::Eltwise(const std::shared_ptr& op, const GraphContext::CPtr& context) : + Node(op, context, EltwiseShapeInferFactory()), broadcastingPolicy(Undefined) { std::string errorMessage; if (!isSupportedOperation(op, errorMessage)) { IE_THROW(NotImplemented) << errorMessage; @@ -1953,7 +2090,8 @@ void Eltwise::initSupportedPrimitiveDescriptors() { Precision::I16, Precision::BF16, Precision::FP16, - Precision::I32 + Precision::I32, + Precision::I64 }; if (!supportedPrimitiveDescriptors.empty()) @@ -1984,7 +2122,7 @@ void Eltwise::initSupportedPrimitiveDescriptors() { IE_THROW() << "Eltwise node with name `" << getName() << "` has invalid input number of inputs: expected = " << expectedInputsNum << " (actual = " << getParentEdges().size() << ")"; - std::vector inputPrecisions; + std::vector inputPrecisions; for (const auto &prec : getOriginalInputPrecisions()) { inputPrecisions.push_back(prec); } @@ -2006,7 +2144,7 @@ void Eltwise::initSupportedPrimitiveDescriptors() { if (inputPrecisions.size() != getParentEdges().size()) IE_THROW() << "Eltwise node with name `" << getName() << "` has invalid input precisions configuration."; - InferenceEngine::Precision outputPrecision = getOriginalOutputPrecisionAtPort(0); + Precision outputPrecision = getOriginalOutputPrecisionAtPort(0); if (!fusedWith.empty()) { outputPrecision = fusedWith[fusedWith.size() - 1]->getOriginalOutputPrecisionAtPort(0); } @@ -2025,8 +2163,10 @@ void Eltwise::initSupportedPrimitiveDescriptors() { if (implType == EltwiseImplType::reference) { return Precision(Precision::FP32); } else if (std::find(supportedPrecisions.begin(), supportedPrecisions.end(), prc) == supportedPrecisions.end()) { - if (prc == Precision::U32 || prc == Precision::I64 || prc == Precision::U64) { + if (prc == Precision::U32) { return Precision(Precision::I32); + } else if (prc == Precision::U64) { + return Precision(Precision::I64); } else { IE_THROW() << "Eltwise node with name `" << getName() << "` doesn't support " << prc << " precision."; } @@ -2746,8 +2886,8 @@ bool Eltwise::canFuse(const NodePtr& node) const { return false; } -InferenceEngine::Precision Eltwise::getRuntimePrecision() const { - std::vector inputPrecisions; +Precision Eltwise::getRuntimePrecision() const { + std::vector inputPrecisions; // Don't take bias precision into account for (size_t i = 0; i < getParentEdges().size(); i++) { auto parentEdge = getParentEdgeAt(i); diff --git a/src/plugins/intel_cpu/src/nodes/eltwise.h b/src/plugins/intel_cpu/src/nodes/eltwise.h index ec3ff99ea545a7..3aa73c56f04c01 100644 --- a/src/plugins/intel_cpu/src/nodes/eltwise.h +++ b/src/plugins/intel_cpu/src/nodes/eltwise.h @@ -4,12 +4,7 @@ #pragma once -#include #include -#include -#include -#include -#include #include "executors/eltwise_list.hpp" namespace ov { @@ -103,7 +98,7 @@ class Eltwise : public Node { using executorPtr = std::shared_ptr; public: - Eltwise(const std::shared_ptr& op, const GraphContext::CPtr context); + Eltwise(const std::shared_ptr& op, const GraphContext::CPtr& context); void getSupportedDescriptors() override; void initSupportedPrimitiveDescriptors() override; diff --git a/src/plugins/intel_cpu/src/nodes/executors/common/ref_opt_transpose.cpp b/src/plugins/intel_cpu/src/nodes/executors/common/ref_opt_transpose.cpp index 71997a495d50e0..ea196005021bb3 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/common/ref_opt_transpose.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/common/ref_opt_transpose.cpp @@ -124,7 +124,8 @@ void RefOptimizedTransposeExecutor::exec(const std::vector& src, con OV_SWITCH(intel_cpu, TransposeOptimizedEmitter, ctx, dataSize, OV_CASE(1u, InferenceEngine::PrecisionTrait::value_type), OV_CASE(2u, InferenceEngine::PrecisionTrait::value_type), - OV_CASE(4u, InferenceEngine::PrecisionTrait::value_type)); + OV_CASE(4u, InferenceEngine::PrecisionTrait::value_type), + OV_CASE(8u, InferenceEngine::PrecisionTrait::value_type)); } bool RefOptimizedTransposeExecutor::init(const TransposeParams &transposeParams, diff --git a/src/plugins/intel_cpu/src/nodes/eye.cpp b/src/plugins/intel_cpu/src/nodes/eye.cpp index 747e89bdc1ed11..6f7ce1f8e92ea8 100644 --- a/src/plugins/intel_cpu/src/nodes/eye.cpp +++ b/src/plugins/intel_cpu/src/nodes/eye.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #define THROW_ERROR IE_THROW() << NameFromType(getType()) << " node with name '" << getName() << "' " @@ -20,7 +21,7 @@ using namespace InferenceEngine::details; bool Eye::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { try { - if (op->get_type_info() != ngraph::op::v9::Eye::get_type_info_static()) { + if (op->get_type_info() != op::v9::Eye::get_type_info_static()) { errorMessage = "Node is not an instance of Eye form the operation set v9."; return false; } diff --git a/src/plugins/intel_cpu/src/nodes/gather.cpp b/src/plugins/intel_cpu/src/nodes/gather.cpp index 06314ca17c6f5e..f6f86114ecc0de 100644 --- a/src/plugins/intel_cpu/src/nodes/gather.cpp +++ b/src/plugins/intel_cpu/src/nodes/gather.cpp @@ -2,23 +2,18 @@ // SPDX-License-Identifier: Apache-2.0 // -#include -#include - -#include "ie_parallel.hpp" #include "gather.h" -#include + #include "common/cpu_memcpy.h" -#include +#include "ie_parallel.hpp" #include "kernels/x64/gather_uni_kernel.hpp" -#include "utils/shape_inference/shape_inference_cpu.hpp" +#include +#include #include using namespace InferenceEngine; using namespace dnnl::impl::cpu; -#define THROW_ERROR IE_THROW() << getTypeStr() << " node with name '" << getName() << "' " - namespace ov { namespace intel_cpu { namespace node { @@ -26,13 +21,13 @@ namespace node { bool Gather::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { try { if (!one_of(op->get_type_info(), - ov::op::v7::Gather::get_type_info_static(), - ov::op::v8::Gather::get_type_info_static())) { + op::v7::Gather::get_type_info_static(), + op::v8::Gather::get_type_info_static())) { errorMessage = "Not supported Gather operation version. CPU plug-in supports only 7 and 8 versions."; return false; } - if (!isDynamicNgraphNode(op) && !ov::is_type(op->get_input_node_ptr(GATHER_AXIS))) { + if (!isDynamicNgraphNode(op) && !ov::is_type(op->get_input_node_ptr(GATHER_AXIS))) { errorMessage = "Only Constant operation on 'axis' input is supported for static node."; return false; } @@ -58,11 +53,15 @@ class GatherShapeInfer : public ShapeInferEmptyPads { const auto& indices_shape = m_isIndicesScalar ? VectorDims{} : input_shapes[GATHER_INDICES].get(); if (!m_isAxisInputConst) { - if (data_dependency.at(GATHER_AXIS)->getDesc().getPrecision() != Precision::I32) { + auto axPrc = data_dependency.at(GATHER_AXIS)->getDesc().getPrecision(); + if (axPrc == Precision::I32) { + m_axis = reinterpret_cast(data_dependency.at(GATHER_AXIS)->getData())[0]; + } else if (axPrc == Precision::I64) { + m_axis = reinterpret_cast(data_dependency.at(GATHER_AXIS)->getData())[0]; + } else { IE_THROW() << "Unsupported precision " << data_dependency.at(GATHER_AXIS)->getDesc().getPrecision() << " for axis tensor."; } - m_axis = reinterpret_cast(data_dependency.at(GATHER_AXIS)->getData())[0]; } if (m_axis < 0) @@ -85,7 +84,7 @@ class GatherShapeInfer : public ShapeInferEmptyPads { private: bool m_isAxisInputConst = false; bool m_isIndicesScalar = false; - int m_axis = 0; + int64_t m_axis = 0; int m_batchDims = 0; }; @@ -95,15 +94,15 @@ class GatherShapeInferFactory : public ShapeInferFactory { ShapeInferPtr makeShapeInfer() const override { static constexpr size_t GATHER_INDICES = 1, GATHER_AXIS = 2; - bool isAxisInputConst = ov::is_type(m_op->get_input_node_ptr(GATHER_AXIS)); + bool isAxisInputConst = ov::is_type(m_op->get_input_node_ptr(GATHER_AXIS)); const auto& indicesShape = m_op->get_input_partial_shape(GATHER_INDICES); if (!indicesShape.rank().is_static()) IE_THROW() << "indicesShape do not support dynamic rank."; bool isIndicesScalar = indicesShape.rank().get_length() == 0; - int axis = isAxisInputConst ? ov::as_type(m_op->get_input_node_ptr(GATHER_AXIS))->cast_vector()[0] : 0; - int batchDims = ov::is_type(m_op) ? static_cast(ov::as_type_ptr(m_op)->get_batch_dims()) : ( - ov::is_type(m_op) ? static_cast(ov::as_type_ptr(m_op)->get_batch_dims()) : 0); + int axis = isAxisInputConst ? ov::as_type(m_op->get_input_node_ptr(GATHER_AXIS))->cast_vector()[0] : 0; + int batchDims = ov::is_type(m_op) ? static_cast(ov::as_type_ptr(m_op)->get_batch_dims()) : ( + ov::is_type(m_op) ? static_cast(ov::as_type_ptr(m_op)->get_batch_dims()) : 0); return std::make_shared(isAxisInputConst, isIndicesScalar, axis, batchDims); } @@ -114,15 +113,14 @@ class GatherShapeInferFactory : public ShapeInferFactory { } // namespace Gather::Gather(const std::shared_ptr& op, const GraphContext::CPtr context) - : Node(op, context, GatherShapeInferFactory(op)), - batchDims(0) { + : Node(op, context, GatherShapeInferFactory(op)), batchDims(0) { std::string errorMessage; if (!isSupportedOperation(op, errorMessage)) { IE_THROW(NotImplemented) << errorMessage; } if (op->get_input_size() != 3 || op->get_output_size() != 1) - THROW_ERROR << "has incorrect number of input/output edges!"; + THROW_CPU_NODE_ERR << "has incorrect number of input/output edges!"; const auto& dataShape = getInputShapeAtPort(GATHER_DATA); isDataShapeStat = dataShape.isStatic(); @@ -132,10 +130,10 @@ Gather::Gather(const std::shared_ptr& op, const GraphContext::CPtr con isIdxShapeStat = idxShape.isStatic(); const auto indicesRank = idxShape.getRank(); if (dataSrcRank == 0lu || indicesRank == 0lu) - THROW_ERROR << "has incorrect input parameters ranks."; + THROW_CPU_NODE_ERR << "has incorrect input parameters ranks."; - if (ov::is_type(op)) { - batchDims = static_cast(ov::as_type_ptr(op)->get_batch_dims()); + if (ov::is_type(op)) { + batchDims = static_cast(ov::as_type_ptr(op)->get_batch_dims()); // WA for NMS->Gather construction. NMS fills part of the output blob by the -1 if these values // must not be taken into account. There is appropriate pass that looks for such subgraphs // and sets the dontReverseIndices flag. @@ -145,23 +143,23 @@ Gather::Gather(const std::shared_ptr& op, const GraphContext::CPtr con reverseIndexing = true; else reverseIndexing = false; - } else if (ov::is_type(op)) { - batchDims = static_cast(ov::as_type_ptr(op)->get_batch_dims()); + } else if (ov::is_type(op)) { + batchDims = static_cast(ov::as_type_ptr(op)->get_batch_dims()); reverseIndexing = false; } if (batchDims < 0) batchDims += indicesRank; if (batchDims < 0 || batchDims > std::min(static_cast(dataSrcRank), static_cast(indicesRank))) - THROW_ERROR << "has incorrect batch_dims " << batchDims << "!"; + THROW_CPU_NODE_ERR << "has incorrect batch_dims " << batchDims << "!"; - if (ov::is_type(op->get_input_node_ptr(GATHER_AXIS))) { + if (ov::is_type(op->get_input_node_ptr(GATHER_AXIS))) { isAxisInputConst = true; - axis = ov::as_type(op->get_input_node_ptr(GATHER_AXIS))->cast_vector()[0]; + axis = ov::as_type(op->get_input_node_ptr(GATHER_AXIS))->cast_vector()[0]; if (axis < 0) axis += dataSrcRank; if (axis < 0 || axis >= dataSrcRank || batchDims > axis) - THROW_ERROR << "has incorrect input parameter axis value: " << axis; + THROW_CPU_NODE_ERR << "has incorrect input parameter axis value: " << axis; } if (auto indices = ov::as_type(op->get_input_node_ptr(GATHER_INDICES))) { @@ -173,7 +171,17 @@ void Gather::initSupportedPrimitiveDescriptors() { if (!supportedPrimitiveDescriptors.empty()) return; - dataTypeSize = getOriginalInputPrecisionAtPort(GATHER_DATA).size(); + const auto &dataPrecision = getOriginalInputPrecisionAtPort(GATHER_DATA); + dataTypeSize = dataPrecision.size(); + idxPrecision = getOriginalInputPrecisionAtPort(GATHER_INDICES); + if (!one_of(idxPrecision, Precision::I32, Precision::I64)) { + idxPrecision = Precision::I32; + } + idxTypeSize = idxPrecision.size(); + auto axisPrecision = getOriginalInputPrecisionAtPort(GATHER_AXIS); + if (!one_of(axisPrecision, Precision::I32, Precision::I64)) { + axisPrecision = Precision::I32; + } const auto& dataDims = getInputShapeAtPort(GATHER_DATA).getDims(); if (isAxisInputConst && isDataShapeStat) { @@ -200,10 +208,9 @@ void Gather::initSupportedPrimitiveDescriptors() { } // Implementation desc type will be redefined in the fn prepareParams if a kernel will be created. - Precision dataPrecision = getOriginalInputPrecisionAtPort(GATHER_DATA); addSupportedPrimDesc({{LayoutType::ncsp, dataPrecision}, - {LayoutType::ncsp, Precision::I32}, - {LayoutType::ncsp, Precision::I32, isAxisInputConst}}, + {LayoutType::ncsp, idxPrecision}, + {LayoutType::ncsp, axisPrecision, isAxisInputConst}}, {{LayoutType::ncsp, dataPrecision}}, ref_any); @@ -232,10 +239,10 @@ void Gather::createPrimitive() { uint64_t idxElPerVec = 1; if (!isDynamicNode()) { idxElPerVec = x64::mayiuse(x64::avx512_core) ? x64::cpu_isa_traits::vlen / idxTypeSize : - x64::mayiuse(x64::avx2) ? x64::cpu_isa_traits::vlen / idxTypeSize : 1; + x64::mayiuse(x64::avx2) ? x64::cpu_isa_traits::vlen / idxTypeSize : 1; } // Gather instruction is not supported by SSE. - if ((x64::mayiuse(x64::avx512_core) || x64::mayiuse(x64::avx2)) && + if ((x64::mayiuse(x64::avx512_core) || x64::mayiuse(x64::avx2)) && dataTypeSize <= 4 && idxTypeSize == 4 && (isDynamicNode() || afterAxisSize == 1 || (afterAxisSize <= idxElPerVec && (x64::mayiuse(x64::avx512_core) || (x64::mayiuse(x64::avx2) && dataTypeSize == 4))))) { jGatherConfParams jcp; @@ -298,31 +305,44 @@ void Gather::createPrimitive() { } bool Gather::needPrepareParams() const { - if (isInPlace()) { - return false; + if (inputShapesModified()) { + return true; + } else if (!isAxisInputConst) { + auto mem = getParentEdgeAt(GATHER_AXIS)->getMemoryPtr(); + int64_t newAxis = axis; + if (mem->getDesc().getPrecision() == Precision::I64) { + newAxis = (reinterpret_cast(mem->getData()))[0]; + } else if (mem->getDesc().getPrecision() == Precision::I32) { + newAxis = (reinterpret_cast(mem->getData()))[0]; + } + if (newAxis != axis) { + return true; + } } - bool result = inputShapesModified(); - if (!isAxisInputConst) - result = result || axis != (reinterpret_cast(getParentEdgeAt(GATHER_AXIS)->getMemoryPtr()->getData()))[0]; - return result; + return false; } void Gather::prepareParams() { auto dataMemPtr = getParentEdgeAt(GATHER_DATA)->getMemoryPtr(); if (!dataMemPtr || !dataMemPtr->isAllocated()) - THROW_ERROR << " has not allocated input data memory."; + THROW_CPU_NODE_ERR << " has not allocated input data memory."; auto idxMemPtr = getParentEdgeAt(GATHER_INDICES)->getMemoryPtr(); if (!idxMemPtr || !idxMemPtr->isAllocated()) - THROW_ERROR << " has not allocated input indices memory."; + THROW_CPU_NODE_ERR << " has not allocated input indices memory."; if (getSelectedPrimitiveDescriptor() == nullptr) - THROW_ERROR << " has unidentified preferable primitive descriptor."; + THROW_CPU_NODE_ERR << " has unidentified preferable primitive descriptor."; if (!isAxisInputConst) { - axis = (reinterpret_cast(getParentEdgeAt(GATHER_AXIS)->getMemoryPtr()->getData()))[0]; + auto mem = getParentEdgeAt(GATHER_AXIS)->getMemoryPtr(); + if (mem->getDesc().getPrecision() == Precision::I64) { + axis = (reinterpret_cast(mem->getData()))[0]; + } else if (mem->getDesc().getPrecision() == Precision::I32) { + axis = (reinterpret_cast(mem->getData()))[0]; + } if (axis < 0) axis += dataSrcRank; if (axis < 0 || axis >= dataSrcRank || batchDims > axis) - THROW_ERROR << "has incorrect input parameter axis value: " << axis; + THROW_CPU_NODE_ERR << "has incorrect input parameter axis value: " << axis; } if (!isDataShapeStat || !isAxisInputConst) { @@ -358,6 +378,9 @@ void Gather::prepareParams() { } else if (x64::mayiuse(x64::avx2)) { selectedPD->setImplementationType(jit_avx2); } + } else { + // TODO: Add tests + selectedPD->setImplementationType(ref_any); } #endif } @@ -415,7 +438,9 @@ void Gather::execute(dnnl::stream strm) { return; } #endif - execReference(); + OV_SWITCH(intel_cpu, refExec, this, idxPrecision, + OV_CASE(Precision::I32, int32_t), + OV_CASE(Precision::I64, int64_t)) } void Gather::executeDynamicImpl(dnnl::stream strm) { @@ -477,12 +502,14 @@ void Gather::executeDynamicImpl(dnnl::stream strm) { return; } #endif - execReference(); + OV_SWITCH(intel_cpu, refExec, this, idxPrecision, + OV_CASE(Precision::I32, int32_t), + OV_CASE(Precision::I64, int64_t)) } void Gather::initShortParams(threadExecParams& p, const uint64_t start) { if (!jitKernel) - THROW_ERROR << "has uninitialized kernel in function initShortParams."; + THROW_CPU_NODE_ERR << "has uninitialized kernel in function initShortParams."; const uint64_t idxElPerVec = jitKernel->getIdxElPerVec(); if (afterAxisSize == 1) { // Elementwise gather. @@ -547,8 +574,9 @@ void Gather::initShortParams(threadExecParams& p, const uint64_t start) { } } +template void Gather::execReference() { - const int32_t* srcIndices = reinterpret_cast(getParentEdgeAt(GATHER_INDICES)->getMemoryPtr()->getData()); + const idxType* srcIndices = reinterpret_cast(getParentEdgeAt(GATHER_INDICES)->getMemoryPtr()->getData()); const uint8_t* srcData = reinterpret_cast(getParentEdgeAt(GATHER_DATA)->getMemoryPtr()->getData()); uint8_t* dstData = reinterpret_cast(getChildEdgeAt(0)->getMemoryPtr()->getData()); @@ -579,6 +607,13 @@ void Gather::execReference() { }); } +template +struct Gather::refExec { + void operator()(Gather *node) { + node->execReference(); + } +}; + bool Gather::created() const { return getType() == Type::Gather; } diff --git a/src/plugins/intel_cpu/src/nodes/gather.h b/src/plugins/intel_cpu/src/nodes/gather.h index f03a08832a66f5..0ec20a0587008d 100644 --- a/src/plugins/intel_cpu/src/nodes/gather.h +++ b/src/plugins/intel_cpu/src/nodes/gather.h @@ -7,17 +7,13 @@ #include #include "kernels/x64/gather_uni_kernel.hpp" -#include -#include -#include - namespace ov { namespace intel_cpu { namespace node { class Gather : public Node { public: - Gather(const std::shared_ptr& op, const GraphContext::CPtr context); + Gather(const std::shared_ptr& op, const GraphContext::CPtr context); void getSupportedDescriptors() override {}; void initSupportedPrimitiveDescriptors() override; @@ -27,7 +23,7 @@ class Gather : public Node { bool isExecutable() const override; void resolveInPlaceEdges(Edge::LOOK look) override; - static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; + static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; struct threadExecParams { std::vector specIdxInBytes; @@ -53,7 +49,12 @@ class Gather : public Node { void prepareParams() override; private: + template + struct refExec; + void initShortParams(threadExecParams& p, uint64_t start); + + template void execReference(); bool isDataShapeStat = false; @@ -63,7 +64,8 @@ class Gather : public Node { bool reverseIndexing = false; uint64_t dataTypeSize = 1lu; - static constexpr uint64_t idxTypeSize = sizeof(int); + uint64_t idxTypeSize = sizeof(int32_t); + InferenceEngine::Precision idxPrecision; int axis = 0; int axisDim = 0; diff --git a/src/plugins/intel_cpu/src/nodes/gather_nd.cpp b/src/plugins/intel_cpu/src/nodes/gather_nd.cpp index c029869faec4bb..79412e7fa7f33e 100644 --- a/src/plugins/intel_cpu/src/nodes/gather_nd.cpp +++ b/src/plugins/intel_cpu/src/nodes/gather_nd.cpp @@ -2,28 +2,21 @@ // SPDX-License-Identifier: Apache-2.0 // -#include -#include -#include -#include -#include "ie_parallel.hpp" #include "gather_nd.h" -#include -#include -#include + +#include "ie_parallel.hpp" #include "common/cpu_memcpy.h" +#include using namespace InferenceEngine; -#define THROW_ERROR IE_THROW() << "GatherND layer with name '" << getName() << "' " - namespace ov { namespace intel_cpu { namespace node { -bool GatherND::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { +bool GatherND::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { try { - if (!one_of(op->get_type_info(), ngraph::op::v5::GatherND::get_type_info_static(), ngraph::op::v8::GatherND::get_type_info_static())) { + if (!one_of(op->get_type_info(), op::v5::GatherND::get_type_info_static(), op::v8::GatherND::get_type_info_static())) { errorMessage = "Node is not an instance of the GatherND operation from operation set v5 and v8."; return false; } @@ -34,51 +27,51 @@ bool GatherND::isSupportedOperation(const std::shared_ptr& o return true; } -GatherND::GatherND(const std::shared_ptr& op, const GraphContext::CPtr context) - : Node(op, context, NgraphShapeInferFactory(op, EMPTY_PORT_MASK)) { +GatherND::GatherND(const std::shared_ptr& op, const GraphContext::CPtr context) + : Node(op, context, NgraphShapeInferFactory(op, EMPTY_PORT_MASK)) { std::string errorMessage; if (!isSupportedOperation(op, errorMessage)) { IE_THROW(NotImplemented) << errorMessage; } if (inputShapes.size() != 2 && outputShapes.size() != 1) - THROW_ERROR << "has invalid number of input/output edges."; + THROW_CPU_NODE_ERR << "has invalid number of input/output edges."; const size_t dataInputRank = getInputShapeAtPort(GATHERND_DATA).getRank(); const size_t indicesInputRank = getInputShapeAtPort(GATHERND_INDEXES).getRank(); - if (auto gatherNdOp = ngraph::as_type_ptr(op)) { - attrs.batchDims = gatherNdOp->get_batch_dims(); - } else if (auto gatherNdOp = ngraph::as_type_ptr(op)) { + if (auto gatherNdOp = ov::as_type(op.get())) { attrs.batchDims = gatherNdOp->get_batch_dims(); } else { - THROW_ERROR << "has support only opset5."; + THROW_CPU_NODE_ERR << "has support only opset5."; } if (attrs.batchDims >= std::min(dataInputRank, indicesInputRank)) - THROW_ERROR << "has invalid batch_dims attribute: " << attrs.batchDims; + THROW_CPU_NODE_ERR << "has invalid batch_dims attribute: " << attrs.batchDims; } void GatherND::initSupportedPrimitiveDescriptors() { if (!supportedPrimitiveDescriptors.empty()) return; - Precision inDataPrecision = getOriginalInputPrecisionAtPort(GATHERND_DATA); + auto inDataPrecision = getOriginalInputPrecisionAtPort(GATHERND_DATA); if (!one_of(inDataPrecision.size(), + sizeof(PrecisionTrait::value_type), sizeof(PrecisionTrait::value_type), sizeof(PrecisionTrait::value_type), sizeof(PrecisionTrait::value_type))) { - THROW_ERROR << "has unsupported 'data' input precision: " << inDataPrecision; + THROW_CPU_NODE_ERR << "has unsupported 'data' input precision: " << inDataPrecision; } attrs.dataSize = inDataPrecision.size(); - Precision indicesPrecision = getOriginalInputPrecisionAtPort(GATHERND_INDEXES); - if (!one_of(indicesPrecision, - Precision::I32, Precision::I64, Precision::I16, Precision::U16, Precision::I8, Precision::U8)) { - THROW_ERROR << "has unsupported 'indices' input precision: " << indicesPrecision; + auto indicesPrecision = getOriginalInputPrecisionAtPort(GATHERND_INDEXES); + if (indicesPrecision == Precision::U64) { + indicesPrecision = Precision::I64; + } else if (!one_of(indicesPrecision, Precision::I32, Precision::I64)) { + indicesPrecision = Precision::I32; } addSupportedPrimDesc({{LayoutType::ncsp, inDataPrecision}, - {LayoutType::ncsp, Precision::I32}}, + {LayoutType::ncsp, indicesPrecision}}, {{LayoutType::ncsp, inDataPrecision}}, impl_desc_type::ref_any); } @@ -88,13 +81,13 @@ void GatherND::prepareParams() { auto idxMemPtr = getParentEdgeAt(GATHERND_INDEXES)->getMemoryPtr(); auto dstMemPtr = getChildEdgeAt(0)->getMemoryPtr(); if (!srcMemPtr || !srcMemPtr->isAllocated()) - THROW_ERROR << " has not allocated input memory of 'data'."; + THROW_CPU_NODE_ERR << " has not allocated input memory of 'data'."; if (!idxMemPtr || !idxMemPtr->isAllocated()) - THROW_ERROR << " has not allocated input memory of 'indices'."; + THROW_CPU_NODE_ERR << " has not allocated input memory of 'indices'."; if (!dstMemPtr || !dstMemPtr->isAllocated()) - THROW_ERROR << " has not allocated output memory."; + THROW_CPU_NODE_ERR << " has not allocated output memory."; if (getSelectedPrimitiveDescriptor() == nullptr) - THROW_ERROR << " has unidentified preferable primitive descriptor."; + THROW_CPU_NODE_ERR << " has unidentified preferable primitive descriptor."; attrs.srcDims = srcMemPtr->getStaticDims(); attrs.srcStrides = srcMemPtr->getDescWithType()->getStrides(); @@ -129,7 +122,7 @@ GatherND::GatherNDExecutor::GatherNDExecutor(const GatherNDAttributes& attrs) : void GatherND::execute(dnnl::stream strm) { if (!execPtr) - THROW_ERROR << "has not compiled executor."; + THROW_CPU_NODE_ERR << "has not compiled executor."; execPtr->exec(getParentEdgeAt(GATHERND_DATA)->getMemoryPtr(), getParentEdgeAt(GATHERND_INDEXES)->getMemoryPtr(), @@ -144,15 +137,16 @@ void GatherND::GatherNDExecutor::exec(const MemoryPtr& srcMemPtr, const MemoryPt GatherNDContext ctx { this, srcMemPtr, idxMemPtr, dstMemPtr }; OV_SWITCH(intel_cpu, GatherNDEmitter, ctx, dataSize, + OV_CASE(sizeof(PrecisionTrait::value_type), PrecisionTrait::value_type), OV_CASE(sizeof(PrecisionTrait::value_type), PrecisionTrait::value_type), OV_CASE(sizeof(PrecisionTrait::value_type), PrecisionTrait::value_type), OV_CASE(sizeof(PrecisionTrait::value_type), PrecisionTrait::value_type)); } void GatherND::GatherNDExecutor::gatherBlocks(const MemoryPtr& srcMemPtr, const MemoryPtr& idxMemPtr, const MemoryPtr& dstMemPtr) { - const uint8_t* srcData = reinterpret_cast(srcMemPtr->getData()); - const int32_t* indices = reinterpret_cast(idxMemPtr->getData()); - uint8_t* dstData = reinterpret_cast(dstMemPtr->getData()); + auto srcData = reinterpret_cast(srcMemPtr->getData()); + auto indices = idxMemPtr->getData(); + auto dstData = reinterpret_cast(dstMemPtr->getData()); parallel_nt(0, [&](const int ithr, const int nthr) { size_t start(0lu), end(0lu); @@ -164,32 +158,55 @@ void GatherND::GatherNDExecutor::gatherBlocks(const MemoryPtr& srcMemPtr, const size_t workCounter = start; const uint8_t* shiftedSrcData = srcData + bStart * srcBatchStride; - const int32_t* shiftedIndices = indices + bStart * idxBatchStride + cStart * sliceRank; uint8_t* shiftedDstData = dstData + bStart * dstBatchStride + cStart * dataLength; - for (size_t b = bStart; b < batchSize; b++) { - for (size_t j = cStart; j < cycles; j++) { - size_t dataIdx = 0lu; - for (size_t i = 0; i < sliceRank; i++) - dataIdx += srcShifts[i] * shiftedIndices[i]; - cpu_memcpy(shiftedDstData, &(shiftedSrcData[dataIdx]), dataLength); - shiftedDstData += dataLength; - shiftedIndices += sliceRank; - if (++workCounter == end) { - return; + if (idxMemPtr->getDataType() == dnnl::memory::data_type::s32) { + const int32_t* shiftedIndices = reinterpret_cast(indices) + + bStart * idxBatchStride + cStart * sliceRank; + + for (size_t b = bStart; b < batchSize; b++) { + for (size_t j = cStart; j < cycles; j++) { + size_t dataIdx = 0lu; + for (size_t i = 0; i < sliceRank; i++) + dataIdx += srcShifts[i] * shiftedIndices[i]; + cpu_memcpy(shiftedDstData, &(shiftedSrcData[dataIdx]), dataLength); + shiftedDstData += dataLength; + shiftedIndices += sliceRank; + if (++workCounter == end) { + return; + } } + cStart = 0; + shiftedSrcData += srcBatchStride; + } + } else { + const int64_t* shiftedIndices = reinterpret_cast(indices) + + bStart * idxBatchStride + cStart * sliceRank; + + for (size_t b = bStart; b < batchSize; b++) { + for (size_t j = cStart; j < cycles; j++) { + size_t dataIdx = 0lu; + for (size_t i = 0; i < sliceRank; i++) + dataIdx += srcShifts[i] * shiftedIndices[i]; + cpu_memcpy(shiftedDstData, &(shiftedSrcData[dataIdx]), dataLength); + shiftedDstData += dataLength; + shiftedIndices += sliceRank; + if (++workCounter == end) { + return; + } + } + cStart = 0; + shiftedSrcData += srcBatchStride; } - cStart = 0; - shiftedSrcData += srcBatchStride; } }); } template void GatherND::GatherNDExecutor::gatherElementwise(const MemoryPtr& srcMemPtr, const MemoryPtr& idxMemPtr, const MemoryPtr& dstMemPtr) { - const dataType* srcData = reinterpret_cast(srcMemPtr->getData()); - const int32_t* indices = reinterpret_cast(idxMemPtr->getData()); - dataType* dstData = reinterpret_cast(dstMemPtr->getData()); + auto srcData = reinterpret_cast(srcMemPtr->getData()); + auto indices = idxMemPtr->getData(); + auto dstData = reinterpret_cast(dstMemPtr->getData()); parallel_nt(0, [&](const int ithr, const int nthr) { size_t start(0lu), end(0lu); @@ -201,23 +218,46 @@ void GatherND::GatherNDExecutor::gatherElementwise(const MemoryPtr& srcMemPtr, c size_t workCounter = start; const dataType* shiftedSrcData = srcData + bStart * srcBatchStride; - const int32_t* shiftedIndices = indices + bStart * idxBatchStride + cStart * sliceRank; dataType* shiftedDstData = dstData + bStart * dstBatchStride + cStart * dataLength; - for (size_t b = bStart; b < batchSize; b++) { - for (size_t j = cStart; j < cycles; j++) { - size_t dataIdx = 0lu; - for (size_t i = 0lu; i < sliceRank; i++) - dataIdx += srcShifts[i] * shiftedIndices[i]; - shiftedDstData[0] = shiftedSrcData[dataIdx]; - shiftedDstData++; - shiftedIndices += sliceRank; - if (++workCounter == end) { - return; + if (idxMemPtr->getDataType() == dnnl::memory::data_type::s32) { + const int32_t* shiftedIndices = reinterpret_cast(indices) + + bStart * idxBatchStride + cStart * sliceRank; + + for (size_t b = bStart; b < batchSize; b++) { + for (size_t j = cStart; j < cycles; j++) { + size_t dataIdx = 0lu; + for (size_t i = 0lu; i < sliceRank; i++) + dataIdx += srcShifts[i] * shiftedIndices[i]; + shiftedDstData[0] = shiftedSrcData[dataIdx]; + shiftedDstData++; + shiftedIndices += sliceRank; + if (++workCounter == end) { + return; + } + } + cStart = 0lu; + shiftedSrcData += srcBatchStride; + } + } else { + const int64_t* shiftedIndices = reinterpret_cast(indices) + + bStart * idxBatchStride + cStart * sliceRank; + + for (size_t b = bStart; b < batchSize; b++) { + for (size_t j = cStart; j < cycles; j++) { + size_t dataIdx = 0lu; + for (size_t i = 0lu; i < sliceRank; i++) + dataIdx += srcShifts[i] * shiftedIndices[i]; + shiftedDstData[0] = shiftedSrcData[dataIdx]; + shiftedDstData++; + shiftedIndices += sliceRank; + if (++workCounter == end) { + return; + } } + cStart = 0lu; + shiftedSrcData += srcBatchStride; } - cStart = 0lu; - shiftedSrcData += srcBatchStride; } }); } diff --git a/src/plugins/intel_cpu/src/nodes/gather_nd.h b/src/plugins/intel_cpu/src/nodes/gather_nd.h index 0fec5e23337354..1d7fae2beae4d5 100644 --- a/src/plugins/intel_cpu/src/nodes/gather_nd.h +++ b/src/plugins/intel_cpu/src/nodes/gather_nd.h @@ -4,11 +4,7 @@ #pragma once -#include #include -#include -#include -#include namespace ov { namespace intel_cpu { @@ -16,14 +12,14 @@ namespace node { class GatherND : public Node { public: - GatherND(const std::shared_ptr& op, const GraphContext::CPtr context); + GatherND(const std::shared_ptr& op, const GraphContext::CPtr context); void getSupportedDescriptors() override {}; void initSupportedPrimitiveDescriptors() override; void execute(dnnl::stream strm) override; bool created() const override; - static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; + static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; protected: void executeDynamicImpl(dnnl::stream strm) override; diff --git a/src/plugins/intel_cpu/src/nodes/grid_sample.cpp b/src/plugins/intel_cpu/src/nodes/grid_sample.cpp index 798b04078352bf..af5c9fbe50d78f 100644 --- a/src/plugins/intel_cpu/src/nodes/grid_sample.cpp +++ b/src/plugins/intel_cpu/src/nodes/grid_sample.cpp @@ -11,8 +11,8 @@ using namespace InferenceEngine; using namespace dnnl::impl::cpu; -using namespace ov::intel_cpu; using namespace ov::intel_cpu::node; +using namespace ov::intel_cpu::kernel; #define THROW_ERROR IE_THROW() << getTypeStr() << " node with name '" << getName() << "' " @@ -145,7 +145,7 @@ void GridSample::createPrimitive() { if (!jitKernel) { THROW_ERROR << " could not create JIT kernel."; } - jitKernel->create_ker(); + jitKernel->create_kernel(); nthr = parallel_get_max_threads(); execParamsPerThread.resize(nthr); diff --git a/src/plugins/intel_cpu/src/nodes/grid_sample.hpp b/src/plugins/intel_cpu/src/nodes/grid_sample.hpp index 89a1a409764615..774f85f3b69a73 100644 --- a/src/plugins/intel_cpu/src/nodes/grid_sample.hpp +++ b/src/plugins/intel_cpu/src/nodes/grid_sample.hpp @@ -58,8 +58,8 @@ class GridSample : public Node { private: bool alignCorners = false; - GridSampleInterpolationMode interpolationMode = GridSampleInterpolationMode::BILINEAR; - GridSamplePaddingMode paddingMode = GridSamplePaddingMode::ZEROS; + kernel::GridSampleInterpolationMode interpolationMode = kernel::GridSampleInterpolationMode::BILINEAR; + kernel::GridSamplePaddingMode paddingMode = kernel::GridSamplePaddingMode::ZEROS; uint64_t dataTypeSize = 1lu; uint64_t gridTypeSize = 1lu; @@ -72,7 +72,7 @@ class GridSample : public Node { static constexpr size_t IN_DATA = 0; static constexpr size_t IN_GRID = 1; - std::shared_ptr jitKernel; + std::shared_ptr jitKernel; }; } // namespace node diff --git a/src/plugins/intel_cpu/src/nodes/input.cpp b/src/plugins/intel_cpu/src/nodes/input.cpp index e153a55b011ace..fe3f1609882b87 100644 --- a/src/plugins/intel_cpu/src/nodes/input.cpp +++ b/src/plugins/intel_cpu/src/nodes/input.cpp @@ -26,7 +26,7 @@ using namespace dnnl; using namespace InferenceEngine; using namespace details; -using namespace ngraph::op; +using namespace ov::op; using namespace dnnl::impl::cpu::x64; using namespace Xbyak; @@ -232,7 +232,7 @@ jit_has_subnormals_base::fn_t jit_has_subnormals_function() { } // namespace #endif -Input::Input(const std::shared_ptr& op, const GraphContext::CPtr context) +Input::Input(const std::shared_ptr& op, const GraphContext::CPtr& context) : Node(op, context, PassThroughShapeInferFactory()) { if (!one_of(op->get_type_info(), v0::Parameter::get_type_info_static(), @@ -244,7 +244,7 @@ Input::Input(const std::shared_ptr& op, const GraphContext::CPtr c constant = ConstantType::NoConst; - constOp = ngraph::as_type_ptr(op); + constOp = ov::as_type_ptr(op); if (constOp) { constant = ConstantType::Const; cloneBlobIfRequired(); @@ -252,7 +252,7 @@ Input::Input(const std::shared_ptr& op, const GraphContext::CPtr c } void Input::cloneBlobIfRequired() { - Shape shape(constOp->get_shape().empty() ? ngraph::Shape(1, 1) : constOp->get_shape()); + Shape shape(constOp->get_shape().empty() ? ov::Shape(1, 1) : constOp->get_shape()); const auto prec = convertPrecision(constOp->get_element_type()); const size_t size = shape.getElementsCount(); DnnlBlockedMemoryDesc memDesc(prec, shape); @@ -379,7 +379,7 @@ Input::Input(const Shape& shape, const InferenceEngine::Precision& prc, const std::string& name, const std::string& type, - const GraphContext::CPtr context) + const GraphContext::CPtr& context) : Node(type, name, context) { constant = ConstantType::NoConst; if (getType() == Type::Input) { @@ -391,7 +391,7 @@ Input::Input(const Shape& shape, } } -Input::Input(MemoryDescPtr memDesc, const std::string& name, const std::string& type, const GraphContext::CPtr context) +Input::Input(const MemoryDescPtr& memDesc, const std::string& name, const std::string& type, const GraphContext::CPtr& context) : Input(memDesc->getShape(), memDesc->getPrecision(), name, type, context) { extMemDesc = memDesc; } diff --git a/src/plugins/intel_cpu/src/nodes/input.h b/src/plugins/intel_cpu/src/nodes/input.h index 71ae6b91e7660c..a1adb15a6244e4 100644 --- a/src/plugins/intel_cpu/src/nodes/input.h +++ b/src/plugins/intel_cpu/src/nodes/input.h @@ -15,13 +15,13 @@ namespace node { class Input : public Node { public: - Input(const std::shared_ptr& op, const GraphContext::CPtr context); + Input(const std::shared_ptr& op, const GraphContext::CPtr& context); Input(const Shape& shape, const InferenceEngine::Precision& prc, const std::string& name, const std::string& type, - const GraphContext::CPtr context); - Input(MemoryDescPtr memDesc, const std::string& name, const std::string& type, const GraphContext::CPtr context); + const GraphContext::CPtr& context); + Input(const MemoryDescPtr& memDesc, const std::string& name, const std::string& type, const GraphContext::CPtr& context); void getSupportedDescriptors() override; void initSupportedPrimitiveDescriptors() override; diff --git a/src/plugins/intel_cpu/src/nodes/kernels/x64/gather_uni_kernel.cpp b/src/plugins/intel_cpu/src/nodes/kernels/x64/gather_uni_kernel.cpp index 4f24e7ac2d7a34..31372bdf65e69c 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/x64/gather_uni_kernel.cpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/x64/gather_uni_kernel.cpp @@ -1025,6 +1025,9 @@ void jitUniGatherKernel::fillVlenVector() { template bool jitUniGatherKernel::isSupportedConfiguration(uint64_t afterAxisSize) { + if (jcp.dataTypeSize > 4 || jcp.idxPrc != InferenceEngine::Precision::I32) { + return false; + } if (!jcp.dynamicShapes && afterAxisSize <= idxElPerVec) { if (afterAxisSize > 1 && isa == x64::avx2 && (jcp.dataTypeSize == 1 || jcp.dataTypeSize == 2)) // There are no enough registers for these cases. diff --git a/src/plugins/intel_cpu/src/nodes/kernels/x64/gather_uni_kernel.hpp b/src/plugins/intel_cpu/src/nodes/kernels/x64/gather_uni_kernel.hpp index aec991ba26360c..0548108948dbcf 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/x64/gather_uni_kernel.hpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/x64/gather_uni_kernel.hpp @@ -24,12 +24,14 @@ #include "cpu/x64/jit_generator.hpp" #include +#include namespace ov { namespace intel_cpu { struct jGatherConfParams { uint64_t dataTypeSize = 1lu; + InferenceEngine::Precision idxPrc = InferenceEngine::Precision::I32; bool reverseIndexing = true; bool dynamicShapes = false; uint64_t batchDims = 0lu; diff --git a/src/plugins/intel_cpu/src/nodes/kernels/x64/grid_sample.cpp b/src/plugins/intel_cpu/src/nodes/kernels/x64/grid_sample.cpp index 7501dd606427ce..ea9bb8a41105fe 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/x64/grid_sample.cpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/x64/grid_sample.cpp @@ -1,19 +1,17 @@ -// Copyright (C) 2022 Intel Corporation +// Copyright (C) 2018-2023 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #include "grid_sample.hpp" using namespace dnnl::impl::cpu; - -namespace ov { -namespace intel_cpu { +using namespace ov::intel_cpu::kernel; #define GET_OFF(field) offsetof(GridSamplesKernelExecArgs, field) template -GridSampleKernel::GridSampleKernel(const GridSampleKernelConfParams& jcp) : - GridSampleKernelBase(jit_name(), jcp) { +GridSampleKernel::GridSampleKernel(const GridSampleKernelConfParams& jcp) + : GridSampleKernelBase(jit_name(), jcp, isa) { vlen = x64::cpu_isa_traits::vlen; dataTypeSize = jcp.inDataPrc.size(); gridTypeSize = jcp.gridPrc.size(); @@ -25,14 +23,6 @@ GridSampleKernel::GridSampleKernel(const GridSampleKernelConfParams& jcp) : dataTypeShift = 2; } -template -void GridSampleKernel::create_ker() { - auto code = x64::jit_generator::create_kernel(); - if (code != dnnl::impl::status::success) - IE_THROW() << "Could not create GridSample kernel. Error code: " << std::to_string(code); - ker_ = (decltype(ker_))jit_ker(); -} - template void GridSampleKernel::generate() { this->preamble(); @@ -2084,6 +2074,3 @@ void GridSampleKernel::hwShiftPs2dq(const Vmm& vDst, const Vmm& vHCoord, co template class GridSampleKernel; template class GridSampleKernel; template class GridSampleKernel; - -} // namespace intel_cpu -} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/kernels/x64/grid_sample.hpp b/src/plugins/intel_cpu/src/nodes/kernels/x64/grid_sample.hpp index c24100259cd5bb..3883c2b1b7de0f 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/x64/grid_sample.hpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/x64/grid_sample.hpp @@ -10,6 +10,7 @@ namespace ov { namespace intel_cpu { +namespace kernel { enum class GridSampleInterpolationMode { BILINEAR, BICUBIC, NEAREST }; enum class GridSamplePaddingMode { ZEROS, BORDER, REFLECTION }; @@ -59,28 +60,16 @@ enum coord { w, h }; -class GridSampleKernelBase: public JitKernelBase { +class GridSampleKernelBase: public JitKernel { public: - void (*ker_)(const GridSamplesKernelExecArgs *); - void operator()(const GridSamplesKernelExecArgs *args) { - assert(ker_); - ker_(args); - } - explicit GridSampleKernelBase(const char* name, const GridSampleKernelConfParams& jcp) : JitKernelBase(name), ker_(nullptr), jcp(jcp) {} + explicit GridSampleKernelBase(const char* name, const GridSampleKernelConfParams& jcp, dnnl::impl::cpu::x64::cpu_isa_t isa) + : JitKernel(name, jcp, isa) {} - virtual void create_ker() = 0; - uint64_t getVecLen() { - return vlen; - } uint64_t getDataElPerVec() { return dataElPerVec; } - uint64_t getGridElPerVec() { - return gridElPerVec; - } protected: - GridSampleKernelConfParams jcp; uint64_t vlen = 16lu; uint64_t dataTypeSize = 1lu; uint64_t gridTypeSize = 1lu; @@ -95,7 +84,6 @@ class GridSampleKernel : public GridSampleKernelBase { explicit GridSampleKernel(const GridSampleKernelConfParams& jcp); - void create_ker() override; void generate() override; using Vmm = typename dnnl::impl::utils::conditional3::vlen; paddd(xmmDst, op.getAddress()); - vperm2f128(vDst, vDst, vDst, 0x1); + vperm2f128(vmm_dst, vmm_dst, vmm_dst, 0x1); paddd(xmmDst, ptr[op.getAddress().getRegExp() + vlen]); - vperm2f128(vDst, vDst, vDst, 0x1); + vperm2f128(vmm_dst, vmm_dst, vmm_dst, 0x1); } else { IE_THROW() << "Not supported operand type."; } } else if (isValidIsa(x64::sse41)) { - assert(vDst.getIdx() != vSrc.getIdx()); - paddd(vDst, op); + assert(vmm_dst.getIdx() != vmm_src.getIdx()); + paddd(vmm_dst, op); } else { IE_THROW() << "Not defined behavior for instruction 'vpaddd' in current instructions set."; } } -void JitKernelBase::uni_vpsubd(const Xbyak::Ymm& vDst, - const Xbyak::Ymm& vSrc, - const Xbyak::Operand& op) { +void JitKernelBase::uni_vaddpd(const Xmm& vmm_dst, const Operand &op1, const Operand &op2) { + if (isValidIsa(x64::avx)) { + vaddpd(vmm_dst, op1, op2); + } else { + if (vmm_dst.getIdx() != op1.getIdx()) { + movupd(vmm_dst, op1); + } + addpd(vmm_dst, op2); + } +} + +void JitKernelBase::uni_vpsubd(const Ymm& vmm_dst, + const Ymm& vmm_src, + const Operand& op) { if (isValidIsa(x64::avx2)) { - vpsubd(vDst, vSrc, op); + vpsubd(vmm_dst, vmm_src, op); } else if (isValidIsa(x64::avx)) { - Xbyak::Xmm xmmDst(vDst.getIdx()); - vmovups(vDst, vSrc); + Xmm xmmDst(vmm_dst.getIdx()); + vmovups(vmm_dst, vmm_src); if (op.isYMM()) { - Xbyak::Ymm ymmOp(op.getIdx()); - Xbyak::Xmm xmmOp(op.getIdx()); + Ymm ymmOp(op.getIdx()); + Xmm xmmOp(op.getIdx()); psubd(xmmDst, xmmOp); - vperm2f128(vDst, vDst, vDst, 0x1); + vperm2f128(vmm_dst, vmm_dst, vmm_dst, 0x1); vperm2f128(ymmOp, ymmOp, ymmOp, 0x1); psubd(xmmDst, xmmOp); - vperm2f128(vDst, vDst, vDst, 0x1); + vperm2f128(vmm_dst, vmm_dst, vmm_dst, 0x1); vperm2f128(ymmOp, ymmOp, ymmOp, 0x1); } else if (op.isMEM()) { const int vlen = x64::cpu_isa_traits::vlen; psubd(xmmDst, op.getAddress()); - vperm2f128(vDst, vDst, vDst, 0x1); + vperm2f128(vmm_dst, vmm_dst, vmm_dst, 0x1); psubd(xmmDst, ptr[op.getAddress().getRegExp() + vlen]); - vperm2f128(vDst, vDst, vDst, 0x1); + vperm2f128(vmm_dst, vmm_dst, vmm_dst, 0x1); } else { IE_THROW() << "Not supported operand type."; } } else if (isValidIsa(x64::sse41)) { - assert(vDst.getIdx() != vSrc.getIdx()); - psubd(vDst, op); + if (vmm_dst.getIdx() != vmm_src.getIdx()) { + movups(vmm_dst, vmm_src); + } + psubd(vmm_dst, op); } else { IE_THROW() << "Not defined behavior for instruction 'vpsubd' in current instructions set."; } } -void JitKernelBase::uni_vdivps(const Xbyak::Xmm& vDst, - const Xbyak::Operand& op1, - const Xbyak::Operand& op2) { +void JitKernelBase::uni_vmulpd(const Xmm& vmm_dst, + const Operand& op1, + const Operand& op2) { + if (isValidIsa(x64::avx)) { + vmulpd(vmm_dst, op1, op2); + } else { + if (vmm_dst.getIdx() != op1.getIdx()) { + movupd(vmm_dst, op1); + } + mulpd(vmm_dst, op2); + } +} + +void JitKernelBase::uni_vdivps(const Xmm& vmm_dst, + const Operand& op1, + const Operand& op2) { + if (isValidIsa(x64::avx)) { + vdivps(vmm_dst, op1, op2); + } else { + if (!vmm_dst.isEqualIfNotInherited(op1)) { + movups(vmm_dst, op1); + } + divps(vmm_dst, op2); + } +} + +void JitKernelBase::uni_vdivpd(const Xmm& vmm_dst, + const Operand& op1, + const Operand& op2) { + if (isValidIsa(x64::avx)) { + vdivpd(vmm_dst, op1, op2); + } else { + if (vmm_dst.getIdx() != op1.getIdx()) { + movupd(vmm_dst, op1); + } + divpd(vmm_dst, op2); + } +} + +void JitKernelBase::uni_vandps(const Xmm& vmm_dst, + const Xmm& vmm_src, + const Operand &op) { + if (isValidIsa(x64::avx)) { + vandps(vmm_dst, vmm_src, op); + } else { + if (vmm_dst.getIdx() != vmm_src.getIdx()) { + movups(vmm_dst, vmm_src); + } + andps(vmm_dst, op); + } +} + +void JitKernelBase::uni_vandpd(const Xmm& vmm_dst, + const Xmm& vmm_src, + const Operand &op) { + if (isValidIsa(x64::avx)) { + vandpd(vmm_dst, vmm_src, op); + } else { + if (vmm_dst.getIdx() != vmm_src.getIdx()) { + movupd(vmm_dst, vmm_src); + } + andpd(vmm_dst, op); + } +} + +void JitKernelBase::uni_vandnps(const Xmm& vmm_dst, + const Xmm& vmm_src, + const Operand &op) { + if (isValidIsa(x64::avx)) { + vandnps(vmm_dst, vmm_src, op); + } else { + if (!vmm_dst.isEqualIfNotInherited(vmm_src)) { + movups(vmm_dst, vmm_src); + } + andnps(vmm_dst, op); + } +} + +void JitKernelBase::uni_vorpd(const Xmm& vmm_dst, + const Xmm& vmm_src, + const Operand &op) { if (isValidIsa(x64::avx)) { - vdivps(vDst, op1, op2); + vorpd(vmm_dst, vmm_src, op); } else { - if (!vDst.isEqualIfNotInherited(op1)) { - movups(vDst, op1); + if (vmm_dst.getIdx() != vmm_src.getIdx()) { + movupd(vmm_dst, vmm_src); } - divps(vDst, op2); + orpd(vmm_dst, op); } } -void JitKernelBase::uni_vandps(const Xbyak::Xmm& vDst, - const Xbyak::Xmm& vSrs, - const Xbyak::Operand &op) { +void JitKernelBase::uni_vcmppd(const Xmm& vmm_dst, + const Xmm &vmm_src, + const Operand &op, + const uint8_t imm) { if (isValidIsa(x64::avx)) { - vandps(vDst, vSrs, op); + vcmppd(vmm_dst, vmm_src, op, imm); } else { - if (!vDst.isEqualIfNotInherited(vSrs)) { - movups(vDst, vSrs); + if (vmm_dst.getIdx() != vmm_src.getIdx()) { + movupd(vmm_dst, vmm_src); } - andps(vDst, op); + cmppd(vmm_dst, op, imm); } } -void JitKernelBase::uni_vandnps(const Xbyak::Xmm& vDst, - const Xbyak::Xmm& vSrs, - const Xbyak::Operand &op) { +void JitKernelBase::uni_vmaxpd(const Xmm& vmm_dst, const Operand &op1, const Operand &op2) { if (isValidIsa(x64::avx)) { - vandnps(vDst, vSrs, op); + vmaxpd(vmm_dst, op1, op2); } else { - if (!vDst.isEqualIfNotInherited(vSrs)) { - movups(vDst, vSrs); + if (vmm_dst.getIdx() != op1.getIdx()) { + movupd(vmm_dst, op1); } - andnps(vDst, op); + maxpd(vmm_dst, op2); } } -void JitKernelBase::gatherdd(const Xbyak::Xmm& vDst, - const Xbyak::Reg64& rSrcPtr, - const Xbyak::Xmm& vSrcShift, - const Xbyak::Opmask& kReadMask, +void JitKernelBase::uni_vminpd(const Xmm& vmm_dst, const Operand &op1, const Operand &op2) { + if (isValidIsa(x64::avx)) { + vminpd(vmm_dst, op1, op2); + } else { + if (vmm_dst.getIdx() != op1.getIdx()) { + movupd(vmm_dst, op1); + } + minpd(vmm_dst, op2); + } +} + +void JitKernelBase::uni_vcvtpd2dq(const Xbyak::Xmm &vmm_dst, const Xbyak::Operand &op) { + if (isValidIsa(x64::avx)) { + vcvtpd2dq(vmm_dst, op); + } else { + cvtpd2dq(vmm_dst, op); + } +} + +void JitKernelBase::uni_vcvtpd2ps(const Xbyak::Xmm &vmm_dst, const Xbyak::Operand &op) { + if (isValidIsa(x64::avx)) { + vcvtpd2ps(vmm_dst, op); + } else { + cvtpd2ps(vmm_dst, op); + } +} + +void JitKernelBase::gatherdd(const Xmm& vmm_dst, + const Reg64& rSrcPtr, + const Xmm& vSrcShift, + const Opmask& kReadMask, const bool useMask, const bool zeroFill) { if (kReadMask.getIdx() == 0) { @@ -178,42 +299,42 @@ void JitKernelBase::gatherdd(const Xbyak::Xmm& vDst, if (!useMask) kxnord(kReadMask, kReadMask, kReadMask); if (zeroFill) - uni_vpxor(vDst, vDst, vDst); + uni_vpxor(vmm_dst, vmm_dst, vmm_dst); - vpgatherdd(vDst | kReadMask, ptr[rSrcPtr + vSrcShift]); + vpgatherdd(vmm_dst | kReadMask, ptr[rSrcPtr + vSrcShift]); } -void JitKernelBase::gatherdd(const Xbyak::Xmm& vDst, - const Xbyak::Reg64& rSrcPtr, - const Xbyak::Xmm& vSrcShift, - const Xbyak::Xmm& vReadMask, +void JitKernelBase::gatherdd(const Xmm& vmm_dst, + const Reg64& rSrcPtr, + const Xmm& vSrcShift, + const Xmm& vReadMask, const bool useMask, const bool zeroFill) { - if (vDst.getIdx() == vSrcShift.getIdx() || vDst.getIdx() == vReadMask.getIdx() || vSrcShift.getIdx() == vReadMask.getIdx()) { + if (vmm_dst.getIdx() == vSrcShift.getIdx() || vmm_dst.getIdx() == vReadMask.getIdx() || vSrcShift.getIdx() == vReadMask.getIdx()) { IE_THROW() << "Any pair of the index, mask, or destination registers cannot be the same."; } if (zeroFill) - pxor(vDst, vDst); // Don't use vpxor. It zeros the rest of the YMM register. + pxor(vmm_dst, vmm_dst); // Don't use vpxor. It zeros the rest of the YMM register. if (isValidIsa(x64::avx2)) { if (!useMask) uni_vpcmpeqd(vReadMask, vReadMask, vReadMask); - vpgatherdd(vDst, ptr[rSrcPtr + vSrcShift], vReadMask); + vpgatherdd(vmm_dst, ptr[rSrcPtr + vSrcShift], vReadMask); } else { auto rAux = getReg64(); - Xbyak::Reg32 r32Aux = Xbyak::Reg32(rAux.getIdx()); + Reg32 r32Aux = Reg32(rAux.getIdx()); const uint8_t elPerVec = x64::cpu_isa_traits::vlen / sizeof(int); for (uint8_t i = 0; i < elPerVec; i++) { - Xbyak::Label lLoopNext; + Label lLoopNext; if (useMask) { uni_vpextrd(r32Aux, vReadMask, i); cmp(r32Aux, 0); // TODO: check significant bit je(lLoopNext, T_NEAR); } uni_vpextrd(r32Aux, vSrcShift, i); - pinsrd(vDst, ptr[rSrcPtr + rAux], i); + pinsrd(vmm_dst, ptr[rSrcPtr + rAux], i); if (useMask) L(lLoopNext); @@ -221,30 +342,30 @@ void JitKernelBase::gatherdd(const Xbyak::Xmm& vDst, } } -void JitKernelBase::gatherdd(const Xbyak::Ymm& vDst, - const Xbyak::Reg64& rSrcPtr, - const Xbyak::Ymm& vSrcShift, - const Xbyak::Ymm& vReadMask, +void JitKernelBase::gatherdd(const Ymm& vmm_dst, + const Reg64& rSrcPtr, + const Ymm& vSrcShift, + const Ymm& vReadMask, const bool useMask, const bool zeroFill) { - if (vDst.getIdx() == vSrcShift.getIdx() || vDst.getIdx() == vReadMask.getIdx() || vSrcShift.getIdx() == vReadMask.getIdx()) { + if (vmm_dst.getIdx() == vSrcShift.getIdx() || vmm_dst.getIdx() == vReadMask.getIdx() || vSrcShift.getIdx() == vReadMask.getIdx()) { IE_THROW() << "Any pair of the index, mask, or destination registers cannot be the same."; } if (isValidIsa(x64::avx2)) { if (!useMask) uni_vpcmpeqd(vReadMask, vReadMask, vReadMask); if (zeroFill) - uni_vpxor(vDst, vDst, vDst); + uni_vpxor(vmm_dst, vmm_dst, vmm_dst); - vpgatherdd(vDst, ptr[rSrcPtr + vSrcShift], vReadMask); + vpgatherdd(vmm_dst, ptr[rSrcPtr + vSrcShift], vReadMask); } else { - Xbyak::Xmm xmmDst = Xbyak::Xmm(vDst.getIdx()), - xmmSrcShft = Xbyak::Xmm(vSrcShift.getIdx()), - xmmReadMask = Xbyak::Xmm(vReadMask.getIdx()); + Xmm xmmDst = Xmm(vmm_dst.getIdx()), + xmmSrcShft = Xmm(vSrcShift.getIdx()), + xmmReadMask = Xmm(vReadMask.getIdx()); for (uint8_t i = 0; i < 2; i++) { gatherdd(xmmDst, rSrcPtr, xmmSrcShft, xmmReadMask, useMask, zeroFill); - vperm2f128(vDst, vDst, vDst, 0x1); + vperm2f128(vmm_dst, vmm_dst, vmm_dst, 0x1); vperm2f128(vSrcShift, vSrcShift, vSrcShift, 0x1); if (useMask) vperm2f128(vReadMask, vReadMask, vReadMask, 0x1); @@ -252,7 +373,7 @@ void JitKernelBase::gatherdd(const Xbyak::Ymm& vDst, } } -void JitKernelBase::uni_vpbroadcastd(const Xbyak::Xmm &x, const Xbyak::Operand &op) { +void JitKernelBase::uni_vpbroadcastd(const Xmm &x, const Operand &op) { if (isValidIsa(x64::avx2)) { vpbroadcastd(x, op); } else if (isValidIsa(x64::avx)) { @@ -268,14 +389,14 @@ void JitKernelBase::uni_vpbroadcastd(const Xbyak::Xmm &x, const Xbyak::Operand & } } -void JitKernelBase::uni_vpbroadcastd(const Xbyak::Ymm &x, const Xbyak::Operand &op) { +void JitKernelBase::uni_vpbroadcastd(const Ymm &x, const Operand &op) { if (isValidIsa(x64::avx2)) { vpbroadcastd(x, op); } else { if (op.isMEM()) { vbroadcastss(x, op.getAddress()); } else { - const Xbyak::Xmm t(x.getIdx()); + const Xmm t(x.getIdx()); if (!t.isEqualIfNotInherited(op)) { vmovss(t, t, op); } @@ -285,8 +406,8 @@ void JitKernelBase::uni_vpbroadcastd(const Xbyak::Ymm &x, const Xbyak::Operand & } } -void JitKernelBase::fillRestWorkMask(const Xbyak::Opmask& dstMask, - const Xbyak::Reg64& rWorkRest) { +void JitKernelBase::fillRestWorkMask(const Opmask& dstMask, + const Reg64& rWorkRest) { auto rOnes = getReg64(); mov(rOnes, 0xFFFFFFFFFFFFFFFF); @@ -295,15 +416,15 @@ void JitKernelBase::fillRestWorkMask(const Xbyak::Opmask& dstMask, kmovq(dstMask, rOnes); } -void JitKernelBase::fillRestWorkMask(const Xbyak::Xmm& xmmDstMask, - const Xbyak::Reg64& rWorkRest, +void JitKernelBase::fillRestWorkMask(const Xmm& xmmDstMask, + const Reg64& rWorkRest, const uint64_t typeSize) { if (!one_of(typeSize, 1u, 2u, 4u, 8u)) { IE_THROW() << "Could not fill data with type size " << typeSize; } - Xbyak::Label lEnd; + Label lEnd; auto r32Ones = getReg32(); - Xbyak::Reg64 r64Ones(r32Ones.getIdx()); + Reg64 r64Ones(r32Ones.getIdx()); auto elPerVec = x64::cpu_isa_traits::vlen / typeSize; mov(r64Ones, 0xFFFFFFFFFFFFFFFF); @@ -324,22 +445,22 @@ void JitKernelBase::fillRestWorkMask(const Xbyak::Xmm& xmmDstMask, L(lEnd); } -void JitKernelBase::fillRestWorkMask(const Xbyak::Ymm& ymmDstMask, - const Xbyak::Reg64& rWorkRest, +void JitKernelBase::fillRestWorkMask(const Ymm& ymmDstMask, + const Reg64& rWorkRest, const uint64_t typeSize) { if (!one_of(typeSize, 1u, 2u, 4u, 8u)) { IE_THROW() << "Could not fill data with type size " << typeSize; } - Xbyak::Label lEnd; + Label lEnd; auto elPerVec = x64::cpu_isa_traits::vlen / typeSize; auto r32Ones = getReg32(); - Xbyak::Reg64 r64Ones(r32Ones.getIdx()); - Xbyak::Xmm xmmDstMask(ymmDstMask.getIdx()); + Reg64 r64Ones(r32Ones.getIdx()); + Xmm xmmDstMask(ymmDstMask.getIdx()); mov(r64Ones, 0xFFFFFFFFFFFFFFFF); uni_vpxor(ymmDstMask, ymmDstMask, ymmDstMask); for (uint8_t i = 0; i < 2; i++) { - Xbyak::Label lPerm; + Label lPerm; for (uint8_t j = 0; j < elPerVec; j++) { cmp(rWorkRest, i * elPerVec + j); jle(i == 0 ? lEnd : lPerm, T_NEAR); @@ -362,18 +483,18 @@ void JitKernelBase::fillRestWorkMask(const Xbyak::Ymm& ymmDstMask, L(lEnd); } -void JitKernelBase::load(const Xbyak::Xmm& vDst, - const Xbyak::Address& srcAddr, - const Xbyak::Reg64& rLoadNum, - const size_t typeSize, - const bool zeroFilling) { - if (!one_of(typeSize, 1u, 2u, 4u, 8u)) { +void JitKernelBase::load(const Xmm& vmm_dst, + const Address& srcAddr, + const Reg64& rLoadNum, + const size_t typeSize, + const bool zeroFilling) { + if (!one_of(typeSize, 1lu, 2lu, 4lu, 8lu)) { IE_THROW() << "Could not load data with type size " << typeSize; } const uint8_t elPerVec = x64::cpu_isa_traits::vlen / typeSize; - Xbyak::Label lEnd; + Label lEnd; if (zeroFilling) - pxor(vDst, vDst); + pxor(vmm_dst, vmm_dst); for (uint8_t i = 0; i < elPerVec; i++) { cmp(rLoadNum, i); @@ -381,33 +502,33 @@ void JitKernelBase::load(const Xbyak::Xmm& vDst, const size_t offset = i * typeSize; if (typeSize == 1) - pinsrb(vDst, ptr[srcAddr.getRegExp() + offset], i); + pinsrb(vmm_dst, ptr[srcAddr.getRegExp() + offset], i); else if (typeSize == 2) - pinsrw(vDst, ptr[srcAddr.getRegExp() + offset], i); + pinsrw(vmm_dst, ptr[srcAddr.getRegExp() + offset], i); else if (typeSize == 4) - pinsrd(vDst, ptr[srcAddr.getRegExp() + offset], i); + pinsrd(vmm_dst, ptr[srcAddr.getRegExp() + offset], i); else if (typeSize == 8) - pinsrq(vDst, ptr[srcAddr.getRegExp() + offset], i); + pinsrq(vmm_dst, ptr[srcAddr.getRegExp() + offset], i); } L(lEnd); } -void JitKernelBase::load(const Xbyak::Ymm& vDst, - const Xbyak::Address& srcAddr, - const Xbyak::Reg64& rLoadNum, - const size_t typeSize, - const bool zeroFilling) { - if (!one_of(typeSize, 1u, 2u, 4u, 8u)) { +void JitKernelBase::load(const Ymm& vmm_dst, + const Address& srcAddr, + const Reg64& rLoadNum, + const size_t typeSize, + const bool zeroFilling) { + if (!one_of(typeSize, 1lu, 2lu, 4lu, 8lu)) { IE_THROW() << "Could not load data with type size " << typeSize; } const size_t elPerXmm = x64::cpu_isa_traits::vlen / typeSize; - Xbyak::Label lEnd; + Label lEnd; if (zeroFilling) - uni_vpxor(vDst, vDst, vDst); - Xbyak::Xmm xmmDst(vDst.getIdx()); + uni_vpxor(vmm_dst, vmm_dst, vmm_dst); + Xmm xmmDst(vmm_dst.getIdx()); for (size_t i = 0lu; i < 2lu; i++) { - Xbyak::Label lPerm; + Label lPerm; const size_t idx = i * elPerXmm; const size_t offset0 = idx * typeSize; @@ -427,19 +548,19 @@ void JitKernelBase::load(const Xbyak::Ymm& vDst, } L(lPerm); - vperm2f128(vDst, vDst, vDst, 0x1); + vperm2f128(vmm_dst, vmm_dst, vmm_dst, 0x1); } L(lEnd); } -void JitKernelBase::store(const Xbyak::Address& dstAddr, - const Xbyak::Xmm& vSrc, - const Xbyak::Reg64& rToStoreNum, +void JitKernelBase::store(const Address& dstAddr, + const Xmm& vmm_src, + const Reg64& rToStoreNum, const size_t typeSize) { if (!one_of(typeSize, 1u, 2u, 4u, 8u)) { IE_THROW() << "Could not store data with type size " << typeSize; } - Xbyak::Label lEnd; + Label lEnd; const size_t elPerVec = x64::cpu_isa_traits::vlen / typeSize; for (size_t i = 0; i < elPerVec; i++) { @@ -448,31 +569,31 @@ void JitKernelBase::store(const Xbyak::Address& dstAddr, const size_t offset = i * typeSize; if (typeSize == 1) { - uni_vpextrb(ptr[dstAddr.getRegExp() + offset], vSrc, i); + uni_vpextrb(ptr[dstAddr.getRegExp() + offset], vmm_src, i); } else if (typeSize == 2) { - uni_vpextrw(ptr[dstAddr.getRegExp() + offset], vSrc, i); + uni_vpextrw(ptr[dstAddr.getRegExp() + offset], vmm_src, i); } else if (typeSize == 4) { - uni_vpextrd(ptr[dstAddr.getRegExp() + offset], vSrc, i); + uni_vpextrd(ptr[dstAddr.getRegExp() + offset], vmm_src, i); } else if (typeSize == 8) { - uni_vpextrq(ptr[dstAddr.getRegExp() + offset], vSrc, i); + uni_vpextrq(ptr[dstAddr.getRegExp() + offset], vmm_src, i); } } L(lEnd); } -void JitKernelBase::store(const Xbyak::Address& dstAddr, - const Xbyak::Ymm& vSrc, - const Xbyak::Reg64& rToStoreNum, +void JitKernelBase::store(const Address& dstAddr, + const Ymm& vmm_src, + const Reg64& rToStoreNum, const size_t typeSize) { if (!one_of(typeSize, 1u, 2u, 4u, 8u)) { IE_THROW() << "Could not store data with type size " << typeSize; } - Xbyak::Label lEnd; - Xbyak::Xmm xmmSrc(vSrc.getIdx()); + Label lEnd; + Xmm xmm_src(vmm_src.getIdx()); const size_t elPerXmm = x64::cpu_isa_traits::vlen / typeSize; for (int i = 0; i < 2; i++) { - Xbyak::Label lPerm; + Label lPerm; const size_t idx = i * elPerXmm; const size_t offset0 = idx * typeSize; @@ -482,32 +603,32 @@ void JitKernelBase::store(const Xbyak::Address& dstAddr, const size_t offset = offset0 + j * typeSize; if (typeSize == 8) { - uni_vpextrq(ptr[dstAddr.getRegExp() + offset], xmmSrc, j); + uni_vpextrq(ptr[dstAddr.getRegExp() + offset], xmm_src, j); } else if (typeSize == 4) { - uni_vpextrd(ptr[dstAddr.getRegExp() + offset], xmmSrc, j); + uni_vpextrd(ptr[dstAddr.getRegExp() + offset], xmm_src, j); } else if (typeSize == 2) { - uni_vpextrw(ptr[dstAddr.getRegExp() + offset], xmmSrc, j); + uni_vpextrw(ptr[dstAddr.getRegExp() + offset], xmm_src, j); } else if (typeSize == 1) { - uni_vpextrb(ptr[dstAddr.getRegExp() + offset], xmmSrc, j); + uni_vpextrb(ptr[dstAddr.getRegExp() + offset], xmm_src, j); } } L(lPerm); - vperm2f128(vSrc, vSrc, vSrc, 0x1); + vperm2f128(vmm_src, vmm_src, vmm_src, 0x1); } L(lEnd); } -void JitKernelBase::memMovDD(const Xbyak::Reg64& rDst, - const Xbyak::Reg64& rSrc, - const Xbyak::Xmm& vReadMask, - const Xbyak::Xmm& vSrcShift, - const Xbyak::Reg64& rToStoreNum, +void JitKernelBase::memMovDD(const Reg64& rDst, + const Reg64& rSrc, + const Xmm& vReadMask, + const Xmm& vSrcShift, + const Reg64& rToStoreNum, const bool useMask, const bool zeroFill) { - Xbyak::Label lEnd; + Label lEnd; auto rAux = getReg64(); - Xbyak::Reg32 r32Aux = Xbyak::Reg32(rAux.getIdx()); + Reg32 r32Aux = Reg32(rAux.getIdx()); const uint8_t typeSize = sizeof(int); const uint8_t elPerVec = x64::cpu_isa_traits::vlen / typeSize; @@ -515,12 +636,12 @@ void JitKernelBase::memMovDD(const Xbyak::Reg64& rDst, cmp(rToStoreNum, i); jle(lEnd, T_NEAR); - Xbyak::Label lLoopNext; + Label lLoopNext; if (useMask) { uni_vpextrd(r32Aux, vReadMask, i); cmp(r32Aux, 0); if (zeroFill) { - Xbyak::Label lNotZero; + Label lNotZero; jne(lNotZero, T_NEAR); mov(ptr[rDst.getReg() + i * typeSize], r32Aux); jmp(lLoopNext, T_NEAR); @@ -538,23 +659,23 @@ void JitKernelBase::memMovDD(const Xbyak::Reg64& rDst, L(lEnd); } -void JitKernelBase::memMovDD(const Xbyak::Reg64& rDst, - const Xbyak::Reg64& rSrc, - const Xbyak::Ymm& vReadMask, - const Xbyak::Ymm& vSrcShift, - const Xbyak::Reg64& rToStoreNum, +void JitKernelBase::memMovDD(const Reg64& rDst, + const Reg64& rSrc, + const Ymm& vReadMask, + const Ymm& vSrcShift, + const Reg64& rToStoreNum, const bool useMask, const bool zeroFill) { - Xbyak::Label lEnd; + Label lEnd; if (isValidIsa(x64::avx2)) { - auto vAux = RegistersPool::Reg(registersPool); + auto vAux = RegistersPool::Reg(registersPool); gatherdd(vAux, rSrc, vSrcShift, vReadMask, useMask, zeroFill); store(ptr[rDst], vAux, rToStoreNum, sizeof(int)); } else if (isValidIsa(x64::avx)) { const uint8_t typeSize = sizeof(int); const uint8_t elPerXmm = x64::cpu_isa_traits::vlen / typeSize; - Xbyak::Xmm xmmReadMask = Xbyak::Xmm(vReadMask.getIdx()), - xmmSrcShft = Xbyak::Xmm(vSrcShift.getIdx()); + Xmm xmmReadMask = Xmm(vReadMask.getIdx()), + xmmSrcShft = Xmm(vSrcShift.getIdx()); for (uint8_t i = 0; i < 2; i++) { memMovDD(rDst, rSrc, xmmReadMask, xmmSrcShft, rToStoreNum, useMask, zeroFill); @@ -575,3 +696,582 @@ void JitKernelBase::memMovDD(const Xbyak::Reg64& rDst, } L(lEnd); } + +void JitKernelBase::load_vector(const Xmm& vmm_dst, + const Address &adr_src, + const ov::element::Type& dst_prc, + const ov::element::Type& src_prc) { + Xmm xmmDst = Xmm(vmm_dst.getIdx()); + Ymm ymmDst = Ymm(vmm_dst.getIdx()); + + switch (src_prc) { + case ov::element::f64: + if (x64::mayiuse(x64::avx512_core) && one_of(dst_prc, ov::element::i64, ov::element::i32, ov::element::f32)) { + if (dst_prc == ov::element::i64) { + vcvtpd2qq(vmm_dst, adr_src); + } else if (dst_prc == ov::element::i32) { + uni_vcvtpd2dq(vmm_dst.isZMM() ? ymmDst : vmm_dst, adr_src); + } else if (dst_prc == ov::element::f32) { + uni_vcvtpd2ps(vmm_dst.isZMM() ? ymmDst : vmm_dst, adr_src); + } + } else if (!x64::mayiuse(x64::avx512_core) && one_of(dst_prc, ov::element::f32, ov::element::i32)) { + if (dst_prc == ov::element::f32) { + uni_vcvtpd2ps(xmmDst, adr_src); + } else if (dst_prc == ov::element::i32) { + uni_vcvtpd2dq(xmmDst, adr_src); + } + } else { + uni_vmovups(vmm_dst, adr_src); + } + break; + case ov::element::i64: + if (x64::mayiuse(x64::avx512_core) && one_of(dst_prc, ov::element::f64, ov::element::f32)) { + if (dst_prc == ov::element::f64) { + vcvtqq2pd(vmm_dst, adr_src); + } else if (dst_prc == ov::element::f32) { + vcvtqq2ps(vmm_dst.isZMM() ? ymmDst : vmm_dst, adr_src); + } + } else { + uni_vmovups(vmm_dst, adr_src); + } + break; + case ov::element::f32: + if (dst_prc == ov::element::i32) { + uni_vcvtps2dq(vmm_dst, adr_src); + } else { + uni_vmovups(vmm_dst, adr_src); + } + break; + case ov::element::i32: + if (dst_prc == ov::element::f64) { + uni_vcvtdq2pd(vmm_dst, adr_src); + } else if (dst_prc == ov::element::f32) { + uni_vcvtdq2ps(vmm_dst, adr_src); + } else { + uni_vmovups(vmm_dst, adr_src); + } + break; + case ov::element::bf16: + uni_vpmovzxwd(vmm_dst, adr_src); + uni_vpslld(vmm_dst, vmm_dst, 16); + break; + case ov::element::u16: + if (one_of(dst_prc, ov::element::f32, ov::element::i32)) { + uni_vpmovzxwd(vmm_dst, adr_src); + } else { + uni_vmovups(vmm_dst, adr_src); + } + break; + case ov::element::i16: + if (one_of(dst_prc, ov::element::f32, ov::element::i32)) { + uni_vpmovsxwd(vmm_dst, adr_src); + } else { + uni_vmovups(vmm_dst, adr_src); + } + break; + case ov::element::i8: + if (one_of(dst_prc, ov::element::f32, ov::element::i32)) { + uni_vpmovsxbd(vmm_dst, adr_src); + } else { + uni_vmovups(vmm_dst, adr_src); + } + break; + case ov::element::u8: + if (one_of(dst_prc, ov::element::f32, ov::element::i32)) { + uni_vpmovzxbd(vmm_dst, adr_src); + } else { + uni_vmovups(vmm_dst, adr_src); + } + break; + default: + IE_THROW() << "Unsupported source precision: " << src_prc; + } + + switch (dst_prc) { + case ov::element::f32: + if (!x64::mayiuse(x64::avx512_core) && (src_prc == ov::element::i64)) { + // Do conversion later. + } + if (one_of(src_prc, ov::element::u8, ov::element::i8, ov::element::i16, ov::element::u16)) { + uni_vcvtdq2ps(vmm_dst, vmm_dst); + } + break; + case ov::element::i32: + if (x64::mayiuse(x64::avx512_core)) { + if (src_prc == ov::element::i64) { + vpmovsqd(vmm_dst, vmm_dst); + } + } else { + if (src_prc == ov::element::i64) { + // Do conversion later. + } + } + if (one_of(src_prc, ov::element::bf16)) { + uni_vcvtps2dq(vmm_dst, vmm_dst); + } + break; + case ov::element::i64: + case ov::element::f64: + break; + default: + IE_THROW() << "Unsupported destination precision: " << dst_prc; + } +} + +void JitKernelBase::load_scalar(const Xmm& vmm_dst, + const Address &adr_src, + const ov::element::Type& dst_prc, + const ov::element::Type& src_prc) { + Address src_adr_bcst(adr_src.getBit(), true, adr_src.getRegExp()); + + switch (src_prc) { + case ov::element::f64: + if (x64::mayiuse(x64::avx512_core) && one_of(dst_prc, ov::element::i64, ov::element::i32, ov::element::f32)) { + if (dst_prc == ov::element::i64) { + vcvtpd2qq(vmm_dst, src_adr_bcst); + } else if (dst_prc == ov::element::i32) { + vcvtpd2dq(vmm_dst, src_adr_bcst); + } else if (dst_prc == ov::element::f32) { + vcvtpd2ps(vmm_dst, src_adr_bcst); + } + } else { + uni_vmovsd(vmm_dst, adr_src); + } + break; + case ov::element::i64: + if (x64::mayiuse(x64::avx512_core) && one_of(dst_prc, ov::element::f64, ov::element::f32)) { + if (dst_prc == ov::element::f64) { + vcvtqq2pd(vmm_dst, src_adr_bcst); + } else if (dst_prc == ov::element::f32) { + vcvtqq2ps(vmm_dst, src_adr_bcst); + } + } else { + uni_vmovsd(vmm_dst, adr_src); + } + break; + case ov::element::f32: + if (x64::mayiuse(x64::avx512_core) && one_of(dst_prc, ov::element::f64, ov::element::i32)) { + if (dst_prc == ov::element::f64) { + vcvtps2pd(vmm_dst, src_adr_bcst); + } else if (dst_prc == ov::element::i32) { + vcvtps2dq(vmm_dst, src_adr_bcst); + } + } else { + uni_vmovss(vmm_dst, adr_src); + } + break; + case ov::element::i32: + if (x64::mayiuse(x64::avx512_core) && one_of(dst_prc, ov::element::f32, ov::element::f64)) { + if (dst_prc == ov::element::f32) { + vcvtdq2ps(vmm_dst, src_adr_bcst); + } else if (dst_prc == ov::element::f64) { + vcvtdq2pd(vmm_dst, src_adr_bcst); + } + } else { + uni_vmovss(vmm_dst, adr_src); + } + break; + case ov::element::bf16: + uni_vpinsrw(vmm_dst, vmm_dst, adr_src, 0); + uni_vpslld(vmm_dst, vmm_dst, 16); + break; + case ov::element::i16: + uni_vpinsrw(vmm_dst, vmm_dst, adr_src, 0); + uni_vpmovsxwd(vmm_dst, adr_src); + break; + case ov::element::u16: + uni_vpinsrw(vmm_dst, vmm_dst, adr_src, 0); + uni_vpmovzxwd(vmm_dst, adr_src); + break; + case ov::element::i8: + pinsrb(vmm_dst, adr_src, 0); + uni_vpmovsxbd(vmm_dst, vmm_dst); + break; + case ov::element::u8: + pinsrb(vmm_dst, adr_src, 0); + uni_vpmovzxbd(vmm_dst, vmm_dst); + break; + default: + IE_THROW() << "Unsupported source precision: " << src_prc; + } + + switch (dst_prc) { + case ov::element::f32: + if (x64::mayiuse(x64::avx512_core)) { + if (one_of(src_prc, ov::element::u8, ov::element::i8, ov::element::u16, ov::element::i16)) { + uni_vcvtdq2ps(vmm_dst, vmm_dst); + } + } else { + if (src_prc == ov::element::f64) { + uni_vcvtpd2ps(vmm_dst, vmm_dst); + } else if (src_prc == ov::element::i64) { + // Do conversion later. + } else if (one_of(src_prc, ov::element::u8, ov::element::i8, ov::element::u16, ov::element::i16, ov::element::i32)) { + uni_vcvtdq2ps(vmm_dst, vmm_dst); + } + } + break; + case ov::element::i32: + if (!x64::mayiuse(x64::avx512_core)) { + if (src_prc == ov::element::i64) { + // Do conversion later. + } else if (one_of(src_prc, ov::element::f32, ov::element::bf16)) { + uni_vcvtps2dq(vmm_dst, vmm_dst); + } + } else if (src_prc == ov::element::i64) { + vpmovsqd(vmm_dst, vmm_dst); + } + break; + case ov::element::i64: + case ov::element::f64: + break; + default: + IE_THROW() << "Unsupported destination precision: " << dst_prc; + } +} + +void JitKernelBase::load_with_bcst(const Xmm &vmm_dst, + const Address &adr_src, + const ov::element::Type& dst_prc, + const ov::element::Type& src_prc) { + Address src_adr_bcst(adr_src.getBit(), true, adr_src.getRegExp()); + + switch (src_prc) { + case ov::element::f64: + if (x64::mayiuse(x64::avx512_core) && one_of(dst_prc, ov::element::i64, ov::element::i32, ov::element::f32)) { + if (dst_prc == ov::element::i64) { + vcvtpd2qq(vmm_dst, src_adr_bcst); + } else if (dst_prc == ov::element::i32) { + vcvtpd2dq(vmm_dst, src_adr_bcst); + } else if (dst_prc == ov::element::f32) { + vcvtpd2ps(vmm_dst, src_adr_bcst); + } + } else { + uni_vbroadcastsd(vmm_dst, adr_src); // does not work with XMM, use vpbroadcastq instead + } + break; + case ov::element::i64: + if (x64::mayiuse(x64::avx512_core) && one_of(dst_prc, ov::element::f64, ov::element::f32)) { + if (dst_prc == ov::element::f64) { + vcvtqq2pd(vmm_dst, src_adr_bcst); + } else if (dst_prc == ov::element::f32) { + vcvtqq2ps(vmm_dst, src_adr_bcst); + } + } else { + uni_vbroadcastsd(vmm_dst, adr_src); + } + break; + case ov::element::f32: + if (x64::mayiuse(x64::avx512_core) && one_of(dst_prc, ov::element::f64, ov::element::i32)) { + if (dst_prc == ov::element::f64) { + vcvtps2pd(vmm_dst, src_adr_bcst); + } else if (dst_prc == ov::element::i32) { + vcvtps2dq(vmm_dst, src_adr_bcst); + } + } else { + uni_vbroadcastss(vmm_dst, adr_src); + } + break; + case ov::element::i32: + if (x64::mayiuse(x64::avx512_core) && one_of(dst_prc, ov::element::f32, ov::element::f64)) { + if (dst_prc == ov::element::f32) { + vcvtdq2ps(vmm_dst, src_adr_bcst); + } else if (dst_prc == ov::element::f64) { + vcvtdq2pd(vmm_dst, src_adr_bcst); + } + } else { + uni_vbroadcastss(vmm_dst, adr_src); + } + break; + case ov::element::bf16: + uni_vpinsrw(vmm_dst, vmm_dst, adr_src, 0); + uni_vpslld(vmm_dst, vmm_dst, 16); + break; + case ov::element::i16: + uni_vpinsrw(vmm_dst, vmm_dst, adr_src, 0); + uni_vpmovsxwd(vmm_dst, adr_src); + break; + case ov::element::u16: + uni_vpinsrw(vmm_dst, vmm_dst, adr_src, 0); + uni_vpmovzxwd(vmm_dst, adr_src); + break; + case ov::element::i8: + if (dst_prc == ov::element::i32) { + pinsrb(vmm_dst, adr_src, 0); + uni_vpmovsxbd(vmm_dst, vmm_dst); + } else { + vpbroadcastb(vmm_dst, adr_src); + } + break; + case ov::element::u8: + if (dst_prc == ov::element::i32) { + pinsrb(vmm_dst, adr_src, 0); + uni_vpmovzxbd(vmm_dst, vmm_dst); + } else { + vpbroadcastb(vmm_dst, adr_src); + } + break; + default: + IE_THROW() << "Unsupported source precision: " << src_prc; + } + + switch (dst_prc) { + case ov::element::f32: + if (x64::mayiuse(x64::avx512_core)) { + if (one_of(src_prc, ov::element::u8, ov::element::i8, ov::element::u16, ov::element::i16)) { + uni_vcvtdq2ps(vmm_dst, vmm_dst); + } + } else { + if (src_prc == ov::element::f64) { + uni_vcvtpd2ps(vmm_dst, vmm_dst); + } else if (src_prc == ov::element::i64) { + // Do conversion later. + } else if (one_of(src_prc, ov::element::u8, ov::element::i8, ov::element::u16, ov::element::i16, ov::element::i32)) { + uni_vcvtdq2ps(vmm_dst, vmm_dst); + } + } + break; + case ov::element::i32: + if (!x64::mayiuse(x64::avx512_core)) { + if (src_prc == ov::element::i64) { + // Do conversion later. + } else if (one_of(src_prc, ov::element::f32, ov::element::bf16)) { + uni_vcvtps2dq(vmm_dst, vmm_dst); + } + } else if (src_prc == ov::element::i64) { + vpmovsqd(vmm_dst, vmm_dst); + } + break; + case ov::element::i64: + case ov::element::f64: + break; + default: + IE_THROW() << "Unsupported destination precision: " << dst_prc; + } +} + +void JitKernelBase::store_vector(const Address &adr_dst, + const Xmm &vmm_src, + const ov::element::Type& dst_prc, + const ov::element::Type& src_prc) { + auto xmm_src = Xmm(vmm_src.getIdx()); + auto ymm_src = Ymm(vmm_src.getIdx()); + + switch (src_prc) { + case ov::element::f64: + if (dst_prc == ov::element::f32) { + uni_vcvtpd2ps(x64::mayiuse(x64::avx512_core) ? ymm_src : xmm_src, vmm_src); + } else if (dst_prc == ov::element::i64) { + vcvtpd2qq(vmm_src, vmm_src); + } else if (dst_prc == ov::element::i32) { + vcvtpd2dq(ymm_src, vmm_src); + } + break; + case ov::element::i64: + if (dst_prc == ov::element::f32 || dst_prc == ov::element::bf16) { + vcvtqq2ps(ymm_src, vmm_src); + } else if (dst_prc == ov::element::f64) { + vcvtqq2pd(vmm_src, vmm_src); + } + break; + case ov::element::f32: + if (dst_prc == ov::element::i64) { + vcvtps2qq(vmm_src, ymm_src); + } else if ((dst_prc == ov::element::u8 || dst_prc == ov::element::u16) && x64::mayiuse(x64::avx512_core)) { + vcvtps2udq(vmm_src, vmm_src); + } else if (dst_prc != ov::element::f32 && dst_prc != ov::element::bf16) { + uni_vcvtps2dq(vmm_src, vmm_src); + } + break; + case ov::element::i32: + if (dst_prc == ov::element::f32 || dst_prc == ov::element::bf16) { + uni_vcvtdq2ps(vmm_src, vmm_src); + } + break; + default: + IE_THROW() << "Unsupported source precision: " << src_prc; + } + + switch (dst_prc) { + case ov::element::f64: + uni_vmovups(adr_dst, vmm_src); + break; + case ov::element::f32: + if (src_prc.size() == 8) { + uni_vmovups(adr_dst, ymm_src); + } else { + uni_vmovups(adr_dst, vmm_src); + } + break; + case ov::element::i64: + uni_vmovups(adr_dst, vmm_src); + break; + case ov::element::i32: + if (src_prc == ov::element::i64) { + vpmovsqd(adr_dst, vmm_src); + } else if (src_prc == ov::element::f64) { + uni_vmovups(adr_dst, ymm_src); + } else { + uni_vmovups(adr_dst, vmm_src); + } + break; + case ov::element::bf16: + if (!vcvtneps2bf16) { + IE_THROW() << "Converter for bf16 was not initialized!"; + } + vcvtneps2bf16->emit_code({static_cast(ymm_src.getIdx())}, {static_cast(ymm_src.getIdx())}); + vmovdqu16(adr_dst, ymm_src); + break; + case ov::element::i16: + if (x64::mayiuse(x64::avx512_core)) { + vpmovsdw(adr_dst, vmm_src); + } else { + uni_vpackssdw(vmm_src, vmm_src, vmm_src); + if (x64::mayiuse(x64::avx)) { + vpermq(ymm_src, ymm_src, 0x08); + uni_vmovdqu(adr_dst, xmm_src); + } else { + movq(adr_dst, xmm_src); + } + } + break; + case ov::element::u16: + if (x64::mayiuse(x64::avx512_core)) { + vpmovusdw(adr_dst, xmm_src); + } else { + uni_vpackusdw(vmm_src, vmm_src, vmm_src); + if (x64::mayiuse(x64::avx)) { + vpermq(ymm_src, ymm_src, 0x08); + uni_vmovdqu(adr_dst, xmm_src); + } else { + movq(adr_dst, xmm_src); + } + } + break; + case ov::element::i8: + if (x64::mayiuse(x64::avx512_core)) { + if (src_prc == ov::element::i64) { + vpmovsqb(adr_dst, vmm_src); + } else { + vpmovsdb(adr_dst, vmm_src); + } + } else { + uni_vpackssdw(vmm_src, vmm_src, vmm_src); + if (x64::mayiuse(x64::avx)) { + vpermq(ymm_src, ymm_src, 0x08); + } + uni_vpacksswb(vmm_src, vmm_src, vmm_src); + if (x64::mayiuse(x64::avx)) { + vmovq(adr_dst, xmm_src); + } else { + movd(adr_dst, xmm_src); + } + } + break; + case ov::element::u8: + if (x64::mayiuse(x64::avx512_core)) { + if (src_prc == ov::element::i64) { + vpmovusqb(adr_dst, vmm_src); + } else { + vpmovusdb(adr_dst, vmm_src); + } + } else { + uni_vpackusdw(vmm_src, vmm_src, vmm_src); + if (x64::mayiuse(x64::avx)) { + vpermq(ymm_src, ymm_src, 0x08); + } + uni_vpackuswb(vmm_src, vmm_src, vmm_src); + if (x64::mayiuse(x64::avx)) { + vmovq(adr_dst, xmm_src); + } else { + movd(adr_dst, xmm_src); + } + } + break; + default: + IE_THROW() << "Unsupported destination precision: " << dst_prc; + } +} + +void JitKernelBase::store_scalar(const Address &adr_dst, + const Xmm &vmm_src, + const ov::element::Type& dst_prc, + const ov::element::Type& src_prc) { + switch (src_prc) { + case ov::element::f64: + if (dst_prc == ov::element::f32) { + uni_vcvtpd2ps(vmm_src, vmm_src); + } else if (dst_prc == ov::element::i64) { + vcvtpd2qq(vmm_src, vmm_src); + } else if (dst_prc == ov::element::i32) { + uni_vcvtpd2dq(vmm_src, vmm_src); + } + break; + case ov::element::i64: + if (dst_prc == ov::element::f32 || dst_prc == ov::element::bf16) { + vcvtqq2ps(vmm_src, vmm_src); + } else if (dst_prc == ov::element::i32) { + vpmovsqd(vmm_src, vmm_src); + } + break; + case ov::element::f32: + if (dst_prc == ov::element::i64) { + vcvtps2qq(vmm_src, vmm_src); + } else if (dst_prc == ov::element::u8 && x64::mayiuse(x64::avx512_core)) { + vcvtps2udq(vmm_src, vmm_src); + } else if (dst_prc != ov::element::f32 && dst_prc != ov::element::bf16) { + uni_vcvtps2dq(vmm_src, vmm_src); + } + break; + case ov::element::i32: + if (dst_prc == ov::element::f32 || dst_prc == ov::element::bf16) { + uni_vcvtdq2ps(vmm_src, vmm_src); + } + break; + default: + IE_THROW() << "Unsupported source precision: " << src_prc; + } + + switch (dst_prc) { + case ov::element::f64: + case ov::element::i64: + uni_vmovsd(adr_dst, vmm_src); + break; + case ov::element::f32: + case ov::element::i32: + uni_vmovss(adr_dst, vmm_src); + break; + case ov::element::bf16: + uni_vpsrld(vmm_src, vmm_src, 16); + uni_vpextrw(adr_dst, vmm_src, 0x0); + break; + case ov::element::i16: + uni_vpackssdw(vmm_src, vmm_src, vmm_src); + uni_vpextrw(adr_dst, vmm_src, 0x0); + break; + case ov::element::u16: + uni_vpackusdw(vmm_src, vmm_src, vmm_src); + uni_vpextrw(adr_dst, vmm_src, 0x0); + break; + case ov::element::i8: + if (x64::mayiuse(x64::avx512_core)) { + vpmovsdb(vmm_src, vmm_src); + } else { + uni_vpackssdw(vmm_src, vmm_src, vmm_src); + uni_vpacksswb(vmm_src, vmm_src, vmm_src); + } + uni_vpextrb(adr_dst, vmm_src, 0x0); + break; + case ov::element::u8: + if (x64::mayiuse(x64::avx512_core)) { + vpmovusdb(vmm_src, vmm_src); + } else { + uni_vpackusdw(vmm_src, vmm_src, vmm_src); + uni_vpackuswb(vmm_src, vmm_src, vmm_src); + } + uni_vpextrb(adr_dst, vmm_src, 0); + break; + default: + IE_THROW() << "Unsupported destination precision: " << dst_prc; + } +} diff --git a/src/plugins/intel_cpu/src/nodes/kernels/x64/jit_kernel_base.hpp b/src/plugins/intel_cpu/src/nodes/kernels/x64/jit_kernel_base.hpp index e39efde753bbbc..ed5b6e02ea354c 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/x64/jit_kernel_base.hpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/x64/jit_kernel_base.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2022 Intel Corporation +// Copyright (C) 2018-2023 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -6,49 +6,67 @@ #include "cpu/x64/jit_generator.hpp" #include "registers_pool.hpp" +#include "emitters/x64/jit_bf16_emitters.hpp" namespace ov { namespace intel_cpu { +namespace kernel { -#define getReg64() RegistersPool::Reg(registersPool) -#define getReg32() RegistersPool::Reg(registersPool) -#define getVmm() RegistersPool::Reg(registersPool) -#define getMask() RegistersPool::Reg(registersPool) +#define getReg64() RegistersPool::Reg(this->registersPool) +#define getReg32() RegistersPool::Reg(this->registersPool) +#define getVmm() RegistersPool::Reg(this->registersPool) +#define getMask() RegistersPool::Reg(this->registersPool) class JitKernelBase: public dnnl::impl::cpu::x64::jit_generator { public: - JitKernelBase(const char* name) : dnnl::impl::cpu::x64::jit_generator(name) {} + JitKernelBase(const char* name, dnnl::impl::cpu::x64::cpu_isa_t max_cpu_isa); - void uni_vfmsub132ps(const Xbyak::Xmm& vDst, const Xbyak::Xmm& vSrc, const Xbyak::Operand& op); + void uni_vfmsub132ps(const Xbyak::Xmm& vmm_dst, const Xbyak::Xmm& vmm_src, const Xbyak::Operand& op); - void uni_vfnmadd132ps(const Xbyak::Xmm& vDst, const Xbyak::Xmm& vSrc, const Xbyak::Operand& op); + void uni_vfnmadd132ps(const Xbyak::Xmm& vmm_dst, const Xbyak::Xmm& vmm_src, const Xbyak::Operand& op); - void uni_vfmsub231ps(const Xbyak::Xmm& vDst, const Xbyak::Xmm& vSrc, const Xbyak::Operand& op); + void uni_vfmsub231ps(const Xbyak::Xmm& vmm_dst, const Xbyak::Xmm& vmm_src, const Xbyak::Operand& op); - void uni_vpaddd(const Xbyak::Xmm& vDst, const Xbyak::Xmm& vSrc, const Xbyak::Operand& op) { - jit_generator::uni_vpaddd(vDst, vSrc, op); + void uni_vpaddd(const Xbyak::Xmm& vmm_dst, const Xbyak::Xmm& vmm_src, const Xbyak::Operand& op) { + jit_generator::uni_vpaddd(vmm_dst, vmm_src, op); } - void uni_vpaddd(const Xbyak::Ymm& vDst, const Xbyak::Ymm& vSrc, const Xbyak::Operand& op); + void uni_vpaddd(const Xbyak::Ymm& vmm_dst, const Xbyak::Ymm& vmm_src, const Xbyak::Operand& op); - void uni_vpsubd(const Xbyak::Xmm& vDst, const Xbyak::Xmm& vSrc, const Xbyak::Operand& op) { - jit_generator::uni_vpsubd(vDst, vSrc, op); + void uni_vaddpd(const Xbyak::Xmm& vmm_dst, const Xbyak::Operand &op1, const Xbyak::Operand &op2); + + void uni_vpsubd(const Xbyak::Xmm& vmm_dst, const Xbyak::Xmm& vmm_src, const Xbyak::Operand& op) { + jit_generator::uni_vpsubd(vmm_dst, vmm_src, op); } - void uni_vpsubd(const Xbyak::Ymm& vDst, const Xbyak::Ymm& vSrc, const Xbyak::Operand& op); + void uni_vpsubd(const Xbyak::Ymm& vmm_dst, const Xbyak::Ymm& vmm_src, const Xbyak::Operand& op); + + void uni_vmulpd(const Xbyak::Xmm& vmm_dst, const Xbyak::Operand& op1, const Xbyak::Operand& op2); + + void uni_vdivps(const Xbyak::Xmm& vmm_dst, const Xbyak::Operand& op1, const Xbyak::Operand& op2); + + void uni_vdivpd(const Xbyak::Xmm& vmm_dst, const Xbyak::Operand& op1, const Xbyak::Operand& op2); + + void uni_vandps(const Xbyak::Xmm& vmm_dst, const Xbyak::Xmm& vmm_src, const Xbyak::Operand &op); - void uni_vdivps(const Xbyak::Xmm& vDst, const Xbyak::Operand& op1, const Xbyak::Operand& op2); + void uni_vandpd(const Xbyak::Xmm& vmm_dst, const Xbyak::Xmm& vmm_src, const Xbyak::Operand &op); - void uni_vandps(const Xbyak::Xmm& vDst, const Xbyak::Xmm& vSrs, const Xbyak::Operand &op); + void uni_vandnps(const Xbyak::Xmm& vmm_dst, const Xbyak::Xmm& vmm_src, const Xbyak::Operand &op); - void uni_vandnps(const Xbyak::Xmm& vDst, const Xbyak::Xmm& vSrs, const Xbyak::Operand &op); + void uni_vorpd(const Xbyak::Xmm& vmm_dst, const Xbyak::Xmm& vmm_src, const Xbyak::Operand &op); + + void uni_vcmppd(const Xbyak::Xmm& vmm_dst, const Xbyak::Xmm& vmm_src, const Xbyak::Operand &op, const uint8_t imm); + + void uni_vmaxpd(const Xbyak::Xmm& vmm_dst, const Xbyak::Operand &op1, const Xbyak::Operand &op2); + + void uni_vminpd(const Xbyak::Xmm& vmm_dst, const Xbyak::Operand &op1, const Xbyak::Operand &op2); void uni_kmovd(const Xbyak::Opmask& kDst, const Xbyak::Opmask& kSrc) { kmovd(kDst, kSrc); } - void uni_kmovd(const Xbyak::Xmm& vDst, const Xbyak::Xmm& vSrc) { - uni_vmovups(vDst, vSrc); + void uni_kmovd(const Xbyak::Xmm& vmm_dst, const Xbyak::Xmm& vmm_src) { + uni_vmovups(vmm_dst, vmm_src); } void uni_kandd(const Xbyak::Opmask& kDst, const Xbyak::Opmask& kSrc1, const Xbyak::Opmask& kSrc2) { @@ -59,81 +77,127 @@ class JitKernelBase: public dnnl::impl::cpu::x64::jit_generator { uni_vandps(kDst, kSrc1, kSrc2); } - void uni_vpbroadcastd(const Xbyak::Xmm &x, const Xbyak::Operand &op); - - void uni_vpbroadcastd(const Xbyak::Ymm &x, const Xbyak::Operand &op); - - void gatherdd(const Xbyak::Xmm& vDst, - const Xbyak::Reg64& rSrcPtr, - const Xbyak::Xmm& vSrcShift, - const Xbyak::Opmask& kReadMask, - const bool useMask = true, - const bool zeroFill = false); - - void gatherdd(const Xbyak::Xmm& vDst, - const Xbyak::Reg64& rSrcPtr, - const Xbyak::Xmm& vSrcShift, - const Xbyak::Xmm& vReadMask, - const bool useMask = true, - const bool zeroFill = false); - - void gatherdd(const Xbyak::Ymm& vDst, - const Xbyak::Reg64& rSrcPtr, - const Xbyak::Ymm& vSrcShift, - const Xbyak::Ymm& vReadMask, - const bool useMask = true, - const bool zeroFill = false); - - void fillRestWorkMask(const Xbyak::Opmask& kDstMask, - const Xbyak::Reg64& rWorkRest); - - void fillRestWorkMask(const Xbyak::Xmm& ymmDstMask, - const Xbyak::Reg64& rWorkRest, - const uint64_t typeSize = 4); - - void fillRestWorkMask(const Xbyak::Ymm& ymmDstMask, - const Xbyak::Reg64& rWorkRest, - const uint64_t typeSize = 4); - - void load(const Xbyak::Xmm& vDst, - const Xbyak::Address& srcAddr, - const Xbyak::Reg64& rLoadNum, - const size_t typeSize, - const bool zeroFill = false); - - void load(const Xbyak::Ymm& vDst, - const Xbyak::Address& srcAddr, - const Xbyak::Reg64& rLoadNum, - const size_t typeSize, - const bool zeroFill = false); - - void store(const Xbyak::Address& dstAddr, - const Xbyak::Xmm& vSrc, - const Xbyak::Reg64& rToStoreNum, - const size_t typeSize); - - void store(const Xbyak::Address& dstAddr, - const Xbyak::Ymm& vSrc, - const Xbyak::Reg64& rToStoreNum, - const size_t typeSize); + void uni_vpbroadcastd(const Xbyak::Xmm &vmm_dst, const Xbyak::Operand &op); + + void uni_vpbroadcastd(const Xbyak::Ymm &vmm_dst, const Xbyak::Operand &op); + + void uni_vcvtpd2dq(const Xbyak::Xmm& vmm_dst, const Xbyak::Operand &op); + + void uni_vcvtpd2ps(const Xbyak::Xmm& vmm_dst, const Xbyak::Operand &op); + + void gatherdd( + const Xbyak::Xmm& vmm_dst, + const Xbyak::Reg64& rSrcPtr, + const Xbyak::Xmm& vSrcShift, + const Xbyak::Opmask& kReadMask, + const bool useMask = true, + const bool zeroFill = false); + + void gatherdd( + const Xbyak::Xmm& vmm_dst, + const Xbyak::Reg64& rSrcPtr, + const Xbyak::Xmm& vSrcShift, + const Xbyak::Xmm& vReadMask, + const bool useMask = true, + const bool zeroFill = false); + + void gatherdd( + const Xbyak::Ymm& vmm_dst, + const Xbyak::Reg64& rSrcPtr, + const Xbyak::Ymm& vSrcShift, + const Xbyak::Ymm& vReadMask, + const bool useMask = true, + const bool zeroFill = false); + + void fillRestWorkMask( + const Xbyak::Opmask& kDstMask, + const Xbyak::Reg64& rWorkRest); + + void fillRestWorkMask( + const Xbyak::Xmm& ymmDstMask, + const Xbyak::Reg64& rWorkRest, + const uint64_t typeSize = 4); + + void fillRestWorkMask( + const Xbyak::Ymm& ymmDstMask, + const Xbyak::Reg64& rWorkRest, + const uint64_t typeSize = 4); + + void load( + const Xbyak::Xmm& vmm_dst, + const Xbyak::Address& adr_src, + const Xbyak::Reg64& rLoadNum, + const size_t typeSize, + const bool zeroFill = false); + + void load( + const Xbyak::Ymm& vmm_dst, + const Xbyak::Address& adr_src, + const Xbyak::Reg64& rLoadNum, + const size_t typeSize, + const bool zeroFill = false); + + void store( + const Xbyak::Address& dstAddr, + const Xbyak::Xmm& vmm_src, + const Xbyak::Reg64& rToStoreNum, + const size_t typeSize); + + void store( + const Xbyak::Address& dstAddr, + const Xbyak::Ymm& vmm_src, + const Xbyak::Reg64& rToStoreNum, + const size_t typeSize); // Makes gather from memory under the vReadMask and writes to the memory m128. - void memMovDD(const Xbyak::Reg64& rDst, - const Xbyak::Reg64& rSrc, - const Xbyak::Xmm& vReadMask, - const Xbyak::Xmm& vSrcShift, - const Xbyak::Reg64& rToStoreCounter, - const bool useMask = true, - const bool zeroFill = false); + void memMovDD( + const Xbyak::Reg64& rDst, + const Xbyak::Reg64& rSrc, + const Xbyak::Xmm& vReadMask, + const Xbyak::Xmm& vSrcShift, + const Xbyak::Reg64& rToStoreCounter, + const bool useMask = true, + const bool zeroFill = false); // Makes gather from the memory under the vReadMask and writes to the memory m256. - void memMovDD(const Xbyak::Reg64& rDst, - const Xbyak::Reg64& rSrc, - const Xbyak::Ymm& vReadMask, - const Xbyak::Ymm& vSrcShift, - const Xbyak::Reg64& rToStoreCounter, - const bool useMask = true, - const bool zeroFill = false); + void memMovDD( + const Xbyak::Reg64& rDst, + const Xbyak::Reg64& rSrc, + const Xbyak::Ymm& vReadMask, + const Xbyak::Ymm& vSrcShift, + const Xbyak::Reg64& rToStoreCounter, + const bool useMask = true, + const bool zeroFill = false); + + void load_vector( + const Xbyak::Xmm& vmm_dst, + const Xbyak::Address &srcAdr, + const ov::element::Type& dstPrc, + const ov::element::Type& srcPrc); + + void load_scalar( + const Xbyak::Xmm& vmm_dst, + const Xbyak::Address &srcAdr, + const ov::element::Type& dstPrc, + const ov::element::Type& srcPrc); + + void load_with_bcst( + const Xbyak::Xmm& vmm_dst, + const Xbyak::Address &srcAdr, + const ov::element::Type& dstPrc, + const ov::element::Type& srcPrc); + + void store_vector( + const Xbyak::Address &dstAdr, + const Xbyak::Xmm& vmm_src, + const ov::element::Type& dstPrc, + const ov::element::Type& srcPrc); + + void store_scalar( + const Xbyak::Address &dstAdr, + const Xbyak::Xmm& vmm_src, + const ov::element::Type& dstPrc, + const ov::element::Type& srcPrc); protected: inline bool isValidIsa(dnnl::impl::cpu::x64::cpu_isa_t isa) { @@ -142,6 +206,8 @@ class JitKernelBase: public dnnl::impl::cpu::x64::jit_generator { RegistersPool::Ptr registersPool; + std::shared_ptr vcvtneps2bf16; + enum { // Comparison predicate operand (immediate byte) for single-precision floating-point values. CMP_EQ_PS = 0, // Equal (ordered, non-signaling) @@ -155,5 +221,41 @@ class JitKernelBase: public dnnl::impl::cpu::x64::jit_generator { }; }; +template +class JitKernel : public JitKernelBase { +public: + using KernelFunc = void (*)(const CallArgs *); + + explicit JitKernel(const char* name, const CompileParams& jcp, dnnl::impl::cpu::x64::cpu_isa_t max_cpu_isa) + : JitKernelBase{name, max_cpu_isa}, jcp{jcp}, func{nullptr} {} + ~JitKernel() override = default; + + dnnl::impl::status_t create_kernel() override { + const dnnl::impl::status_t code = jit_generator::create_kernel(); + if (code != dnnl::impl::status::success) { + IE_THROW() << "Could not create kernel. Error code: " << std::to_string(code) << ". " << + "Xbyak error code: " << Xbyak::ConvertErrorToString(Xbyak::GetError()); + } + func = (decltype(func))jit_ker(); + return code; + } + + void operator()(const CallArgs* args) const { + assert(func); + func(args); + } + + void operator()(const CallArgs& args) const { + this->operator()(&args); + } + +protected: + CompileParams jcp; + +private: + KernelFunc func; +}; + +} // namespace kernel } // namespace intel_cpu } // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/kernels/x64/reduce.cpp b/src/plugins/intel_cpu/src/nodes/kernels/x64/reduce.cpp new file mode 100644 index 00000000000000..cd64ee05706c0c --- /dev/null +++ b/src/plugins/intel_cpu/src/nodes/kernels/x64/reduce.cpp @@ -0,0 +1,1915 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "reduce.hpp" +#include "utils/bfloat16.hpp" +#include + +using namespace ov::intel_cpu::kernel; +using namespace dnnl::impl::utils; +using namespace dnnl::impl::cpu; +using namespace Xbyak; + +#define GET_OFF(field) offsetof(JitReduceCallArgs, field) +#define GET_OFF_POST(field) offsetof(JitReducePostCallArgs, field) + + +static inline bool isFloatCompatible(const ov::element::Type& type) { + return ov::intel_cpu::one_of(type, ov::element::f32, ov::element::bf16, ov::element::f64); +} + +/////////////////////////////// +///// JitReduceKernelBase ///// +/////////////////////////////// + +template +JitReduceKernelBase::JitReduceKernelBase(const char* name, const JitReduceConfigParams& jcp, x64::cpu_isa_t isa) + : JitKernel(name, jcp, isa) { + exec_el_type = jcp.src_el_type; + if (exec_el_type.size() <= 4) { + exec_el_type = ov::element::f32; + } else if (exec_el_type == ov::element::u64) { + exec_el_type = ov::element::i64; + } + + planar_layout = one_of(jcp.layout, ReduceLayoutType::reduce_ncsp, ReduceLayoutType::reduce_nspc); + + if (one_of(ov::element::bf16, exec_el_type, jcp.src_el_type, jcp.dst_el_type)) { + this->vcvtneps2bf16 = std::make_shared(this, isa); + } + if (jcp.reduce_mode == Algorithm::ReduceMax) { + max_emitter = std::make_shared(this, isa, InferenceEngine::details::convertPrecision(exec_el_type)); + } + if (jcp.reduce_mode == Algorithm::ReduceMin) { + min_emitter = std::make_shared(this, isa, InferenceEngine::details::convertPrecision(exec_el_type)); + } + if (one_of(jcp.reduce_mode, Algorithm::ReduceL2, Algorithm::ReduceSumSquare, Algorithm::ReduceProd)) { + mul_emitter = std::make_shared(this, isa, InferenceEngine::details::convertPrecision(exec_el_type)); + } +} + +////////// FLOAT 32 ////////// +template +void JitReduceKernelBase::horiz_ps(const Xmm& vmm_dst, const Operand& op) { + switch (this->jcp.reduce_mode) { + case Algorithm::ReduceAnd: + this->uni_vandps(vmm_dst, vmm_dst, op); + break; + case Algorithm::ReduceL1: + case Algorithm::ReduceL2: + case Algorithm::ReduceLogSum: + case Algorithm::ReduceMean: + case Algorithm::ReduceSum: + case Algorithm::ReduceSumSquare: + case Algorithm::ReduceLogSumExp: + this->uni_vaddps(vmm_dst, vmm_dst, op); + break; + case Algorithm::ReduceMax: + this->uni_vmaxps(vmm_dst, vmm_dst, op); + break; + case Algorithm::ReduceMin: + this->uni_vminps(vmm_dst, vmm_dst, op); + break; + case Algorithm::ReduceOr: + this->uni_vorps(vmm_dst, vmm_dst, op); + break; + case Algorithm::ReduceProd: + this->uni_vmulps(vmm_dst, vmm_dst, op); + break; + default: + IE_THROW() << "Unsupported reduce mode '" << algToString(this->jcp.reduce_mode) << "'"; + } +} + +template +template +void JitReduceKernelBase::horiz_reduce_store_ps(const Xmm& vmm_dst, const ov::element::Type& dst_el_type, bool load_embedded) { + auto xmm_aux_1 = RegistersPool::Reg(this->registersPool); + auto xmm_aux_2 = RegistersPool::Reg(this->registersPool); + + if (isa == x64::avx512_core) { + auto zmm_dst = Zmm(vmm_dst.getIdx()); + auto ymm_dst = Ymm(vmm_dst.getIdx()); + auto ymm_aux_1 = Ymm(xmm_aux_1.getIdx()); + + this->vextractf64x4(ymm_aux_1, zmm_dst, 1); + this->horiz_ps(ymm_aux_1, ymm_dst); + this->vextractf128(xmm_aux_2, ymm_aux_1, 1); + this->horiz_ps(xmm_aux_1, xmm_aux_2); + } else if (isa == x64::avx2) { + auto ymm_dst = Ymm(vmm_dst.getIdx()); + auto xmm_dst = Xmm(vmm_dst.getIdx()); + + this->vextractf128(xmm_aux_1, ymm_dst, 1); + this->horiz_ps(xmm_aux_1, xmm_dst); + } else if (isa == x64::sse41) { + auto xmm_dst = Xmm(vmm_dst.getIdx()); + + if (one_of(this->jcp.reduce_mode, Algorithm::ReduceL1, Algorithm::ReduceL2, Algorithm::ReduceLogSum, Algorithm::ReduceMean, + Algorithm::ReduceSum, Algorithm::ReduceSumSquare, Algorithm::ReduceLogSumExp)) { + this->uni_vhaddps(xmm_aux_1, xmm_dst, xmm_dst); + this->uni_vhaddps(xmm_aux_1, xmm_aux_1, xmm_aux_1); + } else { + this->uni_vshufps(xmm_aux_1, xmm_dst, xmm_dst, 0b00001110); + this->horiz_ps(xmm_aux_1, xmm_dst); + this->uni_vshufps(xmm_aux_2, xmm_aux_1, xmm_aux_1, 0b00000001); + this->horiz_ps(xmm_aux_1, xmm_aux_2); + } + } + + if (isa != x64::sse41) { + if (one_of(this->jcp.reduce_mode, Algorithm::ReduceL1, Algorithm::ReduceL2, Algorithm::ReduceLogSum, Algorithm::ReduceMean, + Algorithm::ReduceSum, Algorithm::ReduceSumSquare, Algorithm::ReduceLogSumExp)) { + this->uni_vhaddps(xmm_aux_1, xmm_aux_1, xmm_aux_1); + this->uni_vhaddps(xmm_aux_1, xmm_aux_1, xmm_aux_1); + } else { + this->uni_vshufps(xmm_aux_2, xmm_aux_1, xmm_aux_1, 0b00001110); + this->horiz_ps(xmm_aux_1, xmm_aux_2); + this->uni_vshufps(xmm_aux_2, xmm_aux_1, xmm_aux_1, 0b00000001); + this->horiz_ps(xmm_aux_1, xmm_aux_2); + } + } + + auto trg_el_type = dst_el_type; + Reg64 trg_ptr = reg_dst; + if (this->jcp.fuse_low_precision && (post_reduce || post_ops_fusing)) { + trg_el_type = ov::element::f32; // TODO i64 fusing + trg_ptr = reg_src; + } + if (load_embedded) { + if (isa == x64::avx512_core && exec_el_type == trg_el_type) { + this->horiz_ps(xmm_aux_1, this->ptr_b[trg_ptr]); + } else { + this->load_scalar(xmm_aux_2, this->ptr[trg_ptr], exec_el_type, trg_el_type); + this->horiz_ps(xmm_aux_1, xmm_aux_2); + } + } + this->store_scalar(this->ptr[trg_ptr], xmm_aux_1, trg_el_type, exec_el_type); +} + +////////// INTEGER 64 ////////// +template +template +void JitReduceKernelBase::horiz_qq(const Xmm& vmm_dst, const Operand& op) { + using Vmm = typename conditional3::type; + + switch (this->jcp.reduce_mode) { + case Algorithm::ReduceAnd: + this->uni_vandpd(vmm_dst, vmm_dst, op); + break; + case Algorithm::ReduceL1: + case Algorithm::ReduceL2: + case Algorithm::ReduceLogSum: + case Algorithm::ReduceMean: + case Algorithm::ReduceSum: + case Algorithm::ReduceSumSquare: + case Algorithm::ReduceLogSumExp: + this->uni_vpaddq(vmm_dst, vmm_dst, op); + break; + case Algorithm::ReduceMax: + if (isa == x64::avx512_core) { + this->vpmaxsq(vmm_dst, vmm_dst, op); + } else { + auto vmm_aux_0 = getVmm(); + if (op.isMEM()) { + max_emitter->emit_code({vmm_dst.getIdx()}, {vmm_dst.getIdx()}, {vmm_aux_0.getIdx()}, {op.getIdx()}); + } else { + max_emitter->emit_code({vmm_dst.getIdx(), op.getIdx()}, {vmm_dst.getIdx()}, {vmm_aux_0.getIdx()}); + } + } + break; + case Algorithm::ReduceMin: + if (isa == x64::avx512_core) { + this->vpminsq(vmm_dst, vmm_dst, op); + } else { + auto vmm_aux_0 = getVmm(); + if (op.isMEM()) { + min_emitter->emit_code({vmm_dst.getIdx()}, {vmm_dst.getIdx()}, {vmm_aux_0.getIdx()}, {op.getIdx()}); + } else { + min_emitter->emit_code({vmm_dst.getIdx(), op.getIdx()}, {vmm_dst.getIdx()}, {vmm_aux_0.getIdx()}); + } + } + break; + case Algorithm::ReduceOr: + this->uni_vorpd(vmm_dst, vmm_dst, op); + break; + case Algorithm::ReduceProd: + if (isa == x64::avx512_core) { + this->vpmullq(vmm_dst, vmm_dst, op); + } else { + auto vmm_aux_0 = getVmm(); + auto vmm_aux_1 = getVmm(); + if (op.isMEM()) { + mul_emitter->emit_code({vmm_dst.getIdx()}, {vmm_dst.getIdx()}, {vmm_aux_0.getIdx(), vmm_aux_1.getIdx()}, {op.getIdx()}); + } else { + mul_emitter->emit_code({vmm_dst.getIdx(), op.getIdx()}, {vmm_dst.getIdx()}, {vmm_aux_0.getIdx(), vmm_aux_1.getIdx()}); + } + } + break; + default: + IE_THROW() << "Unsupported reduce mode '" << algToString(this->jcp.reduce_mode) << "'"; + } +} + +template +template +void JitReduceKernelBase::horiz_reduce_store_qq(const Xmm& vmm_dst, const ov::element::Type& dst_el_type, bool load_embedded) { + auto xmm_aux_1 = RegistersPool::Reg(this->registersPool); + auto xmm_aux_2 = RegistersPool::Reg(this->registersPool); + + if (isa == x64::avx512_core) { + auto zmm_dst = Zmm(vmm_dst.getIdx()); + auto ymm_dst = Ymm(vmm_dst.getIdx()); + auto ymm_aux_1 = Ymm(xmm_aux_1.getIdx()); + + this->vextractf64x4(ymm_aux_1, zmm_dst, 1); + this->horiz_qq(ymm_aux_1, ymm_dst); + this->vextractf128(xmm_aux_2, ymm_aux_1, 1); + this->horiz_qq(xmm_aux_1, xmm_aux_2); + this->vshufpd(xmm_aux_2, xmm_aux_1, xmm_aux_1, 0b00000001); + this->horiz_qq(xmm_aux_1, xmm_aux_2); + } else if (isa == x64::avx2) { + auto ymm_dst = Ymm(vmm_dst.getIdx()); + auto xmm_dst = Xmm(vmm_dst.getIdx()); + + this->vextractf128(xmm_aux_1, ymm_dst, 1); + this->horiz_qq(xmm_aux_1, xmm_dst); + this->vshufpd(xmm_aux_2, xmm_aux_1, xmm_aux_1, 0b00000001); + this->horiz_qq(xmm_aux_1, xmm_aux_2); + } else if (isa == x64::sse41) { + auto xmm_dst = Xmm(vmm_dst.getIdx()); + + this->vshufpd(xmm_aux_1, xmm_dst, xmm_dst, 0b00000001); + this->horiz_qq(xmm_aux_1, xmm_dst); + } + + auto trg_el_type = dst_el_type; + Reg64 trg_ptr = reg_dst; + if (this->jcp.fuse_low_precision && (post_reduce || post_ops_fusing)) { + trg_el_type = ov::element::f32; // TODO i64 fusing + trg_ptr = reg_src; + } + if (load_embedded) { + if (isa == x64::avx512_core && exec_el_type == trg_el_type) { + this->horiz_qq(xmm_aux_1, this->ptr_b[trg_ptr]); + } else { + this->load_scalar(xmm_aux_2, this->ptr[trg_ptr], exec_el_type, trg_el_type); + this->horiz_qq(xmm_aux_1, xmm_aux_2); + } + } + this->store_scalar(this->ptr[trg_ptr], xmm_aux_1, trg_el_type, exec_el_type); +} + +/////////////////////////////// +/////// JitReduceKernel /////// +/////////////////////////////// + +template +JitReduceKernel::JitReduceKernel(const JitReduceConfigParams &jcp) : JitReduceKernelBase(jit_name(), jcp, isa) { + loop_step = vlen / exec_el_type.size(); + if (isa == x64::sse41) { + loop_step *= 2; + } + + if (jcp.reduce_mode == Algorithm::ReduceLogSumExp) { + exp_injector = std::make_shared>(this, dnnl::impl::alg_kind::eltwise_exp, 0.f, 0.f, 1.f); + } +} + +template +void JitReduceKernel::generate() { + this->preamble(); + + registersPool = RegistersPool::create(isa, {rax, rcx, rsp, rdi, k0}); + + reg_src = getReg64(); + reg_dst = getReg64(); + reg_work_amount = getReg64(); + reg_work_batch = getReg64(); + mov(reg_src, ptr[reg_params + GET_OFF(src)]); + mov(reg_dst, ptr[reg_params + GET_OFF(dst)]); + mov(reg_work_amount, ptr[reg_params + GET_OFF(work_amount)]); + mov(reg_work_batch, ptr[reg_params + GET_OFF(work_batch)]); + + reg_reduce_stride = getReg64(); + v_src = getVmm(); + v_dst = getVmm(); + + if (jcp.reduce_mode == Algorithm::ReduceL1 && !(isa == x64::avx512_core && exec_el_type == ov::element::i64)) { + v_abs_mask = getVmm(); + } + if (isa == x64::sse41) { + v_dst_aux = getVmm(); + } + + if (planar_layout) { + reg_reduce_w = getReg64(); + mov(reg_reduce_w, ptr[reg_params + GET_OFF(reduce_w)]); + } + if (one_of(jcp.reduce_mode, Algorithm::ReduceAnd, Algorithm::ReduceL1, Algorithm::ReduceMax, + Algorithm::ReduceMin, Algorithm::ReduceProd, Algorithm::ReduceOr)) { // TODO ReduceProd ? + reg_table = getReg64(); + mov(reg_table, l_table); + } + if (one_of(jcp.reduce_mode, Algorithm::ReduceAnd, Algorithm::ReduceOr)) { + v_zero = getVmm(); + uni_vpxor(v_zero, v_zero, v_zero); + } + if (jcp.reduce_mode == Algorithm::ReduceOr) { + v_ones = getVmm(); + uni_vmovups(v_ones, table_val(0)); + } + + reduce_main(); + reduce_tail(); + + registersPool.reset(); + + this->postamble(); + + if (vcvtneps2bf16) { + vcvtneps2bf16->emit_data(); + } + if (max_emitter) { + max_emitter->emit_data(); + } + if (min_emitter) { + min_emitter->emit_data(); + } + if (mul_emitter) { + mul_emitter->emit_data(); + } + if (one_of(jcp.reduce_mode, Algorithm::ReduceAnd, Algorithm::ReduceL1, Algorithm::ReduceMax, + Algorithm::ReduceMin, Algorithm::ReduceProd, Algorithm::ReduceOr)) { + prepare_aux_table(); + } else if (jcp.reduce_mode == Algorithm::ReduceLogSumExp) { + exp_injector->prepare_table(); + } +} + +template +inline void JitReduceKernel::reduce_main() { + // ================================================================ + // ***isa: AVX512*** + // ReduceAnd (Logical And) + // step 1: init dst 0x3f800000 (1.0f) + // aux 0x3f800000 (1.0f) + // zero 0x00000000 (0.0f) + // step 2: if src equals 0, set mask bit 0, else set mask bit 1 + // step 3: src = mask bit == 0 ? zero : aux + // step 4: dst = dst & src + // src mask_bit new_src dst new_dst + // case 1 ~0 1 1.0f 1.0f 1.0f + // case 2 0 0 0.0f 1.0f 0.0f + // case 3 ~0 1 1.0f 0.0f 0.0f + // case 4 0 0 0.0f 0.0f 0.0f + // step 5: loop: offset src, and do step 2 and step 3 + // + // ReduceOr (Logical Or) + // step 1: init dst 0x00000000 (0.0f) + // aux 0x3f800000 (1.0f) + // zero 0x00000000 (0.0f) + // step 2: if src equals 0, set mask bit 0, else set mask bit 1 + // step 3: src = mask bit == 0 ? zero : aux + // step 4: dst = dst | src + // src mask_bit new_src dst new_dst + // case 1 0 0 0.0f 0.0f 0.0f + // case 2 ~0 1 1.0f 0.0f 1.0f + // case 3 0 0 0.0f 1.0f 1.0f + // case 4 ~0 1 1.0f 1.0f 1.0f + // step 5: loop: offset src, and do step 2 and step 3 + // ================================================================ + // ***isa: OTHER*** + // ReduceAnd (Logical And) + // step 1: init dst 0x3f800000 (1.0f) + // step 2: if src equals 0, set it 0x00000000, else set 0xffffffff + // step 3: dst = dst & src + // 0x3f800000 = 0x3f800000 & 0xffffffff (result: 1.0f) + // 0x00000000 = 0x3f800000 & 0x00000000 (result: 0.0f) + // 0x00000000 = 0x00000000 & 0xffffffff (result: 0.0f) + // 0x00000000 = 0x00000000 & 0x00000000 (result: 0.0f) + // step 4: loop: offset src, and do step 2 and step 3 + // + // ReduceOr (Logical Or) + // step 1: init dst 0x00000000 (0.0f) + // aux 0x3f800000 (1.0f) + // step 2: dst = dst | src + // 0x00000000 = 0x00000000 | 0x00000000 + // A = 0x00000000 | A + // A = A | 0x00000000 + // C = A | B + // (A, B stand for number other than 0x00000000) + // step 3: loop: offset src, and do step 2 + // step 4: if dst equals 0, set it 0x00000000, else set 0xffffffff + // step 5: dst = dst & aux + // 0x00000000 = 0x00000000 & 0x3f800000 (result: 0.0f) + // 0x3f800000 = 0xffffffff & 0x3f800000 (result: 1.0f) + // ================================================================ + Label reduce_to_scalar_label; + Label reduce_to_gather_label; + Label reduce_main_end_label; + if (planar_layout) { + cmp(reg_work_batch, 0); + je(reduce_to_gather_label, T_NEAR); + + cmp(reg_reduce_w, 1); // planar layout reducing W + je(reduce_to_scalar_label, T_NEAR); + } + + // store v_dst directly into memory after reducing + // cases: [planar layout reducing other dimensions but W] [blocked layout] + { + cmp(reg_work_amount, loop_step); + jl(reduce_main_end_label, T_NEAR); // avoid illegal loading and storing + + if (jcp.reduce_mode == Algorithm::ReduceL1 && !(isa == x64::avx512_core && exec_el_type == ov::element::i64)) { + uni_vmovups(v_abs_mask, table_val(1)); + } + + load_dst_vector(); + + reduce_kernel(); + + if (jcp.reduce_mode == Algorithm::ReduceMean) { + auto reg_can_divide = getReg64(); + auto reg_divider = getReg64(); + auto vmm_divider = getVmm(); + Label reduce_divide_end_label; + + mov(reg_can_divide, ptr[reg_params + GET_OFF(can_divide)]); + cmp(reg_can_divide, 0); + je(reduce_divide_end_label, T_NEAR); + { + mov(reg_divider, ptr[reg_params + GET_OFF(divisor)]); + if (exec_el_type.size() == 4) { + uni_vbroadcastss(vmm_divider, ptr[reg_divider]); + } else if (exec_el_type.size() == 8) { + uni_vbroadcastsd(vmm_divider, ptr[reg_divider]); + } + if (exec_el_type == ov::element::f32) { + uni_vdivps(v_dst, v_dst, vmm_divider); + if (isa == x64::sse41) { + uni_vdivps(v_dst_aux, v_dst_aux, vmm_divider); + } + } else if (exec_el_type == ov::element::f64) { + uni_vdivpd(v_dst, v_dst, vmm_divider); + if (isa == x64::sse41) { + uni_vdivpd(v_dst_aux, v_dst_aux, vmm_divider); + } + } else if (exec_el_type == ov::element::i64) { + if (isa == x64::avx512_core) { + vcvtqq2pd(v_dst, v_dst); + } else { + // TODO + } + uni_vdivpd(v_dst, v_dst, vmm_divider); + uni_vroundpd(v_dst, v_dst, 0x3); // Truncation + if (isa == x64::avx512_core) { + vcvtpd2qq(v_dst, v_dst); + } else { + // TODO + } + if (isa == x64::sse41) { + // cvt + uni_vdivpd(v_dst_aux, v_dst_aux, vmm_divider); + // cvt + } + } + } + L(reduce_divide_end_label); + } + + store_dst_vector(); + + jmp(reduce_main_end_label, T_NEAR); + } + + // reduce vector in v_dst to be a scalar before store into memory + // cases: [planar layout reducing W] + L(reduce_to_scalar_label); + { + // init dst, dst loading is embedded in horiz_reduce_store + switch (jcp.reduce_mode) { + case Algorithm::ReduceAnd: + case Algorithm::ReduceProd: + uni_vmovups(v_dst, table_val(0)); + break; + case Algorithm::ReduceL1: + if (!(isa == x64::avx512_core && exec_el_type == ov::element::i64)) { + uni_vmovups(v_abs_mask, table_val(1)); + } + uni_vpxor(v_dst, v_dst, v_dst); + break; + case Algorithm::ReduceL2: + case Algorithm::ReduceLogSum: + case Algorithm::ReduceLogSumExp: + case Algorithm::ReduceMean: + case Algorithm::ReduceOr: + case Algorithm::ReduceSum: + case Algorithm::ReduceSumSquare: + uni_vpxor(v_dst, v_dst, v_dst); + break; + case Algorithm::ReduceMax: + if (isFloatCompatible(jcp.dst_el_type)) { + uni_vmovups(v_dst, table_val(2)); + } else { + uni_vmovups(v_dst, table_val(4)); + } + break; + case Algorithm::ReduceMin: + if (isFloatCompatible(jcp.dst_el_type)) { + uni_vmovups(v_dst, table_val(3)); + } else { + uni_vmovups(v_dst, table_val(5)); + } + break; + default: + IE_THROW() << "Unsupported reduce mode '" << algToString(jcp.reduce_mode) << "'"; + } + // reduce + reduce_main_loop(); + if (jcp.reduce_mode == Algorithm::ReduceOr && isa != x64::avx512_core) { + uni_cmpneqps(v_dst, v_dst, v_zero); + uni_vandps(v_dst, v_dst, v_ones); + } + // store + // store after horizontal calculation and calculation with loaded original ptr[reg_dst] + if (exec_el_type == ov::element::f32) { + horiz_reduce_store_ps(v_dst, jcp.dst_el_type, true); + } else if (exec_el_type == ov::element::i64) { + horiz_reduce_store_qq(v_dst, jcp.dst_el_type, true); + } + + jmp(reduce_main_end_label, T_NEAR); + } + + // load v_src with gather, then store v_dst directly into memory after reducing + // cases: [planar layout reducing small W] + L(reduce_to_gather_label); + { + int step = 1; + cmp(reg_work_amount, step); + jl(reduce_main_end_label, T_NEAR); // Avoid illegal loading and storing. + + auto reg_idx = getReg64(); + v_idx = getVmm(); + mov(reg_idx, ptr[reg_params + GET_OFF(idx)]); + uni_vmovdqu(v_idx, ptr[reg_idx]); + + if (jcp.reduce_mode == Algorithm::ReduceL1 && !(isa == x64::avx512_core && exec_el_type == ov::element::i64)) { + uni_vmovups(v_abs_mask, table_val(1)); + } + + // load + load_dst_vector(); + + // reduce + Label reduce_loop_label; + Label reduce_loop_end_label; + L(reduce_loop_label); + { + cmp(reg_work_amount, step); + jl(reduce_loop_end_label, T_NEAR); + + reduce_gather(v_dst, 0); + if (isa == x64::sse41) { + reduce_gather(v_dst_aux, 4 * jcp.src_el_type.size()); + } + + add(reg_src, step * jcp.src_el_type.size()); + sub(reg_work_amount, step); + jmp(reduce_loop_label, T_NEAR); + } + L(reduce_loop_end_label); + + // store + store_dst_vector(); + + jmp(reduce_main_end_label, T_NEAR); + } + + L(reduce_main_end_label); +} + +template +void JitReduceKernel::reduce_tail() { + if (jcp.reduce_mode == Algorithm::ReduceL1 && !(isa == x64::avx512_core && exec_el_type == ov::element::i64)) { + auto xmm_abs_mask = Xmm(v_abs_mask.getIdx()); + uni_vmovups(xmm_abs_mask, table_val(1)); + } + + Label tail_dst_shifted_label; + Label tail_dst_fixed_label; + Label reduce_tail_end_label; + if (planar_layout) { + cmp(reg_reduce_w, 1); // planar layout reducing W + je(tail_dst_fixed_label, T_NEAR); + } + + // each src scalar reduce to each dst scalar (X1, X2, X3, ...) -> (Y1, Y2, Y3, ...) + // cases: [planar layout reducing other dimensions but W] [blocked layout concern padding] + L(tail_dst_shifted_label); + { + reduce_kernel_tail(); + + jmp(reduce_tail_end_label, T_NEAR); + } + + // each src scalar reduce to the same dst scalar (X1, X2, X3, ...) -> (Y1) + // cases: [planar layout reducing W] + L(tail_dst_fixed_label); + { + auto xmm_dst = Xmm(v_dst.getIdx()); + auto xmm_src = Xmm(v_src.getIdx()); + + // load + load_scalar(xmm_dst, ptr[reg_dst], exec_el_type, jcp.dst_el_type); + + Label reduce_loop_label; + Label reduce_loop_end_label; + + // reduce + int step = 1; + L(reduce_loop_label); + { + cmp(reg_work_amount, step); + jl(reduce_loop_end_label, T_NEAR); + + load_scalar(xmm_src, ptr[reg_src], exec_el_type, jcp.src_el_type); + + reduce_kernel_scalar(xmm_src, xmm_dst); + if (jcp.reduce_mode == Algorithm::ReduceOr) { + auto xmm_ones = Xmm(v_ones.getIdx()); + auto xmm_zero = Xmm(v_zero.getIdx()); + + if (exec_el_type == ov::element::f32) { + uni_vcmpps(xmm_dst, xmm_dst, xmm_zero, _cmp_neq_uq); + uni_vandps(xmm_dst, xmm_dst, xmm_ones); + } else if (exec_el_type == ov::element::f64 || exec_el_type == ov::element::i64) { + uni_vcmppd(xmm_dst, xmm_dst, xmm_zero, _cmp_neq_uq); + uni_vandpd(xmm_dst, xmm_dst, xmm_ones); + } + } + + add(reg_src, step * jcp.src_el_type.size()); + sub(reg_work_amount, step); + + jmp(reduce_loop_label, T_NEAR); + } + L(reduce_loop_end_label); + + // store + store_scalar(ptr[reg_dst], xmm_dst, jcp.dst_el_type, exec_el_type); + } + + L(reduce_tail_end_label); +} + +template +void JitReduceKernel::init_reg_reduce_stride() { + auto reg_tmp_64 = getReg64(); + mov(reg_reduce_stride, ptr[reg_params + GET_OFF(reduce_stride)]); + mul_by_const(reg_reduce_stride, reg_tmp_64, jcp.src_el_type.size()); +} + +template +void JitReduceKernel::reduce_kernel() { + Label reduce_label; + Label reduce_end_label; + Label reduce_batch_label; + + cmp(reg_work_batch, 1); + je(reduce_label, T_NEAR); + + init_reg_reduce_stride(); + + L(reduce_batch_label); + { + cmp(reg_work_amount, loop_step); + jl(reduce_end_label, T_NEAR); + + reduce_batch(); + + add(reg_src, loop_step * jcp.src_el_type.size()); + sub(reg_work_amount, loop_step); + jmp(reduce_batch_label, T_NEAR); + } + + L(reduce_label); + { + cmp(reg_work_amount, loop_step); + jl(reduce_end_label, T_NEAR); + + reduce_once(); + + add(reg_src, loop_step * jcp.src_el_type.size()); + sub(reg_work_amount, loop_step); + jmp(reduce_label, T_NEAR); + } + L(reduce_end_label); +} + +template +void JitReduceKernel::reduce_once() { + load_vector(v_src, ptr[reg_src], exec_el_type, jcp.src_el_type); + reduce_kernel(v_src, v_dst); + + if (isa == x64::sse41) { + load_vector(v_src, ptr[reg_src + 4 * jcp.src_el_type.size()], exec_el_type, jcp.src_el_type); + reduce_kernel(v_src, v_dst_aux); + } +} + +template +void JitReduceKernel::reduce_batch() { + auto reg_src_aux = getReg64(); + auto reg_work_batch_aux = getReg64(); + + mov(reg_src_aux, reg_src); + mov(reg_work_batch_aux, reg_work_batch); + + Label reduce_batch_loop_label; + Label reduce_batch_loop_end_label; + L(reduce_batch_loop_label); + { + cmp(reg_work_batch_aux, 1); + jl(reduce_batch_loop_end_label, T_NEAR); + + load_vector(v_src, ptr[reg_src_aux], exec_el_type, jcp.src_el_type); + reduce_kernel(v_src, v_dst); + if (isa == x64::sse41) { + load_vector(v_src, ptr[reg_src_aux + 4 * jcp.src_el_type.size()], exec_el_type, jcp.src_el_type); + reduce_kernel(v_src, v_dst_aux); + } + + add(reg_src_aux, reg_reduce_stride); + sub(reg_work_batch_aux, 1); + jmp(reduce_batch_loop_label, T_NEAR); + } + L(reduce_batch_loop_end_label); +} + +template <> +void JitReduceKernel::reduce_gather(const Zmm& vmm_dst, int64_t offset) { + switch (jcp.src_el_type.size()) { + case 8: { + auto ymm_idx = Ymm(v_idx.getIdx()); + + kxnorq(k_mask, k_mask, k_mask); + vgatherdpd((Zmm)v_src | k_mask, ptr[reg_src + offset + ymm_idx]); + if (jcp.src_el_type == ov::element::f64 && exec_el_type == ov::element::i64) { + vcvtpd2qq(v_src, v_src); + } else if (jcp.src_el_type == ov::element::i64 && exec_el_type == ov::element::f64) { + vcvtqq2pd(v_src, v_src); + } + } + break; + case 4: { + kxnord(k_mask, k_mask, k_mask); + vgatherdps((Zmm)v_src | k_mask, ptr[reg_src + offset + v_idx]); + if (jcp.src_el_type == ov::element::i32) { + uni_vcvtdq2ps(v_src, v_src); + } + } + break; + case 2: + case 1: + pack_gathered_vector(v_src, v_idx, offset, jcp.src_el_type); + break; + default: + IE_THROW() << "Unkown source element type '" << jcp.src_el_type << "'"; + } + reduce_kernel(v_src, vmm_dst); +} + +template <> +void JitReduceKernel::reduce_gather(const Ymm& vmm_dst, int64_t offset) { + switch (jcp.src_el_type.size()) { + case 8: { + auto v_mask = getVmm(); + auto xmm_idx = Xmm(v_idx.getIdx()); + + uni_vpcmpeqq(v_mask, v_mask, v_mask); + vgatherdpd(v_src, ptr[reg_src + offset + xmm_idx], v_mask); + if (exec_el_type == ov::element::i64) { + // TODO Convert pd tp qq (v_src, v_src); + } + } + break; + case 4: { + auto v_mask = getVmm(); + + uni_vpcmpeqd(v_mask, v_mask, v_mask); + vgatherdps(v_src, ptr[reg_src + offset + v_idx], v_mask); + if (jcp.src_el_type == ov::element::i32) { + uni_vcvtdq2ps(v_src, v_src); + } + } + break; + case 2: + case 1: + pack_gathered_vector(v_src, v_idx, offset, jcp.src_el_type); + break; + default: + IE_THROW() << "Unkown source element type '" << jcp.src_el_type << "'"; + } + reduce_kernel(v_src, vmm_dst); +} + +template <> +void JitReduceKernel::reduce_gather(const Xmm& vmm_dst, int64_t offset) { + pack_gathered_vector(v_src, v_idx, offset, jcp.src_el_type); + reduce_kernel(v_src, vmm_dst); +} + +template +void JitReduceKernel::pack_gathered_vector(const Vmm& vmm_val, const Vmm& vmm_index, int64_t offset, const ov::element::Type& src_el_type) { + sub(rsp, vlen); + uni_vmovdqu(ptr[rsp], vmm_index); + const size_t repeats = vlen / exec_el_type.size(); + auto reg_tmp_64 = getReg64(); + auto reg_tmp_32 = Reg32(reg_tmp_64.getIdx()); + auto reg_tmp_16 = Reg16(reg_tmp_64.getIdx()); + auto reg_tmp_8 = Reg8(reg_tmp_64.getIdx()); + for (size_t i = 0; i < repeats; i++) { + mov(reg_tmp_32, ptr[rsp + i * sizeof(int)]); + Address table_idx = ptr[reg_src + offset + reg_tmp_64]; + + switch (src_el_type.size()) { + case 8: + mov(reg_tmp_64, table_idx); + mov(ptr[rsp + i * sizeof(int64_t)], reg_tmp_64); + break; + case 4: + mov(reg_tmp_32, table_idx); + mov(ptr[rsp + i * sizeof(int32_t)], reg_tmp_32); + break; + case 2: + mov(reg_tmp_16, table_idx); + mov(ptr[rsp + i * sizeof(ov::intel_cpu::bfloat16_t)], reg_tmp_16); + break; + case 1: + mov(reg_tmp_8, table_idx); + mov(ptr[rsp + i * sizeof(char)], reg_tmp_8); + break; + default: + IE_THROW() << "Unkown source element type '" << src_el_type << "'"; + } + } + + switch (src_el_type) { + case ov::element::f64: + case ov::element::f32: + case ov::element::i64: + case ov::element::i32: + uni_vmovups(vmm_val, ptr[rsp]); + break; + case ov::element::bf16: + uni_vpmovzxwd(vmm_val, ptr[rsp]); + uni_vpslld(vmm_val, vmm_val, 16); + break; + case ov::element::i8: + uni_vpmovsxbd(vmm_val, ptr[rsp]); + break; + case ov::element::u8: + uni_vpmovzxbd(vmm_val, ptr[rsp]); + break; + default: + IE_THROW() << "Unkown source element type '" << src_el_type << "'"; + } + + if (!isFloatCompatible(src_el_type)) { + uni_vcvtdq2ps(vmm_val, vmm_val); // TODO i64? + } + add(rsp, vlen); +} + +template +void JitReduceKernel::reduce_kernel_tail() { + Label reduce_label; + Label reduce_end_label; + Label reduce_batch_label; + auto xmm_dst = Xmm(v_dst.getIdx()); + + int step = 1; + cmp(reg_work_batch, 1); + je(reduce_label, T_NEAR); + + init_reg_reduce_stride(); + + L(reduce_batch_label); + { + cmp(reg_work_amount, step); + jl(reduce_end_label, T_NEAR); + + load_scalar(xmm_dst, ptr[reg_dst], exec_el_type, jcp.dst_el_type); + + reduce_batch_tail(); + + store_scalar(ptr[reg_dst], xmm_dst, jcp.dst_el_type, exec_el_type); + + add(reg_dst, step * jcp.dst_el_type.size()); + add(reg_src, step * jcp.src_el_type.size()); + sub(reg_work_amount, step); + + jmp(reduce_batch_label, T_NEAR); + } + + L(reduce_label); + { + cmp(reg_work_amount, step); + jl(reduce_end_label, T_NEAR); + + load_scalar(xmm_dst, ptr[reg_dst], exec_el_type, jcp.dst_el_type); + + reduce_batch_tail(); + + store_scalar(ptr[reg_dst], xmm_dst, jcp.dst_el_type, exec_el_type); + + add(reg_dst, step * jcp.dst_el_type.size()); + add(reg_src, step * jcp.src_el_type.size()); + sub(reg_work_amount, step); + + jmp(reduce_label, T_NEAR); + } + L(reduce_end_label); +} + +template +void JitReduceKernel::reduce_once_tail() { + auto xmm_dst = Xmm(v_dst.getIdx()); + auto xmm_src = Xmm(v_src.getIdx()); + + load_scalar(xmm_src, ptr[reg_src], exec_el_type, jcp.src_el_type); + reduce_kernel_scalar(xmm_src, xmm_dst); + if (jcp.reduce_mode == Algorithm::ReduceOr) { + auto xmm_zero = Xmm(v_zero.getIdx()); + auto xmm_ones = Xmm(v_ones.getIdx()); + + if (exec_el_type == ov::element::f32) { + uni_cmpneqps(xmm_dst, xmm_dst, xmm_zero); + uni_vandps(xmm_dst, xmm_dst, xmm_ones); + } else if (exec_el_type == ov::element::f64 || exec_el_type == ov::element::i64) { + uni_vcmppd(xmm_dst, xmm_dst, xmm_zero, _cmp_neq_uq); + uni_vandpd(xmm_dst, xmm_dst, xmm_ones); + } + } +} + +template +void JitReduceKernel::reduce_batch_tail() { + auto reg_src_aux = getReg64(); + auto reg_work_batch_aux = getReg64(); + auto xmm_src = Xmm(v_src.getIdx()); + auto xmm_dst = Xmm(v_dst.getIdx()); + + mov(reg_src_aux, reg_src); + mov(reg_work_batch_aux, reg_work_batch); + + Label reduce_batch_loop_label; + Label reduce_batch_loop_end_label; + L(reduce_batch_loop_label); + { + cmp(reg_work_batch_aux, 1); + jl(reduce_batch_loop_end_label, T_NEAR); + + load_scalar(xmm_src, ptr[reg_src_aux], exec_el_type, jcp.src_el_type); + reduce_kernel_scalar(xmm_src, xmm_dst); + if (jcp.reduce_mode == Algorithm::ReduceOr) { + auto xmm_zero = Xmm(v_zero.getIdx()); + auto xmm_ones = Xmm(v_ones.getIdx()); + + if (exec_el_type == ov::element::f32) { + uni_cmpneqps(xmm_dst, xmm_dst, xmm_zero); + uni_vandps(xmm_dst, xmm_dst, xmm_ones); + } else if (exec_el_type == ov::element::f64 || exec_el_type == ov::element::i64) { + uni_vcmppd(xmm_dst, xmm_dst, xmm_zero, _cmp_neq_uq); + uni_vandpd(xmm_dst, xmm_dst, xmm_ones); + } + } + + add(reg_src_aux, reg_reduce_stride); + sub(reg_work_batch_aux, 1); + jmp(reduce_batch_loop_label, T_NEAR); + } + L(reduce_batch_loop_end_label); +} + +template +void JitReduceKernel::reduce_main_loop() { + Label reduce_loop_label; + Label reduce_loop_end_label; + + L(reduce_loop_label); + { + cmp(reg_work_amount, loop_step); + jl(reduce_loop_end_label, T_NEAR); + + load_vector(v_src, ptr[reg_src], exec_el_type, jcp.src_el_type); + reduce_kernel(v_src, v_dst); + + if (isa == x64::sse41) { + load_vector(v_src, ptr[reg_src + 4 * jcp.src_el_type.size()], exec_el_type, jcp.src_el_type); + reduce_kernel(v_src, v_dst); + } + + add(reg_src, loop_step * jcp.src_el_type.size()); + sub(reg_work_amount, loop_step); + + jmp(reduce_loop_label, T_NEAR); + } + L(reduce_loop_end_label); +} + +template +void JitReduceKernel::reduce_kernel(const Vmm& vmm_src, const Vmm& vmm_dst) { + const size_t src_idx = static_cast(vmm_src.getIdx()); + const size_t dst_idx = static_cast(vmm_dst.getIdx()); + + if (exec_el_type == ov::element::f32) { + switch (jcp.reduce_mode) { + case Algorithm::ReduceAnd: + if (isa == x64::avx512_core) { + vcmpps(k_mask, vmm_src, v_zero, _cmp_neq_uq); + vmovups(vmm_dst | k_mask | T_z, vmm_dst); + } else { + uni_cmpneqps(vmm_src, vmm_src, v_zero); + uni_vandps(vmm_dst, vmm_dst, vmm_src); + } + break; + case Algorithm::ReduceL1: + uni_vandps(vmm_src, vmm_src, v_abs_mask); + uni_vaddps(vmm_dst, vmm_dst, vmm_src); + break; + case Algorithm::ReduceLogSum: + case Algorithm::ReduceMean: + case Algorithm::ReduceSum: + uni_vaddps(vmm_dst, vmm_dst, vmm_src); + break; + case Algorithm::ReduceMax: + uni_vmaxps(vmm_dst, vmm_dst, vmm_src); + break; + case Algorithm::ReduceMin: + uni_vminps(vmm_dst, vmm_dst, vmm_src); + break; + case Algorithm::ReduceL2: + case Algorithm::ReduceSumSquare: + uni_vmulps(vmm_src, vmm_src, vmm_src); + uni_vaddps(vmm_dst, vmm_dst, vmm_src); + break; + case Algorithm::ReduceLogSumExp: + exp_injector->compute_vector_range(src_idx, src_idx + 1); + uni_vaddps(vmm_dst, vmm_dst, vmm_src); + break; + case Algorithm::ReduceOr: + if (isa == x64::avx512_core) { + vcmpps(k_mask, vmm_src, v_zero, _cmp_neq_uq); + vorps(vmm_dst | k_mask, vmm_dst, v_ones); + } else { + uni_vorps(vmm_dst, vmm_dst, vmm_src); + } + break; + case Algorithm::ReduceProd: + uni_vmulps(vmm_dst, vmm_dst, vmm_src); + break; + default: + IE_THROW() << "Unsupported reduce mode '" << algToString(jcp.reduce_mode) << "'"; + } + } else if (exec_el_type == ov::element::f64) { + switch (jcp.reduce_mode) { + case Algorithm::ReduceAnd: + if (isa == x64::avx512_core) { + vcmppd(k_mask, vmm_src, v_zero, _cmp_neq_uq); + vandpd(vmm_dst | k_mask | T_z, vmm_dst, vmm_src); + } else { + uni_vcmppd(vmm_src, vmm_src, v_zero, _cmp_neq_uq); + uni_vandpd(vmm_dst, vmm_dst, vmm_src); + } + break; + case Algorithm::ReduceL1: + uni_vandpd(vmm_src, vmm_src, v_abs_mask); + uni_vaddpd(vmm_dst, vmm_dst, vmm_src); + break; + case Algorithm::ReduceLogSum: + case Algorithm::ReduceMean: + case Algorithm::ReduceSum: + uni_vaddpd(vmm_dst, vmm_dst, vmm_src); + break; + case Algorithm::ReduceMax: + uni_vmaxpd(vmm_dst, vmm_dst, vmm_src); + break; + case Algorithm::ReduceMin: + uni_vminpd(vmm_dst, vmm_dst, vmm_src); + break; + case Algorithm::ReduceL2: + case Algorithm::ReduceSumSquare: + uni_vmulpd(vmm_src, vmm_src, vmm_src); + uni_vaddpd(vmm_dst, vmm_dst, vmm_src); + break; + case Algorithm::ReduceLogSumExp: + exp_injector->compute_vector_range(src_idx, src_idx + 1); + uni_vaddpd(vmm_dst, vmm_dst, vmm_src); + break; + case Algorithm::ReduceOr: + if (isa == x64::avx512_core) { + vcmppd(k_mask, vmm_src, v_zero, _cmp_neq_uq); + vblendmps(vmm_src | k_mask, v_zero, v_ones); + } + uni_vorpd(vmm_dst, vmm_dst, vmm_src); + break; + case Algorithm::ReduceProd: + uni_vmulpd(vmm_dst, vmm_dst, vmm_src); + break; + default: + IE_THROW() << "Unsupported reduce mode '" << algToString(jcp.reduce_mode) << "'"; + } + } else if (exec_el_type == ov::element::i64) { + switch (jcp.reduce_mode) { + case Algorithm::ReduceAnd: + if (isa == x64::avx512_core) { + vcmppd(k_mask, vmm_src, v_zero, _cmp_neq_uq); + vmovups(vmm_dst | k_mask | T_z, vmm_dst); + } else { + uni_vcmppd(vmm_src, vmm_src, v_zero, _cmp_neq_uq); + uni_vandpd(vmm_dst, vmm_dst, vmm_src); + } + break; + case Algorithm::ReduceL1: + if (isa == x64::avx512_core) { + vpabsq(vmm_src, vmm_src); + } else { + uni_vandpd(vmm_src, vmm_src, v_abs_mask); + } + uni_vpaddq(vmm_dst, vmm_dst, vmm_src); + break; + case Algorithm::ReduceLogSum: + case Algorithm::ReduceMean: + case Algorithm::ReduceSum: + uni_vpaddq(vmm_dst, vmm_dst, vmm_src); + break; + case Algorithm::ReduceMax: + if (isa == x64::avx512_core) { + max_emitter->emit_code({dst_idx, src_idx}, {dst_idx}); + } else { + auto vmm_aux_0 = getVmm(); + max_emitter->emit_code({dst_idx, src_idx}, {dst_idx}, {vmm_aux_0.getIdx()}); + } + break; + case Algorithm::ReduceMin: + if (isa == x64::avx512_core) { + min_emitter->emit_code({dst_idx, src_idx}, {dst_idx}); + } else { + auto vmm_aux_0 = getVmm(); + min_emitter->emit_code({dst_idx, src_idx}, {dst_idx}, {vmm_aux_0.getIdx()}); + } + break; + case Algorithm::ReduceL2: + case Algorithm::ReduceSumSquare: + if (isa == x64::avx512_core) { + mul_emitter->emit_code({src_idx, src_idx}, {src_idx}); + } else { + auto vmm_aux_0 = getVmm(); + auto vmm_aux_1 = getVmm(); + mul_emitter->emit_code({src_idx, src_idx}, {src_idx}, {vmm_aux_0.getIdx(), vmm_aux_1.getIdx()}); + } + uni_vpaddq(vmm_dst, vmm_dst, vmm_src); + break; + case Algorithm::ReduceLogSumExp: + exp_injector->compute_vector_range(src_idx, src_idx + 1); + uni_vpaddq(vmm_dst, vmm_dst, vmm_src); + break; + case Algorithm::ReduceOr: + if (isa == x64::avx512_core) { + // vcmppd(k_mask, vmm_src, v_zero, _cmp_neq_uq); + // vblendmps(vmm_src | k_mask, v_zero, v_ones); + vcmppd(k_mask, vmm_src, v_zero, _cmp_neq_uq); + vorpd(vmm_dst | k_mask, vmm_dst, v_ones); + } else { + uni_vorpd(vmm_dst, vmm_dst, vmm_src); + } + break; + case Algorithm::ReduceProd: + if (isa == x64::avx512_core) { + mul_emitter->emit_code({dst_idx, src_idx}, {dst_idx}); + } else { + auto vmm_aux_0 = getVmm(); + auto vmm_aux_1 = getVmm(); + mul_emitter->emit_code({dst_idx, src_idx}, {dst_idx}, {vmm_aux_0.getIdx(), vmm_aux_1.getIdx()}); + } + break; + default: + IE_THROW() << "Unsupported reduce mode '" << algToString(jcp.reduce_mode) << "'"; + } + } +} + +template +void JitReduceKernel::reduce_kernel_scalar(const Xmm& xmm_src, const Xmm& xmm_dst) { + if (exec_el_type == ov::element::f32) { + switch (jcp.reduce_mode) { + case Algorithm::ReduceAnd: { + auto xmm_zero = Xmm(v_zero.getIdx()); + uni_cmpneqps(xmm_src, xmm_src, xmm_zero); + uni_vandps(xmm_dst, xmm_dst, xmm_src); + } break; + case Algorithm::ReduceL1: { + auto xmm_abs_mask = Xmm(v_abs_mask.getIdx()); + uni_vandps(xmm_src, xmm_src, xmm_abs_mask); + uni_vaddps(xmm_dst, xmm_dst, xmm_src); + } break; + case Algorithm::ReduceLogSum: + case Algorithm::ReduceMean: + case Algorithm::ReduceSum: + uni_vaddps(xmm_dst, xmm_dst, xmm_src); + break; + case Algorithm::ReduceMax: + uni_vmaxps(xmm_dst, xmm_dst, xmm_src); + break; + case Algorithm::ReduceMin: + uni_vminps(xmm_dst, xmm_dst, xmm_src); + break; + case Algorithm::ReduceL2: + case Algorithm::ReduceSumSquare: + uni_vmulps(xmm_src, xmm_src, xmm_src); + uni_vaddps(xmm_dst, xmm_dst, xmm_src); + break; + case Algorithm::ReduceLogSumExp: + exp_injector->compute_vector_range(xmm_src.getIdx(), xmm_src.getIdx() + 1); + uni_vaddps(xmm_dst, xmm_dst, xmm_src); + break; + case Algorithm::ReduceOr: + uni_vorps(xmm_dst, xmm_dst, xmm_src); + break; + case Algorithm::ReduceProd: + uni_vmulps(xmm_dst, xmm_dst, xmm_src); + break; + default: + IE_THROW() << "Unsupported reduce mode '" << algToString(jcp.reduce_mode) << "'"; + } + } else if (exec_el_type == ov::element::f64) { + switch (jcp.reduce_mode) { + case Algorithm::ReduceAnd: { + auto xmm_zero = Xmm(v_zero.getIdx()); + uni_vcmppd(xmm_src, xmm_src, xmm_zero, _cmp_neq_uq); + uni_vandpd(xmm_dst, xmm_dst, xmm_src); + } break; + case Algorithm::ReduceL1: { + auto xmm_abs_mask = Xmm(v_abs_mask.getIdx()); + uni_vandpd(xmm_src, xmm_src, xmm_abs_mask); + uni_vaddpd(xmm_dst, xmm_dst, xmm_abs_mask); + } break; + case Algorithm::ReduceLogSum: + case Algorithm::ReduceMean: + case Algorithm::ReduceSum: + uni_vaddpd(xmm_dst, xmm_dst, xmm_src); + break; + case Algorithm::ReduceMax: + uni_vmaxpd(xmm_dst, xmm_dst, xmm_src); + break; + case Algorithm::ReduceMin: + uni_vminpd(xmm_dst, xmm_dst, xmm_src); + break; + case Algorithm::ReduceL2: + case Algorithm::ReduceSumSquare: + uni_vmulpd(xmm_src, xmm_src, xmm_src); + uni_vaddpd(xmm_dst, xmm_dst, xmm_src); + break; + case Algorithm::ReduceLogSumExp: + exp_injector->compute_vector_range(xmm_src.getIdx(), xmm_src.getIdx() + 1); + uni_vaddpd(xmm_dst, xmm_dst, xmm_src); + break; + case Algorithm::ReduceOr: + uni_vorpd(xmm_dst, xmm_dst, xmm_src); + break; + case Algorithm::ReduceProd: + uni_vmulpd(xmm_dst, xmm_dst, xmm_src); + break; + default: + IE_THROW() << "Unsupported reduce mode '" << algToString(jcp.reduce_mode) << "'"; + } + } else if (exec_el_type == ov::element::i64) { + switch (jcp.reduce_mode) { + case Algorithm::ReduceAnd: { + auto xmm_zero = Xmm(v_zero.getIdx()); + uni_vcmppd(xmm_src, xmm_src, xmm_zero, _cmp_neq_uq); + uni_vandpd(xmm_dst, xmm_dst, xmm_src); + } break; + case Algorithm::ReduceL1: + if (isa == x64::avx512_core) { + vpabsq(xmm_src, xmm_src); + } else { + auto xmm_abs_mask = Xmm(v_abs_mask.getIdx()); + uni_vandpd(xmm_src, xmm_src, xmm_abs_mask); + } + uni_vpaddq(xmm_dst, xmm_dst, xmm_src); + break; + case Algorithm::ReduceLogSum: + case Algorithm::ReduceMean: + case Algorithm::ReduceSum: + uni_vpaddq(xmm_dst, xmm_dst, xmm_src); + break; + case Algorithm::ReduceMax: { + const size_t src_idx = static_cast(xmm_src.getIdx()), dst_idx = static_cast(xmm_dst.getIdx()); + if (isa == x64::avx512_core) { + max_emitter->emit_code({dst_idx, src_idx}, {dst_idx}); + } else { + auto vmm_aux_0 = getVmm(); + max_emitter->emit_code({dst_idx, src_idx}, {dst_idx}, {vmm_aux_0.getIdx()}); + } + } break; + case Algorithm::ReduceMin: { + const size_t src_idx = static_cast(xmm_src.getIdx()), dst_idx = static_cast(xmm_dst.getIdx()); + if (isa == x64::avx512_core) { + min_emitter->emit_code({dst_idx, src_idx}, {dst_idx}); + } else { + auto vmm_aux_0 = getVmm(); + min_emitter->emit_code({dst_idx, src_idx}, {dst_idx}, {vmm_aux_0.getIdx()}); + } + } break; + case Algorithm::ReduceL2: + case Algorithm::ReduceSumSquare: { + const size_t src_idx = static_cast(xmm_src.getIdx()); + if (isa == x64::avx512_core) { + mul_emitter->emit_code({src_idx, src_idx}, {src_idx}); + } else { + auto vmm_aux_0 = getVmm(); + auto vmm_aux_1 = getVmm(); + mul_emitter->emit_code({src_idx, src_idx}, {src_idx}, {vmm_aux_0.getIdx(), vmm_aux_1.getIdx()}); + } + uni_vpaddq(xmm_dst, xmm_dst, xmm_src); + } break; + case Algorithm::ReduceLogSumExp: + exp_injector->compute_vector_range(xmm_src.getIdx(), xmm_src.getIdx() + 1); + uni_vpaddq(xmm_dst, xmm_dst, xmm_src); + break; + case Algorithm::ReduceOr: + uni_vorpd(xmm_dst, xmm_dst, xmm_src); + break; + case Algorithm::ReduceProd: { + const size_t src_idx = static_cast(xmm_src.getIdx()), dst_idx = static_cast(xmm_dst.getIdx()); + if (isa == x64::avx512_core) { + mul_emitter->emit_code({dst_idx, src_idx}, {dst_idx}); + } else { + auto vmm_aux_0 = getVmm(); + auto vmm_aux_1 = getVmm(); + mul_emitter->emit_code({dst_idx, src_idx}, {dst_idx}, {vmm_aux_0.getIdx(), vmm_aux_1.getIdx()}); + } + } break; + default: + IE_THROW() << "Unsupported reduce mode '" << algToString(jcp.reduce_mode) << "'"; + } + } +} + +template +void JitReduceKernel::load_dst_vector() { + load_vector(v_dst, ptr[reg_dst], exec_el_type, jcp.dst_el_type); + if (isa == x64::sse41) { + load_vector(v_dst_aux, ptr[reg_dst + 4 * jcp.dst_el_type.size()], exec_el_type, jcp.dst_el_type); + } +} + +template +void JitReduceKernel::store_dst_vector() { + if (jcp.reduce_mode == Algorithm::ReduceOr && isa != x64::avx512_core) { + if (exec_el_type == ov::element::f32) { + uni_cmpneqps(v_dst, v_dst, v_zero); + uni_vandps(v_dst, v_dst, v_ones); + } else if (exec_el_type == ov::element::f64 || exec_el_type == ov::element::i64) { + uni_vcmppd(v_dst, v_dst, v_zero, _cmp_neq_uq); + uni_vandpd(v_dst, v_dst, v_ones); + } + + if (isa == x64::sse41) { + uni_cmpneqps(v_dst_aux, v_dst_aux, v_zero); + uni_vandps(v_dst_aux, v_dst_aux, v_ones); + } + } + store_vector(ptr[reg_dst], v_dst, jcp.dst_el_type, exec_el_type); + if (isa == x64::sse41) { + store_vector(ptr[reg_dst + 4 * jcp.dst_el_type.size()], v_dst_aux, jcp.dst_el_type, exec_el_type); + } +} + +template +void JitReduceKernel::prepare_aux_table() { + auto broadcast_int32 = [&](uint32_t val) { + for (size_t d = 0; d < vlen / exec_el_type.size(); ++d) { + dd(val); + } + }; + auto broadcast_int64 = [&](uint64_t val) { + for (size_t d = 0; d < vlen / exec_el_type.size(); ++d) { + dq(val); + } + }; + + align(64); + L(l_table); + + if (exec_el_type == ov::element::f32) { + broadcast_int32(aux_vals.float_one); + broadcast_int32(aux_vals.float_abs); + broadcast_int32(aux_vals.float_min); + broadcast_int32(aux_vals.float_max); + broadcast_int32(aux_vals.float_int32_min); + broadcast_int32(aux_vals.float_int32_max); + } else if (exec_el_type == ov::element::f64) { + broadcast_int64(aux_vals.double_one); + broadcast_int64(aux_vals.double_abs); + broadcast_int64(aux_vals.double_min); + broadcast_int64(aux_vals.double_max); + broadcast_int64(aux_vals.double_int64_min); + broadcast_int64(aux_vals.double_int64_max); + } else if (exec_el_type == ov::element::i64) { + broadcast_int64(aux_vals.int64_one); + broadcast_int64(aux_vals.int64_abs); + broadcast_int64(aux_vals.int64_min); + broadcast_int64(aux_vals.int64_max); + broadcast_int64(aux_vals.int64_min); + broadcast_int64(aux_vals.int64_max); + } +} + +/////////////////////////////// +///// JitReducePostKernel ///// +/////////////////////////////// + +template +JitReducePostKernel::JitReducePostKernel(const JitReduceConfigParams& jcp, const dnnl_primitive_attr& attr) + : JitReduceKernelBase(jit_name(), jcp, isa), attr(attr) { + post_reduce = one_of(jcp.reduce_mode, Algorithm::ReduceL2, Algorithm::ReduceMean, Algorithm::ReduceLogSum, Algorithm::ReduceLogSumExp); + post_ops_fusing = attr.post_ops_.len() != 0; + + loop_step = vlen / exec_el_type.size(); + if (isa == x64::sse41) { + loop_step *= 2; + } + + if (jcp.reduce_mode == Algorithm::ReduceLogSum || jcp.reduce_mode == Algorithm::ReduceLogSumExp) { + log_injector = std::make_shared>(this, dnnl::impl::alg_kind::eltwise_log, 0.f, 0.f, 1.f); + } + + if (jcp.reduce_mode == Algorithm::ReduceMean) { + division_emitter = std::make_shared(this, isa, InferenceEngine::details::convertPrecision(exec_el_type)); + division_emitter->second_is_float = true; + } + if (jcp.reduce_mode == Algorithm::ReduceL2) { + sqrt_emitter = std::make_shared(this, isa, InferenceEngine::details::convertPrecision(exec_el_type)); + sqrt_emitter->rounding_type = jit_emitter::RoundType::truncation; + } +} + +template +void JitReducePostKernel::generate() { + registersPool = RegistersPool::create(isa, {rax, rcx, rsp, rdi, k0}); + + const auto &p = attr.post_ops_; + for (int i = 0; i < p.len(); i++) { + auto &post_op = p.entry_[i]; + if (post_op.is_eltwise()) { + eltwise_injectors.push_back(std::make_shared>( + this, post_op.eltwise.alg, post_op.eltwise.alpha, post_op.eltwise.beta, post_op.eltwise.scale)); + } else if (post_op.is_depthwise()) { + if (!reg_d_weights.isInitialized()) { + reg_d_weights = getReg64(); + } + depthwise_injectors.push_back(std::make_shared>( + this, post_op)); + } else if (post_op.is_quantization()) { + if (!reg_d_weights.isInitialized()) { + reg_d_weights = getReg64(); + } + if (!reg_d_bias.isInitialized()) { + reg_d_bias = getReg64(); + } + if (!v_d_weights.isInitialized()) { + v_d_weights = getVmm(); + } + if (!v_d_bias.isInitialized()) { + v_d_bias = getVmm(); + } + quantization_injectors.push_back(std::make_shared>( + this, post_op, v_d_weights, v_d_bias, reg_d_weights, reg_d_bias)); + } + } + + this->preamble(); + + reg_dst = getReg64(); + reg_work_amount = getReg64(); + mov(reg_dst, ptr[reg_params + GET_OFF_POST(dst)]); + mov(reg_work_amount, ptr[reg_params + GET_OFF_POST(work_amount)]); + + v_dst = getVmm(); + + if (!planar_layout) { + reg_reduce_c = getReg64(); + mov(reg_reduce_c, ptr[reg_params + GET_OFF_POST(reduce_c)]); + } + if (post_ops_fusing) { + reg_oc_off = getReg64(); + reg_post_ops_data = getReg64(); + mov(reg_post_ops_data, ptr[reg_params + GET_OFF_POST(post_op_data)]); + mov(reg_oc_off, ptr[reg_params + GET_OFF_POST(oc_off)]); + } + if (jcp.reduce_mode == Algorithm::ReduceMean) { + v_divisor = getVmm(); + reg_divisor = getReg64(); + mov(reg_divisor, ptr[reg_params + GET_OFF_POST(divisor)]); + } + if (jcp.fuse_low_precision) { + reg_src = getReg64(); + mov(reg_src, ptr[reg_params + GET_OFF_POST(src)]); + } + + if (jcp.layout == ReduceLayoutType::reduce_blocked) { + reduce_post_main(); + } else if (jcp.layout == ReduceLayoutType::reduce_nspc && post_ops_fusing) { + auto reg_channel_size = getReg64(); + auto reg_total_work_amount = getReg64(); + // the tail of channel dimension should always be concerned during post ops fusing for nspc layout + Label reduce_nspc_loop_label; + Label reduce_nspc_loop_end_label; + mov(reg_channel_size, ptr[reg_params + GET_OFF_POST(channel_size)]); + mov(reg_total_work_amount, reg_work_amount); + L(reduce_nspc_loop_label); + { + cmp(reg_total_work_amount, 0); + jle(reduce_nspc_loop_end_label, T_NEAR); + + mov(reg_oc_off, 0); + mov(reg_work_amount, reg_channel_size); + reduce_post_main(); + reduce_post_tail(); + + sub(reg_total_work_amount, reg_channel_size); + jmp(reduce_nspc_loop_label, T_NEAR); + } + L(reduce_nspc_loop_end_label); + } else { + reduce_post_main(); + reduce_post_tail(); + } + + registersPool.reset(); + + this->postamble(); + + if (vcvtneps2bf16) { + vcvtneps2bf16->emit_data(); + } + if (max_emitter) { + max_emitter->emit_data(); + } + if (min_emitter) { + min_emitter->emit_data(); + } + if (mul_emitter) { + mul_emitter->emit_data(); + } + if (division_emitter) { + division_emitter->emit_data(); + } + if (sqrt_emitter) { + sqrt_emitter->emit_data(); + } + if (one_of(jcp.reduce_mode, Algorithm::ReduceLogSum, Algorithm::ReduceLogSumExp)) { + log_injector->prepare_table(); + } + for (auto& inj : eltwise_injectors) { + inj->prepare_table(); + } +} + +template +void JitReducePostKernel::reduce_post_main() { + Label reduce_map_label; + if (planar_layout) { + jmp(reduce_map_label, T_NEAR); + } else { + cmp(reg_reduce_c, 1); + jne(reduce_map_label, T_NEAR); + } + + // further reduce channel block since reduce channel batch has already been reduced + // (X1, X2, X3, X4, X5, X6, X7, X8) -> (Y1, N/A, N/A, N/A, N/A, N/A, N/A, N/A) + // cases: [blocked layout reducing channel dimensions] + { + Label reduce_loop_label; + Label reduce_loop_end_label; + RegistersPool::Reg v_dst_aux; + if (isa == x64::sse41) { + v_dst_aux = getVmm(); + } + + // int step = vlen / exec_el_type.size() < 8 ? 8 : vlen / exec_el_type.size(); + L(reduce_loop_label); + { + cmp(reg_work_amount, loop_step); + jl(reduce_loop_end_label, T_NEAR); + + // load + wrap_load_vector(v_dst, exec_el_type, jcp.dst_el_type, 0); + if (isa == x64::sse41) { + wrap_load_vector(v_dst_aux, exec_el_type, jcp.dst_el_type, 4); + } + + // reduce and store + if (exec_el_type == ov::element::f32) { + horiz_reduce_store_ps(v_dst, jcp.dst_el_type); + } else if (exec_el_type == ov::element::i64) { + horiz_reduce_store_qq(v_dst, jcp.dst_el_type); + } + if (isa == x64::sse41) { + if (exec_el_type == ov::element::f32) { + horiz_reduce_store_ps(v_dst_aux, jcp.dst_el_type, true); + } else if (exec_el_type == ov::element::i64) { + horiz_reduce_store_qq(v_dst_aux, jcp.dst_el_type, true); + } + } + + add(reg_dst, loop_step * jcp.dst_el_type.size()); + if (jcp.fuse_low_precision) { + add(reg_src, loop_step * sizeof(float)); // TODO i64 fusing + } + sub(reg_work_amount, loop_step); + + jmp(reduce_loop_label, T_NEAR); + } + L(reduce_loop_end_label); + + if (post_reduce || post_ops_fusing) { + mov(reg_dst, ptr[reg_params + GET_OFF_POST(dst)]); + if (jcp.fuse_low_precision) + mov(reg_src, ptr[reg_params + GET_OFF_POST(src)]); + mov(reg_work_amount, ptr[reg_params + GET_OFF_POST(work_amount)]); + } + } + + // reduce map for value in dst memory + // cases: [ReduceL2] [ReduceLogSum] [ReduceLogSumExp] [ReduceMean] + L(reduce_map_label); + { + if (post_reduce) { + if (jcp.reduce_mode == Algorithm::ReduceMean) { + if (exec_el_type.size() == 4) { + uni_vbroadcastss(v_divisor, ptr[reg_divisor]); + } else if (exec_el_type.size() == 8) { + uni_vbroadcastsd(v_divisor, ptr[reg_divisor]); + } + } + + Label reduce_loop_label; + Label reduce_loop_end_label; + + L(reduce_loop_label); + { + cmp(reg_work_amount, loop_step); + jl(reduce_loop_end_label, T_NEAR); + + wrap_load_vector(v_dst, exec_el_type, jcp.dst_el_type, 0); + reduce_map_kernel(v_dst); + if (post_ops_fusing) { + apply_post_ops(jcp.dst_el_type, jcp.layout == ReduceLayoutType::reduce_ncsp); + } + store_vector(ptr[reg_dst], v_dst, jcp.dst_el_type, exec_el_type); + + if (isa == x64::sse41) { + wrap_load_vector(v_dst, exec_el_type, jcp.dst_el_type, 4); + reduce_map_kernel(v_dst); + if (post_ops_fusing) { + if (jcp.layout != ReduceLayoutType::reduce_ncsp) { + add(reg_oc_off, 4 * exec_el_type.size()); + } + apply_post_ops(jcp.dst_el_type, jcp.layout == ReduceLayoutType::reduce_ncsp); + if (jcp.layout != ReduceLayoutType::reduce_ncsp) { + sub(reg_oc_off, 4 * exec_el_type.size()); + } + } + store_vector(ptr[reg_dst + 4 * jcp.dst_el_type.size()], v_dst, jcp.dst_el_type, exec_el_type); + } + + add(reg_dst, loop_step * jcp.dst_el_type.size()); + if (jcp.layout == ReduceLayoutType::reduce_nspc && post_ops_fusing) { + add(reg_oc_off, loop_step * exec_el_type.size()); + } + sub(reg_work_amount, loop_step); + + jmp(reduce_loop_label, T_NEAR); + } + L(reduce_loop_end_label); + } else { + if (post_ops_fusing) { + Label reduce_loop_label; + Label reduce_loop_end_label; + + L(reduce_loop_label); + { + cmp(reg_work_amount, loop_step); + jl(reduce_loop_end_label, T_NEAR); + + load_vector(v_dst, ptr[reg_dst], exec_el_type, jcp.dst_el_type); + apply_post_ops(jcp.dst_el_type, jcp.layout == ReduceLayoutType::reduce_ncsp); + store_vector(ptr[reg_dst], v_dst, jcp.dst_el_type, exec_el_type); + + if (isa == x64::sse41) { + load_vector(v_dst, ptr[reg_dst + 4 * jcp.dst_el_type.size()], exec_el_type, jcp.dst_el_type); + if (jcp.layout != ReduceLayoutType::reduce_ncsp) { + add(reg_oc_off, 4 * exec_el_type.size()); + } + apply_post_ops(jcp.dst_el_type, jcp.layout == ReduceLayoutType::reduce_ncsp); + if (jcp.layout != ReduceLayoutType::reduce_ncsp) { + sub(reg_oc_off, 4 * exec_el_type.size()); + } + store_vector(ptr[reg_dst + 4 * jcp.dst_el_type.size()], v_dst, jcp.dst_el_type, exec_el_type); + } + + add(reg_dst, loop_step * jcp.dst_el_type.size()); + if (jcp.fuse_low_precision) { + add(reg_src, loop_step * sizeof(float)); //TODO i64 + } + if (jcp.layout == ReduceLayoutType::reduce_nspc && post_ops_fusing) { + add(reg_oc_off, loop_step * exec_el_type.size()); + } + sub(reg_work_amount, loop_step); + + jmp(reduce_loop_label, T_NEAR); + } + L(reduce_loop_end_label); + } + } + } +} + +template +void JitReducePostKernel::reduce_post_tail() { + // reduce map for tail in dst memory + // cases: [ReduceL2] [ReduceLogSum] [ReduceLogSumExp] [ReduceMean] in planar layout + auto xmm_dst = Xmm(v_dst.getIdx()); + if (one_of(jcp.reduce_mode, Algorithm::ReduceL2, Algorithm::ReduceMean, Algorithm::ReduceLogSum, Algorithm::ReduceLogSumExp)) { + if (jcp.reduce_mode == Algorithm::ReduceMean) { + auto xmm_divisor = Xmm(v_divisor.getIdx()); + if (exec_el_type.size() == 4) { + uni_vbroadcastss(xmm_divisor, ptr[reg_divisor]); + } else if (exec_el_type.size() == 8) { + auto ymm_aux = Ymm(xmm_divisor.getIdx()); + vbroadcastsd(ymm_aux, ptr[reg_divisor]); + } + } + + Label reduce_loop_label; + Label reduce_loop_end_label; + + int step = 1; + L(reduce_loop_label); + { + cmp(reg_work_amount, step); + jl(reduce_loop_end_label, T_NEAR); + + wrap_load_scalar(xmm_dst, exec_el_type, jcp.dst_el_type, 0); + + reduce_map_kernel_scalar(xmm_dst); + + if (post_ops_fusing) { + apply_post_ops(jcp.dst_el_type, jcp.layout == ReduceLayoutType::reduce_ncsp); + } + store_scalar(ptr[reg_dst], xmm_dst, jcp.dst_el_type, exec_el_type); + + add(reg_dst, step * jcp.dst_el_type.size()); + if (jcp.fuse_low_precision) { + add(reg_src, step * sizeof(float)); // TODO i64 + } + if (jcp.layout == ReduceLayoutType::reduce_nspc && post_ops_fusing) { + add(reg_oc_off, step * exec_el_type.size()); + } + sub(reg_work_amount, step); + + jmp(reduce_loop_label, T_NEAR); + } + L(reduce_loop_end_label); + } else { + if (post_ops_fusing) { + Label reduce_loop_label; + Label reduce_loop_end_label; + + int step = 1; + L(reduce_loop_label); + { + cmp(reg_work_amount, step); + jl(reduce_loop_end_label, T_NEAR); + + wrap_load_scalar(xmm_dst, exec_el_type, jcp.dst_el_type, 0); + + apply_post_ops(jcp.dst_el_type, jcp.layout == ReduceLayoutType::reduce_ncsp); + store_scalar(ptr[reg_dst], xmm_dst, jcp.dst_el_type, exec_el_type); + + add(reg_dst, step * jcp.dst_el_type.size()); + if (jcp.fuse_low_precision) { + add(reg_src, step * sizeof(float)); // TODO i64 + } + if (jcp.layout == ReduceLayoutType::reduce_nspc && post_ops_fusing) { + add(reg_oc_off, step * exec_el_type.size()); + } + sub(reg_work_amount, step); + + jmp(reduce_loop_label, T_NEAR); + } + L(reduce_loop_end_label); + } + } +} + +template +void JitReducePostKernel::apply_post_ops(const ov::element::Type& dst_el_type, bool is_broadcast) { + const auto &p = attr.post_ops_; + int eltwise_inj_idx = 0; + int depthwise_inj_idx = 0; + int quantization_inj_idx = 0; + int post_ops_data_offset = 0; + for (int i = 0; i < p.len(); i++) { + auto& post_op = p.entry_[i]; + if (post_op.is_eltwise()) { + eltwise_injectors[eltwise_inj_idx]->compute_vector_range(v_dst.getIdx(), v_dst.getIdx() + 1); + eltwise_inj_idx++; + } else if (post_op.is_depthwise()) { + mov(reg_d_weights, ptr[reg_post_ops_data + post_ops_data_offset]); + add(reg_d_weights, reg_oc_off); + + depthwise_injectors[depthwise_inj_idx]->compute_vector_range( + v_dst.getIdx(), v_dst.getIdx() + 1, reg_d_weights, reg_d_weights, is_broadcast); + + post_ops_data_offset += depthwise_injectors[depthwise_inj_idx]->memoryStep(); + depthwise_inj_idx++; + } else if (post_op.is_quantization()) { + bool do_dequantization = post_op.quantization.alg == dnnl::impl::alg_kind::quantization_quantize_dequantize; + bool do_rounding = do_dequantization || isFloatCompatible(dst_el_type) || i != p.len() - 1; + + int s_idx = v_dst.getIdx(); + + quantization_injectors[quantization_inj_idx]->init_crop_ptrs(reg_post_ops_data + post_ops_data_offset, reg_oc_off); + quantization_injectors[quantization_inj_idx]->compute_crop(s_idx, s_idx + 1, 0, 0, is_broadcast); + + quantization_injectors[quantization_inj_idx]->init_input_scale_shift_ptrs(reg_post_ops_data + post_ops_data_offset, reg_oc_off); + quantization_injectors[quantization_inj_idx]->compute_input_scale_shift(s_idx, s_idx + 1, 0, do_rounding, 0, is_broadcast); + + if (do_dequantization) { + quantization_injectors[quantization_inj_idx]->init_output_scale_shift_ptrs(reg_post_ops_data + post_ops_data_offset, reg_oc_off); + quantization_injectors[quantization_inj_idx]->compute_output_scale_shift(s_idx, s_idx + 1, 0, 0, is_broadcast); + } + + post_ops_data_offset += quantization_injectors[quantization_inj_idx]->memoryStep(); + quantization_inj_idx++; + } + } +} + +template +void JitReducePostKernel::reduce_map_kernel(const Vmm& vmm_dst) { + if (jcp.reduce_mode == Algorithm::ReduceMean) { + division_emitter->emit_code({ vmm_dst.getIdx(), v_divisor.getIdx() }, { vmm_dst.getIdx() }); + } else if (jcp.reduce_mode == Algorithm::ReduceL2) { + sqrt_emitter->emit_code({ vmm_dst.getIdx() }, { vmm_dst.getIdx() }); + } else if (one_of(jcp.reduce_mode, Algorithm::ReduceLogSum, Algorithm::ReduceLogSumExp)) { + log_injector->compute_vector_range(vmm_dst.getIdx(), vmm_dst.getIdx() + 1); + } +} + +template +void JitReducePostKernel::reduce_map_kernel_scalar(const Xmm& xmm_dst) { + if (jcp.reduce_mode == Algorithm::ReduceMean) { + division_emitter->emit_code({ xmm_dst.getIdx(), v_divisor.getIdx() }, { xmm_dst.getIdx() }); + } else if (jcp.reduce_mode == Algorithm::ReduceL2) { + sqrt_emitter->emit_code({ xmm_dst.getIdx() }, { xmm_dst.getIdx() }); + } else if (one_of(jcp.reduce_mode, Algorithm::ReduceLogSum, Algorithm::ReduceLogSumExp)) { + log_injector->compute_vector_range(xmm_dst.getIdx(), xmm_dst.getIdx() + 1); + } +} + +template +void JitReducePostKernel::wrap_load_vector(const Vmm& vmm_val, const element::Type& dst_dt, const element::Type& src_dt, size_t offset) { + if (jcp.fuse_low_precision) { + load_vector(vmm_val, ptr[reg_src + offset * sizeof(float)], dst_dt, src_dt); // TODO i64 fusing + } else { + load_vector(vmm_val, ptr[reg_dst + offset * dst_dt.size()], dst_dt, src_dt); + } +} + +template +void JitReducePostKernel::wrap_load_scalar(const Xmm& xmm_val, const element::Type& dst_dt, const element::Type& src_dt, size_t offset) { + if (jcp.fuse_low_precision) { + load_scalar(xmm_val, ptr[reg_src + offset * sizeof(float)], dst_dt, src_dt); // TODO i64 fusing + } else { + load_scalar(xmm_val, ptr[reg_dst + offset * dst_dt.size()], dst_dt, src_dt); + } +} + + +template class JitReduceKernel; +template class JitReduceKernel; +template class JitReduceKernel; + +template class JitReducePostKernel; +template class JitReducePostKernel; +template class JitReducePostKernel; diff --git a/src/plugins/intel_cpu/src/nodes/kernels/x64/reduce.hpp b/src/plugins/intel_cpu/src/nodes/kernels/x64/reduce.hpp new file mode 100644 index 00000000000000..0dc30f24cd1bbc --- /dev/null +++ b/src/plugins/intel_cpu/src/nodes/kernels/x64/reduce.hpp @@ -0,0 +1,246 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "jit_kernel_base.hpp" +#include +#include +#include +#include + +namespace ov { +namespace intel_cpu { +namespace kernel { + +enum ReduceLayoutType { + reduce_ncsp, + reduce_nspc, + reduce_blocked +}; + +struct JitReduceConfigParams { + ReduceLayoutType layout; + Algorithm reduce_mode; + bool fuse_low_precision; + element::Type src_el_type; + element::Type dst_el_type; +}; + +struct JitReduceCallArgs { + const void* src; + const void* idx; + void* dst; + size_t work_amount; + size_t work_batch; + size_t reduce_w = 2; // only used in planar layout [1: reduce width dimension] [0: reduce other dimension] [other value: N/A] + size_t reduce_stride; // only used in planar layout while reducing dimensions except for width + size_t can_divide; // if apply division in reduce_kernel [1: Yes] [0: No] + const void* divisor; // mean = sum / divisor +}; + +struct JitReducePostCallArgs { + const void *src; + void *dst; + size_t work_amount; + size_t reduce_c = 2; // only used in blocked layout [1: reduce channel dimension] [0: reduce other dimension] [other value: N/A] + size_t oc_off; // offset in byte along channel on output tensor + size_t channel_size; // only for post ops fusion of nspc layout + const void *divisor; // mean = sum / divisor + const void** post_op_data; +}; + + +template +class JitReduceKernelBase : public JitKernel { +public: + explicit JitReduceKernelBase(const char* name, const JitReduceConfigParams& jcp, dnnl::impl::cpu::x64::cpu_isa_t isa); + + virtual ~JitReduceKernelBase() = default; + + const element::Type &get_exec_prc() const { + return exec_el_type; + } + +protected: + void horiz_ps(const Xbyak::Xmm& xmm, const Xbyak::Operand& op); + + template + void horiz_qq(const Xbyak::Xmm& xmm, const Xbyak::Operand& op); + + template + void horiz_reduce_store_ps(const Xbyak::Xmm& vmm_dst, const element::Type& dst_dt, bool load_embedded = false); + + template + void horiz_reduce_store_qq(const Xbyak::Xmm& vmm_dst, const element::Type& dst_dt, bool load_embedded = false); + + + RegistersPool::Reg reg_src; + RegistersPool::Reg reg_dst; + RegistersPool::Reg reg_work_amount; + + element::Type exec_el_type; + bool post_reduce = false; + bool post_ops_fusing = false; + bool planar_layout = false; + int loop_step = 1; + + std::shared_ptr max_emitter; + std::shared_ptr min_emitter; + std::shared_ptr mul_emitter; +}; + + +template +struct JitReduceKernel : public JitReduceKernelBase { + DECLARE_CPU_JIT_AUX_FUNCTIONS(JitReduceKernel) + + explicit JitReduceKernel(const JitReduceConfigParams &jcp); + + void generate() override; + +private: + using Vmm = typename dnnl::impl::utils::conditional3::type; + const size_t vlen = dnnl::impl::cpu::x64::cpu_isa_traits::vlen; + + Xbyak::Address table_val(int index) { return ptr[reg_table + index * vlen]; } + + const Xbyak::Reg64 reg_params = Xbyak::Reg64(dnnl::impl::cpu::x64::abi_param_regs[0]); + + RegistersPool::Reg reg_reduce_w; + RegistersPool::Reg reg_reduce_stride; + RegistersPool::Reg reg_work_batch; + RegistersPool::Reg reg_table; + + RegistersPool::Reg v_src; + RegistersPool::Reg v_dst; + RegistersPool::Reg v_zero; + RegistersPool::Reg v_dst_aux; + RegistersPool::Reg v_idx; + RegistersPool::Reg v_ones; + RegistersPool::Reg v_abs_mask; + + const Xbyak::Opmask &k_mask = k1; + + Xbyak::Label l_table; + + std::shared_ptr> exp_injector; + + void reduce_main(); + + void reduce_tail(); + + void init_reg_reduce_stride(); + + void reduce_kernel(); + + void reduce_once(); + + void reduce_batch(); + + void reduce_gather(const Vmm& vmm_dst, int64_t offset); + + void pack_gathered_vector(const Vmm& vmm_val, const Vmm& vmm_index, int64_t offset, const element::Type& src_dt); + + void reduce_kernel_tail(); + + void reduce_once_tail(); + + void reduce_batch_tail(); + + void reduce_main_loop(); + + void reduce_kernel(const Vmm& vmm_src, const Vmm& vmm_dst); + + void reduce_kernel_scalar(const Xbyak::Xmm& xmm_src, const Xbyak::Xmm& xmm_dst); + + void load_dst_vector(); + + void store_dst_vector(); + + void prepare_aux_table(); + + const struct aux_vals_type { + uint32_t float_one = 0x3f800000; // 1.0f + uint32_t float_abs = 0x7fffffff; // mask to make positive + uint32_t float_min = 0xff7fffff; // float lowest + uint32_t float_max = 0x7f7fffff; // float maximum + uint32_t float_int32_min = 0xcf000000; // -2^31 presented in float + uint32_t float_int32_max = 0x4effffff; // 2^31-1 presented in float + + uint64_t double_one = 0x3ff0000000000000; // 1.0 + uint64_t double_abs = 0x7fffffffffffffff; // mask to make positive + uint64_t double_min = 0xffefffffffffffff; // double lowest + uint64_t double_max = 0x7fefffffffffffff; // double maximum + uint64_t double_int64_min = 0xc3e0000000000000; // lowest int64 presented in double + uint64_t double_int64_max = 0x43dfffffffffffff; // max int64 presented in double + + uint64_t int64_one = 0x0000000000000001; // 1 + uint64_t int64_abs = 0x7fffffffffffffff; // mask to make positive + // uint64_t int64_min = 0x0000000000000000; // lowest int64 + uint64_t int64_min = 0x8000000000000000; // lowest int64 + uint64_t int64_max = 0x7fffffffffffffff; // max int64 + } aux_vals; +}; + +template +struct JitReducePostKernel : public JitReduceKernelBase { + DECLARE_CPU_JIT_AUX_FUNCTIONS(JitReducePostKernel) + + explicit JitReducePostKernel(const JitReduceConfigParams& jcp, const dnnl_primitive_attr& attr); + + void generate() override; + +private: + const dnnl_primitive_attr &attr; + + using Vmm = typename dnnl::impl::utils::conditional3::type; + const size_t vlen = dnnl::impl::cpu::x64::cpu_isa_traits::vlen; + + const Xbyak::Reg64 reg_params = Xbyak::Reg64(dnnl::impl::cpu::x64::abi_param_regs[0]); + + RegistersPool::Reg reg_divisor; + RegistersPool::Reg reg_reduce_c; + RegistersPool::Reg reg_oc_off; + RegistersPool::Reg reg_d_weights; + RegistersPool::Reg reg_d_bias; + RegistersPool::Reg reg_post_ops_data; + + RegistersPool::Reg v_dst; + RegistersPool::Reg v_d_weights; + RegistersPool::Reg v_d_bias; + RegistersPool::Reg v_divisor; + + std::shared_ptr division_emitter; + std::shared_ptr sqrt_emitter; + std::shared_ptr> log_injector; + + std::vector>> eltwise_injectors; + std::vector>> depthwise_injectors; + std::vector>> quantization_injectors; + + void reduce_post_main(); + + void reduce_post_tail(); + + void apply_post_ops(const element::Type& dst_dt, bool is_broadcast); + + void reduce_map_kernel(const Vmm& vmm_dst); + + void reduce_map_kernel_scalar(const Xbyak::Xmm& xmm_dst); + + void wrap_load_vector(const Vmm& vmm_val, const element::Type& dst_dt, const element::Type& src_dt, size_t offset); + + void wrap_load_scalar(const Xbyak::Xmm& xmm_val, const element::Type& dst_dt, const element::Type& src_dt, size_t offset); + + void horiz_store(const Xbyak::Xmm& xmm_dst, const element::Type& dst_dt, bool load_embedded); +}; // JitReducePostKernel + +} // namespace kernel +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/kernels/x64/registers_pool.hpp b/src/plugins/intel_cpu/src/nodes/kernels/x64/registers_pool.hpp index 1c1b6218b87bd1..743ead12eb52d6 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/x64/registers_pool.hpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/x64/registers_pool.hpp @@ -4,11 +4,8 @@ #pragma once -#include "cpu/x64/jit_generator.hpp" -#include -#include "ie_common.h" +#include "cpu/x64/cpu_isa_traits.hpp" #include "utils/cpu_utils.hpp" -#include namespace ov { namespace intel_cpu { diff --git a/src/plugins/intel_cpu/src/nodes/mathematics.cpp b/src/plugins/intel_cpu/src/nodes/mathematics.cpp index 926e09fd9770d6..5f0002e734258e 100644 --- a/src/plugins/intel_cpu/src/nodes/mathematics.cpp +++ b/src/plugins/intel_cpu/src/nodes/mathematics.cpp @@ -6,7 +6,7 @@ #include #include -#include +#include #include "ie_parallel.hpp" #include "mathematics.h" #include "utils/general_utils.h" @@ -18,16 +18,16 @@ namespace ov { namespace intel_cpu { namespace node { -bool Math::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { +bool Math::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { try { if (initializers.find(op->get_type_info()) == initializers.end()) { errorMessage = "Unsupported Math layer type."; return false; } - if (one_of(op->get_type_info(), ngraph::op::v0::HardSigmoid::get_type_info_static(), ngraph::op::v0::Selu::get_type_info_static())) { - auto firstConst = ngraph::as_type_ptr(op->get_input_node_shared_ptr(1)); - auto secondConst = ngraph::as_type_ptr(op->get_input_node_shared_ptr(2)); + if (one_of(op->get_type_info(), op::v0::HardSigmoid::get_type_info_static(), op::v0::Selu::get_type_info_static())) { + auto firstConst = ov::as_type_ptr(op->get_input_node_shared_ptr(1)); + auto secondConst = ov::as_type_ptr(op->get_input_node_shared_ptr(2)); if (!firstConst || !secondConst) { errorMessage = "Constant expected as the second and third inputs."; return false; @@ -39,7 +39,7 @@ bool Math::isSupportedOperation(const std::shared_ptr& op, s return true; } -Math::Math(const std::shared_ptr& op, const GraphContext::CPtr context) +Math::Math(const std::shared_ptr& op, const GraphContext::CPtr& context) : Node(op, context, PassThroughShapeInferFactory()), alpha(0.f), beta(0.f), @@ -201,66 +201,66 @@ bool Math::created() const { return getType() == Type::Math; } -std::map&, Math& node)>> Math::initializers { - {ngraph::op::v0::Abs::get_type_info_static(), [](const std::shared_ptr& op, Math& node) { +std::map&, Math& node)>> Math::initializers { + {op::v0::Abs::get_type_info_static(), [](const std::shared_ptr& op, Math& node) { node.algorithm = Algorithm::MathAbs; }}, - {ngraph::op::v0::Acos::get_type_info_static(), [](const std::shared_ptr& op, Math& node) { + {op::v0::Acos::get_type_info_static(), [](const std::shared_ptr& op, Math& node) { node.algorithm = Algorithm::MathAcos; }}, - {ngraph::op::v3::Acosh::get_type_info_static(), [](const std::shared_ptr& op, Math& node) { + {op::v3::Acosh::get_type_info_static(), [](const std::shared_ptr& op, Math& node) { node.algorithm = Algorithm::MathAcosh; }}, - {ngraph::op::v0::Asin::get_type_info_static(), [](const std::shared_ptr& op, Math& node) { + {op::v0::Asin::get_type_info_static(), [](const std::shared_ptr& op, Math& node) { node.algorithm = Algorithm::MathAsin; }}, - {ngraph::op::v3::Asinh::get_type_info_static(), [](const std::shared_ptr& op, Math& node) { + {op::v3::Asinh::get_type_info_static(), [](const std::shared_ptr& op, Math& node) { node.algorithm = Algorithm::MathAsinh; }}, - {ngraph::op::v0::Atan::get_type_info_static(), [](const std::shared_ptr& op, Math& node) { + {op::v0::Atan::get_type_info_static(), [](const std::shared_ptr& op, Math& node) { node.algorithm = Algorithm::MathAtan; }}, - {ngraph::op::v0::Ceiling::get_type_info_static(), [](const std::shared_ptr& op, Math& node) { + {op::v0::Ceiling::get_type_info_static(), [](const std::shared_ptr& op, Math& node) { node.algorithm = Algorithm::MathCeiling; }}, - {ngraph::op::v0::Cos::get_type_info_static(), [](const std::shared_ptr& op, Math& node) { + {op::v0::Cos::get_type_info_static(), [](const std::shared_ptr& op, Math& node) { node.algorithm = Algorithm::MathCos; }}, - {ngraph::op::v0::Cosh::get_type_info_static(), [](const std::shared_ptr& op, Math& node) { + {op::v0::Cosh::get_type_info_static(), [](const std::shared_ptr& op, Math& node) { node.algorithm = Algorithm::MathCosh; }}, - {ngraph::op::v0::Floor::get_type_info_static(), [](const std::shared_ptr& op, Math& node) { + {op::v0::Floor::get_type_info_static(), [](const std::shared_ptr& op, Math& node) { node.algorithm = Algorithm::MathFloor; }}, - {ngraph::op::v0::HardSigmoid::get_type_info_static(), [](const std::shared_ptr& op, Math& node) { + {op::v0::HardSigmoid::get_type_info_static(), [](const std::shared_ptr& op, Math& node) { node.algorithm = Algorithm::MathHardSigmoid; - node.alpha = ngraph::as_type_ptr(op->get_input_node_shared_ptr(1))->cast_vector()[0]; - node.beta = ngraph::as_type_ptr(op->get_input_node_shared_ptr(2))->cast_vector()[0]; + node.alpha = ov::as_type_ptr(op->get_input_node_shared_ptr(1))->cast_vector()[0]; + node.beta = ov::as_type_ptr(op->get_input_node_shared_ptr(2))->cast_vector()[0]; }}, - {ngraph::op::v0::Negative::get_type_info_static(), [](const std::shared_ptr& op, Math& node) { + {op::v0::Negative::get_type_info_static(), [](const std::shared_ptr& op, Math& node) { node.algorithm = Algorithm::MathNegative; }}, - {ngraph::op::v0::Selu::get_type_info_static(), [](const std::shared_ptr& op, Math& node) { + {op::v0::Selu::get_type_info_static(), [](const std::shared_ptr& op, Math& node) { node.algorithm = Algorithm::MathSelu; - node.alpha = ngraph::as_type_ptr(op->get_input_node_shared_ptr(1))->cast_vector()[0]; - node.gamma = ngraph::as_type_ptr(op->get_input_node_shared_ptr(2))->cast_vector()[0]; + node.alpha = ov::as_type_ptr(op->get_input_node_shared_ptr(1))->cast_vector()[0]; + node.gamma = ov::as_type_ptr(op->get_input_node_shared_ptr(2))->cast_vector()[0]; }}, - {ngraph::op::v0::Sign::get_type_info_static(), [](const std::shared_ptr& op, Math& node) { + {op::v0::Sign::get_type_info_static(), [](const std::shared_ptr& op, Math& node) { node.algorithm = Algorithm::MathSign; }}, - {ngraph::op::v0::Sin::get_type_info_static(), [](const std::shared_ptr& op, Math& node) { + {op::v0::Sin::get_type_info_static(), [](const std::shared_ptr& op, Math& node) { node.algorithm = Algorithm::MathSin; }}, - {ngraph::op::v0::Sinh::get_type_info_static(), [](const std::shared_ptr& op, Math& node) { + {op::v0::Sinh::get_type_info_static(), [](const std::shared_ptr& op, Math& node) { node.algorithm = Algorithm::MathSinh; }}, - {ngraph::op::v4::SoftPlus::get_type_info_static(), [](const std::shared_ptr& op, Math& node) { + {op::v4::SoftPlus::get_type_info_static(), [](const std::shared_ptr& op, Math& node) { node.algorithm = Algorithm::MathSoftPlus; }}, - {ngraph::op::v0::Tan::get_type_info_static(), [](const std::shared_ptr& op, Math& node) { + {op::v0::Tan::get_type_info_static(), [](const std::shared_ptr& op, Math& node) { node.algorithm = Algorithm::MathTan; }}, - {ngraph::op::v3::Atanh::get_type_info_static(), [](const std::shared_ptr& op, Math& node) { + {op::v3::Atanh::get_type_info_static(), [](const std::shared_ptr& op, Math& node) { node.algorithm = Algorithm::MathAtanh; }} }; diff --git a/src/plugins/intel_cpu/src/nodes/mathematics.h b/src/plugins/intel_cpu/src/nodes/mathematics.h index 88235fb54e8b78..e8289cfb1f2117 100644 --- a/src/plugins/intel_cpu/src/nodes/mathematics.h +++ b/src/plugins/intel_cpu/src/nodes/mathematics.h @@ -13,7 +13,7 @@ namespace node { class Math : public Node { public: - Math(const std::shared_ptr& op, const GraphContext::CPtr context); + Math(const std::shared_ptr& op, const GraphContext::CPtr& context); void getSupportedDescriptors() override {}; void initSupportedPrimitiveDescriptors() override; @@ -23,10 +23,10 @@ class Math : public Node { bool needPrepareParams() const override { return false; }; void executeDynamicImpl(dnnl::stream strm) override; - static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; + static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; private: - static std::map&, Math& node)>> initializers; + static std::map&, Math& node)>> initializers; float alpha = 0.0f; float beta = 0.0f; diff --git a/src/plugins/intel_cpu/src/nodes/non_zero.cpp b/src/plugins/intel_cpu/src/nodes/non_zero.cpp index cbb0b134211359..1ff6a5ae02c012 100644 --- a/src/plugins/intel_cpu/src/nodes/non_zero.cpp +++ b/src/plugins/intel_cpu/src/nodes/non_zero.cpp @@ -7,7 +7,7 @@ #include #include -#include +#include #include #include @@ -20,9 +20,9 @@ namespace node { static constexpr int blockSize = dnnl::impl::cpu::platform::get_cache_line_size() * 2; static constexpr int elementsStride = blockSize / sizeof(int); -bool NonZero::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { +bool NonZero::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { try { - if (op->get_type_info() != ngraph::op::v3::NonZero::get_type_info_static()) { + if (op->get_type_info() != op::v3::NonZero::get_type_info_static()) { errorMessage = "Node is not an instance of NonZero from the operation set v3."; return false; } @@ -32,38 +32,37 @@ bool NonZero::isSupportedOperation(const std::shared_ptr& op return true; } -NonZero::NonZero(const std::shared_ptr& op, const GraphContext::CPtr context) - : Node(op, context, InternalDynShapeInferFactory()) { +NonZero::NonZero(const std::shared_ptr& op, const GraphContext::CPtr& context) + : Node(op, context, InternalDynShapeInferFactory()) { std::string errorMessage; - if (isSupportedOperation(op, errorMessage)) { - errorPrefix = "NonZero layer with name '" + getName() + "' "; - } else { + if (!isSupportedOperation(op, errorMessage)) { IE_THROW(NotImplemented) << errorMessage; } - if (op->get_output_element_type(0) != ngraph::element::i32) { - IE_THROW() << errorPrefix << "doesn't support demanded output precision"; - } } void NonZero::getSupportedDescriptors() { if (getParentEdges().size() != 1) - IE_THROW() << errorPrefix << "has incorrect number of input edges: " << getParentEdges().size(); + THROW_CPU_NODE_ERR << "has incorrect number of input edges: " << getParentEdges().size(); if (!getChildEdges().size()) - IE_THROW() << errorPrefix << "has incorrect number of output edges: " << getChildEdges().size(); + THROW_CPU_NODE_ERR << "has incorrect number of output edges: " << getChildEdges().size(); } void NonZero::initSupportedPrimitiveDescriptors() { if (!supportedPrimitiveDescriptors.empty()) return; - const auto &inPrc = getOriginalInputPrecisionAtPort(0); - if (!one_of(inPrc, Precision::FP32, Precision::BF16, Precision::I32, Precision::U32, Precision::I8, Precision::U8)) { + const auto inPrc = getOriginalInputPrecisionAtPort(0); + if (!one_of(inPrc, Precision::FP32, Precision::BF16, Precision::I64, Precision::I32, Precision::U32, Precision::I8, Precision::U8)) { IE_THROW() << "Can't create primitive descriptor for NonZero layer with name: " << getName() << " doesn't support " << inPrc.name() << " precision on 0 port"; } + auto outPrc = getOriginalOutputPrecisionAtPort(0); + if (!one_of(outPrc, /*Precision::I64,*/ Precision::I32)) { + outPrc = Precision::I32; + } addSupportedPrimDesc({{LayoutType::ncsp}}, - {{LayoutType::ncsp, Precision::I32}}, + {{LayoutType::ncsp, outPrc}}, impl_desc_type::ref); } @@ -123,7 +122,8 @@ void NonZero::execute(dnnl::stream strm) { OV_SWITCH(intel_cpu, NonZeroExecute, ctx, inputPrec, OV_CASE(Precision::FP32, float), OV_CASE(Precision::BF16, bfloat16_t), - OV_CASE(Precision::I32, int), + OV_CASE(Precision::I64, int64_t), + OV_CASE(Precision::I32, int32_t), OV_CASE(Precision::U32, uint32_t), OV_CASE(Precision::I8, int8_t), OV_CASE(Precision::U8, uint8_t)) diff --git a/src/plugins/intel_cpu/src/nodes/non_zero.h b/src/plugins/intel_cpu/src/nodes/non_zero.h index 57f2683cd56eff..0e9d1fb3255703 100644 --- a/src/plugins/intel_cpu/src/nodes/non_zero.h +++ b/src/plugins/intel_cpu/src/nodes/non_zero.h @@ -18,7 +18,7 @@ namespace node { class NonZero : public Node { public: - NonZero(const std::shared_ptr& op, const GraphContext::CPtr context); + NonZero(const std::shared_ptr& op, const GraphContext::CPtr& context); void getSupportedDescriptors() override; void initSupportedPrimitiveDescriptors() override; @@ -27,13 +27,12 @@ class NonZero : public Node { bool needShapeInfer() const override {return false;}; bool needPrepareParams() const override {return false;}; void executeDynamicImpl(dnnl::stream strm) override; - static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; + static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; bool isExecutable() const override { return true; } private: int threadsCount = 1; - std::string errorPrefix; template void executeSpecified(); template diff --git a/src/plugins/intel_cpu/src/nodes/one_hot.cpp b/src/plugins/intel_cpu/src/nodes/one_hot.cpp index 5eefbf0131324c..2cfd84a6134cb7 100644 --- a/src/plugins/intel_cpu/src/nodes/one_hot.cpp +++ b/src/plugins/intel_cpu/src/nodes/one_hot.cpp @@ -2,18 +2,14 @@ // SPDX-License-Identifier: Apache-2.0 // -#include -#include -#include +#include "one_hot.h" + #include "ie_parallel.hpp" #include -#include "one_hot.h" -#include -#include -#include -#include -#include -#include "common/cpu_memcpy.h" +#include + +#include +#include using namespace InferenceEngine; @@ -51,9 +47,9 @@ class OneHotShapeInfer : public ShapeInferEmptyPads { class OneHotShapeInferFactory : public ShapeInferFactory { public: - OneHotShapeInferFactory(std::shared_ptr op) : m_op(op) {} + OneHotShapeInferFactory(const std::shared_ptr &op) : m_op(op) {} ShapeInferPtr makeShapeInfer() const override { - auto oneHot = ov::as_type_ptr(m_op); + auto oneHot = ov::as_type_ptr(m_op); if (!oneHot) { IE_THROW() << "Unexpected op type in OneHot shape inference factory: " << m_op->get_type_name(); } @@ -73,18 +69,17 @@ class OneHotShapeInferFactory : public ShapeInferFactory { } // namespace -bool OneHot::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { +bool OneHot::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { try { - const auto oneHot = std::dynamic_pointer_cast(op); - if (!oneHot) { + if (op->get_type_info() != ov::opset1::OneHot::get_type_info_static()) { errorMessage = "Only opset1 OneHot operation is supported"; return false; } - if (std::dynamic_pointer_cast(oneHot->get_input_node_shared_ptr(ON_VALUE_ID)) == nullptr) { + if (std::dynamic_pointer_cast(op->get_input_node_shared_ptr(ON_VALUE_ID)) == nullptr) { errorMessage = "Only const 'on_value' input is supported"; return false; } - if (std::dynamic_pointer_cast(oneHot->get_input_node_shared_ptr(OFF_VALUEAXES_ID)) == nullptr) { + if (std::dynamic_pointer_cast(op->get_input_node_shared_ptr(OFF_VALUEAXES_ID)) == nullptr) { errorMessage = "Only const 'off_value' input is supported"; return false; } @@ -94,27 +89,26 @@ bool OneHot::isSupportedOperation(const std::shared_ptr& op, return true; } -OneHot::OneHot(const std::shared_ptr& op, const GraphContext::CPtr context) - : Node(op, context, OneHotShapeInferFactory(op)) { +OneHot::OneHot(const std::shared_ptr& op, const GraphContext::CPtr& context) + : Node(op, context, OneHotShapeInferFactory(op)) { std::string errorMessage; if (!isSupportedOperation(op, errorMessage)) { IE_THROW(NotImplemented) << errorMessage; } - errorPrefix = "OneHot layer with name '" + op->get_friendly_name() + "'"; - const auto oneHot = std::dynamic_pointer_cast(op); - const auto depthNode = std::dynamic_pointer_cast(oneHot->get_input_node_shared_ptr(DEPTH_ID)); + const auto oneHot = std::dynamic_pointer_cast(op); + const auto depthNode = std::dynamic_pointer_cast(oneHot->get_input_node_shared_ptr(DEPTH_ID)); if (depthNode) { depth = depthNode->cast_vector()[0]; } axis = oneHot->get_axis(); VectorDims srcDims = getInputShapeAtPort(INDICES_ID).getDims(); - if (ngraph::is_scalar(srcDims)) { + if (ov::is_scalar(srcDims)) { srcDims = SizeVector{1}; } VectorDims dstDims = getOutputShapeAtPort(0).getDims(); - if (ngraph::is_scalar(dstDims)) { + if (ov::is_scalar(dstDims)) { dstDims = SizeVector{1}; } @@ -123,12 +117,12 @@ OneHot::OneHot(const std::shared_ptr& op, const GraphContext::CPtr axis += output_dims_size; } if (axis < 0 || axis >= output_dims_size) { - IE_THROW() << errorPrefix << " has unsupported 'axis' attribute: " << oneHot->get_axis(); + THROW_CPU_NODE_ERR << " has unsupported 'axis' attribute: " << oneHot->get_axis(); } if (!(((1 + srcDims.size()) == dstDims.size()) || (depthNode && (srcDims.size() == 1 && dstDims.size() == 1 && dstDims[0] == depth && srcDims[0] == 1)))) - IE_THROW() << errorPrefix << " has incorrect number of input/output dimensions!"; + THROW_CPU_NODE_ERR << " has incorrect number of input/output dimensions!"; } bool OneHot::needShapeInfer() const { @@ -146,23 +140,22 @@ void OneHot::initSupportedPrimitiveDescriptors() { return; // check a precision of the input tensor - auto input_precision = getOriginalInputPrecisionAtPort(INDICES_ID); - if (input_precision != Precision::I32) { - IE_THROW() << errorPrefix << " has incorrect input precision for the input. Only I32 is supported!"; + inputPrecision = getOriginalInputPrecisionAtPort(INDICES_ID); + if (!one_of(inputPrecision, Precision::I32, Precision::I64)) { + THROW_CPU_NODE_ERR << " has incorrect input precision for the input. Only I32 and I64 are supported!"; } - output_precision = getOriginalOutputPrecisionAtPort(0); + outputPrecision = getOriginalOutputPrecisionAtPort(0); - addSupportedPrimDesc({{LayoutType::ncsp, input_precision}, - {LayoutType::ncsp, input_precision}, - {LayoutType::ncsp, output_precision}, - {LayoutType::ncsp, output_precision}}, - {{LayoutType::ncsp, output_precision}}, + addSupportedPrimDesc({{LayoutType::ncsp, inputPrecision}, + {LayoutType::ncsp, inputPrecision}, + {LayoutType::ncsp, outputPrecision}, + {LayoutType::ncsp, outputPrecision}}, + {{LayoutType::ncsp, outputPrecision}}, impl_desc_type::ref_any); } template void OneHot::one_hot(size_t prefix_size, size_t suffix_size) { - const auto *src_data = reinterpret_cast(getParentEdgeAt(0)->getMemoryPtr()->getData()); auto *dst_data = reinterpret_cast(getChildEdgeAt(0)->getMemoryPtr()->getData()); const out_type on_value = reinterpret_cast(getParentEdgeAt(2)->getMemoryPtr()->getData())[0]; @@ -174,16 +167,31 @@ void OneHot::one_hot(size_t prefix_size, size_t suffix_size) { // set on_value at needed locations auto on_val = on_value; - parallel_for(prefix_size, [&](std::size_t prefix_idx) { - const in_type* src_dataPtr = &src_data[prefix_idx * suffix_size]; - out_type* dst_dataPtr = &dst_data[prefix_idx * depth * suffix_size]; - for (std::size_t suffix_idx = 0; suffix_idx < suffix_size; ++suffix_idx, ++src_dataPtr, ++dst_dataPtr) { - auto v = static_cast(*src_dataPtr); - if (v < depth) { - dst_dataPtr[v * suffix_size] = on_val; + if (inputPrecision == Precision::I64) { + const auto *src_data = reinterpret_cast(getParentEdgeAt(0)->getMemoryPtr()->getData()); + parallel_for(prefix_size, [&](std::size_t prefix_idx) { + auto src_dataPtr = &src_data[prefix_idx * suffix_size]; + out_type *dst_dataPtr = &dst_data[prefix_idx * depth * suffix_size]; + for (std::size_t suffix_idx = 0; suffix_idx < suffix_size; ++suffix_idx, ++src_dataPtr, ++dst_dataPtr) { + auto v = static_cast(*src_dataPtr); + if (v < depth) { + dst_dataPtr[v * suffix_size] = on_val; + } } - } - }); + }); + } else { + const auto *src_data = reinterpret_cast(getParentEdgeAt(0)->getMemoryPtr()->getData()); + parallel_for(prefix_size, [&](std::size_t prefix_idx) { + auto src_dataPtr = &src_data[prefix_idx * suffix_size]; + out_type *dst_dataPtr = &dst_data[prefix_idx * depth * suffix_size]; + for (std::size_t suffix_idx = 0; suffix_idx < suffix_size; ++suffix_idx, ++src_dataPtr, ++dst_dataPtr) { + auto v = static_cast(*src_dataPtr); + if (v < depth) { + dst_dataPtr[v * suffix_size] = on_val; + } + } + }); + } } void OneHot::executeDynamicImpl(dnnl::stream strm) { @@ -201,7 +209,8 @@ void OneHot::execute(dnnl::stream strm) { std::size_t suffix_size = getParentEdgeAt(0)->getMemory().getShape().getElementsCount() / prefix_size; OneHotContext ctx = {this, prefix_size, suffix_size}; - OV_SWITCH(intel_cpu, OneHotExecute, ctx, output_precision.size(), + OV_SWITCH(intel_cpu, OneHotExecute, ctx, outputPrecision.size(), + OV_CASE(sizeof(uint64_t), uint64_t), OV_CASE(sizeof(uint32_t), uint32_t), OV_CASE(sizeof(uint16_t), uint16_t), OV_CASE(sizeof(uint8_t), uint8_t)) diff --git a/src/plugins/intel_cpu/src/nodes/one_hot.h b/src/plugins/intel_cpu/src/nodes/one_hot.h index 9db0a066c76f8c..731b9c7da0ac28 100644 --- a/src/plugins/intel_cpu/src/nodes/one_hot.h +++ b/src/plugins/intel_cpu/src/nodes/one_hot.h @@ -4,12 +4,11 @@ #pragma once -#include #include -#include + #include +#include #include -#include namespace ov { namespace intel_cpu { @@ -17,7 +16,7 @@ namespace node { class OneHot : public Node { public: - OneHot(const std::shared_ptr& op, const GraphContext::CPtr context); + OneHot(const std::shared_ptr& op, const GraphContext::CPtr& context); void getSupportedDescriptors() override {}; void initSupportedPrimitiveDescriptors() override; @@ -29,11 +28,9 @@ class OneHot : public Node { bool needPrepareParams() const override { return false; }; void executeDynamicImpl(dnnl::stream strm) override; - static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; + static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; private: - typedef InferenceEngine::PrecisionTrait::value_type in_type; - struct OneHotContext { OneHot* nodePtr; size_t prefix_size; @@ -50,9 +47,8 @@ class OneHot : public Node { mutable Dim depth = Shape::UNDEFINED_DIM; int32_t axis = -1; - InferenceEngine::Precision output_precision; - - std::string errorPrefix; + InferenceEngine::Precision inputPrecision; + InferenceEngine::Precision outputPrecision; static const size_t INDICES_ID = 0; static const size_t DEPTH_ID = 1; diff --git a/src/plugins/intel_cpu/src/nodes/pooling.cpp b/src/plugins/intel_cpu/src/nodes/pooling.cpp index 1bd288697bef34..e98f1e2741454e 100644 --- a/src/plugins/intel_cpu/src/nodes/pooling.cpp +++ b/src/plugins/intel_cpu/src/nodes/pooling.cpp @@ -11,13 +11,13 @@ #include #include #include -#include #include -#include #include #include "memory_desc/dnnl_blocked_memory_desc.h" #include "nodes/node_config.h" #include +#include +#include // to access and change C pooling primitive desc internal padding field #include diff --git a/src/plugins/intel_cpu/src/nodes/range.cpp b/src/plugins/intel_cpu/src/nodes/range.cpp index c7b47e55449a21..35ab3867cd17dd 100644 --- a/src/plugins/intel_cpu/src/nodes/range.cpp +++ b/src/plugins/intel_cpu/src/nodes/range.cpp @@ -3,7 +3,7 @@ // #include -#include +#include #include "ie_parallel.hpp" #include "range.h" #include @@ -15,9 +15,9 @@ namespace ov { namespace intel_cpu { namespace node { -bool Range::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { +bool Range::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { try { - if (!one_of(op->get_type_info(), ngraph::op::v0::Range::get_type_info_static(), ngraph::op::v4::Range::get_type_info_static())) { + if (!one_of(op->get_type_info(), ov::op::v0::Range::get_type_info_static(), ov::op::v4::Range::get_type_info_static())) { errorMessage = "Only opset1 and opset4 Range operation is supported"; return false; } @@ -27,7 +27,7 @@ bool Range::isSupportedOperation(const std::shared_ptr& op, return true; } -Range::Range(const std::shared_ptr& op, const GraphContext::CPtr context) +Range::Range(const std::shared_ptr& op, const GraphContext::CPtr& context) : Node(op, context, InternalDynShapeInferFactory()) { std::string errorMessage; if (!isSupportedOperation(op, errorMessage)) { @@ -40,15 +40,15 @@ Range::Range(const std::shared_ptr& op, const GraphContext::CPtr c IE_THROW() << errorPrefix << " has incorrect number of input/output edges!"; SizeVector start_dims = op->get_input_shape(RANGE_START); - if (ngraph::shape_size(start_dims) != 1) + if (ov::shape_size(start_dims) != 1) IE_THROW() << errorPrefix << " has start scalar with more than 1 value"; SizeVector limit_dims = op->get_input_shape(RANGE_LIMIT); - if (ngraph::shape_size(limit_dims) != 1) + if (ov::shape_size(limit_dims) != 1) IE_THROW() << errorPrefix << " has limit scalar with more than 1 value"; SizeVector delta_dims = op->get_input_shape(RANGE_DELTA); - if (ngraph::shape_size(delta_dims) != 1) + if (ov::shape_size(delta_dims) != 1) IE_THROW() << errorPrefix << " has delta scalar with more than 1 value"; size_t dstRank = op->get_output_partial_shape(0).size(); diff --git a/src/plugins/intel_cpu/src/nodes/range.h b/src/plugins/intel_cpu/src/nodes/range.h index e0b424e0e06ae9..4cefbe04811e22 100644 --- a/src/plugins/intel_cpu/src/nodes/range.h +++ b/src/plugins/intel_cpu/src/nodes/range.h @@ -13,7 +13,7 @@ namespace node { class Range : public Node { public: - Range(const std::shared_ptr& op, const GraphContext::CPtr context); + Range(const std::shared_ptr& op, const GraphContext::CPtr& context); void getSupportedDescriptors() override {}; void initSupportedPrimitiveDescriptors() override; diff --git a/src/plugins/intel_cpu/src/nodes/reduce.cpp b/src/plugins/intel_cpu/src/nodes/reduce.cpp index bb3992f98cb38e..2026b074cd0883 100644 --- a/src/plugins/intel_cpu/src/nodes/reduce.cpp +++ b/src/plugins/intel_cpu/src/nodes/reduce.cpp @@ -4,33 +4,18 @@ #include "reduce.h" -#include "fake_quantize.h" #include "eltwise.h" -#include -#include -#include -#include -#include -#include "utils/bfloat16.hpp" -#include "emitters/x64/jit_bf16_emitters.hpp" +#include "fake_quantize.h" #include "ie_parallel.hpp" -#include - -#include -#include -#include -#include -#include -#include -#include +#include "utils/bfloat16.hpp" +#include + #include -using namespace dnnl; +using namespace ov::intel_cpu::node; +using namespace ov::intel_cpu::kernel; using namespace InferenceEngine; -using namespace dnnl::impl; -using namespace dnnl::impl::cpu::x64; -using namespace dnnl::impl::utils; -using namespace Xbyak; +using namespace dnnl::impl::cpu; #define SET_SRC_DIM_VALUE(batch, channel, depth, height, width) IB = batch; \ IC = channel; \ @@ -43,8 +28,8 @@ using namespace Xbyak; OH = height; \ OW = width; -#define GET_OFF(field) offsetof(jit_reduce_call_args, field) -#define GET_OFF_POST(field) offsetof(jit_reduce_post_call_args, field) +#define GET_OFF(field) offsetof(JitReduceCallArgs, field) +#define GET_OFF_POST(field) offsetof(JitReducePostCallArgs, field) #define GET_PTR_N_PLN const uint8_t *in_ptr_n = in_ptr + src_data_size * ib * IC * ID * IH * IW; \ uint8_t *out_ptr_n = out_ptr + dst_data_size * ob * OC * OD * OH * OW; @@ -69,13 +54,10 @@ using namespace Xbyak; #define GET_PTR_NCD_BASE_PTR_N_BLK const uint8_t *in_ptr_ncd = in_ptr_n + src_data_size * (icb * ID + id) * IH * IW * blk_size; \ uint8_t *out_ptr_ncd = out_ptr_n + dst_data_size * (ocb * OD + od) * OH * OW * blk_size; -namespace ov { -namespace intel_cpu { -namespace node { namespace { struct ReduceKey { - jit_reduce_config_params jcp; + JitReduceConfigParams jcp; dnnl::post_ops postOps; size_t hash() const; @@ -90,8 +72,8 @@ size_t ReduceKey::hash() const { seed = hash_combine(seed, jcp.layout); seed = hash_combine(seed, jcp.reduce_mode); seed = hash_combine(seed, jcp.fuse_low_precision); - seed = hash_combine(seed, jcp.src_dt); - seed = hash_combine(seed, jcp.dst_dt); + seed = hash_combine(seed, (ov::element::Type_t)jcp.src_el_type); + seed = hash_combine(seed, (ov::element::Type_t)jcp.dst_el_type); seed = get_post_op_hash(seed, *postOps.get()); return seed; @@ -100,1768 +82,129 @@ size_t ReduceKey::hash() const { bool ReduceKey::operator==(const ReduceKey &rhs) const { return jcp.layout == rhs.jcp.layout && jcp.reduce_mode == rhs.jcp.reduce_mode && jcp.fuse_low_precision == rhs.jcp.fuse_low_precision && - jcp.src_dt == rhs.jcp.src_dt && jcp.dst_dt == rhs.jcp.dst_dt && *postOps.get() == *rhs.postOps.get(); -} -} // namespace - -#if defined(OPENVINO_ARCH_X86_64) - -// some utility functions -static inline bool isFloatCompatible(memory::data_type type) { - return memory::data_type::f32 == type || memory::data_type::bf16 == type; + jcp.src_el_type == rhs.jcp.src_el_type && jcp.dst_el_type == rhs.jcp.dst_el_type && *postOps.get() == *rhs.postOps.get(); } -template -struct jit_uni_reduce_kernel_f32 : public jit_uni_reduce_kernel, public jit_generator { - DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_reduce_kernel_f32) - - explicit jit_uni_reduce_kernel_f32(jit_reduce_config_params jcp) - : jit_uni_reduce_kernel(jcp), jit_generator(jit_name()) {} - - void create_ker() override { - jit_generator::create_kernel(); - ker_ = (decltype(ker_))jit_ker(); - } - - void generate() override { - if (jcp_.reduce_mode == Algorithm::ReduceLogSumExp) { - exp_injector = std::make_shared>(this, alg_kind::eltwise_exp, 0.f, 0.f, 1.f); - } - - if (mayiuse(avx512_core)) - uni_vcvtneps2bf16 = std::make_shared(this, isa); - - this->preamble(); - - planar_layout = jcp_.layout == ReduceLayoutType::reduce_ncsp || jcp_.layout == ReduceLayoutType::reduce_nspc; - - mov(reg_src, ptr[reg_params + GET_OFF(src)]); - mov(reg_dst, ptr[reg_params + GET_OFF(dst)]); - mov(reg_work_amount, ptr[reg_params + GET_OFF(work_amount)]); - mov(reg_work_batch, ptr[reg_params + GET_OFF(work_batch)]); - if (planar_layout) - mov(reg_reduce_w, ptr[reg_params + GET_OFF(reduce_w)]); - - if (jcp_.reduce_mode == Algorithm::ReduceAnd || jcp_.reduce_mode == Algorithm::ReduceL1 || jcp_.reduce_mode == Algorithm::ReduceMax || - jcp_.reduce_mode == Algorithm::ReduceMin || jcp_.reduce_mode == Algorithm::ReduceProd || jcp_.reduce_mode == Algorithm::ReduceOr) { - mov(reg_table, l_table); - } - - if (isa == cpu::x64::avx512_core || jcp_.reduce_mode == Algorithm::ReduceAnd || jcp_.reduce_mode == Algorithm::ReduceOr) - uni_vpxor(vmm_zero, vmm_zero, vmm_zero); - - if ((isa == cpu::x64::avx512_core && jcp_.reduce_mode == Algorithm::ReduceAnd) || jcp_.reduce_mode == Algorithm::ReduceOr) { - uni_vmovups(vmm_aux, table_val(0)); - } - - reduce_main(); - reduce_tail(); - - this->postamble(); - - if (mayiuse(avx512_core)) - uni_vcvtneps2bf16->emit_data(); - - if (jcp_.reduce_mode == Algorithm::ReduceAnd || jcp_.reduce_mode == Algorithm::ReduceL1 || jcp_.reduce_mode == Algorithm::ReduceMax || - jcp_.reduce_mode == Algorithm::ReduceMin || jcp_.reduce_mode == Algorithm::ReduceProd || jcp_.reduce_mode == Algorithm::ReduceOr) { - prepare_aux_table(); - } else if (jcp_.reduce_mode == Algorithm::ReduceLogSumExp) { - exp_injector->prepare_table(); - } - } - -private: - using Vmm = typename conditional3::type; - size_t vlen = cpu_isa_traits::vlen; - bool planar_layout = false; - - Xbyak::Address table_val(int index) { return ptr[reg_table + index * vlen]; } - - Xbyak::Reg64 reg_src = r8; - Xbyak::Reg64 reg_dst = r9; - Xbyak::Reg64 reg_idx = rdx; - Xbyak::Reg64 reg_work_amount = r10; - Xbyak::Reg64 reg_reduce_w = r11; - Xbyak::Reg64 reg_reduce_stride = r12; - Xbyak::Reg64 reg_work_batch = r13; - Xbyak::Reg64 reg_table = r14; - Xbyak::Reg64 reg_params = abi_param1; - - Xbyak::Reg8 reg_tmp_8 = r15b; - Xbyak::Reg32 reg_tmp_32 = r15d; - Xbyak::Reg64 reg_tmp_64 = r15; - - Xbyak::Reg64 reg_src_aux = rax; - Xbyak::Reg64 reg_work_batch_aux = rbx; - Xbyak::Reg64 reg_can_divide = rbp; - Xbyak::Reg64 reg_divisor = reg_can_divide; - - Vmm vmm_aux = Vmm(0); - Xmm xmm_aux = Xmm(0); - Vmm vmm_src = Vmm(1); - Xmm xmm_src = Xmm(1); - Vmm vmm_dst = Vmm(2); - Xmm xmm_dst = Xmm(2); - Vmm vmm_zero = Vmm(3); - Xmm xmm_zero = Xmm(3); - Vmm vmm_dst_aux = Vmm(4); - Xmm xmm_aux1 = Xmm(5); - Xmm xmm_aux2 = Xmm(6); - Xmm xmm_aux3 = Xmm(7); - Vmm vmm_idx = Vmm(8); - Vmm vmm_mask = Vmm(9); - - const Xbyak::Opmask k_mask = Xbyak::Opmask(1); - - Xbyak::Label l_table; - - std::shared_ptr uni_vcvtneps2bf16; - std::shared_ptr> exp_injector; - - inline void reduce_main() { - // ================================================================ - // ***isa: AVX512*** - // ReduceAnd (Logical And) - // step 1: init dst 0x3f800000 (1.0f) - // aux 0x3f800000 (1.0f) - // zero 0x00000000 (0.0f) - // step 2: if src equals 0, set mask bit 0, else set mask bit 1 - // step 3: src = mask bit == 0 ? zero : aux - // step 4: dst = dst & src - // src mask_bit new_src dst new_dst - // case 1 ~0 1 1.0f 1.0f 1.0f - // case 2 0 0 0.0f 1.0f 0.0f - // case 3 ~0 1 1.0f 0.0f 0.0f - // case 4 0 0 0.0f 0.0f 0.0f - // step 5: loop: offset src, and do step 2 and step 3 - // - // ReduceOr (Logical Or) - // step 1: init dst 0x00000000 (0.0f) - // aux 0x3f800000 (1.0f) - // zero 0x00000000 (0.0f) - // step 2: if src equals 0, set mask bit 0, else set mask bit 1 - // step 3: src = mask bit == 0 ? zero : aux - // step 4: dst = dst | src - // src mask_bit new_src dst new_dst - // case 1 0 0 0.0f 0.0f 0.0f - // case 2 ~0 1 1.0f 0.0f 1.0f - // case 3 0 0 0.0f 1.0f 1.0f - // case 4 ~0 1 1.0f 1.0f 1.0f - // step 5: loop: offset src, and do step 2 and step 3 - // ================================================================ - // ***isa: OTHER*** - // ReduceAnd (Logical And) - // step 1: init dst 0x3f800000 (1.0f) - // step 2: if src equals 0, set it 0x00000000, else set 0xffffffff - // step 3: dst = dst & src - // 0x3f800000 = 0x3f800000 & 0xffffffff (result: 1.0f) - // 0x00000000 = 0x3f800000 & 0x00000000 (result: 0.0f) - // 0x00000000 = 0x00000000 & 0xffffffff (result: 0.0f) - // 0x00000000 = 0x00000000 & 0x00000000 (result: 0.0f) - // step 4: loop: offset src, and do step 2 and step 3 - // - // ReduceOr (Logical Or) - // step 1: init dst 0x00000000 (0.0f) - // aux 0x3f800000 (1.0f) - // step 2: dst = dst | src - // 0x00000000 = 0x00000000 | 0x00000000 - // A = 0x00000000 | A - // A = A | 0x00000000 - // C = A | B - // (A, B stand for number other than 0x00000000) - // step 3: loop: offset src, and do step 2 - // step 4: if dst equals 0, set it 0x00000000, else set 0xffffffff - // step 5: dst = dst & aux - // 0x00000000 = 0x00000000 & 0x3f800000 (result: 0.0f) - // 0x3f800000 = 0xffffffff & 0x3f800000 (result: 1.0f) - // ================================================================ - Xbyak::Label reduce_to_vector_label; - Xbyak::Label reduce_to_scalar_label; - Xbyak::Label reduce_to_gather_label; - Xbyak::Label reduce_main_end_label; - if (planar_layout) { - cmp(reg_work_batch, 0); - je(reduce_to_gather_label, T_NEAR); - - cmp(reg_reduce_w, 1); // planar layout reducing W - je(reduce_to_scalar_label, T_NEAR); - } - - // store vmm_dst directly into memory after reducing - // cases: [planar layout reducing other dimensions but W] [blocked layout] - L(reduce_to_vector_label); - { - int step = vlen / sizeof(float) < 8 ? 8 : vlen / sizeof(float); - cmp(reg_work_amount, step); - jl(reduce_main_end_label, T_NEAR); //avoid illegal loading and storing - - if (jcp_.reduce_mode == Algorithm::ReduceL1) { - uni_vmovups(vmm_aux, table_val(1)); - } - - // load - load_dst_vector(); - - // reduce - reduce_kernel(); - - if (jcp_.reduce_mode == Algorithm::ReduceMean) { - Xbyak::Label reduce_divide_end_label; - mov(reg_can_divide, ptr[reg_params + GET_OFF(can_divide)]); - cmp(reg_can_divide, 0); - je(reduce_divide_end_label, T_NEAR); - { - mov(reg_divisor, ptr[reg_params + GET_OFF(divisor)]); - uni_vbroadcastss(vmm_aux, ptr[reg_divisor]); - uni_vdivps(vmm_dst, vmm_dst, vmm_aux); - if (isa == cpu::x64::sse41) { - uni_vdivps(vmm_dst_aux, vmm_dst_aux, vmm_aux); - } - } - L(reduce_divide_end_label); - } - - // store - store_dst_vector(); - - jmp(reduce_main_end_label, T_NEAR); - } - - // reduce vector in vmm_dst to be a scalar before store into memory - // cases: [planar layout reducing W] - L(reduce_to_scalar_label); - { - // init dst, dst loading is embedded in horiz_reduce_store - switch (jcp_.reduce_mode) { - case Algorithm::ReduceAnd: - case Algorithm::ReduceProd: - uni_vmovups(vmm_dst, table_val(0)); - break; - case Algorithm::ReduceL1: - uni_vmovups(vmm_aux, table_val(1)); - uni_vpxor(vmm_dst, vmm_dst, vmm_dst); - break; - case Algorithm::ReduceL2: - case Algorithm::ReduceLogSum: - case Algorithm::ReduceLogSumExp: - case Algorithm::ReduceMean: - case Algorithm::ReduceOr: - case Algorithm::ReduceSum: - case Algorithm::ReduceSumSquare: - uni_vpxor(vmm_dst, vmm_dst, vmm_dst); - break; - case Algorithm::ReduceMax: - if (isFloatCompatible(jcp_.dst_dt)) - uni_vmovups(vmm_dst, table_val(2)); - else - uni_vmovups(vmm_dst, table_val(4)); - break; - case Algorithm::ReduceMin: - if (isFloatCompatible(jcp_.dst_dt)) - uni_vmovups(vmm_dst, table_val(3)); - else - uni_vmovups(vmm_dst, table_val(5)); - break; - default: - assert(!"unsupported reduce mode"); - } - // reduce - reduce_main_loop(); - if (jcp_.reduce_mode == Algorithm::ReduceOr && isa != cpu::x64::avx512_core) { - uni_cmpneqps(vmm_dst, vmm_dst, vmm_zero); - uni_vandps(vmm_dst, vmm_dst, vmm_aux); - } - // store - // store after horizontal calculation and calculation with loaded original ptr[reg_dst] - horiz_reduce_store(vmm_dst, jcp_.dst_dt, true); - - jmp(reduce_main_end_label, T_NEAR); - } - - // load vmm_src with gather, then store vmm_dst directly into memory after reducing - // cases: [planar layout reducing small W] - L(reduce_to_gather_label); - { - int step = 1; - cmp(reg_work_amount, step); - jl(reduce_main_end_label, T_NEAR); //avoid illegal loading and storing - - mov(reg_idx, ptr[reg_params + GET_OFF(idx)]); - uni_vmovdqu(vmm_idx, ptr[reg_idx]); - - if (jcp_.reduce_mode == Algorithm::ReduceL1) { - uni_vmovups(vmm_aux, table_val(1)); - } - - // load - load_dst_vector(); - - // reduce - Xbyak::Label reduce_loop_label; - Xbyak::Label reduce_loop_end_label; - L(reduce_loop_label); - { - cmp(reg_work_amount, step); - jl(reduce_loop_end_label, T_NEAR); - - reduce_gather(vmm_dst, 0); - if (isa == cpu::x64::sse41) { - reduce_gather(vmm_dst_aux, 4 * jcp_.src_data_size); - } - - add(reg_src, step * jcp_.src_data_size); - sub(reg_work_amount, step); - jmp(reduce_loop_label, T_NEAR); - } - L(reduce_loop_end_label); - - // store - store_dst_vector(); - - jmp(reduce_main_end_label, T_NEAR); - } - - L(reduce_main_end_label); - } - - inline void reduce_tail() { - if (jcp_.reduce_mode == Algorithm::ReduceL1) { - uni_vmovups(xmm_aux, table_val(1)); - } - - Xbyak::Label tail_dst_shifted_label; - Xbyak::Label tail_dst_fixed_label; - Xbyak::Label reduce_tail_end_label; - if (planar_layout) { - cmp(reg_reduce_w, 1); // planar layout reducing W - je(tail_dst_fixed_label, T_NEAR); - } - - // each src scalar reduce to each dst scalar (X1, X2, X3, ...) -> (Y1, Y2, Y3, ...) - // cases: [planar layout reducing other dimensions but W] [blocked layout concern padding] - L(tail_dst_shifted_label); - { - reduce_kernel_tail(); - - jmp(reduce_tail_end_label, T_NEAR); - } - - // each src scalar reduce to the same dst scalar (X1, X2, X3, ...) -> (Y1) - // cases: [planar layout reducing W] - L(tail_dst_fixed_label); - { - // load - load_scalar(xmm_dst, ptr[reg_dst], jcp_.dst_dt); - - Xbyak::Label reduce_loop_label; - Xbyak::Label reduce_loop_end_label; - - // reduce - int step = 1; - L(reduce_loop_label); - { - cmp(reg_work_amount, step); - jl(reduce_loop_end_label, T_NEAR); - - load_scalar(xmm_src, ptr[reg_src], jcp_.src_dt); - - reduce_kernel_scalar(xmm_src, xmm_dst); - if (jcp_.reduce_mode == Algorithm::ReduceOr) { - uni_cmpneqps(xmm_dst, xmm_dst, xmm_zero); - uni_vandps(xmm_dst, xmm_dst, xmm_aux); - } - - add(reg_src, step * jcp_.src_data_size); - sub(reg_work_amount, step); - - jmp(reduce_loop_label, T_NEAR); - } - L(reduce_loop_end_label); - - // store - store_scalar(ptr[reg_dst], xmm_dst, jcp_.dst_dt); - } - - L(reduce_tail_end_label); - } - - inline void init_reg_reduce_stride() { - mov(reg_reduce_stride, ptr[reg_params + GET_OFF(reduce_stride)]); - mul_by_const(reg_reduce_stride, reg_tmp_64, jcp_.src_data_size); - } - - inline void reduce_kernel() { - Xbyak::Label reduce_label; - Xbyak::Label reduce_end_label; - Xbyak::Label reduce_batch_label; - Xbyak::Label reduce_batch_end_label; - - int step = vlen / sizeof(float) < 8 ? 8 : vlen / sizeof(float); - cmp(reg_work_batch, 1); - je(reduce_label, T_NEAR); - - init_reg_reduce_stride(); - - L(reduce_batch_label); - { - cmp(reg_work_amount, step); - jl(reduce_end_label, T_NEAR); - - reduce_batch(); - - add(reg_src, step * jcp_.src_data_size); - sub(reg_work_amount, step); - jmp(reduce_batch_label, T_NEAR); - } - L(reduce_batch_end_label); - - L(reduce_label); - { - cmp(reg_work_amount, step); - jl(reduce_end_label, T_NEAR); - - reduce_once(); - - add(reg_src, step * jcp_.src_data_size); - sub(reg_work_amount, step); - jmp(reduce_label, T_NEAR); - } - L(reduce_end_label); - } - - inline void reduce_once() { - load_vector(vmm_src, ptr[reg_src], jcp_.src_dt); - reduce_kernel(vmm_src, vmm_dst); - - if (isa == cpu::x64::sse41) { - load_vector(vmm_src, ptr[reg_src + 4 * jcp_.src_data_size], jcp_.src_dt); - reduce_kernel(vmm_src, vmm_dst_aux); - } - } - - inline void reduce_batch() { - mov(reg_src_aux, reg_src); - mov(reg_work_batch_aux, reg_work_batch); - - Xbyak::Label reduce_batch_loop_label; - Xbyak::Label reduce_batch_loop_end_label; - L(reduce_batch_loop_label); - { - cmp(reg_work_batch_aux, 1); - jl(reduce_batch_loop_end_label, T_NEAR); - - load_vector(vmm_src, ptr[reg_src_aux], jcp_.src_dt); - reduce_kernel(vmm_src, vmm_dst); - if (isa == cpu::x64::sse41) { - load_vector(vmm_src, ptr[reg_src_aux + 4 * jcp_.src_data_size], jcp_.src_dt); - reduce_kernel(vmm_src, vmm_dst_aux); - } - - add(reg_src_aux, reg_reduce_stride); - sub(reg_work_batch_aux, 1); - jmp(reduce_batch_loop_label, T_NEAR); - } - L(reduce_batch_loop_end_label); - } - - inline void reduce_gather(Vmm vmm_dst, int offset) { - switch (jcp_.src_dt) { - case memory::data_type::f32: - case memory::data_type::s32: - if (isa == cpu::x64::avx512_core) { - kxnord(k_mask, k_mask, k_mask); - if (jcp_.src_dt == memory::data_type::f32) { - vgatherdps(vmm_src | k_mask, ptr[reg_src + offset + vmm_idx]); - } else { - vpgatherdd(vmm_src | k_mask, ptr[reg_src + offset + vmm_idx]); - uni_vcvtdq2ps(vmm_src, vmm_src); - } - } else if (isa == cpu::x64::avx2) { - uni_vpcmpeqd(vmm_mask, vmm_mask, vmm_mask); - if (jcp_.src_dt == memory::data_type::f32) { - vgatherdps(vmm_src, ptr[reg_src + offset + vmm_idx], vmm_mask); - } else { - vpgatherdd(vmm_src, ptr[reg_src + offset + vmm_idx], vmm_mask); - uni_vcvtdq2ps(vmm_src, vmm_src); - } - } else { - pack_gathered_vector(vmm_src, vmm_idx, offset, jcp_.src_dt); - } - break; - case memory::data_type::bf16: - case memory::data_type::s8: - case memory::data_type::u8: - pack_gathered_vector(vmm_src, vmm_idx, offset, jcp_.src_dt); - break; - default: - assert(!"unknown src_dt"); - } - reduce_kernel(vmm_src, vmm_dst); - } - - inline void pack_gathered_vector(Vmm vmm_val, Vmm vmm_index, int offset, memory::data_type src_dt) { - sub(rsp, vlen); - uni_vmovdqu(ptr[rsp], vmm_index); - size_t repeats = vlen / sizeof(float); - for (size_t i = 0; i < repeats; i++) { - mov(reg_tmp_64.cvt32(), ptr[rsp + i * sizeof(int)]); - Xbyak::Address table_idx = ptr[reg_src + offset + reg_tmp_64]; - switch (src_dt) { - case memory::data_type::f32: - case memory::data_type::s32: - mov(reg_tmp_64.cvt32(), table_idx); - mov(ptr[rsp + i * sizeof(int)], reg_tmp_64.cvt32()); - break; - case memory::data_type::bf16: - mov(reg_tmp_64.cvt16(), table_idx); - mov(ptr[rsp + i * sizeof(ov::intel_cpu::bfloat16_t)], reg_tmp_64.cvt16()); - break; - case memory::data_type::s8: - case memory::data_type::u8: - mov(reg_tmp_64.cvt8(), table_idx); - mov(ptr[rsp + i * sizeof(char)], reg_tmp_64.cvt8()); - break; - default: - assert(!"unknown src_dt"); - } - } - - switch (src_dt) { - case memory::data_type::f32: - case memory::data_type::s32: - uni_vmovups(vmm_val, ptr[rsp]); - break; - case memory::data_type::bf16: - uni_vpmovzxwd(vmm_val, ptr[rsp]); - uni_vpslld(vmm_val, vmm_val, 16); - break; - case memory::data_type::s8: - uni_vpmovsxbd(vmm_val, ptr[rsp]); - break; - case memory::data_type::u8: - uni_vpmovzxbd(vmm_val, ptr[rsp]); - break; - default: - assert(!"unknown src_dt"); - } - - if (!isFloatCompatible(src_dt)) - uni_vcvtdq2ps(vmm_val, vmm_val); - add(rsp, vlen); - } - - inline void reduce_kernel_tail() { - Xbyak::Label reduce_label; - Xbyak::Label reduce_end_label; - Xbyak::Label reduce_batch_label; - Xbyak::Label reduce_batch_end_label; - - int step = 1; - cmp(reg_work_batch, 1); - je(reduce_label, T_NEAR); - - init_reg_reduce_stride(); - - L(reduce_batch_label); - { - cmp(reg_work_amount, step); - jl(reduce_end_label, T_NEAR); - - // load - load_scalar(xmm_dst, ptr[reg_dst], jcp_.dst_dt); - - // reduce - reduce_batch_tail(); - - // store - store_scalar(ptr[reg_dst], xmm_dst, jcp_.dst_dt); - - add(reg_dst, step * jcp_.dst_data_size); - add(reg_src, step * jcp_.src_data_size); - sub(reg_work_amount, step); - - jmp(reduce_batch_label, T_NEAR); - } - L(reduce_batch_end_label); - - L(reduce_label); - { - cmp(reg_work_amount, step); - jl(reduce_end_label, T_NEAR); - - // load - load_scalar(xmm_dst, ptr[reg_dst], jcp_.dst_dt); - - // reduce - reduce_batch_tail(); - - // store - store_scalar(ptr[reg_dst], xmm_dst, jcp_.dst_dt); - - add(reg_dst, step * jcp_.dst_data_size); - add(reg_src, step * jcp_.src_data_size); - sub(reg_work_amount, step); - - jmp(reduce_label, T_NEAR); - } - L(reduce_end_label); - } - - inline void reduce_once_tail() { - load_scalar(xmm_src, ptr[reg_src], jcp_.src_dt); - reduce_kernel_scalar(xmm_src, xmm_dst); - if (jcp_.reduce_mode == Algorithm::ReduceOr) { - uni_cmpneqps(xmm_dst, xmm_dst, xmm_zero); - uni_vandps(xmm_dst, xmm_dst, xmm_aux); - } - } - - inline void reduce_batch_tail() { - mov(reg_src_aux, reg_src); - mov(reg_work_batch_aux, reg_work_batch); - - Xbyak::Label reduce_batch_loop_label; - Xbyak::Label reduce_batch_loop_end_label; - L(reduce_batch_loop_label); - { - cmp(reg_work_batch_aux, 1); - jl(reduce_batch_loop_end_label, T_NEAR); - - load_scalar(xmm_src, ptr[reg_src_aux], jcp_.src_dt); - reduce_kernel_scalar(xmm_src, xmm_dst); - if (jcp_.reduce_mode == Algorithm::ReduceOr) { - uni_cmpneqps(xmm_dst, xmm_dst, xmm_zero); - uni_vandps(xmm_dst, xmm_dst, xmm_aux); - } - - add(reg_src_aux, reg_reduce_stride); - sub(reg_work_batch_aux, 1); - jmp(reduce_batch_loop_label, T_NEAR); - } - L(reduce_batch_loop_end_label); - } - - inline void reduce_main_loop() { - Xbyak::Label reduce_loop_label; - Xbyak::Label reduce_loop_end_label; - - int step = vlen / sizeof(float) < 8 ? 8 : vlen / sizeof(float); - L(reduce_loop_label); - { - cmp(reg_work_amount, step); - jl(reduce_loop_end_label, T_NEAR); - - load_vector(vmm_src, ptr[reg_src], jcp_.src_dt); - reduce_kernel(vmm_src, vmm_dst); - - if (isa == cpu::x64::sse41) { - load_vector(vmm_src, ptr[reg_src + 4 * jcp_.src_data_size], jcp_.src_dt); - reduce_kernel(vmm_src, vmm_dst); - } - - add(reg_src, step * jcp_.src_data_size); - sub(reg_work_amount, step); - - jmp(reduce_loop_label, T_NEAR); - } - L(reduce_loop_end_label); - } - - inline void reduce_kernel(Vmm vmm_src, Vmm vmm_dst) { - switch (jcp_.reduce_mode) { - case Algorithm::ReduceAnd: - if (isa == cpu::x64::avx512_core) { - vcmpps(k_mask, vmm_src, vmm_zero, _cmp_neq_uq); - vblendmps(vmm_src | k_mask, vmm_zero, vmm_aux); - } else { - uni_cmpneqps(vmm_src, vmm_src, vmm_zero); - } - uni_vandps(vmm_dst, vmm_dst, vmm_src); - break; - case Algorithm::ReduceL1: - uni_vandps(vmm_src, vmm_src, vmm_aux); - uni_vaddps(vmm_dst, vmm_dst, vmm_src); - break; - case Algorithm::ReduceLogSum: - case Algorithm::ReduceMean: - case Algorithm::ReduceSum: - uni_vaddps(vmm_dst, vmm_dst, vmm_src); - break; - case Algorithm::ReduceMax: - uni_vmaxps(vmm_dst, vmm_dst, vmm_src); - break; - case Algorithm::ReduceMin: - uni_vminps(vmm_dst, vmm_dst, vmm_src); - break; - case Algorithm::ReduceL2: - case Algorithm::ReduceSumSquare: - uni_vmulps(vmm_src, vmm_src, vmm_src); - uni_vaddps(vmm_dst, vmm_dst, vmm_src); - break; - case Algorithm::ReduceLogSumExp: - exp_injector->compute_vector_range(vmm_src.getIdx(), vmm_src.getIdx() + 1); - uni_vaddps(vmm_dst, vmm_dst, vmm_src); - break; - case Algorithm::ReduceOr: - if (isa == cpu::x64::avx512_core) { - vcmpps(k_mask, vmm_src, vmm_zero, _cmp_neq_uq); - vblendmps(vmm_src | k_mask, vmm_zero, vmm_aux); - } - uni_vorps(vmm_dst, vmm_dst, vmm_src); - break; - case Algorithm::ReduceProd: - uni_vmulps(vmm_dst, vmm_dst, vmm_src); - break; - default: - assert(!"unsupported reduce mode"); - } - } - - inline void reduce_kernel_scalar(Xmm xmm_src, Xmm xmm_dst) { - switch (jcp_.reduce_mode) { - case Algorithm::ReduceAnd: - uni_cmpneqps(xmm_src, xmm_src, xmm_zero); - uni_vandps(xmm_dst, xmm_dst, xmm_src); - break; - case Algorithm::ReduceL1: - uni_vandps(xmm_src, xmm_src, xmm_aux); - uni_vaddps(xmm_dst, xmm_dst, xmm_src); - break; - case Algorithm::ReduceLogSum: - case Algorithm::ReduceMean: - case Algorithm::ReduceSum: - uni_vaddps(xmm_dst, xmm_dst, xmm_src); - break; - case Algorithm::ReduceMax: - uni_vmaxps(xmm_dst, xmm_dst, xmm_src); - break; - case Algorithm::ReduceMin: - uni_vminps(xmm_dst, xmm_dst, xmm_src); - break; - case Algorithm::ReduceL2: - case Algorithm::ReduceSumSquare: - uni_vmulps(xmm_src, xmm_src, xmm_src); - uni_vaddps(xmm_dst, xmm_dst, xmm_src); - break; - case Algorithm::ReduceLogSumExp: - exp_injector->compute_vector_range(xmm_src.getIdx(), xmm_src.getIdx() + 1); - uni_vaddps(xmm_dst, xmm_dst, xmm_src); - break; - case Algorithm::ReduceOr: - uni_vorps(xmm_dst, xmm_dst, xmm_src); - break; - case Algorithm::ReduceProd: - uni_vmulps(xmm_dst, xmm_dst, xmm_src); - break; - default: - assert(!"unsupported reduce mode"); - } - } - - inline void load_dst_vector() { - load_vector(vmm_dst, ptr[reg_dst], jcp_.dst_dt); - if (isa == cpu::x64::sse41) - load_vector(vmm_dst_aux, ptr[reg_dst + 4 * jcp_.dst_data_size], jcp_.dst_dt); - } - - inline void store_dst_vector() { - if (jcp_.reduce_mode == Algorithm::ReduceOr && isa != cpu::x64::avx512_core) { - uni_cmpneqps(vmm_dst, vmm_dst, vmm_zero); - uni_vandps(vmm_dst, vmm_dst, vmm_aux); - - if (isa == cpu::x64::sse41) { - uni_cmpneqps(vmm_dst_aux, vmm_dst_aux, vmm_zero); - uni_vandps(vmm_dst_aux, vmm_dst_aux, vmm_aux); - } - } - store_vector(ptr[reg_dst], vmm_dst, jcp_.dst_dt); - if (isa == cpu::x64::sse41) - store_vector(ptr[reg_dst + 4 * jcp_.dst_data_size], vmm_dst_aux, jcp_.dst_dt); - } - - inline void load_vector(Vmm vmm_src, const Xbyak::Address &op, memory::data_type src_dt) { - switch (src_dt) { - case memory::data_type::f32: - case memory::data_type::s32: - uni_vmovups(vmm_src, op); - break; - case memory::data_type::bf16: - uni_vpmovzxwd(vmm_src, op); - uni_vpslld(vmm_src, vmm_src, 16); - break; - case memory::data_type::s8: - uni_vpmovsxbd(vmm_src, op); - break; - case memory::data_type::u8: - uni_vpmovzxbd(vmm_src, op); - break; - default: - assert(!"unknown src_dt"); - } - - if (!isFloatCompatible(src_dt)) - uni_vcvtdq2ps(vmm_src, vmm_src); - } - - inline void load_scalar(Xmm xmm_src, const Xbyak::Address &op, memory::data_type src_dt) { - switch (src_dt) { - case memory::data_type::f32: - case memory::data_type::s32: - uni_vmovss(xmm_src, op); - break; - case memory::data_type::bf16: - uni_vpinsrw(xmm_src, xmm_src, op, 0x0); - uni_vpslld(xmm_src, xmm_src, 16); - break; - case memory::data_type::s8: - movsx(reg_tmp_32, op); - uni_vmovq(xmm_src, reg_tmp_64); - break; - case memory::data_type::u8: - movzx(reg_tmp_32, op); - uni_vmovq(xmm_src, reg_tmp_64); - break; - default: - assert(!"unknown src_dt"); - } - - if (!isFloatCompatible(src_dt)) { - uni_vcvtdq2ps(xmm_src, xmm_src); - } - } - - inline void store_vector(const Xbyak::Address &op, Vmm vmm_dst, memory::data_type dst_dt) { - Xmm xmm_dst = Xmm(vmm_dst.getIdx()); - Ymm ymm_dst = Ymm(vmm_dst.getIdx()); - - if (!isFloatCompatible(dst_dt)) { - uni_vcvtps2dq(vmm_dst, vmm_dst); - } - - switch (dst_dt) { - case memory::data_type::f32: - case memory::data_type::s32: - uni_vmovups(op, vmm_dst); - break; - case memory::data_type::bf16: - uni_vcvtneps2bf16->emit_code({static_cast(vmm_dst.getIdx())}, {static_cast(ymm_dst.getIdx())}); - vmovdqu16(op, ymm_dst); - break; - case memory::data_type::s8: - if (isa == cpu::x64::avx512_core) { - vpmovsdb(op, vmm_dst); - } else { - uni_vpackssdw(vmm_dst, vmm_dst, vmm_dst); - if (isa != cpu::x64::sse41) - vpermq(ymm_dst, ymm_dst, 0x08); - uni_vpacksswb(vmm_dst, vmm_dst, vmm_dst); - if (isa != cpu::x64::sse41) - vmovq(op, xmm_dst); - else - uni_vmovd(op, xmm_dst); - } - break; - case memory::data_type::u8: - if (isa == cpu::x64::avx512_core) { - vpmaxsd(vmm_dst, vmm_zero, vmm_dst); - vpmovusdb(op, vmm_dst); - } else { - uni_vpackusdw(vmm_dst, vmm_dst, vmm_dst); - if (isa != cpu::x64::sse41) - vpermq(ymm_dst, ymm_dst, 0x08); - uni_vpackuswb(vmm_dst, vmm_dst, vmm_dst); - if (isa != cpu::x64::sse41) - vmovq(op, xmm_dst); - else - uni_vmovd(op, xmm_dst); - } - break; - default: - assert(!"unknown dst_dt"); - } - } - - inline void store_scalar(const Xbyak::Address &op, Xmm xmm_dst, memory::data_type dst_dt) { - if (!isFloatCompatible(dst_dt)) { - uni_vcvtps2dq(xmm_dst, xmm_dst); - } - - switch (dst_dt) { - case memory::data_type::f32: - case memory::data_type::s32: - uni_vmovss(op, xmm_dst); - break; - case memory::data_type::bf16: - uni_vpsrld(xmm_dst, xmm_dst, 16); - uni_vpextrw(op, xmm_dst, 0x0); - break; - case memory::data_type::s8: - uni_vpackssdw(xmm_dst, xmm_dst, xmm_dst); - uni_vpacksswb(xmm_dst, xmm_dst, xmm_dst); - uni_vmovq(reg_tmp_64, xmm_dst); - mov(op, reg_tmp_8); - break; - case memory::data_type::u8: - uni_vpackusdw(xmm_dst, xmm_dst, xmm_dst); - uni_vpackuswb(xmm_dst, xmm_dst, xmm_dst); - uni_vmovq(reg_tmp_64, xmm_dst); - mov(op, reg_tmp_8); - break; - default: - assert(!"unknown dst_dt"); - } - } - - inline void horiz_reduce_store(Vmm vmm_dst, memory::data_type dst_dt, bool load_embedded = false) { - if (isa == cpu::x64::sse41) { - horiz_store(vmm_dst, dst_dt, load_embedded); - } else if (isa == cpu::x64::avx2) { - Xbyak::Ymm ymm_dst = Xbyak::Ymm(vmm_dst.getIdx()); - vextractf128(xmm_aux1, ymm_dst, 0); - vextractf128(xmm_aux2, ymm_dst, 1); - horiz_ps(xmm_aux1, xmm_aux2); - horiz_store(xmm_aux1, dst_dt, load_embedded); - } else { - Xbyak::Zmm zmm_dst = Xbyak::Zmm(vmm_dst.getIdx()); - vextractf32x4(xmm_aux1, zmm_dst, 0); - vextractf32x4(xmm_aux2, zmm_dst, 1); - horiz_ps(xmm_aux1, xmm_aux2); - vextractf32x4(xmm_aux2, zmm_dst, 2); - vextractf32x4(xmm_aux3, zmm_dst, 3); - horiz_ps(xmm_aux2, xmm_aux3); - horiz_ps(xmm_aux1, xmm_aux2); - horiz_store(xmm_aux1, dst_dt, load_embedded); - } - } - - inline void horiz_store(Xbyak::Xmm xmm_dst, memory::data_type dst_dt, bool load_embedded) { - uni_vmovshdup(xmm_aux3, xmm_dst); // dst:1,2,3,4; aux3:2,2,4,4 - horiz_ps(xmm_dst, xmm_aux3); // dst:f(1,2),f(2,2),f(3,4),f(4,4) - uni_vmovhlps(xmm_aux3, xmm_aux3, xmm_dst); // aux3:f(3,4),f(4,4),4,4 - horiz_ps(xmm_dst, xmm_aux3); // dst:f(1,2,3,4),... - if (load_embedded) { - load_scalar(xmm_aux3, ptr[reg_dst], dst_dt); - horiz_ps(xmm_dst, xmm_aux3); - } - store_scalar(ptr[reg_dst], xmm_dst, dst_dt); - } - - inline void horiz_ps(const Xmm& xmm, const Operand& op) { - switch (jcp_.reduce_mode) { - case Algorithm::ReduceAnd: - uni_vandps(xmm, xmm, op); - break; - case Algorithm::ReduceL1: - case Algorithm::ReduceL2: - case Algorithm::ReduceLogSum: - case Algorithm::ReduceMean: - case Algorithm::ReduceSum: - case Algorithm::ReduceSumSquare: - case Algorithm::ReduceLogSumExp: - uni_vaddps(xmm, xmm, op); - break; - case Algorithm::ReduceMax: - uni_vmaxps(xmm, xmm, op); - break; - case Algorithm::ReduceMin: - uni_vminps(xmm, xmm, op); - break; - case Algorithm::ReduceOr: - uni_vorps(xmm, xmm, op); - break; - case Algorithm::ReduceProd: - uni_vmulps(xmm, xmm, op); - break; - default: - assert(!"unsupported reduce mode"); - } - } - - void prepare_aux_table() { - auto broadcast_int = [&](int val) { - for (size_t d = 0; d < vlen / sizeof(float); ++d) { - dd(val); - } - }; - - align(64); - L(l_table); - - broadcast_int(aux_vals.float_one); - broadcast_int(aux_vals.float_abs); - broadcast_int(aux_vals.float_min); - broadcast_int(aux_vals.float_max); - broadcast_int(aux_vals.int32_min); - broadcast_int(aux_vals.int32_max); - } - - const struct aux_vals_type { - int float_one = 0x3f800000; // 1.0f - int float_abs = 0x7fffffff; // mask to make positive - int float_min = 0xff7fffff; // float minimum - int float_max = 0x7f7fffff; // float maximum - int int32_min = 0xcf000000; // -2^31 presented in float - int int32_max = 0x4effffff; // 2^31-1 presented in float - } aux_vals; -}; - -template -struct jit_uni_reduce_post_kernel_f32 : public jit_uni_reduce_post_kernel, public jit_generator { - DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_reduce_post_kernel_f32) - - explicit jit_uni_reduce_post_kernel_f32(jit_reduce_config_params jcp, const dnnl_primitive_attr &attr) - : jit_uni_reduce_post_kernel(jcp, attr), jit_generator(jit_name()) {} - - void create_ker() override { - jit_generator::create_kernel(); - ker_ = (decltype(ker_))jit_ker(); - } - - void generate() override { - const auto &p = attr_.post_ops_; - for (int i = 0; i < p.len(); i++) { - auto &post_op = p.entry_[i]; - if (post_op.is_eltwise()) { - eltwise_injectors.push_back(std::make_shared>( - this, post_op.eltwise.alg, post_op.eltwise.alpha, post_op.eltwise.beta, post_op.eltwise.scale)); - } else if (post_op.is_depthwise()) { - depthwise_injectors.push_back(std::make_shared>( - this, post_op)); - } else if (post_op.is_quantization()) { - quantization_injectors.push_back(std::make_shared>( - this, post_op, vmm_d_weights, vmm_d_bias, reg_d_weights, reg_d_bias)); - } - } - - if (jcp_.reduce_mode == Algorithm::ReduceLogSum || jcp_.reduce_mode == Algorithm::ReduceLogSumExp) { - log_injector = std::make_shared>(this, alg_kind::eltwise_log, 0.f, 0.f, 1.f); - } - - if (mayiuse(avx512_core)) - uni_vcvtneps2bf16 = std::make_shared(this, isa); - - this->preamble(); - - planar_layout = jcp_.layout == ReduceLayoutType::reduce_ncsp || jcp_.layout == ReduceLayoutType::reduce_nspc; - post_reduce = jcp_.reduce_mode == Algorithm::ReduceL2 || jcp_.reduce_mode == Algorithm::ReduceMean || - jcp_.reduce_mode == Algorithm::ReduceLogSum || jcp_.reduce_mode == Algorithm::ReduceLogSumExp; - post_ops_fusing = attr_.post_ops_.len() != 0; - - mov(reg_dst, ptr[reg_params + GET_OFF_POST(dst)]); - mov(reg_work_amount, ptr[reg_params + GET_OFF_POST(work_amount)]); - mov(reg_channel_size, ptr[reg_params + GET_OFF_POST(channel_size)]); - mov(reg_divisor, ptr[reg_params + GET_OFF_POST(divisor)]); - if (jcp_.fuse_low_precision) - mov(reg_src, ptr[reg_params + GET_OFF_POST(src)]); - if (!planar_layout) - mov(reg_reduce_c, ptr[reg_params + GET_OFF_POST(reduce_c)]); - if (post_ops_fusing) { - mov(reg_post_ops_data, ptr[reg_params + GET_OFF_POST(post_op_data)]); - mov(reg_oc_off, ptr[reg_params + GET_OFF_POST(oc_off)]); - } - - if (isa == cpu::x64::avx512_core) - uni_vpxor(vmm_zero, vmm_zero, vmm_zero); - - if (jcp_.layout == ReduceLayoutType::reduce_blocked) { - reduce_post_main(); - } else if (jcp_.layout == ReduceLayoutType::reduce_nspc && post_ops_fusing) { - // the tail of channel dimension should always be concerned during post ops fusing for nspc layout - Xbyak::Label reduce_nspc_loop_label; - Xbyak::Label reduce_nspc_loop_end_label; - mov(reg_total_work_amount, reg_work_amount); - L(reduce_nspc_loop_label); - { - cmp(reg_total_work_amount, 0); - jle(reduce_nspc_loop_end_label, T_NEAR); - - mov(reg_oc_off, 0); - mov(reg_work_amount, reg_channel_size); - reduce_post_main(); - reduce_post_tail(); - - sub(reg_total_work_amount, reg_channel_size); - jmp(reduce_nspc_loop_label, T_NEAR); - } - L(reduce_nspc_loop_end_label); - } else { - reduce_post_main(); - reduce_post_tail(); - } - - this->postamble(); - - if (mayiuse(avx512_core)) - uni_vcvtneps2bf16->emit_data(); - - if (jcp_.reduce_mode == Algorithm::ReduceLogSum || jcp_.reduce_mode == Algorithm::ReduceLogSumExp) { - log_injector->prepare_table(); - } - - for (auto& inj : eltwise_injectors) - inj->prepare_table(); - } - -private: - using Vmm = typename conditional3::type; - size_t vlen = cpu_isa_traits::vlen; - bool planar_layout = false; - bool post_reduce = true; - bool post_ops_fusing = false; - - Xbyak::Reg64 reg_src = rbp; - Xbyak::Reg64 reg_dst = r8; - Xbyak::Reg64 reg_work_amount = r9; - Xbyak::Reg64 reg_total_work_amount = r10; - Xbyak::Reg64 reg_channel_size = r11; - Xbyak::Reg64 reg_divisor = r12; - Xbyak::Reg64 reg_reduce_c = r13; - Xbyak::Reg64 reg_params = abi_param1; - - Xbyak::Reg8 reg_tmp_8 = r14b; - Xbyak::Reg32 reg_tmp_32 = r14d; - Xbyak::Reg64 reg_tmp_64 = r14; - - Xbyak::Reg64 reg_oc_off = rax; - Xbyak::Reg64 reg_d_weights = rbx; - Xbyak::Reg64 reg_d_bias = rdx; - Xbyak::Reg64 reg_post_ops_data = r15; - - Vmm vmm_aux = Vmm(0); - Xmm xmm_aux = Xmm(0); - Vmm vmm_dst = Vmm(1); - Xmm xmm_dst = Xmm(1); - Vmm vmm_zero = Vmm(2); - Vmm vmm_dst_aux = Vmm(3); - Xbyak::Xmm xmm_aux1 = Xbyak::Xmm(4); - Xbyak::Xmm xmm_aux2 = Xbyak::Xmm(5); - Xbyak::Xmm xmm_aux3 = Xbyak::Xmm(6); - - Vmm vmm_d_weights = Vmm(7); - Vmm vmm_d_bias = Vmm(8); - - std::shared_ptr uni_vcvtneps2bf16; - std::shared_ptr> log_injector; - - std::vector>> eltwise_injectors; - std::vector>> depthwise_injectors; - std::vector>> quantization_injectors; - - inline void reduce_post_main() { - Xbyak::Label reduce_channel_label; - Xbyak::Label reduce_map_label; - if (planar_layout) { - jmp(reduce_map_label, T_NEAR); - } else { - cmp(reg_reduce_c, 1); - jne(reduce_map_label, T_NEAR); - } - - // further reduce channel block since reduce channel batch has already been reduced - // (X1, X2, X3, X4, X5, X6, X7, X8) -> (Y1, N/A, N/A, N/A, N/A, N/A, N/A, N/A) - // cases: [blocked layout reducing channel dimensions] - L(reduce_channel_label); - { - Xbyak::Label reduce_loop_label; - Xbyak::Label reduce_loop_end_label; - - int step = vlen / sizeof(float) < 8 ? 8 : vlen / sizeof(float); - L(reduce_loop_label); - { - cmp(reg_work_amount, step); - jl(reduce_loop_end_label, T_NEAR); - - // load - wrap_load_vector(vmm_dst, 0); - if (isa == cpu::x64::sse41) - wrap_load_vector(vmm_dst_aux, 4); - - // reduce and store - horiz_reduce_store(vmm_dst, jcp_.dst_dt); - if (isa == cpu::x64::sse41) - horiz_reduce_store(vmm_dst_aux, jcp_.dst_dt, true); - - add(reg_dst, step * jcp_.dst_data_size); - if (jcp_.fuse_low_precision) - add(reg_src, step * sizeof(float)); - sub(reg_work_amount, step); - - jmp(reduce_loop_label, T_NEAR); - } - L(reduce_loop_end_label); - - if (post_reduce || post_ops_fusing) { - mov(reg_dst, ptr[reg_params + GET_OFF_POST(dst)]); - if (jcp_.fuse_low_precision) - mov(reg_src, ptr[reg_params + GET_OFF_POST(src)]); - mov(reg_work_amount, ptr[reg_params + GET_OFF_POST(work_amount)]); - } - } - - // reduce map for value in dst memory - // cases: [ReduceL2] [ReduceLogSum] [ReduceLogSumExp] [ReduceMean] - L(reduce_map_label); - { - if (post_reduce) { - if (jcp_.reduce_mode == Algorithm::ReduceMean) - uni_vbroadcastss(vmm_aux, ptr[reg_divisor]); - - Xbyak::Label reduce_loop_label; - Xbyak::Label reduce_loop_end_label; - - int step = vlen / sizeof(float) < 8 ? 8 : vlen / sizeof(float); - L(reduce_loop_label); - { - cmp(reg_work_amount, step); - jl(reduce_loop_end_label, T_NEAR); - - wrap_load_vector(vmm_dst, 0); - reduce_map_kernel(vmm_dst); - if (post_ops_fusing) - apply_post_ops(jcp_.dst_dt, jcp_.layout == ReduceLayoutType::reduce_ncsp); - store_vector(ptr[reg_dst], vmm_dst, jcp_.dst_dt); - - if (isa == cpu::x64::sse41) { - wrap_load_vector(vmm_dst, 4); - reduce_map_kernel(vmm_dst); - if (post_ops_fusing) { - if (jcp_.layout != ReduceLayoutType::reduce_ncsp) - add(reg_oc_off, 4 * sizeof(float)); - apply_post_ops(jcp_.dst_dt, jcp_.layout == ReduceLayoutType::reduce_ncsp); - if (jcp_.layout != ReduceLayoutType::reduce_ncsp) - sub(reg_oc_off, 4 * sizeof(float)); - } - store_vector(ptr[reg_dst + 4 * jcp_.dst_data_size], vmm_dst, jcp_.dst_dt); - } - - add(reg_dst, step * jcp_.dst_data_size); - if (jcp_.fuse_low_precision) - add(reg_src, step * sizeof(float)); - if (jcp_.layout == ReduceLayoutType::reduce_nspc && post_ops_fusing) - add(reg_oc_off, step * sizeof(float)); - sub(reg_work_amount, step); - - jmp(reduce_loop_label, T_NEAR); - } - L(reduce_loop_end_label); - } else { - if (post_ops_fusing) { - Xbyak::Label reduce_loop_label; - Xbyak::Label reduce_loop_end_label; - - int step = vlen / sizeof(float) < 8 ? 8 : vlen / sizeof(float); - L(reduce_loop_label); - { - cmp(reg_work_amount, step); - jl(reduce_loop_end_label, T_NEAR); - - wrap_load_vector(vmm_dst, 0); - apply_post_ops(jcp_.dst_dt, jcp_.layout == ReduceLayoutType::reduce_ncsp); - store_vector(ptr[reg_dst], vmm_dst, jcp_.dst_dt); - - if (isa == cpu::x64::sse41) { - wrap_load_vector(vmm_dst, 4); - if (jcp_.layout != ReduceLayoutType::reduce_ncsp) - add(reg_oc_off, 4 * sizeof(float)); - apply_post_ops(jcp_.dst_dt, jcp_.layout == ReduceLayoutType::reduce_ncsp); - if (jcp_.layout != ReduceLayoutType::reduce_ncsp) - sub(reg_oc_off, 4 * sizeof(float)); - store_vector(ptr[reg_dst + 4 * jcp_.dst_data_size], vmm_dst, jcp_.dst_dt); - } - - add(reg_dst, step * jcp_.dst_data_size); - if (jcp_.fuse_low_precision) - add(reg_src, step * sizeof(float)); - if (jcp_.layout == ReduceLayoutType::reduce_nspc && post_ops_fusing) - add(reg_oc_off, step * sizeof(float)); - sub(reg_work_amount, step); - - jmp(reduce_loop_label, T_NEAR); - } - L(reduce_loop_end_label); - } - } - } - } - - inline void reduce_post_tail() { - // reduce map for tail in dst memory - // cases: [ReduceL2] [ReduceLogSum] [ReduceLogSumExp] [ReduceMean] in planar layout - if (post_reduce) { - if (jcp_.reduce_mode == Algorithm::ReduceMean) - uni_vbroadcastss(xmm_aux, ptr[reg_divisor]); - - Xbyak::Label reduce_loop_label; - Xbyak::Label reduce_loop_end_label; - - int step = 1; - L(reduce_loop_label); - { - cmp(reg_work_amount, step); - jl(reduce_loop_end_label, T_NEAR); - - // load - wrap_load_scalar(xmm_dst, 0); - - // reduce - reduce_map_kernel_scalar(xmm_dst); - - // store - if (post_ops_fusing) - apply_post_ops(jcp_.dst_dt, jcp_.layout == ReduceLayoutType::reduce_ncsp); - store_scalar(ptr[reg_dst], xmm_dst, jcp_.dst_dt); - - add(reg_dst, step * jcp_.dst_data_size); - if (jcp_.fuse_low_precision) - add(reg_src, step * sizeof(float)); - if (jcp_.layout == ReduceLayoutType::reduce_nspc && post_ops_fusing) - add(reg_oc_off, step * sizeof(float)); - sub(reg_work_amount, step); - - jmp(reduce_loop_label, T_NEAR); - } - L(reduce_loop_end_label); - } else { - if (post_ops_fusing) { - Xbyak::Label reduce_loop_label; - Xbyak::Label reduce_loop_end_label; - - int step = 1; - L(reduce_loop_label); - { - cmp(reg_work_amount, step); - jl(reduce_loop_end_label, T_NEAR); - - // load - wrap_load_scalar(xmm_dst, 0); - - // store - apply_post_ops(jcp_.dst_dt, jcp_.layout == ReduceLayoutType::reduce_ncsp); - store_scalar(ptr[reg_dst], xmm_dst, jcp_.dst_dt); - - add(reg_dst, step * jcp_.dst_data_size); - if (jcp_.fuse_low_precision) - add(reg_src, step * sizeof(float)); - if (jcp_.layout == ReduceLayoutType::reduce_nspc && post_ops_fusing) - add(reg_oc_off, step * sizeof(float)); - sub(reg_work_amount, step); - - jmp(reduce_loop_label, T_NEAR); - } - L(reduce_loop_end_label); - } - } - } - - void apply_post_ops(memory::data_type dst_dt, bool is_broadcast) { - const auto &p = attr_.post_ops_; - int eltwise_inj_idx = 0; - int depthwise_inj_idx = 0; - int quantization_inj_idx = 0; - int post_ops_data_offset = 0; - for (int i = 0; i < p.len(); i++) { - auto& post_op = p.entry_[i]; - if (post_op.is_eltwise()) { - eltwise_injectors[eltwise_inj_idx]->compute_vector_range(vmm_dst.getIdx(), vmm_dst.getIdx() + 1); - eltwise_inj_idx++; - } else if (post_op.is_depthwise()) { - mov(reg_d_weights, ptr[reg_post_ops_data + post_ops_data_offset]); - add(reg_d_weights, reg_oc_off); - - depthwise_injectors[depthwise_inj_idx]->compute_vector_range( - vmm_dst.getIdx(), vmm_dst.getIdx() + 1, reg_d_weights, reg_d_weights, is_broadcast); - - post_ops_data_offset += depthwise_injectors[depthwise_inj_idx]->memoryStep(); - depthwise_inj_idx++; - } else if (post_op.is_quantization()) { - bool do_dequantization = post_op.quantization.alg == alg_kind::quantization_quantize_dequantize; - bool do_rounding = do_dequantization || isFloatCompatible(dst_dt) || i != p.len() - 1; - - int s_idx = vmm_dst.getIdx(); - - quantization_injectors[quantization_inj_idx]->init_crop_ptrs(reg_post_ops_data + post_ops_data_offset, reg_oc_off); - quantization_injectors[quantization_inj_idx]->compute_crop(s_idx, s_idx + 1, 0, 0, is_broadcast); - - quantization_injectors[quantization_inj_idx]->init_input_scale_shift_ptrs(reg_post_ops_data + post_ops_data_offset, reg_oc_off); - quantization_injectors[quantization_inj_idx]->compute_input_scale_shift(s_idx, s_idx + 1, 0, do_rounding, 0, is_broadcast); - - if (do_dequantization) { - quantization_injectors[quantization_inj_idx]->init_output_scale_shift_ptrs(reg_post_ops_data + post_ops_data_offset, reg_oc_off); - quantization_injectors[quantization_inj_idx]->compute_output_scale_shift(s_idx, s_idx + 1, 0, 0, is_broadcast); - } - - post_ops_data_offset += quantization_injectors[quantization_inj_idx]->memoryStep(); - quantization_inj_idx++; - } - } - } - - inline void reduce_map_kernel(Vmm vmm_dst) { - if (jcp_.reduce_mode == Algorithm::ReduceMean) - uni_vdivps(vmm_dst, vmm_dst, vmm_aux); - else if (jcp_.reduce_mode == Algorithm::ReduceL2) - uni_vsqrtps(vmm_dst, vmm_dst); - else if (jcp_.reduce_mode == Algorithm::ReduceLogSum || jcp_.reduce_mode == Algorithm::ReduceLogSumExp) - log_injector->compute_vector_range(vmm_dst.getIdx(), vmm_dst.getIdx() + 1); - } - - inline void reduce_map_kernel_scalar(Xmm xmm_dst) { - if (jcp_.reduce_mode == Algorithm::ReduceMean) - uni_vdivps(xmm_dst, xmm_dst, xmm_aux); - else if (jcp_.reduce_mode == Algorithm::ReduceL2) - uni_vsqrtps(xmm_dst, xmm_dst); - else if (jcp_.reduce_mode == Algorithm::ReduceLogSum || jcp_.reduce_mode == Algorithm::ReduceLogSumExp) - log_injector->compute_vector_range(xmm_dst.getIdx(), xmm_dst.getIdx() + 1); - } - - inline void wrap_load_vector(Vmm vmm_val, size_t offset) { - if (jcp_.fuse_low_precision) - load_vector(vmm_val, ptr[reg_src + offset * sizeof(float)], memory::data_type::f32); - else - load_vector(vmm_val, ptr[reg_dst + offset * jcp_.dst_data_size], jcp_.dst_dt); - } - - inline void wrap_load_scalar(Xmm xmm_val, size_t offset) { - if (jcp_.fuse_low_precision) - load_scalar(xmm_val, ptr[reg_src + offset * sizeof(float)], memory::data_type::f32); - else - load_scalar(xmm_val, ptr[reg_dst + offset * jcp_.dst_data_size], jcp_.dst_dt); - } - - inline void load_vector(Vmm vmm_src, const Xbyak::Address &op, memory::data_type src_dt) { - switch (src_dt) { - case memory::data_type::f32: - case memory::data_type::s32: - uni_vmovups(vmm_src, op); - break; - case memory::data_type::bf16: - uni_vpmovzxwd(vmm_src, op); - uni_vpslld(vmm_src, vmm_src, 16); - break; - case memory::data_type::s8: - uni_vpmovsxbd(vmm_src, op); - break; - case memory::data_type::u8: - uni_vpmovzxbd(vmm_src, op); - break; - default: - assert(!"unknown src_dt"); - } - - if (!isFloatCompatible(src_dt)) - uni_vcvtdq2ps(vmm_src, vmm_src); - } - - inline void load_scalar(Xmm xmm_src, const Xbyak::Address &op, memory::data_type src_dt) { - switch (src_dt) { - case memory::data_type::f32: - case memory::data_type::s32: - uni_vmovss(xmm_src, op); - break; - case memory::data_type::bf16: - uni_vpinsrw(xmm_src, xmm_src, op, 0x0); - uni_vpslld(xmm_src, xmm_src, 16); - break; - case memory::data_type::s8: - movsx(reg_tmp_32, op); - uni_vmovq(xmm_src, reg_tmp_64); - break; - case memory::data_type::u8: - movzx(reg_tmp_32, op); - uni_vmovq(xmm_src, reg_tmp_64); - break; - default: - assert(!"unknown src_dt"); - } - - if (!isFloatCompatible(src_dt)) { - uni_vcvtdq2ps(xmm_src, xmm_src); - } - } - - inline void store_vector(const Xbyak::Address &op, Vmm vmm_dst, memory::data_type dst_dt) { - Xmm xmm_dst = Xmm(vmm_dst.getIdx()); - Ymm ymm_dst = Ymm(vmm_dst.getIdx()); - - if (!isFloatCompatible(dst_dt)) { - uni_vcvtps2dq(vmm_dst, vmm_dst); - } - - switch (dst_dt) { - case memory::data_type::f32: - case memory::data_type::s32: - uni_vmovups(op, vmm_dst); - break; - case memory::data_type::bf16: - uni_vcvtneps2bf16->emit_code({static_cast(vmm_dst.getIdx())}, {static_cast(ymm_dst.getIdx())}); - vmovdqu16(op, ymm_dst); - break; - case memory::data_type::s8: - if (isa == cpu::x64::avx512_core) { - vpmovsdb(op, vmm_dst); - } else { - uni_vpackssdw(vmm_dst, vmm_dst, vmm_dst); - if (isa != cpu::x64::sse41) - vpermq(ymm_dst, ymm_dst, 0x08); - uni_vpacksswb(vmm_dst, vmm_dst, vmm_dst); - if (isa != cpu::x64::sse41) - vmovq(op, xmm_dst); - else - uni_vmovd(op, xmm_dst); - } - break; - case memory::data_type::u8: - if (isa == cpu::x64::avx512_core) { - vpmaxsd(vmm_dst, vmm_zero, vmm_dst); - vpmovusdb(op, vmm_dst); - } else { - uni_vpackusdw(vmm_dst, vmm_dst, vmm_dst); - if (isa != cpu::x64::sse41) - vpermq(ymm_dst, ymm_dst, 0x08); - uni_vpackuswb(vmm_dst, vmm_dst, vmm_dst); - if (isa != cpu::x64::sse41) - vmovq(op, xmm_dst); - else - uni_vmovd(op, xmm_dst); - } - break; - default: - assert(!"unknown dst_dt"); - } - } - - inline void store_scalar(const Xbyak::Address &op, Xmm xmm_dst, memory::data_type dst_dt) { - if (!isFloatCompatible(dst_dt)) { - uni_vcvtps2dq(xmm_dst, xmm_dst); - } - - switch (dst_dt) { - case memory::data_type::f32: - case memory::data_type::s32: - uni_vmovss(op, xmm_dst); - break; - case memory::data_type::bf16: - uni_vpsrld(xmm_dst, xmm_dst, 16); - uni_vpextrw(op, xmm_dst, 0x0); - break; - case memory::data_type::s8: - uni_vpackssdw(xmm_dst, xmm_dst, xmm_dst); - uni_vpacksswb(xmm_dst, xmm_dst, xmm_dst); - uni_vmovq(reg_tmp_64, xmm_dst); - mov(op, reg_tmp_8); - break; - case memory::data_type::u8: - uni_vpackusdw(xmm_dst, xmm_dst, xmm_dst); - uni_vpackuswb(xmm_dst, xmm_dst, xmm_dst); - uni_vmovq(reg_tmp_64, xmm_dst); - mov(op, reg_tmp_8); - break; - default: - assert(!"unknown dst_dt"); - } - } - - inline void horiz_reduce_store(Vmm vmm_dst, memory::data_type dst_dt, bool load_embedded = false) { - if (isa == cpu::x64::sse41) { - horiz_store(vmm_dst, dst_dt, load_embedded); - } else if (isa == cpu::x64::avx2) { - Xbyak::Ymm ymm_dst = Xbyak::Ymm(vmm_dst.getIdx()); - vextractf128(xmm_aux1, ymm_dst, 0); - vextractf128(xmm_aux2, ymm_dst, 1); - horiz_ps(xmm_aux1, xmm_aux2); - horiz_store(xmm_aux1, dst_dt, load_embedded); - } else { - Xbyak::Zmm zmm_dst = Xbyak::Zmm(vmm_dst.getIdx()); - vextractf32x4(xmm_aux1, zmm_dst, 0); - vextractf32x4(xmm_aux2, zmm_dst, 1); - horiz_ps(xmm_aux1, xmm_aux2); - vextractf32x4(xmm_aux2, zmm_dst, 2); - vextractf32x4(xmm_aux3, zmm_dst, 3); - horiz_ps(xmm_aux2, xmm_aux3); - horiz_ps(xmm_aux1, xmm_aux2); - horiz_store(xmm_aux1, dst_dt, load_embedded); - } - } - - inline void horiz_store(Xbyak::Xmm xmm_dst, memory::data_type dst_dt, bool load_embedded) { - uni_vmovshdup(xmm_aux3, xmm_dst); // dst:1,2,3,4; aux3:2,2,4,4 - horiz_ps(xmm_dst, xmm_aux3); // dst:f(1,2),f(2,2),f(3,4),f(4,4) - uni_vmovhlps(xmm_aux3, xmm_aux3, xmm_dst); // aux3:f(3,4),f(4,4),4,4 - horiz_ps(xmm_dst, xmm_aux3); // dst:f(1,2,3,4),... - if (jcp_.fuse_low_precision && (post_reduce || post_ops_fusing)) { - if (load_embedded) { - load_scalar(xmm_aux3, ptr[reg_src], memory::data_type::f32); - horiz_ps(xmm_dst, xmm_aux3); - } - store_scalar(ptr[reg_src], xmm_dst, memory::data_type::f32); - } else { - if (load_embedded) { - load_scalar(xmm_aux3, ptr[reg_dst], dst_dt); - horiz_ps(xmm_dst, xmm_aux3); - } - store_scalar(ptr[reg_dst], xmm_dst, dst_dt); - } - } - - inline void horiz_ps(const Xmm& xmm, const Operand& op) { - switch (jcp_.reduce_mode) { - case Algorithm::ReduceAnd: - uni_vandps(xmm, xmm, op); - break; - case Algorithm::ReduceL1: - case Algorithm::ReduceL2: - case Algorithm::ReduceLogSum: - case Algorithm::ReduceMean: - case Algorithm::ReduceSum: - case Algorithm::ReduceSumSquare: - case Algorithm::ReduceLogSumExp: - uni_vaddps(xmm, xmm, op); - break; - case Algorithm::ReduceMax: - uni_vmaxps(xmm, xmm, op); - break; - case Algorithm::ReduceMin: - uni_vminps(xmm, xmm, op); - break; - case Algorithm::ReduceOr: - uni_vorps(xmm, xmm, op); - break; - case Algorithm::ReduceProd: - uni_vmulps(xmm, xmm, op); - break; - default: - assert(!"unsupported reduce mode"); - } - } -}; - -#endif // OPENVINO_ARCH_X86_64 +} // namespace -const std::map&, Reduce&)>> Reduce::initializers = { - {ngraph::opset4::ReduceL1::get_type_info_static(), [](const std::shared_ptr& op, Reduce& node) { +const std::map&, Reduce&)>> Reduce::initializers = { + {op::v4::ReduceL1::get_type_info_static(), [](const std::shared_ptr& op, Reduce& node) { node.algorithm = Algorithm::ReduceL1; }}, - {ngraph::opset4::ReduceL2::get_type_info_static(), [](const std::shared_ptr& op, Reduce& node) { + {op::v4::ReduceL2::get_type_info_static(), [](const std::shared_ptr& op, Reduce& node) { node.algorithm = Algorithm::ReduceL2; }}, - {ngraph::opset1::ReduceLogicalAnd::get_type_info_static(), [](const std::shared_ptr& op, Reduce& node) { + {op::v1::ReduceLogicalAnd::get_type_info_static(), [](const std::shared_ptr& op, Reduce& node) { node.algorithm = Algorithm::ReduceAnd; }}, - {ngraph::opset1::ReduceLogicalOr::get_type_info_static(), [](const std::shared_ptr& op, Reduce& node) { + {op::v1::ReduceLogicalOr::get_type_info_static(), [](const std::shared_ptr& op, Reduce& node) { node.algorithm = Algorithm::ReduceOr; }}, - {ngraph::opset1::ReduceMax::get_type_info_static(), [](const std::shared_ptr& op, Reduce& node) { + {op::v1::ReduceMax::get_type_info_static(), [](const std::shared_ptr& op, Reduce& node) { node.algorithm = Algorithm::ReduceMax; }}, - {ngraph::opset1::ReduceMean::get_type_info_static(), [](const std::shared_ptr& op, Reduce& node) { + {op::v1::ReduceMean::get_type_info_static(), [](const std::shared_ptr& op, Reduce& node) { node.algorithm = Algorithm::ReduceMean; }}, - {ngraph::opset1::ReduceMin::get_type_info_static(), [](const std::shared_ptr& op, Reduce& node) { + {op::v1::ReduceMin::get_type_info_static(), [](const std::shared_ptr& op, Reduce& node) { node.algorithm = Algorithm::ReduceMin; }}, - {ngraph::opset1::ReduceProd::get_type_info_static(), [](const std::shared_ptr& op, Reduce& node) { + {op::v1::ReduceProd::get_type_info_static(), [](const std::shared_ptr& op, Reduce& node) { node.algorithm = Algorithm::ReduceProd; }}, - {ngraph::opset1::ReduceSum::get_type_info_static(), [](const std::shared_ptr& op, Reduce& node) { + {op::v1::ReduceSum::get_type_info_static(), [](const std::shared_ptr& op, Reduce& node) { node.algorithm = Algorithm::ReduceSum; }} }; -bool Reduce::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { +bool Reduce::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { try { - if (std::dynamic_pointer_cast(op) == nullptr && - std::dynamic_pointer_cast(op) == nullptr) { + if (!op->get_type_info().is_castable(op::util::ArithmeticReductionKeepDims::get_type_info_static()) && + !op->get_type_info().is_castable(op::util::LogicalReductionKeepDims::get_type_info_static())) { errorMessage = "Reduce node with name " + op->get_friendly_name() + " is not derived from ArithmeticReductionKeepDims or LogicalReductionKeepDims"; return false; } - if (const auto reduce = std::dynamic_pointer_cast(op)) { - auto reduceConst = std::dynamic_pointer_cast(reduce->get_input_node_shared_ptr(REDUCE_INDEXES)); - if (!reduceConst) { - errorMessage = "Second tensor is not constant"; - return false; - } + const auto idxIn = op->get_input_node_shared_ptr(REDUCE_INDEXES); + if (idxIn->get_type_info() != op::v0::Constant::get_type_info_static()) { + errorMessage = "Only const 'reduce_indexes' input is supported"; + return false; } - if (const auto reduce = std::dynamic_pointer_cast(op)) { - auto reduceConst = std::dynamic_pointer_cast(reduce->get_input_node_shared_ptr(REDUCE_INDEXES)); - if (!reduceConst) { - errorMessage = "Second tensor is not constant"; - return false; - } + if (idxIn->get_element_type() != ov::element::i32 && idxIn->get_element_type() != ov::element::i64) { + errorMessage = "Only i32 and i64 'reduce_indexes' input is supported"; + return false; } if (initializers.find(op->get_type_info()) == initializers.end()) { errorMessage = "Doesn't support Reduce algorithm: " + std::string(op->get_type_info().name); return false; } - if (std::dynamic_pointer_cast(op->get_input_node_shared_ptr(REDUCE_INDEXES)) == nullptr) { - errorMessage = "Only const 'reduce_indexes' input is supported"; - return false; - } } catch (...) { return false; } return true; } -Reduce::Reduce(const std::shared_ptr& op, const GraphContext::CPtr context) +Reduce::Reduce(const std::shared_ptr& op, const GraphContext::CPtr context) : Node(op, context, NgraphShapeInferFactory(op, PortMask(REDUCE_INDEXES))) { std::string errorMessage; - if (isSupportedOperation(op, errorMessage)) { - errorPrefix = "Reduce node with name '" + getName() + "'"; - initializers.at(op->get_type_info())(op, *this); - if (const auto reduce = std::dynamic_pointer_cast(op)) { - keep_dims = reduce->get_keep_dims(); - auto reduceConst = std::dynamic_pointer_cast(reduce->get_input_node_shared_ptr(REDUCE_INDEXES)); - if (!reduceConst) - IE_THROW() << errorPrefix << " second tensor is not constant!"; - raw_axes = reduceConst->cast_vector(); - } else if (const auto reduce = std::dynamic_pointer_cast(op)) { - keep_dims = reduce->get_keep_dims(); - auto reduceConst = std::dynamic_pointer_cast(reduce->get_input_node_shared_ptr(REDUCE_INDEXES)); - if (!reduceConst) - IE_THROW() << errorPrefix << " second tensor is not constant!"; - raw_axes = reduceConst->cast_vector(); - } - set_use_aux_kernel = false; - fuse_low_precision = false; - vec_reduceDH_prc.clear(); - vec_reduceCDW_prc.clear(); - setJITBeyond5D(); - } else { + if (!isSupportedOperation(op, errorMessage)) { IE_THROW(NotImplemented) << errorMessage; } + + initializers.at(op->get_type_info())(op, *this); + + if (const auto reduction = std::dynamic_pointer_cast(op)) { + keep_dims = reduction->get_keep_dims(); + } + const auto idxIn = ov::as_type_ptr(op->get_input_node_shared_ptr(REDUCE_INDEXES)); + if (idxIn->get_element_type() == ov::element::i32) { + const auto tmpData = idxIn->get_vector(); + raw_axes.assign(tmpData.begin(), tmpData.end()); + } else if (idxIn->get_element_type() == ov::element::i64) { + raw_axes = idxIn->get_vector(); + } + + set_use_aux_kernel = false; + fuse_low_precision = false; + vec_reduceDH_prc.clear(); + vec_reduceCDW_prc.clear(); + setJITBeyond5D(); } void Reduce::getSupportedDescriptors() { - if (getParentEdges().size() != 2) - IE_THROW() << errorPrefix << " gets incorrect number of input edges!"; - if (getChildEdges().empty()) - IE_THROW() << errorPrefix << " gets incorrect number of output edges!"; + if (getParentEdges().size() != 2) { + THROW_CPU_NODE_ERR << " gets incorrect number of input edges!"; + } + if (getChildEdges().empty()) { + THROW_CPU_NODE_ERR << " gets incorrect number of output edges!"; + } if (getInputShapeAtPort(REDUCE_INDEXES).getRank() != 1) { - IE_THROW() << errorPrefix << " gets incorrect index vector dimension! Index vector should be 1 dimension."; + THROW_CPU_NODE_ERR << " gets incorrect index vector dimension! Index vector should be 1 dimension."; } if (keep_dims) { if (getInputShapeAtPort(REDUCE_DATA).getRank() != getOutputShapeAtPort(0).getRank()) - IE_THROW() << errorPrefix << " gets incorrect number of input/output dimensions!"; + THROW_CPU_NODE_ERR << " gets incorrect number of input/output dimensions!"; } else { // In fact, after the Reduce operation, the shape must be a scalar if the previous one was 1d. // But for now, 0d tensor (scalar) is emulated as 1d tensor. Skip checking in such cases. bool is_emulated_0d_as_1d = getInputShapeAtPort(REDUCE_DATA).getRank() == 1 && getOutputShapeAtPort(0).getRank() == 1; if (getInputShapeAtPort(REDUCE_DATA).getRank() <= getOutputShapeAtPort(0).getRank() && !is_emulated_0d_as_1d) - IE_THROW() << errorPrefix << "gets incorrect number of input/output dimensions!"; + THROW_CPU_NODE_ERR << "gets incorrect number of input/output dimensions!"; } } void Reduce::initSupportedPrimitiveDescriptors() { - if (!supportedPrimitiveDescriptors.empty()) - return; - - input_prec = getOriginalInputPrecisionAtPort(REDUCE_DATA); + const auto& input_prec_0 = getOriginalInputPrecisionAtPort(REDUCE_DATA); + auto input_prec_1 = getOriginalInputPrecisionAtPort(REDUCE_INDEXES); output_prec = getOriginalOutputPrecisionAtPort(0); + if (input_prec_1 == Precision::U64) { + input_prec_1 = Precision::I64; + } else if (!one_of(input_prec_1, Precision::I32, Precision::I64)) { + input_prec_1 = Precision::I32; + } + if (!fusedWith.empty()) { // In jit mode we use the output memory as an intermediate accumulator for certain reduce modes. // If the post ops node has a lower precision for such modes, working buffer with original precision is needed, @@ -1876,27 +219,27 @@ void Reduce::initSupportedPrimitiveDescriptors() { output_prec = fused_prec; } - jit_mode = canApplyJIT(input_prec, output_prec); + jit_mode = canApplyJIT(input_prec_0, output_prec); if (jit_mode) { // Since in jit mode we use the output memory as an intermediate accumulator for certain reduce modes, we can't use BF16 output precision due to // the possible accuracy loss. Therefore, for such mods, we will change the output precision to FP32. if (Precision::BF16 == output_prec) { - if (!mayiuse(avx512_core)) { - output_prec = Precision::FP32; + if (!x64::mayiuse(x64::avx512_core)) { + output_prec = Precision::FP32; } else if (algorithm != Algorithm::ReduceAnd && algorithm != Algorithm::ReduceOr && algorithm != Algorithm::ReduceMin && algorithm != Algorithm::ReduceMax) { - output_prec = Precision::FP32; + output_prec = Precision::FP32; } } } intermediate_prec = fuse_low_precision ? Precision(Precision::FP32) : output_prec; - precision_change = input_prec != intermediate_prec; + precision_change = input_prec_0 != intermediate_prec; support_split = algorithm != Algorithm::ReduceL2 && algorithm != Algorithm::ReduceLogSumExp && algorithm != Algorithm::ReduceSumSquare; - src_data_size = input_prec.size(); + src_data_size = input_prec_0.size(); dst_data_size = output_prec.size(); intermediate_data_size = intermediate_prec.size(); @@ -1912,11 +255,11 @@ void Reduce::initSupportedPrimitiveDescriptors() { auto& creatorsMap = BlockedDescCreator::getCommonCreators(); - auto pushDesc = [&](LayoutType inFormat, LayoutType outFormat, InferenceEngine::Precision inPrecision, - InferenceEngine::Precision outPrecision, impl_desc_type impl_type, bool useAclExecutor = false) { - config.inConfs[REDUCE_DATA].setMemDesc(creatorsMap.at(inFormat)->createSharedDesc(inPrecision, getInputShapeAtPort(REDUCE_DATA))); - config.inConfs[REDUCE_INDEXES].setMemDesc(creatorsMap.at(LayoutType::ncsp)->createSharedDesc(InferenceEngine::Precision::I32, - getInputShapeAtPort(REDUCE_INDEXES))); + auto pushDesc = [&](const LayoutType &inFormat, const LayoutType &outFormat, const Precision& inPrecision0, const Precision& inPrecision1, + const Precision& outPrecision, const impl_desc_type &impl_type, bool useAclExecutor = false) { + config.inConfs[REDUCE_DATA].setMemDesc(creatorsMap.at(inFormat)->createSharedDesc(inPrecision0, getInputShapeAtPort(REDUCE_DATA))); + config.inConfs[REDUCE_INDEXES].setMemDesc(creatorsMap.at(LayoutType::ncsp)->createSharedDesc(inPrecision1, + getInputShapeAtPort(REDUCE_INDEXES))); config.outConfs[0].setMemDesc(creatorsMap.at(outFormat)->createSharedDesc(outPrecision, getOutputShapeAtPort(0))); if (useAclExecutor) { @@ -1957,35 +300,41 @@ void Reduce::initSupportedPrimitiveDescriptors() { if (jit_mode) { impl_desc_type impl_type = impl_desc_type::jit_sse42; - if (mayiuse(cpu::x64::avx512_core)) { + if (x64::mayiuse(x64::avx512_core)) { impl_type = impl_desc_type::jit_avx512; - } else if (mayiuse(cpu::x64::avx2)) { + } else if (x64::mayiuse(x64::avx2)) { impl_type = impl_desc_type::jit_avx2; } - pushDesc(LayoutType::ncsp, LayoutType::ncsp, input_prec, output_prec, impl_type); + pushDesc(LayoutType::ncsp, LayoutType::ncsp, input_prec_0, input_prec_1, output_prec, impl_type); if ((getInputShapeAtPort(REDUCE_DATA).getRank() == 4 || getInputShapeAtPort(REDUCE_DATA).getRank() == 5) && getInputShapeAtPort(REDUCE_DATA).getMinDims()[1] > 1) { if (keep_dims) { - if (mayiuse(cpu::x64::avx512_core)) { - pushDesc(LayoutType::nspc, LayoutType::nspc, input_prec, output_prec, impl_type); - pushDesc(LayoutType::nCsp16c, LayoutType::nCsp16c, input_prec, output_prec, impl_type); - } else if (mayiuse(cpu::x64::avx2) || mayiuse(cpu::x64::sse41)) { - pushDesc(LayoutType::nspc, LayoutType::nspc, input_prec, output_prec, impl_type); - pushDesc(LayoutType::nCsp8c, LayoutType::nCsp8c, input_prec, output_prec, impl_type); + pushDesc(LayoutType::nspc, LayoutType::nspc, input_prec_0, input_prec_1, output_prec, impl_type); + if (x64::mayiuse(x64::avx512_core)) { + if (src_data_size <= 4) { + pushDesc(LayoutType::nCsp16c, LayoutType::nCsp16c, input_prec_0, input_prec_1, output_prec, impl_type); + } else if (src_data_size == 8) { + pushDesc(LayoutType::nCsp8c, LayoutType::nCsp8c, input_prec_0, input_prec_1, output_prec, impl_type); + } + } else if (src_data_size <= 4) { + pushDesc(LayoutType::nCsp8c, LayoutType::nCsp8c, input_prec_0, input_prec_1, output_prec, impl_type); } } else { - if (mayiuse(cpu::x64::avx512_core)) { - pushDesc(LayoutType::nspc, LayoutType::ncsp, input_prec, output_prec, impl_type); - pushDesc(LayoutType::nCsp16c, LayoutType::ncsp, input_prec, output_prec, impl_type); - } else if (mayiuse(cpu::x64::avx2) || mayiuse(cpu::x64::sse41)) { - pushDesc(LayoutType::nspc, LayoutType::ncsp, input_prec, output_prec, impl_type); - pushDesc(LayoutType::nCsp8c, LayoutType::ncsp, input_prec, output_prec, impl_type); + pushDesc(LayoutType::nspc, LayoutType::ncsp, input_prec_0, input_prec_1, output_prec, impl_type); + if (x64::mayiuse(x64::avx512_core)) { + if (src_data_size <= 4) { + pushDesc(LayoutType::nCsp16c, LayoutType::ncsp, input_prec_0, input_prec_1, output_prec, impl_type); + } else if (src_data_size == 8) { + pushDesc(LayoutType::nCsp8c, LayoutType::ncsp, input_prec_0, input_prec_1, output_prec, impl_type); + } + } else if (src_data_size <= 4) { + pushDesc(LayoutType::nCsp8c, LayoutType::ncsp, input_prec_0, input_prec_1, output_prec, impl_type); } } } } else { - pushDesc(LayoutType::ncsp, LayoutType::ncsp, InferenceEngine::Precision::FP32, InferenceEngine::Precision::FP32, impl_desc_type::ref); + pushDesc(LayoutType::ncsp, LayoutType::ncsp, Precision::FP32, Precision::I32, Precision::FP32, impl_desc_type::ref); } } @@ -2010,7 +359,7 @@ void Reduce::prepareParams() { } src_dims = getParentEdgesAtPort(REDUCE_DATA)[0]->getMemory().getDesc().getShape().getDims(); - std::vector reduce_axes; + std::vector reduce_axes; if (jit_mode && jit_beyond_5D) { reduce_axes = update_src_dims(); } else { @@ -2028,21 +377,22 @@ void Reduce::prepareParams() { apply_post_kernel = true; apply_division = false; - auto builder = [&](const ReduceKey& key) -> std::shared_ptr { - std::shared_ptr post_kernel; + auto builder = [&](const ReduceKey& key) -> std::shared_ptr> { + std::shared_ptr> postKernel; #if defined(OPENVINO_ARCH_X86_64) - if (mayiuse(cpu::x64::avx512_core)) { - post_kernel.reset(new jit_uni_reduce_post_kernel_f32(key.jcp, *attr.get())); - } else if (mayiuse(cpu::x64::avx2)) { - post_kernel.reset(new jit_uni_reduce_post_kernel_f32(key.jcp, *attr.get())); - } else if (mayiuse(cpu::x64::sse41)) { - post_kernel.reset(new jit_uni_reduce_post_kernel_f32(key.jcp, *attr.get())); + if (x64::mayiuse(x64::avx512_core)) { + postKernel.reset(new JitReducePostKernel(key.jcp, *attr.get())); + } else if (x64::mayiuse(x64::avx2)) { + postKernel.reset(new JitReducePostKernel(key.jcp, *attr.get())); + } else if (x64::mayiuse(x64::sse41)) { + postKernel.reset(new JitReducePostKernel(key.jcp, *attr.get())); } #endif // OPENVINO_ARCH_X86_64 - if (post_kernel) - post_kernel->create_ker(); + if (postKernel) { + postKernel->create_kernel(); + } - return post_kernel; + return postKernel; }; if (compile_post_kernel) { @@ -2052,12 +402,26 @@ void Reduce::prepareParams() { auto cache = context->getParamsCache(); auto result = cache->getOrCreate(key, builder); if (!result.first) { - IE_THROW() << errorPrefix << " has not found jit_uni_reduce_post_kernel_f32."; + THROW_CPU_NODE_ERR << " has not found JitReducePostKernel."; } reduce_post_kernel = result.first; jit_mode = jit_mode && reduce_post_kernel; + if (jit_mode) { + size_t divisor = IB * IC * ID * IH * IW / (OB * OC * OD * OH * OW); + if (divisor == 0lu) { + divisor = 1lu; + } + if (reduce_post_kernel->get_exec_prc().size() == 4) { + in_out_divisor_f32 = static_cast(divisor); + in_out_divisor = &in_out_divisor_f32; + } else if (reduce_post_kernel->get_exec_prc().size() == 8) { + in_out_divisor_f64 = static_cast(divisor); + in_out_divisor = &in_out_divisor_f64; + } + } + if (!isDynamicNode()) { compile_post_kernel = false; } @@ -2071,11 +435,11 @@ void Reduce::createPrimitive() { auto dstMemPtr = getChildEdgeAt(0)->getMemoryPtr(); auto srcMemPtr = getParentEdgeAt(REDUCE_DATA)->getMemoryPtr(); if (!dstMemPtr || !dstMemPtr->isAllocated()) - IE_THROW() << errorPrefix << " has not allocated destination memory."; + THROW_CPU_NODE_ERR << " has not allocated destination memory."; if (!srcMemPtr || !srcMemPtr->isAllocated()) - IE_THROW() << errorPrefix << " has not allocate input memory."; + THROW_CPU_NODE_ERR << " has not allocate input memory."; if (getSelectedPrimitiveDescriptor() == nullptr) - IE_THROW() << errorPrefix << " has nullable preferable primitive descriptor"; + THROW_CPU_NODE_ERR << " has nullable preferable primitive descriptor"; if (srcMemPtr->getDesc().hasLayoutType(LayoutType::ncsp)) { layout = ReduceLayoutType::reduce_ncsp; @@ -2086,17 +450,15 @@ void Reduce::createPrimitive() { } // hybrid layout: nspc/blocked layout for input and ncsp for output - // !keep_dims is needed to avoid hybrid layout for cases eg. (A, B, C, D) reduce to (A, 1, 1, 1) + // !keepDims is needed to avoid hybrid layout for cases eg. (A, B, C, D) reduce to (A, 1, 1, 1) if (!keep_dims && (layout == ReduceLayoutType::reduce_nspc || layout == ReduceLayoutType::reduce_blocked)) { is_hybrid_layout = dstMemPtr->getDesc().hasLayoutType(LayoutType::ncsp); } auto selectedPD = getSelectedPrimitiveDescriptor(); - jcp = jit_reduce_config_params(); - jcp.src_dt = DnnlExtensionUtils::IEPrecisionToDataType(selectedPD->getConfig().inConfs[REDUCE_DATA].getMemDesc()->getPrecision()); - jcp.dst_dt = DnnlExtensionUtils::IEPrecisionToDataType(selectedPD->getConfig().outConfs[0].getMemDesc()->getPrecision()); - jcp.src_data_size = DnnlExtensionUtils::sizeOfDataType(jcp.src_dt); - jcp.dst_data_size = DnnlExtensionUtils::sizeOfDataType(jcp.dst_dt); + jcp = JitReduceConfigParams(); + jcp.src_el_type = details::convertPrecision(selectedPD->getConfig().inConfs[REDUCE_DATA].getMemDesc()->getPrecision()); + jcp.dst_el_type = details::convertPrecision(selectedPD->getConfig().outConfs[0].getMemDesc()->getPrecision()); jcp.layout = layout; jcp.reduce_mode = getAlgorithm(); jcp.fuse_low_precision = fuse_low_precision; @@ -2107,10 +469,11 @@ void Reduce::createPrimitive() { compile_post_kernel = false; #endif // OPENVINO_ARCH_X86_64 - if (mayiuse(cpu::x64::avx512_core)) { - blk_size = 16; + size_t prcDiv = jcp.src_el_type.size() < 4 ? 4 : jcp.src_el_type.size(); + if (x64::mayiuse(x64::avx512_core)) { + blk_size = 64 / prcDiv; } else { - blk_size = 8; + blk_size = 32 / prcDiv; } if (inputShapesDefined()) { @@ -2120,8 +483,7 @@ void Reduce::createPrimitive() { } auto reduce_jcp = jcp; - reduce_jcp.dst_dt = fuse_low_precision ? DnnlExtensionUtils::IEPrecisionToDataType(intermediate_prec) : jcp.dst_dt; - jcp.dst_data_size = DnnlExtensionUtils::sizeOfDataType(reduce_jcp.dst_dt); + reduce_jcp.dst_el_type = fuse_low_precision ? details::convertPrecision(intermediate_prec) : jcp.dst_el_type; create_reduce_kernel(reduce_kernel, reduce_jcp); // set_use_aux_kernel being false means this is a dynamic case, and prepareParams() hasn't been invoked yet. @@ -2139,31 +501,27 @@ void Reduce::createPrimitive() { // stage to reduce the rest dimensions. if (use_aux_kernel) { aux_jcp = reduce_jcp; - aux_jcp.src_dt = reduce_jcp.dst_dt; - aux_jcp.src_data_size = reduce_jcp.dst_data_size; + aux_jcp.src_el_type = reduce_jcp.dst_el_type; create_reduce_kernel(reduce_aux_kernel, aux_jcp); } } -void Reduce::create_reduce_kernel(std::shared_ptr &kernel, const jit_reduce_config_params &jcp) { +void Reduce::create_reduce_kernel(std::shared_ptr> &kernel, const JitReduceConfigParams &jcp) { #if defined(OPENVINO_ARCH_X86_64) - if (mayiuse(cpu::x64::avx512_core)) { - kernel.reset(new jit_uni_reduce_kernel_f32(jcp)); - } else if (mayiuse(cpu::x64::avx2)) { - kernel.reset(new jit_uni_reduce_kernel_f32(jcp)); - } else if (mayiuse(cpu::x64::sse41)) { - kernel.reset(new jit_uni_reduce_kernel_f32(jcp)); + if (x64::mayiuse(x64::avx512_core)) { + kernel.reset(new JitReduceKernel(jcp)); + } else if (x64::mayiuse(x64::avx2)) { + kernel.reset(new JitReduceKernel(jcp)); + } else if (x64::mayiuse(x64::sse41)) { + kernel.reset(new JitReduceKernel(jcp)); } #endif // OPENVINO_ARCH_X86_64 - if (kernel) - kernel->create_ker(); + if (kernel) { + kernel->create_kernel(); + } jit_mode = jit_mode && kernel; } -void Reduce::executeDynamicImpl(dnnl::stream strm) { - execute(strm); -} - void Reduce::execute(dnnl::stream strm) { auto dstMemPtr = getChildEdgeAt(0)->getMemoryPtr(); auto srcMemPtr = getParentEdgeAt(REDUCE_DATA)->getMemoryPtr(); @@ -2191,11 +549,15 @@ void Reduce::execute(dnnl::stream strm) { auto out_ptr = reinterpret_cast(dst_data); reduce_ref(in_ptr, out_ptr); } else { - IE_THROW() << errorPrefix << " supports only plain layout on machine w/o sse42."; + THROW_CPU_NODE_ERR << " supports only plain layout on machine w/o sse42."; } } } +void Reduce::executeDynamicImpl(dnnl::stream strm) { + execute(strm); +} + void Reduce::reduce_type(const uint8_t *in_ptr, uint8_t *out_ptr) { reduce_stride = IW; @@ -2214,9 +576,19 @@ void Reduce::reduce_type(const uint8_t *in_ptr, uint8_t *out_ptr) { auto dstMemPtr = getChildEdgeAt(0)->getMemoryPtr(); out_ptr = reinterpret_cast(dstMemPtr->getData()); if (layout == ReduceLayoutType::reduce_nspc) { - nspc2ncsp(proc_ptr, out_ptr); + switch (dst_data_size) { + case 1: nspc2ncsp(proc_ptr, out_ptr); break; + case 2: nspc2ncsp(proc_ptr, out_ptr); break; + case 4: nspc2ncsp(proc_ptr, out_ptr); break; + case 8: nspc2ncsp(proc_ptr, out_ptr); break; + } } else { - blocked2ncsp(proc_ptr, out_ptr); + switch (dst_data_size) { + case 1: blocked2ncsp(proc_ptr, out_ptr); break; + case 2: blocked2ncsp(proc_ptr, out_ptr); break; + case 4: blocked2ncsp(proc_ptr, out_ptr); break; + case 8: blocked2ncsp(proc_ptr, out_ptr); break; + } } } } @@ -2242,20 +614,20 @@ void Reduce::reduce_PLN(const uint8_t *in_ptr, uint8_t *out_ptr) { size_t ob = ReduceN ? 0 : ib; GET_PTR_N_PLN; if (!ReduceC && !ReduceD && ReduceW) { size_t work_amount = ReduceH ? IH * IW : IW; - if (work_amount < blk_size && mayiuse(cpu::x64::avx2)) { + if (work_amount < blk_size && x64::mayiuse(x64::avx2)) { size_t outer_size = ReduceH ? IC * ID : IC * ID * IH; size_t inner_size = ReduceH ? IH * IW : IW; size_t output_inner_size = ReduceH ? OH * OW : OW; size_t IK = outer_size / blk_size; - std::vector index_buf(blk_size); + std::vector indicesBuf(16, work_amount * src_data_size); for (size_t i = 0; i < blk_size; i++) { - index_buf[i] = i * work_amount * src_data_size; + indicesBuf[i] *= i; } parallel_for(IK, [&](size_t ik) { size_t ok = ik; reduce_kernel_process(in_ptr_n + ik * blk_size * inner_size * src_data_size, out_ptr_n + ok * blk_size * output_inner_size * dst_data_size, - work_amount, 1, 0, static_cast(&index_buf[0])); + work_amount, 1, 0, static_cast(&indicesBuf[0])); }); size_t tail_start = IK * blk_size; size_t IT = outer_size - tail_start; @@ -2334,15 +706,18 @@ void Reduce::reduce_PLN(const uint8_t *in_ptr, uint8_t *out_ptr) { } } else if (!ReduceC && !ReduceD && ReduceH && !ReduceW) { parallel_for2d(IC, ID, [&](size_t ic, size_t id) { - size_t oc = ic, od = id; GET_PTR_NCD_BASE_PTR_N_PLN; - parallel_for(IW / blk_size, [&](size_t ibw){ + size_t oc = ic, od = id; + GET_PTR_NCD_BASE_PTR_N_PLN; + parallel_for(IW / blk_size, [&](size_t ibw) { size_t obw = ibw; - reduce_kernel_process(in_ptr_ncd + ibw * blk_size * src_data_size, out_ptr_ncd + obw * blk_size * dst_data_size, - blk_size, 0, IH); + reduce_kernel_process(in_ptr_ncd + ibw * blk_size * src_data_size, + out_ptr_ncd + obw * blk_size * dst_data_size, + blk_size, 0, IH); }); size_t tail_start = IW / blk_size * blk_size; - reduce_kernel_process(in_ptr_ncd + tail_start * src_data_size, out_ptr_ncd + tail_start * dst_data_size, - IW - tail_start, 0, IH); + reduce_kernel_process(in_ptr_ncd + tail_start * src_data_size, + out_ptr_ncd + tail_start * dst_data_size, + IW - tail_start, 0, IH); }); } else if (!ReduceC && ReduceD && ReduceH && !ReduceW) { size_t IWB = IW / blk_size; @@ -2352,10 +727,10 @@ void Reduce::reduce_PLN(const uint8_t *in_ptr, uint8_t *out_ptr) { // step1: !ReduceD && ReduceH && !ReduceW uint8_t *prc_ptr_n = &vec_reduceDH_prc[0]; init_dst_data(prc_ptr_n, prc_size); - parallel_for2d(ID, IWB, [&](size_t id, size_t iwb){ + parallel_for2d(ID, IWB, [&](size_t id, size_t iwb) { size_t pd = id, pwb = iwb; reduce_kernel_process(in_ptr_n + (id * IH * IW + iwb * blk_size) * src_data_size, - prc_ptr_n + (pd * PW + pwb * blk_size) * prc_data_size, blk_size, 0, IH); + prc_ptr_n + (pd * PW + pwb * blk_size) * prc_data_size, blk_size, 0, IH); }); // step2: ReduceD reduce_stride = PW; @@ -2372,7 +747,7 @@ void Reduce::reduce_PLN(const uint8_t *in_ptr, uint8_t *out_ptr) { size_t tail_start = IWB * blk_size; parallel_for(IW - tail_start, [&](size_t i_tail) { reduce_kernel_process(in_ptr_n + (tail_start + i_tail) * src_data_size, out_ptr_n + (tail_start + i_tail) * dst_data_size, - 1, 0, ID * IH); + 1, 0, ID * IH); }); } else { parallel_for(IC, [&](size_t ic) { @@ -2450,7 +825,8 @@ void Reduce::reduce_BLK(const uint8_t *in_ptr, uint8_t *out_ptr) { apply_post_kernel = !apply_division; } parallel_for2d(ICB, ID, [&](size_t icb, size_t id) { - size_t ocb = icb, od = id; GET_PTR_NCD_BASE_PTR_N_BLK; + size_t ocb = icb, od = id; + GET_PTR_NCD_BASE_PTR_N_BLK; reduce_kernel_process(in_ptr_ncd, out_ptr_ncd, IH * IW * blk_size); }); } else if (ReduceC && ReduceD && ReduceH && ReduceW) { @@ -2604,9 +980,9 @@ void Reduce::reduce_BLK_concern_padding(const uint8_t *in_ptr, uint8_t *out_ptr) } inline void Reduce::reduce_kernel_process(const uint8_t *in_p, uint8_t *out_p, size_t work_amount, - size_t reduce_w, size_t work_batch, const int *tab_idx) { - const float divisor = apply_division ? static_cast(IB * IC * ID * IH * IW / (OB * OC * OD * OH * OW)) : 1; - auto arg = jit_reduce_call_args(); + size_t reduce_w, size_t work_batch, const int *tab_idx) { + auto arg = JitReduceCallArgs(); + arg.src = static_cast(in_p); arg.idx = tab_idx; arg.dst = static_cast(out_p); @@ -2614,28 +990,28 @@ inline void Reduce::reduce_kernel_process(const uint8_t *in_p, uint8_t *out_p, s arg.work_batch = work_batch; arg.reduce_w = reduce_w; arg.reduce_stride = reduce_stride; - arg.can_divide = apply_division ? 1 : 0; - arg.divisor = &divisor; + arg.can_divide = apply_division ? 1lu : 0lu; + arg.divisor = in_out_divisor; (*reduce_kernel)(&arg); } inline void Reduce::reduce_kernel_post_process(uint8_t *out_ptr) { const uint8_t *in_ptr = fuse_low_precision ? static_cast(&intermediate_buf[0]) : nullptr; - const size_t integerDivisor = IB * IC * ID * IH * IW / (OB * OC * OD * OH * OW); - const float divisor = static_cast(integerDivisor); if (layout == ReduceLayoutType::reduce_ncsp) { + const auto work_amount = OD * OH * OW; parallel_for2d(OB, OC, [&](size_t ob, size_t oc) { const uint8_t *in_p = in_ptr + (ob * OC + oc) * OD * OH * OW * intermediate_data_size; - uint8_t *out_p = out_ptr + (ob * OC + oc) * OD * OH * OW * dst_data_size; - auto arg = jit_reduce_post_call_args(); + uint8_t *out_p = out_ptr + (ob * OC + oc) * work_amount * dst_data_size; + auto arg = JitReducePostCallArgs(); arg.src = static_cast(in_p); arg.dst = static_cast(out_p); arg.oc_off = oc * sizeof(float); arg.channel_size = OC; - arg.work_amount = OD * OH * OW; - arg.divisor = &divisor; + arg.work_amount = work_amount; + arg.divisor = in_out_divisor; arg.post_op_data = static_cast(postOpsDataPtrs.data()); + (*reduce_post_kernel)(&arg); }); } else if (layout == ReduceLayoutType::reduce_nspc) { @@ -2643,33 +1019,38 @@ inline void Reduce::reduce_kernel_post_process(uint8_t *out_ptr) { size_t OP = OB * OC >= num_threads ? OB * OC : OB * OC * OD; if (OP < num_threads && OW > blk_size) OP *= OH; - size_t work_amount = OB * OC * OD * OH * OW / OP; + const auto work_amount = OB * OC * OD * OH * OW / OP; parallel_for(OP, [&](size_t op) { const uint8_t *in_p = in_ptr + op * work_amount * intermediate_data_size; uint8_t *out_p = out_ptr + op * work_amount * dst_data_size; - auto arg = jit_reduce_post_call_args(); + auto arg = JitReducePostCallArgs(); + arg.src = static_cast(in_p); arg.dst = static_cast(out_p); arg.oc_off = 0; arg.channel_size = OW; // OW is related to nspc-ncsp dimension reinterpret arg.work_amount = work_amount; - arg.divisor = &divisor; + arg.divisor = in_out_divisor; arg.post_op_data = static_cast(postOpsDataPtrs.data()); + (*reduce_post_kernel)(&arg); }); } else { size_t OCB = div_up(OC, blk_size); + const auto work_amount = OD * OH * OW * blk_size; parallel_for2d(OB, OCB, [&](size_t ob, size_t ocb) { const uint8_t *in_p = in_ptr + (ob * OCB + ocb) * OD * OH * OW * blk_size * intermediate_data_size; - uint8_t *out_p = out_ptr + (ob * OCB + ocb) * OD * OH * OW * blk_size * dst_data_size; - auto arg = jit_reduce_post_call_args(); + uint8_t *out_p = out_ptr + (ob * OCB + ocb) * work_amount * dst_data_size; + auto arg = JitReducePostCallArgs(); + arg.src = static_cast(in_p); arg.dst = static_cast(out_p); arg.reduce_c = ReduceC ? 1 : 0; arg.oc_off = ocb * blk_size * sizeof(float); - arg.work_amount = OD * OH * OW * blk_size; - arg.divisor = &divisor; + arg.work_amount = work_amount; + arg.divisor = in_out_divisor; arg.post_op_data = static_cast(postOpsDataPtrs.data()); + (*reduce_post_kernel)(&arg); }); } @@ -2708,6 +1089,7 @@ inline void Reduce::output_info_restore(uint8_t **out_ptr) { } } +template void Reduce::nspc2ncsp(uint8_t *proc_ptr, uint8_t *out_ptr) { // dimension reinterpret after nspc reusing routine reduce_PLN // demote -- nspc -- ncsp @@ -2724,45 +1106,20 @@ void Reduce::nspc2ncsp(uint8_t *proc_ptr, uint8_t *out_ptr) { const size_t stride1 = DIM2 * DIM3 * DIM4; const size_t stride0 = stride1 * DIM1; - if (dst_data_size == 4) { - auto src_data = reinterpret_cast(proc_ptr); - auto dst_data = reinterpret_cast(out_ptr); - parallel_for2d(DIM0, stride1, [&](size_t b, size_t j) { - auto src_off = b * stride0 + j * DIM1; - auto dst_off = b * stride0 + j; - for (size_t dim1 = 0; dim1 < DIM1; dim1++) { - dst_data[dst_off] = src_data[src_off]; - src_off++; - dst_off += stride1; - } - }); - } else if (dst_data_size == 2) { - auto src_data = reinterpret_cast(proc_ptr); - auto dst_data = reinterpret_cast(out_ptr); - parallel_for2d(DIM0, stride1, [&](size_t b, size_t j) { - auto src_off = b * stride0 + j * DIM1; - auto dst_off = b * stride0 + j; - for (size_t dim1 = 0; dim1 < DIM1; dim1++) { - dst_data[dst_off] = src_data[src_off]; - src_off++; - dst_off += stride1; - } - }); - } else { - auto src_data = reinterpret_cast(proc_ptr); - auto dst_data = reinterpret_cast(out_ptr); - parallel_for2d(DIM0, stride1, [&](size_t b, size_t j) { - auto src_off = b * stride0 + j * DIM1; - auto dst_off = b * stride0 + j; - for (size_t dim1 = 0; dim1 < DIM1; dim1++) { - dst_data[dst_off] = src_data[src_off]; - src_off++; - dst_off += stride1; - } - }); - } + auto src_data = reinterpret_cast(proc_ptr); + auto dst_data = reinterpret_cast(out_ptr); + parallel_for2d(DIM0, stride1, [&](size_t b, size_t j) { + auto src_off = b * stride0 + j * DIM1; + auto dst_off = b * stride0 + j; + for (size_t dim1 = 0; dim1 < DIM1; dim1++) { + dst_data[dst_off] = src_data[src_off]; + src_off++; + dst_off += stride1; + } + }); } +template void Reduce::blocked2ncsp(uint8_t *proc_ptr, uint8_t *out_ptr) { const size_t DIM0 = OB; const size_t DIM1 = OC; @@ -2773,70 +1130,26 @@ void Reduce::blocked2ncsp(uint8_t *proc_ptr, uint8_t *out_ptr) { const size_t src_stride0 = stride1 * div_up(OC, blk_size) * blk_size; const size_t dst_stride0 = stride1 * DIM1; - if (dst_data_size == 4) { - auto src_data = reinterpret_cast(proc_ptr); - auto dst_data = reinterpret_cast(out_ptr); - parallel_for2d(DIM0, stride1, [&](size_t b, size_t j) { - auto src_off = b * src_stride0 + j * blk_size; - auto dst_off = b * dst_stride0 + j; - for (size_t dim1 = 0; dim1 + blk_size <= DIM1; dim1 += blk_size) { - for (size_t k = 0; k < blk_size; k++) { - dst_data[dst_off] = src_data[src_off]; - src_off++; - dst_off += stride1; - } - src_off += (stride1 - 1) * blk_size; - } - size_t tail = DIM1 % blk_size; - for (size_t k = 0; k < tail; k++) { - dst_data[dst_off] = src_data[src_off]; - src_off++; - dst_off += stride1; - } - }); - } else if (dst_data_size == 2) { - auto src_data = reinterpret_cast(proc_ptr); - auto dst_data = reinterpret_cast(out_ptr); - parallel_for2d(DIM0, stride1, [&](size_t b, size_t j) { - auto src_off = b * src_stride0 + j * blk_size; - auto dst_off = b * dst_stride0 + j; - for (size_t dim1 = 0; dim1 + blk_size <= DIM1; dim1 += blk_size) { - for (size_t k = 0; k < blk_size; k++) { - dst_data[dst_off] = src_data[src_off]; - src_off++; - dst_off += stride1; - } - src_off += (stride1 - 1) * blk_size; - } - size_t tail = DIM1 % blk_size; - for (size_t k = 0; k < tail; k++) { - dst_data[dst_off] = src_data[src_off]; - src_off++; - dst_off += stride1; - } - }); - } else { - auto src_data = reinterpret_cast(proc_ptr); - auto dst_data = reinterpret_cast(out_ptr); - parallel_for2d(DIM0, stride1, [&](size_t b, size_t j) { - auto src_off = b * src_stride0 + j * blk_size; - auto dst_off = b * dst_stride0 + j; - for (size_t dim1 = 0; dim1 + blk_size <= DIM1; dim1 += blk_size) { - for (size_t k = 0; k < blk_size; k++) { - dst_data[dst_off] = src_data[src_off]; - src_off++; - dst_off += stride1; - } - src_off += (stride1 - 1) * blk_size; - } - size_t tail = DIM1 % blk_size; - for (size_t k = 0; k < tail; k++) { + auto src_data = reinterpret_cast(proc_ptr); + auto dst_data = reinterpret_cast(out_ptr); + parallel_for2d(DIM0, stride1, [&](size_t b, size_t j) { + auto src_off = b * src_stride0 + j * blk_size; + auto dst_off = b * dst_stride0 + j; + for (size_t dim1 = 0; dim1 + blk_size <= DIM1; dim1 += blk_size) { + for (size_t k = 0; k < blk_size; k++) { dst_data[dst_off] = src_data[src_off]; src_off++; dst_off += stride1; } - }); - } + src_off += (stride1 - 1) * blk_size; + } + size_t tail = DIM1 % blk_size; + for (size_t k = 0; k < tail; k++) { + dst_data[dst_off] = src_data[src_off]; + src_off++; + dst_off += stride1; + } + }); } inline void Reduce::init_dst_data(uint8_t *out_ptr, size_t dst_size) { @@ -2853,7 +1166,13 @@ inline void Reduce::init_dst_data(uint8_t *out_ptr, size_t dst_size) { break; case Algorithm::ReduceAnd: case Algorithm::ReduceProd: - if (output_prec == Precision::FP32) { + if (output_prec == Precision::FP64) { + auto out_p = reinterpret_cast(out_ptr); + parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = static_cast(1); }); + } else if (output_prec == Precision::I64) { + auto out_p = reinterpret_cast(out_ptr); + parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = static_cast(1); }); + } else if (output_prec == Precision::FP32) { auto out_p = reinterpret_cast(out_ptr); parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = static_cast(1); }); } else if (output_prec == Precision::I32) { @@ -2871,7 +1190,13 @@ inline void Reduce::init_dst_data(uint8_t *out_ptr, size_t dst_size) { } break; case Algorithm::ReduceMax: - if (output_prec == Precision::FP32) { + if (output_prec == Precision::FP64) { + auto out_p = reinterpret_cast(out_ptr); + parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = std::numeric_limits::lowest(); }); + } else if (output_prec == Precision::I64) { + auto out_p = reinterpret_cast(out_ptr); + parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = std::numeric_limits::min(); }); + } else if (output_prec == Precision::FP32) { auto out_p = reinterpret_cast(out_ptr); parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = std::numeric_limits::lowest(); }); } else if (output_prec == Precision::I32) { @@ -2889,7 +1214,13 @@ inline void Reduce::init_dst_data(uint8_t *out_ptr, size_t dst_size) { } break; case Algorithm::ReduceMin: - if (output_prec == Precision::FP32) { + if (output_prec == Precision::FP64) { + auto out_p = reinterpret_cast(out_ptr); + parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = std::numeric_limits::max(); }); + } else if (output_prec == Precision::I64) { + auto out_p = reinterpret_cast(out_ptr); + parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = std::numeric_limits::max(); }); + } else if (output_prec == Precision::FP32) { auto out_p = reinterpret_cast(out_ptr); parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = std::numeric_limits::max(); }); } else if (output_prec == Precision::I32) { @@ -2907,15 +1238,16 @@ inline void Reduce::init_dst_data(uint8_t *out_ptr, size_t dst_size) { } break; default: - IE_THROW() << errorPrefix << " gets unsupported reduce mode."; + THROW_CPU_NODE_ERR << " gets unsupported reduce mode."; } } inline void Reduce::create_hybrid_working_memory() { auto rank = getInputShapeAtPort(REDUCE_DATA).getRank(); - memory::format_tag format = (layout == ReduceLayoutType::reduce_nspc) ? (rank == 4 ? memory::format_tag::nhwc : memory::format_tag::ndhwc) - : (rank == 4 ? (mayiuse(cpu::x64::avx512_core) ? memory::format_tag::nChw16c : memory::format_tag::nChw8c) - : (mayiuse(cpu::x64::avx512_core) ? memory::format_tag::nCdhw16c : memory::format_tag::nCdhw8c)); + dnnl::memory::format_tag format = + (layout == ReduceLayoutType::reduce_nspc) ? (rank == 4 ? dnnl::memory::format_tag::nhwc : dnnl::memory::format_tag::ndhwc) + : (rank == 4 ? (x64::mayiuse(x64::avx512_core) ? dnnl::memory::format_tag::nChw16c : dnnl::memory::format_tag::nChw8c) + : (x64::mayiuse(x64::avx512_core) ? dnnl::memory::format_tag::nCdhw16c : dnnl::memory::format_tag::nCdhw8c)); auto prc_dims = rank == 4 ? std::vector{OB, OC, OH, OW} : std::vector{OB, OC, OD, OH, OW}; auto desc = dnnl::memory::desc(DnnlExtensionUtils::convertToDnnlDims(prc_dims), DnnlExtensionUtils::IEPrecisionToDataType(output_prec), format); prc_mem = dnnl::memory(desc, getEngine()); @@ -2956,7 +1288,7 @@ inline void Reduce::create_opt_working_memory() { } } -inline void Reduce::calc_process_dst_dims(std::vector &reduce_axes, const SizeVector &dst_dims) { +inline void Reduce::calc_process_dst_dims(std::vector &reduce_axes, const SizeVector &dst_dims) { std::set axes; SizeVector out_dims; process_dst_dims.clear(); @@ -2965,7 +1297,7 @@ inline void Reduce::calc_process_dst_dims(std::vector &reduce_axes, const S if (axis < 0) axis += src_dims.size(); if (static_cast(axis) > src_dims.size()) - IE_THROW() << errorPrefix << " exceeds data tensor dimension on index to reduce"; + THROW_CPU_NODE_ERR << " exceeds data tensor dimension on index to reduce"; axes.insert(static_cast(axis)); } for (size_t i = 0; i < src_dims.size(); i++) { @@ -2988,11 +1320,11 @@ inline void Reduce::calc_process_dst_dims(std::vector &reduce_axes, const S if (jit_mode && jit_beyond_5D) { if (std::accumulate(out_dims.begin(), out_dims.end(), size_t(1), std::multiplies()) != std::accumulate(dst_dims.begin(), dst_dims.end(), size_t(1), std::multiplies())) - IE_THROW() << errorPrefix << "gets incorrect number of output dimensions!"; + THROW_CPU_NODE_ERR << "gets incorrect number of output dimensions!"; } else { for (size_t i = 0; i < std::min(out_dims.size(), dst_dims.size()); i++) { if (out_dims[i] != dst_dims[i]) - IE_THROW() << errorPrefix << "gets incorrect number of output dimensions!"; + THROW_CPU_NODE_ERR << "gets incorrect number of output dimensions!"; } } } @@ -3098,8 +1430,8 @@ inline void Reduce::reduce_ref(const float *in_ptr, float *out_ptr) { case Algorithm::ReduceSumSquare: reduce_ref_process(in_ptr, out_ptr, 0, [](float old, float y)->float { return old + y * y; }); break; - default: - IE_THROW() << errorPrefix << "gets unsupported reduce mode."; + default: + THROW_CPU_NODE_ERR << "gets unsupported reduce mode."; } } @@ -3186,7 +1518,7 @@ inline void Reduce::reduce_ref_map(float *out_ptr, size_t work_amount_dst, size_ }); break; default: - IE_THROW() << errorPrefix << "gets unsupported reduce mode."; + THROW_CPU_NODE_ERR << "gets unsupported reduce mode."; } } @@ -3233,8 +1565,8 @@ void Reduce::setJITBeyond5D() { } } -std::vector Reduce::update_src_dims() { - std::vector reduce_axes = raw_axes; +std::vector Reduce::update_src_dims() { + std::vector reduce_axes = raw_axes; if (reduce_axes.size() < 1) return reduce_axes; @@ -3267,6 +1599,7 @@ std::vector Reduce::update_src_dims() { bool Reduce::canApplyJIT(const Precision &input_prec, const Precision &output_prec) const { static const Precision supportedPrecisions[] = { + Precision::I64, Precision::FP32, Precision::BF16, Precision::I32, @@ -3274,7 +1607,7 @@ bool Reduce::canApplyJIT(const Precision &input_prec, const Precision &output_pr Precision::U8 }; - return (mayiuse(cpu::x64::sse41)) && (getInputShapeAtPort(REDUCE_DATA).getRank() <= 5 || jit_beyond_5D) && + return (x64::mayiuse(x64::sse41)) && (getInputShapeAtPort(REDUCE_DATA).getRank() <= 5 || jit_beyond_5D) && std::find(std::begin(supportedPrecisions), std::end(supportedPrecisions), input_prec) != std::end(supportedPrecisions) && std::find(std::begin(supportedPrecisions), std::end(supportedPrecisions), output_prec) != std::end(supportedPrecisions); } @@ -3297,19 +1630,19 @@ int Reduce::getFusingAxis() const { } bool Reduce::canFuse(const NodePtr& node) const { - Precision input_prec = getOriginalInputPrecisionAtPort(REDUCE_DATA); - Precision output_prec = getOriginalOutputPrecisionAtPort(0); + const auto& input_prec = getOriginalInputPrecisionAtPort(REDUCE_DATA); + const auto& output_prec = getOriginalOutputPrecisionAtPort(0); if (!canApplyJIT(input_prec, output_prec) || jit_beyond_5D || algorithm == Algorithm::ReduceAnd || algorithm == Algorithm::ReduceOr) { return false; } + if (one_of(8, input_prec.size(), output_prec.size())) { + return false; + } + return canFuseSimpleOperation(node); } bool Reduce::created() const { return getType() == Type::Reduce; } - -} // namespace node -} // namespace intel_cpu -} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/reduce.h b/src/plugins/intel_cpu/src/nodes/reduce.h index 2f07cb196a7dfe..067ef41de7ffcb 100644 --- a/src/plugins/intel_cpu/src/nodes/reduce.h +++ b/src/plugins/intel_cpu/src/nodes/reduce.h @@ -4,92 +4,18 @@ #pragma once -#include #include -#include -#include -#include +#include "kernels/x64/reduce.hpp" + #include "executors/reduce_list.hpp" namespace ov { namespace intel_cpu { namespace node { -enum ReduceLayoutType { - reduce_ncsp, - reduce_nspc, - reduce_blocked -}; - -struct jit_reduce_config_params { - ReduceLayoutType layout; - Algorithm reduce_mode; - bool fuse_low_precision; - dnnl::memory::data_type src_dt; - dnnl::memory::data_type dst_dt; - int src_data_size; - int dst_data_size; -}; - -struct jit_reduce_call_args { - const void *src; - const int *idx; - void *dst; - size_t work_amount; - size_t work_batch; - size_t reduce_w = 2; // only used in planar layout [1: reduce width dimension] [0: reduce other dimension] [other value: N/A] - size_t reduce_stride; // only used in planar layout while reducing dimensions except for width - size_t can_divide; // if apply division in reduce_kernel [1: Yes] [0: No] - const float *divisor; // mean = sum / divisor -}; - -struct jit_reduce_post_call_args { - const void *src; - void *dst; - size_t work_amount; - size_t reduce_c = 2; // only used in blocked layout [1: reduce channel dimension] [0: reduce other dimension] [other value: N/A] - size_t oc_off; // offset in byte along channel on output tensor - size_t channel_size; // only for post ops fusion of nspc layout - const float *divisor; // mean = sum / divisor - const void** post_op_data; -}; - -struct jit_uni_reduce_kernel { - void (*ker_)(const jit_reduce_call_args *); - - void operator()(const jit_reduce_call_args *args) { - assert(ker_); - ker_(args); - } - - explicit jit_uni_reduce_kernel(jit_reduce_config_params jcp) : ker_(nullptr), jcp_(jcp) {} - virtual ~jit_uni_reduce_kernel() {} - - virtual void create_ker() = 0; - - jit_reduce_config_params jcp_; -}; - -struct jit_uni_reduce_post_kernel { - void (*ker_)(const jit_reduce_post_call_args *); - - void operator()(const jit_reduce_post_call_args *args) { - assert(ker_); - ker_(args); - } - - explicit jit_uni_reduce_post_kernel(jit_reduce_config_params jcp, const dnnl_primitive_attr &attr) : ker_(nullptr), jcp_(jcp), attr_(attr) {} - virtual ~jit_uni_reduce_post_kernel() {} - - virtual void create_ker() = 0; - - jit_reduce_config_params jcp_; - const dnnl_primitive_attr &attr_; -}; - class Reduce : public Node { public: - Reduce(const std::shared_ptr& op, const GraphContext::CPtr context); + Reduce(const std::shared_ptr& op, const GraphContext::CPtr context); void getSupportedDescriptors() override; void initSupportedPrimitiveDescriptors() override; @@ -105,7 +31,7 @@ class Reduce : public Node { } bool isExecutable() const override; - static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; + static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; private: void reduce_type(const uint8_t *in_ptr, uint8_t *out_ptr); @@ -113,7 +39,7 @@ class Reduce : public Node { void reduce_BLK(const uint8_t *in_ptr, uint8_t *out_ptr); void reduce_BLK_concern_padding(const uint8_t *in_ptr, uint8_t *out_ptr); inline void reduce_kernel_process(const uint8_t *in_p, uint8_t *out_p, size_t work_amount, - size_t reduce_w = 2, size_t work_batch = 1, const int *tab_idx = NULL); + size_t reduce_w = 2, size_t work_batch = 1, const int *tab_idx = NULL); inline void reduce_kernel_post_process(uint8_t *out_ptr); inline void reduce_kernel_reassign(); inline void reduce_kernel_restore(); @@ -122,22 +48,24 @@ class Reduce : public Node { inline void init_dst_data(uint8_t *out_ptr, size_t dst_size); inline void create_hybrid_working_memory(); inline void create_opt_working_memory(); - inline void calc_process_dst_dims(std::vector &reduce_axes, const InferenceEngine::SizeVector &dst_dim); + inline void calc_process_dst_dims(std::vector &reduce_axes, const InferenceEngine::SizeVector &dst_dim); inline void set_reduce_dim_flags(); inline void reduce_ref(const float *in_ptr, float *out_ptr); void reduce_ref_process(const float *in_ptr, float *out_ptr, float init_value, std::function func); - void create_reduce_kernel(std::shared_ptr &kernel, const jit_reduce_config_params &jcp); + void create_reduce_kernel(std::shared_ptr> &kernel, const kernel::JitReduceConfigParams &jcp); inline void reduce_ref_map(float *out_ptr, size_t work_amount_dst, size_t reduced_dims_work_amount); + template void nspc2ncsp(uint8_t *proc_ptr, uint8_t *out_ptr); + template void blocked2ncsp(uint8_t *proc_ptr, uint8_t *out_ptr); void setPostOps(dnnl::primitive_attr &attr, const VectorDims &postOpDims, bool initWeights = false); void setJITBeyond5D(); - std::vector update_src_dims(); + std::vector update_src_dims(); bool canApplyJIT(const InferenceEngine::Precision &input_prec, const InferenceEngine::Precision &output_prec) const; size_t blk_size; - static const size_t REDUCE_DATA = 0; - static const size_t REDUCE_INDEXES = 1; + static constexpr size_t REDUCE_DATA = 0; + static constexpr size_t REDUCE_INDEXES = 1; bool jit_beyond_5D = false; bool jit_mode = true; bool keep_dims = true; @@ -161,16 +89,19 @@ class Reduce : public Node { size_t dst_size, prc_size, intermediate_size, tmp_size; size_t reduce_stride; uint8_t *tmp_ptr; - ReduceLayoutType layout; + kernel::ReduceLayoutType layout; InferenceEngine::Precision input_prec, output_prec, intermediate_prec, tmp_prec; InferenceEngine::SizeVector src_dims; InferenceEngine::SizeVector process_dst_dims; InferenceEngine::SizeVector axes_for_reduction; - std::vector raw_axes; + std::vector raw_axes; std::vector intermediate_buf; + float in_out_divisor_f32 = 1.f; + double in_out_divisor_f64 = 1.; + void* in_out_divisor; - jit_reduce_config_params jcp; - jit_reduce_config_params aux_jcp; + kernel::JitReduceConfigParams jcp; + kernel::JitReduceConfigParams aux_jcp; dnnl::primitive_attr attr; @@ -180,12 +111,12 @@ class Reduce : public Node { std::vector vec_reduceDH_prc; std::vector vec_reduceCDW_prc; - std::shared_ptr reduce_kernel; - std::shared_ptr reduce_aux_kernel; - std::shared_ptr reduce_tmp_kernel; - std::shared_ptr reduce_post_kernel; + std::shared_ptr> reduce_kernel; + std::shared_ptr> reduce_aux_kernel; + std::shared_ptr> reduce_tmp_kernel; + std::shared_ptr> reduce_post_kernel; - static const std::map& op, Reduce& node)>> initializers; + static const std::map& op, Reduce& node)>> initializers; std::string errorPrefix; diff --git a/src/plugins/intel_cpu/src/nodes/reference.cpp b/src/plugins/intel_cpu/src/nodes/reference.cpp index 490610b4f0c822..5fcf00ff196f27 100644 --- a/src/plugins/intel_cpu/src/nodes/reference.cpp +++ b/src/plugins/intel_cpu/src/nodes/reference.cpp @@ -8,10 +8,9 @@ #include #include "openvino/runtime/tensor.hpp" #include "common/blocked_desc_creator.h" -#include +#include #include "common/cpu_memcpy.h" -using namespace dnnl; using namespace InferenceEngine; using namespace InferenceEngine::details; @@ -19,11 +18,11 @@ namespace ov { namespace intel_cpu { namespace node { -Reference::Reference(const std::shared_ptr& op, const GraphContext::CPtr context, +Reference::Reference(const std::shared_ptr& op, const GraphContext::CPtr& context, const std::string& errorMessage) : Node(op, context, NgraphShapeInferFactory(op, FULL_PORT_MASK)), ngraphOp(op), additionalErrorMessage(errorMessage) { if (!op->has_evaluate()) { - IE_THROW(NotImplemented) << "Cannot fallback on ngraph reference implementation (Ngraph::Node::evaluate() is not implemented)"; + IE_THROW(NotImplemented) << "Cannot fallback on ngraph reference implementation (ov::Node::evaluate() is not implemented)"; } setType(Type::Reference); setTypeStr("Reference"); @@ -31,7 +30,7 @@ Reference::Reference(const std::shared_ptr& op, const GraphContext // RandomUniform should generate new sequence each run even if all inputs are constants. So that method Node::IsConstant() // doesn't return 'True' for RandomUniform with all constant inputs and the node generates new values for each inference, // we set 'NoConst' value for 'ConstantType' in ctor - if (ov::is_type(ngraphOp)) { + if (ov::is_type(ngraphOp)) { constant = ConstantType::NoConst; } } diff --git a/src/plugins/intel_cpu/src/nodes/reference.h b/src/plugins/intel_cpu/src/nodes/reference.h index 4c2a8a1310806f..59e1036617b9cc 100644 --- a/src/plugins/intel_cpu/src/nodes/reference.h +++ b/src/plugins/intel_cpu/src/nodes/reference.h @@ -12,7 +12,7 @@ namespace node { class Reference : public Node { public: - Reference(const std::shared_ptr& op, const GraphContext::CPtr context, const std::string& errorMessage); + Reference(const std::shared_ptr& op, const GraphContext::CPtr& context, const std::string& errorMessage); void getSupportedDescriptors() override; void initSupportedPrimitiveDescriptors() override; @@ -29,7 +29,7 @@ class Reference : public Node { ov::TensorVector prepareOutputs() const; private: - const std::shared_ptr ngraphOp; + const std::shared_ptr ngraphOp; const std::string additionalErrorMessage; }; diff --git a/src/plugins/intel_cpu/src/nodes/reorder.cpp b/src/plugins/intel_cpu/src/nodes/reorder.cpp index f8a9de782c2c09..c802115aec45d6 100644 --- a/src/plugins/intel_cpu/src/nodes/reorder.cpp +++ b/src/plugins/intel_cpu/src/nodes/reorder.cpp @@ -29,12 +29,12 @@ bool Reorder::isExecutable() const { return Node::isExecutable() && !isOptimized; } -Reorder::Reorder(const std::shared_ptr& op, const GraphContext::CPtr context) : +Reorder::Reorder(const std::shared_ptr& op, const GraphContext::CPtr& context) : Node(op, context, PassThroughShapeInferFactory()) { IE_THROW() << "Can't create reorder node from ngraph node"; } -Reorder::Reorder(const std::string& name, const GraphContext::CPtr context) : +Reorder::Reorder(const std::string& name, const GraphContext::CPtr& context) : Node("Reorder", name, context) {} void Reorder::getSupportedDescriptors() { diff --git a/src/plugins/intel_cpu/src/nodes/reorder.h b/src/plugins/intel_cpu/src/nodes/reorder.h index ef8e508fa08123..8fc7c71a27f6aa 100644 --- a/src/plugins/intel_cpu/src/nodes/reorder.h +++ b/src/plugins/intel_cpu/src/nodes/reorder.h @@ -17,8 +17,8 @@ namespace node { class Reorder : public Node { public: - Reorder(const std::shared_ptr& op, const GraphContext::CPtr context); - Reorder(const std::string& name, const GraphContext::CPtr context); + Reorder(const std::shared_ptr& op, const GraphContext::CPtr& context); + Reorder(const std::string& name, const GraphContext::CPtr& context); void getSupportedDescriptors() override; void initSupportedPrimitiveDescriptors() override; diff --git a/src/plugins/intel_cpu/src/nodes/reshape.cpp b/src/plugins/intel_cpu/src/nodes/reshape.cpp index 58b59b0dbfa2ab..8a64c34839ee02 100644 --- a/src/plugins/intel_cpu/src/nodes/reshape.cpp +++ b/src/plugins/intel_cpu/src/nodes/reshape.cpp @@ -3,30 +3,26 @@ // #include "reshape.h" -#include "utils.hpp" -#include -#include -#include -#include -#include -#include -#include -#include "utils/shape_inference/shape_inference_cpu.hpp" #include "common/cpu_memcpy.h" +#include +#include +#include +#include +#include -using namespace dnnl; using namespace InferenceEngine; namespace ov { namespace intel_cpu { namespace node { -bool Reshape::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { +bool Reshape::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { try { - if (!std::dynamic_pointer_cast(op) && - !std::dynamic_pointer_cast(op) && - !std::dynamic_pointer_cast(op)) { + if (!one_of(op->get_type_info(), + op::v1::Reshape::get_type_info_static(), + op::v0::Squeeze::get_type_info_static(), + op::v0::Unsqueeze::get_type_info_static())) { errorMessage = "Only opset1 Reshape, Squeeze, Unsqueeze operations are supported"; return false; } @@ -226,36 +222,49 @@ class ReshapeShapeInferFactory : public ShapeInferFactory { }; } // namespace -Reshape::Reshape(const std::shared_ptr& op, const GraphContext::CPtr context) : +Reshape::Reshape(const std::shared_ptr& op, const GraphContext::CPtr context) : Node(op, context, ReshapeShapeInferFactory(op)) { std::string errorMessage; if (!isSupportedOperation(op, errorMessage)) { IE_THROW(NotImplemented) << errorMessage; } - errorPrefix = std::string(op->get_type_name()) + " node with name '" + getName() + "'"; - if (isDynamicNode()) { - auto checkSecondInput = [](const std::shared_ptr& op, const std::string opType) { + auto checkSecondInput = [](const std::shared_ptr& op, const std::string &opType) { if (op->get_input_partial_shape(1).is_dynamic()) { IE_THROW() << "CPU plug-in doesn't support " << opType << " node with non static second input"; } }; - if (std::dynamic_pointer_cast(op)) { - checkSecondInput(op, "Reshape"); - } else if (std::dynamic_pointer_cast(op)) { + if (op->get_type_info() == op::v1::Reshape::get_type_info_static()) { + checkSecondInput(op, getTypeStr()); + } else if (op->get_type_info() == op::v0::Squeeze::get_type_info_static()) { if (op->get_input_size() == 1) IE_THROW() << "CPU plug-in doesn't support Squeeze node with inputs num equal 1"; - checkSecondInput(op, "Squeeze"); - } else if (std::dynamic_pointer_cast(op)) { - checkSecondInput(op, "Unsqueeze"); + checkSecondInput(op, getTypeStr()); + } else if (op->get_type_info() == op::v0::Unsqueeze::get_type_info_static()) { + checkSecondInput(op, getTypeStr()); } else { IE_THROW() << "Unsupported operation type via reshape node"; } } } +template +bool Reshape::validateSecondInputValues(const void* inPtr) const { + const auto sndInput = reinterpret_cast(inPtr); + for (size_t i = 0; i < lastSecondInputValues.size(); i++) { + const auto inVal = static_cast(sndInput[i]); + if (lastSecondInputValues[i] != inVal) { + for (size_t i = 0; i < lastSecondInputValues.size(); i++) { + lastSecondInputValues[i] = inVal; + } + return true; + } + } + return false; +} + bool Reshape::needShapeInfer() const { if (inputShapesModified()) { return true; @@ -264,16 +273,12 @@ bool Reshape::needShapeInfer() const { if (lastSecondInputValues.empty()) { lastSecondInputValues.resize(mem.getStaticDims()[0], 0); } - const int32_t *sndInput = reinterpret_cast(mem.getData()); - for (size_t i = 0; i < lastSecondInputValues.size(); i++) { - if (lastSecondInputValues[i] != sndInput[i]) { - for (size_t i = 0; i < lastSecondInputValues.size(); i++) { - lastSecondInputValues[i] = sndInput[i]; - } - return true; - } + + switch (mem.getDesc().getPrecision()) { + case Precision::I64: return validateSecondInputValues(mem.getData()); + case Precision::I32: return validateSecondInputValues(mem.getData()); + default: THROW_CPU_NODE_ERR << "has unsupported second input data type."; } - return false; } void Reshape::getSupportedDescriptors() { @@ -287,9 +292,12 @@ void Reshape::initSupportedPrimitiveDescriptors() { if (!supportedPrimitiveDescriptors.empty()) return; - InferenceEngine::Precision inPrec = getOriginalInputPrecisionAtPort(0); - InferenceEngine::Precision outPrec = getOriginalOutputPrecisionAtPort(0); - InferenceEngine::Precision secondInPrc = InferenceEngine::Precision::I32; + auto inPrec = getOriginalInputPrecisionAtPort(0); + Precision secondInPrc = Precision::I32; + if (getOriginalInputPrecisions().size() > 1) { + secondInPrc = getOriginalInputPrecisionAtPort(1); + } + const auto &outPrec = getOriginalOutputPrecisionAtPort(0); // Current reshape implementation is simple memory reinterpret, // same precision on input and output is required @@ -308,7 +316,7 @@ void Reshape::initSupportedPrimitiveDescriptors() { for (size_t i = 0; i < getParentEdges().size(); i++) { config.inConfs[i].inPlace(0 == i && canBeInPlace ? 0 : -1); config.inConfs[i].constant(false); - config.inConfs[i].setMemDesc(creatorsMap.at(LayoutType::ncsp)->createSharedDesc((i > 0 ? secondInPrc : inPrec), getInputShapeAtPort(i))); + config.inConfs[i].setMemDesc(creatorsMap.at(LayoutType::ncsp)->createSharedDesc((i == 0 ? inPrec : secondInPrc), getInputShapeAtPort(i))); } config.outConfs.resize(1); config.outConfs[0].inPlace(canBeInPlace ? 0 : -1); diff --git a/src/plugins/intel_cpu/src/nodes/reshape.h b/src/plugins/intel_cpu/src/nodes/reshape.h index e62253e99fa8a3..125cd6cd5c661e 100644 --- a/src/plugins/intel_cpu/src/nodes/reshape.h +++ b/src/plugins/intel_cpu/src/nodes/reshape.h @@ -4,12 +4,7 @@ #pragma once -#include #include -#include -#include -#include -#include "input.h" namespace ov { namespace intel_cpu { @@ -17,7 +12,7 @@ namespace node { class Reshape : public Node { public: - Reshape(const std::shared_ptr& op, const GraphContext::CPtr context); + Reshape(const std::shared_ptr& op, const GraphContext::CPtr context); void getSupportedDescriptors() override; void initSupportedPrimitiveDescriptors() override; @@ -29,12 +24,13 @@ class Reshape : public Node { void executeDynamicImpl(dnnl::stream strm) override; void execute(dnnl::stream strm) override; - static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; + template + bool validateSecondInputValues(const void* inPtr) const; -private: - mutable std::vector lastSecondInputValues; + static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; - std::string errorPrefix; +private: + mutable std::vector lastSecondInputValues; }; } // namespace node diff --git a/src/plugins/intel_cpu/src/nodes/rnn.cpp b/src/plugins/intel_cpu/src/nodes/rnn.cpp index 004cccf763e90c..35c787981bd736 100644 --- a/src/plugins/intel_cpu/src/nodes/rnn.cpp +++ b/src/plugins/intel_cpu/src/nodes/rnn.cpp @@ -17,8 +17,12 @@ #include "ov_ops/augru_cell.hpp" #include "ov_ops/augru_sequence.hpp" - -#include +#include +#include +#include +#include +#include +#include #include #include @@ -388,7 +392,7 @@ RNN::RNN(const std::shared_ptr& op, const GraphContext::CPtr context) yIdx = 0; hoIdx = 1; coIdx = 2; } - auto rnnCellBase = std::dynamic_pointer_cast(op); + auto rnnCellBase = std::dynamic_pointer_cast(op); if (!rnnCellBase) THROW_ERROR << "does not have original layer for RNNCell."; diff --git a/src/plugins/intel_cpu/src/nodes/scatter_update.cpp b/src/plugins/intel_cpu/src/nodes/scatter_update.cpp index 27796341f786b9..171cea2e4887be 100644 --- a/src/plugins/intel_cpu/src/nodes/scatter_update.cpp +++ b/src/plugins/intel_cpu/src/nodes/scatter_update.cpp @@ -3,16 +3,13 @@ // #include "scatter_update.h" -#include -#include -#include + #include #include "ie_parallel.hpp" -#include #include "common/cpu_memcpy.h" - -#include -#include +#include +#include +#include using namespace dnnl; using namespace InferenceEngine; @@ -21,11 +18,11 @@ namespace ov { namespace intel_cpu { namespace node { -bool ScatterUpdate::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { +bool ScatterUpdate::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { try { - auto scatterElemUpd = ngraph::as_type_ptr(op); - auto scatterUpd = ngraph::as_type_ptr(op); - auto scatterNdUpd = ngraph::as_type_ptr(op); + auto scatterElemUpd = ov::as_type_ptr(op); + auto scatterUpd = ov::as_type_ptr(op); + auto scatterNdUpd = ov::as_type_ptr(op); if (!scatterElemUpd && !scatterUpd && !scatterNdUpd) { const std::string opType = op->get_type_name(); errorMessage = "Only opset" + opType == "ScatterNDUpdate" ? "4 " : "3 " + opType + " operation is supported"; @@ -41,18 +38,17 @@ bool ScatterUpdate::isExecutable() const { return !isInputTensorAtPortEmpty(DATA_ID); } -ScatterUpdate::ScatterUpdate(const std::shared_ptr& op, const GraphContext::CPtr context) +ScatterUpdate::ScatterUpdate(const std::shared_ptr& op, const GraphContext::CPtr context) : Node(op, context, NgraphShapeInferFactory(op, EMPTY_PORT_MASK)), dataSize(0lu), indicesSize(0lu), axisSize(0lu), dataPrec(Precision::UNSPECIFIED), indicesPrec(Precision::UNSPECIFIED), axisPrec(Precision::UNSPECIFIED) { std::string errorMessage; - if (isSupportedOperation(op, errorMessage)) { - errorPrefix = std::string(op->get_type_name()) + " node with name '" + getName() + "'"; - } else { + if (!isSupportedOperation(op, errorMessage)) { IE_THROW(NotImplemented) << errorMessage; } + errorPrefix = std::string(op->get_type_name()) + " node with name '" + getName() + "'"; } void ScatterUpdate::getSupportedDescriptors() { @@ -291,7 +287,7 @@ void ScatterUpdate::execute(dnnl::stream strm) { size_t start = 0, end = 0; splitter(indicesBlockND[0], nthr, ithr, start, end); for (size_t i = start; i < end; i++) { - int64_t idxValue = getIndicesValue(indicesPtr, i); + int64_t idxValue = getIndicesValue(indicesPtr, i); if (idxValue >= static_cast(srcDimAxis) || idxValue < 0) { IE_THROW() << errorPrefix << " have indices value that points to non-existing output tensor element"; diff --git a/src/plugins/intel_cpu/src/nodes/scatter_update.h b/src/plugins/intel_cpu/src/nodes/scatter_update.h index 835077b89778b5..512110ef2dc646 100644 --- a/src/plugins/intel_cpu/src/nodes/scatter_update.h +++ b/src/plugins/intel_cpu/src/nodes/scatter_update.h @@ -22,7 +22,7 @@ enum class ScatterUpdateMode { class ScatterUpdate : public Node { public: - ScatterUpdate(const std::shared_ptr& op, const GraphContext::CPtr context); + ScatterUpdate(const std::shared_ptr& op, const GraphContext::CPtr context); void getSupportedDescriptors() override; void initSupportedPrimitiveDescriptors() override; @@ -36,7 +36,7 @@ class ScatterUpdate : public Node { void executeDynamicImpl(dnnl::stream strm) override; bool isExecutable() const override; - static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; + static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; private: void scatterUpdate(uint8_t *indicesPtr, uint8_t *updatePtr, int axis, uint8_t *dstDataPtr); diff --git a/src/plugins/intel_cpu/src/nodes/shapeof.cpp b/src/plugins/intel_cpu/src/nodes/shapeof.cpp index e3b9a8bcc81640..b0d6c36c5f6295 100644 --- a/src/plugins/intel_cpu/src/nodes/shapeof.cpp +++ b/src/plugins/intel_cpu/src/nodes/shapeof.cpp @@ -41,11 +41,11 @@ class ShapeOfShapeInferFactory : public ShapeInferFactory { }; } // namespace -bool ShapeOf::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { +bool ShapeOf::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { try { if (!one_of(op->get_type_info(), - ngraph::op::v0::ShapeOf::get_type_info_static(), - ngraph::op::v3::ShapeOf::get_type_info_static())) { + ov::op::v0::ShapeOf::get_type_info_static(), + ov::op::v3::ShapeOf::get_type_info_static())) { errorMessage = "Node is not an instance of ShapeOf form the operation set v1 or v3."; return false; } @@ -55,35 +55,38 @@ bool ShapeOf::isSupportedOperation(const std::shared_ptr& op return true; } -ShapeOf::ShapeOf(const std::shared_ptr& op, const GraphContext::CPtr context) - : Node(op, context, ShapeOfShapeInferFactory()) { +ShapeOf::ShapeOf(const std::shared_ptr& op, const GraphContext::CPtr& context) + : Node(op, context, ShapeOfShapeInferFactory()) { std::string errorMessage; - if (isSupportedOperation(op, errorMessage)) { - errorPrefix = "ShapeOf layer with name '" + getName() + "' "; - if (op->get_input_partial_shape(0).size() == 0) - IE_THROW() << errorPrefix << "gets unsupported input 0D tensor (scalar)"; - } else { + if (!isSupportedOperation(op, errorMessage)) { IE_THROW(NotImplemented) << errorMessage; } + if (op->get_input_partial_shape(0).size() == 0) { + THROW_CPU_NODE_ERR << "gets unsupported input 0D tensor (scalar)"; + } } void ShapeOf::getSupportedDescriptors() { if (getParentEdges().size() != 1) - IE_THROW() << errorPrefix << "has incorrect number of input edges: " << getParentEdges().size(); + THROW_CPU_NODE_ERR << "has incorrect number of input edges: " << getParentEdges().size(); if (getChildEdges().empty()) - IE_THROW() << errorPrefix << "has incorrect number of output edges: " << getChildEdges().size(); + THROW_CPU_NODE_ERR << "has incorrect number of output edges: " << getChildEdges().size(); } void ShapeOf::initSupportedPrimitiveDescriptors() { if (!supportedPrimitiveDescriptors.empty()) return; - Precision precision = getOriginalInputPrecisionAtPort(0); + const auto inPrc = getOriginalInputPrecisionAtPort(0); + const auto &outPrc = getOriginalOutputPrecisionAtPort(0); + if (!one_of(outPrc, Precision::I32, Precision::I64)) { + THROW_CPU_NODE_ERR << "has unsupported output precision: " << outPrc; + } const LayoutType dataFormats[4] = { LayoutType::ncsp, LayoutType::nspc, LayoutType::nCsp16c, LayoutType::nCsp8c }; for (const auto &df : dataFormats) { - addSupportedPrimDesc({{df, precision}}, - {{LayoutType::ncsp, Precision::I32}}, + addSupportedPrimDesc({{df, inPrc}}, + {{LayoutType::ncsp, outPrc}}, impl_desc_type::ref); } } @@ -96,14 +99,22 @@ void ShapeOf::execute(dnnl::stream strm) { auto inPtr = getParentEdgeAt(0)->getMemoryPtr(); auto outPtr = getChildEdgeAt(0)->getMemoryPtr(); auto inDims = inPtr->getStaticDims(); - size_t dimsCount = inDims.size(); - if (outPtr->getStaticDims().size() != 1 || dimsCount != outPtr->getStaticDims()[0]) - IE_THROW() << errorPrefix << "has inconsistent input shape and output size"; - - auto *dst = reinterpret_cast(getChildEdgeAt(0)->getMemoryPtr()->getData()); + const size_t dimsCount = inDims.size(); + if (outPtr->getStaticDims().size() != 1 || dimsCount != outPtr->getStaticDims()[0]) { + THROW_CPU_NODE_ERR << "has inconsistent input shape and output size"; + } - for (size_t i = 0; i < dimsCount; i++) { - dst[i] = inDims[i]; + const auto execPrc = outPtr->getDesc().getPrecision(); + if (execPrc == Precision::I64) { + auto dstData = reinterpret_cast(outPtr->getData()); + for (size_t i = 0; i < dimsCount; i++) { + dstData[i] = inDims[i]; + } + } else if (execPrc == Precision::I32) { + auto dstData = reinterpret_cast(outPtr->getData()); + for (size_t i = 0; i < dimsCount; i++) { + dstData[i] = inDims[i]; + } } } diff --git a/src/plugins/intel_cpu/src/nodes/shapeof.h b/src/plugins/intel_cpu/src/nodes/shapeof.h index e313d3449e5f98..a200ed80fb5a83 100644 --- a/src/plugins/intel_cpu/src/nodes/shapeof.h +++ b/src/plugins/intel_cpu/src/nodes/shapeof.h @@ -16,7 +16,7 @@ namespace node { class ShapeOf : public Node { public: - ShapeOf(const std::shared_ptr& op, const GraphContext::CPtr context); + ShapeOf(const std::shared_ptr& op, const GraphContext::CPtr& context); void getSupportedDescriptors() override; void initSupportedPrimitiveDescriptors() override; @@ -28,9 +28,6 @@ class ShapeOf : public Node { bool isExecutable() const override; static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; - -private: - std::string errorPrefix; }; } // namespace node diff --git a/src/plugins/intel_cpu/src/nodes/shuffle_channels.cpp b/src/plugins/intel_cpu/src/nodes/shuffle_channels.cpp index 84a74766875f63..1cb34dfc5e0581 100644 --- a/src/plugins/intel_cpu/src/nodes/shuffle_channels.cpp +++ b/src/plugins/intel_cpu/src/nodes/shuffle_channels.cpp @@ -15,6 +15,7 @@ #include #include #include +#include #define THROW_SHCH_ERROR IE_THROW() << "ShuffleChannels layer with name '" << getName() << "' " @@ -54,7 +55,7 @@ bool ShuffleChannels::ShuffleChannelsAttributes::operator==(const ShuffleChannel bool ShuffleChannels::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { try { - auto shuffleChannels = ov::as_type_ptr(op); + auto shuffleChannels = ov::as_type_ptr(op); if (!shuffleChannels) { errorMessage = "Only opset1 ShuffleChannels operation is supported"; return false; @@ -75,7 +76,7 @@ ShuffleChannels::ShuffleChannels(const std::shared_ptr& op, const if (inputShapes.size() != 1 || outputShapes.size() != 1) THROW_SHCH_ERROR << "has incorrect number of input/output edges."; - auto shuffleChannels = ov::as_type_ptr(op); + auto shuffleChannels = ov::as_type_ptr(op); attrs.group = shuffleChannels->get_group(); attrs.axis = shuffleChannels->get_axis(); attrs.dataRank = getInputShapeAtPort(0).getRank(); diff --git a/src/plugins/intel_cpu/src/nodes/split.cpp b/src/plugins/intel_cpu/src/nodes/split.cpp index 710abbfffba80f..55c419d25d2152 100644 --- a/src/plugins/intel_cpu/src/nodes/split.cpp +++ b/src/plugins/intel_cpu/src/nodes/split.cpp @@ -3,19 +3,14 @@ // #include "split.h" + #include "common/cpu_memcpy.h" -#include "common/blocked_desc_creator.h" -#include -#include -#include #include -#include "utils/general_utils.h" -#include -#include "utils/ngraph_utils.hpp" +#include +#include +#include #include -#define THROW_ERROR IE_THROW() << "Split layer with name '" << getName() <<"' " - using namespace dnnl; using namespace InferenceEngine; @@ -23,13 +18,13 @@ namespace ov { namespace intel_cpu { namespace node { -bool Split::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { +bool Split::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { try { - if (!one_of(op->get_type_info(), ngraph::op::v1::Split::get_type_info_static(), ngraph::op::v1::VariadicSplit::get_type_info_static())) { + if (!one_of(op->get_type_info(), op::v1::Split::get_type_info_static(), op::v1::VariadicSplit::get_type_info_static())) { errorMessage = "Only opset1 Split and VariadicSplit operations are supported"; return false; } - auto axisOp = ngraph::as_type_ptr(op->get_input_node_shared_ptr(1)); + auto axisOp = ov::as_type_ptr(op->get_input_node_shared_ptr(1)); if (!axisOp) { errorMessage = "Constant expected as the axis input."; return false; @@ -44,31 +39,36 @@ bool Split::isSupportedOperation(const std::shared_ptr& op, return true; } -Split::Split(const std::shared_ptr& op, const GraphContext::CPtr context) : +Split::Split(const std::shared_ptr& op, const GraphContext::CPtr& context) : Node(op, context, NgraphShapeInferFactory(op, PortMask(1, 2))) { std::string errorMessage; if (!isSupportedOperation(op, errorMessage)) { IE_THROW(NotImplemented) << errorMessage; } - if (ngraph::as_type_ptr(op)) { + if (ov::as_type_ptr(op)) { INPUTS_NUM = 2; - } else if (ngraph::as_type_ptr(op)) { + } else if (ov::as_type_ptr(op)) { INPUTS_NUM = 3; - if (!ngraph::is_type(op->get_input_node_shared_ptr(2))) { + if (!ov::is_type(op->get_input_node_shared_ptr(2))) { this->splitLengths.resize(op->get_input_shape(2)[0]); this->constSplitLengths = false; } } const auto inRank = getInputShapeAtPort(0).getRank(); - auto axisOp = ngraph::as_type_ptr(op->get_input_node_shared_ptr(1)); - auto axis = axisOp->cast_vector()[0]; + auto axisOp = ov::as_type_ptr(op->get_input_node_shared_ptr(1)); + int64_t axis; + if (axisOp->get_element_type() == ov::element::i64) { + axis = axisOp->get_data_ptr()[0]; + } else { + axis = axisOp->cast_vector()[0]; + } if (axis < 0) { axis += inRank; } if (axis >= static_cast(inRank)) { - THROW_ERROR << "Split node with name '" << op->get_friendly_name() << "' has invalid value of axis parameter: " << axis; + THROW_CPU_NODE_ERR << "' has invalid value of axis parameter: " << axis; } this->axis = axis; } @@ -82,24 +82,24 @@ void Split::initSupportedPrimitiveDescriptors() { if (!supportedPrimitiveDescriptors.empty()) return; - const auto &srcShape = getInputShapeAtPort(0); - const auto &dstFirstDims = getOutputShapeAtPort(0).getDims(); + const auto& srcShape = getInputShapeAtPort(0); + const auto& dstFirstDims = getOutputShapeAtPort(0).getDims(); for (size_t i = 0; i < outputShapes.size(); i++) { const auto &o_Dims = outputShapes[i].getDims(); if (dstFirstDims.size() != o_Dims.size()) { - THROW_ERROR << "only supports output blobs with equal number of dimensions"; + THROW_CPU_NODE_ERR << "only supports output blobs with equal number of dimensions"; } for (size_t j = 0; j < dstFirstDims.size(); j++) { if (j == axis) continue; if (!dimsEqualWeak(o_Dims[j], dstFirstDims[j])) - THROW_ERROR << "has incorrect output dimensions"; + THROW_CPU_NODE_ERR << "has incorrect output dimensions"; } } - InferenceEngine::Precision inpPrecision = getOriginalInputPrecisionAtPort(0); - const auto axisPrecision = Precision::I32; + const auto& inpPrecision = getOriginalInputPrecisionAtPort(0); + const auto& axisPrecision = getOriginalInputPrecisionAtPort(1); // Set plain and tailC formats std::vector tdCreatorTypes{ LayoutType::ncsp, LayoutType::nspc }; @@ -233,7 +233,7 @@ bool Split::needPrepareParams() const { void Split::prepareParams() { const auto &srcMemPtr = getParentEdgesAtPort(0)[0]->getMemoryPtr(); if (!srcMemPtr || !srcMemPtr->isAllocated()) { - THROW_ERROR << "has not allocated input memory"; + THROW_CPU_NODE_ERR << "has not allocated input memory"; } if (!constSplitLengths) { @@ -248,7 +248,7 @@ void Split::prepareParams() { for (size_t port = 0; port < outputShapes.size(); ++port) { const auto &outMemPtr = this->getChildEdgesAtPort(port)[0]->getMemoryPtr(); if (!outMemPtr || !outMemPtr->isAllocated()) { - THROW_ERROR << "has not allocated destination memory"; + THROW_CPU_NODE_ERR << "has not allocated destination memory"; } if (outMemPtr->getShape().hasZeroDims()) { @@ -278,7 +278,7 @@ void Split::execute(dnnl::stream strm) { } if (dstMemPtrs.empty()) - THROW_ERROR << "Output data pointers have not been initialized."; + THROW_CPU_NODE_ERR << "Output data pointers have not been initialized."; const auto &srcMem = getParentEdgesAtPort(0)[0]->getMemory(); @@ -300,7 +300,7 @@ void Split::initOptimalPrimitiveDescriptor() { Node::initOptimalPrimitiveDescriptor(); auto selected_pd = getSelectedPrimitiveDescriptor(); if (selected_pd == nullptr) - THROW_ERROR << "Preferable primitive descriptor is not set."; + THROW_CPU_NODE_ERR << "Preferable primitive descriptor is not set."; auto config = selected_pd->getConfig(); canUseOptimizedNspc2Ncsp = false; @@ -462,7 +462,7 @@ std::vector Split::getRawDstMemPtrs() const { for (size_t i = 0; i < dstMemPtrs.size(); ++i) { result[i] = reinterpret_cast(dstMemPtrs[i].second->getData()); if (!result[i]) { - THROW_ERROR << "can't get child edge indx " << dstMemPtrs[i].first << " data."; + THROW_CPU_NODE_ERR << "can't get child edge indx " << dstMemPtrs[i].first << " data."; } } return result; diff --git a/src/plugins/intel_cpu/src/nodes/split.h b/src/plugins/intel_cpu/src/nodes/split.h index 5402d748832d7d..df3dbcc2f16ac9 100644 --- a/src/plugins/intel_cpu/src/nodes/split.h +++ b/src/plugins/intel_cpu/src/nodes/split.h @@ -4,9 +4,7 @@ #pragma once -#include #include -#include namespace ov { namespace intel_cpu { @@ -14,7 +12,7 @@ namespace node { class Split : public Node { public: - Split(const std::shared_ptr& op, const GraphContext::CPtr context); + Split(const std::shared_ptr& op, const GraphContext::CPtr& context); static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; void getSupportedDescriptors() override; diff --git a/src/plugins/intel_cpu/src/nodes/strided_slice.cpp b/src/plugins/intel_cpu/src/nodes/strided_slice.cpp index aa4dae10df7d86..5e996dc88a2905 100644 --- a/src/plugins/intel_cpu/src/nodes/strided_slice.cpp +++ b/src/plugins/intel_cpu/src/nodes/strided_slice.cpp @@ -6,8 +6,8 @@ #include "ie_parallel.hpp" #include "common/cpu_memcpy.h" -#include "input.h" -#include +#include +#include #include #include "slice_shape_inference_utils.hpp" @@ -139,7 +139,7 @@ class StridedSliceShapeInferFactory : public ShapeInferFactory { } // namespace -StridedSlice::StridedSlice(const std::shared_ptr& op, const GraphContext::CPtr context) : +StridedSlice::StridedSlice(const std::shared_ptr& op, const GraphContext::CPtr& context) : Node(op, context, StridedSliceShapeInferFactory(op)) { std::string errorMessage; if (!isSupportedOperation(op, errorMessage)) { @@ -151,10 +151,10 @@ StridedSlice::StridedSlice(const std::shared_ptr& op, const GraphConte if ((attrs.isStridedSliceOp && (inputShapes.size() < 3 || inputShapes.size() > 4)) || (!attrs.isStridedSliceOp && (inputShapes.size() < 4 || inputShapes.size() > 5))) { - IE_THROW() << errorPrefix << "has incorrect number of input edges"; + THROW_CPU_NODE_ERR << "has incorrect number of input edges"; } if (outputShapes.size() != 1) { - IE_THROW() << errorPrefix << "has incorrect number of output edges"; + THROW_CPU_NODE_ERR << "has incorrect number of output edges"; } if (inputShapes.size() > STRIDE_ID) { @@ -229,7 +229,7 @@ StridedSlice::StridedSlice(const std::shared_ptr& op, const GraphConte attrs.ellipsisPos1 = attrs.ellipsisMask[i] == 1 && attrs.ellipsisPos1 == -1 ? i : attrs.ellipsisPos1; } if (attrs.ellipsisMaskCounter > 1) - IE_THROW() << errorPrefix << "has incorrect 'Ellipsis_mask'. Only one non-zero bit is allowed"; + THROW_CPU_NODE_ERR << "has incorrect 'Ellipsis_mask'. Only one non-zero bit is allowed"; int newAxis = std::accumulate(attrs.newAxisMask.begin(), attrs.newAxisMask.end(), 0); int shrinkAxis = std::accumulate(attrs.shrinkAxisMask.begin(), attrs.shrinkAxisMask.end(), 0); @@ -242,7 +242,7 @@ StridedSlice::StridedSlice(const std::shared_ptr& op, const GraphConte if (!isConstantInput[type]) return; - const auto constNode = ov::as_type_ptr(op->get_input_node_shared_ptr(type)); + const auto constNode = ov::as_type_ptr(op->get_input_node_shared_ptr(type)); parameter = constNode->cast_vector(); auto size = constNode->get_shape()[0]; @@ -314,7 +314,7 @@ void StridedSlice::initSupportedPrimitiveDescriptors() { if (!supportedPrimitiveDescriptors.empty()) return; - const InferenceEngine::Precision dataPrecision = getOriginalInputPrecisionAtPort(DATA_ID); + const auto &dataPrecision = getOriginalInputPrecisionAtPort(DATA_ID); const InferenceEngine::Precision iPrecision = Precision::I32; attrs.dataSize = dataPrecision.size(); @@ -420,7 +420,7 @@ bool StridedSlice::needShapeInfer() const { void StridedSlice::execute(dnnl::stream strm) { if (!execPtr) - IE_THROW() << errorPrefix << "doesn't have compiled executor!"; + THROW_CPU_NODE_ERR << "doesn't have compiled executor!"; execPtr->exec(srcMemory, dstMemory); } diff --git a/src/plugins/intel_cpu/src/nodes/strided_slice.h b/src/plugins/intel_cpu/src/nodes/strided_slice.h index 3a8338c97e23a0..732f04b229d110 100644 --- a/src/plugins/intel_cpu/src/nodes/strided_slice.h +++ b/src/plugins/intel_cpu/src/nodes/strided_slice.h @@ -14,7 +14,7 @@ namespace node { class StridedSlice : public Node { public: - StridedSlice(const std::shared_ptr& op, const GraphContext::CPtr context); + StridedSlice(const std::shared_ptr& op, const GraphContext::CPtr& context); static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; void getSupportedDescriptors() override; diff --git a/src/plugins/intel_cpu/src/nodes/subgraph.cpp b/src/plugins/intel_cpu/src/nodes/subgraph.cpp index c6e8f4c03161d2..ea4041b25618ec 100644 --- a/src/plugins/intel_cpu/src/nodes/subgraph.cpp +++ b/src/plugins/intel_cpu/src/nodes/subgraph.cpp @@ -6,19 +6,8 @@ #include -#include -#include -#include -#include - -#include -#include -#include - -#include -#include -#include -#include +#include +#include #include #include "snippets/pass/matmul_to_brgemm.hpp" @@ -34,11 +23,13 @@ #include "transformations/cpu_opset/common/pass/convert_to_swish_cpu.hpp" #include "transformations/defs.hpp" +#include +#include +#include +#include + using namespace InferenceEngine; -using namespace dnnl::impl::utils; using namespace dnnl::impl::cpu; -using namespace dnnl::impl::cpu::x64; -using namespace Xbyak; namespace ov { namespace intel_cpu { @@ -78,8 +69,7 @@ class SnippetShapeInferFactory : public ShapeInferFactory { Snippet::Snippet(const std::shared_ptr& op, const GraphContext::CPtr context) : Node(op, context, SnippetShapeInferFactory(this)) { - host_isa = dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core) ? - dnnl::impl::cpu::x64::avx512_core : dnnl::impl::cpu::x64::avx2; + host_isa = x64::mayiuse(x64::avx512_core) ? x64::avx512_core : x64::avx2; original_snippet = ov::as_type_ptr(op); if (!original_snippet) { IE_THROW(NotImplemented) << "Node is not an instance of snippets::op::Subgraph"; @@ -109,7 +99,8 @@ void Snippet::initSupportedPrimitiveDescriptors() { if (!supportedPrimitiveDescriptors.empty()) return; - const std::set supportedPrecisions = { Precision::FP32, Precision::I32, Precision::BF16, Precision::FP16, Precision::I8, Precision::U8 }; + const std::set supportedPrecisions = + { Precision::I64, Precision::FP32, Precision::I32, Precision::BF16, Precision::FP16, Precision::I8, Precision::U8 }; bool dimRanksAreEqual = true; for (size_t i = 0; dimRanksAreEqual && i < inputShapes.size(); i++) { @@ -157,7 +148,7 @@ void Snippet::initSupportedPrimitiveDescriptors() { return std::make_shared(prc, shape, blocks, order, offset); } else if (lt == Blocked && shape.getRank() != 1 && (shape.getMinDims()[1] != Shape::UNDEFINED_DIM && shape.getMinDims()[1] > 1)) { - size_t blockSize = mayiuse(dnnl::impl::cpu::x64::avx512_core) ? 16 : 8; + size_t blockSize = x64::mayiuse(x64::avx512_core) ? 16 : 8; VectorDims blocks = dims; VectorDims order(blocks.size()); @@ -188,7 +179,7 @@ void Snippet::initSupportedPrimitiveDescriptors() { static_cast(InferenceEngine::Precision::BF16) : originalInputPrecision; if (supportedPrecisions.count(precision) == 0) - IE_THROW() << "Subgraph node with name `" << getName() << "` doesn't support " << precision << " precision."; + THROW_CPU_NODE_ERR << " doesn't support " << precision << " precision."; const auto equalPrecisions = getOriginalOutputPrecisions().size() == 1 && precision == getOriginalOutputPrecisionAtPort(0); @@ -207,7 +198,7 @@ void Snippet::initSupportedPrimitiveDescriptors() { for (size_t i = 0; i < outputShapes.size(); i++) { auto precision = getOriginalOutputPrecisionAtPort(i); if (supportedPrecisions.count(precision) == 0) - IE_THROW() << "Subgraph node with name `" << getName() << "` doesn't support " << precision << " precision."; + THROW_CPU_NODE_ERR << " doesn't support " << precision << " precision."; BlockedMemoryDesc::CmpMask outputMask = BlockedMemoryDesc::SKIP_OFFSET_MASK; PortConfig portConfig; @@ -221,9 +212,9 @@ void Snippet::initSupportedPrimitiveDescriptors() { } impl_desc_type impl_type = impl_desc_type::unknown; - if (mayiuse(x64::avx512_core)) { + if (x64::mayiuse(x64::avx512_core)) { impl_type = impl_desc_type::jit_avx512; - } else if (mayiuse(x64::avx2)) { + } else if (x64::mayiuse(x64::avx2)) { impl_type = impl_desc_type::jit_avx2; } return {config, impl_type}; @@ -239,8 +230,9 @@ void Snippet::initSupportedPrimitiveDescriptors() { void Snippet::selectOptimalPrimitiveDescriptor() { selectPreferPrimitiveDescriptor(getImplPriority(), true); } -InferenceEngine::Precision Snippet::getRuntimePrecision() const { - std::vector inputPrecisions; + +Precision Snippet::getRuntimePrecision() const { + std::vector inputPrecisions; for (size_t i = 0; i < getParentEdges().size(); i++) { auto parentEdge = getParentEdgeAt(i); if (parentEdge && parentEdge->getStatus() == Edge::Status::Validated && !parentEdge->getParent()->isConstant()) { @@ -327,7 +319,7 @@ ov::PartialShape Snippet::canonicalizeBody() { dims.emplace_back(d == Shape::UNDEFINED_DIM ? -1 : d); ov::PartialShape shape(dims); ov::AxisVector blocking(blockedDesc->getOrder()); - ov::element::Type precision = InferenceEngine::details::convertPrecision(blockedDesc->getPrecision()); + ov::element::Type precision = details::convertPrecision(blockedDesc->getPrecision()); return snippets::op::Subgraph::BlockedShape{shape, blocking, precision}; }; inputShapeIsBlocked.resize(inputShapes.size(), false); diff --git a/src/plugins/intel_cpu/src/nodes/subgraph.h b/src/plugins/intel_cpu/src/nodes/subgraph.h index 435b709b492f74..7486c1bf63d082 100644 --- a/src/plugins/intel_cpu/src/nodes/subgraph.h +++ b/src/plugins/intel_cpu/src/nodes/subgraph.h @@ -1,17 +1,12 @@ -// Copyright (C) 2020-2022 Intel Corporation +// Copyright (C) 2020-2023 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #pragma once -#include - -#include -#include -#include "emitters/x64/jit_snippets_emitters.hpp" - #include #include "snippets/op/subgraph.hpp" +#include "emitters/x64/jit_snippets_emitters.hpp" #include @@ -24,7 +19,7 @@ namespace node { /// precision: fp32 class Snippet : public Node { public: - Snippet(const std::shared_ptr& op, const GraphContext::CPtr context); + Snippet(const std::shared_ptr& op, const GraphContext::CPtr context); ~Snippet() override = default; void getSupportedDescriptors() override {}; diff --git a/src/plugins/intel_cpu/src/nodes/tensoriterator.cpp b/src/plugins/intel_cpu/src/nodes/tensoriterator.cpp index be139aaf8d0c75..9f21ce7a343083 100644 --- a/src/plugins/intel_cpu/src/nodes/tensoriterator.cpp +++ b/src/plugins/intel_cpu/src/nodes/tensoriterator.cpp @@ -164,7 +164,7 @@ class IterCountPortHelper : public PortMapHelper { public: IterCountPortHelper(const MemoryPtr &to, const dnnl::engine& eng) { // Only scalar I32 tensor is supported - IE_ASSERT(to->getDataType() == memory::data_type::s32); + // IE_ASSERT(to->getDataType() == memory::data_type::s32); IE_ASSERT(to->getShape() == Shape(VectorDims{1})); mem_holder_dst = to->getPrimitive(); } diff --git a/src/plugins/intel_cpu/src/nodes/tile.cpp b/src/plugins/intel_cpu/src/nodes/tile.cpp index 05392e7f1506fd..bc6f57d8c15e2e 100644 --- a/src/plugins/intel_cpu/src/nodes/tile.cpp +++ b/src/plugins/intel_cpu/src/nodes/tile.cpp @@ -4,6 +4,8 @@ #include "tile.h" #include "common/cpu_memcpy.h" +#include +#include using namespace InferenceEngine; @@ -13,7 +15,7 @@ namespace node { bool Tile::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { try { - if (!ov::is_type(op)) { + if (!ov::is_type(op)) { errorMessage = "Only opset1 Tile operation is supported."; return false; } @@ -22,7 +24,7 @@ bool Tile::isSupportedOperation(const std::shared_ptr& op, std:: return false; } if (!isDynamicNgraphNode(op) && - !ov::is_type(op->get_input_node_ptr(TILE_REPEATS))) { + !ov::is_type(op->get_input_node_ptr(TILE_REPEATS))) { errorMessage = "Only constant 'Repeats' input is supported with static shapes."; return false; } @@ -32,18 +34,16 @@ bool Tile::isSupportedOperation(const std::shared_ptr& op, std:: return true; } -Tile::Tile(const std::shared_ptr& op, const GraphContext::CPtr context) : +Tile::Tile(const std::shared_ptr& op, const GraphContext::CPtr& context) : Node(op, context, NgraphShapeInferFactory(op, PortMask(TILE_REPEATS))) { std::string errorMessage; if (!isSupportedOperation(op, errorMessage)) { IE_THROW(NotImplemented) << errorMessage; } - errorPrefix = "Tile node with name '" + getName() + "'"; - - if (ov::is_type(op->get_input_node_ptr(TILE_REPEATS))) { + if (auto repeatsOp = ov::as_type(op->get_input_node_ptr(TILE_REPEATS))) { constMap[TILE_REPEATS] = true; - repeats = originRepeats = ov::as_type(op->get_input_node_ptr(TILE_REPEATS))->cast_vector(); + repeats = originRepeats = repeatsOp->cast_vector(); while (repeats.size() < getInputShapeAtPort(TILE_INPUT).getRank()) { repeats.insert(repeats.begin(), 1lu); } @@ -61,24 +61,24 @@ void Tile::getSupportedDescriptors() { return result; }; if (getParentEdges().size() != 2) - IE_THROW() << errorPrefix << " has incorrect number of input edges. " + THROW_CPU_NODE_ERR << " has incorrect number of input edges. " "Expected: 2, Actual: " << getParentEdges().size(); if (getChildEdges().empty()) - IE_THROW() << errorPrefix << " has no output edges."; + THROW_CPU_NODE_ERR << " has no output edges."; const auto& dstDims0 = getOutputShapeAtPort(0).getDims(); for (size_t i = 1lu; i < outputShapes.size(); i++) { const auto& dstDims = getOutputShapeAtPort(i).getDims(); if (dstDims.size() != dstDims0.size()) - IE_THROW() << errorPrefix << " has output edges 0 and " << i << " with different ranks: " << dstDims0.size() << " and " << dstDims.size(); + THROW_CPU_NODE_ERR << " has output edges 0 and " << i << " with different ranks: " << dstDims0.size() << " and " << dstDims.size(); for (size_t j = 0; j < dstDims0.size(); j++) { if (dstDims0[j] != dstDims[j]) { - IE_THROW() << errorPrefix << " has output edges 0 and " << i << " with different dims: " << vec_to_string(dstDims0) << " and " + THROW_CPU_NODE_ERR << " has output edges 0 and " << i << " with different dims: " << vec_to_string(dstDims0) << " and " << vec_to_string(dstDims); } } } if (constMap[TILE_REPEATS] && getInputShapeAtPort(TILE_INPUT).getRank() > getOutputShapeAtPort(0).getRank()) - IE_THROW() << errorPrefix << " has incorrect input/output data shape rank. Input shape rank cannot be more than output shape rank. " + THROW_CPU_NODE_ERR << " has incorrect input/output data shape rank. Input shape rank cannot be more than output shape rank. " "Actual input shape size: " << getInputShapeAtPort(TILE_INPUT).getRank() << ", output shape size: " << getOutputShapeAtPort(0).getRank(); if (!isDynamicNode()) @@ -100,8 +100,13 @@ void Tile::prepareParams() { if (!constMap[TILE_REPEATS]) { const auto& repeatsMem = getParentEdgesAtPort(TILE_REPEATS)[0]->getMemory(); - const int32_t* repeatsData = reinterpret_cast(repeatsMem.getData()); - originRepeats.assign(repeatsData, repeatsData + repeatsMem.getStaticDims()[0]); + if (repeatsMem.getDesc().getPrecision() == Precision::I64) { + auto repeatsData = reinterpret_cast(repeatsMem.getData()); + originRepeats.assign(repeatsData, repeatsData + repeatsMem.getStaticDims()[0]); + } else { + auto repeatsData = reinterpret_cast(repeatsMem.getData()); + originRepeats.assign(repeatsData, repeatsData + repeatsMem.getStaticDims()[0]); + } repeats.assign(std::max(originRepeats.size(), getInputShapeAtPort(TILE_INPUT).getRank()), 1lu); const size_t offset = repeats.size() - originRepeats.size(); diff --git a/src/plugins/intel_cpu/src/nodes/tile.h b/src/plugins/intel_cpu/src/nodes/tile.h index 2edda6e0f887d5..eb8ba348168ccc 100644 --- a/src/plugins/intel_cpu/src/nodes/tile.h +++ b/src/plugins/intel_cpu/src/nodes/tile.h @@ -14,7 +14,7 @@ namespace node { class Tile : public Node, public TileBroadcastCommon { public: - Tile(const std::shared_ptr& op, const GraphContext::CPtr context); + Tile(const std::shared_ptr& op, const GraphContext::CPtr& context); void getSupportedDescriptors() override; void initSupportedPrimitiveDescriptors() override; @@ -39,8 +39,6 @@ class Tile : public Node, public TileBroadcastCommon { int tiles = 0; bool noTiling = false; VectorDims originRepeats; - - std::string errorPrefix; }; } // namespace node diff --git a/src/plugins/intel_cpu/src/nodes/topk.cpp b/src/plugins/intel_cpu/src/nodes/topk.cpp index 80d7b42d3a1369..e7f4b9f1c53f81 100644 --- a/src/plugins/intel_cpu/src/nodes/topk.cpp +++ b/src/plugins/intel_cpu/src/nodes/topk.cpp @@ -4,14 +4,10 @@ #include "topk.h" -#include -#include -#include #include #include #include "emitters/x64/jit_load_store_emitters.hpp" #include "ie_parallel.hpp" -#include #include #include @@ -19,8 +15,6 @@ #include #include "common/cpu_memcpy.h" -#include - using namespace dnnl; using namespace InferenceEngine; using namespace dnnl::impl; @@ -1792,30 +1786,29 @@ struct jit_uni_topk_kernel_f32 : public jit_uni_topk_kernel, public jit_generato bool TopK::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { try { - if (!one_of(op->get_type_info(), ov::op::v1::TopK::get_type_info_static(), - ov::op::v3::TopK::get_type_info_static(), - ov::op::v11::TopK::get_type_info_static())) { + if (!one_of(op->get_type_info(), op::v1::TopK::get_type_info_static(), + op::v3::TopK::get_type_info_static(), + op::v11::TopK::get_type_info_static())) { errorMessage = "Node is not an instance of the TopK from the operation sets v1, v3 or v11"; return false; } - auto topKOp = ov::as_type_ptr(op); + auto topKOp = ov::as_type_ptr(op); if (!isDynamicNgraphNode(op)) { - auto topKConst = std::dynamic_pointer_cast(topKOp->get_input_node_shared_ptr(TOPK_K)); - if (!topKConst) { + if (topKOp->get_input_node_shared_ptr(TOPK_K)->get_type_info() != ov::opset1::Constant::get_type_info_static()) { errorMessage = "Second tensor is not constant in static shape mode"; return false; } } - if (topKOp->get_mode() != ov::op::TopKMode::MAX && - topKOp->get_mode() != ov::op::TopKMode::MIN) { + if (topKOp->get_mode() != op::TopKMode::MAX && + topKOp->get_mode() != op::TopKMode::MIN) { errorMessage = "Unsupported mode."; return false; } - if (!one_of(topKOp->get_sort_type(), ov::op::TopKSortType::NONE, - ov::op::TopKSortType::SORT_VALUES, - ov::op::TopKSortType::SORT_INDICES)) { + if (!one_of(topKOp->get_sort_type(), op::TopKSortType::NONE, + op::TopKSortType::SORT_VALUES, + op::TopKSortType::SORT_INDICES)) { errorMessage = "Unsupported sort type."; return false; } @@ -1828,59 +1821,61 @@ bool TopK::isSupportedOperation(const std::shared_ptr& op, std:: TopK::TopK(const std::shared_ptr& op, const GraphContext::CPtr context) : Node(op, context, NgraphShapeInferFactory(op, PortMask(TOPK_K))) { std::string errorMessage; - if (isSupportedOperation(op, errorMessage)) { - errorPrefix = "TopK layer with name '" + getName() + "'"; + if (!isSupportedOperation(op, errorMessage)) { + IE_THROW(NotImplemented) << errorMessage; + } - auto topKOp = ov::as_type_ptr(op); + auto topKOp = ov::as_type_ptr(op); - auto in_dims = topKOp->get_input_partial_shape(TOPK_DATA); - auto out_dims = topKOp->get_output_partial_shape(TOPK_DATA); - auto out_idx_dims = topKOp->get_output_partial_shape(TOPK_INDEX); - auto in_dims_size = in_dims.size(); + const auto& in_dims = topKOp->get_input_partial_shape(TOPK_DATA); + const auto& out_dims = topKOp->get_output_partial_shape(TOPK_DATA); + const auto& out_idx_dims = topKOp->get_output_partial_shape(TOPK_INDEX); + const auto in_dims_size = in_dims.size(); - if (!isDynamicNgraphNode(op)) { - auto topKConst = std::dynamic_pointer_cast(topKOp->get_input_node_shared_ptr(TOPK_K)); - if (!topKConst) { - IE_THROW() << errorPrefix << "gets non-constant second tensor in static shape mode!"; + top_k = 0; + if (!isDynamicNgraphNode(op)) { + if (auto topKL = ov::as_type(topKOp->get_input_node_ptr(TOPK_K))) { + if (topKL->get_element_type() == ov::element::i64) { + top_k = topKL->get_data_ptr()[0]; + } else { + top_k = topKL->cast_vector()[0]; } + } else { + THROW_CPU_NODE_ERR << " gets non-constant second tensor in static shape mode!"; } + } - axis = topKOp->get_axis(); - mode_max = topKOp->get_mode() == ov::op::TopKMode::MAX; - sort_index = topKOp->get_sort_type() == ov::op::TopKSortType::SORT_INDICES; + axis = topKOp->get_axis(); + mode_max = topKOp->get_mode() == op::TopKMode::MAX; + sort_index = topKOp->get_sort_type() == op::TopKSortType::SORT_INDICES; - stable = false; - if (!sort_index) { - const auto topKOpV11 = ngraph::as_type_ptr(op); - if (topKOpV11) { - stable = topKOpV11->get_stable(); - } + stable = false; + if (!sort_index) { + if (auto topKOpV11 = ov::as_type_ptr(op)) { + stable = topKOpV11->get_stable(); } + } - top_k = 0; - preset_params_done = false; - vec_idx_seq.clear(); - vec_idx_block.clear(); + preset_params_done = false; + vec_idx_seq.clear(); + vec_idx_block.clear(); - if (inputShapes.size() != 2 || outputShapes.size() < 2) - IE_THROW() << errorPrefix << " gets incorrect number of input/output edges!"; + if (inputShapes.size() != 2 || outputShapes.size() < 2) + THROW_CPU_NODE_ERR << " gets incorrect number of input/output edges!"; - if (getInputShapeAtPort(TOPK_DATA).getRank() != getOutputShapeAtPort(TOPK_DATA).getRank()) - IE_THROW() << errorPrefix << " gets incorrect number of input/output dimensions!"; + if (getInputShapeAtPort(TOPK_DATA).getRank() != getOutputShapeAtPort(TOPK_DATA).getRank()) + THROW_CPU_NODE_ERR << " gets incorrect number of input/output dimensions!"; - if (getInputShapeAtPort(TOPK_K).getRank() != 1) - IE_THROW() << errorPrefix << " gets incorrect index vector dimension! Index vector should be 1 dimension."; + if (getInputShapeAtPort(TOPK_K).getRank() != 1) + THROW_CPU_NODE_ERR << " gets incorrect index vector dimension! Index vector should be 1 dimension."; - if (out_dims != out_idx_dims) - IE_THROW() << errorPrefix << " gets incorrect output tensor dimension sizes!"; + if (out_dims != out_idx_dims) + THROW_CPU_NODE_ERR << " gets incorrect output tensor dimension sizes!"; - if (axis < 0) - axis += in_dims_size; - if (axis < 0 || axis >= static_cast(in_dims_size)) - IE_THROW() << errorPrefix << " gets incorrect input parameters dimensions and axis number!"; - } else { - IE_THROW(NotImplemented) << errorMessage; - } + if (axis < 0) + axis += in_dims_size; + if (axis < 0 || axis >= static_cast(in_dims_size)) + THROW_CPU_NODE_ERR << " gets incorrect input parameters dimensions and axis number!"; } void TopK::getSupportedDescriptors() {} @@ -1914,9 +1909,13 @@ void TopK::initSupportedPrimitiveDescriptors() { Precision::U8 }; + Precision inLenPrc = getOriginalInputPrecisionAtPort(TOPK_K); + if (!one_of(inLenPrc, Precision::I32, Precision::I64)) { + inLenPrc = Precision::I32; + } Precision dataPrecision = getOriginalOutputPrecisionAtPort(TOPK_DATA); if (dataPrecision == Precision::BF16 && !mayiuse(avx512_core)) - IE_THROW() << errorPrefix << " gets incorrect isa for BF16! AVX512 must be supported!"; + THROW_CPU_NODE_ERR << " gets incorrect isa for BF16! AVX512 must be supported!"; bool precisionSupported = std::find(std::begin(supportedPrecision), std::end(supportedPrecision), dataPrecision) != std::end(supportedPrecision); if (!precisionSupported) { @@ -1937,7 +1936,7 @@ void TopK::initSupportedPrimitiveDescriptors() { }; for (const auto &df : dataFomats) { - addSupportedPrimDesc({{df.first, dataPrecision}, {LayoutType::ncsp, Precision::I32}}, + addSupportedPrimDesc({{df.first, dataPrecision}, {LayoutType::ncsp, inLenPrc}}, {{df.second, dataPrecision}, {df.second, Precision::I32}}, impl_type); } @@ -1984,11 +1983,11 @@ void TopK::prepareParams() { auto dstMemPtr = getChildEdgeAt(TOPK_DATA)->getMemoryPtr(); auto srcMemPtr = getParentEdgeAt(TOPK_DATA)->getMemoryPtr(); if (!dstMemPtr || !dstMemPtr->isAllocated()) - IE_THROW() << errorPrefix << " has not allocated destination memory."; + THROW_CPU_NODE_ERR << " has not allocated destination memory."; if (!srcMemPtr || !srcMemPtr->isAllocated()) - IE_THROW() << errorPrefix << " has not allocate input memory."; + THROW_CPU_NODE_ERR << " has not allocate input memory."; if (getSelectedPrimitiveDescriptor() == nullptr) - IE_THROW() << errorPrefix << " has nullable preferable primitive descriptor"; + THROW_CPU_NODE_ERR << " has nullable preferable primitive descriptor"; src_dims = srcMemPtr->getDesc().getShape().getDims(); dst_dims = dstMemPtr->getDesc().getShape().getDims(); @@ -2000,10 +1999,9 @@ void TopK::prepareParams() { if (top_k != src_k) { top_k = src_k; } - } else { - top_k = reinterpret_cast(getParentEdgeAt(TOPK_K)->getMemoryPtr()->getData())[0]; } + if (jit_mode) { if (!preset_params_done) { preset_params(); @@ -2154,7 +2152,7 @@ void TopK::execute(dnnl::stream strm) { auto out_idx_ptr = reinterpret_cast(dst_idx); topk_ref(in_ptr, out_ptr, out_idx_ptr); } else { - IE_THROW() << errorPrefix << "only support plain layout on machine w/o sse42."; + THROW_CPU_NODE_ERR << "only support plain layout on machine w/o sse42."; } } } diff --git a/src/plugins/intel_cpu/src/nodes/topk.h b/src/plugins/intel_cpu/src/nodes/topk.h index f737857073c8fd..0e30d55ee30d4d 100644 --- a/src/plugins/intel_cpu/src/nodes/topk.h +++ b/src/plugins/intel_cpu/src/nodes/topk.h @@ -6,11 +6,6 @@ #include -#include -#include -#include -#include - namespace ov { namespace intel_cpu { namespace node { @@ -80,7 +75,7 @@ struct jit_uni_topk_kernel { class TopK : public Node { public: - TopK(const std::shared_ptr& op, const GraphContext::CPtr context); + TopK(const std::shared_ptr& op, const GraphContext::CPtr context); ~TopK() override = default; void getSupportedDescriptors() override; @@ -119,14 +114,14 @@ class TopK : public Node { bool stable = false; bool mode_max = false; int axis = 0; - static const size_t TOPK_DATA = 0; - static const size_t TOPK_K = 1; - static const size_t TOPK_INDEX = 1; + static constexpr size_t TOPK_DATA = 0; + static constexpr size_t TOPK_K = 1; + static constexpr size_t TOPK_INDEX = 1; size_t O = 0, A = 0, I = 0; size_t blk_size = 0; size_t data_size = 0; size_t axis_dim = 0; - int top_k = 0; + int64_t top_k = 0; int dim = 0, before_num = 0; bool bubble_inplace = false; bool preset_params_done = false; diff --git a/src/plugins/intel_cpu/src/nodes/transpose.cpp b/src/plugins/intel_cpu/src/nodes/transpose.cpp index 750ae6bf711ca0..7921772b18228d 100644 --- a/src/plugins/intel_cpu/src/nodes/transpose.cpp +++ b/src/plugins/intel_cpu/src/nodes/transpose.cpp @@ -10,6 +10,8 @@ #include #include #include +#include +#include using namespace dnnl; using namespace InferenceEngine; @@ -21,12 +23,12 @@ namespace node { bool Transpose::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { try { if (!one_of(op->get_type_info(), - ov::op::v1::Transpose::get_type_info_static())) { + op::v1::Transpose::get_type_info_static())) { errorMessage = "Node is not an instance of the Transpose operation from opset1."; return false; } - if (op->get_input_node_ptr(INPUT_ORDER_IDX)->get_type_info() != ov::op::v0::Constant::get_type_info_static()) { + if (op->get_input_node_ptr(INPUT_ORDER_IDX)->get_type_info() != op::v0::Constant::get_type_info_static()) { // TODO: Support parameterized Order input for dynamic shapes. errorMessage = "Constant expected as the second input for static shapes."; return false; @@ -88,7 +90,7 @@ class TransposeShapeInferFactory : public ShapeInferFactory { public: TransposeShapeInferFactory(const std::shared_ptr& op) : m_op(op) {} ShapeInferPtr makeShapeInfer() const override { - if (const auto order = ov::as_type_ptr(m_op->get_input_node_shared_ptr(ov::op::v1::Transpose::ORDER))) { + if (const auto order = ov::as_type_ptr(m_op->get_input_node_shared_ptr(op::v1::Transpose::ORDER))) { const auto axes_vec = order->cast_vector(); return std::make_shared(m_op->get_output_partial_shape(0).rank().get_length(), axes_vec); } else { @@ -101,16 +103,20 @@ class TransposeShapeInferFactory : public ShapeInferFactory { }; } // namespace -Transpose::Transpose(const std::shared_ptr& op, const GraphContext::CPtr context) +Transpose::Transpose(const std::shared_ptr& op, const GraphContext::CPtr& context) : Node(op, context, TransposeShapeInferFactory(op)) { std::string errorMessage; if (!isSupportedOperation(op, errorMessage)) { IE_THROW(NotImplemented) << errorMessage; } - if (op->get_input_node_ptr(INPUT_ORDER_IDX)->get_type_info() == ov::op::v0::Constant::get_type_info_static()) { + if (auto inputOrder = ov::as_type(op->get_input_node_ptr(INPUT_ORDER_IDX))) { isInputOrderConst = true; - order = ov::as_type(op->get_input_node_ptr(INPUT_ORDER_IDX))->cast_vector(); + if (one_of(inputOrder->get_element_type(), ov::element::i64, ov::element::u64)) { + order = inputOrder->get_vector(); + } else { + order = inputOrder->cast_vector(); + } if (order.empty()) { size_t rank = getInputShapeAtPort(INPUT_DATA_IDX).getRank(); @@ -128,7 +134,11 @@ void Transpose::initSupportedPrimitiveDescriptors() { if (!supportedPrimitiveDescriptors.empty()) return; - prec = getOriginalInputPrecisionAtPort(0); + const auto &dataPrc = getOriginalInputPrecisionAtPort(0); + auto orderPrc = getOriginalInputPrecisionAtPort(1); + if (!one_of(orderPrc, Precision::I32, Precision::I64)) { + orderPrc = Precision::I32; + } auto& creatorsMap = BlockedDescCreator::getCommonCreators(); @@ -139,7 +149,7 @@ void Transpose::initSupportedPrimitiveDescriptors() { config.inConfs[INPUT_DATA_IDX].constant(false); config.inConfs[INPUT_ORDER_IDX].constant(isInputOrderConst); config.inConfs[INPUT_ORDER_IDX].setMemDesc(creatorsMap.at(LayoutType::ncsp)->createSharedDesc( - Precision::I32, getInputShapeAtPort(INPUT_ORDER_IDX))); + orderPrc, getInputShapeAtPort(INPUT_ORDER_IDX))); config.outConfs[0].inPlace(-1); config.outConfs[0].constant(false); transpose_context = std::make_shared(context, getImplPriority()); @@ -160,30 +170,30 @@ void Transpose::initSupportedPrimitiveDescriptors() { const auto& inputDataShape = getInputShapeAtPort(INPUT_DATA_IDX); const auto& outputDataShape = getOutputShapeAtPort(0); if (inputDataShape.getRank() == 4 || inputDataShape.getRank() == 5) { - config.inConfs[0].setMemDesc(creatorsMap.at(LayoutType::ncsp)->createSharedDesc(prec, inputDataShape)); - config.outConfs[0].setMemDesc(creatorsMap.at(LayoutType::ncsp)->createSharedDesc(prec, outputDataShape)); + config.inConfs[0].setMemDesc(creatorsMap.at(LayoutType::ncsp)->createSharedDesc(dataPrc, inputDataShape)); + config.outConfs[0].setMemDesc(creatorsMap.at(LayoutType::ncsp)->createSharedDesc(dataPrc, outputDataShape)); supportedPrimitiveDescriptorsBuilder(config, transposeParams); #if defined(OPENVINO_ARCH_X86_64) const auto& srcDims = inputDataShape.getDims(); if (srcDims[1] != Shape::UNDEFINED_DIM && srcDims[1] % 8 == 0) { - config.inConfs[0].setMemDesc(creatorsMap.at(LayoutType::nCsp8c)->createSharedDesc(prec, inputDataShape)); + config.inConfs[0].setMemDesc(creatorsMap.at(LayoutType::nCsp8c)->createSharedDesc(dataPrc, inputDataShape)); supportedPrimitiveDescriptorsBuilder(config, transposeParams); } if (srcDims[1] != Shape::UNDEFINED_DIM && srcDims[1] % 16 == 0) { - config.inConfs[0].setMemDesc(creatorsMap.at(LayoutType::nCsp16c)->createSharedDesc(prec, inputDataShape)); + config.inConfs[0].setMemDesc(creatorsMap.at(LayoutType::nCsp16c)->createSharedDesc(dataPrc, inputDataShape)); supportedPrimitiveDescriptorsBuilder(config, transposeParams); } #endif // OPENVINO_ARCH_X86_64 - if (prec == Precision::FP32 || prec == Precision::I8 || prec == Precision::U8) { - config.inConfs[0].setMemDesc(creatorsMap.at(LayoutType::nspc)->createSharedDesc(prec, inputDataShape)); - config.outConfs[0].setMemDesc(creatorsMap.at(LayoutType::nspc)->createSharedDesc(prec, outputDataShape)); + if (one_of(dataPrc, Precision::FP32, Precision::I8, Precision::U8)) { + config.inConfs[0].setMemDesc(creatorsMap.at(LayoutType::nspc)->createSharedDesc(dataPrc, inputDataShape)); + config.outConfs[0].setMemDesc(creatorsMap.at(LayoutType::nspc)->createSharedDesc(dataPrc, outputDataShape)); supportedPrimitiveDescriptorsBuilder(config, transposeParams); } } else { // general plain case - config.inConfs[0].setMemDesc(creatorsMap.at(LayoutType::ncsp)->createSharedDesc(prec, inputDataShape)); - config.outConfs[0].setMemDesc(creatorsMap.at(LayoutType::ncsp)->createSharedDesc(prec, outputDataShape)); + config.inConfs[0].setMemDesc(creatorsMap.at(LayoutType::ncsp)->createSharedDesc(dataPrc, inputDataShape)); + config.outConfs[0].setMemDesc(creatorsMap.at(LayoutType::ncsp)->createSharedDesc(dataPrc, outputDataShape)); supportedPrimitiveDescriptorsBuilder(config, transposeParams); } } @@ -228,9 +238,15 @@ void Transpose::prepareParams() { transposeParams.permuteParams.dst_block_dims = dstDesc->getBlockDims(); if (!isInputOrderConst) { - auto orderPtr = reinterpret_cast(getParentEdgeAt(0)->getMemoryPtr()->getData()); - auto orderLen = getParentEdgeAt(0)->getMemoryPtr()->getSize(); - transposeParams.permuteParams.order.assign(orderPtr, orderPtr + orderLen); + auto mem = getParentEdgeAt(0)->getMemoryPtr(); + auto orderLen = mem->getSize(); + if (mem->getDesc().getPrecision() == Precision::I64) { + auto orderPtr = reinterpret_cast(mem->getData()); + transposeParams.permuteParams.order.assign(orderPtr, orderPtr + orderLen); + } else { + auto orderPtr = reinterpret_cast(mem->getData()); + transposeParams.permuteParams.order.assign(orderPtr, orderPtr + orderLen); + } } auto engine = getEngine(); diff --git a/src/plugins/intel_cpu/src/nodes/transpose.h b/src/plugins/intel_cpu/src/nodes/transpose.h index 5fb7e9f76570bf..4f187270a2904a 100644 --- a/src/plugins/intel_cpu/src/nodes/transpose.h +++ b/src/plugins/intel_cpu/src/nodes/transpose.h @@ -18,9 +18,9 @@ namespace node { class Transpose : public Node { public: - Transpose(const std::shared_ptr& op, const GraphContext::CPtr context); + Transpose(const std::shared_ptr& op, const GraphContext::CPtr& context); - static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; + static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; void getSupportedDescriptors() override; void initSupportedPrimitiveDescriptors() override; void createPrimitive() override; @@ -46,7 +46,6 @@ class Transpose : public Node { TransposeExecutorPtr execPtr = nullptr; dnnl::primitive prim; InferenceEngine::SizeVector order; - InferenceEngine::Precision prec; TransposeParams transposeParams; diff --git a/src/plugins/intel_cpu/src/nodes/unique.cpp b/src/plugins/intel_cpu/src/nodes/unique.cpp index 5fbb3b4cebe2f0..f525a67d34b76f 100644 --- a/src/plugins/intel_cpu/src/nodes/unique.cpp +++ b/src/plugins/intel_cpu/src/nodes/unique.cpp @@ -5,16 +5,15 @@ #include "unique.hpp" #include "ie_parallel.hpp" -#include #include "common/cpu_memcpy.h" #include +#include +#include using namespace InferenceEngine; using namespace ov::intel_cpu; using namespace ov::intel_cpu::node; -#define THROW_ERROR IE_THROW() << getTypeStr() << " node with name '" << getName() << "' " - bool Unique::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { try { if (!ov::is_type(op)) { @@ -40,21 +39,22 @@ Unique::Unique(const std::shared_ptr& op, const GraphContext::CPtr con } if (!one_of(op->get_input_size(), 1u, 2u) || op->get_output_size() != 4) - THROW_ERROR << "has incorrect number of input/output edges."; + THROW_CPU_NODE_ERR << "has incorrect number of input/output edges."; for (int i = 0; i < 4; i++) { definedOutputs[i] = !op->get_output_target_inputs(i).empty(); } sorted = ov::as_type_ptr(op)->get_sorted(); - if (op->get_input_size() > AXIS) { + auto dataShapeRank = op->get_input_partial_shape(IN_DATA).rank().get_length(); + if (op->get_input_size() > AXIS && dataShapeRank > 1) { flattened = false; axis = ov::as_type(op->get_input_node_ptr(AXIS))->cast_vector()[0]; if (axis < 0) { - axis += op->get_input_partial_shape(IN_DATA).rank().get_length(); + axis += dataShapeRank; } - if (axis < 0 || axis >= op->get_input_partial_shape(IN_DATA).rank().get_length()) { - THROW_ERROR << "has invalid axis value: " << ov::as_type(op->get_input_node_ptr(AXIS))->cast_vector()[0]; + if (axis < 0 || axis >= dataShapeRank) { + THROW_CPU_NODE_ERR << "has invalid axis value: " << ov::as_type(op->get_input_node_ptr(AXIS))->cast_vector()[0]; } } else { flattened = true; @@ -63,21 +63,23 @@ Unique::Unique(const std::shared_ptr& op, const GraphContext::CPtr con void Unique::initSupportedPrimitiveDescriptors() { dataPrecision = getOriginalInputPrecisionAtPort(IN_DATA); - if (dataPrecision != Precision::I32 && dataPrecision != Precision::I8 && dataPrecision != Precision::U8) { + if (dataPrecision != Precision::I64 && dataPrecision != Precision::I32 && dataPrecision != Precision::I8 && dataPrecision != Precision::U8) { dataPrecision = Precision::FP32; } dataTypeSize = dataPrecision.size(); - const InferenceEngine::Precision axisPrecision = Precision::I32; + Precision axisPrecision = Precision::I64; impl_desc_type implType = ref; std::vector inPortConfigs = { {LayoutType::ncsp, dataPrecision} }; - if (!flattened) { + if (getOriginalInputsNumber() > AXIS) { + axisPrecision = getOriginalInputPrecisionAtPort(AXIS); inPortConfigs.push_back({LayoutType::ncsp, axisPrecision}); } std::vector outPortConfigs; for (int i = 0; i < 4; i++) { - outPortConfigs.push_back({LayoutType::ncsp, i == 0 ? dataPrecision : axisPrecision}); + outputsPrc[i] = getOriginalOutputPrecisionAtPort(i); + outPortConfigs.push_back({LayoutType::ncsp, i == 0 ? dataPrecision : getOriginalOutputPrecisionAtPort(i)}); } addSupportedPrimDesc(inPortConfigs, outPortConfigs, implType); @@ -90,18 +92,18 @@ void Unique::createPrimitive() { void Unique::prepareParams() { auto dataMemPtr = getParentEdgeAt(IN_DATA)->getMemoryPtr(); if (!dataMemPtr || !dataMemPtr->isAllocated()) { - THROW_ERROR << " has not allocated input data memory."; + THROW_CPU_NODE_ERR << " has not allocated input data memory."; } for (int i = 0; i < 4; i++) { if (definedOutputs[i]) { auto dstMemPtr = getChildEdgeAt(i)->getMemoryPtr(); if (!dstMemPtr || !dstMemPtr->isAllocated()) { - THROW_ERROR << " has not allocated output memory at port " << i; + THROW_CPU_NODE_ERR << " has not allocated output memory at port " << i; } } } if (getSelectedPrimitiveDescriptor() == nullptr) { - THROW_ERROR << " has unidentified preferable primitive descriptor."; + THROW_CPU_NODE_ERR << " has unidentified preferable primitive descriptor."; } size_t srcLen = 1; @@ -111,9 +113,15 @@ void Unique::prepareParams() { auto dstDataShape = getParentEdgeAt(IN_DATA)->getMemoryPtr()->getStaticDims(); srcLen = dstDataShape[axis]; } - firstUniTmp.resize(srcLen, 0); - inToOutTmp.resize(srcLen); - occurTmp.resize(srcLen); + if (definedOutputs[FIRST_UNIQUE_IDX]) { + firstUniTmp.resize(srcLen, 0); + } + if (definedOutputs[INPUT_TO_UNIQ_IDX]) { + inToOutTmp.resize(srcLen); + } + if (definedOutputs[OCCURRENCES_NUM]) { + occurTmp.resize(srcLen); + } } template @@ -135,12 +143,14 @@ void Unique::execute(dnnl::stream strm) { OV_SWITCH(intel_cpu, flattenExec, this, dataPrecision, OV_CASE(Precision::FP32, float), OV_CASE(Precision::I32, int32_t), + OV_CASE(Precision::I64, int64_t), OV_CASE(Precision::I8, int8_t), OV_CASE(Precision::U8, uint8_t)) } else { OV_SWITCH(intel_cpu, slicedExec, this, dataPrecision, OV_CASE(Precision::FP32, float), OV_CASE(Precision::I32, int32_t), + OV_CASE(Precision::I64, int64_t), OV_CASE(Precision::I8, int8_t), OV_CASE(Precision::U8, uint8_t)) } @@ -168,7 +178,7 @@ void Unique::flattenTensorExec() { const size_t inputLen = getParentEdgeAt(IN_DATA)->getMemoryPtr()->getSize() / sizeof(T); std::vector uniDataTmp(inputLen); auto uniDataTmpPtr = uniDataTmp.data(); - int *firstTmpPtr = nullptr, *inToOutTmpPtr = nullptr, *occurTmpPtr = nullptr; + int64_t *firstTmpPtr = nullptr, *inToOutTmpPtr = nullptr, *occurTmpPtr = nullptr; if (definedOutputs[FIRST_UNIQUE_IDX]) { firstTmpPtr = firstUniTmp.data(); } @@ -266,16 +276,13 @@ void Unique::flattenTensorExec() { T* uniDataPtr = reinterpret_cast(getChildEdgesAtPort(UNIQUE_DATA)[0]->getMemoryPtr()->getData()); cpu_parallel_memcpy(uniDataPtr, uniDataTmpPtr, uniqueLen * sizeof(T)); if (definedOutputs[FIRST_UNIQUE_IDX]) { - int *firstPtr = reinterpret_cast(getChildEdgesAtPort(FIRST_UNIQUE_IDX)[0]->getMemoryPtr()->getData()); - cpu_parallel_memcpy(firstPtr, firstUniTmp.data(), uniqueLen * sizeof(int)); + copyOutput(FIRST_UNIQUE_IDX, firstUniTmp.data(), uniqueLen); } if (definedOutputs[INPUT_TO_UNIQ_IDX]) { - auto inToOutPtr = reinterpret_cast(getChildEdgesAtPort(INPUT_TO_UNIQ_IDX)[0]->getMemoryPtr()->getData()); - cpu_parallel_memcpy(inToOutPtr, inToOutTmp.data(), inputLen * sizeof(int)); + copyOutput(INPUT_TO_UNIQ_IDX, inToOutTmpPtr, inputLen); } if (definedOutputs[OCCURRENCES_NUM]) { - auto occurPtr = reinterpret_cast(getChildEdgesAtPort(OCCURRENCES_NUM)[0]->getMemoryPtr()->getData()); - cpu_parallel_memcpy(occurPtr, occurTmp.data(), uniqueLen * sizeof(int)); + copyOutput(OCCURRENCES_NUM, occurTmpPtr, uniqueLen); } } @@ -283,16 +290,17 @@ template void Unique::slicedTensorExec() { auto inDataMemPtr = getParentEdgeAt(IN_DATA)->getMemoryPtr(); auto srcDataPtr = reinterpret_cast(inDataMemPtr->getData()); - int *firstTmpPtr = nullptr, *inToOutTmpPtr = nullptr, *occurTmpPtr = nullptr; - if (definedOutputs[FIRST_UNIQUE_IDX]) { - firstTmpPtr = firstUniTmp.data(); - } - if (definedOutputs[INPUT_TO_UNIQ_IDX]) { - inToOutTmpPtr = inToOutTmp.data(); - } - if (definedOutputs[OCCURRENCES_NUM]) { - occurTmpPtr = occurTmp.data(); - } + + uint8_t *firstTmpPtr = nullptr, *inToOutTmpPtr = nullptr, *occurTmpPtr = nullptr; + if (definedOutputs[FIRST_UNIQUE_IDX]) { + firstTmpPtr = reinterpret_cast(firstUniTmp.data()); + } + if (definedOutputs[INPUT_TO_UNIQ_IDX]) { + inToOutTmpPtr = reinterpret_cast(inToOutTmp.data()); + } + if (definedOutputs[OCCURRENCES_NUM]) { + occurTmpPtr = reinterpret_cast(occurTmp.data()); + } const auto& srcDataShape = inDataMemPtr->getStaticDims(); @@ -309,14 +317,27 @@ void Unique::slicedTensorExec() { const auto srcOuterStep = innerLen * axisDim; if (definedOutputs[FIRST_UNIQUE_IDX]) { - firstTmpPtr[0] = 0; + if (outputsPrc[FIRST_UNIQUE_IDX] == Precision::I32) { + reinterpret_cast(firstTmpPtr)[0] = 0; + } else { + reinterpret_cast(firstTmpPtr)[0] = 0; + } } if (definedOutputs[INPUT_TO_UNIQ_IDX]) { - inToOutTmpPtr[0] = 0; + if (outputsPrc[INPUT_TO_UNIQ_IDX] == Precision::I32) { + reinterpret_cast(inToOutTmpPtr)[0] = 0; + } else { + reinterpret_cast(inToOutTmpPtr)[0] = 0; + } } if (definedOutputs[OCCURRENCES_NUM]) { - occurTmpPtr[0] = 1; - std::fill(occurTmpPtr, occurTmpPtr + axisDim, 1); + if (outputsPrc[OCCURRENCES_NUM] == Precision::I32) { + auto dstMem = reinterpret_cast(occurTmpPtr); + std::fill(dstMem, dstMem + axisDim, 1); + } else { + auto dstMem = reinterpret_cast(occurTmpPtr); + std::fill(dstMem, dstMem + axisDim, 1); + } } uniqueLen = 1lu; @@ -346,17 +367,29 @@ void Unique::slicedTensorExec() { } if (!equal) { if (definedOutputs[FIRST_UNIQUE_IDX]) { - firstTmpPtr[uniqueLen] = a; + if (outputsPrc[FIRST_UNIQUE_IDX] == Precision::I32) { + reinterpret_cast(firstTmpPtr)[uniqueLen] = static_cast(a); + } else { + reinterpret_cast(firstTmpPtr)[uniqueLen] = static_cast(a); + } } uniqIdx[uniqueLen++] = a; } else { if (definedOutputs[OCCURRENCES_NUM]) { - occurTmpPtr[uIdx]++; + if (outputsPrc[OCCURRENCES_NUM] == Precision::I32) { + reinterpret_cast(occurTmpPtr)[uIdx]++; + } else { + reinterpret_cast(occurTmpPtr)[uIdx]++; + } } } if (definedOutputs[INPUT_TO_UNIQ_IDX]) { - inToOutTmpPtr[a] = uIdx; + if (outputsPrc[INPUT_TO_UNIQ_IDX] == Precision::I32) { + reinterpret_cast(inToOutTmpPtr)[a] = uIdx; + } else { + reinterpret_cast(inToOutTmpPtr)[a] = uIdx; + } } } @@ -365,15 +398,15 @@ void Unique::slicedTensorExec() { dstDataShape[axis] = uniqueLen; redefineOutputMemory({ dstDataShape, {uniqueLen}, {axisDim}, {uniqueLen}}); - int *firstPtr = nullptr, *inToOutPtr = nullptr, *occurNPtr = nullptr; + uint8_t *firstPtr = nullptr, *inToOutPtr = nullptr, *occurNPtr = nullptr; if (definedOutputs[FIRST_UNIQUE_IDX]) { - firstPtr = reinterpret_cast(getChildEdgesAtPort(FIRST_UNIQUE_IDX)[0]->getMemoryPtr()->getData()); + firstPtr = reinterpret_cast(getChildEdgesAtPort(FIRST_UNIQUE_IDX)[0]->getMemoryPtr()->getData()); } if (definedOutputs[INPUT_TO_UNIQ_IDX]) { - inToOutPtr = reinterpret_cast(getChildEdgesAtPort(INPUT_TO_UNIQ_IDX)[0]->getMemoryPtr()->getData()); + inToOutPtr = reinterpret_cast(getChildEdgesAtPort(INPUT_TO_UNIQ_IDX)[0]->getMemoryPtr()->getData()); } if (definedOutputs[OCCURRENCES_NUM]) { - occurNPtr = reinterpret_cast(getChildEdgesAtPort(OCCURRENCES_NUM)[0]->getMemoryPtr()->getData()); + occurNPtr = reinterpret_cast(getChildEdgesAtPort(OCCURRENCES_NUM)[0]->getMemoryPtr()->getData()); } T* dstDataPtr = reinterpret_cast(getChildEdgesAtPort(UNIQUE_DATA)[0]->getMemoryPtr()->getData()); @@ -391,8 +424,6 @@ void Unique::slicedTensorExec() { }); } - const auto uniqueLenIB = uniqueLen * sizeof(T); - if (sorted) { const auto dstUniDataLen = dstOuterStep * outerLen; std::vector vDstBuff(dstUniDataLen); @@ -405,9 +436,9 @@ void Unique::slicedTensorExec() { std::vector colToSort(uniqueLen); T *dst1 = dstDataPtr, *dst2 = dstBuff; - int *first1 = firstPtr, *first2 = firstTmpPtr; - int *occurN1 = occurNPtr, *occurN2 = occurTmpPtr; - int *inToOut1 = inToOutPtr, *inToOut2 = inToOutTmpPtr; + uint8_t *first1 = firstPtr, *first2 = firstTmpPtr; + uint8_t *occurN1 = occurNPtr, *occurN2 = occurTmpPtr; + uint8_t *inToOut1 = inToOutPtr, *inToOut2 = inToOutTmpPtr; const bool defined3outputs = definedOutputs[FIRST_UNIQUE_IDX] || definedOutputs[OCCURRENCES_NUM] || definedOutputs[INPUT_TO_UNIQ_IDX]; @@ -432,15 +463,35 @@ void Unique::slicedTensorExec() { if (defined3outputs) { parallel_for(uniqueLen, [&](size_t u) { if (definedOutputs[FIRST_UNIQUE_IDX]) { - first1[u] = first2[colToSort[u].idx]; + if (outputsPrc[FIRST_UNIQUE_IDX] == Precision::I32) { + reinterpret_cast(first1)[u] = reinterpret_cast(first2)[colToSort[u].idx]; + } else { + reinterpret_cast(first1)[u] = reinterpret_cast(first2)[colToSort[u].idx]; + } } if (definedOutputs[OCCURRENCES_NUM]) { - occurN1[u] = occurN2[colToSort[u].idx]; + if (outputsPrc[OCCURRENCES_NUM] == Precision::I32) { + reinterpret_cast(occurN1)[u] = reinterpret_cast(occurN2)[colToSort[u].idx]; + } else { + reinterpret_cast(occurN1)[u] = reinterpret_cast(occurN2)[colToSort[u].idx]; + } } if (definedOutputs[INPUT_TO_UNIQ_IDX]) { - for (size_t ax = 0; ax < axisDim; ax++) { - if (inToOut2[ax] == colToSort[u].idx) { - inToOut1[ax] = u; + if (outputsPrc[INPUT_TO_UNIQ_IDX] == Precision::I32) { + auto inToOut1_i32 = reinterpret_cast(inToOut1); + auto inToOut2_i32 = reinterpret_cast(inToOut2); + for (size_t ax = 0; ax < axisDim; ax++) { + if (inToOut2_i32[ax] == colToSort[u].idx) { + inToOut1_i32[ax] = static_cast(u); + } + } + } else { + auto inToOut1_i64 = reinterpret_cast(inToOut1); + auto inToOut2_i64 = reinterpret_cast(inToOut2); + for (size_t ax = 0; ax < axisDim; ax++) { + if (inToOut2_i64[ax] == colToSort[u].idx) { + inToOut1_i64[ax] = static_cast(u); + } } } } @@ -464,23 +515,41 @@ void Unique::slicedTensorExec() { cpu_parallel_memcpy(dstDataPtr, dst1, dstUniDataLen * sizeof(T)); } if (definedOutputs[FIRST_UNIQUE_IDX] && first2 != firstPtr) { - cpu_parallel_memcpy(firstPtr, first2, uniqueLenIB); + const auto cpyLen = uniqueLen * (outputsPrc[FIRST_UNIQUE_IDX] == Precision::I32 ? sizeof(int32_t) : sizeof(int64_t)); + cpu_parallel_memcpy(firstPtr, first2, cpyLen); } if (definedOutputs[INPUT_TO_UNIQ_IDX] && inToOut2 != inToOutPtr) { - cpu_parallel_memcpy(inToOutPtr, inToOut2, axisDim * sizeof(int)); + const auto cpyLen = axisDim * (outputsPrc[INPUT_TO_UNIQ_IDX] == Precision::I32 ? sizeof(int32_t) : sizeof(int64_t)); + cpu_parallel_memcpy(inToOutPtr, inToOut2, cpyLen); } if (definedOutputs[OCCURRENCES_NUM] && occurN2 != occurNPtr) { - cpu_parallel_memcpy(occurNPtr, occurN2, uniqueLenIB); + const auto cpyLen = uniqueLen * (outputsPrc[OCCURRENCES_NUM] == Precision::I32 ? sizeof(int32_t) : sizeof(int64_t)); + cpu_parallel_memcpy(occurNPtr, occurN2, cpyLen); } } else { if (definedOutputs[FIRST_UNIQUE_IDX]) { - cpu_parallel_memcpy(firstPtr, firstUniTmp.data(), uniqueLenIB); + const auto cpyLen = uniqueLen * (outputsPrc[FIRST_UNIQUE_IDX] == Precision::I32 ? sizeof(int32_t) : sizeof(int64_t)); + cpu_parallel_memcpy(firstPtr, firstUniTmp.data(), cpyLen); } if (definedOutputs[INPUT_TO_UNIQ_IDX]) { - cpu_parallel_memcpy(inToOutPtr, inToOutTmp.data(), axisDim * sizeof(int)); + const auto cpyLen = axisDim * (outputsPrc[INPUT_TO_UNIQ_IDX] == Precision::I32 ? sizeof(int32_t) : sizeof(int64_t)); + cpu_parallel_memcpy(inToOutPtr, inToOutTmp.data(), cpyLen); } if (definedOutputs[OCCURRENCES_NUM]) { - cpu_parallel_memcpy(occurNPtr, occurTmp.data(), uniqueLenIB); + const auto cpyLen = uniqueLen * (outputsPrc[OCCURRENCES_NUM] == Precision::I32 ? sizeof(int32_t) : sizeof(int64_t)); + cpu_parallel_memcpy(occurNPtr, occurTmp.data(), cpyLen); } } } + +void Unique::copyOutput(size_t outIdx, const int64_t* srcPtr, size_t len) { + const auto outMem = getChildEdgesAtPort(outIdx)[0]->getMemoryPtr(); + if (outMem->getDataType() == dnnl::memory::data_type::s64) { + cpu_parallel_memcpy(outMem->getData(), srcPtr, len * sizeof(int64_t)); + } else if (outMem->getDataType() == dnnl::memory::data_type::s32) { + auto outPtr = reinterpret_cast(outMem->getData()); + parallel_for(len, [&](size_t i) { + outPtr[i] = static_cast(srcPtr[i]); + }); + } +} diff --git a/src/plugins/intel_cpu/src/nodes/unique.hpp b/src/plugins/intel_cpu/src/nodes/unique.hpp index 65b8636abe3d01..57d174b4078fee 100644 --- a/src/plugins/intel_cpu/src/nodes/unique.hpp +++ b/src/plugins/intel_cpu/src/nodes/unique.hpp @@ -27,6 +27,8 @@ class Unique : public Node { bool needShapeInfer() const override { return false; } private: + void copyOutput(size_t outIdx, const int64_t* srcPtr, size_t len); + template void flattenTensorExec(); template @@ -37,14 +39,15 @@ class Unique : public Node { template struct slicedExec; - std::vector firstUniTmp; - std::vector inToOutTmp; - std::vector occurTmp; + std::vector firstUniTmp; + std::vector inToOutTmp; + std::vector occurTmp; bool sorted = false; bool flattened = true; int axis = 0; bool definedOutputs[4] = { false, false, false, false }; + InferenceEngine::Precision outputsPrc[4] = { InferenceEngine::Precision::I32 }; InferenceEngine::Precision dataPrecision; int64_t dataTypeSize = 1l; size_t uniqueLen = 1lu; diff --git a/src/plugins/intel_cpu/src/plugin.cpp b/src/plugins/intel_cpu/src/plugin.cpp index 9038d660fb525b..d75122e527b988 100644 --- a/src/plugins/intel_cpu/src/plugin.cpp +++ b/src/plugins/intel_cpu/src/plugin.cpp @@ -9,7 +9,6 @@ #include "transformations/transformation_pipeline.h" #include "itt.h" -#include "extension_mngr.h" #include "extension.h" #include "serialize.h" #include "threading/ie_executor_manager.hpp" @@ -17,15 +16,12 @@ #include "ie_icore.hpp" #include "ie_plugin_config.hpp" #include "ie_system_conf.h" -#include "threading/ie_cpu_streams_info.hpp" #include "cpp_interfaces/interface/ie_internal_plugin_config.hpp" #include "openvino/runtime/intel_cpu/properties.hpp" -#include #include #include "performance_heuristics.hpp" -#include "openvino/runtime/properties.hpp" #include "weights_cache.hpp" #include "utils/denormals.hpp" @@ -36,7 +32,6 @@ #endif #include -#include using namespace InferenceEngine; @@ -155,7 +150,7 @@ static bool streamsSet(const std::map& config) { config.count(ov::num_streams.name()); } -void Engine::ApplyPerformanceHints(std::map &config, const std::shared_ptr& ngraphFunc) const { +void Engine::ApplyPerformanceHints(std::map &config, const std::shared_ptr& ngraphFunc) const { auto getNumStreamsLatency = [&]() { return std::pair(CONFIG_VALUE(CPU_THROUGHPUT_NUMA), ov::util::to_string(ov::streams::NUMA)); }; @@ -272,7 +267,7 @@ void Engine::ApplyPerformanceHints(std::map &config, c } } -void Engine::GetPerformanceStreams(Config& config, const std::shared_ptr& ngraphFunc) { +void Engine::GetPerformanceStreams(Config& config, const std::shared_ptr& ngraphFunc) { const auto perf_hint_name = config.perfHintsConfig.ovPerfHint; // save hints parameters to model rt_info ov::AnyMap hints_props; @@ -421,6 +416,19 @@ static Config::SnippetsMode getSnippetsMode(const std::map& modelConfig, Config& engineConfig) { + engineConfig.enableNativeI64 = false; + const auto i64prop = modelConfig.find(InferenceEngine::PluginConfigInternalParams::KEY_CPU_NATIVE_I64); + if (i64prop != modelConfig.end()) { + if (i64prop->second == PluginConfigParams::YES) { + engineConfig.enableNativeI64 = true; + } else if (i64prop->second != PluginConfigParams::NO) { + IE_THROW() << "Wrong value for property key " << PluginConfigInternalParams::KEY_CPU_NATIVE_I64 << ": " << i64prop->second + << ". Expected only YES or NO values."; + } + } +} + InferenceEngine::IExecutableNetworkInternal::Ptr Engine::LoadExeNetworkImpl(const InferenceEngine::CNNNetwork &network, const std::map &orig_config) { OV_ITT_SCOPED_TASK(itt::domains::intel_cpu, "Engine::LoadExeNetworkImpl"); @@ -454,6 +462,7 @@ Engine::LoadExeNetworkImpl(const InferenceEngine::CNNNetwork &network, const std const bool enableLPT = shouldEnableLPT(config, engConfig); ov::element::Type inferencePrecision = getInferencePrecision(config, engConfig); const Config::SnippetsMode snippetsMode = getSnippetsMode(config, engConfig); + setI64Mode(config, engConfig); auto nGraphFunc = clonedNetwork.getFunction(); @@ -729,6 +738,7 @@ QueryNetworkResult Engine::QueryNetwork(const CNNNetwork& network, const std::ma const bool enableLPT = (lptProp != config.end() && lptProp->second == PluginConfigParams::YES) /* enabled in the orig_config*/ || Config::LPTransformsMode::On == engConfig.lpTransformsMode /* or already enabled */; const Config::SnippetsMode snippetsMode = getSnippetsMode(config, conf); + setI64Mode(config, conf); auto model = network.getFunction(); if (model == nullptr) { @@ -744,7 +754,7 @@ QueryNetworkResult Engine::QueryNetwork(const CNNNetwork& network, const std::ma transformation.UpToCpuSpecificOpSet(); transformation.CpuSpecificOpSet(); }, - [&](const std::shared_ptr& op) { + [&](const std::shared_ptr& op) { std::unique_ptr ptr; try { ptr.reset(Node::factory().create(op, context)); diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/convert_to_cpu_specific_opset.hpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/convert_to_cpu_specific_opset.hpp index b2cd223db3e7b2..7ea7256b8e87fa 100644 --- a/src/plugins/intel_cpu/src/transformations/cpu_opset/convert_to_cpu_specific_opset.hpp +++ b/src/plugins/intel_cpu/src/transformations/cpu_opset/convert_to_cpu_specific_opset.hpp @@ -2,24 +2,19 @@ // SPDX-License-Identifier: Apache-2.0 // -#include -#include "ngraph/op/fake_quantize.hpp" -#include "ngraph/pass/manager.hpp" #include "common/pass/reshape_fc_fusion.hpp" #include "common/pass/align_matmul_input_ranks.hpp" -#include "transformations/common_optimizations/reshape_prelu.hpp" #include "common/pass/convert_broadcast_to_tiles.hpp" #include "common/pass/convert_tile_to_seq_tiles.hpp" #include "common/pass/convert_matmul_to_fc.hpp" #include "common/pass/convert_to_power_static.hpp" #include "common/pass/convert_to_leaky_relu.hpp" #include "common/pass/convert_to_swish_cpu.hpp" -#include "transformations/convert_precision.hpp" -#include "transformations/utils/utils.hpp" #include "common/pass/rnn_sequences_optimization.hpp" -#include "transformations/common_optimizations/reshape_sequence_fusion.hpp" #include "common/pass/ngram_fusion.hpp" -#include "transformations/defs.hpp" +#include +#include "openvino/pass/manager.hpp" +#include "transformations/common_optimizations/reshape_sequence_fusion.hpp" #include "itt.hpp" @@ -44,7 +39,6 @@ inline void ConvertToCPUSpecificOpset(std::shared_ptr &nGraphF // after transformation "MoveEltwiseUpThroughDataMov" there can be reshaped sequences that should be eliminated or fused CPU_REGISTER_PASS_COMMON(manager, ov::pass::ReshapeSequenceFusion); CPU_REGISTER_PASS_COMMON(manager, ov::pass::ConstantFolding); - CPU_REGISTER_PASS_COMMON(manager, ov::pass::ConvertPrecision, precisions_map {{ ngraph::element::i64, ngraph::element::i32 }}); CPU_REGISTER_PASS_COMMON(manager, NgramFusion); CPU_REGISTER_PASS_COMMON(manager, ov::pass::Validate); diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/x64/pass/convert_precision_i64_i32.cpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/x64/pass/convert_precision_i64_i32.cpp new file mode 100644 index 00000000000000..5a76f59f2412c7 --- /dev/null +++ b/src/plugins/intel_cpu/src/transformations/cpu_opset/x64/pass/convert_precision_i64_i32.cpp @@ -0,0 +1,155 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + + +#include "convert_precision_i64_i32.hpp" +#include +#include "transformations/utils/utils.hpp" +#include "cpu_types.h" + +#include + +bool isNativelySupported(const ov::Node::type_info_t &type) { + static const std::unordered_set i64Ops = { + ov::opset12::Add::get_type_info_static(), + ov::op::v1::Broadcast::get_type_info_static(), + ov::op::v3::Broadcast::get_type_info_static(), + ov::opset12::Concat::get_type_info_static(), + ov::opset12::Constant::get_type_info_static(), + ov::opset12::Convert::get_type_info_static(), + ov::opset12::CumSum::get_type_info_static(), + ov::opset12::Divide::get_type_info_static(), + ov::opset12::Equal::get_type_info_static(), + ov::opset12::FloorMod::get_type_info_static(), + ov::op::v1::Gather::get_type_info_static(), + ov::op::v7::Gather::get_type_info_static(), + ov::op::v8::Gather::get_type_info_static(), + ov::op::v5::GatherND::get_type_info_static(), + ov::op::v8::GatherND::get_type_info_static(), + ov::opset12::Greater::get_type_info_static(), + ov::opset12::Less::get_type_info_static(), + ov::opset12::Maximum::get_type_info_static(), + ov::opset12::Minimum::get_type_info_static(), + ov::opset12::Multiply::get_type_info_static(), + ov::opset12::NonMaxSuppression::get_type_info_static(), + ov::opset12::NonZero::get_type_info_static(), + ov::opset12::OneHot::get_type_info_static(), + ov::opset12::Parameter::get_type_info_static(), + ov::opset12::ReduceL1::get_type_info_static(), + ov::opset12::ReduceL2::get_type_info_static(), + ov::opset12::ReduceLogicalAnd::get_type_info_static(), + ov::opset12::ReduceMax::get_type_info_static(), + ov::opset12::ReduceMean::get_type_info_static(), + ov::opset12::ReduceMin::get_type_info_static(), + ov::opset12::ReduceProd::get_type_info_static(), + ov::opset12::ReduceSum::get_type_info_static(), + ov::opset12::Reshape::get_type_info_static(), + ov::opset12::Result::get_type_info_static(), + ov::opset12::ScatterElementsUpdate::get_type_info_static(), + ov::opset12::ScatterNDUpdate::get_type_info_static(), + ov::opset12::ScatterUpdate::get_type_info_static(), + ov::opset12::Select::get_type_info_static(), + ov::opset12::ShapeOf::get_type_info_static(), + ov::opset12::Slice::get_type_info_static(), + ov::opset12::Split::get_type_info_static(), + ov::opset12::Sqrt::get_type_info_static(), + ov::opset12::SquaredDifference::get_type_info_static(), + ov::opset12::Squeeze::get_type_info_static(), + ov::opset12::StridedSlice::get_type_info_static(), + ov::opset12::Subtract::get_type_info_static(), + ov::opset12::Tile::get_type_info_static(), + ov::opset12::Transpose::get_type_info_static(), + ov::opset12::Unique::get_type_info_static(), + ov::opset12::Unsqueeze::get_type_info_static(), + ov::opset12::VariadicSplit::get_type_info_static() + }; + + return i64Ops.find(type) != i64Ops.end(); +} + +std::shared_ptr changeConstantPrecision(std::shared_ptr& constant) { + const auto* srcData = constant->get_data_ptr(); + const auto size = shape_size(constant->get_shape()); + + auto newConstant = std::make_shared(ov::element::i32, constant->get_shape()); + newConstant->output(0).set_names(constant->output(0).get_names()); + auto* dstData = const_cast(reinterpret_cast(newConstant->get_data_ptr())); + if (dstData == nullptr) { + throw ngraph::ngraph_error("Can't get destination data pointer"); + } + + for (size_t i = 0; i < size; ++i) { + if (srcData[i] >= std::numeric_limits::max()) { + dstData[i] = std::numeric_limits::max(); + } else if (srcData[i] <= std::numeric_limits::lowest()) { + dstData[i] = std::numeric_limits::lowest(); + } else { + dstData[i] = static_cast(srcData[i]); + } + } + return newConstant; +} + +bool ov::intel_cpu::ConvertPrecisionI64ToI32::run_on_model(const std::shared_ptr &model) { + const auto orderedOps = model->get_ordered_ops(); + for (const auto& op : orderedOps) { + if (isNativelySupported(op->get_type_info()) || TypeFromName(op->get_type_name()) == Type::Unknown) { + continue; + } + + bool convertForOutputsRequired = false; + for (const auto& input : op->inputs()) { + if (input.get_element_type() == ov::element::i64) { + auto parentOutput = input.get_source_output(); + auto parentNode = parentOutput.get_node_shared_ptr(); + if (ov::is_type(parentNode) && + parentNode->get_rt_info().find("convert_i32_i64") != parentNode->get_rt_info().end()) { + input.replace_source_output(parentNode->input_value(0)); + } else if (auto constOp = ov::as_type_ptr(parentNode)) { + auto newConst = changeConstantPrecision(constOp); + input.replace_source_output(newConst); + newConst->set_friendly_name(constOp->get_friendly_name()); + } else { + auto convert = std::make_shared(input.get_source_output(), ov::element::i32); + convert->output(0).add_names(parentOutput.get_names()); + input.replace_source_output(convert); + } + convertForOutputsRequired = true; + } + } + + if (convertForOutputsRequired) { + // Propagate i32 precision into outputs. + op->validate_and_infer_types(); + for (auto& output : op->outputs()) { + if (output.get_element_type() == ov::element::i32) { + auto targetInputs = output.get_target_inputs(); + auto convert = std::make_shared(output, ov::element::i64); + + auto& rt_info = convert->get_rt_info(); + rt_info["convert_i32_i64"] = ""; + for (const auto& targetInput : targetInputs) { + targetInput.replace_source_output(convert); + } + + auto& convertTensor = convert->output(0).get_tensor(); + const std::string newName = ov::op::util::get_ie_output_name(output); + if (ov::descriptor::get_ov_tensor_legacy_name(convertTensor).empty()) { + ov::descriptor::set_ov_tensor_legacy_name(convertTensor, newName); + } + if (!output.get_names().empty()) { + convertTensor.set_names(output.get_names()); + } + } + } + } + + if (auto multisubgraph_op = ov::as_type_ptr(op)) { + for (size_t idx = 0; idx < multisubgraph_op->get_internal_subgraphs_size(); ++idx) { + run_on_model(multisubgraph_op->get_function(static_cast(idx))); + } + } + } + + return true; +} diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/x64/pass/convert_precision_i64_i32.hpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/x64/pass/convert_precision_i64_i32.hpp new file mode 100644 index 00000000000000..a3aa5a6f35455e --- /dev/null +++ b/src/plugins/intel_cpu/src/transformations/cpu_opset/x64/pass/convert_precision_i64_i32.hpp @@ -0,0 +1,21 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "openvino/pass/pass.hpp" + +namespace ov { +namespace intel_cpu { +class ConvertPrecisionI64ToI32: public ov::pass::ModelPass { +public: + OPENVINO_RTTI("ConvertPrecisionI64ToI32", "0"); + + ConvertPrecisionI64ToI32() = default; + + bool run_on_model(const std::shared_ptr& model) override; +}; + +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp index 1ec5d40071d73e..f833edda7658a2 100644 --- a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp +++ b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2022 Intel Corporation +// Copyright (C) 2022-2023 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -25,6 +25,7 @@ #include "transformations/common_optimizations/fq_mul_fusion.hpp" #include "transformations/common_optimizations/mul_fake_quantize_fusion.hpp" #include "transformations/common_optimizations/nop_elimination.hpp" +#include "transformations/common_optimizations/reshape_prelu.hpp" #include "transformations/common_optimizations/transpose_sinking.hpp" #include "transformations/common_optimizations/weights_dequantize_to_fake_quantize.hpp" #include "transformations/common_optimizations/augru_cell_fusion.hpp" @@ -53,8 +54,6 @@ #include "transformations/op_conversions/convert_slice_to_strided_slice.hpp" #include "transformations/op_conversions/convert_space_to_batch.hpp" #include "transformations/op_conversions/convert_space_to_depth.hpp" -#include "transformations/op_conversions/convert_subtract.hpp" -#include "transformations/op_conversions/convert_ti_to_sequences.hpp" #include "transformations/op_conversions/detection_output_downgrade.hpp" #include "transformations/op_conversions/detection_output_upgrade.hpp" #include "transformations/op_conversions/eye_decomposition.hpp" @@ -98,11 +97,7 @@ #include "transformations/snippets/x64/pass/snippets_mark_skipped.hpp" #include "transformations/cpu_opset/x64/pass/mha_fusion.hpp" #include "transformations/cpu_opset/x64/pass/convert_to_interaction.hpp" -#include "transformations/cpu_opset/arm/pass/convert_group_conv.hpp" -#include "transformations/cpu_opset/arm/pass/convert_group_conv1d.hpp" -#include "transformations/cpu_opset/arm/pass/convert_reduce_multi_axis.hpp" -#include "transformations/cpu_opset/arm/pass/mish_decomposition.hpp" -#include "transformations/cpu_opset/common/pass/decompose_integer_divide.hpp" +#include "transformations/cpu_opset/x64/pass/convert_precision_i64_i32.hpp" #include "transformations/cpu_opset/common/pass/convert_fq_rnn_to_quantized_rnn.hpp" #include "transformations/cpu_opset/common/pass/insert_convert_after_extension.hpp" #include "transformations/cpu_opset/common/pass/move_eltwise_up_data_movement.hpp" @@ -127,7 +122,7 @@ namespace intel_cpu { using const_node_ptr = const std::shared_ptr; -bool Transformations::fuse_type_to_convert(const std::shared_ptr& node, const precisions_map& precisions) { +bool Transformations::fuse_type_to_convert(const std::shared_ptr& node, const precisions_map& precisions) { const auto& from = node->get_output_element_type(0); auto it = precisions.find(from); if (it == precisions.end()) @@ -139,7 +134,7 @@ bool Transformations::fuse_type_to_convert(const std::shared_ptr& // is converted to be 1 for boolean, but 0 for u8. Thus an Abs and Ceil node should be added before the // Convert node for this scenario. if (convert->input(0).get_element_type().is_real() && - convert->get_convert_element_type() == ngraph::element::boolean && to.is_integral_number()) { + convert->get_convert_element_type() == ov::element::boolean && to.is_integral_number()) { auto abs = std::make_shared(convert->input_value(0).get_node_shared_ptr()); auto ceil = std::make_shared(abs); auto new_convert = std::make_shared(ceil, to); @@ -208,11 +203,10 @@ void Transformations::PreLpt(const std::vector& defaultPrecis if (useLpt) { CPU_REGISTER_PASS_COMMON(manager, ov::pass::MarkDequantizationSubgraph, defaultPrecisions); } + bool supportI64 = config.enableNativeI64; - auto get_convert_precisions = []() { + auto get_convert_precisions = [&]() { precisions_map map = { - {ov::element::i64, ov::element::i32}, - {ov::element::u64, ov::element::i32}, {ov::element::i16, ov::element::i32}, {ov::element::u16, ov::element::i32}, {ov::element::u32, ov::element::i32}, @@ -223,12 +217,21 @@ void Transformations::PreLpt(const std::vector& defaultPrecis {ov::element::u4, ov::element::u8} }; - if (!dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core)) + if (supportI64) { + map.insert({ov::element::u64, ov::element::i64}); + } else { + map.insert({ov::element::u64, ov::element::i32}); + map.insert({ov::element::i64, ov::element::i32}); + } + + if (!dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core)) { map.insert({ov::element::bf16, ov::element::f32}); + } return map; }; - static const auto precisions = get_convert_precisions(); + + const auto precisions = get_convert_precisions(); type_to_fuse_map type_to_fuse = {{ov::opset10::Convert::get_type_info_static(), fuse_type_to_convert}}; CPU_REGISTER_PASS_COMMON(manager, ov::pass::AUGRUCellFusion); @@ -263,8 +266,13 @@ void Transformations::PreLpt(const std::vector& defaultPrecis // Common ConvertPrecision pass handles only a limited set of opevino operations to match the list of precisions supported by the plugin. // However, if the extension operation produces an output precision that is not natively supported, this may lead to inconsistency during // element type propagation. This transformation is called before the ConvertPrecision pass to align the actual precisions with the list of supported ones. - CPU_REGISTER_PASS_COMMON(manager, ov::pass::InsertConvertAfterExtension); + if (!supportI64) { + CPU_REGISTER_PASS_COMMON(manager, ov::pass::InsertConvertAfterExtension); + } CPU_REGISTER_PASS_COMMON(manager, ov::pass::ConvertPrecision, precisions, type_to_fuse); + if (supportI64) { + CPU_REGISTER_PASS_X64(manager, ConvertPrecisionI64ToI32); + } CPU_REGISTER_PASS_COMMON(manager, ov::pass::EliminateConvert); CPU_REGISTER_PASS_COMMON(manager, SwapConvertTranspose); diff --git a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.h b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.h index 57ad2e95e122af..290011951aa264 100644 --- a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.h +++ b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.h @@ -62,7 +62,7 @@ class Transformations { void Snippets(void); - static bool fuse_type_to_convert(const std::shared_ptr& node, const precisions_map& precisions); + static bool fuse_type_to_convert(const std::shared_ptr& node, const precisions_map& precisions); }; } // namespace intel_cpu diff --git a/src/plugins/intel_cpu/src/utils/blob_dump.cpp b/src/plugins/intel_cpu/src/utils/blob_dump.cpp index dce76d115d0908..af4b32babce63e 100644 --- a/src/plugins/intel_cpu/src/utils/blob_dump.cpp +++ b/src/plugins/intel_cpu/src/utils/blob_dump.cpp @@ -166,6 +166,12 @@ void BlobDumper::dumpAsTxt(std::ostream &stream) const { const void *ptr = memory->getData(); switch (desc.getPrecision()) { + case Precision::FP64 : { + auto *blob_ptr = reinterpret_cast(ptr); + for (size_t i = 0; i < data_size; i++) + stream << blob_ptr[desc.getElementOffset(i)] << std::endl; + break; + } case Precision::FP32 : { auto *blob_ptr = reinterpret_cast(ptr); for (size_t i = 0; i < data_size; i++) @@ -180,6 +186,12 @@ void BlobDumper::dumpAsTxt(std::ostream &stream) const { } break; } + case Precision::I64: { + auto *blob_ptr = reinterpret_cast(ptr); + for (size_t i = 0; i < data_size; i++) + stream << blob_ptr[desc.getElementOffset(i)] << std::endl; + break; + } case Precision::I32: { auto *blob_ptr = reinterpret_cast(ptr); for (size_t i = 0; i < data_size; i++) diff --git a/src/plugins/intel_cpu/src/utils/cpu_utils.hpp b/src/plugins/intel_cpu/src/utils/cpu_utils.hpp index 870b081ba277cb..d28b1aeda33931 100644 --- a/src/plugins/intel_cpu/src/utils/cpu_utils.hpp +++ b/src/plugins/intel_cpu/src/utils/cpu_utils.hpp @@ -102,7 +102,8 @@ inline InferenceEngine::Precision normalizeToSupportedPrecision(InferenceEngine: case InferenceEngine::Precision::I8: case InferenceEngine::Precision::I32: case InferenceEngine::Precision::BF16: - case InferenceEngine::Precision::FP32: { + case InferenceEngine::Precision::FP32: + case InferenceEngine::Precision::I64: { break; } case InferenceEngine::Precision::FP64: { @@ -113,11 +114,13 @@ inline InferenceEngine::Precision normalizeToSupportedPrecision(InferenceEngine: precision = InferenceEngine::Precision::U8; break; } + case InferenceEngine::Precision::U64: { + precision = InferenceEngine::Precision::I64; + break; + } case InferenceEngine::Precision::U16: case InferenceEngine::Precision::I16: - case InferenceEngine::Precision::U32: - case InferenceEngine::Precision::I64: - case InferenceEngine::Precision::U64: { + case InferenceEngine::Precision::U32: { precision = InferenceEngine::Precision::I32; break; } diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/single_layer_tests/comparison.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/single_layer_tests/comparison.cpp index 769aea85731bdb..2eff2b4792495e 100644 --- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/single_layer_tests/comparison.cpp +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/single_layer_tests/comparison.cpp @@ -8,6 +8,7 @@ using namespace LayerTestsDefinitions; using namespace LayerTestsDefinitions::ComparisonParams; +using namespace InferenceEngine; namespace { @@ -20,13 +21,6 @@ std::map, std::vector>> inputShapes = { {{2, 1, 1, 3, 1}, {{1}, {1, 3, 4}, {2, 1, 3, 4}, {1, 1, 1, 1, 1}}}, }; -std::vector inputsPrecisions = { - InferenceEngine::Precision::FP32, - InferenceEngine::Precision::FP16, - InferenceEngine::Precision::I32, - InferenceEngine::Precision::BOOL, -}; - std::vector comparisonOpTypes = { ngraph::helpers::ComparisonTypes::EQUAL, ngraph::helpers::ComparisonTypes::NOT_EQUAL, @@ -43,17 +37,29 @@ std::vector secondInputTypes = { std::map additional_config = {}; -const auto ComparisonTestParams = ::testing::Combine( - ::testing::ValuesIn(CommonTestUtils::combineParams(inputShapes)), - ::testing::ValuesIn(inputsPrecisions), - ::testing::ValuesIn(comparisonOpTypes), - ::testing::ValuesIn(secondInputTypes), - ::testing::Values(InferenceEngine::Precision::UNSPECIFIED), - ::testing::Values(InferenceEngine::Precision::UNSPECIFIED), - ::testing::Values(CommonTestUtils::DEVICE_CPU), - ::testing::Values(additional_config)); - -INSTANTIATE_TEST_SUITE_P(smoke_CompareWithRefs, ComparisonLayerTest, ComparisonTestParams, ComparisonLayerTest::getTestCaseName); +INSTANTIATE_TEST_SUITE_P(smoke_CompareWithRefs, ComparisonLayerTest, + ::testing::Combine( + ::testing::ValuesIn(CommonTestUtils::combineParams(inputShapes)), + ::testing::ValuesIn(std::vector{Precision::FP32, Precision::I32, Precision::I64}), + ::testing::ValuesIn(comparisonOpTypes), + ::testing::ValuesIn(secondInputTypes), + ::testing::Values(Precision::UNSPECIFIED), + ::testing::Values(Precision::UNSPECIFIED), + ::testing::Values(CommonTestUtils::DEVICE_CPU), + ::testing::Values(additional_config)), + ComparisonLayerTest::getTestCaseName); + +INSTANTIATE_TEST_SUITE_P(nightly_CompareWithRefs, ComparisonLayerTest, + ::testing::Combine( + ::testing::ValuesIn(CommonTestUtils::combineParams(inputShapes)), + ::testing::ValuesIn(std::vector{Precision::FP16, Precision::BOOL}), + ::testing::ValuesIn(comparisonOpTypes), + ::testing::ValuesIn(secondInputTypes), + ::testing::Values(Precision::UNSPECIFIED), + ::testing::Values(Precision::UNSPECIFIED), + ::testing::Values(CommonTestUtils::DEVICE_CPU), + ::testing::Values(additional_config)), + ComparisonLayerTest::getTestCaseName); std::vector inputShapesIsOps = { @@ -80,11 +86,11 @@ std::vector comparisonOpTypesIs = { const auto ComparisonTestParamsIs = ::testing::Combine( ::testing::ValuesIn(inputShapesIsOps), - ::testing::Values(InferenceEngine::Precision::FP32), + ::testing::Values(Precision::FP32), ::testing::ValuesIn(comparisonOpTypesIs), ::testing::Values(ngraph::helpers::InputLayerType::CONSTANT), - ::testing::Values(InferenceEngine::Precision::UNSPECIFIED), - ::testing::Values(InferenceEngine::Precision::UNSPECIFIED), + ::testing::Values(Precision::UNSPECIFIED), + ::testing::Values(Precision::UNSPECIFIED), ::testing::Values(CommonTestUtils::DEVICE_CPU), ::testing::Values(additional_config)); diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/single_layer_tests/concat.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/single_layer_tests/concat.cpp index 9111bf532ce88e..017e217e29ded9 100644 --- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/single_layer_tests/concat.cpp +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/single_layer_tests/concat.cpp @@ -5,7 +5,6 @@ #include #include "single_layer_tests/concat.hpp" -#include "common_test_utils/test_constants.hpp" using namespace LayerTestsDefinitions; @@ -20,15 +19,11 @@ std::vector>> inShapes = { {{10, 10, 10, 10}, {10, 10, 10, 10}, {10, 10, 10, 10}, {10, 10, 10, 10}, {10, 10, 10, 10}} }; - -std::vector netPrecisions = {InferenceEngine::Precision::FP32, - InferenceEngine::Precision::FP16}; - INSTANTIATE_TEST_SUITE_P(smoke_NoReshape, ConcatLayerTest, ::testing::Combine( ::testing::ValuesIn(axes), ::testing::ValuesIn(inShapes), - ::testing::ValuesIn(netPrecisions), + ::testing::Values(InferenceEngine::Precision::FP32), ::testing::Values(InferenceEngine::Precision::UNSPECIFIED), ::testing::Values(InferenceEngine::Precision::UNSPECIFIED), ::testing::Values(InferenceEngine::Layout::ANY), diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/single_layer_tests/eltwise.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/single_layer_tests/eltwise.cpp index 8192c4089c4c97..afbb218fc2bd21 100644 --- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/single_layer_tests/eltwise.cpp +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/single_layer_tests/eltwise.cpp @@ -48,7 +48,7 @@ std::vector> inShapesDynamicLargeUpperBound = std::vector netPrecisions = { ov::element::f32, ov::element::f16, - ov::element::i32, + ov::element::i32 }; std::vector secondaryInputTypes = { diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/single_layer_tests/minimum_maximum.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/single_layer_tests/minimum_maximum.cpp deleted file mode 100644 index 6f3c729fd18688..00000000000000 --- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/single_layer_tests/minimum_maximum.cpp +++ /dev/null @@ -1,51 +0,0 @@ -// Copyright (C) 2018-2023 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#include -#include "single_layer_tests/minimum_maximum.hpp" -#include "common_test_utils/test_constants.hpp" - -using namespace LayerTestsDefinitions; - -namespace { - -const std::vector>> inShapes = { - {{2}, {1}}, - {{1, 1, 1, 3}, {1}}, - {{1, 2, 4}, {1}}, - {{1, 4, 4}, {1}}, - {{1, 4, 4, 1}, {1}}, - {{256, 56}, {256, 56}}, - {{8, 1, 6, 1}, {7, 1, 5}}, -}; - -const std::vector netPrecisions = { - InferenceEngine::Precision::FP32, - InferenceEngine::Precision::FP16, -}; - -const std::vector opType = { - ngraph::helpers::MinMaxOpType::MINIMUM, - ngraph::helpers::MinMaxOpType::MAXIMUM, -}; - -const std::vector inputType = { - ngraph::helpers::InputLayerType::CONSTANT, - ngraph::helpers::InputLayerType::PARAMETER, -}; - -INSTANTIATE_TEST_SUITE_P(smoke_maximum, MaxMinLayerTest, - ::testing::Combine( - ::testing::ValuesIn(inShapes), - ::testing::ValuesIn(opType), - ::testing::ValuesIn(netPrecisions), - ::testing::Values(InferenceEngine::Precision::UNSPECIFIED), - ::testing::Values(InferenceEngine::Precision::UNSPECIFIED), - ::testing::Values(InferenceEngine::Layout::ANY), - ::testing::Values(InferenceEngine::Layout::ANY), - ::testing::ValuesIn(inputType), - ::testing::Values(CommonTestUtils::DEVICE_CPU)), - MaxMinLayerTest::getTestCaseName); - -} // namespace diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/single_layer_tests/non_max_suppression.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/single_layer_tests/non_max_suppression.cpp index 18a8aa36044458..402ccbd94cddfe 100644 --- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/single_layer_tests/non_max_suppression.cpp +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/single_layer_tests/non_max_suppression.cpp @@ -39,4 +39,19 @@ const auto nmsParams = ::testing::Combine(::testing::ValuesIn(inShapeParams), ::testing::Values(CommonTestUtils::DEVICE_CPU) ); +const auto nmsParams_i64 = ::testing::Combine(::testing::ValuesIn(inShapeParams), + ::testing::Combine(::testing::Values(Precision::FP32), + ::testing::Values(Precision::I64), + ::testing::Values(Precision::FP32)), + ::testing::ValuesIn(maxOutBoxPerClass), + ::testing::ValuesIn(threshold), + ::testing::ValuesIn(threshold), + ::testing::ValuesIn(sigmaThreshold), + ::testing::ValuesIn(encodType), + ::testing::ValuesIn(sortResDesc), + ::testing::ValuesIn(outType), + ::testing::Values(CommonTestUtils::DEVICE_CPU) +); + INSTANTIATE_TEST_SUITE_P(smoke_NmsLayerTest, NmsLayerTest, nmsParams, NmsLayerTest::getTestCaseName); +INSTANTIATE_TEST_SUITE_P(smoke_NmsLayerTest_i64, NmsLayerTest, nmsParams_i64, NmsLayerTest::getTestCaseName); diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/single_layer_tests/range.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/single_layer_tests/range.cpp index 70403b61629d28..6d128ec3b37164 100644 --- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/single_layer_tests/range.cpp +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/single_layer_tests/range.cpp @@ -17,7 +17,8 @@ const std::vector step = { 1.0f, 0.1f }; const std::vector netPrecisions = { InferenceEngine::Precision::FP32, - InferenceEngine::Precision::FP16 + InferenceEngine::Precision::FP16, + InferenceEngine::Precision::I64 }; INSTANTIATE_TEST_SUITE_P(smoke_Basic, RangeLayerTest, diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/single_layer_tests/reduce_ops.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/single_layer_tests/reduce_ops.cpp index e5ae486545f926..a8c1a713f55fdd 100644 --- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/single_layer_tests/reduce_ops.cpp +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/single_layer_tests/reduce_ops.cpp @@ -13,9 +13,7 @@ namespace { const std::vector netPrecisions = { InferenceEngine::Precision::FP32, InferenceEngine::Precision::FP16, - InferenceEngine::Precision::I64, - InferenceEngine::Precision::I32, - InferenceEngine::Precision::U64 + InferenceEngine::Precision::I32 }; const std::vector keepDims = { diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/single_layer_tests/reshape.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/single_layer_tests/reshape.cpp index 7d3923f8be55dc..719e834a3f32cb 100644 --- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/single_layer_tests/reshape.cpp +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/single_layer_tests/reshape.cpp @@ -12,7 +12,8 @@ using namespace LayerTestsDefinitions; namespace { const std::vector netPrecisions = { InferenceEngine::Precision::FP32, - InferenceEngine::Precision::FP16 + InferenceEngine::Precision::FP16, + InferenceEngine::Precision::I64 }; INSTANTIATE_TEST_SUITE_P(smoke_ReshapeCheck, ReshapeLayerTest, diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/single_layer_tests/scatter_ND_update.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/single_layer_tests/scatter_ND_update.cpp index 28698967cbd17a..61f893d896f856 100644 --- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/single_layer_tests/scatter_ND_update.cpp +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/single_layer_tests/scatter_ND_update.cpp @@ -16,6 +16,7 @@ const std::vector inputPrecisions = { InferenceEngine::Precision::FP32, InferenceEngine::Precision::FP16, InferenceEngine::Precision::I32, + InferenceEngine::Precision::I64 }; const std::vector idxPrecisions = { diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/single_layer_tests/scatter_elements_update.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/single_layer_tests/scatter_elements_update.cpp index 3a2033805d57b5..b633a3fc516522 100644 --- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/single_layer_tests/scatter_elements_update.cpp +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/single_layer_tests/scatter_elements_update.cpp @@ -27,6 +27,7 @@ const std::vector inputPrecisions = { InferenceEngine::Precision::FP32, InferenceEngine::Precision::FP16, InferenceEngine::Precision::I32, + InferenceEngine::Precision::I64 }; const std::vector idxPrecisions = { diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/single_layer_tests/scatter_update.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/single_layer_tests/scatter_update.cpp index 60a3f488c0040f..6afbf6fe39587f 100644 --- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/single_layer_tests/scatter_update.cpp +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/single_layer_tests/scatter_update.cpp @@ -15,7 +15,7 @@ namespace { const std::vector inputPrecisions = { InferenceEngine::Precision::FP32, InferenceEngine::Precision::FP16, - InferenceEngine::Precision::I32, + InferenceEngine::Precision::I32 }; const std::vector idxPrecisions = { diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/single_layer_tests/select.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/single_layer_tests/select.cpp index 5f76a8462c51fb..22ab4942f69eb1 100644 --- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/single_layer_tests/select.cpp +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/single_layer_tests/select.cpp @@ -13,9 +13,9 @@ const std::vector inputPrecision = { InferenceEngine::Precision::I8, InferenceEngine::Precision::I16, InferenceEngine::Precision::I32, - InferenceEngine::Precision::FP32 + InferenceEngine::Precision::FP32, // CPU plug-in doesn't support I64 and U64 precisions at the moment - // InferenceEngine::Precision::I64 + InferenceEngine::Precision::I64 }; const std::vector>> noneShapes = { diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/single_layer_tests/squeeze_unsqueeze.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/single_layer_tests/squeeze_unsqueeze.cpp index 3e00f12b22284f..c98a60447ea06e 100644 --- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/single_layer_tests/squeeze_unsqueeze.cpp +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/single_layer_tests/squeeze_unsqueeze.cpp @@ -30,7 +30,8 @@ std::map, std::vector>> emptyAxesVectors = const std::vector netPrecisions = { InferenceEngine::Precision::FP32, - InferenceEngine::Precision::FP16 + InferenceEngine::Precision::FP16, + InferenceEngine::Precision::I64 }; const std::vector opTypes = { diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/single_layer_tests/tile.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/single_layer_tests/tile.cpp index 52eff3f90d0a39..a21f16f6e1fcba 100644 --- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/single_layer_tests/tile.cpp +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/single_layer_tests/tile.cpp @@ -15,7 +15,8 @@ const std::vector netPrecisions = { InferenceEngine::Precision::U8, InferenceEngine::Precision::I32, InferenceEngine::Precision::BF16, - InferenceEngine::Precision::FP32 + InferenceEngine::Precision::FP32, + InferenceEngine::Precision::I64 }; const std::vector netTPrecisions = { diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp index 4b79a3bf83e417..9aa442dd236cdf 100644 --- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp @@ -74,10 +74,11 @@ std::vector disabledTestPatterns() { R"(.*OVCompiledModelBaseTest.*(CanGetInputsInfoAndCheck|canSetConfigToCompiledModel).*)", R"(.*Behavior.*CorrectConfigCheck.*(canSetConfigAndCheckGetConfig|canSetConfigTwiceAndCheckGetConfig).*CPU_BIND_THREAD=YES.*)", // Issue: 72021 Unreasonable abs_threshold for comparing bf16 results - R"(.*smoke_Reduce.*type=(Prod|Min).*netPRC=(BF|bf)16.*)", + R"(.*smoke_Reduce.*type=Prod.*netPRC=(BF|bf)16.*)", // TODO: 56520 Accuracy mismatch - R"(.*ReduceOpsLayerTest.*type=Mean_.*netPRC=(I64|I32).*)", - R"(.*ReduceOpsLayerTest.*type=Mean_.*netPRC=U64.*)", + R"(.*ReduceOpsLayerTest.*type=Mean_.*netPRC=I32.*)", + R"(.*fusing.*ReduceCPULayerTest.*netPRC=(i|u)64.*CPU_NATIVE_I64=YES.*)", + R"(.*smoke.*Split.*(4D|5D).*netPRC=i8.*)", // Not implemented yet: R"(.*Behavior.*ExecutableNetworkBaseTest.*canSetConfigToExecNet.*)", R"(.*Behavior.*OVCompiledModelBaseTest.*canSetConfigToCompiledModel.*)", diff --git a/src/plugins/intel_cpu/tests/functional/single_layer_tests/broadcast.cpp b/src/plugins/intel_cpu/tests/functional/single_layer_tests/broadcast.cpp index 84812e1d048bc6..ff796fc21701d5 100644 --- a/src/plugins/intel_cpu/tests/functional/single_layer_tests/broadcast.cpp +++ b/src/plugins/intel_cpu/tests/functional/single_layer_tests/broadcast.cpp @@ -6,39 +6,41 @@ #include "ngraph_functions/builders.hpp" #include "shared_test_classes/base/ov_subgraph.hpp" #include +#include using namespace CPUTestUtils; +using namespace ov::test; namespace CPULayerTestsDefinitions { using BroadcastLayerTestParamsSet = typename std::tuple< - std::vector, // Shapes + std::vector, // Shapes std::vector, // Target shapes std::vector, // Axes mapping ov::op::BroadcastType, // Broadcast mode - ov::element::Type_t, // Network precision + ElementType, // Network precision std::vector, // Const inputs - std::string>; // Device name + ov::AnyMap>; // Additional network configuration using BroadcastLayerCPUTestParamsSet = typename std::tuple< BroadcastLayerTestParamsSet, CPUSpecificParams>; class BroadcastLayerCPUTest : public testing::WithParamInterface, - virtual public ov::test::SubgraphBaseTest, public CPUTestsBase { + virtual public SubgraphBaseTest, public CPUTestsBase { public: static std::string getTestCaseName(testing::TestParamInfo obj) { BroadcastLayerTestParamsSet basicParamsSet; CPUSpecificParams cpuParams; std::tie(basicParamsSet, cpuParams) = obj.param; - std::vector inputShapes; + std::vector inputShapes; std::vector targetShapes, axesMapping; ov::op::BroadcastType mode; - ov::element::Type_t netPrecision; + ElementType netPrecision; std::vector isConstInputs; - std::string deviceName; - std::tie(inputShapes, targetShapes, axesMapping, mode, netPrecision, isConstInputs, deviceName) = basicParamsSet; + ov::AnyMap additionalConfig; + std::tie(inputShapes, targetShapes, axesMapping, mode, netPrecision, isConstInputs, additionalConfig) = basicParamsSet; std::ostringstream result; result << "IS=("; @@ -56,7 +58,13 @@ class BroadcastLayerCPUTest : public testing::WithParamInterface inputPrecisions = { - ov::element::f32, - ov::element::bf16, - ov::element::i32, - ov::element::i8 +const std::vector inputPrecisions = { + ElementType::f32, + ElementType::bf16, + ElementType::i32, + ElementType::i8 +}; + +const ov::AnyMap emptyConfig = {}; +const ov::AnyMap i64Config = { + {InferenceEngine::PluginConfigInternalParams::KEY_CPU_NATIVE_I64, InferenceEngine::PluginConfigParams::YES} }; + /* ============= */ /* INSTANCES */ @@ -221,7 +236,7 @@ const std::vector CPUParams4D = { cpuParams_nhwc }; -const std::vector> staticInputShapes4D = { +const std::vector> staticInputShapes4D = { { {{}, { // Static shapes @@ -247,7 +262,20 @@ INSTANTIATE_TEST_CASE_P(smoke_StaticShape4D, BroadcastLayerCPUTest, ::testing::Values(ov::op::BroadcastType::NUMPY), ::testing::ValuesIn(inputPrecisions), ::testing::Values(std::vector{true, true}), - ::testing::Values(CommonTestUtils::DEVICE_CPU)), + ::testing::Values(emptyConfig)), + ::testing::ValuesIn(CPUParams4D)), + BroadcastLayerCPUTest::getTestCaseName); + +INSTANTIATE_TEST_CASE_P(smoke_StaticShape4D_I64, BroadcastLayerCPUTest, + ::testing::Combine( + ::testing::Combine( + ::testing::Values(staticInputShapes4D[0]), + ::testing::ValuesIn(std::vector>{{1, 16, 3, 3}, {1, 16, 1, 3}}), + ::testing::Values(std::vector{}), + ::testing::Values(ov::op::BroadcastType::NUMPY), + ::testing::Values(ElementType::i64), + ::testing::Values(std::vector{true, true}), + ::testing::Values(i64Config)), ::testing::ValuesIn(CPUParams4D)), BroadcastLayerCPUTest::getTestCaseName); @@ -260,11 +288,11 @@ INSTANTIATE_TEST_CASE_P(smoke_StaticShape4DE, BroadcastLayerCPUTest, ::testing::Values(ov::op::BroadcastType::EXPLICIT), ::testing::ValuesIn(inputPrecisions), ::testing::Values(std::vector{true, true}), - ::testing::Values(CommonTestUtils::DEVICE_CPU)), + ::testing::Values(emptyConfig)), ::testing::Values(CPUSpecificParams{{}, {}, {}, "ref"})), BroadcastLayerCPUTest::getTestCaseName); -const std::vector> staticInputShapesScalar = { +const std::vector> staticInputShapesScalar = { { {{}, { // Static shapes @@ -283,11 +311,11 @@ INSTANTIATE_TEST_CASE_P(smoke_StaticShape4DScalar, BroadcastLayerCPUTest, ::testing::Values(ov::op::BroadcastType::NUMPY), ::testing::ValuesIn(inputPrecisions), ::testing::Values(std::vector{true, true}), - ::testing::Values(CommonTestUtils::DEVICE_CPU)), + ::testing::Values(emptyConfig)), ::testing::Values(CPUSpecificParams{{}, {}, {}, "ref"})), BroadcastLayerCPUTest::getTestCaseName); -const std::vector> dynamicInputShapes4D = { +const std::vector> dynamicInputShapes4D = { { { // Origin dynamic shapes {ov::Dimension(1, 20), ov::Dimension(1, 20), ov::Dimension(1, 20), ov::Dimension(1, 20)}, @@ -317,11 +345,23 @@ INSTANTIATE_TEST_CASE_P(smoke_DynamicShape4D, BroadcastLayerCPUTest, ::testing::Values(ov::op::BroadcastType::NUMPY), ::testing::ValuesIn(inputPrecisions), ::testing::ValuesIn(std::vector>{{true, true}, {false, true}}), - ::testing::Values(CommonTestUtils::DEVICE_CPU)), + ::testing::Values(emptyConfig)), ::testing::Values(CPUSpecificParams{{}, {}, {}, "ref"})), BroadcastLayerCPUTest::getTestCaseName); -const std::vector> dynamicInputShapesScalar = { +INSTANTIATE_TEST_CASE_P(smoke_DynamicShape4D_I64, BroadcastLayerCPUTest, + ::testing::Combine(::testing::Combine( + ::testing::ValuesIn(dynamicInputShapes4D), + ::testing::ValuesIn(std::vector>{{8, 16, 1, 7}, {8, 16, 10, 7}}), + ::testing::Values(std::vector{}), + ::testing::Values(ov::op::BroadcastType::NUMPY), + ::testing::Values(ElementType::i64), + ::testing::ValuesIn(std::vector>{{true, true}, {false, true}}), + ::testing::Values(i64Config)), + ::testing::Values(CPUSpecificParams{{}, {}, {}, "ref"})), + BroadcastLayerCPUTest::getTestCaseName); + +const std::vector> dynamicInputShapesScalar = { { { // Origin dynamic shapes {-1}, @@ -341,12 +381,12 @@ INSTANTIATE_TEST_CASE_P(smoke_DynamicShape4DScalar, BroadcastLayerCPUTest, ::testing::Values(ov::op::BroadcastType::NUMPY), ::testing::ValuesIn(inputPrecisions), ::testing::ValuesIn(std::vector>{{true, true}, {false, true}}), - ::testing::Values(CommonTestUtils::DEVICE_CPU)), + ::testing::Values(emptyConfig)), ::testing::Values(CPUSpecificParams{{}, {}, {}, "ref"})), BroadcastLayerCPUTest::getTestCaseName); // 5D -const std::vector> staticInputShapes5D = { +const std::vector> staticInputShapes5D = { { {{}, { // Static shapes @@ -355,7 +395,7 @@ const std::vector> staticInputShapes5D = { } } }; -const std::vector> dynamicInputShapes5D = { +const std::vector> dynamicInputShapes5D = { { { // Origin dynamic shapes {ov::Dimension(1, 20), ov::Dimension(1, 20), ov::Dimension(1, 20), ov::Dimension(1, 20), ov::Dimension(1, 20)}, @@ -396,7 +436,20 @@ INSTANTIATE_TEST_CASE_P(smoke_StaticShape5D, BroadcastLayerCPUTest, ::testing::Values(ov::op::BroadcastType::NUMPY), ::testing::ValuesIn(inputPrecisions), ::testing::Values(std::vector{true, true}), - ::testing::Values(CommonTestUtils::DEVICE_CPU)), + ::testing::Values(emptyConfig)), + ::testing::ValuesIn(CPUParams5D)), + BroadcastLayerCPUTest::getTestCaseName); + +INSTANTIATE_TEST_CASE_P(smoke_StaticShape5D_I64, BroadcastLayerCPUTest, + ::testing::Combine( + ::testing::Combine( + ::testing::ValuesIn(staticInputShapes5D), + ::testing::ValuesIn(std::vector>{{1, 16, 1, 1, 3}, {1, 16, 3, 1, 3}}), + ::testing::Values(std::vector{}), + ::testing::Values(ov::op::BroadcastType::NUMPY), + ::testing::Values(ElementType::i64), + ::testing::Values(std::vector{true, true}), + ::testing::Values(i64Config)), ::testing::ValuesIn(CPUParams5D)), BroadcastLayerCPUTest::getTestCaseName); @@ -409,7 +462,7 @@ INSTANTIATE_TEST_CASE_P(smoke_StaticShape5DScalar, BroadcastLayerCPUTest, ::testing::Values(ov::op::BroadcastType::NUMPY), ::testing::ValuesIn(inputPrecisions), ::testing::Values(std::vector{true, true}), - ::testing::Values(CommonTestUtils::DEVICE_CPU)), + ::testing::Values(emptyConfig)), ::testing::Values(CPUSpecificParams{{}, {}, {}, "ref"})), BroadcastLayerCPUTest::getTestCaseName); @@ -422,7 +475,7 @@ INSTANTIATE_TEST_CASE_P(smoke_DynamicShape5D, BroadcastLayerCPUTest, ::testing::Values(ov::op::BroadcastType::NUMPY), ::testing::ValuesIn(inputPrecisions), ::testing::ValuesIn(std::vector>{{true, true}, {false, true}}), - ::testing::Values(CommonTestUtils::DEVICE_CPU)), + ::testing::Values(emptyConfig)), ::testing::Values(CPUSpecificParams{{}, {}, {}, "ref"})), BroadcastLayerCPUTest::getTestCaseName); @@ -435,12 +488,12 @@ INSTANTIATE_TEST_CASE_P(smoke_DynamicShape5DScalar, BroadcastLayerCPUTest, ::testing::Values(ov::op::BroadcastType::NUMPY), ::testing::ValuesIn(inputPrecisions), ::testing::ValuesIn(std::vector>{{true, true}, {false, true}}), - ::testing::Values(CommonTestUtils::DEVICE_CPU)), + ::testing::Values(emptyConfig)), ::testing::Values(CPUSpecificParams{{}, {}, {}, "ref"})), BroadcastLayerCPUTest::getTestCaseName); // 1D -const std::vector> dynamicShapes1D = { +const std::vector> dynamicShapes1D = { { { // Origin dynamic shapes {-1}, @@ -460,7 +513,7 @@ INSTANTIATE_TEST_CASE_P(smoke_DynamicShapes1D, BroadcastLayerCPUTest, ::testing::Values(ov::op::BroadcastType::NUMPY), ::testing::ValuesIn(inputPrecisions), ::testing::ValuesIn(std::vector>{{false, true}}), - ::testing::Values(CommonTestUtils::DEVICE_CPU)), + ::testing::Values(emptyConfig)), ::testing::Values(CPUSpecificParams{{}, {}, {}, "ref"})), BroadcastLayerCPUTest::getTestCaseName); /* ========= */ diff --git a/src/plugins/intel_cpu/tests/functional/single_layer_tests/classes/activation.cpp b/src/plugins/intel_cpu/tests/functional/single_layer_tests/classes/activation.cpp index b9764711d99204..976f9549b32fc9 100644 --- a/src/plugins/intel_cpu/tests/functional/single_layer_tests/classes/activation.cpp +++ b/src/plugins/intel_cpu/tests/functional/single_layer_tests/classes/activation.cpp @@ -5,8 +5,8 @@ #include "activation.hpp" #include "gtest/gtest.h" #include "test_utils/cpu_test_utils.hpp" +#include -using namespace InferenceEngine; using namespace CPUTestUtils; using namespace ngraph::helpers; using namespace ov::test; @@ -17,9 +17,10 @@ std::string ActivationLayerCPUTest::getTestCaseName(const testing::TestParamInfo std::vector inputShapes; std::vector activationShapes; std::pair> activationTypeAndConstValue; - InferenceEngine::Precision netPrecision, inPrecision, outPrecision; + ElementType netPrecision, inPrecision, outPrecision; CPUTestUtils::CPUSpecificParams cpuParams; - std::tie(inputShapes, activationShapes, activationTypeAndConstValue, netPrecision, inPrecision, outPrecision, cpuParams) = obj.param; + ov::AnyMap config; + std::tie(inputShapes, activationShapes, activationTypeAndConstValue, netPrecision, inPrecision, outPrecision, config, cpuParams) = obj.param; std::ostringstream result; result << LayerTestsDefinitions::activationNames[activationTypeAndConstValue.first] << "_"; @@ -39,20 +40,28 @@ std::string ActivationLayerCPUTest::getTestCaseName(const testing::TestParamInfo } result << "AS=" << CommonTestUtils::vec2str(activationShapes) << "_"; result << "ConstantsValue=" << CommonTestUtils::vec2str(activationTypeAndConstValue.second) << "_"; - result << "netPRC=" << netPrecision.name() << "_"; - result << "inPRC=" << inPrecision.name() << "_"; - result << "outPRC=" << outPrecision.name() << "_"; + result << "netPRC=" << netPrecision << "_"; + result << "inPRC=" << inPrecision << "_"; + result << "outPRC=" << outPrecision << "_"; result << CPUTestUtils::CPUTestsBase::getTestCaseName(cpuParams); + if (!config.empty()) { + result << "_PluginConf"; + for (const auto& configItem : config) { + result << "_" << configItem.first << "="; + configItem.second.print(result); + } + } + return result.str(); } -void ActivationLayerCPUTest::generate_inputs(const std::vector& targetInputStaticShapes) { +void ActivationLayerCPUTest::generate_inputs(const std::vector& targetInputStaticShapes) { int32_t startFrom = 0; uint32_t range = 0; int32_t resolution = 0; - if (activationType == ActivationTypes::Exp && netPrecision == Precision::BF16) { + if (activationType == ActivationTypes::Exp && netPrecision == ElementType::bf16) { startFrom = 0; range = 2; resolution = 32768; @@ -93,18 +102,16 @@ void ActivationLayerCPUTest::SetUp() { std::vector inputShapes; std::vector activationShapes; std::pair> activationTypeAndConstValue; - InferenceEngine::Precision inPrecision, outPrecision; CPUTestUtils::CPUSpecificParams cpuParams; - std::tie(inputShapes, activationShapes, activationTypeAndConstValue, netPrecision, inPrecision, outPrecision, cpuParams) = this->GetParam(); + + std::tie(inputShapes, activationShapes, activationTypeAndConstValue, netPrecision, inType, outType, configuration, cpuParams) = this->GetParam(); std::tie(inFmts, outFmts, priority, selectedType) = cpuParams; + activationType = activationTypeAndConstValue.first; auto constantsValue = activationTypeAndConstValue.second; - inType = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(inPrecision); - outType = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(outPrecision); - selectedType = getPrimitiveType() + "_" + netPrecision.name(); - #if defined(OPENVINO_ARCH_ARM) || defined(OPENVINO_ARCH_ARM64) + selectedType = getPrimitiveType() + "_" + netPrecision.name(); # if defined(OPENVINO_ARCH_ARM) if (activationType == ngraph::helpers::ActivationTypes::GeluErf) // @todo tmp fallback to ref, gelu erf is disabled for 32bit ARM selectedType = std::string("ref_") + netPrecision.name(); @@ -114,17 +121,30 @@ void ActivationLayerCPUTest::SetUp() { inputShapes.front().first.rank().get_length() > 5) // @todo tmp fallback to ref, remove after 6D+ ranks are properly supported selectedType = std::string("ref_") + netPrecision.name(); #else + selectedType = getPrimitiveType(); if (activationType == ngraph::helpers::ActivationTypes::Log) // @todo tmp fallback to ref, remove after Log is supported in emitters - selectedType = std::string("ref_") + netPrecision.name(); + selectedType = std::string("ref"); + + if (netPrecision == ElementType::i64 || netPrecision == ElementType::u64) { + auto i64It = configuration.find(InferenceEngine::PluginConfigInternalParams::KEY_CPU_NATIVE_I64); + if (i64It == configuration.end() || i64It->second == InferenceEngine::PluginConfigParams::NO) { + selectedType = makeSelectedTypeStr(selectedType, ElementType::i32); + } else { + selectedType = makeSelectedTypeStr(selectedType, ElementType::i64); + } + } else if (netPrecision == ElementType::boolean) { + selectedType = makeSelectedTypeStr(selectedType, ElementType::i8); + } else { + selectedType = makeSelectedTypeStr(selectedType, netPrecision); + } #endif init_input_shapes(inputShapes); - auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision); - auto params = ngraph::builder::makeDynamicParams(ngPrc, {inputDynamicShapes.front()}); - auto activation = ngraph::builder::makeActivation(params[0], ngPrc, activationType, activationShapes, constantsValue); + auto params = ngraph::builder::makeDynamicParams(netPrecision, {inputDynamicShapes.front()}); + auto activation = ngraph::builder::makeActivation(params[0], netPrecision, activationType, activationShapes, constantsValue); activation->get_rt_info() = getCPUInfo(); - function = std::make_shared(ngraph::NodeVector{activation}, params, "Activation"); + function = std::make_shared(ov::NodeVector{activation}, params, "Activation"); } TEST_P(ActivationLayerCPUTest, CompareWithRefs) { @@ -160,8 +180,8 @@ const std::map>>& activationType return activationTypes; } -const std::vector& netPrc() { - static const std::vector netPrc{Precision::FP32}; +const std::vector& netPrc() { + static const std::vector netPrc{ElementType::f32}; return netPrc; } @@ -245,9 +265,9 @@ const std::map>>& activationType return activationTypesDynamicMath; } -const std::vector& netPrecisions() { - static const std::vector netPrecisions { - InferenceEngine::Precision::FP32 +const std::vector& netPrecisions() { + static const std::vector netPrecisions { + ElementType::f32 }; return netPrecisions; diff --git a/src/plugins/intel_cpu/tests/functional/single_layer_tests/classes/activation.hpp b/src/plugins/intel_cpu/tests/functional/single_layer_tests/classes/activation.hpp index b7881fae053691..b443011d019894 100644 --- a/src/plugins/intel_cpu/tests/functional/single_layer_tests/classes/activation.hpp +++ b/src/plugins/intel_cpu/tests/functional/single_layer_tests/classes/activation.hpp @@ -9,7 +9,6 @@ #include "shared_test_classes/base/ov_subgraph.hpp" #include #include "test_utils/cpu_test_utils.hpp" -#include "gtest/gtest.h" namespace CPULayerTestsDefinitions { @@ -17,9 +16,10 @@ using ActivationLayerCPUTestParamSet = std::tuple, // Input shapes std::vector, // Activation shapes std::pair>, // Activation type and constant value - InferenceEngine::Precision, // Net precision - InferenceEngine::Precision, // Input precision - InferenceEngine::Precision, // Output precision + ov::test::ElementType, // Net precision + ov::test::ElementType, // Input precision + ov::test::ElementType, // Output precision + ov::AnyMap, // Additional network configuration CPUTestUtils::CPUSpecificParams>; class ActivationLayerCPUTest : public testing::WithParamInterface, @@ -27,13 +27,13 @@ class ActivationLayerCPUTest : public testing::WithParamInterface &obj); - void generate_inputs(const std::vector& targetInputStaticShapes) override; + void generate_inputs(const std::vector& targetInputStaticShapes) override; protected: void SetUp() override; private: - InferenceEngine::Precision netPrecision = InferenceEngine::Precision::UNSPECIFIED; + ov::test::ElementType netPrecision = ov::test::ElementType::undefined; ngraph::helpers::ActivationTypes activationType = ngraph::helpers::None; }; @@ -43,7 +43,7 @@ const std::vector activationShapes(); const std::map>>& activationTypes(); -const std::vector& netPrc(); +const std::vector& netPrc(); /* ============= Activation (1D) ============= */ const std::vector& cpuParams3D(); @@ -62,7 +62,7 @@ const std::vector>& basic5D(); const std::map>>& activationTypesDynamicMath(); -const std::vector& netPrecisions(); +const std::vector& netPrecisions(); const std::vector& cpuParamsDynamicMath(); diff --git a/src/plugins/intel_cpu/tests/functional/single_layer_tests/classes/conversion.cpp b/src/plugins/intel_cpu/tests/functional/single_layer_tests/classes/conversion.cpp index bac15ee7f0152a..c5e259aa9b6f17 100644 --- a/src/plugins/intel_cpu/tests/functional/single_layer_tests/classes/conversion.cpp +++ b/src/plugins/intel_cpu/tests/functional/single_layer_tests/classes/conversion.cpp @@ -3,22 +3,21 @@ // #include "conversion.hpp" - -#include "gtest/gtest.h" #include "test_utils/cpu_test_utils.hpp" +#include using namespace InferenceEngine; using namespace CPUTestUtils; -using namespace ngraph::helpers; using namespace ov::test; namespace CPULayerTestsDefinitions { std::string ConvertCPULayerTest::getTestCaseName(testing::TestParamInfo obj) { InputShape inputShape; - InferenceEngine::Precision inPrc, outPrc; + ElementType inPrc, outPrc; CPUSpecificParams cpuParams; - std::tie(inputShape, inPrc, outPrc, cpuParams) = obj.param; + ov::AnyMap config; + std::tie(inputShape, inPrc, outPrc, config, cpuParams) = obj.param; std::ostringstream result; @@ -27,26 +26,34 @@ std::string ConvertCPULayerTest::getTestCaseName(testing::TestParamInfo primitive has to be changed + // TODO: remove the WA after I32 is supported in snippets (ticket: 99803) + if (inPrc == ElementType::i32 || inPrc == ElementType::i64 || outPrc == ElementType::i32 || outPrc == ElementType::i64) + primitive = "unknown"; + + if (inPrc == ElementType::i64 || inPrc == ElementType::u64) { + auto i64Flag = configuration.find(PluginConfigInternalParams::KEY_CPU_NATIVE_I64); + if (i64Flag == configuration.end() || i64Flag->second == PluginConfigParams::NO) { + selectedType = makeSelectedTypeStr(primitive, ElementType::i32); + } else { + selectedType = makeSelectedTypeStr(primitive, ElementType::i64); + } + } else if (inPrc == ElementType::u8) { + selectedType = makeSelectedTypeStr(primitive, ElementType::i8); + } else { + selectedType = makeSelectedTypeStr(primitive, inPrc); + } for (size_t i = 0; i < shapes.second.size(); i++) { - targetStaticShapes.push_back(std::vector{shapes.second[i]}); + targetStaticShapes.push_back(std::vector{shapes.second[i]}); } inputDynamicShapes.push_back(shapes.first); - auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(inPrc); - auto targetPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(outPrc); - ParameterVector params = builder::makeDynamicParams(ngPrc, inputDynamicShapes); - auto conversion = ngraph::builder::makeConversion(params.front(), targetPrc, helpers::ConversionTypes::CONVERT); + ov::ParameterVector params = ngraph::builder::makeDynamicParams(inPrc, inputDynamicShapes); + auto conversion = ngraph::builder::makeConversion(params.front(), outPrc, ngraph::helpers::ConversionTypes::CONVERT); - function = makeNgraphFunction(ngPrc, params, conversion, "ConversionCPU"); + function = makeNgraphFunction(inPrc, params, conversion, "ConversionCPU"); } -void ConvertCPULayerTest::generate_inputs(const std::vector& targetInputStaticShapes) { - if (outPrc != Precision::BOOL) { +void ConvertCPULayerTest::generate_inputs(const std::vector& targetInputStaticShapes) { + if (outPrc != ElementType::boolean) { SubgraphBaseTest::generate_inputs(targetInputStaticShapes); return; } @@ -104,13 +119,13 @@ void ConvertCPULayerTest::generate_inputs(const std::vector& targ size_t size = shape_size(shape); ov::Tensor tensor = ov::test::utils::create_and_fill_tensor(funcInputs[0].get_element_type(), shape, 2 * size); - if (inPrc == Precision::FP32) { + if (inPrc == ElementType::f32) { auto* rawBlobDataPtr = static_cast(tensor.data()); for (size_t i = 0; i < size; ++i) { rawBlobDataPtr[i] = rawBlobDataPtr[i] / size - 1; } - } else if (inPrc == Precision::BF16) { - auto* rawBlobDataPtr = static_cast(tensor.data()); + } else if (inPrc == ElementType::bf16) { + auto* rawBlobDataPtr = static_cast(tensor.data()); for (size_t i = 0; i < size; ++i) { rawBlobDataPtr[i] = rawBlobDataPtr[i] / size - 1; } @@ -162,13 +177,13 @@ const std::vector& inShapes_4D_dynamic() { return inShapes_4D_dynamic; } -const std::vector& precisions() { - static const std::vector precisions = { - Precision::U8, - Precision::I8, - Precision::I32, - Precision::FP32, - Precision::BF16 +const std::vector& precisions() { + static const std::vector precisions = { + ElementType::u8, + ElementType::i8, + ElementType::i32, + ElementType::f32, + ElementType::bf16 }; return precisions; } diff --git a/src/plugins/intel_cpu/tests/functional/single_layer_tests/classes/conversion.hpp b/src/plugins/intel_cpu/tests/functional/single_layer_tests/classes/conversion.hpp index 10c331a0ff255d..b08a8dbf8eed31 100644 --- a/src/plugins/intel_cpu/tests/functional/single_layer_tests/classes/conversion.hpp +++ b/src/plugins/intel_cpu/tests/functional/single_layer_tests/classes/conversion.hpp @@ -9,35 +9,33 @@ #include "shared_test_classes/base/ov_subgraph.hpp" #include #include "test_utils/cpu_test_utils.hpp" -#include "gtest/gtest.h" - -using namespace InferenceEngine; -using namespace ngraph; -using namespace CPUTestUtils; -using namespace ov::test; namespace CPULayerTestsDefinitions { -using convertLayerTestParamsSet = std::tuple; +using convertLayerTestParamsSet = std::tuple< + ov::test::InputShape, // input shapes + ov::test::ElementType, // input precision + ov::test::ElementType, // output precision + ov::AnyMap, // Additional plugin configuration + CPUTestUtils::CPUSpecificParams +>; class ConvertCPULayerTest : public testing::WithParamInterface, - virtual public SubgraphBaseTest, public CPUTestsBase { + virtual public ov::test::SubgraphBaseTest, public CPUTestUtils::CPUTestsBase { public: static std::string getTestCaseName(testing::TestParamInfo obj); - static bool isInOutPrecisionSupported(InferenceEngine::Precision inPrc, InferenceEngine::Precision outPrc); + static bool isInOutPrecisionSupported(ov::test::ElementType inPrc, ov::test::ElementType outPrc); protected: void SetUp() override; - void generate_inputs(const std::vector& targetInputStaticShapes) override; + void generate_inputs(const std::vector& targetInputStaticShapes) override; private: - InferenceEngine::Precision inPrc, outPrc; + ov::test::ElementType inPrc, outPrc; }; namespace Conversion { - const std::vector& inShapes_4D_static(); - const std::vector& inShapes_4D_dynamic(); - const std::vector& precisions(); + const std::vector& inShapes_4D_static(); + const std::vector& inShapes_4D_dynamic(); + const std::vector& precisions(); + } // namespace Conversion -} // namespace CPULayerTestsDefinitions \ No newline at end of file +} // namespace CPULayerTestsDefinitions diff --git a/src/plugins/intel_cpu/tests/functional/single_layer_tests/classes/eltwise.cpp b/src/plugins/intel_cpu/tests/functional/single_layer_tests/classes/eltwise.cpp index d81de3af743ee1..72b0fc163adbe5 100644 --- a/src/plugins/intel_cpu/tests/functional/single_layer_tests/classes/eltwise.cpp +++ b/src/plugins/intel_cpu/tests/functional/single_layer_tests/classes/eltwise.cpp @@ -5,6 +5,7 @@ #include "eltwise.hpp" #include "gtest/gtest.h" #include "test_utils/cpu_test_utils.hpp" +#include using namespace InferenceEngine; using namespace CPUTestUtils; @@ -28,7 +29,7 @@ std::string EltwiseLayerCPUTest::getTestCaseName(testing::TestParamInfo& targetInputStaticShapes) { +void EltwiseLayerCPUTest::generate_inputs(const std::vector& targetInputStaticShapes) { inputs.clear(); const auto& funcInputs = function->inputs(); for (size_t i = 0; i < funcInputs.size(); ++i) { @@ -93,7 +94,16 @@ void EltwiseLayerCPUTest::SetUp() { std::tie(inFmts, outFmts, priority, selectedType) = cpuParams; std::tie(postOpMgrPtr, fusedOps) = fusingParams; - selectedType = makeSelectedTypeStr(getPrimitiveType(), netType); + if (inType == ElementType::i64 || inType == ElementType::u64) { + auto i64Flag = configuration.find(PluginConfigInternalParams::KEY_CPU_NATIVE_I64); + if (i64Flag == configuration.end() || i64Flag->second == PluginConfigParams::NO) { + selectedType = makeSelectedTypeStr(getPrimitiveType(), ElementType::i32); + } else { + selectedType = makeSelectedTypeStr(getPrimitiveType(), ElementType::i64); + } + } else { + selectedType = makeSelectedTypeStr(getPrimitiveType(), netType); + } #if defined(OPENVINO_ARCH_ARM) || defined(OPENVINO_ARCH_ARM64) if (eltwiseType == POWER) { selectedType = std::regex_replace(selectedType, std::regex("acl"), "ref"); @@ -103,7 +113,7 @@ void EltwiseLayerCPUTest::SetUp() { shapes.resize(2); switch (opType) { case CommonTestUtils::OpType::SCALAR: { - std::vector identityShapes(shapes[0].second.size(), {1}); + std::vector identityShapes(shapes[0].second.size(), {1}); shapes[1] = {{}, identityShapes}; break; } @@ -120,13 +130,13 @@ void EltwiseLayerCPUTest::SetUp() { configuration.insert(additional_config.begin(), additional_config.end()); auto parameters = ngraph::builder::makeDynamicParams(netType, {inputDynamicShapes.front()}); - std::shared_ptr secondaryInput; + std::shared_ptr secondaryInput; if (secondaryInputType == ngraph::helpers::InputLayerType::PARAMETER) { secondaryInput = ngraph::builder::makeDynamicParams(netType, {inputDynamicShapes.back()}).front(); - parameters.push_back(std::dynamic_pointer_cast(secondaryInput)); + parameters.push_back(std::dynamic_pointer_cast(secondaryInput)); } else { auto pShape = inputDynamicShapes.back(); - ngraph::Shape shape; + ov::Shape shape; if (pShape.is_static()) { shape = pShape.get_shape(); } else { @@ -138,16 +148,24 @@ void EltwiseLayerCPUTest::SetUp() { } } } + if (netType == ElementType::i32) { - auto data_tensor = generate_eltwise_input(ElementType::i32, shape); + auto data_tensor = generate_eltwise_input(netType, shape); auto data_ptr = reinterpret_cast(data_tensor.data()); - std::vector data(data_ptr, data_ptr + ngraph::shape_size(shape)); + std::vector data(data_ptr, data_ptr + ov::shape_size(shape)); secondaryInput = ngraph::builder::makeConstant(netType, shape, data); - } else { + } else if (netType == ElementType::i64) { + auto data_tensor = generate_eltwise_input(netType, shape); + auto data_ptr = reinterpret_cast(data_tensor.data()); + std::vector data(data_ptr, data_ptr + ov::shape_size(shape)); + secondaryInput = ngraph::builder::makeConstant(netType, shape, data); + } else if (netType == ElementType::f32 || netType == ElementType::bf16) { auto data_tensor = generate_eltwise_input(ElementType::f32, shape); auto data_ptr = reinterpret_cast(data_tensor.data()); - std::vector data(data_ptr, data_ptr + ngraph::shape_size(shape)); + std::vector data(data_ptr, data_ptr + ov::shape_size(shape)); secondaryInput = ngraph::builder::makeConstant(netType, shape, data); + } else { + IE_THROW() << "Unsupported data type."; } } auto eltwise = ngraph::builder::makeEltwise(parameters[0], secondaryInput, eltwiseType); @@ -270,8 +288,8 @@ const std::vector& secondaryInputTypes() { return secondaryInputTypes; } -const std::vector>& inShapes_4D_1D() { - static const std::vector> inShapes_4D_1D = { +const std::vector>& inShapes_4D_1D() { + static const std::vector> inShapes_4D_1D = { {{2, 17, 5, 4}, {4}}, {{1, 3, 3, 3}, {3}}, }; @@ -293,8 +311,8 @@ const std::vector& cpuParams_4D_1D_Parameter_mode() { return cpuParams_4D_1D_Parameter_mode; } -const std::vector>& inShapes_5D_1D() { - static const std::vector> inShapes_5D_1D = { +const std::vector>& inShapes_5D_1D() { + static const std::vector> inShapes_5D_1D = { {{2, 17, 5, 4, 10}, {10}}, {{1, 3, 3, 3, 3}, {3}}, }; diff --git a/src/plugins/intel_cpu/tests/functional/single_layer_tests/classes/reduce.cpp b/src/plugins/intel_cpu/tests/functional/single_layer_tests/classes/reduce.cpp index fbd3f9bb3e485f..a48e3b5f39eaf2 100644 --- a/src/plugins/intel_cpu/tests/functional/single_layer_tests/classes/reduce.cpp +++ b/src/plugins/intel_cpu/tests/functional/single_layer_tests/classes/reduce.cpp @@ -4,12 +4,11 @@ #include "reduce.hpp" -#include "gtest/gtest.h" -#include "test_utils/cpu_test_utils.hpp" +#include +#include using namespace InferenceEngine; using namespace CPUTestUtils; -using namespace ngraph::helpers; using namespace ov::test; namespace CPULayerTestsDefinitions { @@ -26,8 +25,9 @@ std::string ReduceCPULayerTest::getTestCaseName(testing::TestParamInfo inputShapes; + ov::AnyMap config; - std::tie(axes, opType, keepDims, reductionType, netPrecision, inPrc, outPrc, inputShapes) = basicParams; + std::tie(axes, opType, keepDims, reductionType, netPrecision, inPrc, outPrc, inputShapes, config) = basicParams; std::ostringstream result; result << "IS=("; @@ -49,7 +49,15 @@ std::string ReduceCPULayerTest::getTestCaseName(testing::TestParamInfo 1) + FAIL() << "In reduce op if op type is scalar, 'axis' input's must contain 1 element"; + break; + case CommonTestUtils::OpType::VECTOR: + shapeAxes.push_back(axes.size()); + break; + default: + FAIL() << "Reduce op doesn't support operation type: " << opType; } - auto reductionAxesNode = std::dynamic_pointer_cast( - std::make_shared(ngraph::element::Type_t::i64, ngraph::Shape(shapeAxes), axes)); + auto reductionAxesNode = std::dynamic_pointer_cast( + std::make_shared(ElementType::i64, ov::Shape(shapeAxes), axes)); const auto reduce = ngraph::builder::makeReduce(paramOuts[0], reductionAxesNode, keepDims, reductionType); - selectedType = getPrimitiveType() + "_" + - (inPrc == ElementType::boolean ? "I8" : InferenceEngine::details::convertPrecision(inPrc).name()); + if (inPrc == ElementType::i64 || inPrc == ElementType::u64) { + auto i64It = configuration.find(PluginConfigInternalParams::KEY_CPU_NATIVE_I64); + if (i64It == configuration.end() || i64It->second == PluginConfigParams::NO) { + selectedType = makeSelectedTypeStr(getPrimitiveType(), ElementType::i32); + } else { + selectedType = makeSelectedTypeStr(getPrimitiveType(), ElementType::i64); + } + } else if (inPrc == ElementType::boolean) { + selectedType = makeSelectedTypeStr(getPrimitiveType(), ElementType::i8); + } else { + selectedType = makeSelectedTypeStr(getPrimitiveType(), inPrc); + } // hybrid layouts if (inFmts.size() != 0 && outFmts.size() == 0) { size_t outShapeSize = inputDynamicShapes[0].size() - axes.size(); switch (outShapeSize) { - case 0: - case 1: - outFmts.push_back(x); - break; - case 2: - outFmts.push_back(nc); - break; - case 3: - outFmts.push_back(tnc); - break; - case 4: - outFmts.push_back(nchw); - break; - default: - FAIL() << "Invaid outShapeSize: " << outShapeSize; + case 0: + case 1: + outFmts.push_back(x); + break; + case 2: + outFmts.push_back(nc); + break; + case 3: + outFmts.push_back(tnc); + break; + case 4: + outFmts.push_back(nchw); + break; + default: + FAIL() << "Invaid outShapeSize: " << outShapeSize; } } @@ -135,23 +153,25 @@ void ReduceCPULayerTest::generate_inputs(const std::vector& targe const auto& funcInput = funcInputs[i]; ov::Tensor tensor; if (reductionType == ngraph::helpers::ReductionType::Prod) { - tensor = ov::test::utils::create_and_fill_tensor(funcInput.get_element_type(), - targetInputStaticShapes[i], - 10, - 5); + tensor = utils::create_and_fill_tensor(funcInput.get_element_type(), targetInputStaticShapes[i], 10, 1); if (netPrecision == ElementType::f32) { - auto* rawBlobDataPtr = static_cast(tensor.data()); + auto *rawBlobDataPtr = static_cast(tensor.data()); for (size_t i = 0; i < tensor.get_size(); ++i) { rawBlobDataPtr[i] /= 10.f; } } else if (netPrecision == ElementType::bf16) { - auto* rawBlobDataPtr = static_cast(tensor.data()); + auto *rawBlobDataPtr = static_cast(tensor.data()); for (size_t i = 0; i < tensor.get_size(); ++i) { rawBlobDataPtr[i] /= 10.f; } + } else if (netPrecision == ElementType::i64) { + // auto *rawBlobDataPtr = static_cast(tensor.data()); + // for (size_t i = 0; i < tensor.get_size(); ++i) { + // rawBlobDataPtr[i] /= 10; + // } } } else { - tensor = ov::test::utils::create_and_fill_tensor(funcInput.get_element_type(), targetInputStaticShapes[i]); + tensor = utils::create_and_fill_tensor(funcInput.get_element_type(), targetInputStaticShapes[i]); } inputs.insert({funcInput.get_node_shared_ptr(), tensor}); diff --git a/src/plugins/intel_cpu/tests/functional/single_layer_tests/classes/reduce.hpp b/src/plugins/intel_cpu/tests/functional/single_layer_tests/classes/reduce.hpp index 5325093c222313..dbcfaf9666c5cf 100644 --- a/src/plugins/intel_cpu/tests/functional/single_layer_tests/classes/reduce.hpp +++ b/src/plugins/intel_cpu/tests/functional/single_layer_tests/classes/reduce.hpp @@ -5,43 +5,38 @@ #pragma once #include "shared_test_classes/base/ov_subgraph.hpp" -#include "ngraph_functions/builders.hpp" -#include "test_utils/cpu_test_utils.hpp" -#include #include "test_utils/fusing_test_utils.hpp" -using namespace CPUTestUtils; -using namespace ov::test; - namespace CPULayerTestsDefinitions { typedef std::tuple< - std::vector, // Axis to reduce order - CommonTestUtils::OpType, // Scalar or vector type axis - bool, // Keep dims - ngraph::helpers::ReductionType, // Reduce operation type - ElementType, // Net precision - ElementType, // Input precision - ElementType, // Output precision - std::vector // Input shapes + std::vector, // Axis to reduce order + CommonTestUtils::OpType, // Scalar or vector type axis + bool, // Keep dims + ngraph::helpers::ReductionType, // Reduce operation type + ov::test::ElementType, // Net precision + ov::test::ElementType, // Input precision + ov::test::ElementType, // Output precision + std::vector, // Input shapes + ov::AnyMap // Additional network configuration > basicReduceParams; typedef std::tuple< basicReduceParams, - CPUSpecificParams, - fusingSpecificParams> ReduceLayerCPUTestParamSet; + CPUTestUtils::CPUSpecificParams, + CPUTestUtils::fusingSpecificParams> ReduceLayerCPUTestParamSet; class ReduceCPULayerTest : public testing::WithParamInterface, - virtual public SubgraphBaseTest, public CpuTestWithFusing { + virtual public ov::test::SubgraphBaseTest, public CPUTestUtils::CpuTestWithFusing { public: static std::string getTestCaseName(testing::TestParamInfo obj); protected: void SetUp() override; - void generate_inputs(const std::vector& targetInputStaticShapes) override; + void generate_inputs(const std::vector& targetInputStaticShapes) override; private: ngraph::helpers::ReductionType reductionType; - ElementType netPrecision; + ov::test::ElementType netPrecision; }; namespace Reduce { @@ -51,8 +46,8 @@ const std::vector>& axes(); const std::vector>& axesND(); const std::vector& opTypes(); const std::vector& reductionTypes(); -const std::vector& inpOutPrc(); +const std::vector& inpOutPrc(); const std::vector& reductionTypesInt32(); } // namespace Reduce -} // namespace CPULayerTestsDefinitions \ No newline at end of file +} // namespace CPULayerTestsDefinitions diff --git a/src/plugins/intel_cpu/tests/functional/single_layer_tests/classes/transpose.cpp b/src/plugins/intel_cpu/tests/functional/single_layer_tests/classes/transpose.cpp index f90af8c07d008e..fec1a010176af6 100644 --- a/src/plugins/intel_cpu/tests/functional/single_layer_tests/classes/transpose.cpp +++ b/src/plugins/intel_cpu/tests/functional/single_layer_tests/classes/transpose.cpp @@ -14,13 +14,12 @@ using namespace ov::test; namespace CPULayerTestsDefinitions { std::string TransposeLayerCPUTest::getTestCaseName(testing::TestParamInfo obj) { - Precision netPrecision; + ElementType netPrecision; InputShape inputShapes; std::vector inputOrder; - std::string targetDevice; CPUSpecificParams cpuParams; - std::map additionalConfig; - std::tie(inputShapes, inputOrder, netPrecision, targetDevice, additionalConfig, cpuParams) = obj.param; + ov::AnyMap config; + std::tie(inputShapes, inputOrder, netPrecision, config, cpuParams) = obj.param; std::ostringstream result; result << "IS=" << CommonTestUtils::partialShape2str({inputShapes.first}) << "_"; @@ -30,34 +29,40 @@ std::string TransposeLayerCPUTest::getTestCaseName(testing::TestParamInfo(ov::element::i64, ov::Shape({inputOrder.size()}), inputOrder); + const auto inputOrderOp = std::make_shared(ov::element::i64, + ov::Shape({inputOrder.size()}), + inputOrder); const auto transpose = std::make_shared(params[0], inputOrderOp); transpose->get_rt_info() = getCPUInfo(); const ov::ResultVector results{std::make_shared(transpose)}; @@ -72,8 +77,8 @@ TEST_P(TransposeLayerCPUTest, CompareWithRefs) { } namespace Transpose { -const std::vector& netPrecisionsPerChannels() { - static const std::vector netPrecisionsPerChannels = {Precision::I8, Precision::FP32}; +const std::vector& netPrecisionsPerChannels() { + static const std::vector netPrecisionsPerChannels = {ElementType::i8, ElementType::f32}; return netPrecisionsPerChannels; } diff --git a/src/plugins/intel_cpu/tests/functional/single_layer_tests/classes/transpose.hpp b/src/plugins/intel_cpu/tests/functional/single_layer_tests/classes/transpose.hpp index 6d07d4a0d22943..00089021a3b1b5 100644 --- a/src/plugins/intel_cpu/tests/functional/single_layer_tests/classes/transpose.hpp +++ b/src/plugins/intel_cpu/tests/functional/single_layer_tests/classes/transpose.hpp @@ -8,24 +8,18 @@ #include "ngraph_functions/builders.hpp" #include "shared_test_classes/base/ov_subgraph.hpp" #include "test_utils/cpu_test_utils.hpp" -#include "gtest/gtest.h" - - -using namespace InferenceEngine; -using namespace CPUTestUtils; -using namespace ov::test; namespace CPULayerTestsDefinitions { typedef std::tuple< - InputShape, // Input shapes - std::vector, // Input order - InferenceEngine::Precision, // Net precision - std::string, // Target device name - std::map, // Additional network configuration - CPUSpecificParams> TransposeLayerCPUTestParamSet; + ov::test::InputShape, // Input shapes + std::vector, // Input order + ov::test::ElementType, // Net precision + ov::AnyMap, // Additional plugin configuration + CPUTestUtils::CPUSpecificParams +> TransposeLayerCPUTestParamSet; class TransposeLayerCPUTest : public testing::WithParamInterface, - public ov::test::SubgraphBaseTest, public CPUTestsBase { + public ov::test::SubgraphBaseTest, public CPUTestUtils::CPUTestsBase { public: static std::string getTestCaseName(testing::TestParamInfo obj); protected: @@ -33,10 +27,10 @@ class TransposeLayerCPUTest : public testing::WithParamInterface& netPrecisionsPerChannels(); - const std::vector& dynamicInputShapes4DC16(); - const std::vector& dynamicInputShapes4DC32(); - const std::vector& dynamicInputShapes4D(); + const std::vector& netPrecisionsPerChannels(); + const std::vector& dynamicInputShapes4DC16(); + const std::vector& dynamicInputShapes4DC32(); + const std::vector& dynamicInputShapes4D(); const std::vector>& inputOrder4D(); } // namespace Transpose -} // namespace CPULayerTestsDefinitions \ No newline at end of file +} // namespace CPULayerTestsDefinitions diff --git a/src/plugins/intel_cpu/tests/functional/single_layer_tests/concat.cpp b/src/plugins/intel_cpu/tests/functional/single_layer_tests/concat.cpp index 1cb242daaf55f1..bbacc1c38c7c7a 100644 --- a/src/plugins/intel_cpu/tests/functional/single_layer_tests/concat.cpp +++ b/src/plugins/intel_cpu/tests/functional/single_layer_tests/concat.cpp @@ -5,6 +5,7 @@ #include "shared_test_classes/base/ov_subgraph.hpp" #include "ngraph_functions/builders.hpp" #include "test_utils/cpu_test_utils.hpp" +#include using namespace ov::test; using namespace CPUTestUtils; @@ -12,9 +13,10 @@ using namespace CPUTestUtils; namespace CPULayerTestsDefinitions { typedef std::tuple< - size_t, // Concat axis + int64_t, // Concat axis std::vector, // Input shapes ElementType, // Network precision + ov::AnyMap, // Additional config CPUSpecificParams > concatCPUTestParams; @@ -22,11 +24,12 @@ class ConcatLayerCPUTest : public testing::WithParamInterface obj) { - int axis; + int64_t axis; std::vector inputShapes; ElementType netPrecision; + ov::AnyMap additionalConfig; CPUSpecificParams cpuParams; - std::tie(axis, inputShapes, netPrecision, cpuParams) = obj.param; + std::tie(axis, inputShapes, netPrecision, additionalConfig, cpuParams) = obj.param; std::ostringstream result; result << "IS="; @@ -46,6 +49,15 @@ class ConcatLayerCPUTest : public testing::WithParamInterfacesecond == InferenceEngine::PluginConfigParams::NO) { + selectedType = makeSelectedTypeStr(selectedType, ElementType::i32); + } else { + selectedType = makeSelectedTypeStr(selectedType, ElementType::i64); + } + } else { + selectedType = makeSelectedTypeStr(selectedType, netPrecision); + } init_input_shapes(inputShape); auto params = ngraph::builder::makeDynamicParams(netPrecision, inputDynamicShapes); auto paramOuts = ngraph::helpers::convert2OutputVector( - ngraph::helpers::castOps2Nodes(params)); - auto concat = std::make_shared(paramOuts, axis); + ngraph::helpers::castOps2Nodes(params)); + auto concat = std::make_shared(paramOuts, axis); function = makeNgraphFunction(netPrecision, params, concat, "ConcatCPU"); } @@ -118,23 +139,41 @@ const auto blocked16_5D_ref = CPUSpecificParams{{nCdhw16c}, {nCdhw16c}, {}, "ref const std::vector netPrecisions = { ElementType::i8, ElementType::i32, + ElementType::i64, ElementType::f32, ElementType::bf16 }; +const ov::AnyMap emptyConfig = {}; +const std::vector i64Config = { + {{InferenceEngine::PluginConfigInternalParams::KEY_CPU_NATIVE_I64, InferenceEngine::PluginConfigParams::YES}}, + {{InferenceEngine::PluginConfigInternalParams::KEY_CPU_NATIVE_I64, InferenceEngine::PluginConfigParams::NO}} +}; + INSTANTIATE_TEST_SUITE_P(smoke_Concat4D_CPU_Block8_static, ConcatLayerCPUTest, ::testing::Combine( ::testing::Values(1, -2, 3), ::testing::Values(static_shapes_to_test_representation({{2, 16, 3, 5}, {2, 16, 3, 5}})), ::testing::ValuesIn(netPrecisions), + ::testing::Values(emptyConfig), ::testing::Values(planar_4D_ref, planarChannels_4D, blocked8_4D_ref)), ConcatLayerCPUTest::getTestCaseName); +INSTANTIATE_TEST_SUITE_P(smoke_Concat4D_CPU_Block8_I64_static, ConcatLayerCPUTest, + ::testing::Combine( + ::testing::Values(1, -2, 3), + ::testing::Values(static_shapes_to_test_representation({{2, 16, 3, 5}, {2, 16, 3, 5}})), + ::testing::Values(ElementType::i64), + ::testing::ValuesIn(i64Config), + ::testing::Values(planar_4D_ref, planarChannels_4D, blocked8_4D_ref)), + ConcatLayerCPUTest::getTestCaseName); + INSTANTIATE_TEST_SUITE_P(smoke_Concat4D_CPU_Block16_static, ConcatLayerCPUTest, ::testing::Combine( ::testing::Values(1, 2, -1), ::testing::Values(static_shapes_to_test_representation({{3, 32, 3, 5}, {3, 32, 3, 5}})), ::testing::ValuesIn(netPrecisions), + ::testing::Values(emptyConfig), ::testing::Values(blocked16_4D_ref)), ConcatLayerCPUTest::getTestCaseName); @@ -162,9 +201,19 @@ INSTANTIATE_TEST_SUITE_P(smoke_Concat4D_CPU_Block_dynamic_axis_1, ConcatLayerCPU ::testing::Values(1, -3), ::testing::ValuesIn(inputShapes4D_Block_axis1), ::testing::ValuesIn(netPrecisions), + ::testing::Values(emptyConfig), ::testing::Values(blocked8_4D_ref, blocked16_4D_ref)), ConcatLayerCPUTest::getTestCaseName); +INSTANTIATE_TEST_SUITE_P(smoke_Concat4D_CPU_Block_dynamic_axis_1_I64, ConcatLayerCPUTest, + ::testing::Combine( + ::testing::Values(1, -3), + ::testing::ValuesIn(inputShapes4D_Block_axis1), + ::testing::Values(ElementType::i64), + ::testing::ValuesIn(i64Config), + ::testing::Values(blocked8_4D_ref, blocked16_4D_ref)), + ConcatLayerCPUTest::getTestCaseName); + const std::vector> inputShapes4D_axis1 = { { {{-1, -1, -1, -1}, {{2, 32, 0, 7}, {2, 32, 5, 7}, {2, 32, 5, 7}, {1, 18, 10, 2}, {2, 32, 5, 7}, {3, 8, 1, 8}, {2, 0, 5, 7}}}, @@ -193,6 +242,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_Concat4D_CPU_dynamic_axis_1, ConcatLayerCPUTest, ::testing::Values(1), ::testing::ValuesIn(inputShapes4D_axis1), ::testing::ValuesIn(netPrecisions), + ::testing::Values(emptyConfig), ::testing::Values(planar_4D_ref, planarChannels_4D)), ConcatLayerCPUTest::getTestCaseName); @@ -219,6 +269,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_Concat4D_CPU_Block_dynamic_axis_2, ConcatLayerCPU ::testing::Values(2), ::testing::ValuesIn(inputShapes4D_Block_axis2), ::testing::ValuesIn(netPrecisions), + ::testing::Values(emptyConfig), ::testing::Values(blocked8_4D_ref, blocked16_4D_ref)), ConcatLayerCPUTest::getTestCaseName); @@ -240,6 +291,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_Concat4D_CPU_dynamic_axis_2, ConcatLayerCPUTest, ::testing::Values(2, -2), ::testing::ValuesIn(inputShapes4D_axis2), ::testing::ValuesIn(netPrecisions), + ::testing::Values(emptyConfig), ::testing::Values(planar_4D_ref, planarChannels_4D)), ConcatLayerCPUTest::getTestCaseName); @@ -261,6 +313,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_Concat4D_CPU_Block_dynamic_axis_3, ConcatLayerCPU ::testing::Values(3), ::testing::ValuesIn(inputShapes4D_Block_axis3), ::testing::ValuesIn(netPrecisions), + ::testing::Values(emptyConfig), ::testing::Values(blocked8_4D_ref, blocked16_4D_ref)), ConcatLayerCPUTest::getTestCaseName); @@ -287,6 +340,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_Concat4D_CPU_dynamic_axis_3, ConcatLayerCPUTest, ::testing::Values(3, -1), ::testing::ValuesIn(inputShapes4D_axis3), ::testing::ValuesIn(netPrecisions), + ::testing::Values(emptyConfig), ::testing::Values(planar_4D_ref, planarChannels_4D)), ConcatLayerCPUTest::getTestCaseName); @@ -295,14 +349,25 @@ INSTANTIATE_TEST_SUITE_P(smoke_Concat5D_CPU_Block8_static, ConcatLayerCPUTest, ::testing::Values(2, 3, -2), ::testing::Values(static_shapes_to_test_representation({{2, 16, 3, 5, 7}, {2, 16, 3, 5, 7}})), ::testing::ValuesIn(netPrecisions), + ::testing::Values(emptyConfig), ::testing::Values(planar_5D_ref, planarChannels_5D, blocked8_5D_ref)), ConcatLayerCPUTest::getTestCaseName); +INSTANTIATE_TEST_SUITE_P(smoke_Concat5D_CPU_Block8_I64_static, ConcatLayerCPUTest, + ::testing::Combine( + ::testing::Values(2, 3, -2), + ::testing::Values(static_shapes_to_test_representation({{2, 16, 3, 5, 7}, {2, 16, 3, 5, 7}})), + ::testing::Values(ElementType::i64), + ::testing::ValuesIn(i64Config), + ::testing::Values(planar_5D_ref, planarChannels_5D, blocked8_5D_ref)), + ConcatLayerCPUTest::getTestCaseName); + INSTANTIATE_TEST_SUITE_P(smoke_Concat5D_CPU_Block16_static, ConcatLayerCPUTest, ::testing::Combine( ::testing::Values(2, 3, 4), ::testing::Values(static_shapes_to_test_representation({{2, 32, 3, 5, 7}, {2, 32, 3, 5, 7}})), ::testing::ValuesIn(netPrecisions), + ::testing::Values(emptyConfig), ::testing::Values(blocked16_5D_ref)), ConcatLayerCPUTest::getTestCaseName); @@ -324,9 +389,19 @@ INSTANTIATE_TEST_SUITE_P(smoke_Concat5D_CPU_Block_dynamic_axis_1, ConcatLayerCPU ::testing::Values(1), ::testing::ValuesIn(inputShapes5D_Block_axis1), ::testing::ValuesIn(netPrecisions), + ::testing::Values(emptyConfig), ::testing::Values(blocked8_5D_ref, blocked16_5D_ref)), ConcatLayerCPUTest::getTestCaseName); +INSTANTIATE_TEST_SUITE_P(smoke_Concat5D_CPU_Block_dynamic_axis_1_I64, ConcatLayerCPUTest, + ::testing::Combine( + ::testing::Values(1), + ::testing::ValuesIn(inputShapes5D_Block_axis1), + ::testing::Values(ElementType::i64), + ::testing::ValuesIn(i64Config), + ::testing::Values(blocked8_5D_ref, blocked16_5D_ref)), + ConcatLayerCPUTest::getTestCaseName); + const std::vector> inputShapes5D_axis1 = { { {{-1, -1, -1, -1, -1}, {{2, 5, 5, 7, 6}, {1, 3, 10, 2, 8}, {3, 4, 1, 8, 10}}}, @@ -345,6 +420,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_Concat5D_CPU_dynamic_axis_1, ConcatLayerCPUTest, ::testing::Values(1), ::testing::ValuesIn(inputShapes5D_axis1), ::testing::ValuesIn(netPrecisions), + ::testing::Values(emptyConfig), ::testing::Values(planar_5D_ref, planarChannels_5D)), ConcatLayerCPUTest::getTestCaseName); @@ -366,6 +442,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_Concat5D_CPU_Block_dynamic_axis_2, ConcatLayerCPU ::testing::Values(-3), ::testing::ValuesIn(inputShapes5D_Block_axis2), ::testing::ValuesIn(netPrecisions), + ::testing::Values(emptyConfig), ::testing::Values(blocked8_5D_ref, blocked16_5D_ref)), ConcatLayerCPUTest::getTestCaseName); @@ -387,6 +464,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_Concat5D_CPU_dynamic_axis_2, ConcatLayerCPUTest, ::testing::Values(2), ::testing::ValuesIn(inputShapes5D_axis2), ::testing::ValuesIn(netPrecisions), + ::testing::Values(emptyConfig), ::testing::Values(planar_5D_ref, planarChannels_5D)), ConcatLayerCPUTest::getTestCaseName); @@ -408,6 +486,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_Concat5D_CPU_Block_dynamic_axis_3, ConcatLayerCPU ::testing::Values(3), ::testing::ValuesIn(inputShapes5D_Block_axis3), ::testing::ValuesIn(netPrecisions), + ::testing::Values(emptyConfig), ::testing::Values(blocked8_5D_ref, blocked16_5D_ref)), ConcatLayerCPUTest::getTestCaseName); @@ -429,6 +508,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_Concat5D_CPU_dynamic_axis_3, ConcatLayerCPUTest, ::testing::Values(3), ::testing::ValuesIn(inputShapes5D_axis3), ::testing::ValuesIn(netPrecisions), + ::testing::Values(emptyConfig), ::testing::Values(planar_5D_ref, planarChannels_5D)), ConcatLayerCPUTest::getTestCaseName); @@ -450,6 +530,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_Concat5D_CPU_Block_dynamic_axis_4, ConcatLayerCPU ::testing::Values(4), ::testing::ValuesIn(inputShapes5D_Block_axis4), ::testing::ValuesIn(netPrecisions), + ::testing::Values(emptyConfig), ::testing::Values(blocked8_5D_ref, blocked16_5D_ref)), ConcatLayerCPUTest::getTestCaseName); @@ -471,6 +552,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_Concat5D_CPU_dynamic_axis_4, ConcatLayerCPUTest, ::testing::Values(4), ::testing::ValuesIn(inputShapes5D_axis4), ::testing::ValuesIn(netPrecisions), + ::testing::Values(emptyConfig), ::testing::Values(planar_5D_ref, planarChannels_5D)), ConcatLayerCPUTest::getTestCaseName); @@ -515,16 +597,27 @@ INSTANTIATE_TEST_SUITE_P(smoke_Concat_byBatch_static, ConcatLayerCPUTest, ::testing::Values(0), ::testing::ValuesIn(inputShapes_byBatch_static), ::testing::ValuesIn(netPrecisions), + ::testing::Values(emptyConfig), + ::testing::Values(CPUSpecificParams{{}, {}, {}, "unknown"})), + ConcatLayerCPUTest::getTestCaseName); + +INSTANTIATE_TEST_SUITE_P(smoke_Concat_byBatch_I64_static, ConcatLayerCPUTest, + ::testing::Combine( + ::testing::Values(0), + ::testing::ValuesIn(inputShapes_byBatch_static), + ::testing::Values(ElementType::i64), + ::testing::ValuesIn(i64Config), ::testing::Values(CPUSpecificParams{{}, {}, {}, "unknown"})), - ConcatLayerCPUTest::getTestCaseName); + ConcatLayerCPUTest::getTestCaseName); INSTANTIATE_TEST_SUITE_P(smoke_Concat_byBatch_dynamic, ConcatLayerCPUTest, ::testing::Combine( ::testing::Values(0), ::testing::ValuesIn(inputShapes_byBatch_dynamic), ::testing::ValuesIn(netPrecisions), + ::testing::Values(emptyConfig), ::testing::Values(CPUSpecificParams{{}, {}, {}, "ref"})), - ConcatLayerCPUTest::getTestCaseName); + ConcatLayerCPUTest::getTestCaseName); const std::vector> inputShapes3D_axis1 = { static_shapes_to_test_representation({{2, 4, 5}, {2, 4, 5}}), @@ -545,6 +638,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_Concat_3D_axis1, ConcatLayerCPUTest, ::testing::Values(1), ::testing::ValuesIn(inputShapes3D_axis1), ::testing::ValuesIn(netPrecisions), + ::testing::Values(emptyConfig), ::testing::Values(CPUSpecificParams{{}, {}, {}, "ref"})), ConcatLayerCPUTest::getTestCaseName); @@ -567,6 +661,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_Concat_3D_axis2, ConcatLayerCPUTest, ::testing::Values(2), ::testing::ValuesIn(inputShapes3D_axis2), ::testing::ValuesIn(netPrecisions), + ::testing::Values(emptyConfig), ::testing::Values(CPUSpecificParams{{}, {}, {}, "ref"})), ConcatLayerCPUTest::getTestCaseName); @@ -589,6 +684,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_Concat_2D_axis1, ConcatLayerCPUTest, ::testing::Values(1), ::testing::ValuesIn(inputShapes2D_axis1), ::testing::ValuesIn(netPrecisions), + ::testing::Values(emptyConfig), ::testing::Values(CPUSpecificParams{{}, {}, {}, "ref"})), ConcatLayerCPUTest::getTestCaseName); @@ -617,14 +713,25 @@ INSTANTIATE_TEST_SUITE_P(smoke_Concat_1D_static, ConcatLayerCPUTest, ::testing::Values(0), ::testing::ValuesIn(inputShapes1D_static), ::testing::ValuesIn(netPrecisions), + ::testing::Values(emptyConfig), ::testing::Values(CPUSpecificParams{{}, {}, {}, "unknown"})), ConcatLayerCPUTest::getTestCaseName); +INSTANTIATE_TEST_SUITE_P(smoke_Concat_1D_I64_static, ConcatLayerCPUTest, + ::testing::Combine( + ::testing::Values(0), + ::testing::ValuesIn(inputShapes1D_static), + ::testing::Values(ElementType::i64), + ::testing::ValuesIn(i64Config), + ::testing::Values(CPUSpecificParams{{}, {}, {}, "unknown"})), + ConcatLayerCPUTest::getTestCaseName); + INSTANTIATE_TEST_SUITE_P(smoke_Concat_1D_dynamic, ConcatLayerCPUTest, ::testing::Combine( ::testing::Values(0), ::testing::ValuesIn(inputShapes1D_dynamic), ::testing::ValuesIn(netPrecisions), + ::testing::Values(emptyConfig), ::testing::Values(CPUSpecificParams{{}, {}, {}, "ref"})), ConcatLayerCPUTest::getTestCaseName); @@ -643,6 +750,7 @@ INSTANTIATE_TEST_SUITE_P(concat_Concat4D_CPU_Block8inPlace, ConcatLayerCPUTest, {{1, 16, -1, -1}, {{1, 16, 5, 7}, {1, 16, 16, 2}, {1, 16, 2, 8}}}, }), ::testing::Values(ElementType::f32), + ::testing::Values(emptyConfig), ::testing::Values(planar_4D, blocked8_4D)), ConcatLayerCPUTest::getTestCaseName); @@ -660,6 +768,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_Concat4D_CPU_Block16inPlace_0, ConcatLayerCPUTest {{1, 32, -1, -1}, {{1, 32, 5, 7}, {1, 32, 16, 2}, {1, 32, 2, 8}}}, }), ::testing::Values(ElementType::f32), + ::testing::Values(emptyConfig), ::testing::Values(blocked16_4D)), ConcatLayerCPUTest::getTestCaseName); @@ -677,6 +786,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_Concat4D_CPU_Block16inPlace_1, ConcatLayerCPUTest {{1, 32, -1, -1}, {{1, 32, 5, 7}, {1, 32, 16, 2}, {1, 32, 2, 8}}}, }), ::testing::Values(ElementType::f32), + ::testing::Values(emptyConfig), ::testing::Values(blocked16_4D)), ConcatLayerCPUTest::getTestCaseName); @@ -694,6 +804,7 @@ INSTANTIATE_TEST_SUITE_P(concat_Concat5D_CPU_Block8inPlace, ConcatLayerCPUTest, {{1, 32, -1, -1, -1}, {{1, 32, 5, 7, 3}, {1, 32, 16, 2, 3}, {1, 32, 2, 8, 3}}}, }), ::testing::ValuesIn(netPrecisions), + ::testing::Values(emptyConfig), ::testing::Values(planar_5D, blocked8_5D)), ConcatLayerCPUTest::getTestCaseName); @@ -711,6 +822,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_Concat5D_CPU_Block16inPlace, ConcatLayerCPUTest, {{1, 32, -1, -1, -1}, {{1, 32, 5, 7, 3}, {1, 32, 16, 2, 3}, {1, 32, 2, 8, 3}}}, }), ::testing::ValuesIn(netPrecisions), + ::testing::Values(emptyConfig), ::testing::Values(blocked16_5D)), ConcatLayerCPUTest::getTestCaseName); @@ -721,6 +833,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_Concat_inPlace, ConcatLayerCPUTest, static_shapes_to_test_representation({{1, 1, 1, 10}, {1, 1, 1, 10}}), static_shapes_to_test_representation({{1, 1, 5}, {1, 1, 5}})}), ::testing::ValuesIn(netPrecisions), + ::testing::Values(emptyConfig), ::testing::Values(CPUSpecificParams{{}, {}, {}, "unknown"})), ConcatLayerCPUTest::getTestCaseName); diff --git a/src/plugins/intel_cpu/tests/functional/single_layer_tests/cum_sum.cpp b/src/plugins/intel_cpu/tests/functional/single_layer_tests/cum_sum.cpp index 84ce22c180db14..237818624e5d2b 100644 --- a/src/plugins/intel_cpu/tests/functional/single_layer_tests/cum_sum.cpp +++ b/src/plugins/intel_cpu/tests/functional/single_layer_tests/cum_sum.cpp @@ -5,42 +5,49 @@ #include "test_utils/cpu_test_utils.hpp" #include "shared_test_classes/base/ov_subgraph.hpp" #include "ngraph_functions/builders.hpp" +#include -using namespace ngraph; using namespace InferenceEngine; using namespace CPUTestUtils; -using namespace ov; -using namespace test; +using namespace ov::test; namespace CPULayerTestsDefinitions { using cumSumParams = std::tuple< - ngraph::element::Type, // data precision - InputShape, // input shape - std::int64_t, // axis - bool, // exclusive - bool>; // reverse + ElementType, // data precision + InputShape, // input shape + std::int64_t, // axis + bool, // exclusive + bool, // reverse + ov::AnyMap>; // Additional network configuration class CumSumLayerCPUTest : public testing::WithParamInterface, public SubgraphBaseTest, public CPUTestsBase { public: static std::string getTestCaseName(testing::TestParamInfo obj) { - ngraph::element::Type inputPrecision; + ElementType inputPrecision; InputShape shapes; std::int64_t axis; bool exclusive; bool reverse; - std::tie(inputPrecision, shapes, axis, exclusive, reverse) = obj.param; + ov::AnyMap config; + std::tie(inputPrecision, shapes, axis, exclusive, reverse, config) = obj.param; - std::ostringstream results; - results << "IS=" << CommonTestUtils::partialShape2str({shapes.first}) << "_"; - results << "TS="; + std::ostringstream result; + result << "IS=" << CommonTestUtils::partialShape2str({shapes.first}) << "_"; + result << "TS="; for (const auto& item : shapes.second) { - results << CommonTestUtils::vec2str(item) << "_"; + result << CommonTestUtils::vec2str(item) << "_"; } - results << "Prc=" << inputPrecision << "_"; - results << "Axis=" << axis << "_" << (exclusive ? "exclusive" : "") << "_" << (reverse ? "reverse" : ""); - return results.str(); + result << "Prc=" << inputPrecision << "_"; + result << "Axis=" << axis << "_" << (exclusive ? "exclusive" : "") << "_" << (reverse ? "reverse" : ""); + + for (auto const& configItem : config) { + result << "_configItem=" << configItem.first << "_"; + configItem.second.print(result); + } + + return result.str(); } protected: @@ -50,7 +57,7 @@ class CumSumLayerCPUTest : public testing::WithParamInterface, std::int64_t axis; bool exclusive; bool reverse; - std::tie(inType, shapes, axis, exclusive, reverse) = this->GetParam(); + std::tie(inType, shapes, axis, exclusive, reverse, configuration) = this->GetParam(); if (inType == ElementType::bf16) rel_threshold = 0.05f; @@ -58,11 +65,11 @@ class CumSumLayerCPUTest : public testing::WithParamInterface, init_input_shapes({shapes}); auto params = ngraph::builder::makeDynamicParams(inType, inputDynamicShapes); - auto axisNode = ngraph::opset1::Constant::create(ngraph::element::i32, ngraph::Shape{}, std::vector{axis})->output(0); + auto axisNode = ov::op::v0::Constant::create(ElementType::i32, ov::Shape{}, std::vector{axis})->output(0); auto cumSum = ngraph::builder::makeCumSum(params[0], axisNode, exclusive, reverse); - function = std::make_shared(ngraph::NodeVector{ cumSum }, params, "CumSumLayerCPUTest"); - functionRefs = ngraph::clone_function(*function); + function = std::make_shared(ov::NodeVector{ cumSum }, params, "CumSumLayerCPUTest"); + functionRefs = ov::clone_model(*function); } }; @@ -71,10 +78,10 @@ TEST_P(CumSumLayerCPUTest, CompareWithRefs) { CheckPluginRelatedResults(compiledModel, "CumSum"); } -const ngraph::element::TypeVector inputPrecision = { - ngraph::element::i8, - ngraph::element::bf16, - ngraph::element::f32 +const std::vector inputPrecision = { + ElementType::i8, + ElementType::bf16, + ElementType::f32 }; const std::vector axes = { 0, 1, 2, 3, 4, 5, 6 }; @@ -112,12 +119,16 @@ const std::vector inShapes = { {{2, 4, 6, 5, 4, 3, 1}, {3, 5, 6, 6, 5, 3, 1}, {5, 7, 4, 6, 3, 7, 2}}} }; +const ov::AnyMap emptyConfig = {}; +const ov::AnyMap i64Config = {{PluginConfigInternalParams::KEY_CPU_NATIVE_I64, PluginConfigParams::YES}}; + const auto testCasesAxis_0 = ::testing::Combine( ::testing::ValuesIn(inputPrecision), ::testing::ValuesIn(inShapes), ::testing::Values(axes[0]), ::testing::ValuesIn(exclusive), - ::testing::ValuesIn(reverse) + ::testing::ValuesIn(reverse), + ::testing::Values(emptyConfig) ); const auto testCasesAxis_1 = ::testing::Combine( @@ -125,7 +136,8 @@ const auto testCasesAxis_1 = ::testing::Combine( ::testing::ValuesIn(std::vector(inShapes.begin() + 1, inShapes.end())), ::testing::Values(axes[1]), ::testing::ValuesIn(exclusive), - ::testing::ValuesIn(reverse) + ::testing::ValuesIn(reverse), + ::testing::Values(emptyConfig) ); const auto testCasesAxis_2 = ::testing::Combine( @@ -133,7 +145,8 @@ const auto testCasesAxis_2 = ::testing::Combine( ::testing::ValuesIn(std::vector(inShapes.begin() + 2, inShapes.end())), ::testing::Values(axes[2]), ::testing::ValuesIn(exclusive), - ::testing::ValuesIn(reverse) + ::testing::ValuesIn(reverse), + ::testing::Values(emptyConfig) ); const auto testCasesAxis_3 = ::testing::Combine( @@ -141,7 +154,8 @@ const auto testCasesAxis_3 = ::testing::Combine( ::testing::ValuesIn(std::vector(inShapes.begin() + 3, inShapes.end())), ::testing::Values(axes[3]), ::testing::ValuesIn(exclusive), - ::testing::ValuesIn(reverse) + ::testing::ValuesIn(reverse), + ::testing::Values(emptyConfig) ); const auto testCasesAxis_4 = ::testing::Combine( @@ -149,7 +163,8 @@ const auto testCasesAxis_4 = ::testing::Combine( ::testing::ValuesIn(std::vector(inShapes.begin() + 4, inShapes.end())), ::testing::Values(axes[4]), ::testing::ValuesIn(exclusive), - ::testing::ValuesIn(reverse) + ::testing::ValuesIn(reverse), + ::testing::Values(emptyConfig) ); const auto testCasesAxis_5 = ::testing::Combine( @@ -157,7 +172,8 @@ const auto testCasesAxis_5 = ::testing::Combine( ::testing::ValuesIn(std::vector(inShapes.begin() + 5, inShapes.end())), ::testing::Values(axes[5]), ::testing::ValuesIn(exclusive), - ::testing::ValuesIn(reverse) + ::testing::ValuesIn(reverse), + ::testing::Values(emptyConfig) ); const auto testCasesAxis_6 = ::testing::Combine( @@ -165,7 +181,8 @@ const auto testCasesAxis_6 = ::testing::Combine( ::testing::ValuesIn(std::vector(inShapes.begin() + 6, inShapes.end())), ::testing::Values(axes[6]), ::testing::ValuesIn(exclusive), - ::testing::ValuesIn(reverse) + ::testing::ValuesIn(reverse), + ::testing::Values(emptyConfig) ); const auto testCasesAxis_negative = ::testing::Combine( @@ -173,7 +190,8 @@ const auto testCasesAxis_negative = ::testing::Combine( ::testing::ValuesIn(std::vector(inShapes.begin() + 6, inShapes.end())), ::testing::ValuesIn(negativeAxes), ::testing::ValuesIn(exclusive), - ::testing::ValuesIn(reverse) + ::testing::ValuesIn(reverse), + ::testing::Values(emptyConfig) ); INSTANTIATE_TEST_SUITE_P(smoke_CompareWithRefsNumpy_axis_0, CumSumLayerCPUTest, testCasesAxis_0, CumSumLayerCPUTest::getTestCaseName); @@ -185,4 +203,41 @@ INSTANTIATE_TEST_SUITE_P(smoke_CompareWithRefsNumpy_axis_5, CumSumLayerCPUTest, INSTANTIATE_TEST_SUITE_P(smoke_CompareWithRefsNumpy_axis_6, CumSumLayerCPUTest, testCasesAxis_6, CumSumLayerCPUTest::getTestCaseName); INSTANTIATE_TEST_SUITE_P(smoke_CompareWithRefsNumpy_negative_axes, CumSumLayerCPUTest, testCasesAxis_negative, CumSumLayerCPUTest::getTestCaseName); +INSTANTIATE_TEST_SUITE_P(smoke_CompareWithRefsNumpy_axis_0_I64, CumSumLayerCPUTest, + ::testing::Combine( + ::testing::Values(ElementType::i64), + ::testing::ValuesIn(inShapes), + ::testing::Values(axes[0]), + ::testing::ValuesIn(exclusive), + ::testing::ValuesIn(reverse), + ::testing::Values(i64Config)), + CumSumLayerCPUTest::getTestCaseName); +INSTANTIATE_TEST_SUITE_P(smoke_CompareWithRefsNumpy_axis_3_I64, CumSumLayerCPUTest, + ::testing::Combine( + ::testing::Values(ElementType::i64), + ::testing::ValuesIn(std::vector(inShapes.begin() + 3, inShapes.end())), + ::testing::Values(axes[3]), + ::testing::ValuesIn(exclusive), + ::testing::ValuesIn(reverse), + ::testing::Values(i64Config)), + CumSumLayerCPUTest::getTestCaseName); +INSTANTIATE_TEST_SUITE_P(smoke_CompareWithRefsNumpy_axis_6_I64, CumSumLayerCPUTest, + ::testing::Combine( + ::testing::Values(ElementType::i64), + ::testing::ValuesIn(std::vector(inShapes.begin() + 6, inShapes.end())), + ::testing::Values(axes[6]), + ::testing::ValuesIn(exclusive), + ::testing::ValuesIn(reverse), + ::testing::Values(i64Config)), + CumSumLayerCPUTest::getTestCaseName); +INSTANTIATE_TEST_SUITE_P(smoke_CompareWithRefsNumpy_negative_axes_I64, CumSumLayerCPUTest, + ::testing::Combine( + ::testing::Values(ElementType::i64), + ::testing::ValuesIn(std::vector(inShapes.begin() + 6, inShapes.end())), + ::testing::ValuesIn(negativeAxes), + ::testing::ValuesIn(exclusive), + ::testing::ValuesIn(reverse), + ::testing::Values(i64Config)), + CumSumLayerCPUTest::getTestCaseName); + } // namespace CPULayerTestsDefinitions diff --git a/src/plugins/intel_cpu/tests/functional/single_layer_tests/gather.cpp b/src/plugins/intel_cpu/tests/functional/single_layer_tests/gather.cpp index 3e4326321e7732..8d6100673bdcd5 100644 --- a/src/plugins/intel_cpu/tests/functional/single_layer_tests/gather.cpp +++ b/src/plugins/intel_cpu/tests/functional/single_layer_tests/gather.cpp @@ -6,7 +6,9 @@ #include "ngraph_functions/builders.hpp" #include "test_utils/cpu_test_utils.hpp" #include +#include +using namespace InferenceEngine; using namespace CPUTestUtils; using namespace ov::test; @@ -15,10 +17,11 @@ namespace CPULayerTestsDefinitions { typedef std::tuple< std::vector, // Input shapes std::tuple, // Axis and Batch dim - ElementType, // Network precision + ElementType, // Data precision + ElementType, // Indices precision bool, // Is const Axis CPUSpecificParams, // CPU specific params - std::map // Additional config + ov::AnyMap // Additional config > GatherLayerTestCPUParams; class GatherLayerTestCPU : public testing::WithParamInterface, @@ -27,12 +30,12 @@ class GatherLayerTestCPU : public testing::WithParamInterface obj) { std::vector inputShapes; std::tuple axisAndBatchDims; - ElementType netPrecision; + ElementType dataPrc, idxPrc; bool isAxisConstant; CPUSpecificParams cpuParams; - std::map additionalConfig; + ov::AnyMap additionalConfig; - std::tie(inputShapes, axisAndBatchDims, netPrecision, isAxisConstant, cpuParams, additionalConfig) = obj.param; + std::tie(inputShapes, axisAndBatchDims, dataPrc, idxPrc, isAxisConstant, cpuParams, additionalConfig) = obj.param; std::ostringstream result; result << "IS=("; @@ -49,15 +52,16 @@ class GatherLayerTestCPU : public testing::WithParamInterfaceset_friendly_name("indices"); if (!isAxisConstant) { - params.push_back(std::make_shared(intInputsPrecision, inputDynamicShapes[2])); + params.push_back(std::make_shared(axisPrc, inputDynamicShapes[2])); params[2]->set_friendly_name("axis"); } auto paramOuts = ngraph::helpers::convert2OutputVector(ngraph::helpers::castOps2Nodes(params)); std::shared_ptr gatherNode; if (isAxisConstant) { gatherNode = std::make_shared(paramOuts[0], paramOuts[1], - ov::op::v0::Constant::create(intInputsPrecision, ov::Shape({1}), { axis }), batchDims); + ov::op::v0::Constant::create(axisPrc, ov::Shape({1}), { axis }), batchDims); } else { gatherNode = std::make_shared(paramOuts[0], paramOuts[1], paramOuts[2], batchDims); } - function = makeNgraphFunction(netPrecision, params, gatherNode, "GatherCPU"); + function = makeNgraphFunction(dataPrc, params, gatherNode, "GatherCPU"); } void generate_inputs(const std::vector& targetInputStaticShapes) override { @@ -225,24 +229,31 @@ TEST_P(GatherInPlaceLayerTestCPU, CompareWithRefs) { } namespace { -const std::vector netPrecisions = { +const std::vector dataPrcs = { ElementType::f32, ElementType::bf16, ElementType::i8 }; -std::vector> additionalConfig - = {{{InferenceEngine::PluginConfigParams::KEY_ENFORCE_BF16, InferenceEngine::PluginConfigParams::NO}}, - {{InferenceEngine::PluginConfigParams::KEY_ENFORCE_BF16, InferenceEngine::PluginConfigParams::YES}}}; +const std::vector bf16Config = { + {{PluginConfigParams::KEY_ENFORCE_BF16, PluginConfigParams::NO}}, + {{PluginConfigParams::KEY_ENFORCE_BF16, PluginConfigParams::YES}} +}; + +const ov::AnyMap i64Config = { + {PluginConfigInternalParams::KEY_CPU_NATIVE_I64, PluginConfigParams::YES} +}; + +const ov::AnyMap emptyConfig = {}; std::vector isAxisConst{true, false}; const CPUSpecificParams cpuParamsRef{{}, {}, {"ref_any"}, "ref_any"}; std::vector getCPUInfo() { std::vector resCPUParams; - if (InferenceEngine::with_cpu_x86_avx512f()) { + if (with_cpu_x86_avx512f()) { resCPUParams.push_back(CPUSpecificParams{{}, {}, {"jit_avx512"}, "jit_avx512"}); - } else if (InferenceEngine::with_cpu_x86_avx2()) { + } else if (with_cpu_x86_avx2()) { resCPUParams.push_back(CPUSpecificParams{{}, {}, {"jit_avx2"}, "jit_avx2"}); } else { resCPUParams.push_back(CPUSpecificParams{{}, {}, {"ref"}, "ref"}); @@ -284,10 +295,22 @@ INSTANTIATE_TEST_SUITE_P(smoke_static_1D, GatherLayerTestCPU, ::testing::Combine( ::testing::ValuesIn(staticInputShapes1D), ::testing::Values(std::tuple{0, 0}), - ::testing::ValuesIn(netPrecisions), + ::testing::ValuesIn(dataPrcs), + ::testing::Values(ElementType::i32), ::testing::Values(true), ::testing::ValuesIn(getCPUInfo()), - ::testing::Values(additionalConfig[0])), + ::testing::Values(emptyConfig)), + GatherLayerTestCPU::getTestCaseName); + +INSTANTIATE_TEST_SUITE_P(smoke_static_1D_i64, GatherLayerTestCPU, + ::testing::Combine( + ::testing::ValuesIn(staticInputShapes1D), + ::testing::Values(std::tuple{0, 0}), + ::testing::Values(ElementType::i64), + ::testing::Values(ElementType::i32, ElementType::i64), + ::testing::Values(true), + ::testing::Values(CPUSpecificParams{{}, {}, {"ref_any"}, "ref_any"}), + ::testing::Values(i64Config)), GatherLayerTestCPU::getTestCaseName); const std::vector> dynamicInputShapes1D = { @@ -301,16 +324,28 @@ INSTANTIATE_TEST_SUITE_P(smoke_dynamic_1D, GatherLayerTestCPU, ::testing::Combine( ::testing::ValuesIn(dynamicInputShapes1D), ::testing::Values(std::tuple{0, 0}), - ::testing::ValuesIn(netPrecisions), + ::testing::ValuesIn(dataPrcs), + ::testing::Values(ElementType::i32), ::testing::Values(true, false), ::testing::ValuesIn(getCPUInfo()), - ::testing::Values(additionalConfig[0])), + ::testing::Values(emptyConfig)), + GatherLayerTestCPU::getTestCaseName); + +INSTANTIATE_TEST_SUITE_P(smoke_dynamic_1D_i64, GatherLayerTestCPU, + ::testing::Combine( + ::testing::ValuesIn(dynamicInputShapes1D), + ::testing::Values(std::tuple{0, 0}), + ::testing::Values(ElementType::i64), + ::testing::ValuesIn({ElementType::i32, ElementType::i64}), + ::testing::Values(true, false), + ::testing::Values(CPUSpecificParams{{}, {}, {"ref_any"}, "ref_any"}), + ::testing::Values(i64Config)), GatherLayerTestCPU::getTestCaseName); ///// 4D JIT ///// std::vector> get4DShapesJitStat(int maxBatchDims) { std::vector> result = {}; - if (InferenceEngine::with_cpu_x86_avx2()) { + if (with_cpu_x86_avx2()) { if (maxBatchDims == 2) { result = { { { {}, { {18, 2, 2, 1} } }, // Static shapes @@ -369,7 +404,7 @@ std::vector> get4DShapesJitStat(int maxBatchDi throw std::invalid_argument("Invalid test case. Not valid batch dims."); } } // AVX2 - if (InferenceEngine::with_cpu_x86_avx512f()) { + if (with_cpu_x86_avx512f()) { std::vector> tmp; if (maxBatchDims == 2) { tmp = { @@ -436,7 +471,7 @@ std::vector> get4DShapesJitStat(int maxBatchDi std::vector> get4DAxisBatchJitStat(ov::element::Type type, int maxBatchDims) { std::vector> result = {}; - if (InferenceEngine::with_cpu_x86_avx512f()) { + if (with_cpu_x86_avx512f()) { if (type.size() == 4 || type.size() == 2 || type.size() == 1) { if (maxBatchDims == 2) return std::vector>{{3, 0}, {3, 1}, {3, 2}, {2, 0}, {2, 1}, {2, 2}}; @@ -445,7 +480,7 @@ std::vector> get4DAxisBatchJitStat(ov::element::Type type, else throw std::invalid_argument("Invalid test case. Not valid batch dims."); } - } else if (InferenceEngine::with_cpu_x86_avx2()) { + } else if (with_cpu_x86_avx2()) { if (type.size() == 4) { if (maxBatchDims == 2) return std::vector>{{3, 0}, {3, 1}, {3, 2}, {2, 0}, {2, 1}, {2, 2}}; @@ -470,9 +505,10 @@ INSTANTIATE_TEST_SUITE_P(smoke_static_4D_jit32, GatherLayerTestCPU, ::testing::ValuesIn(get4DShapesJitStat(2)), ::testing::ValuesIn(get4DAxisBatchJitStat(ElementType::f32, 2)), ::testing::Values(ElementType::f32), + ::testing::Values(ElementType::i32), ::testing::Values(true), ::testing::ValuesIn(getCPUInfo()), - ::testing::ValuesIn(additionalConfig)), + ::testing::ValuesIn(bf16Config)), GatherLayerTestCPU::getTestCaseName); INSTANTIATE_TEST_SUITE_P(smoke_static_4D_jit16, GatherLayerTestCPU, @@ -480,9 +516,10 @@ INSTANTIATE_TEST_SUITE_P(smoke_static_4D_jit16, GatherLayerTestCPU, ::testing::ValuesIn(get4DShapesJitStat(2)), ::testing::ValuesIn(get4DAxisBatchJitStat(ElementType::bf16, 2)), ::testing::Values(ElementType::bf16), + ::testing::Values(ElementType::i32), ::testing::Values(true), ::testing::ValuesIn(getCPUInfo()), - ::testing::Values(additionalConfig[0])), + ::testing::Values(emptyConfig)), GatherLayerTestCPU::getTestCaseName); INSTANTIATE_TEST_SUITE_P(smoke_static_4D_jit8, GatherLayerTestCPU, @@ -490,9 +527,10 @@ INSTANTIATE_TEST_SUITE_P(smoke_static_4D_jit8, GatherLayerTestCPU, ::testing::ValuesIn(get4DShapesJitStat(2)), ::testing::ValuesIn(get4DAxisBatchJitStat(ElementType::i8, 2)), ::testing::Values(ElementType::i8), + ::testing::Values(ElementType::i32), ::testing::Values(true), ::testing::ValuesIn(getCPUInfo()), - ::testing::Values(additionalConfig[0])), + ::testing::Values(emptyConfig)), GatherLayerTestCPU::getTestCaseName); // batchDims == indicesRank @@ -501,9 +539,10 @@ INSTANTIATE_TEST_SUITE_P(smoke_static_4D_jit32_Bmax, GatherLayerTestCPU, ::testing::ValuesIn(get4DShapesJitStat(3)), ::testing::ValuesIn(get4DAxisBatchJitStat(ElementType::f32, 3)), ::testing::Values(ElementType::f32), + ::testing::Values(ElementType::i32), ::testing::Values(true), ::testing::ValuesIn(getCPUInfo()), - ::testing::ValuesIn(additionalConfig)), + ::testing::ValuesIn(bf16Config)), GatherLayerTestCPU::getTestCaseName); INSTANTIATE_TEST_SUITE_P(smoke_static_4D_jit16_Bmax, GatherLayerTestCPU, @@ -511,9 +550,10 @@ INSTANTIATE_TEST_SUITE_P(smoke_static_4D_jit16_Bmax, GatherLayerTestCPU, ::testing::ValuesIn(get4DShapesJitStat(3)), ::testing::ValuesIn(get4DAxisBatchJitStat(ElementType::bf16, 3)), ::testing::Values(ElementType::bf16), + ::testing::Values(ElementType::i32), ::testing::Values(true), ::testing::ValuesIn(getCPUInfo()), - ::testing::Values(additionalConfig[0])), + ::testing::Values(emptyConfig)), GatherLayerTestCPU::getTestCaseName); INSTANTIATE_TEST_SUITE_P(smoke_static_4D_jit8_Bmax, GatherLayerTestCPU, @@ -521,15 +561,16 @@ INSTANTIATE_TEST_SUITE_P(smoke_static_4D_jit8_Bmax, GatherLayerTestCPU, ::testing::ValuesIn(get4DShapesJitStat(3)), ::testing::ValuesIn(get4DAxisBatchJitStat(ElementType::i8, 3)), ::testing::Values(ElementType::i8), + ::testing::Values(ElementType::i32), ::testing::Values(true), ::testing::ValuesIn(getCPUInfo()), - ::testing::Values(additionalConfig[0])), + ::testing::Values(emptyConfig)), GatherLayerTestCPU::getTestCaseName); std::vector> get4DShapesJitDyn(int maxBatchDims) { std::vector> result = {}; - if (InferenceEngine::with_cpu_x86_avx2()) { + if (with_cpu_x86_avx2()) { if (maxBatchDims == 2) { result = { { { { ov::Dimension(5, 15), -1, -1, -1 }, // Dynamic shape 0 @@ -572,7 +613,7 @@ std::vector> get4DShapesJitDyn(int maxBatchDim throw std::invalid_argument("Invalid test case. Not valid batch dims."); } } - if (InferenceEngine::with_cpu_x86_avx512f()) { + if (with_cpu_x86_avx512f()) { std::vector> tmp; if (maxBatchDims == 2) { tmp = { @@ -623,7 +664,7 @@ std::vector> get4DShapesJitDyn(int maxBatchDim std::vector> get4DAxisBatchJitDyn(ov::element::Type type, int maxBatchDims) { std::vector> result = {}; - if (InferenceEngine::with_cpu_x86_avx512f()) { + if (with_cpu_x86_avx512f()) { if (type.size() == 4 || type.size() == 2 || type.size() == 1) { if (maxBatchDims == 2) return std::vector>{{3, 0}, {3, 1}, {3, 2}}; @@ -632,7 +673,7 @@ std::vector> get4DAxisBatchJitDyn(ov::element::Type type, i else throw std::invalid_argument("Invalid test case. Not valid batch dims."); } - } else if (InferenceEngine::with_cpu_x86_avx2()) { + } else if (with_cpu_x86_avx2()) { if (type.size() == 4 || type.size() == 2 || type.size() == 1) { if (maxBatchDims == 2) return std::vector>{{3, 0}, {3, 1}, {3, 2}}; @@ -650,9 +691,10 @@ INSTANTIATE_TEST_SUITE_P(smoke_dynamic_4D_jit32, GatherLayerTestCPU, ::testing::ValuesIn(get4DShapesJitDyn(2)), ::testing::ValuesIn(get4DAxisBatchJitDyn(ElementType::f32, 2)), ::testing::Values(ElementType::f32), + ::testing::Values(ElementType::i32), ::testing::ValuesIn(isAxisConst), ::testing::ValuesIn(getCPUInfo()), - ::testing::ValuesIn(additionalConfig)), + ::testing::ValuesIn(bf16Config)), GatherLayerTestCPU::getTestCaseName); INSTANTIATE_TEST_SUITE_P(smoke_dynamic_4D_jit16, GatherLayerTestCPU, @@ -660,9 +702,10 @@ INSTANTIATE_TEST_SUITE_P(smoke_dynamic_4D_jit16, GatherLayerTestCPU, ::testing::ValuesIn(get4DShapesJitDyn(2)), ::testing::ValuesIn(get4DAxisBatchJitDyn(ElementType::bf16, 2)), ::testing::Values(ElementType::bf16), + ::testing::Values(ElementType::i32), ::testing::ValuesIn(isAxisConst), ::testing::ValuesIn(getCPUInfo()), - ::testing::Values(additionalConfig[0])), + ::testing::Values(emptyConfig)), GatherLayerTestCPU::getTestCaseName); INSTANTIATE_TEST_SUITE_P(smoke_dynamic_4D_jit8, GatherLayerTestCPU, @@ -670,9 +713,10 @@ INSTANTIATE_TEST_SUITE_P(smoke_dynamic_4D_jit8, GatherLayerTestCPU, ::testing::ValuesIn(get4DShapesJitDyn(2)), ::testing::ValuesIn(get4DAxisBatchJitDyn(ElementType::i8, 2)), ::testing::Values(ElementType::i8), + ::testing::Values(ElementType::i32), ::testing::ValuesIn(isAxisConst), ::testing::ValuesIn(getCPUInfo()), - ::testing::Values(additionalConfig[0])), + ::testing::Values(emptyConfig)), GatherLayerTestCPU::getTestCaseName); // batchDims == indicesRank @@ -681,9 +725,10 @@ INSTANTIATE_TEST_SUITE_P(smoke_dynamic_4D_jit32_Bmax, GatherLayerTestCPU, ::testing::ValuesIn(get4DShapesJitDyn(3)), ::testing::ValuesIn(get4DAxisBatchJitDyn(ElementType::f32, 3)), ::testing::Values(ElementType::f32), + ::testing::Values(ElementType::i32), ::testing::ValuesIn(isAxisConst), ::testing::ValuesIn(getCPUInfo()), - ::testing::ValuesIn(additionalConfig)), + ::testing::ValuesIn(bf16Config)), GatherLayerTestCPU::getTestCaseName); INSTANTIATE_TEST_SUITE_P(smoke_dynamic_4D_jit16_Bmax, GatherLayerTestCPU, @@ -691,9 +736,10 @@ INSTANTIATE_TEST_SUITE_P(smoke_dynamic_4D_jit16_Bmax, GatherLayerTestCPU, ::testing::ValuesIn(get4DShapesJitDyn(3)), ::testing::ValuesIn(get4DAxisBatchJitDyn(ElementType::bf16, 3)), ::testing::Values(ElementType::bf16), + ::testing::Values(ElementType::i32), ::testing::ValuesIn(isAxisConst), ::testing::ValuesIn(getCPUInfo()), - ::testing::Values(additionalConfig[0])), + ::testing::Values(emptyConfig)), GatherLayerTestCPU::getTestCaseName); INSTANTIATE_TEST_SUITE_P(smoke_dynamic_4D_jit8_Bmax, GatherLayerTestCPU, @@ -701,16 +747,17 @@ INSTANTIATE_TEST_SUITE_P(smoke_dynamic_4D_jit8_Bmax, GatherLayerTestCPU, ::testing::ValuesIn(get4DShapesJitDyn(3)), ::testing::ValuesIn(get4DAxisBatchJitDyn(ElementType::i8, 3)), ::testing::Values(ElementType::i8), + ::testing::Values(ElementType::i32), ::testing::ValuesIn(isAxisConst), ::testing::ValuesIn(getCPUInfo()), - ::testing::Values(additionalConfig[0])), + ::testing::Values(emptyConfig)), GatherLayerTestCPU::getTestCaseName); ///// 4D REFERENCE ///// std::vector> get4DShapesRefStat(bool maxBatchDims) { std::vector> result = {}; - if (InferenceEngine::with_cpu_x86_avx2()) { + if (with_cpu_x86_avx2()) { if (!maxBatchDims) { result = { { { {}, { {10, 2, 9, 9} } }, // Static shapes @@ -767,7 +814,7 @@ std::vector> get4DShapesRefStat(bool maxBatchD }; } } - if (InferenceEngine::with_cpu_x86_avx512f()) { + if (with_cpu_x86_avx512f()) { std::vector> tmp; if (!maxBatchDims) { tmp = { @@ -832,8 +879,8 @@ std::vector> get4DShapesRefStat(bool maxBatchD std::vector> get4DAxisBatchRefStat(ov::element::Type type, bool maxBatchDims) { std::vector> result = {}; - if (InferenceEngine::with_cpu_x86_avx512f()) { - if (type.size() == 4) { + if (with_cpu_x86_avx512f()) { + if (type.size() == 4 || type.size() == 8) { if (!maxBatchDims) return std::vector>{{1, 0}, {1, 1}, {0, 0}}; else @@ -844,8 +891,8 @@ std::vector> get4DAxisBatchRefStat(ov::element::Type type, else return std::vector>{{2, 2}}; } - } else if (InferenceEngine::with_cpu_x86_avx2()) { - if (type.size() == 4) { + } else if (with_cpu_x86_avx2()) { + if (type.size() == 4 || type.size() == 8) { if (!maxBatchDims) return std::vector>{{1, 0}, {1, 1}, {0, 0}}; else @@ -860,14 +907,26 @@ std::vector> get4DAxisBatchRefStat(ov::element::Type type, return {}; } +INSTANTIATE_TEST_SUITE_P(smoke_static_4D_ref64, GatherLayerTestCPU, + ::testing::Combine( + ::testing::ValuesIn(get4DShapesRefStat(false)), + ::testing::ValuesIn(get4DAxisBatchRefStat(ElementType::i64, false)), + ::testing::Values(ElementType::i64), + ::testing::ValuesIn({ElementType::i32, ElementType::i64}), + ::testing::Values(true), + ::testing::Values(cpuParamsRef), + ::testing::Values(i64Config)), + GatherLayerTestCPU::getTestCaseName); + INSTANTIATE_TEST_SUITE_P(smoke_static_4D_ref32, GatherLayerTestCPU, ::testing::Combine( ::testing::ValuesIn(get4DShapesRefStat(false)), ::testing::ValuesIn(get4DAxisBatchRefStat(ElementType::f32, false)), ::testing::Values(ElementType::f32), + ::testing::Values(ElementType::i32), ::testing::Values(true), ::testing::Values(cpuParamsRef), - ::testing::ValuesIn(additionalConfig)), + ::testing::ValuesIn(bf16Config)), GatherLayerTestCPU::getTestCaseName); INSTANTIATE_TEST_SUITE_P(smoke_static_4D_ref16, GatherLayerTestCPU, @@ -875,9 +934,10 @@ INSTANTIATE_TEST_SUITE_P(smoke_static_4D_ref16, GatherLayerTestCPU, ::testing::ValuesIn(get4DShapesRefStat(false)), ::testing::ValuesIn(get4DAxisBatchRefStat(ElementType::bf16, false)), ::testing::Values(ElementType::bf16), + ::testing::Values(ElementType::i32), ::testing::Values(true), ::testing::Values(cpuParamsRef), - ::testing::Values(additionalConfig[0])), + ::testing::Values(emptyConfig)), GatherLayerTestCPU::getTestCaseName); INSTANTIATE_TEST_SUITE_P(smoke_static_4D_ref8, GatherLayerTestCPU, @@ -885,9 +945,10 @@ INSTANTIATE_TEST_SUITE_P(smoke_static_4D_ref8, GatherLayerTestCPU, ::testing::ValuesIn(get4DShapesRefStat(false)), ::testing::ValuesIn(get4DAxisBatchRefStat(ElementType::i8, false)), ::testing::Values(ElementType::i8), + ::testing::Values(ElementType::i32), ::testing::Values(true), ::testing::Values(cpuParamsRef), - ::testing::Values(additionalConfig[0])), + ::testing::Values(emptyConfig)), GatherLayerTestCPU::getTestCaseName); // batchDims == indicesRank @@ -896,9 +957,10 @@ INSTANTIATE_TEST_SUITE_P(smoke_static_4D_ref32_Bmax, GatherLayerTestCPU, ::testing::ValuesIn(get4DShapesRefStat(true)), ::testing::ValuesIn(get4DAxisBatchRefStat(ElementType::f32, true)), ::testing::Values(ElementType::f32), + ::testing::Values(ElementType::i32), ::testing::Values(true), ::testing::Values(cpuParamsRef), - ::testing::ValuesIn(additionalConfig)), + ::testing::ValuesIn(bf16Config)), GatherLayerTestCPU::getTestCaseName); INSTANTIATE_TEST_SUITE_P(smoke_static_4D_ref16_Bmax, GatherLayerTestCPU, @@ -906,9 +968,10 @@ INSTANTIATE_TEST_SUITE_P(smoke_static_4D_ref16_Bmax, GatherLayerTestCPU, ::testing::ValuesIn(get4DShapesRefStat(true)), ::testing::ValuesIn(get4DAxisBatchRefStat(ElementType::bf16, true)), ::testing::Values(ElementType::bf16), + ::testing::Values(ElementType::i32), ::testing::Values(true), ::testing::Values(cpuParamsRef), - ::testing::Values(additionalConfig[0])), + ::testing::Values(emptyConfig)), GatherLayerTestCPU::getTestCaseName); INSTANTIATE_TEST_SUITE_P(smoke_static_4D_ref8_Bmax, GatherLayerTestCPU, @@ -916,9 +979,10 @@ INSTANTIATE_TEST_SUITE_P(smoke_static_4D_ref8_Bmax, GatherLayerTestCPU, ::testing::ValuesIn(get4DShapesRefStat(true)), ::testing::ValuesIn(get4DAxisBatchRefStat(ElementType::i8, true)), ::testing::Values(ElementType::i8), + ::testing::Values(ElementType::i32), ::testing::Values(true), ::testing::Values(cpuParamsRef), - ::testing::Values(additionalConfig[0])), + ::testing::Values(emptyConfig)), GatherLayerTestCPU::getTestCaseName); // InPlace diff --git a/src/plugins/intel_cpu/tests/functional/single_layer_tests/gather_nd.cpp b/src/plugins/intel_cpu/tests/functional/single_layer_tests/gather_nd.cpp index 41c01f9228bbe1..a1ac6b1f3ab245 100644 --- a/src/plugins/intel_cpu/tests/functional/single_layer_tests/gather_nd.cpp +++ b/src/plugins/intel_cpu/tests/functional/single_layer_tests/gather_nd.cpp @@ -5,19 +5,20 @@ #include #include "shared_test_classes/base/ov_subgraph.hpp" #include "ngraph_functions/builders.hpp" +#include using namespace InferenceEngine; -using namespace ov; -using namespace test; +using namespace ov::test; namespace CPULayerTestsDefinitions { using GatherNDLayerCPUTestParamSet = std::tuple< InputShape, // Input shapes - std::pair>, // Indexes shape and values + std::pair>, // Indexes shape and values ElementType, // Input element type ElementType, // Indices element type - int // Batch dims + int, // Batch dims + ov::AnyMap // Additional config >; class GatherNDLayerCPUTest : public testing::WithParamInterface, @@ -25,10 +26,11 @@ class GatherNDLayerCPUTest : public testing::WithParamInterface obj) { InputShape shapes; - std::pair> indexes; + std::pair> indexes; ElementType dataElementType, idxElementType; int batchDims; - std::tie(shapes, indexes, dataElementType, idxElementType, batchDims) = obj.param; + ov::AnyMap config; + std::tie(shapes, indexes, dataElementType, idxElementType, batchDims, config) = obj.param; std::ostringstream results; results << "IS=" << CommonTestUtils::partialShape2str({shapes.first}) << "_"; @@ -39,7 +41,15 @@ class GatherNDLayerCPUTest : public testing::WithParamInterface(params[0], indexes_node, batchDims); + ngraph::ResultVector results{std::make_shared(gather_nd)}; + function = std::make_shared(results, params, "gatherND"); } }; @@ -73,16 +83,16 @@ class GatherND8LayerCPUTest : public testing::WithParamInterface> indexes; + std::pair> indexes; ElementType dataElementType, idxElementType; int batchDims; - std::tie(shapes, indexes, dataElementType, idxElementType, batchDims) = this->GetParam(); + std::tie(shapes, indexes, dataElementType, idxElementType, batchDims, configuration) = this->GetParam(); targetDevice = CommonTestUtils::DEVICE_CPU; init_input_shapes({shapes}); auto params = ngraph::builder::makeDynamicParams(dataElementType, inputDynamicShapes); - auto indexes_node = ngraph::opset3::Constant::create(idxElementType, indexes.first, indexes.second); + auto indexes_node = ov::op::v0::Constant::create(idxElementType, indexes.first, indexes.second); auto gather_nd = std::make_shared(params[0], indexes_node, batchDims); ngraph::ResultVector results{std::make_shared(gather_nd)}; function = std::make_shared(results, params, "gatherND"); @@ -120,10 +130,13 @@ const std::vector inputShapesDynamicBD_0 = { {{4, 5, 5, 5, 5}, {4, 5, 5, 8, 5}, {10, 8, 5, 5, 5}}}, // target }; -const std::vector>> indexesShapesBD_0 = { - std::pair>{{2, 2}, {3, 3, 2, 1}}, - std::pair>{{1, 2, 3}, {0, 1, 1, 1, 0, 2}}, - std::pair>{{2, 1, 1, 2}, {0, 2, 1, 1}}, +ov::AnyMap empty_config = {}; +ov::AnyMap config_i64 = {{PluginConfigInternalParams::KEY_CPU_NATIVE_I64, PluginConfigParams::YES}}; + +const std::vector>> indexesShapesBD_0 = { + std::pair>{{2, 2}, {3, 3, 2, 1}}, + std::pair>{{1, 2, 3}, {0, 1, 1, 1, 0, 2}}, + std::pair>{{2, 1, 1, 2}, {0, 2, 1, 1}}, }; const auto subset_BD0 = ::testing::Combine( @@ -131,10 +144,20 @@ const auto subset_BD0 = ::testing::Combine( ::testing::ValuesIn(indexesShapesBD_0), ::testing::ValuesIn(inputPrecisions), ::testing::ValuesIn(indexesPrecisions), - ::testing::Values(0)); + ::testing::Values(0), + ::testing::Values(empty_config)); INSTANTIATE_TEST_SUITE_P(smoke_GatherND5DynamicBD_0, GatherNDLayerCPUTest, subset_BD0, GatherNDLayerCPUTest::getTestCaseName); INSTANTIATE_TEST_SUITE_P(smoke_GatherND8DynamicBD_0, GatherND8LayerCPUTest, subset_BD0, GatherNDLayerCPUTest::getTestCaseName); +INSTANTIATE_TEST_SUITE_P(smoke_GatherND8DynamicBD_0_I64, GatherND8LayerCPUTest, + ::testing::Combine( + ::testing::ValuesIn(inputShapesDynamicBD_0), + ::testing::ValuesIn(indexesShapesBD_0), + ::testing::Values(ElementType::i64), + ::testing::Values(ElementType::i64), + ::testing::Values(0), + ::testing::Values(config_i64)), + GatherNDLayerCPUTest::getTestCaseName); const std::vector inputShapesDynamicBD_1 = { {{3, -1, -1}, // dynamic @@ -144,10 +167,10 @@ const std::vector inputShapesDynamicBD_1 = { {{3, 5, 5, 5, 5}, {3, 8, 10, 10, 10}, {3, 8, 6, 8, 7}}}, // target }; -const std::vector>> indexesShapesBD_1 = { - std::pair>{{3, 2}, {0, 1, 2, 1, 0, 0}}, - std::pair>{{3, 2, 2}, {0, 1, 1, 1, 0, 2, 0, 1, 1, 1, 0, 2}}, - std::pair>{{3, 1, 1, 2}, {0, 2, 1, 1, 0, 2}}, +const std::vector>> indexesShapesBD_1 = { + std::pair>{{3, 2}, {0, 1, 2, 1, 0, 0}}, + std::pair>{{3, 2, 2}, {0, 1, 1, 1, 0, 2, 0, 1, 1, 1, 0, 2}}, + std::pair>{{3, 1, 1, 2}, {0, 2, 1, 1, 0, 2}}, }; const auto subset_BD1 = ::testing::Combine( @@ -155,10 +178,20 @@ const auto subset_BD1 = ::testing::Combine( ::testing::ValuesIn(indexesShapesBD_1), ::testing::ValuesIn(inputPrecisions), ::testing::ValuesIn(indexesPrecisions), - ::testing::Values(0)); + ::testing::Values(0), + ::testing::Values(empty_config)); INSTANTIATE_TEST_SUITE_P(smoke_GatherND5DynamicBD_1, GatherNDLayerCPUTest, subset_BD1, GatherNDLayerCPUTest::getTestCaseName); INSTANTIATE_TEST_SUITE_P(smoke_GatherND8DynamicBD_1, GatherND8LayerCPUTest, subset_BD1, GatherNDLayerCPUTest::getTestCaseName); +INSTANTIATE_TEST_SUITE_P(smoke_GatherND8DynamicBD_1_I64, GatherND8LayerCPUTest, + ::testing::Combine( + ::testing::ValuesIn(inputShapesDynamicBD_1), + ::testing::ValuesIn(indexesShapesBD_1), + ::testing::Values(ElementType::i64), + ::testing::Values(ElementType::i64), + ::testing::Values(0), + ::testing::Values(config_i64)), + GatherNDLayerCPUTest::getTestCaseName); const std::vector inputShapesDynamicBD_2 = { {{2, 2, -1, -1, -1}, // dynamic @@ -168,10 +201,10 @@ const std::vector inputShapesDynamicBD_2 = { {{2, 2, 5, 5, 5}, {2, 2, 10, 10, 5}, {2, 2, 7, 8, 7}}}, // target }; -const std::vector>> indexesShapesBD_2 = { - std::pair>{{2, 2, 3}, {0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0}}, - std::pair>{{2, 2, 2, 3}, {0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, - 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0}}, +const std::vector>> indexesShapesBD_2 = { + std::pair>{{2, 2, 3}, {0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0}}, + std::pair>{{2, 2, 2, 3}, {0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, + 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0}}, }; const auto subset_BD2 = ::testing::Combine( @@ -179,11 +212,20 @@ const auto subset_BD2 = ::testing::Combine( ::testing::ValuesIn(indexesShapesBD_2), ::testing::ValuesIn(inputPrecisions), ::testing::ValuesIn(indexesPrecisions), - ::testing::Values(0)); + ::testing::Values(0), + ::testing::Values(empty_config)); INSTANTIATE_TEST_SUITE_P(smoke_GatherND5DynamicBD_2, GatherNDLayerCPUTest, subset_BD2, GatherNDLayerCPUTest::getTestCaseName); INSTANTIATE_TEST_SUITE_P(smoke_GatherND8DynamicBD_2, GatherND8LayerCPUTest, subset_BD2, GatherNDLayerCPUTest::getTestCaseName); - +INSTANTIATE_TEST_SUITE_P(smoke_GatherND8DynamicBD_2_I64, GatherND8LayerCPUTest, + ::testing::Combine( + ::testing::ValuesIn(inputShapesDynamicBD_2), + ::testing::ValuesIn(indexesShapesBD_2), + ::testing::Values(ElementType::i64), + ::testing::Values(ElementType::i64), + ::testing::Values(0), + ::testing::Values(config_i64)), + GatherNDLayerCPUTest::getTestCaseName); } // namespace } // namespace CPULayerTestsDefinitions diff --git a/src/plugins/intel_cpu/tests/functional/single_layer_tests/instances/common/activation.cpp b/src/plugins/intel_cpu/tests/functional/single_layer_tests/instances/common/activation.cpp index 20a5089288811f..a37d0732d6791d 100644 --- a/src/plugins/intel_cpu/tests/functional/single_layer_tests/instances/common/activation.cpp +++ b/src/plugins/intel_cpu/tests/functional/single_layer_tests/instances/common/activation.cpp @@ -14,14 +14,17 @@ using namespace ov::test; namespace CPULayerTestsDefinitions { namespace Activation { +ov::AnyMap empty_config = {}; + /* ============= Activation (1D) ============= */ const auto basicCases3D = ::testing::Combine( ::testing::ValuesIn(static_shapes_to_test_representation(basic3D())), ::testing::Values(activationShapes()), ::testing::ValuesIn(CommonTestUtils::combineParams(activationTypes())), ::testing::ValuesIn(netPrc()), - ::testing::Values(Precision::FP32), - ::testing::Values(Precision::FP32), + ::testing::Values(ElementType::f32), + ::testing::Values(ElementType::f32), + ::testing::Values(empty_config), ::testing::ValuesIn(filterCPUSpecificParams(cpuParams3D())) ); @@ -33,8 +36,9 @@ const auto basicCases4D = ::testing::Combine( ::testing::Values(activationShapes()), ::testing::ValuesIn(CommonTestUtils::combineParams(activationTypes())), ::testing::ValuesIn(netPrc()), - ::testing::Values(Precision::FP32), - ::testing::Values(Precision::FP32), + ::testing::Values(ElementType::f32), + ::testing::Values(ElementType::f32), + ::testing::Values(empty_config), ::testing::ValuesIn(filterCPUSpecificParams(cpuParams4D())) ); @@ -46,8 +50,9 @@ const auto basicCases5D = ::testing::Combine( ::testing::Values(activationShapes()), ::testing::ValuesIn(CommonTestUtils::combineParams(activationTypes())), ::testing::ValuesIn(netPrc()), - ::testing::Values(Precision::FP32), - ::testing::Values(Precision::FP32), + ::testing::Values(ElementType::f32), + ::testing::Values(ElementType::f32), + ::testing::Values(empty_config), ::testing::ValuesIn(filterCPUSpecificParams(cpuParams5D())) ); @@ -58,8 +63,9 @@ const auto dynamicMathBasicCases = ::testing::Combine( ::testing::Values(activationShapes()), ::testing::ValuesIn(CommonTestUtils::combineParams(activationTypesDynamicMath())), ::testing::ValuesIn(netPrecisions()), - ::testing::Values(Precision::FP32), - ::testing::Values(Precision::FP32), + ::testing::Values(ElementType::f32), + ::testing::Values(ElementType::f32), + ::testing::Values(empty_config), ::testing::ValuesIn(cpuParamsDynamicMath()) ); diff --git a/src/plugins/intel_cpu/tests/functional/single_layer_tests/instances/common/conversion.cpp b/src/plugins/intel_cpu/tests/functional/single_layer_tests/instances/common/conversion.cpp index cf2af8b8a70bba..33a7ef56f39ce6 100644 --- a/src/plugins/intel_cpu/tests/functional/single_layer_tests/instances/common/conversion.cpp +++ b/src/plugins/intel_cpu/tests/functional/single_layer_tests/instances/common/conversion.cpp @@ -6,7 +6,7 @@ #include "shared_test_classes/single_layer/conversion.hpp" #include "test_utils/cpu_test_utils.hpp" -using namespace InferenceEngine; +// using namespace InferenceEngine; using namespace CPUTestUtils; using namespace ngraph::helpers; using namespace ov::test; @@ -29,11 +29,14 @@ std::vector memForm4D_dynamic = { CPUSpecificParams({nhwc}, {nhwc}, {}, expectedPrimitiveType()), }; +ov::AnyMap empty_config = {}; + INSTANTIATE_TEST_SUITE_P(smoke_ConvertCPULayerTest_Dynamic, ConvertCPULayerTest, ::testing::Combine( ::testing::ValuesIn(inShapes_4D_dynamic()), ::testing::ValuesIn(precisions()), ::testing::ValuesIn(precisions()), + ::testing::Values(empty_config), ::testing::ValuesIn(memForm4D_dynamic)), ConvertCPULayerTest::getTestCaseName); @@ -47,8 +50,9 @@ INSTANTIATE_TEST_SUITE_P(smoke_ConvertCPULayerTest, ConvertCPULayerTest, ::testing::ValuesIn(inShapes_4D_static()), ::testing::ValuesIn(precisions()), ::testing::ValuesIn(precisions()), + ::testing::Values(empty_config), ::testing::ValuesIn(memForm4D_static_common)), ConvertCPULayerTest::getTestCaseName); } // namespace Conversion -} // namespace CPULayerTestsDefinitions \ No newline at end of file +} // namespace CPULayerTestsDefinitions diff --git a/src/plugins/intel_cpu/tests/functional/single_layer_tests/instances/common/reduce.cpp b/src/plugins/intel_cpu/tests/functional/single_layer_tests/instances/common/reduce.cpp index 386417bcf0c258..c132c08c6629cf 100644 --- a/src/plugins/intel_cpu/tests/functional/single_layer_tests/instances/common/reduce.cpp +++ b/src/plugins/intel_cpu/tests/functional/single_layer_tests/instances/common/reduce.cpp @@ -55,30 +55,34 @@ std::vector cpuParams_4D = { #endif }; +ov::AnyMap enpty_config = {}; + /* ================================ 1.1 No fusion - Arithmetic ================================ */ const auto params_OneAxis = testing::Combine( testing::Combine( - testing::ValuesIn(axes()), - testing::ValuesIn(opTypes()), - testing::ValuesIn(keepDims()), - testing::ValuesIn(reductionTypes()), - testing::ValuesIn(inpOutPrc()), - testing::Values(ElementType::undefined), - testing::Values(ElementType::undefined), - testing::ValuesIn(inputShapes)), + testing::ValuesIn(axes()), + testing::ValuesIn(opTypes()), + testing::ValuesIn(keepDims()), + testing::ValuesIn(reductionTypes()), + testing::ValuesIn(inpOutPrc()), + testing::Values(ElementType::undefined), + testing::Values(ElementType::undefined), + testing::ValuesIn(inputShapes), + testing::Values(enpty_config)), testing::Values(emptyCPUSpec), testing::Values(emptyFusingSpec)); const auto params_OneAxis_dynamic = testing::Combine( testing::Combine( - testing::Values(1), // ACL supports reduce against static dims only - testing::ValuesIn(opTypes()), - testing::ValuesIn(keepDims()), - testing::ValuesIn(reductionTypes()), - testing::ValuesIn(inpOutPrc()), - testing::Values(ElementType::undefined), - testing::Values(ElementType::undefined), - testing::ValuesIn(inputShapes_dynamic_3dims)), + testing::Values(1), // ACL supports reduce against static dims only + testing::ValuesIn(opTypes()), + testing::ValuesIn(keepDims()), + testing::ValuesIn(reductionTypes()), + testing::ValuesIn(inpOutPrc()), + testing::Values(ElementType::undefined), + testing::Values(ElementType::undefined), + testing::ValuesIn(inputShapes_dynamic_3dims), + testing::Values(enpty_config)), testing::Values(emptyCPUSpec), testing::Values(emptyFusingSpec)); @@ -91,7 +95,8 @@ const auto params_MultiAxis_4D = testing::Combine( testing::ValuesIn(inpOutPrc()), testing::Values(ElementType::undefined), testing::Values(ElementType::undefined), - testing::ValuesIn(inputShapes)), + testing::ValuesIn(inputShapes), + testing::Values(enpty_config)), testing::ValuesIn(filterCPUSpecificParams(cpuParams_4D)), testing::Values(emptyFusingSpec)); @@ -104,20 +109,22 @@ const auto params_MultiAxis_4D_dynamic = testing::Combine( testing::ValuesIn(inpOutPrc()), testing::Values(ElementType::undefined), testing::Values(ElementType::undefined), - testing::ValuesIn(inputShapes_dynamic_2dims)), + testing::ValuesIn(inputShapes_dynamic_2dims), + testing::Values(enpty_config)), testing::ValuesIn(filterCPUSpecificParams(cpuParams_4D)), testing::Values(emptyFusingSpec)); const auto params_Int32 = testing::Combine( testing::Combine( - testing::ValuesIn(axes()), - testing::Values(CommonTestUtils::OpType::VECTOR), - testing::ValuesIn(keepDims()), - testing::ValuesIn(reductionTypesInt32()), - testing::Values(ElementType::i32), - testing::Values(ElementType::undefined), - testing::Values(ElementType::undefined), - testing::ValuesIn(inputShapes_Int32)), + testing::ValuesIn(axes()), + testing::Values(CommonTestUtils::OpType::VECTOR), + testing::ValuesIn(keepDims()), + testing::ValuesIn(reductionTypesInt32()), + testing::Values(ElementType::i32), + testing::Values(ElementType::undefined), + testing::Values(ElementType::undefined), + testing::ValuesIn(inputShapes_Int32), + testing::Values(enpty_config)), testing::Values(emptyCPUSpec), testing::Values(emptyFusingSpec)); diff --git a/src/plugins/intel_cpu/tests/functional/single_layer_tests/instances/common/transpose.cpp b/src/plugins/intel_cpu/tests/functional/single_layer_tests/instances/common/transpose.cpp index 0684aaeaec622d..39a7fcaff419cd 100644 --- a/src/plugins/intel_cpu/tests/functional/single_layer_tests/instances/common/transpose.cpp +++ b/src/plugins/intel_cpu/tests/functional/single_layer_tests/instances/common/transpose.cpp @@ -6,21 +6,21 @@ #include "shared_test_classes/single_layer/transpose.hpp" #include "test_utils/cpu_test_utils.hpp" -using namespace InferenceEngine; +// using namespace InferenceEngine; using namespace CPUTestUtils; -using namespace ngraph::helpers; +// using namespace ngraph::helpers; using namespace ov::test; namespace CPULayerTestsDefinitions { namespace Transpose { -std::map additional_config; +ov::AnyMap additional_config; const auto cpuParams_nhwc = CPUSpecificParams {{nhwc}, {}, {}, {}}; const auto cpuParams_nchw = CPUSpecificParams {{nchw}, {}, {}, {}}; -const std::vector netPrecisions = { - Precision::I8, - Precision::FP32 +const std::vector netPrecisions = { + ElementType::i8, + ElementType::f32 }; const std::vector> inputOrderPerChannels4D = { @@ -39,7 +39,6 @@ INSTANTIATE_TEST_SUITE_P(smoke_staticShapes4DC16_Transpose, TransposeLayerCPUTes ::testing::ValuesIn(dynamicInputShapes4DC16()), ::testing::ValuesIn(inputOrder4D()), ::testing::ValuesIn(netPrecisions), - ::testing::Values(CommonTestUtils::DEVICE_CPU), ::testing::Values(additional_config), ::testing::ValuesIn(CPUParams4D)), TransposeLayerCPUTest::getTestCaseName); @@ -49,7 +48,6 @@ INSTANTIATE_TEST_SUITE_P(smoke_staticShapes4DC32_Transpose, TransposeLayerCPUTes ::testing::ValuesIn(dynamicInputShapes4DC32()), ::testing::ValuesIn(inputOrder4D()), ::testing::ValuesIn(netPrecisions), - ::testing::Values(CommonTestUtils::DEVICE_CPU), ::testing::Values(additional_config), ::testing::ValuesIn(CPUParams4D)), TransposeLayerCPUTest::getTestCaseName); @@ -59,7 +57,6 @@ INSTANTIATE_TEST_SUITE_P(smoke_dynamicShapes4D_Transpose, TransposeLayerCPUTest, ::testing::ValuesIn(dynamicInputShapes4D()), ::testing::ValuesIn(inputOrder4D()), ::testing::ValuesIn(netPrecisions), - ::testing::Values(CommonTestUtils::DEVICE_CPU), ::testing::Values(additional_config), ::testing::Values(CPUSpecificParams{})), TransposeLayerCPUTest::getTestCaseName); @@ -69,7 +66,6 @@ INSTANTIATE_TEST_SUITE_P(smoke_staticShapes4DC16_PermutePerChannels, TransposeLa ::testing::ValuesIn(dynamicInputShapes4DC16()), ::testing::ValuesIn(inputOrderPerChannels4D), ::testing::ValuesIn(netPrecisionsPerChannels()), - ::testing::Values(CommonTestUtils::DEVICE_CPU), ::testing::Values(additional_config), ::testing::Values(cpuParams_nhwc)), TransposeLayerCPUTest::getTestCaseName); @@ -79,7 +75,6 @@ INSTANTIATE_TEST_SUITE_P(smoke_staticShapes4DC32_PermutePerChannels, TransposeLa ::testing::ValuesIn(dynamicInputShapes4DC32()), ::testing::ValuesIn(inputOrderPerChannels4D), ::testing::ValuesIn(netPrecisionsPerChannels()), - ::testing::Values(CommonTestUtils::DEVICE_CPU), ::testing::Values(additional_config), ::testing::Values(cpuParams_nhwc)), TransposeLayerCPUTest::getTestCaseName); @@ -89,10 +84,9 @@ INSTANTIATE_TEST_SUITE_P(smoke_dynamicShapes4D_PermutePerChannels, TransposeLaye ::testing::ValuesIn(dynamicInputShapes4D()), ::testing::ValuesIn(inputOrderPerChannels4D), ::testing::ValuesIn(netPrecisionsPerChannels()), - ::testing::Values(CommonTestUtils::DEVICE_CPU), ::testing::Values(additional_config), ::testing::Values(CPUSpecificParams{})), TransposeLayerCPUTest::getTestCaseName); } // namespace Transpose -} // namespace CPULayerTestsDefinitions \ No newline at end of file +} // namespace CPULayerTestsDefinitions diff --git a/src/plugins/intel_cpu/tests/functional/single_layer_tests/instances/x64/activation.cpp b/src/plugins/intel_cpu/tests/functional/single_layer_tests/instances/x64/activation.cpp index 794437e770ef37..a7c0ac290cdac6 100644 --- a/src/plugins/intel_cpu/tests/functional/single_layer_tests/instances/x64/activation.cpp +++ b/src/plugins/intel_cpu/tests/functional/single_layer_tests/instances/x64/activation.cpp @@ -5,8 +5,8 @@ #include "single_layer_tests/classes/activation.hpp" #include "shared_test_classes/single_layer/activation.hpp" #include "test_utils/cpu_test_utils.hpp" +#include -using namespace InferenceEngine; using namespace CPUTestUtils; using namespace ngraph::helpers; using namespace ov::test; @@ -15,10 +15,10 @@ namespace CPULayerTestsDefinitions { namespace Activation { namespace { -const std::vector& netPrc() { - static const std::vector netPrc { - Precision::FP32, - Precision::BF16, +const std::vector& netPrc() { + static const std::vector netPrc { + ElementType::f32, + ElementType::bf16, }; return netPrc; @@ -41,13 +41,25 @@ const std::vector& cpuParams3Dblocked() { return cpuParams3Dblocked; } +const std::map>>& activationTypes_i64() { + static const std::map>> activationTypes { + {Sqrt, {{}}} + }; + + return activationTypes; +} + +ov::AnyMap empty_config = {}; +ov::AnyMap config_i64 = {{InferenceEngine::PluginConfigInternalParams::KEY_CPU_NATIVE_I64, InferenceEngine::PluginConfigParams::YES}}; + const auto blockedCases3D = ::testing::Combine( ::testing::ValuesIn(static_shapes_to_test_representation(basic3D())), ::testing::Values(activationShapes()), ::testing::ValuesIn(CommonTestUtils::combineParams(activationTypesBlocked())), ::testing::ValuesIn(netPrc()), - ::testing::Values(Precision::FP32), - ::testing::Values(Precision::FP32), + ::testing::Values(ElementType::f32), + ::testing::Values(ElementType::f32), + ::testing::Values(empty_config), ::testing::ValuesIn(filterCPUSpecificParams(cpuParams3Dblocked())) ); @@ -67,13 +79,26 @@ const auto basicCases4D = ::testing::Combine( ::testing::Values(activationShapes()), ::testing::ValuesIn(CommonTestUtils::combineParams(activationTypes())), ::testing::ValuesIn(netPrc()), - ::testing::Values(Precision::FP32), - ::testing::Values(Precision::FP32), + ::testing::Values(ElementType::f32), + ::testing::Values(ElementType::f32), + ::testing::Values(empty_config), ::testing::ValuesIn(filterCPUSpecificParams(cpuParams4Dblocked())) ); INSTANTIATE_TEST_SUITE_P(smoke_Activation4D_Eltwise_CPU_Blocked, ActivationLayerCPUTest, basicCases4D, ActivationLayerCPUTest::getTestCaseName); +// INSTANTIATE_TEST_SUITE_P(smoke_Activation4D_Eltwise_CPU_Blocked_I64, ActivationLayerCPUTest, +// ::testing::Combine( +// ::testing::ValuesIn(static_shapes_to_test_representation(basic4D())), +// ::testing::Values(activationShapes()), +// ::testing::ValuesIn(CommonTestUtils::combineParams(activationTypes_i64())), +// ::testing::Values(ElementType::i64), +// ::testing::Values(ElementType::i64), +// ::testing::Values(ElementType::i64), +// ::testing::Values(config_i64), +// ::testing::ValuesIn(filterCPUSpecificParams(cpuParams4Dblocked())) +// ), ActivationLayerCPUTest::getTestCaseName); + /* ============= Activation (3D) ============= */ const std::vector& cpuParams5Dblocked() { static const std::vector cpuParams5Dblocked { @@ -88,13 +113,73 @@ const auto basicCases5D = ::testing::Combine( ::testing::Values(activationShapes()), ::testing::ValuesIn(CommonTestUtils::combineParams(activationTypes())), ::testing::ValuesIn(netPrc()), - ::testing::Values(Precision::FP32), - ::testing::Values(Precision::FP32), + ::testing::Values(ElementType::f32), + ::testing::Values(ElementType::f32), + ::testing::Values(empty_config), ::testing::ValuesIn(filterCPUSpecificParams(cpuParams5Dblocked())) ); INSTANTIATE_TEST_SUITE_P(smoke_Activation5D_Eltwise_CPU_Blocked, ActivationLayerCPUTest, basicCases5D, ActivationLayerCPUTest::getTestCaseName); +// INSTANTIATE_TEST_SUITE_P(smoke_Activation5D_Eltwise_CPU_Blocked_I64, ActivationLayerCPUTest, +// ::testing::Combine( +// ::testing::ValuesIn(static_shapes_to_test_representation(basic5D())), +// ::testing::Values(activationShapes()), +// ::testing::ValuesIn(CommonTestUtils::combineParams(activationTypes_i64())), +// ::testing::Values(ElementType::i64), +// ::testing::Values(ElementType::i64), +// ::testing::Values(ElementType::i64), +// ::testing::Values(config_i64), +// ::testing::ValuesIn(filterCPUSpecificParams(cpuParams5Dblocked())) +// ), ActivationLayerCPUTest::getTestCaseName); + +const auto basicCases3D = ::testing::Combine( + ::testing::ValuesIn(static_shapes_to_test_representation(basic3D())), + ::testing::Values(activationShapes()), + ::testing::ValuesIn(CommonTestUtils::combineParams(activationTypes())), + ::testing::ValuesIn(netPrc()), + ::testing::Values(ElementType::f32), + ::testing::Values(ElementType::f32), + ::testing::Values(empty_config), + ::testing::ValuesIn(filterCPUSpecificParams(cpuParams3D())) +); + +// INSTANTIATE_TEST_SUITE_P(smoke_Activation3D_Eltwise_CPU_I64, ActivationLayerCPUTest, +// ::testing::Combine( +// ::testing::ValuesIn(static_shapes_to_test_representation(basic3D())), +// ::testing::Values(activationShapes()), +// ::testing::ValuesIn(CommonTestUtils::combineParams(activationTypes_i64())), +// ::testing::Values(ElementType::i64), +// ::testing::Values(ElementType::i64), +// ::testing::Values(ElementType::i64), +// ::testing::Values(config_i64), +// ::testing::ValuesIn(filterCPUSpecificParams(cpuParams3D()))), +// ActivationLayerCPUTest::getTestCaseName); + +// INSTANTIATE_TEST_SUITE_P(smoke_Activation4D_Eltwise_CPU_I64, ActivationLayerCPUTest, +// ::testing::Combine( +// ::testing::ValuesIn(static_shapes_to_test_representation(basic4D())), +// ::testing::Values(activationShapes()), +// ::testing::ValuesIn(CommonTestUtils::combineParams(activationTypes_i64())), +// ::testing::Values(ElementType::i64), +// ::testing::Values(ElementType::i64), +// ::testing::Values(ElementType::i64), +// ::testing::Values(config_i64), +// ::testing::ValuesIn(filterCPUSpecificParams(cpuParams4D()))), +// ActivationLayerCPUTest::getTestCaseName); + +// INSTANTIATE_TEST_SUITE_P(smoke_Activation5D_Eltwise_CPU, ActivationLayerCPUTest, +// ::testing::Combine( +// ::testing::ValuesIn(static_shapes_to_test_representation(basic5D())), +// ::testing::Values(activationShapes()), +// ::testing::ValuesIn(CommonTestUtils::combineParams(activationTypes_i64())), +// ::testing::Values(ElementType::i64), +// ::testing::Values(ElementType::i64), +// ::testing::Values(ElementType::i64), +// ::testing::Values(config_i64), +// ::testing::ValuesIn(filterCPUSpecificParams(cpuParams5D()))), +// ActivationLayerCPUTest::getTestCaseName); + } // namespace } // namespace Activation } // namespace CPULayerTestsDefinitions diff --git a/src/plugins/intel_cpu/tests/functional/single_layer_tests/instances/x64/conversion.cpp b/src/plugins/intel_cpu/tests/functional/single_layer_tests/instances/x64/conversion.cpp index 9206eca36d7352..a49c41b039d780 100644 --- a/src/plugins/intel_cpu/tests/functional/single_layer_tests/instances/x64/conversion.cpp +++ b/src/plugins/intel_cpu/tests/functional/single_layer_tests/instances/x64/conversion.cpp @@ -5,10 +5,9 @@ #include "single_layer_tests/classes/conversion.hpp" #include "shared_test_classes/single_layer/conversion.hpp" #include "test_utils/cpu_test_utils.hpp" +#include -using namespace InferenceEngine; using namespace CPUTestUtils; -using namespace ngraph::helpers; using namespace ov::test; namespace CPULayerTestsDefinitions { @@ -20,11 +19,15 @@ std::vector memForm4D_dynamic = { CPUSpecificParams({nChw16c}, {nChw16c}, {}, "ref") }; +ov::AnyMap empty_config = {}; +ov::AnyMap config_i64 = {{InferenceEngine::PluginConfigInternalParams::KEY_CPU_NATIVE_I64, InferenceEngine::PluginConfigParams::YES}}; + INSTANTIATE_TEST_SUITE_P(smoke_ConvertCPULayerTest_blocked_Dynamic, ConvertCPULayerTest, ::testing::Combine( ::testing::ValuesIn(inShapes_4D_dynamic()), ::testing::ValuesIn(precisions()), ::testing::ValuesIn(precisions()), + ::testing::Values(empty_config), ::testing::ValuesIn(memForm4D_dynamic)), ConvertCPULayerTest::getTestCaseName); @@ -36,9 +39,14 @@ std::vector memForm4D_static_blocked = { CPUSpecificParams({nChw16c}, {nChw16c}, {}, {}) }; -const std::vector precisions_floating_point = { - Precision::FP32, - Precision::BF16 +const std::vector precisions_floating_point = { + ElementType::f32, + ElementType::bf16 +}; + +std::vector memForm4D_static_common = { + CPUSpecificParams({nchw}, {nchw}, {}, {}), + CPUSpecificParams({nhwc}, {nhwc}, {}, {}), }; INSTANTIATE_TEST_SUITE_P(smoke_ConvertCPULayerTest_Blocked, ConvertCPULayerTest, @@ -46,6 +54,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_ConvertCPULayerTest_Blocked, ConvertCPULayerTest, ::testing::ValuesIn(inShapes_4D_blocked), ::testing::ValuesIn(precisions()), ::testing::ValuesIn(precisions()), + ::testing::Values(empty_config), ::testing::ValuesIn(filterCPUSpecificParams(memForm4D_static_blocked))), ConvertCPULayerTest::getTestCaseName); @@ -53,7 +62,8 @@ INSTANTIATE_TEST_SUITE_P(smoke_ConvertCPULayerTest_BOOL_Static, ConvertCPULayerT ::testing::Combine( ::testing::ValuesIn(inShapes_4D_static()), ::testing::ValuesIn(precisions_floating_point), - ::testing::Values(Precision::BOOL), + ::testing::Values(ElementType::boolean), + ::testing::Values(empty_config), ::testing::Values(CPUSpecificParams({nchw}, {nchw}, {}, {}))), ConvertCPULayerTest::getTestCaseName); @@ -61,10 +71,47 @@ INSTANTIATE_TEST_SUITE_P(smoke_ConvertCPULayerTest_BOOL_Dynamic, ConvertCPULayer ::testing::Combine( ::testing::ValuesIn(inShapes_4D_dynamic()), ::testing::ValuesIn(precisions_floating_point), - ::testing::Values(Precision::BOOL), + ::testing::Values(ElementType::boolean), + ::testing::Values(empty_config), ::testing::Values(CPUSpecificParams({nchw}, {nchw}, {}, "ref"))), ConvertCPULayerTest::getTestCaseName); +INSTANTIATE_TEST_SUITE_P(smoke_ConvertCPULayerTest_FromI64_Dynamic, ConvertCPULayerTest, + ::testing::Combine( + ::testing::ValuesIn(inShapes_4D_dynamic()), + ::testing::Values(ElementType::i64), + ::testing::ValuesIn(precisions()), + ::testing::Values(config_i64), + ::testing::ValuesIn(memForm4D_dynamic)), + ConvertCPULayerTest::getTestCaseName); + +INSTANTIATE_TEST_SUITE_P(smoke_ConvertCPULayerTest_ToI64_Dynamic, ConvertCPULayerTest, + ::testing::Combine( + ::testing::ValuesIn(inShapes_4D_dynamic()), + ::testing::ValuesIn(precisions()), + ::testing::Values(ElementType::i64), + ::testing::Values(config_i64), + ::testing::ValuesIn(memForm4D_dynamic)), + ConvertCPULayerTest::getTestCaseName); + +INSTANTIATE_TEST_SUITE_P(smoke_ConvertCPULayerTest_FromI64, ConvertCPULayerTest, + ::testing::Combine( + ::testing::ValuesIn(inShapes_4D_static()), + ::testing::Values(ElementType::i64), + ::testing::ValuesIn(precisions()), + ::testing::Values(config_i64), + ::testing::ValuesIn(memForm4D_static_common)), + ConvertCPULayerTest::getTestCaseName); + +INSTANTIATE_TEST_SUITE_P(smoke_ConvertCPULayerTest_ToI64, ConvertCPULayerTest, + ::testing::Combine( + ::testing::ValuesIn(inShapes_4D_static()), + ::testing::ValuesIn(precisions()), + ::testing::Values(ElementType::i64), + ::testing::Values(config_i64), + ::testing::ValuesIn(memForm4D_static_common)), + ConvertCPULayerTest::getTestCaseName); + } // namespace } // namespace Conversion } // namespace CPULayerTestsDefinitions \ No newline at end of file diff --git a/src/plugins/intel_cpu/tests/functional/single_layer_tests/instances/x64/eltwise.cpp b/src/plugins/intel_cpu/tests/functional/single_layer_tests/instances/x64/eltwise.cpp index ac8ff11d1e9b18..bd764b1be122fe 100644 --- a/src/plugins/intel_cpu/tests/functional/single_layer_tests/instances/x64/eltwise.cpp +++ b/src/plugins/intel_cpu/tests/functional/single_layer_tests/instances/x64/eltwise.cpp @@ -8,6 +8,7 @@ #include "test_utils/fusing_test_utils.hpp" #include #include +#include using namespace InferenceEngine; using namespace CPUTestUtils; @@ -25,6 +26,8 @@ const std::vector& netType() { return netType; } +ov::AnyMap additional_config_i64 = {{PluginConfigInternalParams::KEY_CPU_NATIVE_I64, PluginConfigParams::YES}}; + const std::vector& inShapes_4D_dyn_param_fusing() { static const std::vector inShapes_4D_dyn_param_fusing = { { @@ -172,6 +175,22 @@ const auto params_4D_Blocked_Blocked = ::testing::Combine( INSTANTIATE_TEST_SUITE_P(smoke_CompareWithRefs_4D_MemOrder_Blocked_Blocked, EltwiseLayerCPUTest, params_4D_Blocked_Blocked, EltwiseLayerCPUTest::getTestCaseName); +INSTANTIATE_TEST_SUITE_P(smoke_CompareWithRefs_4D_MemOrder_Blocked_I64, EltwiseLayerCPUTest, + ::testing::Combine( + ::testing::Combine( + ::testing::ValuesIn(static_shapes_to_test_representation(inShapes_4D())), + ::testing::ValuesIn(eltwiseOpTypesBinInp()), + ::testing::Values(ngraph::helpers::InputLayerType::CONSTANT), + ::testing::ValuesIn(opTypes()), + ::testing::Values(ElementType::i64), + ::testing::Values(ov::element::undefined), + ::testing::Values(ov::element::undefined), + ::testing::Values(CommonTestUtils::DEVICE_CPU), + ::testing::Values(additional_config_i64)), + ::testing::ValuesIn(filterCPUSpecificParams(cpuParams_4D_Blocked_Blocked())), + ::testing::Values(emptyFusingSpec)), + EltwiseLayerCPUTest::getTestCaseName); + const auto params_4D_fusing = ::testing::Combine( ::testing::Combine( ::testing::ValuesIn(static_shapes_to_test_representation(inShapes_4D_fusing())), @@ -255,6 +274,22 @@ const auto params_5D_Blocked_Blocked = ::testing::Combine( INSTANTIATE_TEST_SUITE_P(smoke_CompareWithRefs_5D_MemOrder_Blocked_Blocked, EltwiseLayerCPUTest, params_5D_Blocked_Blocked, EltwiseLayerCPUTest::getTestCaseName); +// INSTANTIATE_TEST_SUITE_P(smoke_CompareWithRefs_5D_MemOrder_Blocked_Blocked_I64, EltwiseLayerCPUTest, +// ::testing::Combine( +// ::testing::Combine( +// ::testing::ValuesIn(static_shapes_to_test_representation(inShapes_5D())), +// ::testing::ValuesIn(eltwiseOpTypesBinInp()), +// ::testing::ValuesIn(secondaryInputTypes()), +// ::testing::ValuesIn(opTypes()), +// ::testing::Values(ElementType::i64), +// ::testing::Values(ov::element::undefined), +// ::testing::Values(ov::element::undefined), +// ::testing::Values(CommonTestUtils::DEVICE_CPU), +// ::testing::Values(additional_config_i64)), +// ::testing::ValuesIn(filterCPUSpecificParams(cpuParams_5D_Blocked_Blocked())), +// ::testing::Values(emptyFusingSpec)), +// EltwiseLayerCPUTest::getTestCaseName); + const std::vector fusingParamsSet_I32{ fusingMultiplyAddPerChannel }; @@ -291,6 +326,22 @@ const auto params_4D_Blocked_Planar = ::testing::Combine( INSTANTIATE_TEST_SUITE_P(smoke_CompareWithRefs_4D_Blocked_Planar, EltwiseLayerCPUTest, params_4D_Blocked_Planar, EltwiseLayerCPUTest::getTestCaseName); +// INSTANTIATE_TEST_SUITE_P(smoke_CompareWithRefs_4D_Blocked_Planar_I64, EltwiseLayerCPUTest, +// ::testing::Combine( +// ::testing::Combine( +// ::testing::ValuesIn(static_shapes_to_test_representation(inShapes_4D_Blocked_Planar())), +// ::testing::ValuesIn(eltwiseOpTypesBinInp()), +// ::testing::Values(ngraph::helpers::InputLayerType::CONSTANT), +// ::testing::ValuesIn(opTypes()), +// ::testing::Values(ElementType::i64), +// ::testing::Values(ov::element::undefined), +// ::testing::Values(ov::element::undefined), +// ::testing::Values(CommonTestUtils::DEVICE_CPU), +// ::testing::Values(additional_config_i64)), +// ::testing::ValuesIn(filterCPUSpecificParams(cpuParams_4D_Blocked_Planar())), +// ::testing::Values(emptyFusingSpec)), +// EltwiseLayerCPUTest::getTestCaseName); + const auto params_4D_Planar_Blocked = ::testing::Combine( ::testing::Combine( ::testing::ValuesIn(static_shapes_to_test_representation(inShapes_4D_Planar_Blocked())), diff --git a/src/plugins/intel_cpu/tests/functional/single_layer_tests/instances/x64/reduce.cpp b/src/plugins/intel_cpu/tests/functional/single_layer_tests/instances/x64/reduce.cpp index 3e2384c5bfa420..190eac8a13f29f 100644 --- a/src/plugins/intel_cpu/tests/functional/single_layer_tests/instances/x64/reduce.cpp +++ b/src/plugins/intel_cpu/tests/functional/single_layer_tests/instances/x64/reduce.cpp @@ -3,13 +3,10 @@ // #include "single_layer_tests/classes/reduce.hpp" -#include "shared_test_classes/single_layer/reduce_ops.hpp" -#include "test_utils/cpu_test_utils.hpp" -#include "test_utils/fusing_test_utils.hpp" +#include using namespace InferenceEngine; using namespace CPUTestUtils; -using namespace ngraph::helpers; using namespace ov::test; @@ -17,27 +14,35 @@ namespace CPULayerTestsDefinitions { namespace Reduce { namespace { -std::vector> inputShapes_dyn = { +std::vector> inputShapes = { + {{{}, {{2, 19, 2, 9}}}}, +}; + +std::vector> inputShapes_5D = { + {{{}, {{2, 19, 2, 2, 9}}}}, +}; + +std::vector> inputShapes_dyn = { {{{{1, 5}, 19, {1, 5}, {1, 10}}, {{2, 19, 2, 2}, {2, 19, 2, 9}}}}, }; -std::vector> inputShapes_5D_dyn = { +std::vector> inputShapes_5D_dyn = { {{{{1, 5}, 19, {1, 5}, {1, 5}, {1, 5}}, {{2, 19, 2, 2, 2}, {2, 19, 3, 2, 2}}}}, }; -std::vector> inputShapes_6D_dyn = { +std::vector> inputShapes_6D_dyn = { {{{{1, 5}, 19, {1, 5}, {1, 5}, {1, 5}, {1, 5}}, {{2, 19, 2, 2, 2, 2}, {2, 19, 2, 2, 3, 2}}}}, }; -std::vector> inputShapes_Int32_dyn = { +std::vector> inputShapes_Int32_dyn = { {{{{1, 5}, 19, {1, 5}, {1, 10}}, {{2, 19, 2, 2}, {2, 19, 2, 3}}}}, }; -std::vector> inputShapes_SmallChannel_dyn = { +std::vector> inputShapes_SmallChannel_dyn = { {{{{1, 5}, 3, {1, 5}, {1, 10}}, {{2, 3, 2, 2}, {2, 3, 2, 9}}}}, }; -std::vector> inputShapes_SingleBatch_dyn = { +std::vector> inputShapes_SingleBatch_dyn = { {{{{1, 5}, 19, {1, 5}, {1, 10}}, {{1, 19, 2, 2}, {1, 19, 2, 9}}}}, }; @@ -79,22 +84,44 @@ const std::vector> axesHW = { {2, 3} }; +std::vector cpuParams_4D_I64 = { + CPUSpecificParams({nChw16c}, {nChw8c}, {}, {}), + CPUSpecificParams({nchw}, {nchw}, {}, {}), + CPUSpecificParams({nhwc}, {nhwc}, {}, {}) +}; + std::vector cpuParams_5D = { CPUSpecificParams({nCdhw16c}, {nCdhw16c}, {}, {}), CPUSpecificParams({ndhwc}, {ndhwc}, {}, {}), CPUSpecificParams({ncdhw}, {ncdhw}, {}, {}), }; +std::vector cpuParams_5D_I64 = { + CPUSpecificParams({nCdhw8c}, {nCdhw8c}, {}, {}), + CPUSpecificParams({ncdhw}, {ncdhw}, {}, {}), + CPUSpecificParams({ndhwc}, {ndhwc}, {}, {}) +}; + std::vector cpuParams_HybridLayout_4D = { CPUSpecificParams({nChw16c}, {}, {}, {}), CPUSpecificParams({nhwc}, {}, {}, {}) }; +std::vector cpuParams_HybridLayout_4D_I64 = { + CPUSpecificParams({nChw8c}, {}, {}, {}), + CPUSpecificParams({nhwc}, {}, {}, {}) +}; + std::vector cpuParams_HybridLayout_5D = { CPUSpecificParams({nCdhw16c}, {}, {}, {}), CPUSpecificParams({ndhwc}, {}, {}, {}) }; +std::vector cpuParams_HybridLayout_5D_I64 = { + CPUSpecificParams({nCdhw16c}, {}, {}, {}), + CPUSpecificParams({ndhwc}, {}, {}, {}) +}; + std::vector cpuParams_NHWC_4D = { CPUSpecificParams({nhwc}, {nhwc}, {}, {}) }; @@ -120,6 +147,14 @@ const std::vector fusingParamsSet { fusingScaleShift }; +const std::vector fusingParamsSet_I64 { + /* FQ */ + fusingFakeQuantizePerChannelRelu, + fusingFakeQuantizePerTensorRelu, + /* another patterns */ + fusingScaleShift +}; + // Exclude cases of fusingFakeQuantizePerChannelRelu, where FQ for non-1 channel fallbacks // to decomposed ngraph reference implementation, so such fusing tests are N/A const std::vector fusingParamsSet_KeepNoDims { @@ -132,16 +167,42 @@ const std::vector fusingParamsSet_KeepNoDims { fusingScaleShift }; +const std::vector fusingParamsSet_KeepNoDims_I64 { + /* FQ */ + fusingFakeQuantizePerTensorRelu, + /* another patterns */ + fusingScaleShift +}; + +ov::AnyMap empty_config = {}; +ov::AnyMap config_i64 = {{PluginConfigInternalParams::KEY_CPU_NATIVE_I64, PluginConfigParams::YES}}; + +/* ================================ 1.1 No fusion - Arithmetic ================================ */ const auto params_OneAxis = testing::Combine( testing::Combine( - testing::ValuesIn(axes()), - testing::ValuesIn(opTypes()), - testing::ValuesIn(keepDims()), - testing::ValuesIn(reductionTypes()), - testing::ValuesIn(inpOutPrc()), - testing::Values(ElementType::undefined), - testing::Values(ElementType::undefined), - testing::ValuesIn(inputShapes_dyn)), + testing::ValuesIn(axes()), + testing::ValuesIn(opTypes()), + testing::ValuesIn(keepDims()), + testing::ValuesIn(reductionTypes()), + testing::ValuesIn(inpOutPrc()), + testing::Values(ElementType::undefined), + testing::Values(ElementType::undefined), + testing::ValuesIn(inputShapes_dyn), + testing::Values(empty_config)), + testing::Values(emptyCPUSpec), + testing::Values(emptyFusingSpec)); + +const auto params_OneAxis_I64 = testing::Combine( + testing::Combine( + testing::ValuesIn(axes()), + testing::ValuesIn(opTypes()), + testing::ValuesIn(keepDims()), + testing::ValuesIn(reductionTypes()), + testing::Values(ElementType::i64, ElementType::u64), + testing::Values(ElementType::undefined), + testing::Values(ElementType::undefined), + testing::ValuesIn(inputShapes), + testing::Values(config_i64)), testing::Values(emptyCPUSpec), testing::Values(emptyFusingSpec)); @@ -154,20 +215,36 @@ const auto params_MultiAxis_4D = testing::Combine( testing::ValuesIn(inpOutPrc()), testing::Values(ElementType::undefined), testing::Values(ElementType::undefined), - testing::ValuesIn(inputShapes_dyn)), + testing::ValuesIn(inputShapes_dyn), + testing::Values(empty_config)), testing::ValuesIn(filterCPUSpecificParams(cpuParams_4D)), testing::Values(emptyFusingSpec)); +const auto params_MultiAxis_4D_I64 = testing::Combine( + testing::Combine( + testing::ValuesIn(axesND()), + testing::Values(CommonTestUtils::OpType::VECTOR), + testing::Values(true), + testing::ValuesIn(reductionTypes()), + testing::Values(ElementType::i64), + testing::Values(ElementType::undefined), + testing::Values(ElementType::undefined), + testing::ValuesIn(inputShapes), + testing::Values(config_i64)), + testing::ValuesIn(filterCPUSpecificParams(cpuParams_4D_I64, ElementType::i64)), + testing::Values(emptyFusingSpec)); + const auto params_Int32 = testing::Combine( testing::Combine( - testing::ValuesIn(axes()), - testing::Values(CommonTestUtils::OpType::VECTOR), - testing::ValuesIn(keepDims()), - testing::ValuesIn(reductionTypesInt32()), - testing::Values(ElementType::i32), - testing::Values(ElementType::undefined), - testing::Values(ElementType::undefined), - testing::ValuesIn(inputShapes_Int32_dyn)), + testing::ValuesIn(axes()), + testing::Values(CommonTestUtils::OpType::VECTOR), + testing::ValuesIn(keepDims()), + testing::ValuesIn(reductionTypesInt32()), + testing::ValuesIn({ElementType::i32, ElementType::i64}), + testing::Values(ElementType::undefined), + testing::Values(ElementType::undefined), + testing::ValuesIn(inputShapes_Int32_dyn), + testing::Values(empty_config)), testing::Values(emptyCPUSpec), testing::Values(emptyFusingSpec)); @@ -178,6 +255,13 @@ INSTANTIATE_TEST_SUITE_P( ReduceCPULayerTest::getTestCaseName ); +INSTANTIATE_TEST_SUITE_P( + smoke_Reduce_OneAxis_CPU_I64, + ReduceCPULayerTest, + params_OneAxis_I64, + ReduceCPULayerTest::getTestCaseName +); + INSTANTIATE_TEST_SUITE_P( smoke_Reduce_MultiAxis_4D_CPU, ReduceCPULayerTest, @@ -185,6 +269,13 @@ INSTANTIATE_TEST_SUITE_P( ReduceCPULayerTest::getTestCaseName ); +INSTANTIATE_TEST_SUITE_P( + smoke_Reduce_MultiAxis_4D_CPU_I64, + ReduceCPULayerTest, + params_MultiAxis_4D_I64, + ReduceCPULayerTest::getTestCaseName +); + INSTANTIATE_TEST_SUITE_P( smoke_Reduce_Int32_CPU, ReduceCPULayerTest, @@ -201,36 +292,81 @@ const auto params_MultiAxis_5D = testing::Combine( testing::ValuesIn(inpOutPrc()), testing::Values(ElementType::undefined), testing::Values(ElementType::undefined), - testing::ValuesIn(inputShapes_5D_dyn)), + testing::ValuesIn(inputShapes_5D_dyn), + testing::Values(empty_config)), testing::ValuesIn(filterCPUSpecificParams(cpuParams_5D)), testing::Values(emptyFusingSpec)); +const auto params_MultiAxis_5D_I64 = testing::Combine( + testing::Combine( + testing::ValuesIn(axes5D), + testing::Values(CommonTestUtils::OpType::VECTOR), + testing::Values(true), + testing::ValuesIn(reductionTypes()), + testing::Values(ElementType::i64), + testing::Values(ElementType::undefined), + testing::Values(ElementType::undefined), + testing::ValuesIn(inputShapes_5D), + testing::Values(config_i64)), + testing::ValuesIn(filterCPUSpecificParams(cpuParams_5D_I64, ElementType::i64)), + testing::Values(emptyFusingSpec)); + const auto params_MultiAxis_4D_Hybrid = testing::Combine( testing::Combine( - testing::ValuesIn(axesND()), - testing::Values(CommonTestUtils::OpType::VECTOR), - testing::Values(false), - testing::ValuesIn(reductionTypes()), - testing::ValuesIn(inpOutPrc()), - testing::Values(ElementType::undefined), - testing::Values(ElementType::undefined), - testing::ValuesIn(inputShapes_dyn)), + testing::ValuesIn(axesND()), + testing::Values(CommonTestUtils::OpType::VECTOR), + testing::Values(false), + testing::ValuesIn(reductionTypes()), + testing::ValuesIn(inpOutPrc()), + testing::Values(ElementType::undefined), + testing::Values(ElementType::undefined), + testing::ValuesIn(inputShapes_dyn), + testing::Values(empty_config)), testing::ValuesIn(filterCPUSpecificParams(cpuParams_HybridLayout_4D)), testing::Values(emptyFusingSpec)); +const auto params_MultiAxis_4D_Hybrid_I64 = testing::Combine( + testing::Combine( + testing::ValuesIn(axesND()), + testing::Values(CommonTestUtils::OpType::VECTOR), + testing::Values(false), + testing::ValuesIn(reductionTypes()), + testing::Values(ElementType::i64), + testing::Values(ElementType::undefined), + testing::Values(ElementType::undefined), + testing::ValuesIn(inputShapes), + testing::Values(config_i64)), + testing::ValuesIn(filterCPUSpecificParams(cpuParams_HybridLayout_4D_I64, ElementType::i64)), + testing::Values(emptyFusingSpec)); + const auto params_MultiAxis_5D_Hybrid = testing::Combine( testing::Combine( - testing::ValuesIn(axes5D), - testing::Values(CommonTestUtils::OpType::VECTOR), - testing::Values(false), - testing::ValuesIn(reductionTypes()), - testing::ValuesIn(inpOutPrc()), - testing::Values(ElementType::undefined), - testing::Values(ElementType::undefined), - testing::ValuesIn(inputShapes_5D_dyn)), + testing::ValuesIn(axes5D), + testing::Values(CommonTestUtils::OpType::VECTOR), + testing::Values(false), + testing::ValuesIn(reductionTypes()), + testing::ValuesIn(inpOutPrc()), + testing::Values(ElementType::undefined), + testing::Values(ElementType::undefined), + testing::ValuesIn(inputShapes_5D_dyn), + testing::Values(empty_config)), testing::ValuesIn(filterCPUSpecificParams(cpuParams_HybridLayout_5D)), testing::Values(emptyFusingSpec)); +const auto params_MultiAxis_5D_Hybrid_I64 = testing::Combine( + testing::Combine( + testing::ValuesIn(axes5D), + testing::Values(CommonTestUtils::OpType::VECTOR), + testing::Values(false), + testing::ValuesIn(reductionTypes()), + testing::Values(ElementType::i64), + testing::Values(ElementType::undefined), + testing::Values(ElementType::undefined), + testing::ValuesIn(inputShapes_5D), + testing::Values(config_i64)), + testing::ValuesIn(filterCPUSpecificParams(cpuParams_HybridLayout_5D_I64, ElementType::i64)), + testing::Values(emptyFusingSpec)); + const auto params_MultiAxis_6D = testing::Combine( testing::Combine( testing::ValuesIn(axes6D), @@ -240,7 +376,8 @@ const auto params_MultiAxis_6D = testing::Combine( testing::ValuesIn(inpOutPrc()), testing::Values(ElementType::undefined), testing::Values(ElementType::undefined), - testing::ValuesIn(inputShapes_6D_dyn)), + testing::ValuesIn(inputShapes_6D_dyn), + testing::Values(empty_config)), testing::Values(emptyCPUSpec), testing::Values(emptyFusingSpec)); @@ -253,7 +390,8 @@ const auto params_NHWC_SmallChannel = testing::Combine( testing::ValuesIn(inpOutPrc()), testing::Values(ElementType::undefined), testing::Values(ElementType::undefined), - testing::ValuesIn(inputShapes_SmallChannel_dyn)), + testing::ValuesIn(inputShapes_SmallChannel_dyn), + testing::Values(empty_config)), testing::ValuesIn(filterCPUSpecificParams(cpuParams_NHWC_4D)), testing::Values(emptyFusingSpec)); @@ -266,7 +404,8 @@ const auto params_SingleBatch = testing::Combine( testing::ValuesIn(inpOutPrc()), testing::Values(ElementType::undefined), testing::Values(ElementType::undefined), - testing::ValuesIn(inputShapes_SingleBatch_dyn)), + testing::ValuesIn(inputShapes_SingleBatch_dyn), + testing::Values(empty_config)), testing::ValuesIn(filterCPUSpecificParams(cpuParams_NHWC_4D)), testing::Values(emptyFusingSpec)); @@ -277,6 +416,13 @@ INSTANTIATE_TEST_SUITE_P( ReduceCPULayerTest::getTestCaseName ); +INSTANTIATE_TEST_SUITE_P( + smoke_Reduce_MultiAxis_5D_CPU_I64, + ReduceCPULayerTest, + params_MultiAxis_5D_I64, + ReduceCPULayerTest::getTestCaseName +); + INSTANTIATE_TEST_SUITE_P( smoke_Reduce_MultiAxis_4D_Hybrid_CPU, ReduceCPULayerTest, @@ -284,6 +430,13 @@ INSTANTIATE_TEST_SUITE_P( ReduceCPULayerTest::getTestCaseName ); +INSTANTIATE_TEST_SUITE_P( + smoke_Reduce_MultiAxis_4D_Hybrid_CPU_I64, + ReduceCPULayerTest, + params_MultiAxis_4D_Hybrid_I64, + ReduceCPULayerTest::getTestCaseName +); + INSTANTIATE_TEST_SUITE_P( smoke_Reduce_MultiAxis_5D_Hybrid_CPU, ReduceCPULayerTest, @@ -291,6 +444,13 @@ INSTANTIATE_TEST_SUITE_P( ReduceCPULayerTest::getTestCaseName ); +INSTANTIATE_TEST_SUITE_P( + smoke_Reduce_MultiAxis_5D_Hybrid_CPU_I64, + ReduceCPULayerTest, + params_MultiAxis_5D_Hybrid_I64, + ReduceCPULayerTest::getTestCaseName +); + INSTANTIATE_TEST_SUITE_P( smoke_Reduce_MultiAxis_6D_CPU, ReduceCPULayerTest, @@ -315,14 +475,15 @@ INSTANTIATE_TEST_SUITE_P( /* ================================ 1.2 No fusion - Logical ================================ */ const auto params_OneAxis_Logical = testing::Combine( testing::Combine( - testing::ValuesIn(axes()), - testing::ValuesIn(opTypes()), - testing::ValuesIn(keepDims()), - testing::ValuesIn((reductionLogicalTypes)), - testing::Values(ElementType::boolean), - testing::Values(ElementType::undefined), - testing::Values(ElementType::undefined), - testing::ValuesIn(inputShapes_dyn)), + testing::ValuesIn(axes()), + testing::ValuesIn(opTypes()), + testing::ValuesIn(keepDims()), + testing::ValuesIn((reductionLogicalTypes)), + testing::Values(ElementType::boolean), + testing::Values(ElementType::undefined), + testing::Values(ElementType::undefined), + testing::ValuesIn(inputShapes_dyn), + testing::Values(empty_config)), testing::Values(emptyCPUSpec), testing::Values(emptyFusingSpec)); @@ -335,7 +496,8 @@ const auto params_MultiAxis_4D_Logical = testing::Combine( testing::Values(ElementType::boolean), testing::Values(ElementType::undefined), testing::Values(ElementType::undefined), - testing::ValuesIn(inputShapes_dyn)), + testing::ValuesIn(inputShapes_dyn), + testing::Values(empty_config)), testing::ValuesIn(filterCPUSpecificParams(cpuParams_4D)), testing::Values(emptyFusingSpec)); @@ -348,33 +510,36 @@ const auto params_MultiAxis_5D_Logical = testing::Combine( testing::Values(ElementType::boolean), testing::Values(ElementType::undefined), testing::Values(ElementType::undefined), - testing::ValuesIn(inputShapes_5D_dyn)), + testing::ValuesIn(inputShapes_5D_dyn), + testing::Values(empty_config)), testing::ValuesIn(filterCPUSpecificParams(cpuParams_5D)), testing::Values(emptyFusingSpec)); const auto params_MultiAxis_4D_Hybrid_Logical = testing::Combine( testing::Combine( - testing::ValuesIn(axesND()), - testing::Values(CommonTestUtils::OpType::VECTOR), - testing::Values(false), - testing::ValuesIn((reductionLogicalTypes)), - testing::Values(ElementType::boolean), - testing::Values(ElementType::undefined), - testing::Values(ElementType::undefined), - testing::ValuesIn(inputShapes_dyn)), + testing::ValuesIn(axesND()), + testing::Values(CommonTestUtils::OpType::VECTOR), + testing::Values(false), + testing::ValuesIn((reductionLogicalTypes)), + testing::Values(ElementType::boolean), + testing::Values(ElementType::undefined), + testing::Values(ElementType::undefined), + testing::ValuesIn(inputShapes_dyn), + testing::Values(empty_config)), testing::ValuesIn(filterCPUSpecificParams(cpuParams_HybridLayout_4D)), testing::Values(emptyFusingSpec)); const auto params_MultiAxis_5D_Hybrid_Logical = testing::Combine( testing::Combine( - testing::ValuesIn(axes5D), - testing::Values(CommonTestUtils::OpType::VECTOR), - testing::Values(false), - testing::ValuesIn((reductionLogicalTypes)), - testing::Values(ElementType::boolean), - testing::Values(ElementType::undefined), - testing::Values(ElementType::undefined), - testing::ValuesIn(inputShapes_5D_dyn)), + testing::ValuesIn(axes5D), + testing::Values(CommonTestUtils::OpType::VECTOR), + testing::Values(false), + testing::ValuesIn((reductionLogicalTypes)), + testing::Values(ElementType::boolean), + testing::Values(ElementType::undefined), + testing::Values(ElementType::undefined), + testing::ValuesIn(inputShapes_5D_dyn), + testing::Values(empty_config)), testing::ValuesIn(filterCPUSpecificParams(cpuParams_HybridLayout_5D)), testing::Values(emptyFusingSpec)); @@ -387,7 +552,8 @@ const auto params_MultiAxis_6D_Logical = testing::Combine( testing::Values(ElementType::boolean), testing::Values(ElementType::undefined), testing::Values(ElementType::undefined), - testing::ValuesIn(inputShapes_6D_dyn)), + testing::ValuesIn(inputShapes_6D_dyn), + testing::Values(empty_config)), testing::Values(emptyCPUSpec), testing::Values(emptyFusingSpec)); @@ -436,17 +602,32 @@ INSTANTIATE_TEST_SUITE_P( /* ================================ 2.1 Fusion - KeepDims ================================ */ const auto params_OneAxis_fusing = testing::Combine( testing::Combine( - testing::ValuesIn(axes()), - testing::ValuesIn(opTypes()), - testing::Values(true), - testing::ValuesIn(reductionTypesFusing), - testing::ValuesIn(inpOutPrc()), - testing::Values(ElementType::undefined), - testing::Values(ElementType::undefined), - testing::ValuesIn(inputShapes_dyn)), + testing::ValuesIn(axes()), + testing::ValuesIn(opTypes()), + testing::Values(true), + testing::ValuesIn(reductionTypesFusing), + testing::ValuesIn(inpOutPrc()), + testing::Values(ElementType::undefined), + testing::Values(ElementType::undefined), + testing::ValuesIn(inputShapes_dyn), + testing::Values(empty_config)), testing::Values(emptyCPUSpec), testing::ValuesIn(fusingParamsSet)); +const auto params_OneAxis_fusing_I64 = testing::Combine( + testing::Combine( + testing::ValuesIn(axes()), + testing::ValuesIn(opTypes()), + testing::Values(true), + testing::ValuesIn(reductionTypesFusing), + testing::Values(ElementType::i64, ElementType::u64), + testing::Values(ElementType::undefined), + testing::Values(ElementType::undefined), + testing::ValuesIn(inputShapes), + testing::Values(config_i64)), + testing::Values(emptyCPUSpec), + testing::ValuesIn(fusingParamsSet_I64)); + const auto params_MultiAxis_4D_fusing = testing::Combine( testing::Combine( testing::ValuesIn(axesND()), @@ -456,10 +637,25 @@ const auto params_MultiAxis_4D_fusing = testing::Combine( testing::ValuesIn(inpOutPrc()), testing::Values(ElementType::undefined), testing::Values(ElementType::undefined), - testing::ValuesIn(inputShapes_dyn)), + testing::ValuesIn(inputShapes_dyn), + testing::Values(empty_config)), testing::ValuesIn(filterCPUSpecificParams(cpuParams_4D)), testing::ValuesIn(fusingParamsSet)); +const auto params_MultiAxis_4D_fusing_I64 = testing::Combine( + testing::Combine( + testing::ValuesIn(axesND()), + testing::Values(CommonTestUtils::OpType::VECTOR), + testing::Values(true), + testing::ValuesIn(reductionTypesFusing), + testing::Values(ElementType::i64), + testing::Values(ElementType::undefined), + testing::Values(ElementType::undefined), + testing::ValuesIn(inputShapes), + testing::Values(config_i64)), + testing::ValuesIn(filterCPUSpecificParams(cpuParams_4D_I64, ElementType::i64)), + testing::ValuesIn(fusingParamsSet_I64)); + const auto params_MultiAxis_5D_fusing = testing::Combine( testing::Combine( testing::ValuesIn(axes5D), @@ -469,7 +665,8 @@ const auto params_MultiAxis_5D_fusing = testing::Combine( testing::ValuesIn(inpOutPrc()), testing::Values(ElementType::undefined), testing::Values(ElementType::undefined), - testing::ValuesIn(inputShapes_5D_dyn)), + testing::ValuesIn(inputShapes_5D_dyn), + testing::Values(empty_config)), testing::ValuesIn(filterCPUSpecificParams(cpuParams_5D)), testing::ValuesIn(fusingParamsSet)); @@ -480,6 +677,13 @@ INSTANTIATE_TEST_SUITE_P( ReduceCPULayerTest::getTestCaseName ); +INSTANTIATE_TEST_SUITE_P( + smoke_Reduce_OneAxis_fusing_CPU_I64, + ReduceCPULayerTest, + params_OneAxis_fusing_I64, + ReduceCPULayerTest::getTestCaseName +); + INSTANTIATE_TEST_SUITE_P( smoke_Reduce_MultiAxis_4D_fusing_CPU, ReduceCPULayerTest, @@ -487,6 +691,13 @@ INSTANTIATE_TEST_SUITE_P( ReduceCPULayerTest::getTestCaseName ); +INSTANTIATE_TEST_SUITE_P( + smoke_Reduce_MultiAxis_4D_fusing_CPU_I64, + ReduceCPULayerTest, + params_MultiAxis_4D_fusing_I64, + ReduceCPULayerTest::getTestCaseName +); + INSTANTIATE_TEST_SUITE_P( smoke_Reduce_MultiAxis_5D_fusing_CPU, ReduceCPULayerTest, @@ -497,40 +708,57 @@ INSTANTIATE_TEST_SUITE_P( /* ================================ 2.2 Fusion - KeepNoDims ================================ */ const auto params_OneAxis_fusing_KeepNoDims = testing::Combine( testing::Combine( - testing::ValuesIn(axes()), - testing::ValuesIn(opTypes()), - testing::Values(false), - testing::ValuesIn(reductionTypesFusing), - testing::ValuesIn(inpOutPrc()), - testing::Values(ElementType::undefined), - testing::Values(ElementType::undefined), - testing::ValuesIn(inputShapes_dyn)), + testing::ValuesIn(axes()), + testing::ValuesIn(opTypes()), + testing::Values(false), + testing::ValuesIn(reductionTypesFusing), + testing::ValuesIn(inpOutPrc()), + testing::Values(ElementType::undefined), + testing::Values(ElementType::undefined), + testing::ValuesIn(inputShapes_dyn), + testing::Values(empty_config)), testing::Values(emptyCPUSpec), testing::ValuesIn(fusingParamsSet_KeepNoDims)); +const auto params_OneAxis_fusing_KeepNoDims_I64 = testing::Combine( + testing::Combine( + testing::ValuesIn(axes()), + testing::ValuesIn(opTypes()), + testing::Values(false), + testing::ValuesIn(reductionTypesFusing), + testing::Values(ElementType::i64), + testing::Values(ElementType::undefined), + testing::Values(ElementType::undefined), + testing::ValuesIn(inputShapes), + testing::Values(config_i64)), + testing::Values(emptyCPUSpec), + testing::ValuesIn(fusingParamsSet_KeepNoDims_I64)); + const auto params_MultiAxis_4D_Hybrid_fusing_KeepNoDims = testing::Combine( testing::Combine( - testing::ValuesIn(axesNDFusing), - testing::Values(CommonTestUtils::OpType::VECTOR), - testing::Values(false), - testing::ValuesIn(reductionTypesFusing), - testing::ValuesIn(inpOutPrc()), - testing::Values(ElementType::undefined), - testing::Values(ElementType::undefined), - testing::ValuesIn(inputShapes_dyn)), + testing::ValuesIn(axesNDFusing), + testing::Values(CommonTestUtils::OpType::VECTOR), + testing::Values(false), + testing::ValuesIn(reductionTypesFusing), + testing::ValuesIn(inpOutPrc()), + testing::Values(ElementType::undefined), + testing::Values(ElementType::undefined), + testing::ValuesIn(inputShapes_dyn), + testing::Values(empty_config)), testing::ValuesIn(filterCPUSpecificParams(cpuParams_HybridLayout_4D)), testing::ValuesIn(fusingParamsSet_KeepNoDims)); const auto params_MultiAxis_5D_Hybrid_fusing_KeepNoDims = testing::Combine( testing::Combine( - testing::ValuesIn(axes5DFusing), - testing::Values(CommonTestUtils::OpType::VECTOR), - testing::Values(false), - testing::ValuesIn(reductionTypesFusing), - testing::ValuesIn(inpOutPrc()), - testing::Values(ElementType::undefined), - testing::Values(ElementType::undefined), - testing::ValuesIn(inputShapes_5D_dyn)), + testing::ValuesIn(axes5DFusing), + testing::Values(CommonTestUtils::OpType::VECTOR), + testing::Values(false), + testing::ValuesIn(reductionTypesFusing), + testing::ValuesIn(inpOutPrc()), + testing::Values(ElementType::undefined), + testing::Values(ElementType::undefined), + testing::ValuesIn(inputShapes_5D_dyn), + testing::Values(empty_config)), testing::ValuesIn(filterCPUSpecificParams(cpuParams_HybridLayout_5D)), testing::ValuesIn(fusingParamsSet_KeepNoDims)); @@ -541,6 +769,13 @@ INSTANTIATE_TEST_SUITE_P( ReduceCPULayerTest::getTestCaseName ); +INSTANTIATE_TEST_SUITE_P( + smoke_Reduce_OneAxis_fusing_KeepNoDims_CPU_I64, + ReduceCPULayerTest, + params_OneAxis_fusing_KeepNoDims_I64, + ReduceCPULayerTest::getTestCaseName +); + INSTANTIATE_TEST_SUITE_P( smoke_Reduce_MultiAxis_4D_Hybrid_fusing_KeepNoDims_CPU, ReduceCPULayerTest, @@ -557,4 +792,4 @@ INSTANTIATE_TEST_SUITE_P( } // namespace } // namespace Reduce -} // namespace CPULayerTestsDefinitions \ No newline at end of file +} // namespace CPULayerTestsDefinitions diff --git a/src/plugins/intel_cpu/tests/functional/single_layer_tests/instances/x64/transpose.cpp b/src/plugins/intel_cpu/tests/functional/single_layer_tests/instances/x64/transpose.cpp index b6dbb7657007e2..e16f9197052a00 100644 --- a/src/plugins/intel_cpu/tests/functional/single_layer_tests/instances/x64/transpose.cpp +++ b/src/plugins/intel_cpu/tests/functional/single_layer_tests/instances/x64/transpose.cpp @@ -5,17 +5,18 @@ #include "single_layer_tests/classes/transpose.hpp" #include "shared_test_classes/single_layer/transpose.hpp" #include "test_utils/cpu_test_utils.hpp" +#include -using namespace InferenceEngine; using namespace CPUTestUtils; -using namespace ngraph::helpers; using namespace ov::test; - namespace CPULayerTestsDefinitions { namespace Transpose { namespace { -std::map additional_config; +ov::AnyMap empty_config = {}; +ov::AnyMap config_i64 = {{InferenceEngine::PluginConfigInternalParams::KEY_CPU_NATIVE_I64, InferenceEngine::PluginConfigParams::YES}}; + +const auto cpuParams_nchw = CPUSpecificParams {{nchw}, {}, {}, {}}; const auto cpuParams_ndhwc = CPUSpecificParams {{ndhwc}, {}, {}, {}}; const auto cpuParams_ncdhw = CPUSpecificParams {{ncdhw}, {}, {}, {}}; @@ -26,10 +27,15 @@ const auto cpuParams_nCdhw16c = CPUSpecificParams {{nCdhw16c}, {}, {}, {}}; const auto cpuParams_nChw8c = CPUSpecificParams {{nChw8c}, {}, {}, {}}; const auto cpuParams_nCdhw8c = CPUSpecificParams {{nCdhw8c}, {}, {}, {}}; -const std::vector netPrecisions = { - Precision::I8, - Precision::BF16, - Precision::FP32 +const std::vector staticInputShapes4DC32 = {InputShape{// dynamic + {-1, 32, -1, -1}, + // target + {{4, 32, 16, 14}, {16, 32, 5, 16}, {4, 32, 16, 14}}}}; + +const std::vector netPrecisions = { + ElementType::i8, + ElementType::bf16, + ElementType::f32 }; const std::vector CPUParams4D_blocked = { @@ -37,13 +43,23 @@ const std::vector CPUParams4D_blocked = { cpuParams_nChw8c, }; +const std::vector CPUParams4D = { + cpuParams_nChw16c, + cpuParams_nChw8c, + cpuParams_nchw, +}; + +const std::vector staticInputShapes4DC16 = {InputShape{// dynamic + {-1, 16, -1, -1}, + // target + {{2, 16, 21, 10}, {3, 16, 11, 12}, {2, 16, 21, 10}}}}; + INSTANTIATE_TEST_SUITE_P(smoke_staticShapes4DC16_TransposeBlocked, TransposeLayerCPUTest, ::testing::Combine( ::testing::ValuesIn(dynamicInputShapes4DC16()), ::testing::ValuesIn(inputOrder4D()), ::testing::ValuesIn(netPrecisions), - ::testing::Values(CommonTestUtils::DEVICE_CPU), - ::testing::Values(additional_config), + ::testing::Values(empty_config), ::testing::ValuesIn(CPUParams4D_blocked)), TransposeLayerCPUTest::getTestCaseName); @@ -52,8 +68,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_staticShapes4DC32_TransposeBlocked, TransposeLaye ::testing::ValuesIn(dynamicInputShapes4DC32()), ::testing::ValuesIn(inputOrder4D()), ::testing::ValuesIn(netPrecisions), - ::testing::Values(CommonTestUtils::DEVICE_CPU), - ::testing::Values(additional_config), + ::testing::Values(empty_config), ::testing::ValuesIn(CPUParams4D_blocked)), TransposeLayerCPUTest::getTestCaseName); @@ -61,9 +76,35 @@ INSTANTIATE_TEST_SUITE_P(smoke_dynamicShapes4D_Transpose, TransposeLayerCPUTest, ::testing::Combine( ::testing::ValuesIn(dynamicInputShapes4D()), ::testing::ValuesIn(inputOrder4D()), - ::testing::Values(Precision::BF16), - ::testing::Values(CommonTestUtils::DEVICE_CPU), - ::testing::Values(additional_config), + ::testing::Values(ElementType::bf16), + ::testing::Values(empty_config), + ::testing::Values(CPUSpecificParams{})), + TransposeLayerCPUTest::getTestCaseName); + +INSTANTIATE_TEST_SUITE_P(smoke_staticShapes4DC16_Transpose_I64, TransposeLayerCPUTest, + ::testing::Combine( + ::testing::ValuesIn(staticInputShapes4DC16), + ::testing::ValuesIn(inputOrder4D()), + ::testing::Values(ElementType::i64), + ::testing::Values(config_i64), + ::testing::ValuesIn(CPUParams4D)), + TransposeLayerCPUTest::getTestCaseName); + +INSTANTIATE_TEST_SUITE_P(smoke_staticShapes4DC32_Transpose_i64, TransposeLayerCPUTest, + ::testing::Combine( + ::testing::ValuesIn(staticInputShapes4DC32), + ::testing::ValuesIn(inputOrder4D()), + ::testing::Values(ElementType::i64), + ::testing::Values(config_i64), + ::testing::ValuesIn(CPUParams4D)), + TransposeLayerCPUTest::getTestCaseName); + +INSTANTIATE_TEST_SUITE_P(smoke_dynamicShapes4D_Transpose_I64, TransposeLayerCPUTest, + ::testing::Combine( + ::testing::ValuesIn(dynamicInputShapes4D()), + ::testing::ValuesIn(inputOrder4D()), + ::testing::Values(ElementType::i64), + ::testing::Values(config_i64), ::testing::Values(CPUSpecificParams{})), TransposeLayerCPUTest::getTestCaseName); @@ -122,8 +163,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_staticShapes5DC16_Transpose, TransposeLayerCPUTes ::testing::ValuesIn(staticInputShapes5DC16), ::testing::ValuesIn(inputOrder5D), ::testing::ValuesIn(netPrecisions), - ::testing::Values(CommonTestUtils::DEVICE_CPU), - ::testing::Values(additional_config), + ::testing::Values(empty_config), ::testing::ValuesIn(CPUParams5D)), TransposeLayerCPUTest::getTestCaseName); @@ -132,8 +172,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_staticShapes5DC32_Transpose, TransposeLayerCPUTes ::testing::ValuesIn(staticInputShapes5DC32), ::testing::ValuesIn(inputOrder5D), ::testing::ValuesIn(netPrecisions), - ::testing::Values(CommonTestUtils::DEVICE_CPU), - ::testing::Values(additional_config), + ::testing::Values(empty_config), ::testing::ValuesIn(CPUParams5D)), TransposeLayerCPUTest::getTestCaseName); @@ -141,9 +180,8 @@ INSTANTIATE_TEST_SUITE_P(smoke_dynamicShapes5D_Transpose, TransposeLayerCPUTest, ::testing::Combine( ::testing::ValuesIn(dynamicInputShapes5D), ::testing::ValuesIn(inputOrder5D), - ::testing::Values(Precision::BF16), - ::testing::Values(CommonTestUtils::DEVICE_CPU), - ::testing::Values(additional_config), + ::testing::Values(ElementType::bf16), + ::testing::Values(empty_config), ::testing::Values(CPUSpecificParams{})), TransposeLayerCPUTest::getTestCaseName); @@ -152,8 +190,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_staticShapes5DC16_PermutePerChannels, TransposeLa ::testing::ValuesIn(staticInputShapes5DC16), ::testing::ValuesIn(inputOrderPerChannels5D), ::testing::ValuesIn(netPrecisionsPerChannels()), - ::testing::Values(CommonTestUtils::DEVICE_CPU), - ::testing::Values(additional_config), + ::testing::Values(empty_config), ::testing::Values(cpuParams_ndhwc)), TransposeLayerCPUTest::getTestCaseName); @@ -162,8 +199,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_staticShapes5DC32_PermutePerChannels, TransposeLa ::testing::ValuesIn(staticInputShapes5DC32), ::testing::ValuesIn(inputOrderPerChannels5D), ::testing::ValuesIn(netPrecisionsPerChannels()), - ::testing::Values(CommonTestUtils::DEVICE_CPU), - ::testing::Values(additional_config), + ::testing::Values(empty_config), ::testing::Values(cpuParams_ndhwc)), TransposeLayerCPUTest::getTestCaseName); @@ -172,10 +208,9 @@ INSTANTIATE_TEST_SUITE_P(smoke_dynamicShapes5D_PermutePerChannels, TransposeLaye ::testing::ValuesIn(dynamicInputShapes5D), ::testing::ValuesIn(inputOrderPerChannels5D), ::testing::ValuesIn(netPrecisionsPerChannels()), - ::testing::Values(CommonTestUtils::DEVICE_CPU), - ::testing::Values(additional_config), + ::testing::Values(empty_config), ::testing::Values(CPUSpecificParams{})), TransposeLayerCPUTest::getTestCaseName); } // namespace } // namespace Transpose -} // namespace CPULayerTestsDefinitions \ No newline at end of file +} // namespace CPULayerTestsDefinitions diff --git a/src/plugins/intel_cpu/tests/functional/single_layer_tests/minimum_maximum.cpp b/src/plugins/intel_cpu/tests/functional/single_layer_tests/minimum_maximum.cpp new file mode 100644 index 00000000000000..ced17bab6b306c --- /dev/null +++ b/src/plugins/intel_cpu/tests/functional/single_layer_tests/minimum_maximum.cpp @@ -0,0 +1,170 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "shared_test_classes/base/ov_subgraph.hpp" +#include "ngraph_functions/builders.hpp" +#include "test_utils/cpu_test_utils.hpp" +#include +#include "test_utils/fusing_test_utils.hpp" +#include + + +using namespace CPUTestUtils; +using namespace ov::test; + +namespace CPULayerTestsDefinitions { + +typedef std::tuple< + std::vector, // Input shapes + ngraph::helpers::MinMaxOpType, // Operation type + ElementType, // Net precision + ngraph::helpers::InputLayerType, // Second input type: Parameter or Constant + ov::AnyMap // Additional network configuration +> basicMinMaxParams; + +typedef std::tuple< + basicMinMaxParams, + CPUSpecificParams> MinMaxLayerCPUTestParamSet; + +class MinMaxCPULayerTest : public testing::WithParamInterface, + virtual public SubgraphBaseTest, public CpuTestWithFusing { +public: + static std::string getTestCaseName(const testing::TestParamInfo& obj) { + basicMinMaxParams basicParams; + CPUSpecificParams cpuParams; + std::tie(basicParams, cpuParams) = obj.param; + + std::vector inputShapes; + ngraph::helpers::MinMaxOpType opType; + ElementType netPrecision; + ngraph::helpers::InputLayerType layerType; + ov::AnyMap config; + + std::tie(inputShapes, opType, netPrecision, layerType, config) = basicParams; + + std::ostringstream result; + result << "IS=("; + for (const auto& shape : inputShapes) { + result << CommonTestUtils::partialShape2str({shape.first}) << "_"; + } + result << ")_TS=("; + for (const auto& shape : inputShapes) { + for (const auto& item : shape.second) { + result << CommonTestUtils::vec2str(item) << "_"; + } + } + if (opType == ngraph::helpers::MinMaxOpType::MINIMUM) { + result << "opType=MIN_"; + } else { + result << "opType=MAX_"; + } + result << "netPRC=" << netPrecision << "_"; + result << "type=" << layerType; + for (auto const& configItem : config) { + result << "_configItem=" << configItem.first << "_"; + configItem.second.print(result); + } + + result << CPUTestsBase::getTestCaseName(cpuParams); + + return result.str(); + } + +protected: + void SetUp() override { + targetDevice = CommonTestUtils::DEVICE_CPU; + + basicMinMaxParams basicParams; + CPUSpecificParams cpuParams; + std::tie(basicParams, cpuParams) = this->GetParam(); + + std::tie(inFmts, outFmts, priority, selectedType) = cpuParams; + + std::vector inputShapes; + ngraph::helpers::MinMaxOpType opType; + ElementType netPrecision; + ngraph::helpers::InputLayerType layerType; + + std::tie(inputShapes, opType, netPrecision, layerType, configuration) = basicParams; + + init_input_shapes(inputShapes); + + auto params = ngraph::builder::makeDynamicParams(netPrecision, inputDynamicShapes); + auto paramOuts = ngraph::helpers::convert2OutputVector( + ngraph::helpers::castOps2Nodes(params)); + + auto maxMinNode = ngraph::builder::makeMinMax(paramOuts[0], paramOuts[1], opType); + + if (netPrecision == ElementType::i64 || netPrecision == ElementType::u64) { + auto i64It = configuration.find(InferenceEngine::PluginConfigInternalParams::KEY_CPU_NATIVE_I64); + if (i64It == configuration.end() || i64It->second == InferenceEngine::PluginConfigParams::NO) { + selectedType = makeSelectedTypeStr(getPrimitiveType(), ElementType::i32); + } else { + selectedType = makeSelectedTypeStr(getPrimitiveType(), ElementType::i64); + } + } else if (netPrecision == ElementType::boolean) { + selectedType = makeSelectedTypeStr(getPrimitiveType(), ElementType::i8); + } else { + selectedType = makeSelectedTypeStr(getPrimitiveType(), netPrecision); + } + + function = makeNgraphFunction(netPrecision, params, maxMinNode, "MinMax"); + } +}; + +TEST_P(MinMaxCPULayerTest, CompareWithRefs) { + run(); +} + +namespace { + +const std::vector netPrecisions = { ElementType::f32, ElementType::i32 }; + +std::vector> inShapesStatic = { + { {{}, {{2}}}, {{}, {{1}}} }, + { {{}, {{1, 1, 1, 3}}}, {{}, {{1}}} }, + { {{}, {{1, 2, 4}}}, {{}, {{1}}} }, + { {{}, {{1, 4, 4}}}, {{}, {{1}}} }, + { {{}, {{1, 4, 4, 1}}}, {{}, {{1}}} }, + { {{}, {{256, 56}}}, {{}, {{256, 56}}} }, + { {{}, {{8, 1, 6, 1}}}, {{}, {{7, 1, 5}}} } +}; + +const std::vector opType = { + ngraph::helpers::MinMaxOpType::MINIMUM, + ngraph::helpers::MinMaxOpType::MAXIMUM, +}; + +const std::vector inputType = { + ngraph::helpers::InputLayerType::CONSTANT, + ngraph::helpers::InputLayerType::PARAMETER, +}; + +ov::AnyMap config = {}; +ov::AnyMap config_i64 = {{InferenceEngine::PluginConfigInternalParams::KEY_CPU_NATIVE_I64, InferenceEngine::PluginConfigParams::YES}}; + +INSTANTIATE_TEST_SUITE_P(smoke_MinMax, MinMaxCPULayerTest, + ::testing::Combine( + ::testing::Combine( + ::testing::ValuesIn(inShapesStatic), + ::testing::ValuesIn(opType), + ::testing::ValuesIn(netPrecisions), + ::testing::ValuesIn(inputType), + ::testing::Values(config)), + testing::Values(emptyCPUSpec)), + MinMaxCPULayerTest::getTestCaseName); + +INSTANTIATE_TEST_SUITE_P(smoke_MinMax_I64, MinMaxCPULayerTest, + ::testing::Combine( + ::testing::Combine( + ::testing::ValuesIn(inShapesStatic), + ::testing::ValuesIn(opType), + ::testing::Values(ElementType::i64), + ::testing::ValuesIn(inputType), + ::testing::Values(config_i64)), + testing::Values(emptyCPUSpec)), + MinMaxCPULayerTest::getTestCaseName); + +} // namespace +} // namespace CPULayerTestsDefinitions diff --git a/src/plugins/intel_cpu/tests/functional/single_layer_tests/non_max_suppression.cpp b/src/plugins/intel_cpu/tests/functional/single_layer_tests/non_max_suppression.cpp index 482c9e3cff39ea..e675852f48e491 100644 --- a/src/plugins/intel_cpu/tests/functional/single_layer_tests/non_max_suppression.cpp +++ b/src/plugins/intel_cpu/tests/functional/single_layer_tests/non_max_suppression.cpp @@ -10,6 +10,7 @@ #include #include "test_utils/cpu_test_utils.hpp" #include "shared_test_classes/base/utils/ranges.hpp" +#include using namespace ov::test; using namespace ngraph; @@ -43,9 +44,9 @@ using NmsParams = std::tuple; // Device name class NmsLayerCPUTest : public testing::WithParamInterface, virtual public SubgraphBaseTest, public CPUTestsBase { @@ -129,41 +130,49 @@ class NmsLayerCPUTest : public testing::WithParamInterface, virtual p std::tie(bounds, targetInDims) = inShapeParams; if (!bounds.empty()) { - inputDynamicShapes = std::vector{{bounds[BATCHES], bounds[BOXES], 4}, {bounds[BATCHES], bounds[CLASSES], bounds[BOXES]}}; + inputDynamicShapes = std::vector{{bounds[BATCHES], bounds[BOXES], 4}, {bounds[BATCHES], bounds[CLASSES], bounds[BOXES]}}; } else { size_t batches, boxes, classes; std::tie(batches, boxes, classes) = targetInDims.front(); ov::Dimension numBatches(batches), numBoxes(boxes), numClasses(classes); - inputDynamicShapes = std::vector{{numBatches, numBoxes, 4}, {numBatches, numClasses, numBoxes}}; + inputDynamicShapes = std::vector{{numBatches, numBoxes, 4}, {numBatches, numClasses, numBoxes}}; } for (const auto &ts : targetInDims) { size_t numBatches, numBoxes, numClasses; std::tie(numBatches, numBoxes, numClasses) = ts; - targetStaticShapes.push_back(std::vector{{numBatches, numBoxes, 4}, {numBatches, numClasses, numBoxes}}); + targetStaticShapes.push_back(std::vector{{numBatches, numBoxes, 4}, {numBatches, numClasses, numBoxes}}); if (maxOutBoxesType == ngraph::helpers::InputLayerType::PARAMETER) { - targetStaticShapes.back().push_back(ngraph::Shape{1}); + targetStaticShapes.back().push_back(ov::Shape{1}); } } - std::shared_ptr maxOutBoxesPerClassNode; + std::shared_ptr maxOutBoxesPerClassNode; auto params = ngraph::builder::makeDynamicParams(paramsPrec, inputDynamicShapes); params[0]->set_friendly_name("param_1"); params[1]->set_friendly_name("param_2"); if (maxOutBoxesType == ngraph::helpers::InputLayerType::PARAMETER) { - inputDynamicShapes.push_back(ngraph::PartialShape{1}); - params.push_back(std::make_shared(element::Type_t::i32, inputDynamicShapes.back())); + inputDynamicShapes.push_back(ov::PartialShape{1}); + if (maxBoxPrec == ElementType::i64) { + params.push_back(std::make_shared(element::Type_t::i64, inputDynamicShapes.back())); + } else { + params.push_back(std::make_shared(element::Type_t::i32, inputDynamicShapes.back())); + } params[1]->set_friendly_name("param_3"); maxOutBoxesPerClassNode = params.back(); } else { - maxOutBoxesPerClassNode = builder::makeConstant(maxBoxPrec, ngraph::Shape{}, std::vector{maxOutBoxesPerClass}); + if (maxBoxPrec == ElementType::i64) { + maxOutBoxesPerClassNode = builder::makeConstant(maxBoxPrec, ov::Shape{}, std::vector{maxOutBoxesPerClass}); + } else { + maxOutBoxesPerClassNode = builder::makeConstant(maxBoxPrec, ov::Shape{}, std::vector{maxOutBoxesPerClass}); + } } - auto iouThrNode = builder::makeConstant(thrPrec, ngraph::Shape{}, std::vector{iouThr})->output(0); - auto scoreThrNode = builder::makeConstant(thrPrec, ngraph::Shape{}, std::vector{scoreThr})->output(0); - auto softNmsSigmaNode = builder::makeConstant(thrPrec, ngraph::Shape{}, std::vector{softNmsSigma})->output(0); - auto nms = std::make_shared(params[0], params[1], maxOutBoxesPerClassNode, iouThrNode, scoreThrNode, + auto iouThrNode = builder::makeConstant(thrPrec, ov::Shape{}, std::vector{iouThr})->output(0); + auto scoreThrNode = builder::makeConstant(thrPrec, ov::Shape{}, std::vector{scoreThr})->output(0); + auto softNmsSigmaNode = builder::makeConstant(thrPrec, ov::Shape{}, std::vector{softNmsSigma})->output(0); + auto nms = std::make_shared(params[0], params[1], maxOutBoxesPerClassNode, iouThrNode, scoreThrNode, softNmsSigmaNode, boxEncoding, sortResDescend, outType); function = makeNgraphFunction(paramsPrec, params, nms, "NMS"); @@ -433,6 +442,22 @@ const auto nmsParams = ::testing::Combine(::testing::ValuesIn(inShapeParams), ::testing::Values(CommonTestUtils::DEVICE_CPU) ); +const auto nmsParams_i64 = ::testing::Combine(::testing::ValuesIn(inShapeParams), + ::testing::Combine(::testing::Values(ElementType::f32), + ::testing::Values(ElementType::i64), + ::testing::Values(ElementType::f32)), + ::testing::ValuesIn(maxOutBoxPerClass), + ::testing::Combine(::testing::ValuesIn(threshold), + ::testing::ValuesIn(threshold), + ::testing::ValuesIn(sigmaThreshold)), + ::testing::ValuesIn(maxBoxInputTypes), + ::testing::ValuesIn(encodType), + ::testing::ValuesIn(sortResDesc), + ::testing::ValuesIn(outType), + ::testing::Values(CommonTestUtils::DEVICE_CPU) +); + INSTANTIATE_TEST_SUITE_P(smoke_NmsLayerCPUTest, NmsLayerCPUTest, nmsParams, NmsLayerCPUTest::getTestCaseName); +INSTANTIATE_TEST_SUITE_P(smoke_NmsLayerCPUTest_i64, NmsLayerCPUTest, nmsParams_i64, NmsLayerCPUTest::getTestCaseName); } // namespace CPULayerTestsDefinitions diff --git a/src/plugins/intel_cpu/tests/functional/single_layer_tests/one_hot.cpp b/src/plugins/intel_cpu/tests/functional/single_layer_tests/one_hot.cpp index 3d761c34917275..9e5d53b93353bd 100644 --- a/src/plugins/intel_cpu/tests/functional/single_layer_tests/one_hot.cpp +++ b/src/plugins/intel_cpu/tests/functional/single_layer_tests/one_hot.cpp @@ -6,6 +6,7 @@ #include #include "test_utils/cpu_test_utils.hpp" #include "shared_test_classes/base/ov_subgraph.hpp" +#include using namespace InferenceEngine; using namespace CPUTestUtils; @@ -20,7 +21,9 @@ using oneHotCPUTestParams = std::tuple< size_t, // depth float, // on_value float, // off_value - InferenceEngine::Precision, // Output precision + ElementType, // Input precision + ElementType, // Output precision + ov::AnyMap, // Additional network configuration CPUSpecificParams>; class OneHotLayerCPUTest : public testing::WithParamInterface, @@ -32,9 +35,10 @@ class OneHotLayerCPUTest : public testing::WithParamInterface inputType; size_t depth; float onValue, offValue; - InferenceEngine::Precision outPrc; + ElementType inPrc, outPrc; + ov::AnyMap additionalConfig; CPUSpecificParams cpuParams; - std::tie(inputShape, axis, inputType, depth, onValue, offValue, outPrc, cpuParams) = obj.param; + std::tie(inputShape, axis, inputType, depth, onValue, offValue, inPrc, outPrc, additionalConfig, cpuParams) = obj.param; std::ostringstream result; if (inputShape.first.size() != 0) { @@ -54,11 +58,21 @@ class OneHotLayerCPUTest : public testing::WithParamInterfacesecond == PluginConfigParams::NO) { + selectedType = makeSelectedTypeStr(selectedType, ElementType::i32); + } else { + selectedType = makeSelectedTypeStr(selectedType, ElementType::i64); + } + } else { + selectedType = makeSelectedTypeStr(selectedType, inType); + } init_input_shapes({inputShape}); if (inputType.second) { @@ -102,6 +126,7 @@ class OneHotLayerCPUTest : public testing::WithParamInterface &funcRef, const std::vector& targetInputStaticShapes) override { if (function->get_parameters().size() == 2) { generateDepth(); @@ -109,6 +134,7 @@ class OneHotLayerCPUTest : public testing::WithParamInterfaceget_parameters().size() == 2) { @@ -128,24 +154,26 @@ class OneHotLayerCPUTest : public testing::WithParamInterface createFunction(bool depthConst) { - auto params = ngraph::builder::makeDynamicParams(ngraph::element::i32, {inputDynamicShapes.front()}); + + std::shared_ptr createFunction(bool depthConst) { + auto params = ngraph::builder::makeDynamicParams(inType, {inputDynamicShapes.front()}); params.front()->set_friendly_name("ParamsIndices"); std::shared_ptr depth; if (depthConst) { - depth = ngraph::op::Constant::create(ngraph::element::i32, ngraph::Shape{ }, {Depth}); + depth = ov::op::v0::Constant::create(inType, ov::Shape{ }, {Depth}); } else { - auto depthParam = std::make_shared(ngraph::element::i32, ngraph::Shape{ }); + auto depthParam = std::make_shared(inType, ov::Shape{ }); depthParam->set_friendly_name("ParamDepth"); params.push_back(depthParam); depth = depthParam; } - auto paramOuts = ngraph::helpers::convert2OutputVector(ngraph::helpers::castOps2Nodes(params)); - auto on_value_const = std::make_shared(outType, ngraph::Shape{ }, OnValue); - auto off_value_const = std::make_shared(outType, ngraph::Shape{ }, OffValue); - auto oneHot = std::make_shared(paramOuts[0], depth, on_value_const, off_value_const, Axis); - return makeNgraphFunction(ngraph::element::i32, params, oneHot, "OneHot"); + auto paramOuts = ngraph::helpers::convert2OutputVector(ngraph::helpers::castOps2Nodes(params)); + auto on_value_const = std::make_shared(outType, ov::Shape{ }, OnValue); + auto off_value_const = std::make_shared(outType, ov::Shape{ }, OffValue); + auto oneHot = std::make_shared(paramOuts[0], depth, on_value_const, off_value_const, Axis); + return makeNgraphFunction(inType, params, oneHot, "OneHot"); } + void generateDepth() { testing::internal::Random random(time(nullptr)); random.Generate(10); @@ -163,13 +191,19 @@ TEST_P(OneHotLayerCPUTest, CompareWithRefs) { } namespace { -const std::vector outPrc = { - Precision::FP32, - Precision::BF16, - Precision::I8 - // Precision::U8 // Precision cannot be wrapped to constant one hot +const std::vector inPrc = { + ElementType::i32, }; +const std::vector outPrc = { + ElementType::f32, + ElementType::bf16, + ElementType::i8 + // ElementType::u8 // Precision cannot be wrapped to constant one hot +}; + +const CPUSpecificParams cpuParamsRef{{}, {}, {"ref_any"}, "ref_any"}; + std::vector> secondaryInputTypesStaticCase = { {ngraph::helpers::InputLayerType::CONSTANT, true}, {ngraph::helpers::InputLayerType::CONSTANT, false} @@ -184,6 +218,11 @@ const std::vector staticInputShapes0D = { { } }; +const ov::AnyMap i64Config = { + {PluginConfigInternalParams::KEY_CPU_NATIVE_I64, PluginConfigParams::YES} +}; +const ov::AnyMap emptyConfig = {}; + // 0d -> 1d, depth const auto testCase_1d = ::testing::Combine( ::testing::ValuesIn(static_shapes_to_test_representation(staticInputShapes0D)), @@ -192,11 +231,27 @@ const auto testCase_1d = ::testing::Combine( ::testing::Values(3), ::testing::Values(1.f), ::testing::Values(0.f), + ::testing::ValuesIn(inPrc), ::testing::ValuesIn(outPrc), - ::testing::Values(emptyCPUSpec) + ::testing::Values(emptyConfig), + ::testing::Values(cpuParamsRef) ); INSTANTIATE_TEST_SUITE_P(smoke_OneHotCPU_1D, OneHotLayerCPUTest, testCase_1d, OneHotLayerCPUTest::getTestCaseName); +INSTANTIATE_TEST_SUITE_P(smoke_OneHotCPU_1D_I64, OneHotLayerCPUTest, + ::testing::Combine( + ::testing::ValuesIn(static_shapes_to_test_representation(staticInputShapes0D)), + ::testing::Values(-1, 0), + ::testing::ValuesIn(secondaryInputTypesStaticCase), + ::testing::Values(3), + ::testing::Values(1.f), + ::testing::Values(0.f), + ::testing::Values(ElementType::i64), + ::testing::Values(ElementType::i64), + ::testing::Values(i64Config), + ::testing::Values(cpuParamsRef)), + OneHotLayerCPUTest::getTestCaseName); + const std::vector staticInputShapes1D = { { 3 } }; @@ -208,11 +263,27 @@ const auto testCase_2d_static = ::testing::Combine( ::testing::Values(6), ::testing::Values(1.f), ::testing::Values(0.f), + ::testing::ValuesIn(inPrc), ::testing::ValuesIn(outPrc), - ::testing::Values(emptyCPUSpec) + ::testing::Values(emptyConfig), + ::testing::Values(cpuParamsRef) ); INSTANTIATE_TEST_SUITE_P(smoke_OneHotCPU_2D_Static, OneHotLayerCPUTest, testCase_2d_static, OneHotLayerCPUTest::getTestCaseName); +INSTANTIATE_TEST_SUITE_P(smoke_OneHotCPU_2D_I64_Static, OneHotLayerCPUTest, + ::testing::Combine( + ::testing::ValuesIn(static_shapes_to_test_representation(staticInputShapes1D)), + ::testing::Values(-1, 0, 1), + ::testing::ValuesIn(secondaryInputTypesStaticCase), + ::testing::Values(6), + ::testing::Values(1.f), + ::testing::Values(0.f), + ::testing::Values(ElementType::i64), + ::testing::Values(ElementType::i64), + ::testing::Values(i64Config), + ::testing::Values(cpuParamsRef)), + OneHotLayerCPUTest::getTestCaseName); + const std::vector dynamicInputShapes1D = { {{-1}, {{3}, {4}, {5}}}, {{{1, 5}}, {{1}, {3}, {5}}}, @@ -225,8 +296,10 @@ const auto testCase_2d_dynamic = ::testing::Combine( ::testing::Values(6), ::testing::Values(1.f), ::testing::Values(0.f), + ::testing::ValuesIn(inPrc), ::testing::ValuesIn(outPrc), - ::testing::Values(emptyCPUSpec) + ::testing::Values(emptyConfig), + ::testing::Values(cpuParamsRef) ); INSTANTIATE_TEST_SUITE_P(smoke_OneHotCPU_2D_Dynamic, OneHotLayerCPUTest, testCase_2d_dynamic, OneHotLayerCPUTest::getTestCaseName); @@ -241,8 +314,10 @@ const auto testCase_3d_static = ::testing::Combine( ::testing::Values(4), ::testing::Values(2.f), ::testing::Values(-1.f), + ::testing::ValuesIn(inPrc), ::testing::ValuesIn(outPrc), - ::testing::Values(emptyCPUSpec) + ::testing::Values(emptyConfig), + ::testing::Values(cpuParamsRef) ); INSTANTIATE_TEST_SUITE_P(smoke_OneHotCPU_3D_Static, OneHotLayerCPUTest, testCase_3d_static, OneHotLayerCPUTest::getTestCaseName); @@ -259,8 +334,10 @@ const auto testCase_3d_dynamic = ::testing::Combine( ::testing::Values(4), ::testing::Values(2.f), ::testing::Values(-1.f), + ::testing::ValuesIn(inPrc), ::testing::ValuesIn(outPrc), - ::testing::Values(emptyCPUSpec) + ::testing::Values(emptyConfig), + ::testing::Values(cpuParamsRef) ); INSTANTIATE_TEST_SUITE_P(smoke_OneHotCPU_3D_Dynamic, OneHotLayerCPUTest, testCase_3d_dynamic, OneHotLayerCPUTest::getTestCaseName); @@ -275,8 +352,10 @@ const auto testCase_4d_static = ::testing::Combine( ::testing::Values(4), ::testing::Values(1.f), ::testing::Values(0.f), + ::testing::ValuesIn(inPrc), ::testing::ValuesIn(outPrc), - ::testing::Values(emptyCPUSpec) + ::testing::Values(emptyConfig), + ::testing::Values(cpuParamsRef) ); INSTANTIATE_TEST_SUITE_P(smoke_OneHotCPU_4D_Static, OneHotLayerCPUTest, testCase_4d_static, OneHotLayerCPUTest::getTestCaseName); @@ -293,11 +372,27 @@ const auto testCase_4d_dynamic = ::testing::Combine( ::testing::Values(4), ::testing::Values(1.f), ::testing::Values(0.f), + ::testing::ValuesIn(inPrc), ::testing::ValuesIn(outPrc), - ::testing::Values(emptyCPUSpec) + ::testing::Values(emptyConfig), + ::testing::Values(cpuParamsRef) ); INSTANTIATE_TEST_SUITE_P(smoke_OneHotCPU_4D_Dynamic, OneHotLayerCPUTest, testCase_4d_dynamic, OneHotLayerCPUTest::getTestCaseName); +INSTANTIATE_TEST_SUITE_P(smoke_OneHotCPU_4D_I64_Dynamic, OneHotLayerCPUTest, + ::testing::Combine( + ::testing::ValuesIn(dynamicInputShapes3D), + ::testing::Values(-1, 0, 1, 2), + ::testing::ValuesIn(secondaryInputTypesDynamicCase), + ::testing::Values(4), + ::testing::Values(1.f), + ::testing::Values(0.f), + ::testing::Values(ElementType::i64), + ::testing::Values(ElementType::i64), + ::testing::Values(i64Config), + ::testing::Values(cpuParamsRef)), + OneHotLayerCPUTest::getTestCaseName); + const std::vector staticInputShapes4D = { { 1, 3, 2, 3 } }; @@ -309,8 +404,10 @@ const auto testCase_5d_static = ::testing::Combine( ::testing::Values(4), ::testing::Values(1.f), ::testing::Values(0.f), + ::testing::ValuesIn(inPrc), ::testing::ValuesIn(outPrc), - ::testing::Values(emptyCPUSpec) + ::testing::Values(emptyConfig), + ::testing::Values(cpuParamsRef) ); INSTANTIATE_TEST_SUITE_P(smoke_OneHotCPU_5D_Static, OneHotLayerCPUTest, testCase_5d_static, OneHotLayerCPUTest::getTestCaseName); @@ -327,8 +424,10 @@ const auto testCase_5d_dynamic = ::testing::Combine( ::testing::Values(4), ::testing::Values(1.f), ::testing::Values(0.f), + ::testing::ValuesIn(inPrc), ::testing::ValuesIn(outPrc), - ::testing::Values(emptyCPUSpec) + ::testing::Values(emptyConfig), + ::testing::Values(cpuParamsRef) ); INSTANTIATE_TEST_SUITE_P(smoke_OneHotCPU_5D_Dynamic, OneHotLayerCPUTest, testCase_5d_dynamic, OneHotLayerCPUTest::getTestCaseName); diff --git a/src/plugins/intel_cpu/tests/functional/single_layer_tests/scatter_ND_update.cpp b/src/plugins/intel_cpu/tests/functional/single_layer_tests/scatter_ND_update.cpp index 019c40e390cbd8..a282a46e0ccf8e 100644 --- a/src/plugins/intel_cpu/tests/functional/single_layer_tests/scatter_ND_update.cpp +++ b/src/plugins/intel_cpu/tests/functional/single_layer_tests/scatter_ND_update.cpp @@ -165,6 +165,7 @@ const std::vector scatterParams = { const std::vector inputPrecisions = { ElementType::f32, ElementType::i32, + ElementType::i64 }; const std::vector constantPrecisions = { diff --git a/src/plugins/intel_cpu/tests/functional/single_layer_tests/scatter_elements_update.cpp b/src/plugins/intel_cpu/tests/functional/single_layer_tests/scatter_elements_update.cpp index bc6c5b33692077..9e908d375b01f8 100644 --- a/src/plugins/intel_cpu/tests/functional/single_layer_tests/scatter_elements_update.cpp +++ b/src/plugins/intel_cpu/tests/functional/single_layer_tests/scatter_elements_update.cpp @@ -156,6 +156,7 @@ const std::vector scatterParams = { const std::vector inputPrecisions = { ElementType::f32, ElementType::i32, + ElementType::i64 }; const std::vector constantPrecisions = { diff --git a/src/plugins/intel_cpu/tests/functional/single_layer_tests/scatter_update.cpp b/src/plugins/intel_cpu/tests/functional/single_layer_tests/scatter_update.cpp index bc65fd172874d5..ec749f870aea92 100644 --- a/src/plugins/intel_cpu/tests/functional/single_layer_tests/scatter_update.cpp +++ b/src/plugins/intel_cpu/tests/functional/single_layer_tests/scatter_update.cpp @@ -5,8 +5,8 @@ #include "test_utils/cpu_test_utils.hpp" #include "shared_test_classes/base/ov_subgraph.hpp" #include "ngraph_functions/builders.hpp" +#include -using namespace ngraph; using namespace InferenceEngine; using namespace CPUTestUtils; using namespace ov::test; @@ -25,7 +25,8 @@ struct ScatterUpdateLayerParams { using scatterUpdateParams = std::tuple< ScatterUpdateLayerParams, ElementType, // input precision - ElementType>; // indices precision + ElementType, // indices precision + ov::AnyMap>; // Additional network configuration class ScatterUpdateLayerCPUTest : public testing::WithParamInterface, public SubgraphBaseTest, public CPUTestsBase { public: @@ -33,7 +34,8 @@ class ScatterUpdateLayerCPUTest : public testing::WithParamInterfaceGetParam(); + std::tie(scatterParams, inputPrecision, idxPrecision, configuration) = this->GetParam(); const auto inputShapes = scatterParams.inputShapes; const auto indicesDescr = scatterParams.indicesDescriprion; const auto axis = scatterParams.axis; init_input_shapes(inputShapes); - selectedType = makeSelectedTypeStr("unknown", inputPrecision); + + if (inputPrecision == ElementType::i64 || inputPrecision == ElementType::u64) { + auto i64It = configuration.find(PluginConfigInternalParams::KEY_CPU_NATIVE_I64); + if (i64It == configuration.end() || i64It->second == PluginConfigParams::NO) { + selectedType = makeSelectedTypeStr("unknown", ElementType::i32); + } else { + selectedType = makeSelectedTypeStr("unknown", ElementType::i64); + } + } else { + selectedType = makeSelectedTypeStr("unknown", inputPrecision); + } auto params = ngraph::builder::makeDynamicParams(inputPrecision, inputDynamicShapes); - auto indicesNode = ngraph::opset1::Constant::create(idxPrecision, indicesDescr.first, indicesDescr.second); - auto axis_node = ngraph::opset1::Constant::create(idxPrecision, {}, { axis }); - auto scatter = std::make_shared(params[0], indicesNode, params[1], axis_node); + auto indicesNode = ov::op::v0::Constant::create(idxPrecision, indicesDescr.first, indicesDescr.second); + auto axis_node = ov::op::v0::Constant::create(idxPrecision, {}, { axis }); + auto scatter = std::make_shared(params[0], indicesNode, params[1], axis_node); function = makeNgraphFunction(inputPrecision, params, scatter, "ScatterUpdateLayerCPUTest"); } @@ -127,9 +145,12 @@ const std::vector scatterParams = { }, }; +ov::AnyMap config = {}; +ov::AnyMap config_i64 = {{PluginConfigInternalParams::KEY_CPU_NATIVE_I64, PluginConfigParams::YES}}; + const std::vector inputPrecisions = { ElementType::f32, - ElementType::i32, + ElementType::i32 }; const std::vector constantPrecisions = { @@ -138,9 +159,19 @@ const std::vector constantPrecisions = { }; INSTANTIATE_TEST_SUITE_P(smoke_CompareWithRefs, ScatterUpdateLayerCPUTest, - ::testing::Combine( - ::testing::ValuesIn(scatterParams), - ::testing::ValuesIn(inputPrecisions), - ::testing::ValuesIn(constantPrecisions)), - ScatterUpdateLayerCPUTest::getTestCaseName); + ::testing::Combine( + ::testing::ValuesIn(scatterParams), + ::testing::ValuesIn(inputPrecisions), + ::testing::ValuesIn(constantPrecisions), + ::testing::Values(config)), + ScatterUpdateLayerCPUTest::getTestCaseName); + +INSTANTIATE_TEST_SUITE_P(smoke_CompareWithRefs_I64, ScatterUpdateLayerCPUTest, + ::testing::Combine( + ::testing::ValuesIn(scatterParams), + ::testing::Values(ElementType::i64), + ::testing::ValuesIn(constantPrecisions), + ::testing::Values(config_i64)), + ScatterUpdateLayerCPUTest::getTestCaseName); + } // namespace CPULayerTestsDefinitions diff --git a/src/plugins/intel_cpu/tests/functional/single_layer_tests/split.cpp b/src/plugins/intel_cpu/tests/functional/single_layer_tests/split.cpp index e7e9b86aa68088..b369ca19ad649b 100644 --- a/src/plugins/intel_cpu/tests/functional/single_layer_tests/split.cpp +++ b/src/plugins/intel_cpu/tests/functional/single_layer_tests/split.cpp @@ -5,7 +5,9 @@ #include "shared_test_classes/base/ov_subgraph.hpp" #include "ngraph_functions/builders.hpp" #include "test_utils/cpu_test_utils.hpp" +#include +using namespace InferenceEngine; using namespace ov::test; using namespace CPUTestUtils; @@ -17,6 +19,7 @@ typedef std::tuple< ElementType, // Net precision InputShape, // Input shapes std::vector, // Used outputs indices + ov::AnyMap, // Additional network configuration CPUSpecificParams > splitCPUTestParams; @@ -28,9 +31,10 @@ class SplitLayerCPUTest : public testing::WithParamInterface int64_t axis; ElementType netPrecision; InputShape inputShapes; - InferenceEngine::SizeVector outIndices; + SizeVector outIndices; + ov::AnyMap config; CPUSpecificParams cpuParams; - std::tie(numSplits, axis, netPrecision, inputShapes, outIndices, cpuParams) = obj.param; + std::tie(numSplits, axis, netPrecision, inputShapes, outIndices, config, cpuParams) = obj.param; std::ostringstream result; result << "IS="; @@ -45,7 +49,12 @@ class SplitLayerCPUTest : public testing::WithParamInterface result << "outIndices" << CommonTestUtils::vec2str(outIndices) << "_"; } result << "netPRC=" << netPrecision << "_"; + for (auto const& configItem : config) { + result << "_configItem=" << configItem.first << "_"; + configItem.second.print(result); + } result << CPUTestsBase::getTestCaseName(cpuParams); + return result.str(); } @@ -56,9 +65,9 @@ class SplitLayerCPUTest : public testing::WithParamInterface size_t axis, numSplits; ElementType netPrecision; InputShape inputShapes; - InferenceEngine::SizeVector outIndices; + SizeVector outIndices; CPUSpecificParams cpuParams; - std::tie(numSplits, axis, netPrecision, inputShapes, outIndices, cpuParams) = this->GetParam(); + std::tie(numSplits, axis, netPrecision, inputShapes, outIndices, configuration, cpuParams) = this->GetParam(); if (outIndices.empty()) { for (size_t i = 0; i < numSplits; ++i) { outIndices.push_back(i); @@ -66,27 +75,37 @@ class SplitLayerCPUTest : public testing::WithParamInterface } std::tie(inFmts, outFmts, priority, selectedType) = cpuParams; - selectedType += std::string("_") + InferenceEngine::details::convertPrecision(netPrecision).name(); + + if (netPrecision == ElementType::i64) { + auto i64Flag = configuration.find(PluginConfigInternalParams::KEY_CPU_NATIVE_I64); + if (i64Flag == configuration.end() || i64Flag->second == PluginConfigParams::NO) { + selectedType = makeSelectedTypeStr(selectedType, ElementType::i32); + } else { + selectedType = makeSelectedTypeStr(selectedType, ElementType::i64); + } + } else { + selectedType = makeSelectedTypeStr(selectedType, netPrecision); + } init_input_shapes({inputShapes}); auto params = ngraph::builder::makeDynamicParams(netPrecision, inputDynamicShapes); auto paramOuts = ngraph::helpers::convert2OutputVector( - ngraph::helpers::castOps2Nodes(params)); - auto split = std::dynamic_pointer_cast(ngraph::builder::makeSplit(paramOuts[0], + ngraph::helpers::castOps2Nodes(params)); + auto split = std::dynamic_pointer_cast(ngraph::builder::makeSplit(paramOuts[0], netPrecision, numSplits, axis)); - ngraph::ResultVector results; + ov::ResultVector results; for (size_t i = 0; i < outIndices.size(); i++) { // This WA is necessary because result nodes connected to the same output of the split node (or any node) are deduplicated - // on the CNNNetwork level. It might not be needed when the CPU plugin moves completely to nGraph. + // on the CNNNetwork level. It might not be needed when the CPU plugin moves completely to Core model. // This is still a single layer test since the Relu nodes are added only as a WA. - auto fakeEltwise = std::make_shared(split->output(outIndices[i])); - results.push_back(std::make_shared(fakeEltwise)); + auto fakeEltwise = std::make_shared(split->output(outIndices[i])); + results.push_back(std::make_shared(fakeEltwise)); } split->get_rt_info() = getCPUInfo(); - function = std::make_shared(results, params, "split"); + function = std::make_shared(results, params, "split"); } }; @@ -120,6 +139,9 @@ const auto blocked16_5D = CPUSpecificParams{{nCdhw16c}, {nCdhw16c}, {}, "unknown const auto blocked16_4D_ref = CPUSpecificParams{{nChw16c}, {nChw16c}, {}, "ref"}; const auto blocked16_5D_ref = CPUSpecificParams{{nCdhw16c}, {nCdhw16c}, {}, "ref"}; +ov::AnyMap additional_config = {}; +ov::AnyMap i64Config = {{PluginConfigInternalParams::KEY_CPU_NATIVE_I64, PluginConfigParams::YES}}; + // List of precisions natively supported by onednn. const std::vector netPrecisions = { ElementType::i8, @@ -172,6 +194,18 @@ INSTANTIATE_TEST_SUITE_P(smoke_Split4D_CPU_Nspc2NcspSpecial, SplitLayerCPUTest, ::testing::ValuesIn(netPrecisions), ::testing::ValuesIn(inputShapes4D_Nspc2NcspSpecial), ::testing::ValuesIn(outIndices4), + ::testing::Values(additional_config), + ::testing::Values(perChannelsToPlanar_4D)), + SplitLayerCPUTest::getTestCaseName); + +INSTANTIATE_TEST_SUITE_P(smoke_Split4D_CPU_Nspc2NcspSpecial_I64, SplitLayerCPUTest, + ::testing::Combine( + ::testing::Values(4), + ::testing::Values(1), + ::testing::Values(ElementType::i64), + ::testing::ValuesIn(inputShapes4D_Nspc2NcspSpecial), + ::testing::ValuesIn(outIndices4), + ::testing::Values(i64Config), ::testing::Values(perChannelsToPlanar_4D)), SplitLayerCPUTest::getTestCaseName); @@ -206,6 +240,18 @@ INSTANTIATE_TEST_SUITE_P(smoke_Split5D_CPU_Nspc2NcspSpecial, SplitLayerCPUTest, ::testing::ValuesIn(netPrecisions), ::testing::ValuesIn(inputShapes5D_Nspc2NcspSpecial), ::testing::ValuesIn(outIndices3), + ::testing::Values(additional_config), + ::testing::Values(perChannelsToPlanar_5D)), + SplitLayerCPUTest::getTestCaseName); + +INSTANTIATE_TEST_SUITE_P(smoke_Split5D_CPU_Nspc2NcspSpecial_I64, SplitLayerCPUTest, + ::testing::Combine( + ::testing::Values(3), + ::testing::Values(1), + ::testing::Values(ElementType::i64), + ::testing::ValuesIn(inputShapes5D_Nspc2NcspSpecial), + ::testing::ValuesIn(outIndices3), + ::testing::Values(i64Config), ::testing::Values(perChannelsToPlanar_5D)), SplitLayerCPUTest::getTestCaseName); @@ -249,6 +295,18 @@ INSTANTIATE_TEST_SUITE_P(smoke_Split4D_CPU_planar, SplitLayerCPUTest, ::testing::ValuesIn(netPrecisions), ::testing::ValuesIn(inputShapes4D_planar), ::testing::ValuesIn(outIndices3), + ::testing::Values(additional_config), + ::testing::Values(planar_4D_ref, perChannels_4D)), + SplitLayerCPUTest::getTestCaseName); + +INSTANTIATE_TEST_SUITE_P(smoke_Split4D_CPU_planar_I64, SplitLayerCPUTest, + ::testing::Combine( + ::testing::Values(3), + ::testing::Values(2, 3), + ::testing::Values(ElementType::i64), + ::testing::ValuesIn(inputShapes4D_planar), + ::testing::ValuesIn(outIndices3), + ::testing::Values(i64Config), ::testing::Values(planar_4D_ref, perChannels_4D)), SplitLayerCPUTest::getTestCaseName); @@ -292,6 +350,18 @@ INSTANTIATE_TEST_SUITE_P(smoke_Split4D_CPU_Block8, SplitLayerCPUTest, ::testing::ValuesIn(netPrecisions), ::testing::ValuesIn(inputShapes4D_block), ::testing::ValuesIn(outIndices3), + ::testing::Values(additional_config), + ::testing::Values(blocked8_4D_ref)), + SplitLayerCPUTest::getTestCaseName); + +INSTANTIATE_TEST_SUITE_P(smoke_Split4D_CPU_Block8_I64, SplitLayerCPUTest, + ::testing::Combine( + ::testing::Values(3), + ::testing::Values(2, 3), + ::testing::Values(ElementType::i64), + ::testing::ValuesIn(inputShapes4D_block), + ::testing::ValuesIn(outIndices3), + ::testing::Values(i64Config), ::testing::Values(blocked8_4D_ref)), SplitLayerCPUTest::getTestCaseName); @@ -302,6 +372,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_Split4D_CPU_Block16, SplitLayerCPUTest, ::testing::ValuesIn(netPrecisions), ::testing::ValuesIn(inputShapes4D_block), ::testing::ValuesIn(outIndices4), + ::testing::Values(additional_config), ::testing::Values(blocked16_4D_ref)), SplitLayerCPUTest::getTestCaseName); @@ -336,6 +407,18 @@ INSTANTIATE_TEST_SUITE_P(smoke_Split5D_CPU_planar, SplitLayerCPUTest, ::testing::ValuesIn(netPrecisions), ::testing::ValuesIn(inputShapes5D_planar), ::testing::ValuesIn(outIndices3), + ::testing::Values(additional_config), + ::testing::Values(planar_5D_ref, perChannels_5D)), + SplitLayerCPUTest::getTestCaseName); + +INSTANTIATE_TEST_SUITE_P(smoke_Split5D_CPU_planar_I64, SplitLayerCPUTest, + ::testing::Combine( + ::testing::Values(3), + ::testing::Values(2, 3, 4), + ::testing::Values(ElementType::i64), + ::testing::ValuesIn(inputShapes5D_planar), + ::testing::ValuesIn(outIndices3), + ::testing::Values(i64Config), ::testing::Values(planar_5D_ref, perChannels_5D)), SplitLayerCPUTest::getTestCaseName); @@ -370,6 +453,18 @@ INSTANTIATE_TEST_SUITE_P(smoke_Split5D_CPU_Block8, SplitLayerCPUTest, ::testing::ValuesIn(netPrecisions), ::testing::ValuesIn(inputShapes5D_block), ::testing::ValuesIn(outIndices3), + ::testing::Values(additional_config), + ::testing::Values(blocked8_5D_ref)), + SplitLayerCPUTest::getTestCaseName); + +INSTANTIATE_TEST_SUITE_P(smoke_Split5D_CPU_Block8_I64, SplitLayerCPUTest, + ::testing::Combine( + ::testing::Values(3), + ::testing::Values(2, 3, 4), + ::testing::Values(ElementType::i64), + ::testing::ValuesIn(inputShapes5D_block), + ::testing::ValuesIn(outIndices3), + ::testing::Values(i64Config), ::testing::Values(blocked8_5D_ref)), SplitLayerCPUTest::getTestCaseName); @@ -380,6 +475,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_Split5D_CPU_Block16, SplitLayerCPUTest, ::testing::ValuesIn(netPrecisions), ::testing::ValuesIn(inputShapes5D_block), ::testing::ValuesIn(outIndices4), + ::testing::Values(additional_config), ::testing::Values(blocked16_5D_ref)), SplitLayerCPUTest::getTestCaseName); @@ -414,6 +510,18 @@ INSTANTIATE_TEST_SUITE_P(smoke_Split3D, SplitLayerCPUTest, ::testing::ValuesIn(netPrecisions), ::testing::ValuesIn(inputShapes3D), ::testing::Values(std::vector({})), + ::testing::Values(additional_config), + ::testing::Values(CPUSpecificParams{{}, {}, {}, "ref"})), + SplitLayerCPUTest::getTestCaseName); + +INSTANTIATE_TEST_SUITE_P(smoke_Split3D_I64, SplitLayerCPUTest, + ::testing::Combine( + ::testing::Values(7), + ::testing::Values(1, 2), + ::testing::Values(ElementType::i64), + ::testing::ValuesIn(inputShapes3D), + ::testing::Values(std::vector({})), + ::testing::Values(i64Config), ::testing::Values(CPUSpecificParams{{}, {}, {}, "ref"})), SplitLayerCPUTest::getTestCaseName); @@ -448,6 +556,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_Split2D, SplitLayerCPUTest, ::testing::ValuesIn(netPrecisions), ::testing::ValuesIn(inputShapes2D), ::testing::Values(std::vector({})), + ::testing::Values(additional_config), ::testing::Values(CPUSpecificParams{{}, {}, {}, "ref"})), SplitLayerCPUTest::getTestCaseName); @@ -458,8 +567,9 @@ INSTANTIATE_TEST_SUITE_P(smoke_Split1D_static, SplitLayerCPUTest, ::testing::ValuesIn(netPrecisions), ::testing::Values(InputShape{ {}, {{10}} }), ::testing::Values(std::vector({})), + ::testing::Values(additional_config), ::testing::Values(CPUSpecificParams{{}, {}, {}, "unknown"})), - SplitLayerCPUTest::getTestCaseName); + SplitLayerCPUTest::getTestCaseName); const std::vector inputShapes1D = { { @@ -491,6 +601,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_Split1D, SplitLayerCPUTest, ::testing::ValuesIn(netPrecisions), ::testing::ValuesIn(inputShapes1D), ::testing::Values(std::vector({})), + ::testing::Values(additional_config), ::testing::Values(CPUSpecificParams{{}, {}, {}, "ref"})), SplitLayerCPUTest::getTestCaseName); @@ -513,6 +624,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_Split4D_CPU_by_batch, SplitLayerCPUTest, ::testing::ValuesIn(netPrecisions), ::testing::ValuesIn(inputShapes4D_dynBatch), ::testing::ValuesIn(outIndices3), + ::testing::Values(additional_config), ::testing::Values(planar_4D_ref, perChannels_4D)), SplitLayerCPUTest::getTestCaseName); @@ -557,6 +669,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_Split_CPU_planar_inPlace_0, SplitLayerCPUTest, ::testing::Values(ElementType::f32), ::testing::ValuesIn(inputShapes4D_inPlace_0), ::testing::Values(std::vector{}), + ::testing::Values(additional_config), ::testing::Values(CPUSpecificParams{{}, {}, {}, "unknown"})), SplitLayerCPUTest::getTestCaseName); @@ -573,24 +686,26 @@ INSTANTIATE_TEST_SUITE_P(smoke_Split4D_CPU_Block8inPlace_1, SplitLayerCPUTest, {1, 32, 5, 8} } }), ::testing::ValuesIn(outIndices4), + ::testing::Values(additional_config), ::testing::Values(planar_4D, blocked8_4D)), SplitLayerCPUTest::getTestCaseName); INSTANTIATE_TEST_SUITE_P(smoke_Split5D_CPU_Block16inPlace_1, SplitLayerCPUTest, - ::testing::Combine( - ::testing::Values(3), - ::testing::Values(1), - ::testing::ValuesIn(netPrecisions), - ::testing::Values(InputShape{ {}, {{1, 48, 5, 6, 3}} }, - InputShape{ {1, 48, -1, -1, 3}, - { - {1, 48, 5, 6, 3}, - {1, 48, 5, 2, 3}, - {1, 48, 5, 8, 3} - } }), - ::testing::ValuesIn(outIndices3), - ::testing::Values(planar_5D, blocked16_5D)), - SplitLayerCPUTest::getTestCaseName); + ::testing::Combine( + ::testing::Values(3), + ::testing::Values(1), + ::testing::ValuesIn(netPrecisions), + ::testing::Values(InputShape{ {}, {{1, 48, 5, 6, 3}} }, + InputShape{ {1, 48, -1, -1, 3}, + { + {1, 48, 5, 6, 3}, + {1, 48, 5, 2, 3}, + {1, 48, 5, 8, 3} + } }), + ::testing::ValuesIn(outIndices3), + ::testing::Values(additional_config), + ::testing::Values(planar_5D, blocked16_5D)), + SplitLayerCPUTest::getTestCaseName); } // namespace diff --git a/src/plugins/intel_cpu/tests/functional/single_layer_tests/strided_slice.cpp b/src/plugins/intel_cpu/tests/functional/single_layer_tests/strided_slice.cpp index 310980153133d7..92f33df8353878 100644 --- a/src/plugins/intel_cpu/tests/functional/single_layer_tests/strided_slice.cpp +++ b/src/plugins/intel_cpu/tests/functional/single_layer_tests/strided_slice.cpp @@ -7,6 +7,7 @@ #include "test_utils/cpu_test_utils.hpp" #include "shared_test_classes/base/ov_subgraph.hpp" #include +#include using namespace InferenceEngine; using namespace CPUTestUtils; @@ -32,6 +33,7 @@ typedef std::tuple< StridedSliceParams, ngraph::helpers::InputLayerType, // Secondary input types ElementType, // Element type + ov::AnyMap, // Additional network configuration CPUSpecificParams> StridedSliceLayerCPUTestParamSet; class StridedSliceLayerCPUTest : public testing::WithParamInterface, @@ -43,7 +45,8 @@ class StridedSliceLayerCPUTest : public testing::WithParamInterfacesecond == PluginConfigParams::NO) { + selectedType = makeSelectedTypeStr(selectedType, ElementType::i32); + } else { + selectedType = makeSelectedTypeStr(selectedType, ElementType::i64); + } + } else { + selectedType = makeSelectedTypeStr(selectedType, dataType); + } + targetDevice = CommonTestUtils::DEVICE_CPU; std::vector input_shapes = {shapes}; @@ -104,17 +127,17 @@ class StridedSliceLayerCPUTest : public testing::WithParamInterface ss; + std::shared_ptr ss; if (secondaryInputType == ngraph::helpers::InputLayerType::PARAMETER) { ov::Shape inShape = {ssParams.begin.size()}; - auto beginNode = std::make_shared(ov::element::i64, inShape); - auto endNode = std::make_shared(ov::element::i64, inShape); - auto strideNode = std::make_shared(ov::element::i64, inShape); + auto beginNode = std::make_shared(ov::element::i64, inShape); + auto endNode = std::make_shared(ov::element::i64, inShape); + auto strideNode = std::make_shared(ov::element::i64, inShape); - params.push_back(std::dynamic_pointer_cast(beginNode)); - params.push_back(std::dynamic_pointer_cast(endNode)); - params.push_back(std::dynamic_pointer_cast(strideNode)); + params.push_back(beginNode); + params.push_back(std::dynamic_pointer_cast(endNode)); + params.push_back(std::dynamic_pointer_cast(strideNode)); ss = ngraph::builder::makeStridedSlice(params[0], beginNode, endNode, strideNode, inType, ssParams.beginMask, ssParams.endMask, ssParams.newAxisMask, ssParams.shrinkAxisMask, ssParams.ellipsisAxisMask); @@ -150,7 +173,8 @@ const auto cpuParams_ncdhw = CPUSpecificParams {{ncdhw}, {ncdhw}, {}, {}}; const std::vector inputPrecisions = { ElementType::f32, ElementType::bf16, - ElementType::i8 + ElementType::i8, + ElementType::i64 }; const std::vector inputLayerTypes = { @@ -178,12 +202,29 @@ const std::vector paramsPlain2D = { StridedSliceParams{ { 2 }, { 22 }, { 2 }, { 0 }, { 0 }, { }, { }, { } }, }; +const ov::AnyMap additionalConfig; +const std::vector i64Config = { + {{InferenceEngine::PluginConfigInternalParams::KEY_CPU_NATIVE_I64, InferenceEngine::PluginConfigParams::YES}}, + {{InferenceEngine::PluginConfigInternalParams::KEY_CPU_NATIVE_I64, InferenceEngine::PluginConfigParams::NO}} +}; + INSTANTIATE_TEST_SUITE_P(smoke_CompareWithRefs_Plain_Static_2D, StridedSliceLayerCPUTest, ::testing::Combine( ::testing::ValuesIn(static_shapes_to_test_representation({{32, 20}})), ::testing::ValuesIn(paramsPlain2D), ::testing::ValuesIn(inputLayerTypes), ::testing::ValuesIn(inputPrecisions), + ::testing::Values(additionalConfig), + ::testing::Values(emptyCPUSpec)), + StridedSliceLayerCPUTest::getTestCaseName); + +INSTANTIATE_TEST_SUITE_P(smoke_CompareWithRefs_Plain_Static_2D_I64, StridedSliceLayerCPUTest, + ::testing::Combine( + ::testing::ValuesIn(static_shapes_to_test_representation({{32, 20}})), + ::testing::ValuesIn(paramsPlain2D), + ::testing::ValuesIn(inputLayerTypes), + ::testing::Values(ElementType::i64), + ::testing::ValuesIn(i64Config), ::testing::Values(emptyCPUSpec)), StridedSliceLayerCPUTest::getTestCaseName); @@ -193,6 +234,17 @@ INSTANTIATE_TEST_SUITE_P(smoke_CompareWithRefs_Plain_Dynamic_2D, StridedSliceLay ::testing::ValuesIn(paramsPlain2D), ::testing::ValuesIn(inputLayerTypes), ::testing::ValuesIn(inputPrecisions), + ::testing::Values(additionalConfig), + ::testing::Values(emptyCPUSpec)), + StridedSliceLayerCPUTest::getTestCaseName); + +INSTANTIATE_TEST_SUITE_P(smoke_CompareWithRefs_Plain_Dynamic_2D_I64, StridedSliceLayerCPUTest, + ::testing::Combine( + ::testing::ValuesIn(inputShapesDynamic2D), + ::testing::ValuesIn(paramsPlain2D), + ::testing::ValuesIn(inputLayerTypes), + ::testing::Values(ElementType::i64), + ::testing::ValuesIn(i64Config), ::testing::Values(emptyCPUSpec)), StridedSliceLayerCPUTest::getTestCaseName); @@ -234,6 +286,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_CompareWithRefs_Common_Static_4D, StridedSliceLay ::testing::ValuesIn(testCasesCommon4D), ::testing::ValuesIn(inputLayerTypes), ::testing::ValuesIn(inputPrecisions), + ::testing::Values(additionalConfig), ::testing::ValuesIn(CPUParamsCommon4D)), StridedSliceLayerCPUTest::getTestCaseName); @@ -243,6 +296,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_CompareWithRefs_Common_Dynamic_4D, StridedSliceLa ::testing::ValuesIn(testCasesCommon4D), ::testing::ValuesIn(inputLayerTypes), ::testing::ValuesIn(inputPrecisions), + ::testing::Values(additionalConfig), ::testing::ValuesIn(CPUParamsCommon4D)), StridedSliceLayerCPUTest::getTestCaseName); @@ -306,6 +360,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_CompareWithRefs_Common_Static_4D_Subset1, Strided ::testing::ValuesIn(testCasesBlocked4DSubset1), ::testing::ValuesIn(inputLayerTypesBlocked), ::testing::ValuesIn(inputPrecisions), + ::testing::Values(additionalConfig), ::testing::ValuesIn(CPUParamsBlocked4D)), StridedSliceLayerCPUTest::getTestCaseName); @@ -315,6 +370,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_CompareWithRefs_Common_Dynamic_4D_Subset1, Stride ::testing::ValuesIn(testCasesBlocked4DSubset1), ::testing::ValuesIn(inputLayerTypesBlocked), ::testing::ValuesIn(inputPrecisions), + ::testing::Values(additionalConfig), ::testing::ValuesIn(CPUParamsBlocked4D)), StridedSliceLayerCPUTest::getTestCaseName); @@ -324,6 +380,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_CompareWithRefs_Common_Static_4D_Subset2, Strided ::testing::ValuesIn(testCasesBlocked4DSubset2), ::testing::ValuesIn(inputLayerTypesBlocked), ::testing::ValuesIn(inputPrecisions), + ::testing::Values(additionalConfig), ::testing::ValuesIn(CPUParamsBlocked4D)), StridedSliceLayerCPUTest::getTestCaseName); @@ -333,6 +390,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_CompareWithRefs_Common_Dynamic_4D_Subset2, Stride ::testing::ValuesIn(testCasesBlocked4DSubset2), ::testing::ValuesIn(inputLayerTypesBlocked), ::testing::ValuesIn(inputPrecisions), + ::testing::Values(additionalConfig), ::testing::ValuesIn(CPUParamsBlocked4D)), StridedSliceLayerCPUTest::getTestCaseName); @@ -373,6 +431,17 @@ INSTANTIATE_TEST_SUITE_P(smoke_CompareWithRefs_Common_Static_5D, StridedSliceLay ::testing::ValuesIn(testCasesCommon5D), ::testing::ValuesIn(inputLayerTypes), ::testing::ValuesIn(inputPrecisions), + ::testing::Values(additionalConfig), + ::testing::ValuesIn(CPUParamsCommon5D)), + StridedSliceLayerCPUTest::getTestCaseName); + +INSTANTIATE_TEST_SUITE_P(smoke_CompareWithRefs_Common_Static_5D_I64, StridedSliceLayerCPUTest, + ::testing::Combine( + ::testing::ValuesIn(static_shapes_to_test_representation(inputShapesStatic5D)), + ::testing::ValuesIn(testCasesCommon5D), + ::testing::ValuesIn(inputLayerTypes), + ::testing::Values(ElementType::i64), + ::testing::ValuesIn(i64Config), ::testing::ValuesIn(CPUParamsCommon5D)), StridedSliceLayerCPUTest::getTestCaseName); @@ -382,6 +451,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_CompareWithRefs_Common_Dynamic_5D, StridedSliceLa ::testing::ValuesIn(testCasesCommon5D), ::testing::ValuesIn(inputLayerTypes), ::testing::ValuesIn(inputPrecisions), + ::testing::Values(additionalConfig), ::testing::ValuesIn(CPUParamsCommon5D)), StridedSliceLayerCPUTest::getTestCaseName); @@ -444,6 +514,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_CompareWithRefs_Common_Static_5D_Subset1, Strided ::testing::ValuesIn(testCasesBlocked5DSubset1), ::testing::ValuesIn(inputLayerTypesBlocked), ::testing::ValuesIn(inputPrecisions), + ::testing::Values(additionalConfig), ::testing::ValuesIn(CPUParamsBlocked5D)), StridedSliceLayerCPUTest::getTestCaseName); @@ -453,6 +524,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_CompareWithRefs_Common_Dynamic_5D_Subset1, Stride ::testing::ValuesIn(testCasesBlocked5DSubset1), ::testing::ValuesIn(inputLayerTypesBlocked), ::testing::ValuesIn(inputPrecisions), + ::testing::Values(additionalConfig), ::testing::ValuesIn(CPUParamsBlocked5D)), StridedSliceLayerCPUTest::getTestCaseName); @@ -462,6 +534,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_CompareWithRefs_Common_Static_5D_Subset2, Strided ::testing::ValuesIn(testCasesBlocked4DSubset2), ::testing::ValuesIn(inputLayerTypesBlocked), ::testing::ValuesIn(inputPrecisions), + ::testing::Values(additionalConfig), ::testing::ValuesIn(CPUParamsBlocked4D)), StridedSliceLayerCPUTest::getTestCaseName); @@ -471,6 +544,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_CompareWithRefs_Common_Dynamic_5D_Subset2, Stride ::testing::ValuesIn(testCasesBlocked5DSubset2), ::testing::ValuesIn(inputLayerTypesBlocked), ::testing::ValuesIn(inputPrecisions), + ::testing::Values(additionalConfig), ::testing::ValuesIn(CPUParamsBlocked5D)), StridedSliceLayerCPUTest::getTestCaseName); @@ -502,6 +576,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_StridedSliceLayerDescriptorCPUTest, StridedSliceL ::testing::ValuesIn(testCasesDescriptors), ::testing::Values(ngraph::helpers::InputLayerType::CONSTANT), ::testing::Values(ElementType::f32), + ::testing::Values(additionalConfig), ::testing::Values(cpuParams_nChw8c)), StridedSliceLayerDescriptorCPUTest::getTestCaseName); diff --git a/src/plugins/intel_cpu/tests/functional/single_layer_tests/tile.cpp b/src/plugins/intel_cpu/tests/functional/single_layer_tests/tile.cpp index db6c874ba660a0..de4c741df543b3 100644 --- a/src/plugins/intel_cpu/tests/functional/single_layer_tests/tile.cpp +++ b/src/plugins/intel_cpu/tests/functional/single_layer_tests/tile.cpp @@ -6,36 +6,38 @@ #include "ngraph_functions/builders.hpp" #include "shared_test_classes/base/ov_subgraph.hpp" #include +#include using namespace CPUTestUtils; +using namespace ov::test; namespace CPULayerTestsDefinitions { using TileLayerTestParamsSet = typename std::tuple< - std::vector, // Input shapes - std::vector, // Repeats - ov::element::Type_t, // Network precision - bool, // Is Repeats input constant - std::string>; // Device name + std::vector, // Input shapes + std::vector, // Repeats + ElementType, // Network precision + bool, // Is Repeats input constant + ov::AnyMap>; // Additional network configuration typedef std::tuple< TileLayerTestParamsSet, CPUSpecificParams> TileLayerCPUTestParamsSet; class TileLayerCPUTest : public testing::WithParamInterface, - virtual public ov::test::SubgraphBaseTest, public CPUTestsBase { + virtual public SubgraphBaseTest, public CPUTestsBase { public: static std::string getTestCaseName(testing::TestParamInfo obj) { TileLayerTestParamsSet basicParamsSet; CPUSpecificParams cpuParams; std::tie(basicParamsSet, cpuParams) = obj.param; - std::vector inputShapes; + std::vector inputShapes; std::vector repeats; - ov::element::Type_t netPrecision; + ElementType netPrecision; bool isRepeatsConst; - std::string deviceName; - std::tie(inputShapes, repeats, netPrecision, isRepeatsConst, deviceName) = basicParamsSet; + ov::AnyMap config; + std::tie(inputShapes, repeats, netPrecision, isRepeatsConst, config) = basicParamsSet; std::ostringstream result; result << "IS=("; @@ -51,7 +53,11 @@ class TileLayerCPUTest : public testing::WithParamInterfaceGetParam(); std::tie(inFmts, outFmts, priority, selectedType) = cpuParams; - std::vector inputShapes; - ov::element::Type_t netPrecision; + std::vector inputShapes; + ElementType netPrecision; bool isRepeatsConst; - std::tie(inputShapes, repeatsData, netPrecision, isRepeatsConst, targetDevice) = basicParamsSet; + std::tie(inputShapes, repeatsData, netPrecision, isRepeatsConst, configuration) = basicParamsSet; - selectedType += std::string("_") + InferenceEngine::details::convertPrecision(netPrecision).name(); + if (netPrecision == ElementType::i64 || netPrecision == ElementType::u64) { + auto i64Flag = configuration.find(InferenceEngine::PluginConfigInternalParams::KEY_CPU_NATIVE_I64); + if (i64Flag == configuration.end() || i64Flag->second == InferenceEngine::PluginConfigParams::NO) { + selectedType = makeSelectedTypeStr(selectedType, ElementType::i32); + } else { + selectedType = makeSelectedTypeStr(selectedType, ElementType::i64); + } + } else { + selectedType = makeSelectedTypeStr(selectedType, netPrecision); + } if (inputShapes.front().first.rank() != 0) { inputDynamicShapes.push_back(inputShapes.front().first); @@ -124,10 +141,10 @@ class TileLayerCPUTest : public testing::WithParamInterface netPrecisions = { +const std::vector netPrecisions = { ov::element::f32, ov::element::bf16, ov::element::i32, - ov::element::i8 + ov::element::i8, + ov::element::i64 }; -const std::vector> staticInputShapes4D = { +const std::vector> staticInputShapes4D = { { {{}, { // Static shapes @@ -182,7 +200,7 @@ const std::vector> staticInputShapes4D = { } } }; -const std::vector> dynamicInputShapes4D = { +const std::vector> dynamicInputShapes4D = { { { // Origin dynamic shapes {ov::Dimension(1, 20), ov::Dimension(10, 20), ov::Dimension(1, 20), ov::Dimension(1, 20)}, @@ -204,7 +222,7 @@ const std::vector> dynamicInputShapes4D = { } }; -const std::vector> staticInputShapes5D = { +const std::vector> staticInputShapes5D = { { {{}, { // Static shapes @@ -213,7 +231,7 @@ const std::vector> staticInputShapes5D = { } } }; -const std::vector> dynamicInputShapes5D = { +const std::vector> dynamicInputShapes5D = { { { // Origin dynamic shapes {ov::Dimension(1, 20), ov::Dimension(1, 20), ov::Dimension(1, 20), ov::Dimension(1, 20), ov::Dimension(1, 70)}, @@ -267,6 +285,12 @@ const std::vector CPUParams5D = { cpuParams_nCdhw8c, cpuParams_ndhwc, }; + +const ov::AnyMap additionalConfig = {}; +const std::vector i64Config = { + {{InferenceEngine::PluginConfigInternalParams::KEY_CPU_NATIVE_I64, InferenceEngine::PluginConfigParams::YES}}, + {{InferenceEngine::PluginConfigInternalParams::KEY_CPU_NATIVE_I64, InferenceEngine::PluginConfigParams::NO}} +}; /* ============= */ /* INSTANCES */ @@ -277,7 +301,18 @@ INSTANTIATE_TEST_CASE_P(smoke_StaticShape4D, TileLayerCPUTest, ::testing::ValuesIn(repeats4D), ::testing::ValuesIn(netPrecisions), ::testing::Values(true), - ::testing::Values(CommonTestUtils::DEVICE_CPU)), + ::testing::Values(additionalConfig)), + ::testing::ValuesIn(CPUParams4D)), + TileLayerCPUTest::getTestCaseName); + +INSTANTIATE_TEST_CASE_P(smoke_StaticShape4D_I64, TileLayerCPUTest, + ::testing::Combine( + ::testing::Combine( + ::testing::ValuesIn(staticInputShapes4D), + ::testing::ValuesIn(repeats4D), + ::testing::Values(ElementType::i64), + ::testing::Values(true), + ::testing::ValuesIn(i64Config)), ::testing::ValuesIn(CPUParams4D)), TileLayerCPUTest::getTestCaseName); @@ -288,11 +323,22 @@ INSTANTIATE_TEST_CASE_P(smoke_DynamicShape4D, TileLayerCPUTest, ::testing::ValuesIn(repeats4D), ::testing::ValuesIn(netPrecisions), ::testing::Values(true, false), - ::testing::Values(CommonTestUtils::DEVICE_CPU)), + ::testing::Values(additionalConfig)), ::testing::Values(CPUSpecificParams{{}, {}, {}, "ref"})), TileLayerCPUTest::getTestCaseName); -const std::vector> dynBatchInputShapes4D = { +INSTANTIATE_TEST_CASE_P(smoke_DynamicShape4D_I64, TileLayerCPUTest, + ::testing::Combine( + ::testing::Combine( + ::testing::ValuesIn(dynamicInputShapes4D), + ::testing::ValuesIn(repeats4D), + ::testing::Values(ElementType::i64), + ::testing::Values(true, false), + ::testing::ValuesIn(i64Config)), + ::testing::Values(CPUSpecificParams{{}, {}, {}, "ref"})), + TileLayerCPUTest::getTestCaseName); + +const std::vector> dynBatchInputShapes4D = { { // Origin dynamic shapes { {{1, 20}, 16, 3, 4}, @@ -312,7 +358,7 @@ INSTANTIATE_TEST_CASE_P(smoke_DynBatch4D, TileLayerCPUTest, ::testing::Values(std::vector{1, 2, 1, 3}), ::testing::ValuesIn(netPrecisions), ::testing::Values(true), - ::testing::Values(CommonTestUtils::DEVICE_CPU)), + ::testing::Values(additionalConfig)), ::testing::Values(CPUSpecificParams{{}, {}, {}, "ref"})), TileLayerCPUTest::getTestCaseName); @@ -323,7 +369,18 @@ INSTANTIATE_TEST_CASE_P(smoke_StaticShape5D, TileLayerCPUTest, ::testing::ValuesIn(repeats5D), ::testing::ValuesIn(netPrecisions), ::testing::Values(true), - ::testing::Values(CommonTestUtils::DEVICE_CPU)), + ::testing::Values(additionalConfig)), + ::testing::ValuesIn(CPUParams5D)), + TileLayerCPUTest::getTestCaseName); + +INSTANTIATE_TEST_CASE_P(smoke_StaticShape5D_I64, TileLayerCPUTest, + ::testing::Combine( + ::testing::Combine( + ::testing::ValuesIn(staticInputShapes5D), + ::testing::ValuesIn(repeats5D), + ::testing::Values(ElementType::i64), + ::testing::Values(true), + ::testing::ValuesIn(i64Config)), ::testing::ValuesIn(CPUParams5D)), TileLayerCPUTest::getTestCaseName); @@ -334,7 +391,7 @@ INSTANTIATE_TEST_CASE_P(smoke_DynamicShape5D, TileLayerCPUTest, ::testing::ValuesIn(repeats5D), ::testing::ValuesIn(netPrecisions), ::testing::Values(true, false), - ::testing::Values(CommonTestUtils::DEVICE_CPU)), + ::testing::Values(additionalConfig)), ::testing::Values(CPUSpecificParams{{}, {}, {}, "ref"})), TileLayerCPUTest::getTestCaseName); /* ========= */ diff --git a/src/plugins/intel_cpu/tests/functional/single_layer_tests/topk.cpp b/src/plugins/intel_cpu/tests/functional/single_layer_tests/topk.cpp index 5026bd76becead..a2606a7165356c 100644 --- a/src/plugins/intel_cpu/tests/functional/single_layer_tests/topk.cpp +++ b/src/plugins/intel_cpu/tests/functional/single_layer_tests/topk.cpp @@ -6,6 +6,8 @@ #include "test_utils/cpu_test_utils.hpp" #include "shared_test_classes/base/ov_subgraph.hpp" #include "ngraph_functions/builders.hpp" +#include +#include using namespace InferenceEngine; using namespace CPUTestUtils; @@ -88,6 +90,7 @@ class TopKLayerCPUTest : public testing::WithParamInterfaceGetParam(); std::tie(inFmts, outFmts, priority, selectedType) = cpuParams; + selectedType = getPrimitiveType(); int64_t keepK; SortMode mode; @@ -98,14 +101,16 @@ class TopKLayerCPUTest : public testing::WithParamInterface(sortTypeStable); stable = std::get<1>(sortTypeStable); - if (additionalConfig[PluginConfigParams::KEY_ENFORCE_BF16] == PluginConfigParams::YES) - inPrc = outPrc = netPrecision = ElementType::bf16; - else - inPrc = outPrc = netPrecision; + if (additionalConfig[PluginConfigParams::KEY_ENFORCE_BF16] == PluginConfigParams::YES) { + netPrecision = ElementType::bf16; // TODO: KEY_ENFORCE_BF16 does not work? + selectedType = makeSelectedTypeStr(selectedType, ElementType::bf16); + } else if (netPrecision == ElementType::i64) { + selectedType = makeSelectedTypeStr(selectedType, ElementType::i32); + } else { + selectedType = makeSelectedTypeStr(selectedType, netPrecision); + } configuration.insert(additionalConfig.begin(), additionalConfig.end()); - selectedType = getPrimitiveType() + "_" + InferenceEngine::details::convertPrecision(netPrecision).name(); - staticShape = inputShape.first.rank() == 0; if (staticShape) { init_input_shapes({inputShape}); @@ -133,12 +138,12 @@ class TopKLayerCPUTest : public testing::WithParamInterfaceget_rt_info() = getCPUInfo(); - ngraph::ResultVector results; + ov::ResultVector results; for (size_t i = 0; i < topk->get_output_size(); i++) { results.push_back(std::make_shared(topk->output(i))); } - function = std::make_shared(results, params, "TopK"); + function = std::make_shared(results, params, "TopK"); } void generate_inputs(const std::vector& targetInputStaticShapes) override { @@ -156,7 +161,7 @@ class TopKLayerCPUTest : public testing::WithParamInterface data(size); // For int32, deliberately set big numbers which are not accurately representable in fp32 @@ -170,14 +175,17 @@ class TopKLayerCPUTest : public testing::WithParamInterface(tensor.data()); + auto rawBlobDataPtr = tensor.data(); for (size_t i = 0; i < size; ++i) { rawBlobDataPtr[i] = static_cast(data[i]); } - } else { - auto *rawBlobDataPtr = static_cast(tensor.data()); + } else if (netPrecision == ElementType::i32) { + auto rawBlobDataPtr = static_cast(tensor.data()); + std::copy(data.begin(), data.end(), rawBlobDataPtr); + } else if (netPrecision == ElementType::i64) { + auto *rawBlobDataPtr = tensor.data(); for (size_t i = 0; i < size; ++i) { - rawBlobDataPtr[i] = static_cast(data[i]); + rawBlobDataPtr[i] = static_cast(data[i]); } } } else if (netPrecision == ElementType::bf16) { @@ -351,6 +359,21 @@ INSTANTIATE_TEST_CASE_P(smoke_TopK_int32_dynamic, TopKLayerCPUTest, ::testing::Values(additionalConfig[0])), TopKLayerCPUTest::getTestCaseName); +INSTANTIATE_TEST_CASE_P(smoke_TopK_i64, TopKLayerCPUTest, + ::testing::Combine( + ::testing::Combine( + ::testing::ValuesIn(k_int32), + ::testing::ValuesIn(axes), + ::testing::ValuesIn(modes), + ::testing::ValuesIn(sortTypeStable), + ::testing::Values(ElementType::i64), + ::testing::Values(ElementType::undefined), + ::testing::Values(ElementType::undefined), + ::testing::ValuesIn(inputShapes_int32)), + ::testing::ValuesIn(filterCPUSpecificParams(cpuParams)), + ::testing::Values(additionalConfig[0])), + TopKLayerCPUTest::getTestCaseName); + std::vector inputShapes_bubble_BLK_on_channel_horiz = { {{}, {{2, 2, 2, 2}}}, }; diff --git a/src/plugins/intel_cpu/tests/functional/single_layer_tests/unique.cpp b/src/plugins/intel_cpu/tests/functional/single_layer_tests/unique.cpp index acfac9a31278a6..e602d9ee6e32d0 100644 --- a/src/plugins/intel_cpu/tests/functional/single_layer_tests/unique.cpp +++ b/src/plugins/intel_cpu/tests/functional/single_layer_tests/unique.cpp @@ -6,6 +6,7 @@ #include "ngraph_functions/builders.hpp" #include "test_utils/cpu_test_utils.hpp" #include +#include using namespace CPUTestUtils; using namespace ov::test; @@ -18,7 +19,7 @@ typedef std::tuple< bool, // Sorted ElementType, // Data precision CPUSpecificParams, // CPU specific params - std::map // Additional config + ov::AnyMap // Additional config > UniqueLayerTestCPUParams; class UniqueLayerTestCPU : public testing::WithParamInterface, @@ -30,7 +31,7 @@ class UniqueLayerTestCPU : public testing::WithParamInterface additionalConfig; + ov::AnyMap additionalConfig; std::tie(inputShapes, flatOrAxis, sorted, dataPrecision, cpuParams, additionalConfig) = obj.param; @@ -59,9 +60,9 @@ class UniqueLayerTestCPU : public testing::WithParamInterface +#include +#include +#include +#include +#include +#include + +template +bool evaluate(const std::shared_ptr& op, const ov::HostTensorVector& outputs, const ov::HostTensorVector& inputs) { + using T = typename ov::element_type_traits::value_type; + + const auto axes_vector = host_tensor_2_vector(inputs[1]); + const auto normalized_axes = ov::normalize_axes(op->get_friendly_name(), axes_vector, inputs[0]->get_partial_shape().rank()); + const auto reduction_axes = ov::AxisSet{normalized_axes}; + + ngraph::runtime::reference::reduce_l1(inputs[0]->get_data_ptr(), + outputs[0]->get_data_ptr(), + inputs[0]->get_shape(), + reduction_axes); + return true; +} + +template +bool evaluate(const std::shared_ptr& op, const ov::HostTensorVector& outputs, const ov::HostTensorVector& inputs) { + using T = typename ov::element_type_traits::value_type; + + const auto axes_vector = host_tensor_2_vector(inputs[1]); + const auto normalized_axes = ov::normalize_axes(op->get_friendly_name(), axes_vector, inputs[0]->get_partial_shape().rank()); + const auto reduction_axes = ov::AxisSet{normalized_axes}; + + ngraph::runtime::reference::reduce_l2(inputs[0]->get_data_ptr(), + outputs[0]->get_data_ptr(), + inputs[0]->get_shape(), + reduction_axes); + return true; +} + +template <> +bool evaluate_node(std::shared_ptr node, + const ov::HostTensorVector& outputs, + const ov::HostTensorVector& inputs) { + const ov::element::Type_t element_type = node->get_output_element_type(0); + auto reduce_node = ov::as_type_ptr(node); + + switch (element_type) { + case ov::element::Type_t::i64: + return evaluate(reduce_node, outputs, inputs); + default: + OPENVINO_THROW(std::string("Unhandled data type ") + node->get_element_type().get_type_name() + + std::string("in evaluate_node()")); + } +} + +template <> +bool evaluate_node(std::shared_ptr node, + const ov::HostTensorVector& outputs, + const ov::HostTensorVector& inputs) { + const ov::element::Type_t element_type = node->get_output_element_type(0); + auto reduce_node = ov::as_type_ptr(node); + + switch (element_type) { + case ov::element::Type_t::i64: + return evaluate(reduce_node, outputs, inputs); + default: + OPENVINO_THROW(std::string("Unhandled data type ") + node->get_element_type().get_type_name() + + std::string("in evaluate_node()")); + } +} diff --git a/src/plugins/template/backend/opset_int_tbl.hpp b/src/plugins/template/backend/opset_int_tbl.hpp index f65bf093a800cb..cefd1602fc4947 100644 --- a/src/plugins/template/backend/opset_int_tbl.hpp +++ b/src/plugins/template/backend/opset_int_tbl.hpp @@ -82,6 +82,8 @@ _OPENVINO_OP_REG(CTCLoss, op::v4) _OPENVINO_OP_REG(LSTMCell, op::v4) _OPENVINO_OP_REG(NonMaxSuppression, op::v4) _OPENVINO_OP_REG(Proposal, op::v4) +_OPENVINO_OP_REG(ReduceL1, op::v4) +_OPENVINO_OP_REG(ReduceL2, op::v4) _OPENVINO_OP_REG(BatchNormInference, op::v5) _OPENVINO_OP_REG(GatherND, op::v5) diff --git a/src/tests/ngraph_helpers/ngraph_functions/src/non_max_suppression.cpp b/src/tests/ngraph_helpers/ngraph_functions/src/non_max_suppression.cpp index 94d29d88869bfa..b8416fd68a8683 100644 --- a/src/tests/ngraph_helpers/ngraph_functions/src/non_max_suppression.cpp +++ b/src/tests/ngraph_helpers/ngraph_functions/src/non_max_suppression.cpp @@ -19,8 +19,12 @@ std::shared_ptr makeNms(const ngraph::Output& boxes, const bool& isCenter, const bool& sortResDescend, const ngraph::element::Type& outType) { - auto maxOutBoxesPerClassNode = - makeConstant(maxBoxesPrec, ngraph::Shape{}, std::vector{maxOutBoxesPerClass})->output(0); + std::shared_ptr maxOutBoxesPerClassNode; + if (maxBoxesPrec == element::i64) { + maxOutBoxesPerClassNode = makeConstant(maxBoxesPrec, ngraph::Shape{}, std::vector{maxOutBoxesPerClass}); + } else { + maxOutBoxesPerClassNode = makeConstant(maxBoxesPrec, ngraph::Shape{}, std::vector{maxOutBoxesPerClass}); + } auto iouThrNode = makeConstant(thrPrec, ngraph::Shape{}, std::vector{iouThr})->output(0); auto scoreThrNode = makeConstant(thrPrec, ngraph::Shape{}, std::vector{scoreThr})->output(0); auto softNmsSigmaNode = makeConstant(thrPrec, ngraph::Shape{}, std::vector{softNmsSigma})->output(0); @@ -30,7 +34,7 @@ std::shared_ptr makeNms(const ngraph::Output& boxes, return std::make_shared(boxes, scores, - maxOutBoxesPerClassNode, + maxOutBoxesPerClassNode->output(0), iouThrNode, scoreThrNode, softNmsSigmaNode,