From 5fc6d012fc791e47d83c8cfa482d5aab8f3c17b7 Mon Sep 17 00:00:00 2001 From: Luwei Zhou Date: Wed, 27 Sep 2023 05:39:52 +0200 Subject: [PATCH 1/4] RNN not specify weight layout when creating primitive descriptor. RNN weight expose planar layout to cpu graph. --- src/plugins/intel_cpu/src/nodes/rnn.cpp | 43 ++++++++----------------- src/plugins/intel_cpu/src/nodes/rnn.h | 3 -- 2 files changed, 13 insertions(+), 33 deletions(-) diff --git a/src/plugins/intel_cpu/src/nodes/rnn.cpp b/src/plugins/intel_cpu/src/nodes/rnn.cpp index 9992f0f392b8..3f1f93d50bfa 100644 --- a/src/plugins/intel_cpu/src/nodes/rnn.cpp +++ b/src/plugins/intel_cpu/src/nodes/rnn.cpp @@ -610,9 +610,11 @@ void RNN::fillCellDesc() { inCandidate.emplace_back(std::make_shared(shapeS, inDataTypes[cIdx], memory::format_tag::nc)); outCandidate.emplace_back(std::make_shared(shapeS, outDataTypes[coIdx], memory::format_tag::nc)); } - + // The weight and weights_iter would expose nc layout to avoid unnecessary reorder. + // The onednn would determine the final layout when prepareParams. inCandidate.emplace_back(std::make_shared(WShape, inDataTypes[wIdx], memory::format_tag::nc)); inCandidate.emplace_back(std::make_shared(RShape, inDataTypes[rIdx], memory::format_tag::nc)); + inCandidate.emplace_back(std::make_shared(BShape, inDataTypes[bIdx], memory::format_tag::x)); if (haveAttention(cell_type)) { @@ -715,8 +717,11 @@ void RNN::fillSequenceDesc() { } inCandidate.emplace_back(std::make_shared(TShape, inDataTypes[sIdx], memory::format_tag::x)); // sequence lengths - inCandidate.emplace_back(std::make_shared(WShape, inDataTypes[wIdx], memory::format_tag::ntc)); // W - inCandidate.emplace_back(std::make_shared(RShape, inDataTypes[rIdx], memory::format_tag::ntc)); // R + // The weight and weights_iter would expose tnc layout to avoid unnecessary reorder. + // The onednn would determine the final layout when prepareParams. + inCandidate.emplace_back(std::make_shared(WShape, inDataTypes[wIdx], memory::format_tag::tnc)); // W + inCandidate.emplace_back(std::make_shared(RShape, inDataTypes[rIdx], memory::format_tag::tnc)); // R + inCandidate.emplace_back(std::make_shared(BShape, inDataTypes[bIdx], memory::format_tag::nc)); // B if (haveAttention(cell_type)) { @@ -885,9 +890,6 @@ void RNN::copyWeightsData() { if (dataType == memory::data_type::bf16) { fillWeights(gate_map, wIdx, rIdx); } else if (dataType == memory::data_type::f32) { - // WA To avoid different weights layer and iter formats in FP32 case - if (T.minVal > 1 || N.maxVal < optimalBatchSize) - wFormat = dnnl::memory::format_tag::ldigo; fillWeights(gate_map, wIdx, rIdx); } else if (dataType == memory::data_type::u8 || dataType == memory::data_type::s8) { fillWeights(gate_map, wIdx, rIdx); @@ -1026,9 +1028,11 @@ void RNN::createDescriptor(const std::vector &inputDesc, since internalBlobs are used for the execution, not the initial weights */ const auto& targetWeightDataType = weightsByinputDataType.at(inDataTypes[xIdx]); auto weightsDims = DnnlExtensionUtils::convertToDnnlDims(VectorDims{ L, D, DC, G, SC }); - wDescs[0] = dnnl::memory::desc(weightsDims, targetWeightDataType, wFormat); + //onednn determines the preferred weight layout. + wDescs[0] = dnnl::memory::desc(weightsDims, targetWeightDataType, memory::format_tag::any); auto statesDims = DnnlExtensionUtils::convertToDnnlDims(VectorDims{ L, D, SC, G, SC }); - wDescs[1] = dnnl::memory::desc(statesDims, targetWeightDataType, wFormat); + //onednn determines the preferred weights_iter layout. + wDescs[1] = dnnl::memory::desc(statesDims, targetWeightDataType, memory::format_tag::any); auto biasDims = DnnlExtensionUtils::convertToDnnlDims(VectorDims{ L, D, Gb, SC }); wDescs[2] = dnnl::memory::desc(biasDims, inDataTypes[bIdx], memory::format_tag::ldgo); @@ -1053,7 +1057,7 @@ void RNN::createDescriptor(const std::vector &inputDesc, config.outConfs.push_back(dataConfig); } - supportedPrimitiveDescriptors.emplace_back(config, ref_any); + supportedPrimitiveDescriptors.emplace_back(config, parse_impl_name(descs[0].impl_info_str())); } Node::AttrPtr RNN::initPrimitiveAttr() { @@ -1103,27 +1107,6 @@ void RNN::prepareParams() { inDataDescs[2] = std::make_shared(Shape{SL, B, 1}, inDataTypes[aIdx], memory::format_tag::tnc); } - bool wFormatWasChanged = false; - // WA To avoid different weights layer and iter formats in FP32 case. - if (one_of(inDataTypes[xIdx], memory::data_type::f32) && - (SL != 1 || B < optimalBatchSize)) { - if (wFormat != dnnl::memory::format_tag::ldigo) { - wFormat = dnnl::memory::format_tag::ldigo; - wFormatWasChanged = true; - } - } else if (wFormat != dnnl::memory::format_tag::any) { - wFormat = dnnl::memory::format_tag::any; - wFormatWasChanged = true; - } - - if (wFormatWasChanged) { - auto weightsDims = DnnlExtensionUtils::convertToDnnlDims(VectorDims{ L, D, DC, G, SC }); - const auto& targetWeightDataType = weightsByinputDataType.at(inDataTypes[xIdx]); - wDescs[0] = dnnl::memory::desc(weightsDims, targetWeightDataType, wFormat); - auto statesDims = DnnlExtensionUtils::convertToDnnlDims(VectorDims{ L, D, SC, G, SC }); - wDescs[1] = dnnl::memory::desc(statesDims, targetWeightDataType, wFormat); - } - const auto attr = initPrimitiveAttr(); RNNKey key = { inDataDescs, outDataDescs, wDescs, cell_type, cell_act, direction, *attr }; diff --git a/src/plugins/intel_cpu/src/nodes/rnn.h b/src/plugins/intel_cpu/src/nodes/rnn.h index d16bcd10c507..692b52108ad6 100644 --- a/src/plugins/intel_cpu/src/nodes/rnn.h +++ b/src/plugins/intel_cpu/src/nodes/rnn.h @@ -105,9 +105,6 @@ class RNN : public Node { /** activation type for vanilla RNN cell */ dnnl::algorithm cell_act = dnnl::algorithm::undef; - /** Weights data and state memory format: ldigo or any */ - dnnl::memory::format_tag wFormat = dnnl::memory::format_tag::any; - struct Interval { Interval() = default; From db6cff68729e0c2f7975bb753126720a4d6ff0f4 Mon Sep 17 00:00:00 2001 From: Luwei Zhou Date: Wed, 27 Sep 2023 08:59:14 +0200 Subject: [PATCH 2/4] WA to resotore refany to pass tests.. --- src/plugins/intel_cpu/src/nodes/rnn.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/plugins/intel_cpu/src/nodes/rnn.cpp b/src/plugins/intel_cpu/src/nodes/rnn.cpp index 3f1f93d50bfa..7f305cbacb5a 100644 --- a/src/plugins/intel_cpu/src/nodes/rnn.cpp +++ b/src/plugins/intel_cpu/src/nodes/rnn.cpp @@ -1057,7 +1057,7 @@ void RNN::createDescriptor(const std::vector &inputDesc, config.outConfs.push_back(dataConfig); } - supportedPrimitiveDescriptors.emplace_back(config, parse_impl_name(descs[0].impl_info_str())); + supportedPrimitiveDescriptors.emplace_back(config, ref_any); } Node::AttrPtr RNN::initPrimitiveAttr() { From 4356738184c7b63b96bdc765018d85be62e8f4ad Mon Sep 17 00:00:00 2001 From: Luwei Zhou Date: Wed, 27 Sep 2023 10:57:08 +0200 Subject: [PATCH 3/4] Remove the fusing Reshape + FC transformation in CPU plugin. --- .../common/pass/reshape_fc_fusion.cpp | 76 ------------------- .../common/pass/reshape_fc_fusion.hpp | 19 ----- .../convert_to_cpu_specific_opset.hpp | 4 - 3 files changed, 99 deletions(-) delete mode 100644 src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/reshape_fc_fusion.cpp delete mode 100644 src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/reshape_fc_fusion.hpp diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/reshape_fc_fusion.cpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/reshape_fc_fusion.cpp deleted file mode 100644 index bacc0bbef6e7..000000000000 --- a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/reshape_fc_fusion.cpp +++ /dev/null @@ -1,76 +0,0 @@ -// Copyright (C) 2018-2023 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#include "reshape_fc_fusion.hpp" -#include "transformations/cpu_opset/common/op/fully_connected.hpp" -#include -#include -#include -#include -#include - -#include "itt.hpp" - -ov::intel_cpu::ReshapeFullyConnectedFusion::ReshapeFullyConnectedFusion() { - MATCHER_SCOPE(ReshapeFullyConnectedFusion); - auto m_reshape = ngraph::pattern::wrap_type({ngraph::pattern::any_input(ov::pass::pattern::has_static_shape()), - ngraph::pattern::any_input()}, - ngraph::pattern::has_static_shape()); - ngraph::OutputVector fcInputs = {m_reshape, ngraph::pattern::any_input()}; - auto fc = ngraph::pattern::wrap_type(fcInputs, ngraph::pattern::has_static_shape()); - - ngraph::matcher_pass_callback callback = [](ngraph::pattern::Matcher &m) { - auto fc = std::dynamic_pointer_cast(m.get_match_root()); - if (!fc) - return false; - auto reshape = std::dynamic_pointer_cast(fc->get_input_node_shared_ptr(0)); - if (!reshape) - return false; - - // Check that Reshape reshapes 4D tensor to 2D or input shape = output shape - auto shape_in = reshape->input_value(0).get_shape(); - auto shape_out = reshape->get_shape(); - if (!((shape_in.size() == 4 && reshape->get_shape().size() == 2) || (shape_in == shape_out && !shape_in.empty()))) { - return false; - } - - // Check that Weights[O, C*H*W] consistent with Input[N, C, H, W] - auto shape_w = fc->input_value(1).get_shape(); - if (shape_in[0] != shape_out[0] || std::accumulate(shape_in.begin() + 1, shape_in.end(), size_t{1}, std::multiplies()) != shape_w[1]) { - return false; - } - - ngraph::NodeVector new_ops; - auto weightInput = fc->input(1).get_source_output(); - ngraph::Shape newWeightsShape; - const auto outShape = fc->get_shape(); - if (shape_in.size() == 3) { - newWeightsShape = ngraph::Shape({outShape[2], shape_in[2]}); - } else { - newWeightsShape.push_back(outShape[1]); - for (size_t i = 1; i < shape_in.size(); i++) - newWeightsShape.push_back(shape_in[i]); - } - - if (newWeightsShape != weightInput.get_shape()) { - auto newShape = std::make_shared(ngraph::element::i64, ngraph::Shape{newWeightsShape.size()}, newWeightsShape); - weightInput = std::make_shared(weightInput, newShape, true); - new_ops.push_back(weightInput.get_node_shared_ptr()); - } - - std::shared_ptr new_fc = std::make_shared( - reshape->input_value(0), - weightInput, - ngraph::Rank(outShape.size()), - fc->output(0).get_element_type()); - new_ops.push_back(new_fc); - new_fc->set_friendly_name(fc->get_friendly_name()); - ngraph::copy_runtime_info({reshape, fc}, new_ops); - ngraph::replace_node(fc, new_fc); - return true; - }; - - auto m = std::make_shared(fc, matcher_name); - register_matcher(m, callback); -} diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/reshape_fc_fusion.hpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/reshape_fc_fusion.hpp deleted file mode 100644 index 61c163533561..000000000000 --- a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/reshape_fc_fusion.hpp +++ /dev/null @@ -1,19 +0,0 @@ -// Copyright (C) 2018-2023 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#pragma once - -#include - -namespace ov { -namespace intel_cpu { - -class ReshapeFullyConnectedFusion : public ov::pass::MatcherPass { -public: - OPENVINO_RTTI("ReshapeFullyConnectedFusion", "0"); - ReshapeFullyConnectedFusion(); -}; - -} // namespace intel_cpu -} // namespace ov diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/convert_to_cpu_specific_opset.hpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/convert_to_cpu_specific_opset.hpp index 3e2158e7a383..c3e052cbda14 100644 --- a/src/plugins/intel_cpu/src/transformations/cpu_opset/convert_to_cpu_specific_opset.hpp +++ b/src/plugins/intel_cpu/src/transformations/cpu_opset/convert_to_cpu_specific_opset.hpp @@ -5,7 +5,6 @@ #include #include "ngraph/op/fake_quantize.hpp" #include "ngraph/pass/manager.hpp" -#include "common/pass/reshape_fc_fusion.hpp" #include "common/pass/align_matmul_input_ranks.hpp" #include "transformations/common_optimizations/reshape_prelu.hpp" #include "common/pass/convert_broadcast_to_tiles.hpp" @@ -42,9 +41,6 @@ inline void ConvertToCPUSpecificOpset(std::shared_ptr &nGraphF CPU_REGISTER_PASS_COMMON(manager, ConvertToLeakyRelu); CPU_REGISTER_PASS_COMMON(manager, ConvertToSwishCPU); CPU_REGISTER_PASS_COMMON(manager, OptimizeSequenceTransposes); - if (!ov::op::util::has_op_with_type(nGraphFunc)) { - CPU_REGISTER_PASS_COMMON(manager, ReshapeFullyConnectedFusion); - } // after transformation "MoveEltwiseUpThroughDataMov" there can be reshaped sequences that should be eliminated or fused CPU_REGISTER_PASS_COMMON(manager, ov::pass::ReshapeSequenceFusion); CPU_REGISTER_PASS_COMMON(manager, ov::pass::ConstantFolding); From 88f33b7d9644a56e3759c67d07d41f4a766ce95e Mon Sep 17 00:00:00 2001 From: Luwei Zhou Date: Mon, 4 Dec 2023 03:41:31 +0100 Subject: [PATCH 4/4] Restore reshape+fc fusion. --- .../common/pass/reshape_fc_fusion.cpp | 76 +++++++++++++++++++ .../common/pass/reshape_fc_fusion.hpp | 19 +++++ .../convert_to_cpu_specific_opset.hpp | 4 + 3 files changed, 99 insertions(+) create mode 100644 src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/reshape_fc_fusion.cpp create mode 100644 src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/reshape_fc_fusion.hpp diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/reshape_fc_fusion.cpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/reshape_fc_fusion.cpp new file mode 100644 index 000000000000..2606a6f53987 --- /dev/null +++ b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/reshape_fc_fusion.cpp @@ -0,0 +1,76 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "reshape_fc_fusion.hpp" +#include "transformations/cpu_opset/common/op/fully_connected.hpp" +#include +#include +#include "openvino/core/rt_info.hpp" +#include "openvino/pass/pattern/op/wrap_type.hpp" +#include "openvino/pass/pattern/op/or.hpp" + +#include "itt.hpp" + +ov::intel_cpu::ReshapeFullyConnectedFusion::ReshapeFullyConnectedFusion() { + MATCHER_SCOPE(ReshapeFullyConnectedFusion); + auto m_reshape = ov::pass::pattern::wrap_type({ov::pass::pattern::any_input(ov::pass::pattern::has_static_shape()), + ov::pass::pattern::any_input()}, + ov::pass::pattern::has_static_shape()); + ov::OutputVector fcInputs = {m_reshape, ov::pass::pattern::any_input()}; + auto fc = ov::pass::pattern::wrap_type(fcInputs, ov::pass::pattern::has_static_shape()); + + ov::matcher_pass_callback callback = [](ov::pass::pattern::Matcher &m) { + auto fc = std::dynamic_pointer_cast(m.get_match_root()); + if (!fc) + return false; + auto reshape = std::dynamic_pointer_cast(fc->get_input_node_shared_ptr(0)); + if (!reshape) + return false; + + // Check that Reshape reshapes 4D tensor to 2D or input shape = output shape + auto shape_in = reshape->input_value(0).get_shape(); + auto shape_out = reshape->get_shape(); + if (!((shape_in.size() == 4 && reshape->get_shape().size() == 2) || (shape_in == shape_out && !shape_in.empty()))) { + return false; + } + + // Check that Weights[O, C*H*W] consistent with Input[N, C, H, W] + auto shape_w = fc->input_value(1).get_shape(); + if (shape_in[0] != shape_out[0] || std::accumulate(shape_in.begin() + 1, shape_in.end(), size_t{1}, std::multiplies()) != shape_w[1]) { + return false; + } + + ov::NodeVector new_ops; + auto weightInput = fc->input(1).get_source_output(); + ov::Shape newWeightsShape; + const auto outShape = fc->get_shape(); + if (shape_in.size() == 3) { + newWeightsShape = ov::Shape({outShape[2], shape_in[2]}); + } else { + newWeightsShape.push_back(outShape[1]); + for (size_t i = 1; i < shape_in.size(); i++) + newWeightsShape.push_back(shape_in[i]); + } + + if (newWeightsShape != weightInput.get_shape()) { + auto newShape = std::make_shared(ov::element::i64, ov::Shape{newWeightsShape.size()}, newWeightsShape); + weightInput = std::make_shared(weightInput, newShape, true); + new_ops.push_back(weightInput.get_node_shared_ptr()); + } + + std::shared_ptr new_fc = std::make_shared( + reshape->input_value(0), + weightInput, + ov::Rank(outShape.size()), + fc->output(0).get_element_type()); + new_ops.push_back(new_fc); + new_fc->set_friendly_name(fc->get_friendly_name()); + ov::copy_runtime_info({reshape, fc}, new_ops); + ov::replace_node(fc, new_fc); + return true; + }; + + auto m = std::make_shared(fc, matcher_name); + register_matcher(m, callback); +} diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/reshape_fc_fusion.hpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/reshape_fc_fusion.hpp new file mode 100644 index 000000000000..8bf7026ab198 --- /dev/null +++ b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/reshape_fc_fusion.hpp @@ -0,0 +1,19 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "openvino/pass/graph_rewrite.hpp" + +namespace ov { +namespace intel_cpu { + +class ReshapeFullyConnectedFusion : public ov::pass::MatcherPass { +public: + OPENVINO_RTTI("ReshapeFullyConnectedFusion", "0"); + ReshapeFullyConnectedFusion(); +}; + +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/convert_to_cpu_specific_opset.hpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/convert_to_cpu_specific_opset.hpp index e334d11babe0..f0f5c3e44d0d 100644 --- a/src/plugins/intel_cpu/src/transformations/cpu_opset/convert_to_cpu_specific_opset.hpp +++ b/src/plugins/intel_cpu/src/transformations/cpu_opset/convert_to_cpu_specific_opset.hpp @@ -5,6 +5,7 @@ #include "openvino/pass/constant_folding.hpp" #include "openvino/op/fake_quantize.hpp" #include "openvino/pass/manager.hpp" +#include "common/pass/reshape_fc_fusion.hpp" #include "common/pass/align_matmul_input_ranks.hpp" #include "transformations/common_optimizations/reshape_prelu.hpp" #include "common/pass/convert_broadcast_to_tiles.hpp" @@ -41,6 +42,9 @@ inline void ConvertToCPUSpecificOpset(std::shared_ptr &nGraphFunc) { CPU_REGISTER_PASS_COMMON(manager, ConvertToLeakyRelu); CPU_REGISTER_PASS_COMMON(manager, ConvertToSwishCPU); CPU_REGISTER_PASS_COMMON(manager, OptimizeSequenceTransposes); + if (!ov::op::util::has_op_with_type(nGraphFunc)) { + CPU_REGISTER_PASS_COMMON(manager, ReshapeFullyConnectedFusion); + } // after transformation "MoveEltwiseUpThroughDataMov" there can be reshaped sequences that should be eliminated or fused CPU_REGISTER_PASS_COMMON(manager, ov::pass::ReshapeSequenceFusion); CPU_REGISTER_PASS_COMMON(manager, ov::pass::ConstantFolding);