From 68e056d6b219e4c1a98ae1518e575744c5935a1b Mon Sep 17 00:00:00 2001
From: alexey-varyzgin <alexey.varyzgin@intel.com>
Date: Fri, 1 Apr 2022 12:58:42 +0300
Subject: [PATCH] [CPU][BF16] Functional filures fixes for 2022 R2

---
 src/plugins/intel_cpu/src/graph.cpp           |  2 +-
 src/plugins/intel_cpu/src/graph_optimizer.cpp | 35 +++++++++++--------
 src/plugins/intel_cpu/src/nodes/deconv.cpp    |  3 ++
 src/plugins/intel_cpu/thirdparty/mkl-dnn      |  2 +-
 4 files changed, 25 insertions(+), 17 deletions(-)

diff --git a/src/plugins/intel_cpu/src/graph.cpp b/src/plugins/intel_cpu/src/graph.cpp
index 4ad70ceaae30f8..5d463f478e7e07 100644
--- a/src/plugins/intel_cpu/src/graph.cpp
+++ b/src/plugins/intel_cpu/src/graph.cpp
@@ -1361,7 +1361,7 @@ void Graph::EnforceBF16() {
         if (nodesToSkip.count(node) && !node->enforceBF16evenForGraphTail)
             continue;
 
-        if (node->getType() != Type::Input && node->getType() != Type::Output) {
+        if (!ov::intel_cpu::one_of(node->getType(), Type::Input, Type::Output, Type::MemoryInput, Type::MemoryOutput)) {
             for (size_t i = 0; i < node->getOriginalInputsNumber(); i++) {
                 const auto &parent = node->getParentEdgesAtPort(i)[0]->getParent();
                 /* Skip BF16 enforcement for nodes after Constant Inputs for maintaining precision for fusing.
diff --git a/src/plugins/intel_cpu/src/graph_optimizer.cpp b/src/plugins/intel_cpu/src/graph_optimizer.cpp
index ab32a7272e74db..3c2ad3fd50cebf 100644
--- a/src/plugins/intel_cpu/src/graph_optimizer.cpp
+++ b/src/plugins/intel_cpu/src/graph_optimizer.cpp
@@ -289,6 +289,20 @@ void GraphOptimizer::FuseConvolutionMatMulAndBias(Graph &graph) {
     }
 }
 
+/**
+ * @todo FQ fusing was disabled for BF16 output since oneDNN primitives lack support
+ *       for bf16 depthwise postops.
+ *       This is not the case anymore, because after migration to oneDNN 2.3 FQ will be fused as
+ *       multiple binary post ops.
+ *       This check can already be removed for FC fusing, but should be kept for Convolution,
+ *       which still uses legacy depthwise postops for performance reasons.
+ */
+static bool BF16QuantizeNodeFusing(const NodePtr& parentNode, const NodePtr& childNode) {
+    return childNode->getType() == Type::FakeQuantize &&
+        one_of(Precision::BF16,
+            parentNode->getOriginalOutputPrecisionAtPort(0),
+            childNode->getOriginalOutputPrecisionAtPort(0));
+}
 void GraphOptimizer::FuseDeconvolutionAndSimpleOperation(Graph &graph) {
     auto& graphNodes = graph.GetNodes();
 
@@ -328,6 +342,12 @@ void GraphOptimizer::FuseDeconvolutionAndSimpleOperation(Graph &graph) {
             continue;
         }
 
+        //  BF16 Quantize Layer Fusing Disabling
+        if (BF16QuantizeNodeFusing(parentNode, childNode)) {
+            parent++;
+            continue;
+        }
+
         childNode->fuseInto(parentNode);
 
         auto parentEdges = childNode->parentEdges;
@@ -715,21 +735,6 @@ void GraphOptimizer::FuseConvolutionAndZeroPoints(Graph &graph) {
     }
 }
 
-/**
- * @todo FQ fusing was disabled for BF16 output since oneDNN primitives lack support
- *       for bf16 depthwise postops.
- *       This is not the case anymore, because after migration to oneDNN 2.3 FQ will be fused as
- *       multiple binary post ops.
- *       This check can already be removed for FC fusing, but should be kept for Convolution,
- *       which still uses legacy depthwise postops for performance reasons.
- */
-static bool BF16QuantizeNodeFusing(const NodePtr& parentNode, const NodePtr& childNode) {
-    return childNode->getType() == Type::FakeQuantize &&
-        one_of(Precision::BF16,
-            parentNode->getOriginalOutputPrecisionAtPort(0),
-            childNode->getOriginalOutputPrecisionAtPort(0));
-}
-
 void GraphOptimizer::FuseFullyConnectedAndSimpleOperation(Graph &graph) {
     auto& graphNodes = graph.GetNodes();
 
diff --git a/src/plugins/intel_cpu/src/nodes/deconv.cpp b/src/plugins/intel_cpu/src/nodes/deconv.cpp
index a6fc40bc33ad28..717137a9506a36 100644
--- a/src/plugins/intel_cpu/src/nodes/deconv.cpp
+++ b/src/plugins/intel_cpu/src/nodes/deconv.cpp
@@ -295,6 +295,9 @@ void Deconvolution::getSupportedDescriptors() {
        inputDataType = outputDataType = memory::data_type::bf16;
     if (!fusedWith.empty()) {
         outputDataType = DnnlExtensionUtils::IEPrecisionToDataType(fusedWith[fusedWith.size() - 1]->getOriginalOutputPrecisionAtPort(0));
+        // TODO: We have to extend jit_avx512_core_x8s8s32x_deconv_fwd_kernel from oneDNN to support BF16 output data type
+        if (isInt8 && outputDataType == memory::data_type::bf16)
+            outputDataType = memory::data_type::f32;
     }
 
     if (getParentEdges().size() != 2 && getParentEdges().size() != 3)
diff --git a/src/plugins/intel_cpu/thirdparty/mkl-dnn b/src/plugins/intel_cpu/thirdparty/mkl-dnn
index 82ca2f931c1d58..ede2dfc7f8df8c 160000
--- a/src/plugins/intel_cpu/thirdparty/mkl-dnn
+++ b/src/plugins/intel_cpu/thirdparty/mkl-dnn
@@ -1 +1 @@
-Subproject commit 82ca2f931c1d588b67d154d873136d4af1ffb3a8
+Subproject commit ede2dfc7f8df8c0eb0e0945851dbd8ec4666ac5c