openvinotoolkit · luweizhou2016 · Sep 15, 2023 · Sep 18, 2023 · Sep 27, 2023 · Sep 27, 2023
@@ -616,9 +616,11 @@ void RNN::fillCellDesc() {
         inCandidate.emplace_back(std::make_shared<DnnlBlockedMemoryDesc>(shapeS, inDataTypes[cIdx], memory::format_tag::nc));
         outCandidate.emplace_back(std::make_shared<DnnlBlockedMemoryDesc>(shapeS, outDataTypes[coIdx], memory::format_tag::nc));
     }
-
+    // The weight and weights_iter would expose nc layout to avoid unnecessary reorder.
+    // The onednn would determine the final layout when prepareParams.
     inCandidate.emplace_back(std::make_shared<DnnlBlockedMemoryDesc>(WShape, inDataTypes[wIdx], memory::format_tag::nc));
     inCandidate.emplace_back(std::make_shared<DnnlBlockedMemoryDesc>(RShape, inDataTypes[rIdx], memory::format_tag::nc));
+
     inCandidate.emplace_back(std::make_shared<DnnlBlockedMemoryDesc>(BShape, inDataTypes[bIdx], memory::format_tag::x));
 
     if (haveAttention(cell_type)) {
@@ -721,8 +723,11 @@ void RNN::fillSequenceDesc() {
     }
 
     inCandidate.emplace_back(std::make_shared<DnnlBlockedMemoryDesc>(TShape, inDataTypes[sIdx], memory::format_tag::x)); // sequence lengths
-    inCandidate.emplace_back(std::make_shared<DnnlBlockedMemoryDesc>(WShape, inDataTypes[wIdx], memory::format_tag::ntc)); // W
-    inCandidate.emplace_back(std::make_shared<DnnlBlockedMemoryDesc>(RShape, inDataTypes[rIdx], memory::format_tag::ntc)); // R
+    // The weight and weights_iter would expose tnc layout to avoid unnecessary reorder.
+    // The onednn would determine the final layout when prepareParams.
+    inCandidate.emplace_back(std::make_shared<DnnlBlockedMemoryDesc>(WShape, inDataTypes[wIdx], memory::format_tag::tnc)); // W
+    inCandidate.emplace_back(std::make_shared<DnnlBlockedMemoryDesc>(RShape, inDataTypes[rIdx], memory::format_tag::tnc)); // R
+
     inCandidate.emplace_back(std::make_shared<DnnlBlockedMemoryDesc>(BShape, inDataTypes[bIdx], memory::format_tag::nc)); // B
 
     if (haveAttention(cell_type)) {
@@ -891,9 +896,6 @@ void RNN::copyWeightsData() {
     if (one_of(dataType, memory::data_type::bf16, memory::data_type::f16)) {
         fillWeights<uint16_t>(gate_map, wIdx, rIdx);
     } else if (dataType == memory::data_type::f32) {
-        // WA To avoid different weights layer and iter formats in FP32 case
-        if (T.minVal > 1 || N.maxVal < optimalBatchSize)
-            wFormat = dnnl::memory::format_tag::ldigo;
         fillWeights<float>(gate_map, wIdx, rIdx);
     } else if (dataType == memory::data_type::u8 || dataType == memory::data_type::s8) {
         fillWeights<int8_t>(gate_map, wIdx, rIdx);
@@ -1032,9 +1034,11 @@ void RNN::createDescriptor(const std::vector<MemoryDescPtr> &inputDesc,
            since internalBlobs are used for the execution, not the initial weights */
         const auto& targetWeightDataType = weightsByinputDataType.at(inDataTypes[xIdx]);
         auto weightsDims = DnnlExtensionUtils::convertToDnnlDims(VectorDims{ L, D, DC, G, SC });
-        wDescs[0] = dnnl::memory::desc(weightsDims, targetWeightDataType, wFormat);
+        //onednn determines the preferred weight layout.
+        wDescs[0] = dnnl::memory::desc(weightsDims, targetWeightDataType, memory::format_tag::any);
         auto statesDims = DnnlExtensionUtils::convertToDnnlDims(VectorDims{ L, D, SC, G, SC });
-        wDescs[1] = dnnl::memory::desc(statesDims, targetWeightDataType, wFormat);
+        //onednn determines the preferred weights_iter layout.
+        wDescs[1] = dnnl::memory::desc(statesDims, targetWeightDataType, memory::format_tag::any);
         auto biasDims = DnnlExtensionUtils::convertToDnnlDims(VectorDims{ L, D, Gb, SC });
         wDescs[2] = dnnl::memory::desc(biasDims, inDataTypes[bIdx], memory::format_tag::ldgo);
 
@@ -1109,27 +1113,6 @@ void RNN::prepareParams() {
         inDataDescs[2] = std::make_shared<DnnlBlockedMemoryDesc>(Shape{SL, B, 1}, inDataTypes[aIdx], memory::format_tag::tnc);
     }
 
-    bool wFormatWasChanged = false;
-    // WA To avoid different weights layer and iter formats in FP32 case.
-    if (one_of(inDataTypes[xIdx], memory::data_type::f32) &&
-        (SL != 1 || B < optimalBatchSize)) {
-        if (wFormat != dnnl::memory::format_tag::ldigo) {
-            wFormat = dnnl::memory::format_tag::ldigo;
-            wFormatWasChanged = true;
-        }
-    } else if (wFormat != dnnl::memory::format_tag::any) {
-        wFormat = dnnl::memory::format_tag::any;
-        wFormatWasChanged = true;
-    }
-
-    if (wFormatWasChanged) {
-        auto weightsDims = DnnlExtensionUtils::convertToDnnlDims(VectorDims{ L, D, DC, G, SC });
-        const auto& targetWeightDataType = weightsByinputDataType.at(inDataTypes[xIdx]);
-        wDescs[0] = dnnl::memory::desc(weightsDims, targetWeightDataType, wFormat);
-        auto statesDims = DnnlExtensionUtils::convertToDnnlDims(VectorDims{ L, D, SC, G, SC });
-        wDescs[1] = dnnl::memory::desc(statesDims, targetWeightDataType, wFormat);
-    }
-
     const auto attr = initPrimitiveAttr();
     RNNKey key = { inDataDescs, outDataDescs, wDescs, cell_type, cell_act, direction, *attr };
 

@@ -105,9 +105,6 @@ class RNN : public Node {
     /** activation type for vanilla RNN cell */
     dnnl::algorithm cell_act = dnnl::algorithm::undef;
 
-    /** Weights data and state memory format: ldigo or any */
-    dnnl::memory::format_tag wFormat = dnnl::memory::format_tag::any;
-
     struct Interval {
         Interval() = default;
 

@@ -5,7 +5,6 @@
 #include "openvino/pass/constant_folding.hpp"
 #include "openvino/op/fake_quantize.hpp"
 #include "openvino/pass/manager.hpp"
-#include "common/pass/reshape_fc_fusion.hpp"
 #include "common/pass/align_matmul_input_ranks.hpp"
 #include "transformations/common_optimizations/reshape_prelu.hpp"
 #include "common/pass/convert_broadcast_to_tiles.hpp"
@@ -42,9 +41,6 @@ inline void ConvertToCPUSpecificOpset(std::shared_ptr<ov::Model> &nGraphFunc) {
     CPU_REGISTER_PASS_COMMON(manager, ConvertToLeakyRelu);
     CPU_REGISTER_PASS_COMMON(manager, ConvertToSwishCPU);
     CPU_REGISTER_PASS_COMMON(manager, OptimizeSequenceTransposes);
-    if (!ov::op::util::has_op_with_type<ov::op::v0::FakeQuantize>(nGraphFunc)) {
-        CPU_REGISTER_PASS_COMMON(manager, ReshapeFullyConnectedFusion);
-    }
     // after transformation "MoveEltwiseUpThroughDataMov" there can be reshaped sequences that should be eliminated or fused
     CPU_REGISTER_PASS_COMMON(manager, ov::pass::ReshapeSequenceFusion);
     CPU_REGISTER_PASS_COMMON(manager, ov::pass::ConstantFolding);

@@ -342,6 +342,16 @@ std::vector<std::string> disabledTestPatterns() {
         retVector.emplace_back(R"(.*smoke_Snippets_MHAWOTransposeEnforceBF16.*)");
         retVector.emplace_back(R"(.*smoke_Snippets_MHAEnforceBF16.*)");
     }
-
+    // RNN/LSTM/GRU/AUGRU BF16 tests on avx512 core ISA would fail when gemm_avx512 fall back to gemm_avx2
+    if (InferenceEngine::with_cpu_x86_avx512_core()&& !InferenceEngine::with_cpu_x86_avx512_core_amx_bf16()) {
+        retVector.emplace_back(R"(smoke.*(AUGRUCellCPUTest|GRUCellCPUTest|LSTMCellLayerCPUTest).CompareWithRefs.*ENFORCE_BF16=YES.*)");
+        retVector.emplace_back(R"(nightly.*bf16.*/(AUGRUSequenceCPUTest|GRUSequenceCPUTest|LSTMSequenceCPUTest|RNNSequenceCPUTest).CompareWithRefs.*ENFORCE_BF16=YES.*)");
+    }
+    if (InferenceEngine::with_cpu_x86_avx512_core_amx_bf16()) {
+        // GRUCell and GRUSequence BF16 tests on SPR would fail when gemm_avx512 fall back to gemm_avx2
+        retVector.emplace_back(R"(.*/GRU.*ENFORCE_BF16=YES.*)");
+        // GroupDeconv 3D would fail on BF16 when gemm_avx512 fall back to gemm_avx2
+        retVector.emplace_back(R"(nightly_GroupDeconv_3D_Planar_BF16/GroupDeconvolutionLayerCPUTest\.CompareWithRefs/IS=\[\?.12.\?.\?.\?\]_TS=\(\(2.12.7.7.7\)\)_\(\(2.12.5.7.7\)\)_\(\(1.12.9.4.9\)\)_\(\(2.12.5.7.7\)\)_PRC=f32.*S=\(2.2.2\)_PB=\(0.0.0\)_PE=\(0.0.0\)_D=\(1.1.1\)_OP=\(\)_O=6_G.*primitive=jit_gemm.*_ENFORCE_BF16=YES)");
+    }
     return retVector;
 }
@@ -32,6 +32,9 @@ function(ov_add_onednn)
     set(DNNL_CPU_RUNTIME "${THREADING}" CACHE STRING "" FORCE)
     set(DNNL_GPU_RUNTIME "NONE" CACHE STRING "" FORCE)
     set(DNNL_BLAS_VENDOR "NONE" CACHE STRING "" FORCE)
+    #Only build gemm driver for AVX2 and below ISAs
+    set(ONEDNN_ENABLE_GEMM_KERNELS_ISA "AVX2" CACHE STRING "" FORCE)
+
     # plugin does not use onednn graph
     set(ONEDNN_BUILD_GRAPH OFF CACHE BOOL "" FORCE)
     # select needed primitives
+16 −1		cmake/configuring_primitive_list.cmake
+2 −0		cmake/dnnl_compat.cmake
+10 −0		cmake/options.cmake
+12 −0		doc/build/build_options.md
+6 −0		include/oneapi/dnnl/dnnl_config.h.in
+327 −0		src/cpu/gemm/bf16/ref_gemm_bf16.cpp
+37 −0		src/cpu/gemm/bf16/ref_gemm_bf16.hpp
+10 −1		src/cpu/gemm/f32/gemm_utils_f32.hpp
+32 −20		src/cpu/gemm/gemm.cpp
+31 −3		src/cpu/gemm/gemm.hpp
+16 −15		src/cpu/gemm/gemm_pack.cpp
+1 −1		src/cpu/rnn/rnn_utils.hpp
+29 −0		src/cpu/x64/CMakeLists.txt
+19 −17		src/cpu/x64/gemm/f32/jit_avx2_kernel_sgemm_kern.cpp
+64 −63		src/cpu/x64/gemm/f32/jit_avx2_kernel_sgemm_kern.hpp
+36 −21		src/cpu/x64/gemm/gemm_driver.cpp
+324 −318		src/cpu/x64/gemm/gemm_info.cpp
+2 −2		src/cpu/x64/gemm/gemm_pack.cpp
+1 −1		src/cpu/x64/gemm/gemv_driver.cpp
+8 −1		tests/gtests/in/gemm_in.h