diff --git a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_fullyconnected_primitive.cpp b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_fullyconnected_primitive.cpp index e358d24e5e06..65d4473cbebf 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_fullyconnected_primitive.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_fullyconnected_primitive.cpp @@ -166,12 +166,24 @@ static bool useDynamicQuantizationImpl(size_t dqGroupSize, return false; } - if (!dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx2_vnni) && - !dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core_vnni)) { - return false; - } - - if (srcDesc->getPrecision() != ov::element::f32) { + // BF16 dynamic-quant path requires native x86 BF16 HW support (AVX512_BF16) AND an + // AVX512-VNNI impl in oneDNN (only avx512_core_vnni instance is registered for bf16 src). + const auto srcPrecision = srcDesc->getPrecision(); + if (srcPrecision == ov::element::bf16) { + if (!dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core_bf16)) { + return false; + } + // On AMX-capable HW, AMX BF16 TMUL outperforms VNNI int8 dyn-quant for bf16 src. + // Disable the bf16 dyn-quant entry here so the AMX BF16 path stays in use. + if (dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core_amx)) { + return false; + } + } else if (srcPrecision == ov::element::f32) { + if (!dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx2_vnni) && + !dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core_vnni)) { + return false; + } + } else { return false; } diff --git a/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp b/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp index aa317305d77b..6d9d2d75319e 100644 --- a/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp +++ b/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp @@ -94,9 +94,14 @@ ov::element::TypeVector FullyConnected::getSupportedCompressedActivationsTypes() return {Type_t::f32, Type_t::f16}; } #if defined(OPENVINO_ARCH_X86_64) - // @todo enable for bf16 as well - // after EnforceInferencePrecision is replaced with ConvertPrecision - return {Type_t::f32}; + // BF16 compressed-activations path is intended for SIMD (avx512_vnni) + // dynamic-quant kernels. On AMX-capable HW, AMX BF16 TMUL outperforms + // VNNI int8 on prefill, so keep f32 here and let the existing AMX BF16 + // path handle bf16 inference precision. + if (dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core_amx)) { + return {Type_t::f32}; + } + return {Type_t::f32, Type_t::bf16}; #elif defined(OV_CPU_WITH_KLEIDIAI) return {Type_t::f32}; #else diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/classes/matmul_weights_decompression.cpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/classes/matmul_weights_decompression.cpp index aa3dbe620052..596303c5ba44 100644 --- a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/classes/matmul_weights_decompression.cpp +++ b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/classes/matmul_weights_decompression.cpp @@ -3,6 +3,7 @@ // #include "matmul_weights_decompression.hpp" + #include "common_test_utils/subgraph_builders/weights_decompression_builders.hpp" #include "openvino/runtime/intel_cpu/properties.hpp" @@ -11,10 +12,19 @@ using namespace CPUTestUtils; namespace ov { namespace test { -std::string MatmulWeightsDecompression::getTestCaseName(const testing::TestParamInfo& obj) { - const auto& [shape_params, weights_precision, decompression_precision, scale_precision, transpose, - decompression_multiply_type, decompression_subtract_type, reshape_on_decompression, additional_config, - fusing_params, should_fuse] = obj.param; +std::string MatmulWeightsDecompression::getTestCaseName( + const testing::TestParamInfo& obj) { + const auto& [shape_params, + weights_precision, + decompression_precision, + scale_precision, + transpose, + decompression_multiply_type, + decompression_subtract_type, + reshape_on_decompression, + additional_config, + fusing_params, + should_fuse] = obj.param; std::ostringstream result; result << shape_params << "_"; result << "weights_precision=" << weights_precision << "_"; @@ -36,53 +46,67 @@ std::string MatmulWeightsDecompression::getTestCaseName(const testing::TestParam return result.str(); } -std::shared_ptr MatmulWeightsDecompression::initSubgraph(const ov::PartialShape& data_shape, - const ov::Shape& weights_shape, - const int group_size, - const ov::element::Type data_precision, - const ov::element::Type weights_precision, - const ov::element::Type decompression_precision, - const ov::element::Type scale_precision, - const bool transpose_weights, - const ov::test::utils::DecompressionType decompression_multiply_type, - const ov::test::utils::DecompressionType decompression_subtract_type, - const bool reshape_on_decompression) { +std::shared_ptr MatmulWeightsDecompression::initSubgraph( + const ov::PartialShape& data_shape, + const ov::Shape& weights_shape, + const int group_size, + const ov::element::Type data_precision, + const ov::element::Type weights_precision, + const ov::element::Type decompression_precision, + const ov::element::Type scale_precision, + const bool transpose_weights, + const ov::test::utils::DecompressionType decompression_multiply_type, + const ov::test::utils::DecompressionType decompression_subtract_type, + const bool reshape_on_decompression) { ov::ParameterVector params{std::make_shared(data_precision, data_shape)}; const auto weights_subgraph = initMatMulDecompressionSubgraph(weights_shape, - group_size, - data_precision, - weights_precision, - decompression_precision, - scale_precision, - transpose_weights, - decompression_multiply_type, - decompression_subtract_type, - reshape_on_decompression); + group_size, + data_precision, + weights_precision, + decompression_precision, + scale_precision, + transpose_weights, + decompression_multiply_type, + decompression_subtract_type, + reshape_on_decompression); auto matMul = std::make_shared(params[0], weights_subgraph); return create_ov_model(data_precision, params, matMul, "MatmulWeightsDecompression"); } void MatmulWeightsDecompression::SetUp() { targetDevice = ov::test::utils::DEVICE_CPU; - const auto& [shape_params, weights_precision, decompression_precision, scale_precision, transpose_weights, - decompression_multiply_type, decompression_subtract_type, reshape_on_decompression, additional_config, - fusing_params, should_fuse] = GetParam(); + const auto& [shape_params, + weights_precision, + decompression_precision, + scale_precision, + transpose_weights, + decompression_multiply_type, + decompression_subtract_type, + reshape_on_decompression, + additional_config, + fusing_params, + should_fuse] = GetParam(); configuration.insert(additional_config.begin(), additional_config.end()); std::tie(postOpMgrPtr, fusedOps) = fusing_params; init_input_shapes({shape_params.data_shape}); + const bool dyn_quant_enabled = configuration.count(ov::hint::dynamic_quantization_group_size.name()) && + configuration.at(ov::hint::dynamic_quantization_group_size.name()) != 0; + const auto inference_precision_hint = + configuration.count(ov::hint::inference_precision.name()) + ? configuration.at(ov::hint::inference_precision.name()).as() + : ov::element::dynamic; + if (!configuration.count(ov::hint::dynamic_quantization_group_size.name())) { abs_threshold = 5e-3; } // if dynamic quantization is enabled - if (configuration.count(ov::hint::dynamic_quantization_group_size.name()) && - configuration.at(ov::hint::dynamic_quantization_group_size.name()) != 0) { - abs_threshold = 0.1; + if (dyn_quant_enabled) { + abs_threshold = inference_precision_hint == ov::element::bf16 ? 0.2 : 0.1; } - if (configuration.count(ov::hint::inference_precision.name()) && - configuration.at(ov::hint::inference_precision.name()) == ov::element::f16) { + if (inference_precision_hint == ov::element::f16) { abs_threshold = 0.2; } @@ -118,9 +142,8 @@ void MatmulWeightsDecompression::check_results() { type = fc->get_rt_info().at(ov::exec_model_info::LAYER_TYPE).as(); EXPECT_EQ(type, "FullyConnected"); - const auto& expected_weights_precision = use_matmul_decompression_impl - ? compressed_weights_precision - : fc->get_input_element_type(0); + const auto& expected_weights_precision = + use_matmul_decompression_impl ? compressed_weights_precision : fc->get_input_element_type(0); EXPECT_EQ(fc->get_input_element_type(1), expected_weights_precision); } diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/classes/matmul_weights_decompression.hpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/classes/matmul_weights_decompression.hpp index 114c3ab8ebe7..0fc457682c74 100644 --- a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/classes/matmul_weights_decompression.hpp +++ b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/classes/matmul_weights_decompression.hpp @@ -3,9 +3,9 @@ // #include "common_test_utils/ov_tensor_utils.hpp" +#include "common_test_utils/subgraph_builders/weights_decompression_builders.hpp" #include "shared_test_classes/base/ov_subgraph.hpp" #include "shared_test_classes/subgraph/weights_decompression_params.hpp" -#include "common_test_utils/subgraph_builders/weights_decompression_builders.hpp" #include "utils/cpu_test_utils.hpp" #include "utils/fusing_test_utils.hpp" @@ -42,14 +42,14 @@ namespace test { * Bias */ typedef std::tuple // should use decompression implementation MatmulWeightsDecompressionParams; diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/matmul_weights_decompression.cpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/matmul_weights_decompression.cpp index 9903d149efba..0f870a141eeb 100644 --- a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/matmul_weights_decompression.cpp +++ b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/matmul_weights_decompression.cpp @@ -302,6 +302,22 @@ std::vector filter_additional_config_dyn_quant() { return additional_config; } +std::vector filter_additional_config_dyn_quant_bf16() { + // Drive the BF16 dynamic-quant compressed-FC path through the inference_precision + // hint on top of an f32 IR. The ConvertPrecision pipeline is responsible for + // adjusting the decompression chain to bf16; the test should not pre-bake bf16 + // into the IR. + std::vector additional_config = {}; + if (ov::with_cpu_x86_bfloat16() && !ov::with_cpu_x86_avx512_core_amx()) { + additional_config = { + {ov::hint::dynamic_quantization_group_size(0), ov::hint::inference_precision(ov::element::bf16)}, + {ov::hint::dynamic_quantization_group_size(16), ov::hint::inference_precision(ov::element::bf16)}, + {ov::hint::dynamic_quantization_group_size(128), ov::hint::inference_precision(ov::element::bf16)}, + }; + } + return additional_config; +} + INSTANTIATE_TEST_SUITE_P(smoke_MatMulCompressedWeights_non_default_dyn_quant_group_sizes, MatmulWeightsDecompression, ::testing::Combine(::testing::ValuesIn(input_shapes_basic_dyn_quant), @@ -317,6 +333,21 @@ INSTANTIATE_TEST_SUITE_P(smoke_MatMulCompressedWeights_non_default_dyn_quant_gro ::testing::Values(true)), MatmulWeightsDecompression::getTestCaseName); +INSTANTIATE_TEST_SUITE_P(smoke_MatMulCompressedWeights_non_default_dyn_quant_group_sizes_bf16, + MatmulWeightsDecompression, + ::testing::Combine(::testing::ValuesIn(input_shapes_basic_dyn_quant), + ::testing::ValuesIn(weights_precisions_dyn_quant), + ::testing::ValuesIn(decompression_precisions), + ::testing::Values(ov::element::dynamic), + ::testing::Values(true), + ::testing::Values(DecompressionType::full), + ::testing::ValuesIn(decompression_subtract_type), + ::testing::Values(false), + ::testing::ValuesIn(filter_additional_config_dyn_quant_bf16()), + ::testing::ValuesIn(fusing_params_dyn_quant), + ::testing::Values(true)), + MatmulWeightsDecompression::getTestCaseName); + INSTANTIATE_TEST_SUITE_P(smoke_MatMulCompressedWeights_non_default_dyn_quant_group_sizes_u2, MatmulWeightsDecompression, ::testing::Combine(::testing::ValuesIn(input_shapes_basic_dyn_quant_u2), diff --git a/src/plugins/intel_cpu/thirdparty/onednn b/src/plugins/intel_cpu/thirdparty/onednn index 587b6d7ae4d7..f82d833de6f1 160000 --- a/src/plugins/intel_cpu/thirdparty/onednn +++ b/src/plugins/intel_cpu/thirdparty/onednn @@ -1 +1 @@ -Subproject commit 587b6d7ae4d77b0a2b438c23e757ccfd352e5bed +Subproject commit f82d833de6f13fac4bb1926d521ca8fec4f4ae01