From 89db8484efd5b07c28994e1fe4a8f6bfd07e84b7 Mon Sep 17 00:00:00 2001 From: liubo-intel Date: Mon, 23 Mar 2026 03:59:45 -0400 Subject: [PATCH 1/5] Enable BF16 dynamic quantization path for compressed FC --- .../dnnl/dnnl_fullyconnected_primitive.cpp | 21 ++-- .../intel_cpu/src/nodes/fullyconnected.cpp | 4 +- .../classes/matmul_weights_decompression.cpp | 98 ++++++++++++------- .../classes/matmul_weights_decompression.hpp | 7 ++ .../src/x64/matmul_weights_decompression.cpp | 28 ++++++ src/plugins/intel_cpu/thirdparty/onednn | 2 +- 6 files changed, 117 insertions(+), 43 deletions(-) diff --git a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_fullyconnected_primitive.cpp b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_fullyconnected_primitive.cpp index e358d24e5e0626..eb34c835951842 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_fullyconnected_primitive.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_fullyconnected_primitive.cpp @@ -41,6 +41,7 @@ #include "utils/cpu_utils.hpp" #include "utils/debug_capabilities.h" #include "utils/general_utils.h" +#include "utils/precision_support.h" namespace ov::intel_cpu { @@ -166,12 +167,20 @@ static bool useDynamicQuantizationImpl(size_t dqGroupSize, return false; } - if (!dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx2_vnni) && - !dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core_vnni)) { - return false; - } - - if (srcDesc->getPrecision() != ov::element::f32) { + // BF16 dynamic-quant path requires native BF16 HW support AND an AVX512-VNNI + // impl in oneDNN (only avx512_core_vnni instance is registered for bf16 src). + const auto srcPrecision = srcDesc->getPrecision(); + if (srcPrecision == ov::element::bf16) { + if (!hasHardwareSupport(ov::element::bf16) || + !dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core_vnni)) { + return false; + } + } else if (srcPrecision == ov::element::f32) { + if (!dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx2_vnni) && + !dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core_vnni)) { + return false; + } + } else { return false; } diff --git a/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp b/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp index aa317305d77b2c..e40ad95c68f5a6 100644 --- a/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp +++ b/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp @@ -94,9 +94,7 @@ ov::element::TypeVector FullyConnected::getSupportedCompressedActivationsTypes() return {Type_t::f32, Type_t::f16}; } #if defined(OPENVINO_ARCH_X86_64) - // @todo enable for bf16 as well - // after EnforceInferencePrecision is replaced with ConvertPrecision - return {Type_t::f32}; + return {Type_t::f32, Type_t::bf16}; #elif defined(OV_CPU_WITH_KLEIDIAI) return {Type_t::f32}; #else diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/classes/matmul_weights_decompression.cpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/classes/matmul_weights_decompression.cpp index aa3dbe620052b2..34a79a7303c078 100644 --- a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/classes/matmul_weights_decompression.cpp +++ b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/classes/matmul_weights_decompression.cpp @@ -3,6 +3,7 @@ // #include "matmul_weights_decompression.hpp" + #include "common_test_utils/subgraph_builders/weights_decompression_builders.hpp" #include "openvino/runtime/intel_cpu/properties.hpp" @@ -11,10 +12,19 @@ using namespace CPUTestUtils; namespace ov { namespace test { -std::string MatmulWeightsDecompression::getTestCaseName(const testing::TestParamInfo& obj) { - const auto& [shape_params, weights_precision, decompression_precision, scale_precision, transpose, - decompression_multiply_type, decompression_subtract_type, reshape_on_decompression, additional_config, - fusing_params, should_fuse] = obj.param; +std::string MatmulWeightsDecompression::getTestCaseName( + const testing::TestParamInfo& obj) { + const auto& [shape_params, + weights_precision, + decompression_precision, + scale_precision, + transpose, + decompression_multiply_type, + decompression_subtract_type, + reshape_on_decompression, + additional_config, + fusing_params, + should_fuse] = obj.param; std::ostringstream result; result << shape_params << "_"; result << "weights_precision=" << weights_precision << "_"; @@ -36,37 +46,46 @@ std::string MatmulWeightsDecompression::getTestCaseName(const testing::TestParam return result.str(); } -std::shared_ptr MatmulWeightsDecompression::initSubgraph(const ov::PartialShape& data_shape, - const ov::Shape& weights_shape, - const int group_size, - const ov::element::Type data_precision, - const ov::element::Type weights_precision, - const ov::element::Type decompression_precision, - const ov::element::Type scale_precision, - const bool transpose_weights, - const ov::test::utils::DecompressionType decompression_multiply_type, - const ov::test::utils::DecompressionType decompression_subtract_type, - const bool reshape_on_decompression) { +std::shared_ptr MatmulWeightsDecompression::initSubgraph( + const ov::PartialShape& data_shape, + const ov::Shape& weights_shape, + const int group_size, + const ov::element::Type data_precision, + const ov::element::Type weights_precision, + const ov::element::Type decompression_precision, + const ov::element::Type scale_precision, + const bool transpose_weights, + const ov::test::utils::DecompressionType decompression_multiply_type, + const ov::test::utils::DecompressionType decompression_subtract_type, + const bool reshape_on_decompression) { ov::ParameterVector params{std::make_shared(data_precision, data_shape)}; const auto weights_subgraph = initMatMulDecompressionSubgraph(weights_shape, - group_size, - data_precision, - weights_precision, - decompression_precision, - scale_precision, - transpose_weights, - decompression_multiply_type, - decompression_subtract_type, - reshape_on_decompression); + group_size, + data_precision, + weights_precision, + decompression_precision, + scale_precision, + transpose_weights, + decompression_multiply_type, + decompression_subtract_type, + reshape_on_decompression); auto matMul = std::make_shared(params[0], weights_subgraph); return create_ov_model(data_precision, params, matMul, "MatmulWeightsDecompression"); } -void MatmulWeightsDecompression::SetUp() { +void MatmulWeightsDecompression::setUpWithDataPrecision(const ov::element::Type data_precision) { targetDevice = ov::test::utils::DEVICE_CPU; - const auto& [shape_params, weights_precision, decompression_precision, scale_precision, transpose_weights, - decompression_multiply_type, decompression_subtract_type, reshape_on_decompression, additional_config, - fusing_params, should_fuse] = GetParam(); + const auto& [shape_params, + weights_precision, + decompression_precision, + scale_precision, + transpose_weights, + decompression_multiply_type, + decompression_subtract_type, + reshape_on_decompression, + additional_config, + fusing_params, + should_fuse] = GetParam(); configuration.insert(additional_config.begin(), additional_config.end()); std::tie(postOpMgrPtr, fusedOps) = fusing_params; init_input_shapes({shape_params.data_shape}); @@ -78,7 +97,7 @@ void MatmulWeightsDecompression::SetUp() { // if dynamic quantization is enabled if (configuration.count(ov::hint::dynamic_quantization_group_size.name()) && configuration.at(ov::hint::dynamic_quantization_group_size.name()) != 0) { - abs_threshold = 0.1; + abs_threshold = data_precision == ov::element::bf16 ? 0.2 : 0.1; } if (configuration.count(ov::hint::inference_precision.name()) && @@ -86,7 +105,7 @@ void MatmulWeightsDecompression::SetUp() { abs_threshold = 0.2; } - ElementType netType = ov::element::f32; + const ElementType netType = data_precision; inType = outType = netType; function = initSubgraph(inputDynamicShapes[0], @@ -102,6 +121,14 @@ void MatmulWeightsDecompression::SetUp() { reshape_on_decompression); } +void MatmulWeightsDecompression::SetUp() { + setUpWithDataPrecision(ov::element::f32); +} + +void MatmulWeightsDecompressionBF16::SetUp() { + setUpWithDataPrecision(ov::element::bf16); +} + void MatmulWeightsDecompression::check_results() { const auto& test_param = GetParam(); const ov::element::Type compressed_weights_precision = std::get<1>(test_param); @@ -118,9 +145,8 @@ void MatmulWeightsDecompression::check_results() { type = fc->get_rt_info().at(ov::exec_model_info::LAYER_TYPE).as(); EXPECT_EQ(type, "FullyConnected"); - const auto& expected_weights_precision = use_matmul_decompression_impl - ? compressed_weights_precision - : fc->get_input_element_type(0); + const auto& expected_weights_precision = + use_matmul_decompression_impl ? compressed_weights_precision : fc->get_input_element_type(0); EXPECT_EQ(fc->get_input_element_type(1), expected_weights_precision); } @@ -130,5 +156,11 @@ TEST_P(MatmulWeightsDecompression, CompareWithRefs) { check_results(); } +TEST_P(MatmulWeightsDecompressionBF16, CompareWithRefs) { + SKIP_IF_CURRENT_TEST_IS_DISABLED() + run(); + check_results(); +} + } // namespace test } // namespace ov diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/classes/matmul_weights_decompression.hpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/classes/matmul_weights_decompression.hpp index 114c3ab8ebe7ff..cf55ba68e2f399 100644 --- a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/classes/matmul_weights_decompression.hpp +++ b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/classes/matmul_weights_decompression.hpp @@ -76,6 +76,13 @@ class MatmulWeightsDecompression : public testing::WithParamInterface filter_additional_config_amx() { } const std::vector decompression_precisions = {ov::element::f32}; +const std::vector decompression_precisions_bf16 = {ov::element::bf16}; const std::vector weights_precisions = {ov::element::u8, ov::element::u4, ov::element::i4, @@ -302,6 +303,18 @@ std::vector filter_additional_config_dyn_quant() { return additional_config; } +std::vector filter_additional_config_dyn_quant_bf16() { + std::vector additional_config = {}; + if (ov::with_cpu_x86_bfloat16() && ov::with_cpu_x86_avx512_core_vnni() && !ov::with_cpu_x86_avx512_core_amx()) { + additional_config = { + {{ov::hint::dynamic_quantization_group_size(0)}}, + {{ov::hint::dynamic_quantization_group_size(16)}}, + {{ov::hint::dynamic_quantization_group_size(128)}}, + }; + } + return additional_config; +} + INSTANTIATE_TEST_SUITE_P(smoke_MatMulCompressedWeights_non_default_dyn_quant_group_sizes, MatmulWeightsDecompression, ::testing::Combine(::testing::ValuesIn(input_shapes_basic_dyn_quant), @@ -317,6 +330,21 @@ INSTANTIATE_TEST_SUITE_P(smoke_MatMulCompressedWeights_non_default_dyn_quant_gro ::testing::Values(true)), MatmulWeightsDecompression::getTestCaseName); +INSTANTIATE_TEST_SUITE_P(smoke_MatMulCompressedWeights_non_default_dyn_quant_group_sizes_bf16, + MatmulWeightsDecompressionBF16, + ::testing::Combine(::testing::ValuesIn(input_shapes_basic_dyn_quant), + ::testing::ValuesIn(weights_precisions_dyn_quant), + ::testing::ValuesIn(decompression_precisions_bf16), + ::testing::Values(ov::element::dynamic), + ::testing::Values(true), + ::testing::Values(DecompressionType::full), + ::testing::ValuesIn(decompression_subtract_type), + ::testing::Values(false), + ::testing::ValuesIn(filter_additional_config_dyn_quant_bf16()), + ::testing::ValuesIn(fusing_params_dyn_quant), + ::testing::Values(true)), + MatmulWeightsDecompression::getTestCaseName); + INSTANTIATE_TEST_SUITE_P(smoke_MatMulCompressedWeights_non_default_dyn_quant_group_sizes_u2, MatmulWeightsDecompression, ::testing::Combine(::testing::ValuesIn(input_shapes_basic_dyn_quant_u2), diff --git a/src/plugins/intel_cpu/thirdparty/onednn b/src/plugins/intel_cpu/thirdparty/onednn index 587b6d7ae4d77b..fc147eb569285f 160000 --- a/src/plugins/intel_cpu/thirdparty/onednn +++ b/src/plugins/intel_cpu/thirdparty/onednn @@ -1 +1 @@ -Subproject commit 587b6d7ae4d77b0a2b438c23e757ccfd352e5bed +Subproject commit fc147eb569285f7a1479aebcdc327df2bd39aa2e From 0e3f79a04d1575ba6b79f7380d0619b0c5bb291e Mon Sep 17 00:00:00 2001 From: liubo-intel Date: Mon, 11 May 2026 02:08:35 -0400 Subject: [PATCH 2/5] Apply suggestions from code review --- .../executors/dnnl/dnnl_fullyconnected_primitive.cpp | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_fullyconnected_primitive.cpp b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_fullyconnected_primitive.cpp index eb34c835951842..c81acde8ba3d61 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_fullyconnected_primitive.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_fullyconnected_primitive.cpp @@ -41,7 +41,6 @@ #include "utils/cpu_utils.hpp" #include "utils/debug_capabilities.h" #include "utils/general_utils.h" -#include "utils/precision_support.h" namespace ov::intel_cpu { @@ -167,12 +166,11 @@ static bool useDynamicQuantizationImpl(size_t dqGroupSize, return false; } - // BF16 dynamic-quant path requires native BF16 HW support AND an AVX512-VNNI - // impl in oneDNN (only avx512_core_vnni instance is registered for bf16 src). + // BF16 dynamic-quant path requires native x86 BF16 HW support (AVX512_BF16) AND an + // AVX512-VNNI impl in oneDNN (only avx512_core_vnni instance is registered for bf16 src). const auto srcPrecision = srcDesc->getPrecision(); if (srcPrecision == ov::element::bf16) { - if (!hasHardwareSupport(ov::element::bf16) || - !dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core_vnni)) { + if (!dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core_bf16)) { return false; } } else if (srcPrecision == ov::element::f32) { From ebdc7177b57c64fe7fb376a4b7d82b6709fcdbb5 Mon Sep 17 00:00:00 2001 From: liubo-intel Date: Tue, 19 May 2026 03:59:03 -0400 Subject: [PATCH 3/5] [CPU] Apply suggestions from code review * Gate BF16 dyn-quant entry on AMX-capable HW (two layers: node-level getSupportedCompressedActivationsTypes + primitive-level useDynamicQuantizationImpl), since AMX BF16 TMUL handles long prompts (prefill) more efficiently than VNNI int8 dyn-quant. * Drive the BF16 dyn-quant test through the inference_precision hint on an f32 IR; remove the MatmulWeightsDecompressionBF16 subclass and decompression_precisions_bf16. --- .../dnnl/dnnl_fullyconnected_primitive.cpp | 5 +++ .../intel_cpu/src/nodes/fullyconnected.cpp | 7 ++++ .../classes/matmul_weights_decompression.cpp | 33 +++++++------------ .../classes/matmul_weights_decompression.hpp | 25 +++++--------- .../src/x64/matmul_weights_decompression.cpp | 15 +++++---- 5 files changed, 42 insertions(+), 43 deletions(-) diff --git a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_fullyconnected_primitive.cpp b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_fullyconnected_primitive.cpp index c81acde8ba3d61..65d4473cbebf0f 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_fullyconnected_primitive.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_fullyconnected_primitive.cpp @@ -173,6 +173,11 @@ static bool useDynamicQuantizationImpl(size_t dqGroupSize, if (!dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core_bf16)) { return false; } + // On AMX-capable HW, AMX BF16 TMUL outperforms VNNI int8 dyn-quant for bf16 src. + // Disable the bf16 dyn-quant entry here so the AMX BF16 path stays in use. + if (dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core_amx)) { + return false; + } } else if (srcPrecision == ov::element::f32) { if (!dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx2_vnni) && !dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core_vnni)) { diff --git a/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp b/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp index e40ad95c68f5a6..6d9d2d75319e3f 100644 --- a/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp +++ b/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp @@ -94,6 +94,13 @@ ov::element::TypeVector FullyConnected::getSupportedCompressedActivationsTypes() return {Type_t::f32, Type_t::f16}; } #if defined(OPENVINO_ARCH_X86_64) + // BF16 compressed-activations path is intended for SIMD (avx512_vnni) + // dynamic-quant kernels. On AMX-capable HW, AMX BF16 TMUL outperforms + // VNNI int8 on prefill, so keep f32 here and let the existing AMX BF16 + // path handle bf16 inference precision. + if (dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core_amx)) { + return {Type_t::f32}; + } return {Type_t::f32, Type_t::bf16}; #elif defined(OV_CPU_WITH_KLEIDIAI) return {Type_t::f32}; diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/classes/matmul_weights_decompression.cpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/classes/matmul_weights_decompression.cpp index 34a79a7303c078..596303c5ba444d 100644 --- a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/classes/matmul_weights_decompression.cpp +++ b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/classes/matmul_weights_decompression.cpp @@ -73,7 +73,7 @@ std::shared_ptr MatmulWeightsDecompression::initSubgraph( return create_ov_model(data_precision, params, matMul, "MatmulWeightsDecompression"); } -void MatmulWeightsDecompression::setUpWithDataPrecision(const ov::element::Type data_precision) { +void MatmulWeightsDecompression::SetUp() { targetDevice = ov::test::utils::DEVICE_CPU; const auto& [shape_params, weights_precision, @@ -90,22 +90,27 @@ void MatmulWeightsDecompression::setUpWithDataPrecision(const ov::element::Type std::tie(postOpMgrPtr, fusedOps) = fusing_params; init_input_shapes({shape_params.data_shape}); + const bool dyn_quant_enabled = configuration.count(ov::hint::dynamic_quantization_group_size.name()) && + configuration.at(ov::hint::dynamic_quantization_group_size.name()) != 0; + const auto inference_precision_hint = + configuration.count(ov::hint::inference_precision.name()) + ? configuration.at(ov::hint::inference_precision.name()).as() + : ov::element::dynamic; + if (!configuration.count(ov::hint::dynamic_quantization_group_size.name())) { abs_threshold = 5e-3; } // if dynamic quantization is enabled - if (configuration.count(ov::hint::dynamic_quantization_group_size.name()) && - configuration.at(ov::hint::dynamic_quantization_group_size.name()) != 0) { - abs_threshold = data_precision == ov::element::bf16 ? 0.2 : 0.1; + if (dyn_quant_enabled) { + abs_threshold = inference_precision_hint == ov::element::bf16 ? 0.2 : 0.1; } - if (configuration.count(ov::hint::inference_precision.name()) && - configuration.at(ov::hint::inference_precision.name()) == ov::element::f16) { + if (inference_precision_hint == ov::element::f16) { abs_threshold = 0.2; } - const ElementType netType = data_precision; + ElementType netType = ov::element::f32; inType = outType = netType; function = initSubgraph(inputDynamicShapes[0], @@ -121,14 +126,6 @@ void MatmulWeightsDecompression::setUpWithDataPrecision(const ov::element::Type reshape_on_decompression); } -void MatmulWeightsDecompression::SetUp() { - setUpWithDataPrecision(ov::element::f32); -} - -void MatmulWeightsDecompressionBF16::SetUp() { - setUpWithDataPrecision(ov::element::bf16); -} - void MatmulWeightsDecompression::check_results() { const auto& test_param = GetParam(); const ov::element::Type compressed_weights_precision = std::get<1>(test_param); @@ -156,11 +153,5 @@ TEST_P(MatmulWeightsDecompression, CompareWithRefs) { check_results(); } -TEST_P(MatmulWeightsDecompressionBF16, CompareWithRefs) { - SKIP_IF_CURRENT_TEST_IS_DISABLED() - run(); - check_results(); -} - } // namespace test } // namespace ov diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/classes/matmul_weights_decompression.hpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/classes/matmul_weights_decompression.hpp index cf55ba68e2f399..0fc457682c74c1 100644 --- a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/classes/matmul_weights_decompression.hpp +++ b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/classes/matmul_weights_decompression.hpp @@ -3,9 +3,9 @@ // #include "common_test_utils/ov_tensor_utils.hpp" +#include "common_test_utils/subgraph_builders/weights_decompression_builders.hpp" #include "shared_test_classes/base/ov_subgraph.hpp" #include "shared_test_classes/subgraph/weights_decompression_params.hpp" -#include "common_test_utils/subgraph_builders/weights_decompression_builders.hpp" #include "utils/cpu_test_utils.hpp" #include "utils/fusing_test_utils.hpp" @@ -42,14 +42,14 @@ namespace test { * Bias */ typedef std::tuple // should use decompression implementation MatmulWeightsDecompressionParams; @@ -76,13 +76,6 @@ class MatmulWeightsDecompression : public testing::WithParamInterface filter_additional_config_amx() { } const std::vector decompression_precisions = {ov::element::f32}; -const std::vector decompression_precisions_bf16 = {ov::element::bf16}; const std::vector weights_precisions = {ov::element::u8, ov::element::u4, ov::element::i4, @@ -304,12 +303,16 @@ std::vector filter_additional_config_dyn_quant() { } std::vector filter_additional_config_dyn_quant_bf16() { + // Drive the BF16 dynamic-quant compressed-FC path through the inference_precision + // hint on top of an f32 IR. The ConvertPrecision pipeline is responsible for + // adjusting the decompression chain to bf16; the test should not pre-bake bf16 + // into the IR. std::vector additional_config = {}; if (ov::with_cpu_x86_bfloat16() && ov::with_cpu_x86_avx512_core_vnni() && !ov::with_cpu_x86_avx512_core_amx()) { additional_config = { - {{ov::hint::dynamic_quantization_group_size(0)}}, - {{ov::hint::dynamic_quantization_group_size(16)}}, - {{ov::hint::dynamic_quantization_group_size(128)}}, + {ov::hint::dynamic_quantization_group_size(0), ov::hint::inference_precision(ov::element::bf16)}, + {ov::hint::dynamic_quantization_group_size(16), ov::hint::inference_precision(ov::element::bf16)}, + {ov::hint::dynamic_quantization_group_size(128), ov::hint::inference_precision(ov::element::bf16)}, }; } return additional_config; @@ -331,10 +334,10 @@ INSTANTIATE_TEST_SUITE_P(smoke_MatMulCompressedWeights_non_default_dyn_quant_gro MatmulWeightsDecompression::getTestCaseName); INSTANTIATE_TEST_SUITE_P(smoke_MatMulCompressedWeights_non_default_dyn_quant_group_sizes_bf16, - MatmulWeightsDecompressionBF16, + MatmulWeightsDecompression, ::testing::Combine(::testing::ValuesIn(input_shapes_basic_dyn_quant), ::testing::ValuesIn(weights_precisions_dyn_quant), - ::testing::ValuesIn(decompression_precisions_bf16), + ::testing::ValuesIn(decompression_precisions), ::testing::Values(ov::element::dynamic), ::testing::Values(true), ::testing::Values(DecompressionType::full), From b2ce2d11abdba012e3ab0c5224a59c01232210a1 Mon Sep 17 00:00:00 2001 From: liubo-intel Date: Sat, 23 May 2026 10:24:25 -0400 Subject: [PATCH 4/5] Apply suggestions from code review --- .../subgraph_tests/src/x64/matmul_weights_decompression.cpp | 2 +- src/plugins/intel_cpu/thirdparty/onednn | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/matmul_weights_decompression.cpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/matmul_weights_decompression.cpp index 44f08d39f745d1..0f870a141eebb9 100644 --- a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/matmul_weights_decompression.cpp +++ b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/matmul_weights_decompression.cpp @@ -308,7 +308,7 @@ std::vector filter_additional_config_dyn_quant_bf16() { // adjusting the decompression chain to bf16; the test should not pre-bake bf16 // into the IR. std::vector additional_config = {}; - if (ov::with_cpu_x86_bfloat16() && ov::with_cpu_x86_avx512_core_vnni() && !ov::with_cpu_x86_avx512_core_amx()) { + if (ov::with_cpu_x86_bfloat16() && !ov::with_cpu_x86_avx512_core_amx()) { additional_config = { {ov::hint::dynamic_quantization_group_size(0), ov::hint::inference_precision(ov::element::bf16)}, {ov::hint::dynamic_quantization_group_size(16), ov::hint::inference_precision(ov::element::bf16)}, diff --git a/src/plugins/intel_cpu/thirdparty/onednn b/src/plugins/intel_cpu/thirdparty/onednn index fc147eb569285f..6b31ed2b862ab3 160000 --- a/src/plugins/intel_cpu/thirdparty/onednn +++ b/src/plugins/intel_cpu/thirdparty/onednn @@ -1 +1 @@ -Subproject commit fc147eb569285f7a1479aebcdc327df2bd39aa2e +Subproject commit 6b31ed2b862ab3e216d81433db0c7a8b3a0792d0 From 88e17364c1221df1589cc75ee6611381da865724 Mon Sep 17 00:00:00 2001 From: liubo-intel Date: Wed, 27 May 2026 06:15:06 -0400 Subject: [PATCH 5/5] onednn commit message reword --- src/plugins/intel_cpu/thirdparty/onednn | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/plugins/intel_cpu/thirdparty/onednn b/src/plugins/intel_cpu/thirdparty/onednn index 6b31ed2b862ab3..f82d833de6f13f 160000 --- a/src/plugins/intel_cpu/thirdparty/onednn +++ b/src/plugins/intel_cpu/thirdparty/onednn @@ -1 +1 @@ -Subproject commit 6b31ed2b862ab3e216d81433db0c7a8b3a0792d0 +Subproject commit f82d833de6f13fac4bb1926d521ca8fec4f4ae01