openvinotoolkit · maxnick · May 27, 2026 · Mar 23, 2026 · May 11, 2026 · May 19, 2026
@@ -166,12 +166,24 @@ static bool useDynamicQuantizationImpl(size_t dqGroupSize,
         return false;
     }
 
-    if (!dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx2_vnni) &&
-        !dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core_vnni)) {
-        return false;
-    }
-
-    if (srcDesc->getPrecision() != ov::element::f32) {
+    // BF16 dynamic-quant path requires native x86 BF16 HW support (AVX512_BF16) AND an
+    // AVX512-VNNI impl in oneDNN (only avx512_core_vnni instance is registered for bf16 src).
+    const auto srcPrecision = srcDesc->getPrecision();
+    if (srcPrecision == ov::element::bf16) {
+        if (!dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core_bf16)) {
+            return false;
+        }
+        // On AMX-capable HW, AMX BF16 TMUL outperforms VNNI int8 dyn-quant for bf16 src.
+        // Disable the bf16 dyn-quant entry here so the AMX BF16 path stays in use.
+        if (dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core_amx)) {
+            return false;
+        }
+    } else if (srcPrecision == ov::element::f32) {
+        if (!dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx2_vnni) &&
+            !dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core_vnni)) {
+            return false;
+        }
+    } else {
         return false;
     }
 

@@ -94,9 +94,14 @@ ov::element::TypeVector FullyConnected::getSupportedCompressedActivationsTypes()
         return {Type_t::f32, Type_t::f16};
     }
 #if defined(OPENVINO_ARCH_X86_64)
-    // @todo enable for bf16 as well
-    // after EnforceInferencePrecision is replaced with ConvertPrecision
-    return {Type_t::f32};
+    // BF16 compressed-activations path is intended for SIMD (avx512_vnni)
+    // dynamic-quant kernels. On AMX-capable HW, AMX BF16 TMUL outperforms
+    // VNNI int8 on prefill, so keep f32 here and let the existing AMX BF16
+    // path handle bf16 inference precision.
+    if (dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core_amx)) {
+        return {Type_t::f32};
+    }
+    return {Type_t::f32, Type_t::bf16};
 #elif defined(OV_CPU_WITH_KLEIDIAI)
     return {Type_t::f32};
 #else

@@ -3,6 +3,7 @@
 //
 
 #include "matmul_weights_decompression.hpp"
+
 #include "common_test_utils/subgraph_builders/weights_decompression_builders.hpp"
 #include "openvino/runtime/intel_cpu/properties.hpp"
 
@@ -11,10 +12,19 @@ using namespace CPUTestUtils;
 namespace ov {
 namespace test {
 
-std::string MatmulWeightsDecompression::getTestCaseName(const testing::TestParamInfo<MatmulWeightsDecompressionParams>& obj) {
-    const auto& [shape_params, weights_precision, decompression_precision, scale_precision, transpose,
-                 decompression_multiply_type, decompression_subtract_type, reshape_on_decompression, additional_config,
-                 fusing_params, should_fuse] = obj.param;
+std::string MatmulWeightsDecompression::getTestCaseName(
+    const testing::TestParamInfo<MatmulWeightsDecompressionParams>& obj) {
+    const auto& [shape_params,
+                 weights_precision,
+                 decompression_precision,
+                 scale_precision,
+                 transpose,
+                 decompression_multiply_type,
+                 decompression_subtract_type,
+                 reshape_on_decompression,
+                 additional_config,
+                 fusing_params,
+                 should_fuse] = obj.param;
     std::ostringstream result;
     result << shape_params << "_";
     result << "weights_precision=" << weights_precision << "_";
@@ -36,53 +46,67 @@ std::string MatmulWeightsDecompression::getTestCaseName(const testing::TestParam
     return result.str();
 }
 
-std::shared_ptr<ov::Model> MatmulWeightsDecompression::initSubgraph(const ov::PartialShape& data_shape,
-                                                                    const ov::Shape& weights_shape,
-                                                                    const int group_size,
-                                                                    const ov::element::Type data_precision,
-                                                                    const ov::element::Type weights_precision,
-                                                                    const ov::element::Type decompression_precision,
-                                                                    const ov::element::Type scale_precision,
-                                                                    const bool transpose_weights,
-                                                                    const ov::test::utils::DecompressionType decompression_multiply_type,
-                                                                    const ov::test::utils::DecompressionType decompression_subtract_type,
-                                                                    const bool reshape_on_decompression) {
+std::shared_ptr<ov::Model> MatmulWeightsDecompression::initSubgraph(
+    const ov::PartialShape& data_shape,
+    const ov::Shape& weights_shape,
+    const int group_size,
+    const ov::element::Type data_precision,
+    const ov::element::Type weights_precision,
+    const ov::element::Type decompression_precision,
+    const ov::element::Type scale_precision,
+    const bool transpose_weights,
+    const ov::test::utils::DecompressionType decompression_multiply_type,
+    const ov::test::utils::DecompressionType decompression_subtract_type,
+    const bool reshape_on_decompression) {
     ov::ParameterVector params{std::make_shared<ov::op::v0::Parameter>(data_precision, data_shape)};
     const auto weights_subgraph = initMatMulDecompressionSubgraph(weights_shape,
-                                                                    group_size,
-                                                                    data_precision,
-                                                                    weights_precision,
-                                                                    decompression_precision,
-                                                                    scale_precision,
-                                                                    transpose_weights,
-                                                                    decompression_multiply_type,
-                                                                    decompression_subtract_type,
-                                                                    reshape_on_decompression);
+                                                                  group_size,
+                                                                  data_precision,
+                                                                  weights_precision,
+                                                                  decompression_precision,
+                                                                  scale_precision,
+                                                                  transpose_weights,
+                                                                  decompression_multiply_type,
+                                                                  decompression_subtract_type,
+                                                                  reshape_on_decompression);
     auto matMul = std::make_shared<ov::op::v0::MatMul>(params[0], weights_subgraph);
     return create_ov_model(data_precision, params, matMul, "MatmulWeightsDecompression");
 }
 
 void MatmulWeightsDecompression::SetUp() {
     targetDevice = ov::test::utils::DEVICE_CPU;
-    const auto& [shape_params, weights_precision, decompression_precision, scale_precision, transpose_weights,
-                 decompression_multiply_type, decompression_subtract_type, reshape_on_decompression, additional_config,
-                 fusing_params, should_fuse] = GetParam();
+    const auto& [shape_params,
+                 weights_precision,
+                 decompression_precision,
+                 scale_precision,
+                 transpose_weights,
+                 decompression_multiply_type,
+                 decompression_subtract_type,
+                 reshape_on_decompression,
+                 additional_config,
+                 fusing_params,
+                 should_fuse] = GetParam();
     configuration.insert(additional_config.begin(), additional_config.end());
     std::tie(postOpMgrPtr, fusedOps) = fusing_params;
     init_input_shapes({shape_params.data_shape});
 
+    const bool dyn_quant_enabled = configuration.count(ov::hint::dynamic_quantization_group_size.name()) &&
+                                   configuration.at(ov::hint::dynamic_quantization_group_size.name()) != 0;
+    const auto inference_precision_hint =
+        configuration.count(ov::hint::inference_precision.name())
+            ? configuration.at(ov::hint::inference_precision.name()).as<ov::element::Type>()
+            : ov::element::dynamic;
+
     if (!configuration.count(ov::hint::dynamic_quantization_group_size.name())) {
         abs_threshold = 5e-3;
     }
 
     // if dynamic quantization is enabled
-    if (configuration.count(ov::hint::dynamic_quantization_group_size.name()) &&
-        configuration.at(ov::hint::dynamic_quantization_group_size.name()) != 0) {
-        abs_threshold = 0.1;
+    if (dyn_quant_enabled) {
+        abs_threshold = inference_precision_hint == ov::element::bf16 ? 0.2 : 0.1;
     }
 
-    if (configuration.count(ov::hint::inference_precision.name()) &&
-        configuration.at(ov::hint::inference_precision.name()) == ov::element::f16) {
+    if (inference_precision_hint == ov::element::f16) {
         abs_threshold = 0.2;
     }
 
@@ -118,9 +142,8 @@ void MatmulWeightsDecompression::check_results() {
     type = fc->get_rt_info().at(ov::exec_model_info::LAYER_TYPE).as<std::string>();
     EXPECT_EQ(type, "FullyConnected");
 
-    const auto& expected_weights_precision = use_matmul_decompression_impl
-                                                    ? compressed_weights_precision
-                                                    : fc->get_input_element_type(0);
+    const auto& expected_weights_precision =
+        use_matmul_decompression_impl ? compressed_weights_precision : fc->get_input_element_type(0);
     EXPECT_EQ(fc->get_input_element_type(1), expected_weights_precision);
 }
 

@@ -3,9 +3,9 @@
 //
 
 #include "common_test_utils/ov_tensor_utils.hpp"
+#include "common_test_utils/subgraph_builders/weights_decompression_builders.hpp"
 #include "shared_test_classes/base/ov_subgraph.hpp"
 #include "shared_test_classes/subgraph/weights_decompression_params.hpp"
-#include "common_test_utils/subgraph_builders/weights_decompression_builders.hpp"
 #include "utils/cpu_test_utils.hpp"
 #include "utils/fusing_test_utils.hpp"
 
@@ -42,14 +42,14 @@ namespace test {
  *              Bias
  */
 typedef std::tuple<MatMulDecompressionShapeParams,
-                   ov::test::ElementType,      // weights precision
-                   ov::test::ElementType,      // decompression precision
-                   ov::test::ElementType,      // scale precision
-                   bool,                       // transpose on weights
-                   ov::test::utils::DecompressionType,          // decompression multiply type
-                   ov::test::utils::DecompressionType,          // decompression subtract type
-                   bool,                       // reshape on decompression constants
-                   ov::AnyMap,                 // additional config
+                   ov::test::ElementType,               // weights precision
+                   ov::test::ElementType,               // decompression precision
+                   ov::test::ElementType,               // scale precision
+                   bool,                                // transpose on weights
+                   ov::test::utils::DecompressionType,  // decompression multiply type
+                   ov::test::utils::DecompressionType,  // decompression subtract type
+                   bool,                                // reshape on decompression constants
+                   ov::AnyMap,                          // additional config
                    fusingSpecificParams,
                    bool>  // should use decompression implementation
     MatmulWeightsDecompressionParams;

@@ -302,6 +302,22 @@ std::vector<ov::AnyMap> filter_additional_config_dyn_quant() {
     return additional_config;
 }
 
+std::vector<ov::AnyMap> filter_additional_config_dyn_quant_bf16() {
+    // Drive the BF16 dynamic-quant compressed-FC path through the inference_precision
+    // hint on top of an f32 IR. The ConvertPrecision pipeline is responsible for
+    // adjusting the decompression chain to bf16; the test should not pre-bake bf16
+    // into the IR.
+    std::vector<ov::AnyMap> additional_config = {};
+    if (ov::with_cpu_x86_bfloat16() && !ov::with_cpu_x86_avx512_core_amx()) {
+        additional_config = {
+            {ov::hint::dynamic_quantization_group_size(0), ov::hint::inference_precision(ov::element::bf16)},
+            {ov::hint::dynamic_quantization_group_size(16), ov::hint::inference_precision(ov::element::bf16)},
+            {ov::hint::dynamic_quantization_group_size(128), ov::hint::inference_precision(ov::element::bf16)},
+        };
+    }
+    return additional_config;
+}
+
 INSTANTIATE_TEST_SUITE_P(smoke_MatMulCompressedWeights_non_default_dyn_quant_group_sizes,
                          MatmulWeightsDecompression,
                          ::testing::Combine(::testing::ValuesIn(input_shapes_basic_dyn_quant),
@@ -317,6 +333,21 @@ INSTANTIATE_TEST_SUITE_P(smoke_MatMulCompressedWeights_non_default_dyn_quant_gro
                                             ::testing::Values(true)),
                          MatmulWeightsDecompression::getTestCaseName);
 
+INSTANTIATE_TEST_SUITE_P(smoke_MatMulCompressedWeights_non_default_dyn_quant_group_sizes_bf16,
+                         MatmulWeightsDecompression,
+                         ::testing::Combine(::testing::ValuesIn(input_shapes_basic_dyn_quant),
+                                            ::testing::ValuesIn(weights_precisions_dyn_quant),
+                                            ::testing::ValuesIn(decompression_precisions),
+                                            ::testing::Values(ov::element::dynamic),
+                                            ::testing::Values(true),
+                                            ::testing::Values(DecompressionType::full),
+                                            ::testing::ValuesIn(decompression_subtract_type),
+                                            ::testing::Values(false),
+                                            ::testing::ValuesIn(filter_additional_config_dyn_quant_bf16()),
+                                            ::testing::ValuesIn(fusing_params_dyn_quant),
+                                            ::testing::Values(true)),
+                         MatmulWeightsDecompression::getTestCaseName);
+
 INSTANTIATE_TEST_SUITE_P(smoke_MatMulCompressedWeights_non_default_dyn_quant_group_sizes_u2,
                          MatmulWeightsDecompression,
                          ::testing::Combine(::testing::ValuesIn(input_shapes_basic_dyn_quant_u2),
+11 −0		src/cpu/cpu_inner_product_list.cpp
+295 −127		src/cpu/x64/jit_brgemm_inner_product.cpp
+29 −20		src/cpu/x64/jit_brgemm_src_quantization_kernel.cpp