Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -166,12 +166,24 @@ static bool useDynamicQuantizationImpl(size_t dqGroupSize,
return false;
}

if (!dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx2_vnni) &&
!dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core_vnni)) {
return false;
}

if (srcDesc->getPrecision() != ov::element::f32) {
// BF16 dynamic-quant path requires native x86 BF16 HW support (AVX512_BF16) AND an
// AVX512-VNNI impl in oneDNN (only avx512_core_vnni instance is registered for bf16 src).
const auto srcPrecision = srcDesc->getPrecision();
if (srcPrecision == ov::element::bf16) {
if (!dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core_bf16)) {
return false;
}
// On AMX-capable HW, AMX BF16 TMUL outperforms VNNI int8 dyn-quant for bf16 src.
// Disable the bf16 dyn-quant entry here so the AMX BF16 path stays in use.
if (dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core_amx)) {
return false;
}
} else if (srcPrecision == ov::element::f32) {
if (!dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx2_vnni) &&
!dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core_vnni)) {
return false;
}
} else {
return false;
}

Expand Down
11 changes: 8 additions & 3 deletions src/plugins/intel_cpu/src/nodes/fullyconnected.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -94,9 +94,14 @@ ov::element::TypeVector FullyConnected::getSupportedCompressedActivationsTypes()
return {Type_t::f32, Type_t::f16};
}
#if defined(OPENVINO_ARCH_X86_64)
// @todo enable for bf16 as well
// after EnforceInferencePrecision is replaced with ConvertPrecision
return {Type_t::f32};
// BF16 compressed-activations path is intended for SIMD (avx512_vnni)
// dynamic-quant kernels. On AMX-capable HW, AMX BF16 TMUL outperforms
// VNNI int8 on prefill, so keep f32 here and let the existing AMX BF16
// path handle bf16 inference precision.
if (dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core_amx)) {
return {Type_t::f32};
}
return {Type_t::f32, Type_t::bf16};
Comment thread
liubo-intel marked this conversation as resolved.
Comment thread
maxnick marked this conversation as resolved.
#elif defined(OV_CPU_WITH_KLEIDIAI)
return {Type_t::f32};
#else
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
//

#include "matmul_weights_decompression.hpp"

#include "common_test_utils/subgraph_builders/weights_decompression_builders.hpp"
#include "openvino/runtime/intel_cpu/properties.hpp"

Expand All @@ -11,10 +12,19 @@ using namespace CPUTestUtils;
namespace ov {
namespace test {

std::string MatmulWeightsDecompression::getTestCaseName(const testing::TestParamInfo<MatmulWeightsDecompressionParams>& obj) {
const auto& [shape_params, weights_precision, decompression_precision, scale_precision, transpose,
decompression_multiply_type, decompression_subtract_type, reshape_on_decompression, additional_config,
fusing_params, should_fuse] = obj.param;
std::string MatmulWeightsDecompression::getTestCaseName(
const testing::TestParamInfo<MatmulWeightsDecompressionParams>& obj) {
const auto& [shape_params,
weights_precision,
decompression_precision,
scale_precision,
transpose,
decompression_multiply_type,
decompression_subtract_type,
reshape_on_decompression,
additional_config,
fusing_params,
should_fuse] = obj.param;
std::ostringstream result;
result << shape_params << "_";
result << "weights_precision=" << weights_precision << "_";
Expand All @@ -36,53 +46,67 @@ std::string MatmulWeightsDecompression::getTestCaseName(const testing::TestParam
return result.str();
}

std::shared_ptr<ov::Model> MatmulWeightsDecompression::initSubgraph(const ov::PartialShape& data_shape,
const ov::Shape& weights_shape,
const int group_size,
const ov::element::Type data_precision,
const ov::element::Type weights_precision,
const ov::element::Type decompression_precision,
const ov::element::Type scale_precision,
const bool transpose_weights,
const ov::test::utils::DecompressionType decompression_multiply_type,
const ov::test::utils::DecompressionType decompression_subtract_type,
const bool reshape_on_decompression) {
std::shared_ptr<ov::Model> MatmulWeightsDecompression::initSubgraph(
const ov::PartialShape& data_shape,
const ov::Shape& weights_shape,
const int group_size,
const ov::element::Type data_precision,
const ov::element::Type weights_precision,
const ov::element::Type decompression_precision,
const ov::element::Type scale_precision,
const bool transpose_weights,
const ov::test::utils::DecompressionType decompression_multiply_type,
const ov::test::utils::DecompressionType decompression_subtract_type,
const bool reshape_on_decompression) {
ov::ParameterVector params{std::make_shared<ov::op::v0::Parameter>(data_precision, data_shape)};
const auto weights_subgraph = initMatMulDecompressionSubgraph(weights_shape,
group_size,
data_precision,
weights_precision,
decompression_precision,
scale_precision,
transpose_weights,
decompression_multiply_type,
decompression_subtract_type,
reshape_on_decompression);
group_size,
data_precision,
weights_precision,
decompression_precision,
scale_precision,
transpose_weights,
decompression_multiply_type,
decompression_subtract_type,
reshape_on_decompression);
auto matMul = std::make_shared<ov::op::v0::MatMul>(params[0], weights_subgraph);
return create_ov_model(data_precision, params, matMul, "MatmulWeightsDecompression");
}

void MatmulWeightsDecompression::SetUp() {
targetDevice = ov::test::utils::DEVICE_CPU;
const auto& [shape_params, weights_precision, decompression_precision, scale_precision, transpose_weights,
decompression_multiply_type, decompression_subtract_type, reshape_on_decompression, additional_config,
fusing_params, should_fuse] = GetParam();
const auto& [shape_params,
weights_precision,
decompression_precision,
scale_precision,
transpose_weights,
decompression_multiply_type,
decompression_subtract_type,
reshape_on_decompression,
additional_config,
fusing_params,
should_fuse] = GetParam();
configuration.insert(additional_config.begin(), additional_config.end());
std::tie(postOpMgrPtr, fusedOps) = fusing_params;
init_input_shapes({shape_params.data_shape});

const bool dyn_quant_enabled = configuration.count(ov::hint::dynamic_quantization_group_size.name()) &&
configuration.at(ov::hint::dynamic_quantization_group_size.name()) != 0;
const auto inference_precision_hint =
configuration.count(ov::hint::inference_precision.name())
? configuration.at(ov::hint::inference_precision.name()).as<ov::element::Type>()
: ov::element::dynamic;

if (!configuration.count(ov::hint::dynamic_quantization_group_size.name())) {
abs_threshold = 5e-3;
}

// if dynamic quantization is enabled
if (configuration.count(ov::hint::dynamic_quantization_group_size.name()) &&
configuration.at(ov::hint::dynamic_quantization_group_size.name()) != 0) {
abs_threshold = 0.1;
if (dyn_quant_enabled) {
abs_threshold = inference_precision_hint == ov::element::bf16 ? 0.2 : 0.1;
}

if (configuration.count(ov::hint::inference_precision.name()) &&
configuration.at(ov::hint::inference_precision.name()) == ov::element::f16) {
if (inference_precision_hint == ov::element::f16) {
abs_threshold = 0.2;
}

Expand Down Expand Up @@ -118,9 +142,8 @@ void MatmulWeightsDecompression::check_results() {
type = fc->get_rt_info().at(ov::exec_model_info::LAYER_TYPE).as<std::string>();
EXPECT_EQ(type, "FullyConnected");

const auto& expected_weights_precision = use_matmul_decompression_impl
? compressed_weights_precision
: fc->get_input_element_type(0);
const auto& expected_weights_precision =
use_matmul_decompression_impl ? compressed_weights_precision : fc->get_input_element_type(0);
EXPECT_EQ(fc->get_input_element_type(1), expected_weights_precision);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@
//

#include "common_test_utils/ov_tensor_utils.hpp"
#include "common_test_utils/subgraph_builders/weights_decompression_builders.hpp"
#include "shared_test_classes/base/ov_subgraph.hpp"
#include "shared_test_classes/subgraph/weights_decompression_params.hpp"
#include "common_test_utils/subgraph_builders/weights_decompression_builders.hpp"
#include "utils/cpu_test_utils.hpp"
#include "utils/fusing_test_utils.hpp"

Expand Down Expand Up @@ -42,14 +42,14 @@ namespace test {
* Bias
*/
typedef std::tuple<MatMulDecompressionShapeParams,
ov::test::ElementType, // weights precision
ov::test::ElementType, // decompression precision
ov::test::ElementType, // scale precision
bool, // transpose on weights
ov::test::utils::DecompressionType, // decompression multiply type
ov::test::utils::DecompressionType, // decompression subtract type
bool, // reshape on decompression constants
ov::AnyMap, // additional config
ov::test::ElementType, // weights precision
ov::test::ElementType, // decompression precision
ov::test::ElementType, // scale precision
bool, // transpose on weights
ov::test::utils::DecompressionType, // decompression multiply type
ov::test::utils::DecompressionType, // decompression subtract type
bool, // reshape on decompression constants
ov::AnyMap, // additional config
fusingSpecificParams,
bool> // should use decompression implementation
MatmulWeightsDecompressionParams;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -302,6 +302,22 @@ std::vector<ov::AnyMap> filter_additional_config_dyn_quant() {
return additional_config;
}

std::vector<ov::AnyMap> filter_additional_config_dyn_quant_bf16() {
// Drive the BF16 dynamic-quant compressed-FC path through the inference_precision
// hint on top of an f32 IR. The ConvertPrecision pipeline is responsible for
// adjusting the decompression chain to bf16; the test should not pre-bake bf16
// into the IR.
std::vector<ov::AnyMap> additional_config = {};
if (ov::with_cpu_x86_bfloat16() && !ov::with_cpu_x86_avx512_core_amx()) {
additional_config = {
{ov::hint::dynamic_quantization_group_size(0), ov::hint::inference_precision(ov::element::bf16)},
{ov::hint::dynamic_quantization_group_size(16), ov::hint::inference_precision(ov::element::bf16)},
{ov::hint::dynamic_quantization_group_size(128), ov::hint::inference_precision(ov::element::bf16)},
};
}
return additional_config;
}

INSTANTIATE_TEST_SUITE_P(smoke_MatMulCompressedWeights_non_default_dyn_quant_group_sizes,
MatmulWeightsDecompression,
::testing::Combine(::testing::ValuesIn(input_shapes_basic_dyn_quant),
Expand All @@ -317,6 +333,21 @@ INSTANTIATE_TEST_SUITE_P(smoke_MatMulCompressedWeights_non_default_dyn_quant_gro
::testing::Values(true)),
MatmulWeightsDecompression::getTestCaseName);

INSTANTIATE_TEST_SUITE_P(smoke_MatMulCompressedWeights_non_default_dyn_quant_group_sizes_bf16,
MatmulWeightsDecompression,
::testing::Combine(::testing::ValuesIn(input_shapes_basic_dyn_quant),
::testing::ValuesIn(weights_precisions_dyn_quant),
::testing::ValuesIn(decompression_precisions),
::testing::Values(ov::element::dynamic),
::testing::Values(true),
::testing::Values(DecompressionType::full),
::testing::ValuesIn(decompression_subtract_type),
::testing::Values(false),
::testing::ValuesIn(filter_additional_config_dyn_quant_bf16()),
::testing::ValuesIn(fusing_params_dyn_quant),
::testing::Values(true)),
MatmulWeightsDecompression::getTestCaseName);

INSTANTIATE_TEST_SUITE_P(smoke_MatMulCompressedWeights_non_default_dyn_quant_group_sizes_u2,
MatmulWeightsDecompression,
::testing::Combine(::testing::ValuesIn(input_shapes_basic_dyn_quant_u2),
Expand Down
Loading