Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -233,23 +233,6 @@ ov::pass::ConvertWeightCompressedConv1x1ToMatmul::ConvertWeightCompressedConv1x1
}
}

// If the activation has a static leading dimension of 1, squeeze it.
// This is done to allow pre-selection of OCL implementations for non-IMMAD devices, reducing memory pressure.
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Judging by the comment, this change is only useful for non-IMMAD devices, does it make sense in the supports_immad case?
If not, then transformations_pipeline.cpp contains information about device, and transformation may be disabled in this case

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added the check.

bool squeeze_activation = false;
auto act_pshape = activation->get_output_partial_shape(0);
if (act_pshape.rank().is_static() && act_pshape.rank().get_length() >= 4 && act_pshape[0].is_static() &&
act_pshape[0] == 1) {
squeeze_activation = true;
auto squeeze_const =
std::make_shared<ov::op::v0::Constant>(ov::element::i64,
ov::Shape{3},
std::vector<int64_t>{1, -1, act_pshape[-1].get_length()});
auto squeeze = std::make_shared<ov::op::v1::Reshape>(activation, squeeze_const, false);
ov::copy_runtime_info(activation, squeeze);
squeeze->set_friendly_name(activation->get_friendly_name() + "_squeeze");
activation = squeeze;
}

auto matmul = std::make_shared<ov::op::v0::MatMul>(activation, scaled_weight, false, true);
ov::copy_runtime_info(conv1x1, matmul);
std::shared_ptr<Node> matmul_out;
Expand All @@ -275,18 +258,6 @@ ov::pass::ConvertWeightCompressedConv1x1ToMatmul::ConvertWeightCompressedConv1x1
matmul_out = matmul;
}

if (squeeze_activation) {
auto shape_out = matmul_out->get_output_partial_shape(0);
auto unsqueeze_const =
std::make_shared<ov::op::v0::Constant>(ov::element::i64,
ov::Shape{4},
std::vector<int64_t>{1, 1, -1, shape_out[-1].get_length()});
auto unsqueeze = std::make_shared<ov::op::v1::Reshape>(matmul_out, unsqueeze_const, false);
ov::copy_runtime_info(matmul_out, unsqueeze);
unsqueeze->set_friendly_name(matmul_out->get_friendly_name() + "_unsqueeze");
matmul_out = unsqueeze;
}

if (reshape_out) {
if (convert_out) {
auto convert_final = convert_out->clone_with_new_inputs({matmul_out});
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -178,21 +178,13 @@ std::shared_ptr<ov::Model> gen_model_ref(const Conv1x1ToMatmulTestParams& p) {
auto reshape_const = ov::opset1::Constant::create(ov::element::i64, ov::Shape{4}, {1, 1, input_batch, 10});
act_node = std::make_shared<ov::opset1::Reshape>(input, reshape_const, false);
}
if (input_batch == 1 || (p.activation_op_type == "Reshape" && p.with_act_new_reshape)) {
auto squeeze_const = ov::opset1::Constant::create(ov::element::i64, ov::Shape{3}, {1, input_batch, 10});
act_node = std::make_shared<ov::opset1::Reshape>(act_node, squeeze_const, false);
}
auto matmul = std::make_shared<ov::op::v0::MatMul>(act_node, mul, false, true);
current_node = matmul;

if (p.with_bias) {
auto bias_const = ov::opset1::Constant::create(ov::element::f16, ov::Shape{1, 1, 1, 15}, {1});
current_node = std::make_shared<ov::opset1::Add>(current_node, bias_const);
}
if (input_batch == 1 || (p.activation_op_type == "Reshape" && p.with_act_new_reshape)) {
auto unsqueeze_const = ov::opset1::Constant::create(ov::element::i64, ov::Shape{4}, {1, 1, input_batch, 15});
current_node = std::make_shared<ov::opset1::Reshape>(current_node, unsqueeze_const, false);
}
if (p.with_convert) {
current_node = std::make_shared<ov::op::v0::Convert>(current_node, ov::element::f32);
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
// Copyright (C) 2018-2026 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#include "reduce_fc_dimensions.hpp"
#include "intel_gpu/op/fully_connected.hpp"
#include "intel_gpu/op/placeholder.hpp"
#include "openvino/core/graph_util.hpp"
#include "openvino/core/rt_info.hpp"
#include "openvino/op/reshape.hpp"
#include "openvino/pass/pattern/op/any.hpp"
#include "openvino/pass/pattern/op/pattern.hpp"
#include "openvino/pass/pattern/op/wrap_type.hpp"
#include "transformations/utils/utils.hpp"

namespace ov::intel_gpu {

ReduceFCDimensions::ReduceFCDimensions() {
auto activations_m = ov::pass::pattern::any_input(ov::pass::pattern::shape_matches("[1, 1, ?, ?]"));
auto weights_m = ov::pass::pattern::any_input(ov::pass::pattern::shape_matches("[?, ?]"));
auto no_bias_m = ov::pass::pattern::wrap_type<op::Placeholder>();
auto fc_m = ov::pass::pattern::wrap_type<op::FullyConnected>({activations_m, weights_m, no_bias_m});

ov::matcher_pass_callback callback = [OV_CAPTURE_CPY_AND_THIS](ov::pass::pattern::Matcher& m) {
const auto& pattern_map = m.get_pattern_value_map();

auto activations = pattern_map.at(activations_m).get_node_shared_ptr();
auto weights = pattern_map.at(weights_m).get_node_shared_ptr();
auto no_bias = pattern_map.at(no_bias_m).get_node_shared_ptr();
auto fc = pattern_map.at(fc_m).get_node_shared_ptr();

auto wei_pshape = weights->get_output_partial_shape(0);
// Do not apply in case of dynamic weight shape
if (wei_pshape.is_dynamic()) {
return false;
}
auto squeeze_const =
std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{3}, std::vector<int64_t>{1, -1, wei_pshape[1].get_length()});
auto squeeze = std::make_shared<ov::op::v1::Reshape>(activations, squeeze_const, false);
ov::copy_runtime_info(activations, squeeze);
squeeze->set_friendly_name(activations->get_friendly_name() + "_squeeze");

auto fc_new = fc->clone_with_new_inputs({squeeze, weights, no_bias});
ov::copy_runtime_info(fc, fc_new);

auto unsqueeze_const =
std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{4}, std::vector<int64_t>{1, 1, -1, wei_pshape[0].get_length()});
ov::copy_runtime_info(fc, unsqueeze_const);
auto unsqueeze = std::make_shared<ov::op::v1::Reshape>(fc_new, unsqueeze_const, false);
unsqueeze->set_friendly_name(fc->get_friendly_name() + "_unsqueeze");
ov::copy_runtime_info(fc, unsqueeze);

ov::replace_node(fc, unsqueeze);
return true;
};

auto m = std::make_shared<ov::pass::pattern::Matcher>(fc_m, "ReduceFCDimensions");
this->register_matcher(m, callback);
}

} // namespace ov::intel_gpu
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
// Copyright (C) 2018-2026 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#pragma once

#include "openvino/pass/graph_rewrite.hpp"

namespace ov::intel_gpu {

class ReduceFCDimensions : public ov::pass::MatcherPass {
public:
OPENVINO_MATCHER_PASS_RTTI("ReduceFCDimensions");
ReduceFCDimensions();
};

} // namespace ov::intel_gpu
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@
#include "plugin/transformations/move_fc_reshape_to_weights.hpp"
#include "plugin/transformations/optimize_subsequent_reshapes.hpp"
#include "plugin/transformations/print_model_statistics.hpp"
#include "plugin/transformations/reduce_fc_dimensions.hpp"
#include "plugin/transformations/sink_reshape.hpp"
#include "plugin/transformations/transpose_fusion.hpp"
#include "plugin/transformations/unsqueeze_broadcast_reshape_matmul_fusion.hpp"
Expand Down Expand Up @@ -1517,6 +1518,9 @@ void TransformationsPipeline::apply(std::shared_ptr<ov::Model> func) {
manager.register_pass<ov::intel_gpu::ClampFP16Output>();
manager.register_pass<ov::intel_gpu::ConvertMatMulToFullyConnected>(device_info.supports_immad);
manager.register_pass<ov::intel_gpu::MoveFCReshapeToWeights>();
if (!device_info.supports_immad) {
manager.register_pass<ov::intel_gpu::ReduceFCDimensions>();
}
manager.register_pass<ov::intel_gpu::ConvertFullyConnectedToFullyConnectedCompressed>();

const bool disable_horizontal_fc_fusion = GPU_DEBUG_VALUE_OR(config.get_disable_horizontal_fc_fusion(), false);
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
// Copyright (C) 2018-2026 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#include <memory>

#include "common_test_utils/ov_test_utils.hpp"
#include "intel_gpu/op/fully_connected.hpp"
#include "intel_gpu/op/placeholder.hpp"
#include "openvino/core/model.hpp"
#include "openvino/op/add.hpp"
#include "openvino/op/constant.hpp"
#include "openvino/op/convert.hpp"
#include "openvino/op/multiply.hpp"
#include "openvino/op/parameter.hpp"
#include "openvino/op/reshape.hpp"
#include "openvino/op/result.hpp"
#include "openvino/pass/manager.hpp"
#include "plugin/transformations/reduce_fc_dimensions.hpp"

using namespace testing;
using namespace ov::intel_gpu;

namespace ov {
namespace test {
namespace intel_gpu {

// Regular case, transformation should trigger
TEST_F(TransformationTestsF, ReduceFCDimensions1) {
{
auto input1 = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::PartialShape{1, 1, -1, 16});
auto weights_const = ov::op::v0::Constant::create(ov::element::u8, ov::Shape{32, 16}, {1});
auto convert = std::make_shared<ov::op::v0::Convert>(weights_const, ov::element::f32);
auto scale_const = ov::op::v0::Constant::create(ov::element::f32, ov::Shape{32, 1}, {1});
auto scale = std::make_shared<ov::op::v1::Multiply>(convert, scale_const);
auto no_bias = std::make_shared<ov::intel_gpu::op::Placeholder>();
auto fc = std::make_shared<ov::intel_gpu::op::FullyConnected>(input1, scale, no_bias);

model = std::make_shared<ov::Model>(ov::OutputVector{fc}, ov::ParameterVector{input1});
manager.register_pass<ReduceFCDimensions>();
}
{
auto input1 = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::PartialShape{1, 1, -1, 16});
auto squeeze_const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{3}, {1, -1, 16});
auto squeeze = std::make_shared<ov::op::v1::Reshape>(input1, squeeze_const, false);
auto weights_const = ov::op::v0::Constant::create(ov::element::u8, ov::Shape{32, 16}, {1});
auto convert = std::make_shared<ov::op::v0::Convert>(weights_const, ov::element::f32);
auto scale_const = ov::op::v0::Constant::create(ov::element::f32, ov::Shape{32, 1}, {1});
auto scale = std::make_shared<ov::op::v1::Multiply>(convert, scale_const);
auto no_bias = std::make_shared<ov::intel_gpu::op::Placeholder>();
auto fc = std::make_shared<ov::intel_gpu::op::FullyConnected>(squeeze, scale, no_bias);
auto unsqueeze_const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{4}, {1, 1, -1, 32});
auto unsqueeze = std::make_shared<ov::op::v1::Reshape>(fc, unsqueeze_const, false);

model_ref = std::make_shared<ov::Model>(ov::OutputVector{unsqueeze}, ov::ParameterVector{input1});
}
}

// Incorrect input size, transformation should not trigger
TEST_F(TransformationTestsF, ReduceFCDimensions2) {
{
auto input1 = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::PartialShape{1, 4, -1, 16});
auto weights_const = ov::op::v0::Constant::create(ov::element::u8, ov::Shape{32, 16}, {1});
auto convert = std::make_shared<ov::op::v0::Convert>(weights_const, ov::element::f32);
auto scale_const = ov::op::v0::Constant::create(ov::element::f32, ov::Shape{32, 1}, {1});
auto scale = std::make_shared<ov::op::v1::Multiply>(convert, scale_const);
auto no_bias = std::make_shared<ov::intel_gpu::op::Placeholder>();
auto fc = std::make_shared<ov::intel_gpu::op::FullyConnected>(input1, scale, no_bias);

model = std::make_shared<ov::Model>(ov::OutputVector{fc}, ov::ParameterVector{input1});
manager.register_pass<ReduceFCDimensions>();
}
{
model_ref = model->clone();
}
}

// Bias present, transformation should not trigger
TEST_F(TransformationTestsF, ReduceFCDimensions3) {
{
auto input1 = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::PartialShape{1, 1, -1, 16});
auto weights_const = ov::op::v0::Constant::create(ov::element::u8, ov::Shape{32, 16}, {1});
auto convert = std::make_shared<ov::op::v0::Convert>(weights_const, ov::element::f32);
auto scale_const = ov::op::v0::Constant::create(ov::element::f32, ov::Shape{32, 1}, {1});
auto scale = std::make_shared<ov::op::v1::Multiply>(convert, scale_const);
auto bias = ov::op::v0::Constant::create(ov::element::f32, ov::Shape{1, 1, 1, 32}, {1.0});
auto fc = std::make_shared<ov::intel_gpu::op::FullyConnected>(input1, scale, bias);

model = std::make_shared<ov::Model>(ov::OutputVector{fc}, ov::ParameterVector{input1});
manager.register_pass<ReduceFCDimensions>();
}
{
model_ref = model->clone();
}
}

// 3D weight, transformation should not trigger
TEST_F(TransformationTestsF, ReduceFCDimensions4) {
{
auto input1 = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::PartialShape{1, 1, -1, 16});
auto weights_const = ov::op::v0::Constant::create(ov::element::u8, ov::Shape{4, 32, 16}, {1});
auto convert = std::make_shared<ov::op::v0::Convert>(weights_const, ov::element::f32);
auto scale_const = ov::op::v0::Constant::create(ov::element::f32, ov::Shape{4, 32, 1}, {1});
auto scale = std::make_shared<ov::op::v1::Multiply>(convert, scale_const);
auto no_bias = std::make_shared<ov::intel_gpu::op::Placeholder>();
auto fc = std::make_shared<ov::intel_gpu::op::FullyConnected>(input1, scale, no_bias);

model = std::make_shared<ov::Model>(ov::OutputVector{fc}, ov::ParameterVector{input1});
manager.register_pass<ReduceFCDimensions>();
}
{
model_ref = model->clone();
}
}

// Dynamic result dim, transformation should not trigger
TEST_F(TransformationTestsF, ReduceFCDimensions5) {
{
auto input1 = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::PartialShape{1, 1, -1, 16});
auto weights_param = std::make_shared<ov::op::v0::Parameter>(ov::element::u8, ov::PartialShape{-1, 16});
auto convert = std::make_shared<ov::op::v0::Convert>(weights_param, ov::element::f32);
auto scale_param = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::PartialShape{-1, 1});
auto scale = std::make_shared<ov::op::v1::Multiply>(convert, scale_param);
auto no_bias = std::make_shared<ov::intel_gpu::op::Placeholder>();
auto fc = std::make_shared<ov::intel_gpu::op::FullyConnected>(input1, scale, no_bias);

model = std::make_shared<ov::Model>(ov::OutputVector{fc}, ov::ParameterVector{input1, weights_param, scale_param});
manager.register_pass<ReduceFCDimensions>();
}
{
model_ref = model->clone();
}
}

// Dynamic inner dim, transformation should not trigger
TEST_F(TransformationTestsF, ReduceFCDimensions6) {
{
auto input1 = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::PartialShape{1, 1, 10, -1});
auto weights_param = std::make_shared<ov::op::v0::Parameter>(ov::element::u8, ov::PartialShape{32, -1});
auto convert = std::make_shared<ov::op::v0::Convert>(weights_param, ov::element::f32);
auto scale_const = ov::op::v0::Constant::create(ov::element::f32, ov::Shape{32, 1}, {1});
auto scale = std::make_shared<ov::op::v1::Multiply>(convert, scale_const);
auto no_bias = std::make_shared<ov::intel_gpu::op::Placeholder>();
auto fc = std::make_shared<ov::intel_gpu::op::FullyConnected>(input1, scale, no_bias);

model = std::make_shared<ov::Model>(ov::OutputVector{fc}, ov::ParameterVector{input1, weights_param});
manager.register_pass<ReduceFCDimensions>();
}
{
model_ref = model->clone();
}
}

} // namespace intel_gpu
} // namespace test
} // namespace ov
Loading