openvinotoolkit · mdvoretc-intel · Apr 23, 2026 · Apr 27, 2026 · Jun 3, 2026 · Jun 3, 2026
@@ -233,23 +233,6 @@ ov::pass::ConvertWeightCompressedConv1x1ToMatmul::ConvertWeightCompressedConv1x1
             }
         }
 
-        // If the activation has a static leading dimension of 1, squeeze it.
-        // This is done to allow pre-selection of OCL implementations for non-IMMAD devices, reducing memory pressure.
-        bool squeeze_activation = false;
-        auto act_pshape = activation->get_output_partial_shape(0);
-        if (act_pshape.rank().is_static() && act_pshape.rank().get_length() >= 4 && act_pshape[0].is_static() &&
-            act_pshape[0] == 1) {
-            squeeze_activation = true;
-            auto squeeze_const =
-                std::make_shared<ov::op::v0::Constant>(ov::element::i64,
-                                                       ov::Shape{3},
-                                                       std::vector<int64_t>{1, -1, act_pshape[-1].get_length()});
-            auto squeeze = std::make_shared<ov::op::v1::Reshape>(activation, squeeze_const, false);
-            ov::copy_runtime_info(activation, squeeze);
-            squeeze->set_friendly_name(activation->get_friendly_name() + "_squeeze");
-            activation = squeeze;
-        }
-
         auto matmul = std::make_shared<ov::op::v0::MatMul>(activation, scaled_weight, false, true);
         ov::copy_runtime_info(conv1x1, matmul);
         std::shared_ptr<Node> matmul_out;
@@ -275,18 +258,6 @@ ov::pass::ConvertWeightCompressedConv1x1ToMatmul::ConvertWeightCompressedConv1x1
             matmul_out = matmul;
         }
 
-        if (squeeze_activation) {
-            auto shape_out = matmul_out->get_output_partial_shape(0);
-            auto unsqueeze_const =
-                std::make_shared<ov::op::v0::Constant>(ov::element::i64,
-                                                       ov::Shape{4},
-                                                       std::vector<int64_t>{1, 1, -1, shape_out[-1].get_length()});
-            auto unsqueeze = std::make_shared<ov::op::v1::Reshape>(matmul_out, unsqueeze_const, false);
-            ov::copy_runtime_info(matmul_out, unsqueeze);
-            unsqueeze->set_friendly_name(matmul_out->get_friendly_name() + "_unsqueeze");
-            matmul_out = unsqueeze;
-        }
-
         if (reshape_out) {
             if (convert_out) {
                 auto convert_final = convert_out->clone_with_new_inputs({matmul_out});

@@ -178,21 +178,13 @@ std::shared_ptr<ov::Model> gen_model_ref(const Conv1x1ToMatmulTestParams& p) {
         auto reshape_const = ov::opset1::Constant::create(ov::element::i64, ov::Shape{4}, {1, 1, input_batch, 10});
         act_node = std::make_shared<ov::opset1::Reshape>(input, reshape_const, false);
     }
-    if (input_batch == 1 || (p.activation_op_type == "Reshape" && p.with_act_new_reshape)) {
-        auto squeeze_const = ov::opset1::Constant::create(ov::element::i64, ov::Shape{3}, {1, input_batch, 10});
-        act_node = std::make_shared<ov::opset1::Reshape>(act_node, squeeze_const, false);
-    }
     auto matmul = std::make_shared<ov::op::v0::MatMul>(act_node, mul, false, true);
     current_node = matmul;
 
     if (p.with_bias) {
         auto bias_const = ov::opset1::Constant::create(ov::element::f16, ov::Shape{1, 1, 1, 15}, {1});
         current_node = std::make_shared<ov::opset1::Add>(current_node, bias_const);
     }
-    if (input_batch == 1 || (p.activation_op_type == "Reshape" && p.with_act_new_reshape)) {
-        auto unsqueeze_const = ov::opset1::Constant::create(ov::element::i64, ov::Shape{4}, {1, 1, input_batch, 15});
-        current_node = std::make_shared<ov::opset1::Reshape>(current_node, unsqueeze_const, false);
-    }
     if (p.with_convert) {
         current_node = std::make_shared<ov::op::v0::Convert>(current_node, ov::element::f32);
     }

@@ -0,0 +1,61 @@
+// Copyright (C) 2018-2026 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "reduce_fc_dimensions.hpp"
+#include "intel_gpu/op/fully_connected.hpp"
+#include "intel_gpu/op/placeholder.hpp"
+#include "openvino/core/graph_util.hpp"
+#include "openvino/core/rt_info.hpp"
+#include "openvino/op/reshape.hpp"
+#include "openvino/pass/pattern/op/any.hpp"
+#include "openvino/pass/pattern/op/pattern.hpp"
+#include "openvino/pass/pattern/op/wrap_type.hpp"
+#include "transformations/utils/utils.hpp"
+
+namespace ov::intel_gpu {
+
+ReduceFCDimensions::ReduceFCDimensions() {
+    auto activations_m = ov::pass::pattern::any_input(ov::pass::pattern::shape_matches("[1, 1, ?, ?]"));
+    auto weights_m = ov::pass::pattern::any_input(ov::pass::pattern::shape_matches("[?, ?]"));
+    auto no_bias_m = ov::pass::pattern::wrap_type<op::Placeholder>();
+    auto fc_m = ov::pass::pattern::wrap_type<op::FullyConnected>({activations_m, weights_m, no_bias_m});
+
+    ov::matcher_pass_callback callback = [OV_CAPTURE_CPY_AND_THIS](ov::pass::pattern::Matcher& m) {
+        const auto& pattern_map = m.get_pattern_value_map();
+
+        auto activations = pattern_map.at(activations_m).get_node_shared_ptr();
+        auto weights = pattern_map.at(weights_m).get_node_shared_ptr();
+        auto no_bias = pattern_map.at(no_bias_m).get_node_shared_ptr();
+        auto fc = pattern_map.at(fc_m).get_node_shared_ptr();
+
+        auto wei_pshape = weights->get_output_partial_shape(0);
+        // Do not apply in case of dynamic weight shape
+        if (wei_pshape.is_dynamic()) {
+            return false;
+        }
+        auto squeeze_const =
+            std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{3}, std::vector<int64_t>{1, -1, wei_pshape[1].get_length()});
+        auto squeeze = std::make_shared<ov::op::v1::Reshape>(activations, squeeze_const, false);
+        ov::copy_runtime_info(activations, squeeze);
+        squeeze->set_friendly_name(activations->get_friendly_name() + "_squeeze");
+
+        auto fc_new = fc->clone_with_new_inputs({squeeze, weights, no_bias});
+        ov::copy_runtime_info(fc, fc_new);
+
+        auto unsqueeze_const =
+            std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{4}, std::vector<int64_t>{1, 1, -1, wei_pshape[0].get_length()});
+        ov::copy_runtime_info(fc, unsqueeze_const);
+        auto unsqueeze = std::make_shared<ov::op::v1::Reshape>(fc_new, unsqueeze_const, false);
+        unsqueeze->set_friendly_name(fc->get_friendly_name() + "_unsqueeze");
+        ov::copy_runtime_info(fc, unsqueeze);
+
+        ov::replace_node(fc, unsqueeze);
+        return true;
+    };
+
+    auto m = std::make_shared<ov::pass::pattern::Matcher>(fc_m, "ReduceFCDimensions");
+    this->register_matcher(m, callback);
+}
+
+}  // namespace ov::intel_gpu
@@ -0,0 +1,17 @@
+// Copyright (C) 2018-2026 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "openvino/pass/graph_rewrite.hpp"
+
+namespace ov::intel_gpu {
+
+class ReduceFCDimensions : public ov::pass::MatcherPass {
+public:
+    OPENVINO_MATCHER_PASS_RTTI("ReduceFCDimensions");
+    ReduceFCDimensions();
+};
+
+}  // namespace ov::intel_gpu
@@ -105,6 +105,7 @@
 #include "plugin/transformations/move_fc_reshape_to_weights.hpp"
 #include "plugin/transformations/optimize_subsequent_reshapes.hpp"
 #include "plugin/transformations/print_model_statistics.hpp"
+#include "plugin/transformations/reduce_fc_dimensions.hpp"
 #include "plugin/transformations/sink_reshape.hpp"
 #include "plugin/transformations/transpose_fusion.hpp"
 #include "plugin/transformations/unsqueeze_broadcast_reshape_matmul_fusion.hpp"
@@ -1517,6 +1518,9 @@ void TransformationsPipeline::apply(std::shared_ptr<ov::Model> func) {
         manager.register_pass<ov::intel_gpu::ClampFP16Output>();
         manager.register_pass<ov::intel_gpu::ConvertMatMulToFullyConnected>(device_info.supports_immad);
         manager.register_pass<ov::intel_gpu::MoveFCReshapeToWeights>();
+        if (!device_info.supports_immad) {
+            manager.register_pass<ov::intel_gpu::ReduceFCDimensions>();
+        }
         manager.register_pass<ov::intel_gpu::ConvertFullyConnectedToFullyConnectedCompressed>();
 
         const bool disable_horizontal_fc_fusion = GPU_DEBUG_VALUE_OR(config.get_disable_horizontal_fc_fusion(), false);

@@ -0,0 +1,156 @@
+// Copyright (C) 2018-2026 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <memory>
+
+#include "common_test_utils/ov_test_utils.hpp"
+#include "intel_gpu/op/fully_connected.hpp"
+#include "intel_gpu/op/placeholder.hpp"
+#include "openvino/core/model.hpp"
+#include "openvino/op/add.hpp"
+#include "openvino/op/constant.hpp"
+#include "openvino/op/convert.hpp"
+#include "openvino/op/multiply.hpp"
+#include "openvino/op/parameter.hpp"
+#include "openvino/op/reshape.hpp"
+#include "openvino/op/result.hpp"
+#include "openvino/pass/manager.hpp"
+#include "plugin/transformations/reduce_fc_dimensions.hpp"
+
+using namespace testing;
+using namespace ov::intel_gpu;
+
+namespace ov {
+namespace test {
+namespace intel_gpu {
+
+// Regular case, transformation should trigger
+TEST_F(TransformationTestsF, ReduceFCDimensions1) {
+    {
+        auto input1 = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::PartialShape{1, 1, -1, 16});
+        auto weights_const = ov::op::v0::Constant::create(ov::element::u8, ov::Shape{32, 16}, {1});
+        auto convert = std::make_shared<ov::op::v0::Convert>(weights_const, ov::element::f32);
+        auto scale_const = ov::op::v0::Constant::create(ov::element::f32, ov::Shape{32, 1}, {1});
+        auto scale = std::make_shared<ov::op::v1::Multiply>(convert, scale_const);
+        auto no_bias = std::make_shared<ov::intel_gpu::op::Placeholder>();
+        auto fc = std::make_shared<ov::intel_gpu::op::FullyConnected>(input1, scale, no_bias);
+
+        model = std::make_shared<ov::Model>(ov::OutputVector{fc}, ov::ParameterVector{input1});
+        manager.register_pass<ReduceFCDimensions>();
+    }
+    {
+        auto input1 = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::PartialShape{1, 1, -1, 16});
+        auto squeeze_const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{3}, {1, -1, 16});
+        auto squeeze = std::make_shared<ov::op::v1::Reshape>(input1, squeeze_const, false);
+        auto weights_const = ov::op::v0::Constant::create(ov::element::u8, ov::Shape{32, 16}, {1});
+        auto convert = std::make_shared<ov::op::v0::Convert>(weights_const, ov::element::f32);
+        auto scale_const = ov::op::v0::Constant::create(ov::element::f32, ov::Shape{32, 1}, {1});
+        auto scale = std::make_shared<ov::op::v1::Multiply>(convert, scale_const);
+        auto no_bias = std::make_shared<ov::intel_gpu::op::Placeholder>();
+        auto fc = std::make_shared<ov::intel_gpu::op::FullyConnected>(squeeze, scale, no_bias);
+        auto unsqueeze_const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{4}, {1, 1, -1, 32});
+        auto unsqueeze = std::make_shared<ov::op::v1::Reshape>(fc, unsqueeze_const, false);
+
+        model_ref = std::make_shared<ov::Model>(ov::OutputVector{unsqueeze}, ov::ParameterVector{input1});
+    }
+}
+
+// Incorrect input size, transformation should not trigger
+TEST_F(TransformationTestsF, ReduceFCDimensions2) {
+    {
+        auto input1 = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::PartialShape{1, 4, -1, 16});
+        auto weights_const = ov::op::v0::Constant::create(ov::element::u8, ov::Shape{32, 16}, {1});
+        auto convert = std::make_shared<ov::op::v0::Convert>(weights_const, ov::element::f32);
+        auto scale_const = ov::op::v0::Constant::create(ov::element::f32, ov::Shape{32, 1}, {1});
+        auto scale = std::make_shared<ov::op::v1::Multiply>(convert, scale_const);
+        auto no_bias = std::make_shared<ov::intel_gpu::op::Placeholder>();
+        auto fc = std::make_shared<ov::intel_gpu::op::FullyConnected>(input1, scale, no_bias);
+
+        model = std::make_shared<ov::Model>(ov::OutputVector{fc}, ov::ParameterVector{input1});
+        manager.register_pass<ReduceFCDimensions>();
+    }
+    {
+        model_ref = model->clone();
+    }
+}
+
+// Bias present, transformation should not trigger
+TEST_F(TransformationTestsF, ReduceFCDimensions3) {
+    {
+        auto input1 = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::PartialShape{1, 1, -1, 16});
+        auto weights_const = ov::op::v0::Constant::create(ov::element::u8, ov::Shape{32, 16}, {1});
+        auto convert = std::make_shared<ov::op::v0::Convert>(weights_const, ov::element::f32);
+        auto scale_const = ov::op::v0::Constant::create(ov::element::f32, ov::Shape{32, 1}, {1});
+        auto scale = std::make_shared<ov::op::v1::Multiply>(convert, scale_const);
+        auto bias = ov::op::v0::Constant::create(ov::element::f32, ov::Shape{1, 1, 1, 32}, {1.0});
+        auto fc = std::make_shared<ov::intel_gpu::op::FullyConnected>(input1, scale, bias);
+
+        model = std::make_shared<ov::Model>(ov::OutputVector{fc}, ov::ParameterVector{input1});
+        manager.register_pass<ReduceFCDimensions>();
+    }
+    {
+        model_ref = model->clone();
+    }
+}
+
+// 3D weight, transformation should not trigger
+TEST_F(TransformationTestsF, ReduceFCDimensions4) {
+    {
+        auto input1 = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::PartialShape{1, 1, -1, 16});
+        auto weights_const = ov::op::v0::Constant::create(ov::element::u8, ov::Shape{4, 32, 16}, {1});
+        auto convert = std::make_shared<ov::op::v0::Convert>(weights_const, ov::element::f32);
+        auto scale_const = ov::op::v0::Constant::create(ov::element::f32, ov::Shape{4, 32, 1}, {1});
+        auto scale = std::make_shared<ov::op::v1::Multiply>(convert, scale_const);
+        auto no_bias = std::make_shared<ov::intel_gpu::op::Placeholder>();
+        auto fc = std::make_shared<ov::intel_gpu::op::FullyConnected>(input1, scale, no_bias);
+
+        model = std::make_shared<ov::Model>(ov::OutputVector{fc}, ov::ParameterVector{input1});
+        manager.register_pass<ReduceFCDimensions>();
+    }
+    {
+        model_ref = model->clone();
+    }
+}
+
+// Dynamic result dim, transformation should not trigger
+TEST_F(TransformationTestsF, ReduceFCDimensions5) {
+    {
+        auto input1 = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::PartialShape{1, 1, -1, 16});
+        auto weights_param = std::make_shared<ov::op::v0::Parameter>(ov::element::u8, ov::PartialShape{-1, 16});
+        auto convert = std::make_shared<ov::op::v0::Convert>(weights_param, ov::element::f32);
+        auto scale_param = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::PartialShape{-1, 1});
+        auto scale = std::make_shared<ov::op::v1::Multiply>(convert, scale_param);
+        auto no_bias = std::make_shared<ov::intel_gpu::op::Placeholder>();
+        auto fc = std::make_shared<ov::intel_gpu::op::FullyConnected>(input1, scale, no_bias);
+
+        model = std::make_shared<ov::Model>(ov::OutputVector{fc}, ov::ParameterVector{input1, weights_param, scale_param});
+        manager.register_pass<ReduceFCDimensions>();
+    }
+    {
+        model_ref = model->clone();
+    }
+}
+
+// Dynamic inner dim, transformation should not trigger
+TEST_F(TransformationTestsF, ReduceFCDimensions6) {
+    {
+        auto input1 = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::PartialShape{1, 1, 10, -1});
+        auto weights_param = std::make_shared<ov::op::v0::Parameter>(ov::element::u8, ov::PartialShape{32, -1});
+        auto convert = std::make_shared<ov::op::v0::Convert>(weights_param, ov::element::f32);
+        auto scale_const = ov::op::v0::Constant::create(ov::element::f32, ov::Shape{32, 1}, {1});
+        auto scale = std::make_shared<ov::op::v1::Multiply>(convert, scale_const);
+        auto no_bias = std::make_shared<ov::intel_gpu::op::Placeholder>();
+        auto fc = std::make_shared<ov::intel_gpu::op::FullyConnected>(input1, scale, no_bias);
+
+        model = std::make_shared<ov::Model>(ov::OutputVector{fc}, ov::ParameterVector{input1, weights_param});
+        manager.register_pass<ReduceFCDimensions>();
+    }
+    {
+        model_ref = model->clone();
+    }
+}
+
+}  // namespace intel_gpu
+}  // namespace test
+}  // namespace ov