From 7027704e3247e59294b77dd8163786b372338d54 Mon Sep 17 00:00:00 2001
From: "S, Deepak" <deepak.s@intel.com>
Date: Mon, 25 May 2026 20:42:37 +0800
Subject: [PATCH 1/9] [GPU] Update concat_no_implicit_gpu_onednn_4d_f16 test
 expectation for batch>1

The batch=2 feature-axis case had is_implicit_concat=false, reflecting the
old prepare_buffer_fusing behaviour where batch > 1 unconditionally disabled
in-place fusing on the oneDNN path. After tightening that guard to apply only
when concat_axis_index == 0 (batch-axis concat), feature-axis concat at
batch=2 is correctly fused. Update is_implicit_concat to true to match the
corrected behaviour; the existing diff_count == 0 assertion in the test body
verifies the output values are bit-exact.

Signed-off-by: S, Deepak <deepak.s@intel.com>
---
 .../src/graph/graph_optimizer/prepare_buffer_fusing.cpp   | 8 ++++++--
 .../tests/unit/test_cases/concatenation_gpu_test.cpp      | 2 +-
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp
index 18787ee6e404..844ce6055910 100644
--- a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp
+++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp
@@ -236,7 +236,7 @@ bool concat_in_place_optimization::match(const program_node& concat_node,
         idx++;
     }
 
-    // Implicit concat for onednn only when use_usm and batch 1.
+    // Implicit concat for onednn only when use_usm and batch 1 on the batch axis.
     if (is_onednn_impl) {
         bool use_usm = concat_node.get_program().get_engine().use_unified_shared_memory();
         const layout& concat_out_l = concat_params.get_output_layout();
@@ -246,7 +246,11 @@ bool concat_in_place_optimization::match(const program_node& concat_node,
             // Return true in build time, it will be checked again in runtime
             return true;
         } else {
-            if (concat_out_l.batch() > 1)
+            // Block formats (b_fs_yx_fsv16 etc.) are not contiguous along the batch axis,
+            // so batch-axis (axis=0) concat with batch>1 cannot safely alias buffers.
+            // Feature-axis and other axes are fine — the 64-byte alignment check above is
+            // the correctness gate for those cases.
+            if (concat_axis_index == 0 && concat_out_l.batch() > 1)
                 return false;
             const auto& dims_order = concat_out_l.format.dims_order();
             for (auto dim : dims_order) {
diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/concatenation_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/concatenation_gpu_test.cpp
index 7c35ad8941ad..27d0ec7859a8 100644
--- a/src/plugins/intel_gpu/tests/unit/test_cases/concatenation_gpu_test.cpp
+++ b/src/plugins/intel_gpu/tests/unit/test_cases/concatenation_gpu_test.cpp
@@ -2235,7 +2235,7 @@ INSTANTIATE_TEST_SUITE_P(smoke,
                         concat_no_implicit_gpu_onednn_4d_f16,
                         ::testing::Values(
                             TestParamType_implicit_concat(1, { 16 }, 2, 2, format::b_fs_yx_fsv16, true, false),
-                            TestParamType_implicit_concat(2, { 16 }, 2, 2, format::b_fs_yx_fsv16, false, false)
+                            TestParamType_implicit_concat(2, { 16 }, 2, 2, format::b_fs_yx_fsv16, true, false)
                         ),
                         concat_gpu_implicit::PrintToStringParamName);
 

From fea3740d811d58317e3a179d3e31b087e8b28308 Mon Sep 17 00:00:00 2001
From: "S, Deepak" <deepak.s@intel.com>
Date: Mon, 25 May 2026 20:42:37 +0800
Subject: [PATCH 2/9] [GPU] Improve prepare_buffer_fusing onednn concat test
 coverage

Replace NaN/Inf output checks with element-wise comparison against a
reference network running an explicit (non-fused) concat over the same
random inputs. This catches buffer aliasing bugs where the implicit path
produces numerically valid but incorrect values.

The batch>1 test switches to a two-network implicit-vs-explicit comparison
to avoid hard-coding layout assumptions for block formats at batch>1. The
multi-user test adds force_implementations to pin the oneDNN fusing code
path regardless of the runtime heuristic.

Signed-off-by: S, Deepak <deepak.s@intel.com>
---
 .../passes/prepare_buffer_fusing_test.cpp     | 266 ++++++++++++++++++
 1 file changed, 266 insertions(+)

diff --git a/src/plugins/intel_gpu/tests/unit/passes/prepare_buffer_fusing_test.cpp b/src/plugins/intel_gpu/tests/unit/passes/prepare_buffer_fusing_test.cpp
index c64bffc5ee54..41bb20b4a8f4 100644
--- a/src/plugins/intel_gpu/tests/unit/passes/prepare_buffer_fusing_test.cpp
+++ b/src/plugins/intel_gpu/tests/unit/passes/prepare_buffer_fusing_test.cpp
@@ -1650,6 +1650,272 @@ TEST(prepare_buffer_fusing, in_place_onednn_concat_static) {
         ASSERT_EQ(ref_output[x], output_ptr[x]);
     }
 }
+// Feature-axis oneDNN concat with batch>1 must now fuse (zero-copy, in-place buffer).
+// Before the fix: concat_out_l.batch() > 1 returned false for all axes in the oneDNN static path.
+// After the fix: the guard is scoped to batch-axis (axis=0) only; feature-axis (axis=1) at
+// batch>1 goes through the existing 64-byte alignment gate and may be optimized.
+//
+// Three inputs with non-uniform feature counts and non-square spatial dimensions stress the
+// alignment gate and the buffer-offset arithmetic more than symmetric shapes would.
+//
+// Run both an implicit (optimize_data=true) and an explicit (optimize_data=false) network over
+// the same random inputs and assert element-wise equality. This catches buffer aliasing bugs
+// that produce valid-looking floats but incorrect values without requiring hardcoded references.
+TEST(prepare_buffer_fusing, in_place_onednn_concat_static_batch_gt1) {
+    auto& engine = get_test_engine();
+    if (!engine.get_device_info().supports_immad)
+        return;
+
+    // Three inputs with batch=3 and non-uniform feature counts [16, 32, 16] at spatial 2×3.
+    // Previously all were blocked by the unconditional batch>1 guard on feature-axis concat.
+    auto in_layout1 = layout{ ov::PartialShape{3, 16, 2, 3}, data_types::f32, format::bfyx };
+    auto in_layout2 = layout{ ov::PartialShape{3, 32, 2, 3}, data_types::f32, format::bfyx };
+    auto in_layout3 = layout{ ov::PartialShape{3, 16, 2, 3}, data_types::f32, format::bfyx };
+
+    auto build_topology = [](bool use_block_format) {
+        auto fmt = use_block_format ? format::b_fs_yx_fsv16 : format::bfyx;
+        topology topo;
+        topo.add(input_layout("input1", layout{ ov::PartialShape{3, 16, 2, 3}, data_types::f32, format::bfyx }));
+        topo.add(input_layout("input2", layout{ ov::PartialShape{3, 32, 2, 3}, data_types::f32, format::bfyx }));
+        topo.add(input_layout("input3", layout{ ov::PartialShape{3, 16, 2, 3}, data_types::f32, format::bfyx }));
+        topo.add(reorder("reorder1", input_info("input1"), fmt, data_types::f16));
+        topo.add(reorder("reorder2", input_info("input2"), fmt, data_types::f16));
+        topo.add(reorder("reorder3", input_info("input3"), fmt, data_types::f16));
+        topo.add(concatenation("concat", { input_info("reorder1"), input_info("reorder2"), input_info("reorder3") }, 1));
+        topo.add(reorder("output", input_info("concat"), format::bfyx, data_types::f32));
+        return topo;
+    };
+
+    auto input_memory1 = engine.allocate_memory(in_layout1);
+    auto input_memory2 = engine.allocate_memory(in_layout2);
+    auto input_memory3 = engine.allocate_memory(in_layout3);
+    tests::random_generator rg(GET_SUITE_NAME);
+    auto vals1 = rg.generate_random_1d<float>(3 * 16 * 2 * 3, -1, 1);
+    auto vals2 = rg.generate_random_1d<float>(3 * 32 * 2 * 3, -1, 1);
+    auto vals3 = rg.generate_random_1d<float>(3 * 16 * 2 * 3, -1, 1);
+    set_values(input_memory1, vals1);
+    set_values(input_memory2, vals2);
+    set_values(input_memory3, vals3);
+
+    // implicit concat — runtime selects the preferred impl (onednn on immad devices) for the path
+    ExecutionConfig cfg_implicit = get_test_default_config(engine);
+    cfg_implicit.set_property(ov::intel_gpu::optimize_data(true));
+    cfg_implicit.set_property(ov::intel_gpu::allow_new_shape_infer(false));
+    network net_implicit(engine, build_topology(true), cfg_implicit);
+    net_implicit.set_input_data("input1", input_memory1);
+    net_implicit.set_input_data("input2", input_memory2);
+    net_implicit.set_input_data("input3", input_memory3);
+    auto out_implicit = net_implicit.execute();
+
+    const auto& concat_node = net_implicit.get_primitive("concat")->get_node();
+    ASSERT_TRUE(concat_node.can_be_optimized());
+
+    // explicit concat — reference without in-place optimisation (bfyx to avoid format aliasing)
+    ExecutionConfig cfg_explicit = get_test_default_config(engine);
+    cfg_explicit.set_property(ov::intel_gpu::optimize_data(false));
+    network net_explicit(engine, build_topology(false), cfg_explicit);
+    net_explicit.set_input_data("input1", input_memory1);
+    net_explicit.set_input_data("input2", input_memory2);
+    net_explicit.set_input_data("input3", input_memory3);
+    auto out_explicit = net_explicit.execute();
+
+    auto mem_implicit = out_implicit.at("output").get_memory();
+    auto mem_explicit = out_explicit.at("output").get_memory();
+    cldnn::mem_lock<float> ptr_implicit(mem_implicit, get_test_stream());
+    cldnn::mem_lock<float> ptr_explicit(mem_explicit, get_test_stream());
+
+    ASSERT_EQ(ptr_implicit.size(), ptr_explicit.size());
+    for (size_t i = 0; i < ptr_implicit.size(); i++)
+        ASSERT_NEAR(ptr_implicit[i], ptr_explicit[i], 1e-3f) << "mismatch at index " << i;
+}
+
+// Predecessor with 3 users (concat + 2 safe-type activation nodes) must now be allowed
+// to fuse. Before the fix: get_users().size() > 2 blocked fusing unconditionally.
+// After the fix: the type-based reads_padded_input_safely check allows safe-type
+// multi-user predecessors.
+TEST(prepare_buffer_fusing, in_place_onednn_concat_multi_user_safe_type) {
+    auto& engine = get_test_engine();
+    if (!engine.get_device_info().supports_immad)
+        return;
+
+    // shared_in → reorder (shared_r) feeds 3 users: concat, act1, act2 — all safe types.
+    // Use f32 inputs so reorder(f32→f16) is non-trivial and not optimized out by the pass.
+    auto in_layout  = layout{ ov::PartialShape{1, 16, 4, 4}, data_types::f32, format::bfyx };
+    auto in_layout2 = layout{ ov::PartialShape{1, 16, 4, 4}, data_types::f32, format::bfyx };
+
+    topology topology;
+    topology.add(input_layout("shared_in", in_layout));
+    topology.add(input_layout("other_in",  in_layout2));
+    // shared_r: the predecessor node that will have 3 users
+    topology.add(reorder("shared_r", input_info("shared_in"), format::bfyx, data_types::f16));
+    topology.add(reorder("other_r",  input_info("other_in"),  format::bfyx, data_types::f16));
+
+    // User 1 of shared_r: concat (the node we want to fuse)
+    topology.add(concatenation("concat", { input_info("shared_r"), input_info("other_r") }, 1));
+    // User 2 of shared_r: activation relu (safe type — in available_pred, never reads padding)
+    topology.add(activation("act1", input_info("shared_r"), activation_func::relu));
+    // User 3 of shared_r: activation clamp (safe type)
+    topology.add(activation("act2", input_info("shared_r"), activation_func::clamp, {0.0f, 1.0f}));
+
+    topology.add(reorder("out_concat", input_info("concat"), format::bfyx, data_types::f32));
+    topology.add(reorder("out_act1",   input_info("act1"),   format::bfyx, data_types::f32));
+    topology.add(reorder("out_act2",   input_info("act2"),   format::bfyx, data_types::f32));
+
+    ExecutionConfig config = get_test_default_config(engine);
+    config.set_property(ov::intel_gpu::optimize_data(true));
+    config.set_property(ov::intel_gpu::allow_new_shape_infer(false));
+    // Force onednn on the concat predecessors so the onednn fusing path is exercised.
+    config.set_property(ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{
+        {"shared_r", ov::intel_gpu::ImplementationDesc{format::any, "", impl_types::onednn}},
+        {"other_r",  ov::intel_gpu::ImplementationDesc{format::any, "", impl_types::onednn}},
+    }));
+    network network(engine, topology, config);
+
+    auto input_memory  = engine.allocate_memory(in_layout);
+    auto input_memory2 = engine.allocate_memory(in_layout2);
+    std::vector<float> d1(16 * 4 * 4, 1.0f);
+    std::vector<float> d2(16 * 4 * 4, 2.0f);
+    set_values(input_memory,  d1);
+    set_values(input_memory2, d2);
+
+    network.set_input_data("shared_in", input_memory);
+    network.set_input_data("other_in",  input_memory2);
+
+    std::map<cldnn::primitive_id, cldnn::network_output> output;
+    EXPECT_NO_THROW(output = network.execute());
+
+    const auto& concat_node = network.get_primitive("concat")->get_node();
+    ASSERT_TRUE(concat_node.can_be_optimized());
+
+    // out_concat = [shared_r(1.0f) | other_r(2.0f)] along feature axis
+    auto out_concat_mem = output.at("out_concat").get_memory();
+    cldnn::mem_lock<float> concat_ptr(out_concat_mem, get_test_stream());
+    ASSERT_EQ(concat_ptr.size(), 512u);
+    for (size_t i = 0; i < 256; i++)
+        ASSERT_EQ(concat_ptr[i], 1.0f) << "out_concat mismatch at index " << i;
+    for (size_t i = 256; i < 512; i++)
+        ASSERT_EQ(concat_ptr[i], 2.0f) << "out_concat mismatch at index " << i;
+
+    // out_act1 = relu(shared_r) = relu(1.0f) = 1.0f
+    auto out_act1_mem = output.at("out_act1").get_memory();
+    cldnn::mem_lock<float> act1_ptr(out_act1_mem, get_test_stream());
+    for (size_t i = 0; i < act1_ptr.size(); i++)
+        ASSERT_EQ(act1_ptr[i], 1.0f) << "out_act1 mismatch at index " << i;
+
+    // out_act2 = clamp(shared_r, 0, 1) = clamp(1.0f, 0, 1) = 1.0f
+    auto out_act2_mem = output.at("out_act2").get_memory();
+    cldnn::mem_lock<float> act2_ptr(out_act2_mem, get_test_stream());
+    for (size_t i = 0; i < act2_ptr.size(); i++)
+        ASSERT_EQ(act2_ptr[i], 1.0f) << "out_act2 mismatch at index " << i;
+}
+
+// Verifies that convolution is recognised as a safe-reader type in
+// reads_padded_input_safely, so that implicit concat fusing still applies
+// when the shared predecessor has a conv as one of its non-concat users.
+//
+// Topology (shared_r has 3 users):
+//   shared_in(f32) → shared_r(f32→f16, oneDNN) ─┬→ concat ─→ out_concat
+//   other_in(f32)  → other_r(f32→f16)           ─┘
+//                                     └→ conv(bfyx, reads padded safely)  → out_conv
+//                                     └→ act1                             → out_act1
+//
+// Both conv and act1 are in reads_padded_input_safely, so the multi-user guard
+// allows fusing. shared_r is forced to oneDNN so the oneDNN predecessor path
+// is exercised. Conv is forced to format::bfyx so reorder_inputs does not
+// insert a format-conversion reorder between shared_r and conv (a reorder
+// would be outside reads_padded_input_safely and would incorrectly block fusing).
+TEST(prepare_buffer_fusing, in_place_onednn_concat_multi_user_conv_as_user) {
+    auto& engine = get_test_engine();
+    if (!engine.get_device_info().supports_immad)
+        return;
+
+    // f32 inputs so shared_r and other_r are genuinely non-trivial reorders
+    // (f32→f16) and are not eliminated by remove_redundant_reorders.
+    auto in_layout  = layout{ ov::PartialShape{1, 16, 4, 4}, data_types::f32, format::bfyx };
+    auto in_layout2 = layout{ ov::PartialShape{1, 16, 4, 4}, data_types::f32, format::bfyx };
+
+    // 1×1 identity conv weights (16 output channels, 16 input channels).
+    auto weights_layout = layout{ ov::PartialShape{16, 16, 1, 1}, data_types::f16, format::bfyx };
+    auto weights_mem = engine.allocate_memory(weights_layout);
+    std::vector<ov::float16> wdata(16 * 16, ov::float16(0.f));
+    for (int i = 0; i < 16; ++i)
+        wdata[i * 16 + i] = ov::float16(1.f);
+    set_values(weights_mem, wdata);
+
+    topology topology;
+    topology.add(input_layout("shared_in", in_layout));
+    topology.add(input_layout("other_in",  in_layout2));
+    topology.add(data("conv_w", weights_mem));
+
+    // shared_r is the predecessor node that will have 3 users after fusing.
+    topology.add(reorder("shared_r", input_info("shared_in"), format::bfyx, data_types::f16));
+    topology.add(reorder("other_r",  input_info("other_in"),  format::bfyx, data_types::f16));
+
+    // User 1 of shared_r: feature-axis concat (the fusing candidate).
+    topology.add(concatenation("concat", { input_info("shared_r"), input_info("other_r") }, 1));
+    // User 2 of shared_r: 1×1 identity conv with bfyx output (validates that conv
+    // reads from the padded buffer by logical coordinates, not raw offsets).
+    topology.add(convolution("conv", input_info("shared_r"), "conv_w", "", 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, false));
+    // User 3 of shared_r: activation (always safe).
+    topology.add(activation("act1", input_info("shared_r"), activation_func::relu));
+
+    topology.add(reorder("out_concat", input_info("concat"), format::bfyx, data_types::f32));
+    topology.add(reorder("out_conv",   input_info("conv"),   format::bfyx, data_types::f32));
+    topology.add(reorder("out_act1",   input_info("act1"),   format::bfyx, data_types::f32));
+
+    ExecutionConfig config = get_test_default_config(engine);
+    config.set_property(ov::intel_gpu::optimize_data(true));
+    config.set_property(ov::intel_gpu::allow_new_shape_infer(false));
+    // Force shared_r (predecessor) to oneDNN so the oneDNN fusing path is exercised.
+    // Force conv to bfyx so reorder_inputs does not insert a format-conversion reorder
+    // between shared_r (bfyx output) and conv; without this, the reorder would not be
+    // in reads_padded_input_safely and would incorrectly block fusing.
+    config.set_property(ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{
+        {"shared_r", ov::intel_gpu::ImplementationDesc{format::bfyx, "", impl_types::onednn}},
+        {"conv",     ov::intel_gpu::ImplementationDesc{format::bfyx, "", impl_types::ocl}},
+    }));
+    network network(engine, topology, config);
+
+    auto input_memory  = engine.allocate_memory(in_layout);
+    auto input_memory2 = engine.allocate_memory(in_layout2);
+    std::vector<float> d1(16 * 4 * 4, 1.0f);
+    std::vector<float> d2(16 * 4 * 4, 2.0f);
+    set_values(input_memory,  d1);
+    set_values(input_memory2, d2);
+
+    network.set_input_data("shared_in", input_memory);
+    network.set_input_data("other_in",  input_memory2);
+
+    std::map<cldnn::primitive_id, cldnn::network_output> output;
+    EXPECT_NO_THROW(output = network.execute());
+
+    // Confirm concat was fused: shared_r has 3 users (concat, conv, act1), all safe types,
+    // so the multi-user guard allows implicit concat optimization.
+    const auto& concat_node = network.get_primitive("concat")->get_node();
+    ASSERT_TRUE(concat_node.can_be_optimized());
+
+    // out_concat: [shared_r(1.0f) | other_r(2.0f)] along feature axis.
+    auto out_concat_mem = output.at("out_concat").get_memory();
+    cldnn::mem_lock<float> concat_ptr(out_concat_mem, get_test_stream());
+    ASSERT_EQ(concat_ptr.size(), 512u);
+    for (size_t i = 0; i < 256; i++)
+        ASSERT_NEAR(concat_ptr[i], 1.0f, 1e-2f) << "out_concat mismatch at index " << i;
+    for (size_t i = 256; i < 512; i++)
+        ASSERT_NEAR(concat_ptr[i], 2.0f, 1e-2f) << "out_concat mismatch at index " << i;
+
+    // out_conv: identity conv(shared_r) = 1.0f — confirms that conv reads from the
+    // padded predecessor buffer by logical tensor coordinates, not raw byte offsets.
+    auto out_conv_mem = output.at("out_conv").get_memory();
+    cldnn::mem_lock<float> conv_ptr(out_conv_mem, get_test_stream());
+    for (size_t i = 0; i < conv_ptr.size(); i++)
+        ASSERT_NEAR(conv_ptr[i], 1.0f, 1e-2f) << "out_conv mismatch at index " << i;
+
+    // out_act1: relu(shared_r) = relu(1.0f) = 1.0f.
+    auto out_act1_mem = output.at("out_act1").get_memory();
+    cldnn::mem_lock<float> act1_ptr(out_act1_mem, get_test_stream());
+    for (size_t i = 0; i < act1_ptr.size(); i++)
+        ASSERT_NEAR(act1_ptr[i], 1.0f, 1e-2f) << "out_act1 mismatch at index " << i;
+}
 #endif  // ENABLE_ONEDNN_FOR_GPU
 
 TEST(prepare_buffer_fusing, in_place_concat_with_fsv32_to_fsv16_reorder_regression) {

From 418cf021527482713c18918c378246e7838cda6e Mon Sep 17 00:00:00 2001
From: "S, Deepak" <deepak.s@intel.com>
Date: Mon, 25 May 2026 20:42:37 +0800
Subject: [PATCH 3/9] [GPU] Use narrower type set for multi-user concat
 predecessor safety check

The available_pred lambda guards output-padding support: whether a node
type's kernel can write output with buffer gaps for in-place aliasing. The
multi-user predecessor check in concat_in_place_optimization reused that
lambda to validate non-concat consumers, but the question there is different:
whether the consumer's kernel reads input by coordinate and therefore skips
padding correctly.

Introduce reads_padded_input_safely with that explicit semantic and use it
for the multi-user consumer check. The list is narrower than available_pred:
reorder and permute are excluded because some of their implementations copy
over raw buffer byte ranges and would include padding bytes. The types
retained (convolution, deconvolution, pooling, activation, eltwise, quantize)
address input by explicit tensor coordinate in all GPU kernel variants.

Signed-off-by: S, Deepak <deepak.s@intel.com>
---
 .../graph_optimizer/prepare_buffer_fusing.cpp | 37 +++++++++++++++++--
 1 file changed, 34 insertions(+), 3 deletions(-)

diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp
index 844ce6055910..7225bce0af4b 100644
--- a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp
+++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp
@@ -76,6 +76,30 @@ auto available_pred = [](const program_node& input) {
     return true;
 };
 
+// Primitives that read input by explicit tensor coordinate and therefore correctly skip
+// padding on the input side. reorder and permute are excluded because some of their
+// implementations copy over raw buffer byte ranges and would include padding bytes.
+//
+// A can_be_optimzied node is excluded as it is transparent and shares buffer with
+// downstream consumers whoes types are not checked
+//
+// Eltwise is safe only when broadcast_spec is NONE/EXPLICIT, i.e, when both inputs
+// were declared equal-shape at graph construction and no dimension is expanded
+// over the padded region of the padded predecessor.
+auto reads_padded_input_safely = [](const program_node& user) {
+    if (user.can_be_optimized())
+        return false;
+    if (user.is_type<eltwise>()) {
+        auto broadcast_type = user.as<eltwise>().get_primitive()->broadcast_spec.m_type;
+        return broadcast_type == ov::op::AutoBroadcastType::NONE;
+    }
+    return user.is_type<convolution>()   ||
+           user.is_type<deconvolution>() ||
+           user.is_type<pooling>()       ||
+           user.is_type<activation>()    ||
+           user.is_type<quantize>();
+};
+
 bool concat_in_place_optimization::match(const program_node& concat_node,
                                          kernel_impl_params& concat_params,
                                          std::vector<kernel_impl_params>& pred_params,
@@ -148,9 +172,16 @@ bool concat_in_place_optimization::match(const program_node& concat_node,
         // TODO: handle optimized reshape
         if (pred.first->is_type<reshape>() && pred.first->can_be_optimized())
             return false;
-        // TODO: Investigate if this condition is needed
-        if (pred.first->get_users().size() > 2)
-            return false;
+        // A predecessor with more than two users can still be fused if all non-concat
+        // users correctly handle a padded input buffer.
+        if (pred.first->get_users().size() > 2) {
+            for (const auto& user : pred.first->get_users()) {
+                if (user->is_type<concatenation>())
+                    continue;
+                if (!reads_padded_input_safely(*user))
+                    return false;
+            }
+        }
 
        // Check that input isn't optimized out concatenation along different axis.
         if (pred.first->is_type<concatenation>() && pred.first->can_be_optimized()) {

From 2970ebc89a5f8c8879d28b561537b7ee4e5136d7 Mon Sep 17 00:00:00 2001
From: "S, Deepak" <deepak.s@intel.com>
Date: Tue, 2 Jun 2026 22:36:01 +0800
Subject: [PATCH 4/9] [GPU] Use natural-number test data in
 multi_user_safe_type concat test

Replace uniform constant fill (1.0f / 2.0f) with sequential 0..N-1 values so
any buffer-overlap or aliasing regression produces a wrong value at a specific
index rather than going undetected.

Update output assertions to match: concat halves checked element-wise, relu
output equals input, clamp output is 0.0f at index 0 and 1.0f elsewhere.

Signed-off-by: S, Deepak <deepak.s@intel.com>
---
 .../passes/prepare_buffer_fusing_test.cpp     | 30 +++++++++++--------
 1 file changed, 17 insertions(+), 13 deletions(-)

diff --git a/src/plugins/intel_gpu/tests/unit/passes/prepare_buffer_fusing_test.cpp b/src/plugins/intel_gpu/tests/unit/passes/prepare_buffer_fusing_test.cpp
index 41bb20b4a8f4..82b785bd8f1f 100644
--- a/src/plugins/intel_gpu/tests/unit/passes/prepare_buffer_fusing_test.cpp
+++ b/src/plugins/intel_gpu/tests/unit/passes/prepare_buffer_fusing_test.cpp
@@ -1773,8 +1773,11 @@ TEST(prepare_buffer_fusing, in_place_onednn_concat_multi_user_safe_type) {
 
     auto input_memory  = engine.allocate_memory(in_layout);
     auto input_memory2 = engine.allocate_memory(in_layout2);
-    std::vector<float> d1(16 * 4 * 4, 1.0f);
-    std::vector<float> d2(16 * 4 * 4, 2.0f);
+    // Natural-number sequences (0, 1, 2 … N-1) so any buffer-overlap or aliasing bug
+    // produces a wrong value at a specific position rather than going unnoticed.
+    const size_t N = 16 * 4 * 4;  // 256
+    std::vector<float> d1(N), d2(N);
+    for (size_t i = 0; i < N; i++) { d1[i] = static_cast<float>(i); d2[i] = static_cast<float>(i); }
     set_values(input_memory,  d1);
     set_values(input_memory2, d2);
 
@@ -1787,26 +1790,27 @@ TEST(prepare_buffer_fusing, in_place_onednn_concat_multi_user_safe_type) {
     const auto& concat_node = network.get_primitive("concat")->get_node();
     ASSERT_TRUE(concat_node.can_be_optimized());
 
-    // out_concat = [shared_r(1.0f) | other_r(2.0f)] along feature axis
+    // out_concat = [shared_r(0..N-1) | other_r(0..N-1)] along feature axis
     auto out_concat_mem = output.at("out_concat").get_memory();
     cldnn::mem_lock<float> concat_ptr(out_concat_mem, get_test_stream());
-    ASSERT_EQ(concat_ptr.size(), 512u);
-    for (size_t i = 0; i < 256; i++)
-        ASSERT_EQ(concat_ptr[i], 1.0f) << "out_concat mismatch at index " << i;
-    for (size_t i = 256; i < 512; i++)
-        ASSERT_EQ(concat_ptr[i], 2.0f) << "out_concat mismatch at index " << i;
+    ASSERT_EQ(concat_ptr.size(), 2 * N);
+    for (size_t i = 0; i < N; i++)
+        ASSERT_NEAR(concat_ptr[i],     static_cast<float>(i), 1e-3f) << "out_concat first half mismatch at index " << i;
+    for (size_t i = 0; i < N; i++)
+        ASSERT_NEAR(concat_ptr[N + i], static_cast<float>(i), 1e-3f) << "out_concat second half mismatch at index " << i;
 
-    // out_act1 = relu(shared_r) = relu(1.0f) = 1.0f
+    // out_act1 = relu(shared_r) = relu(0..N-1) = 0..N-1 (all non-negative)
     auto out_act1_mem = output.at("out_act1").get_memory();
     cldnn::mem_lock<float> act1_ptr(out_act1_mem, get_test_stream());
     for (size_t i = 0; i < act1_ptr.size(); i++)
-        ASSERT_EQ(act1_ptr[i], 1.0f) << "out_act1 mismatch at index " << i;
+        ASSERT_NEAR(act1_ptr[i], static_cast<float>(i), 1e-3f) << "out_act1 mismatch at index " << i;
 
-    // out_act2 = clamp(shared_r, 0, 1) = clamp(1.0f, 0, 1) = 1.0f
+    // out_act2 = clamp(shared_r, 0, 1): index 0 → 0.0f, indices 1..N-1 → 1.0f
     auto out_act2_mem = output.at("out_act2").get_memory();
     cldnn::mem_lock<float> act2_ptr(out_act2_mem, get_test_stream());
-    for (size_t i = 0; i < act2_ptr.size(); i++)
-        ASSERT_EQ(act2_ptr[i], 1.0f) << "out_act2 mismatch at index " << i;
+    ASSERT_NEAR(act2_ptr[0], 0.0f, 1e-3f) << "out_act2 mismatch at index 0";
+    for (size_t i = 1; i < act2_ptr.size(); i++)
+        ASSERT_NEAR(act2_ptr[i], 1.0f, 1e-3f) << "out_act2 mismatch at index " << i;
 }
 
 // Verifies that convolution is recognised as a safe-reader type in

From f1bdbc73cf7b63f16f9d5d34b6f5b875df8ba683 Mon Sep 17 00:00:00 2001
From: "S, Deepak" <deepak.s@intel.com>
Date: Tue, 2 Jun 2026 22:56:25 +0800
Subject: [PATCH 5/9] [GPU] Use natural-number test data in conv_as_user concat
 test

Replace uniform constant fill (1.0f / 2.0f) with sequential 0..N-1 values
so any buffer-overlap or aliasing bug produces a mismatch at a specific index
rather than going undetected.

Update output assertions: concat halves and conv output checked element-wise
against float(i); relu output is identity for all non-negative inputs.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../passes/prepare_buffer_fusing_test.cpp     | 27 ++++++++++---------
 1 file changed, 15 insertions(+), 12 deletions(-)

diff --git a/src/plugins/intel_gpu/tests/unit/passes/prepare_buffer_fusing_test.cpp b/src/plugins/intel_gpu/tests/unit/passes/prepare_buffer_fusing_test.cpp
index 82b785bd8f1f..d9ac48228fe4 100644
--- a/src/plugins/intel_gpu/tests/unit/passes/prepare_buffer_fusing_test.cpp
+++ b/src/plugins/intel_gpu/tests/unit/passes/prepare_buffer_fusing_test.cpp
@@ -1882,8 +1882,11 @@ TEST(prepare_buffer_fusing, in_place_onednn_concat_multi_user_conv_as_user) {
 
     auto input_memory  = engine.allocate_memory(in_layout);
     auto input_memory2 = engine.allocate_memory(in_layout2);
-    std::vector<float> d1(16 * 4 * 4, 1.0f);
-    std::vector<float> d2(16 * 4 * 4, 2.0f);
+    // Natural-number sequences (0, 1, 2 … N-1) so any buffer-overlap or aliasing bug
+    // produces a wrong value at a specific position rather than going unnoticed.
+    const size_t N = 16 * 4 * 4;  // 256
+    std::vector<float> d1(N), d2(N);
+    for (size_t i = 0; i < N; i++) { d1[i] = static_cast<float>(i); d2[i] = static_cast<float>(i); }
     set_values(input_memory,  d1);
     set_values(input_memory2, d2);
 
@@ -1898,27 +1901,27 @@ TEST(prepare_buffer_fusing, in_place_onednn_concat_multi_user_conv_as_user) {
     const auto& concat_node = network.get_primitive("concat")->get_node();
     ASSERT_TRUE(concat_node.can_be_optimized());
 
-    // out_concat: [shared_r(1.0f) | other_r(2.0f)] along feature axis.
+    // out_concat: [shared_r(0..N-1) | other_r(0..N-1)] along feature axis.
     auto out_concat_mem = output.at("out_concat").get_memory();
     cldnn::mem_lock<float> concat_ptr(out_concat_mem, get_test_stream());
-    ASSERT_EQ(concat_ptr.size(), 512u);
-    for (size_t i = 0; i < 256; i++)
-        ASSERT_NEAR(concat_ptr[i], 1.0f, 1e-2f) << "out_concat mismatch at index " << i;
-    for (size_t i = 256; i < 512; i++)
-        ASSERT_NEAR(concat_ptr[i], 2.0f, 1e-2f) << "out_concat mismatch at index " << i;
+    ASSERT_EQ(concat_ptr.size(), 2 * N);
+    for (size_t i = 0; i < N; i++)
+        ASSERT_NEAR(concat_ptr[i],     static_cast<float>(i), 1e-2f) << "out_concat first half mismatch at index " << i;
+    for (size_t i = 0; i < N; i++)
+        ASSERT_NEAR(concat_ptr[N + i], static_cast<float>(i), 1e-2f) << "out_concat second half mismatch at index " << i;
 
-    // out_conv: identity conv(shared_r) = 1.0f — confirms that conv reads from the
+    // out_conv: identity conv(shared_r) = 0..N-1 — confirms that conv reads from the
     // padded predecessor buffer by logical tensor coordinates, not raw byte offsets.
     auto out_conv_mem = output.at("out_conv").get_memory();
     cldnn::mem_lock<float> conv_ptr(out_conv_mem, get_test_stream());
     for (size_t i = 0; i < conv_ptr.size(); i++)
-        ASSERT_NEAR(conv_ptr[i], 1.0f, 1e-2f) << "out_conv mismatch at index " << i;
+        ASSERT_NEAR(conv_ptr[i], static_cast<float>(i), 1e-2f) << "out_conv mismatch at index " << i;
 
-    // out_act1: relu(shared_r) = relu(1.0f) = 1.0f.
+    // out_act1: relu(shared_r) = 0..N-1 (all non-negative, relu is identity).
     auto out_act1_mem = output.at("out_act1").get_memory();
     cldnn::mem_lock<float> act1_ptr(out_act1_mem, get_test_stream());
     for (size_t i = 0; i < act1_ptr.size(); i++)
-        ASSERT_NEAR(act1_ptr[i], 1.0f, 1e-2f) << "out_act1 mismatch at index " << i;
+        ASSERT_NEAR(act1_ptr[i], static_cast<float>(i), 1e-2f) << "out_act1 mismatch at index " << i;
 }
 #endif  // ENABLE_ONEDNN_FOR_GPU
 

From fa675279f66ab8a1eb7f9c1f205c1e6e8a277072 Mon Sep 17 00:00:00 2001
From: "S, Deepak" <deepak.s@intel.com>
Date: Tue, 2 Jun 2026 23:08:49 +0800
Subject: [PATCH 6/9] [GPU] Use oneDNN conv in multi_user_conv_as_user concat
 test
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Force both shared_r and conv to b_fs_yx_fsv16 + oneDNN. Since their formats
already match, reorder_inputs inserts no intermediate reorder, so oneDNN conv
reads directly from the padded predecessor buffer — confirming the safety of
reads_padded_input_safely for oneDNN convolution.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../passes/prepare_buffer_fusing_test.cpp     | 41 +++++++++----------
 1 file changed, 20 insertions(+), 21 deletions(-)

diff --git a/src/plugins/intel_gpu/tests/unit/passes/prepare_buffer_fusing_test.cpp b/src/plugins/intel_gpu/tests/unit/passes/prepare_buffer_fusing_test.cpp
index d9ac48228fe4..16f5d1c00bdb 100644
--- a/src/plugins/intel_gpu/tests/unit/passes/prepare_buffer_fusing_test.cpp
+++ b/src/plugins/intel_gpu/tests/unit/passes/prepare_buffer_fusing_test.cpp
@@ -1813,21 +1813,19 @@ TEST(prepare_buffer_fusing, in_place_onednn_concat_multi_user_safe_type) {
         ASSERT_NEAR(act2_ptr[i], 1.0f, 1e-3f) << "out_act2 mismatch at index " << i;
 }
 
-// Verifies that convolution is recognised as a safe-reader type in
+// Verifies that oneDNN convolution is recognised as a safe-reader type in
 // reads_padded_input_safely, so that implicit concat fusing still applies
-// when the shared predecessor has a conv as one of its non-concat users.
+// when the shared predecessor has an oneDNN conv as one of its non-concat users.
 //
 // Topology (shared_r has 3 users):
-//   shared_in(f32) → shared_r(f32→f16, oneDNN) ─┬→ concat ─→ out_concat
-//   other_in(f32)  → other_r(f32→f16)           ─┘
-//                                     └→ conv(bfyx, reads padded safely)  → out_conv
-//                                     └→ act1                             → out_act1
+//   shared_in(f32) → shared_r(f32→f16, b_fs_yx_fsv16, oneDNN) ─┬→ concat ─→ out_concat
+//   other_in(f32)  → other_r(f32→f16)                          ─┘
+//                                     └→ conv(b_fs_yx_fsv16, oneDNN, reads padded safely) → out_conv
+//                                     └→ act1                                              → out_act1
 //
-// Both conv and act1 are in reads_padded_input_safely, so the multi-user guard
-// allows fusing. shared_r is forced to oneDNN so the oneDNN predecessor path
-// is exercised. Conv is forced to format::bfyx so reorder_inputs does not
-// insert a format-conversion reorder between shared_r and conv (a reorder
-// would be outside reads_padded_input_safely and would incorrectly block fusing).
+// Both conv and act1 are in reads_padded_input_safely. shared_r and conv are both
+// forced to b_fs_yx_fsv16 so reorder_inputs finds no format mismatch and does not
+// insert an intermediate reorder — oneDNN conv reads directly from the padded buffer.
 TEST(prepare_buffer_fusing, in_place_onednn_concat_multi_user_conv_as_user) {
     auto& engine = get_test_engine();
     if (!engine.get_device_info().supports_immad)
@@ -1852,13 +1850,15 @@ TEST(prepare_buffer_fusing, in_place_onednn_concat_multi_user_conv_as_user) {
     topology.add(data("conv_w", weights_mem));
 
     // shared_r is the predecessor node that will have 3 users after fusing.
-    topology.add(reorder("shared_r", input_info("shared_in"), format::bfyx, data_types::f16));
-    topology.add(reorder("other_r",  input_info("other_in"),  format::bfyx, data_types::f16));
+    // Declared as b_fs_yx_fsv16 to match the oneDNN conv preferred input format,
+    // so reorder_inputs does not insert a format-conversion reorder in between.
+    topology.add(reorder("shared_r", input_info("shared_in"), format::b_fs_yx_fsv16, data_types::f16));
+    topology.add(reorder("other_r",  input_info("other_in"),  format::b_fs_yx_fsv16, data_types::f16));
 
     // User 1 of shared_r: feature-axis concat (the fusing candidate).
     topology.add(concatenation("concat", { input_info("shared_r"), input_info("other_r") }, 1));
-    // User 2 of shared_r: 1×1 identity conv with bfyx output (validates that conv
-    // reads from the padded buffer by logical coordinates, not raw offsets).
+    // User 2 of shared_r: 1×1 identity oneDNN conv — validates that oneDNN conv
+    // reads from the padded buffer by logical tensor coordinates, not raw byte offsets.
     topology.add(convolution("conv", input_info("shared_r"), "conv_w", "", 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, false));
     // User 3 of shared_r: activation (always safe).
     topology.add(activation("act1", input_info("shared_r"), activation_func::relu));
@@ -1870,13 +1870,12 @@ TEST(prepare_buffer_fusing, in_place_onednn_concat_multi_user_conv_as_user) {
     ExecutionConfig config = get_test_default_config(engine);
     config.set_property(ov::intel_gpu::optimize_data(true));
     config.set_property(ov::intel_gpu::allow_new_shape_infer(false));
-    // Force shared_r (predecessor) to oneDNN so the oneDNN fusing path is exercised.
-    // Force conv to bfyx so reorder_inputs does not insert a format-conversion reorder
-    // between shared_r (bfyx output) and conv; without this, the reorder would not be
-    // in reads_padded_input_safely and would incorrectly block fusing.
+    // Both shared_r and conv are forced to b_fs_yx_fsv16 + oneDNN. Since their
+    // formats already match, reorder_inputs inserts no intermediate reorder —
+    // oneDNN conv reads the padded buffer directly, exercising the safety guarantee.
     config.set_property(ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{
-        {"shared_r", ov::intel_gpu::ImplementationDesc{format::bfyx, "", impl_types::onednn}},
-        {"conv",     ov::intel_gpu::ImplementationDesc{format::bfyx, "", impl_types::ocl}},
+        {"shared_r", ov::intel_gpu::ImplementationDesc{format::b_fs_yx_fsv16, "", impl_types::onednn}},
+        {"conv",     ov::intel_gpu::ImplementationDesc{format::b_fs_yx_fsv16, "", impl_types::onednn}},
     }));
     network network(engine, topology, config);
 

From f9dfa1d4b1aaa157c531e0bf2a2fa0f19132de14 Mon Sep 17 00:00:00 2001
From: "S, Deepak" <deepak.s@intel.com>
Date: Tue, 2 Jun 2026 23:42:45 +0800
Subject: [PATCH 7/9] [GPU] Replace clamp with abs in multi_user_safe_type
 concat test

abs preserves the full natural-number range of the input (0..N-1),
making buffer-overlap bugs immediately visible at every index.
clamp(0,1) was masking all values above 1, reducing detectability.

Also swaps concat input order (other_r first) and offsets d2 by 512
so the two halves of the concat output are unambiguously distinguishable.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../passes/prepare_buffer_fusing_test.cpp     | 28 +++++++++----------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/src/plugins/intel_gpu/tests/unit/passes/prepare_buffer_fusing_test.cpp b/src/plugins/intel_gpu/tests/unit/passes/prepare_buffer_fusing_test.cpp
index 16f5d1c00bdb..7dd37896dee1 100644
--- a/src/plugins/intel_gpu/tests/unit/passes/prepare_buffer_fusing_test.cpp
+++ b/src/plugins/intel_gpu/tests/unit/passes/prepare_buffer_fusing_test.cpp
@@ -1750,12 +1750,12 @@ TEST(prepare_buffer_fusing, in_place_onednn_concat_multi_user_safe_type) {
     topology.add(reorder("shared_r", input_info("shared_in"), format::bfyx, data_types::f16));
     topology.add(reorder("other_r",  input_info("other_in"),  format::bfyx, data_types::f16));
 
-    // User 1 of shared_r: concat (the node we want to fuse)
-    topology.add(concatenation("concat", { input_info("shared_r"), input_info("other_r") }, 1));
+    // User 1 of shared_r: concat (the node we want to fuse) — other_r first so shared_r is in the second slot
+    topology.add(concatenation("concat", { input_info("other_r"), input_info("shared_r") }, 1));
     // User 2 of shared_r: activation relu (safe type — in available_pred, never reads padding)
     topology.add(activation("act1", input_info("shared_r"), activation_func::relu));
-    // User 3 of shared_r: activation clamp (safe type)
-    topology.add(activation("act2", input_info("shared_r"), activation_func::clamp, {0.0f, 1.0f}));
+    // User 3 of shared_r: activation abs (safe type — preserves full value range)
+    topology.add(activation("act2", input_info("shared_r"), activation_func::abs));
 
     topology.add(reorder("out_concat", input_info("concat"), format::bfyx, data_types::f32));
     topology.add(reorder("out_act1",   input_info("act1"),   format::bfyx, data_types::f32));
@@ -1773,11 +1773,11 @@ TEST(prepare_buffer_fusing, in_place_onednn_concat_multi_user_safe_type) {
 
     auto input_memory  = engine.allocate_memory(in_layout);
     auto input_memory2 = engine.allocate_memory(in_layout2);
-    // Natural-number sequences (0, 1, 2 … N-1) so any buffer-overlap or aliasing bug
-    // produces a wrong value at a specific position rather than going unnoticed.
+    // Natural-number sequences — d1 starts at 0, d2 starts at 512 so buffers are
+    // distinguishable: any overlap/aliasing bug produces a clearly wrong value.
     const size_t N = 16 * 4 * 4;  // 256
     std::vector<float> d1(N), d2(N);
-    for (size_t i = 0; i < N; i++) { d1[i] = static_cast<float>(i); d2[i] = static_cast<float>(i); }
+    for (size_t i = 0; i < N; i++) { d1[i] = static_cast<float>(i); d2[i] = static_cast<float>(512 + i); }
     set_values(input_memory,  d1);
     set_values(input_memory2, d2);
 
@@ -1790,14 +1790,15 @@ TEST(prepare_buffer_fusing, in_place_onednn_concat_multi_user_safe_type) {
     const auto& concat_node = network.get_primitive("concat")->get_node();
     ASSERT_TRUE(concat_node.can_be_optimized());
 
-    // out_concat = [shared_r(0..N-1) | other_r(0..N-1)] along feature axis
+    // out_concat = [other_r(512..512+N-1) | shared_r(0..N-1)] along feature axis
+    // (other_r is the first concat input after the order swap)
     auto out_concat_mem = output.at("out_concat").get_memory();
     cldnn::mem_lock<float> concat_ptr(out_concat_mem, get_test_stream());
     ASSERT_EQ(concat_ptr.size(), 2 * N);
     for (size_t i = 0; i < N; i++)
-        ASSERT_NEAR(concat_ptr[i],     static_cast<float>(i), 1e-3f) << "out_concat first half mismatch at index " << i;
+        ASSERT_NEAR(concat_ptr[i],     static_cast<float>(512 + i), 1e-3f) << "out_concat first half mismatch at index " << i;
     for (size_t i = 0; i < N; i++)
-        ASSERT_NEAR(concat_ptr[N + i], static_cast<float>(i), 1e-3f) << "out_concat second half mismatch at index " << i;
+        ASSERT_NEAR(concat_ptr[N + i], static_cast<float>(i),       1e-3f) << "out_concat second half mismatch at index " << i;
 
     // out_act1 = relu(shared_r) = relu(0..N-1) = 0..N-1 (all non-negative)
     auto out_act1_mem = output.at("out_act1").get_memory();
@@ -1805,12 +1806,11 @@ TEST(prepare_buffer_fusing, in_place_onednn_concat_multi_user_safe_type) {
     for (size_t i = 0; i < act1_ptr.size(); i++)
         ASSERT_NEAR(act1_ptr[i], static_cast<float>(i), 1e-3f) << "out_act1 mismatch at index " << i;
 
-    // out_act2 = clamp(shared_r, 0, 1): index 0 → 0.0f, indices 1..N-1 → 1.0f
+    // out_act2 = abs(shared_r) = abs(0..N-1) = 0..N-1 (all non-negative, full range preserved)
     auto out_act2_mem = output.at("out_act2").get_memory();
     cldnn::mem_lock<float> act2_ptr(out_act2_mem, get_test_stream());
-    ASSERT_NEAR(act2_ptr[0], 0.0f, 1e-3f) << "out_act2 mismatch at index 0";
-    for (size_t i = 1; i < act2_ptr.size(); i++)
-        ASSERT_NEAR(act2_ptr[i], 1.0f, 1e-3f) << "out_act2 mismatch at index " << i;
+    for (size_t i = 0; i < act2_ptr.size(); i++)
+        ASSERT_NEAR(act2_ptr[i], static_cast<float>(i), 1e-3f) << "out_act2 mismatch at index " << i;
 }
 
 // Verifies that oneDNN convolution is recognised as a safe-reader type in

From dcc0978ee14f278026f9c01bc8a07ea629d4cd2d Mon Sep 17 00:00:00 2001
From: "S, Deepak" <deepak.s@intel.com>
Date: Tue, 2 Jun 2026 23:47:24 +0800
Subject: [PATCH 8/9] [GPU] Harden conv_as_user concat test with asymmetric
 inputs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Swap concat input order (other_r first) so shared_r lands in the
second slot, exercising a more asymmetric padding layout. Offset d2
by 512 so the two concat halves are unambiguously distinguishable —
any buffer aliasing or ordering bug produces a clearly wrong value
rather than going unnoticed when both halves held identical data.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../unit/passes/prepare_buffer_fusing_test.cpp | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/src/plugins/intel_gpu/tests/unit/passes/prepare_buffer_fusing_test.cpp b/src/plugins/intel_gpu/tests/unit/passes/prepare_buffer_fusing_test.cpp
index 7dd37896dee1..971d9a4e738b 100644
--- a/src/plugins/intel_gpu/tests/unit/passes/prepare_buffer_fusing_test.cpp
+++ b/src/plugins/intel_gpu/tests/unit/passes/prepare_buffer_fusing_test.cpp
@@ -1855,8 +1855,9 @@ TEST(prepare_buffer_fusing, in_place_onednn_concat_multi_user_conv_as_user) {
     topology.add(reorder("shared_r", input_info("shared_in"), format::b_fs_yx_fsv16, data_types::f16));
     topology.add(reorder("other_r",  input_info("other_in"),  format::b_fs_yx_fsv16, data_types::f16));
 
-    // User 1 of shared_r: feature-axis concat (the fusing candidate).
-    topology.add(concatenation("concat", { input_info("shared_r"), input_info("other_r") }, 1));
+    // User 1 of shared_r: feature-axis concat (the fusing candidate) — other_r first so
+    // shared_r lands in the second slot, making the padding layout more asymmetric.
+    topology.add(concatenation("concat", { input_info("other_r"), input_info("shared_r") }, 1));
     // User 2 of shared_r: 1×1 identity oneDNN conv — validates that oneDNN conv
     // reads from the padded buffer by logical tensor coordinates, not raw byte offsets.
     topology.add(convolution("conv", input_info("shared_r"), "conv_w", "", 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, false));
@@ -1881,11 +1882,11 @@ TEST(prepare_buffer_fusing, in_place_onednn_concat_multi_user_conv_as_user) {
 
     auto input_memory  = engine.allocate_memory(in_layout);
     auto input_memory2 = engine.allocate_memory(in_layout2);
-    // Natural-number sequences (0, 1, 2 … N-1) so any buffer-overlap or aliasing bug
-    // produces a wrong value at a specific position rather than going unnoticed.
+    // Natural-number sequences — d1 starts at 0, d2 starts at 512 so the two halves of
+    // the concat output are unambiguously distinguishable; aliasing bugs produce clearly wrong values.
     const size_t N = 16 * 4 * 4;  // 256
     std::vector<float> d1(N), d2(N);
-    for (size_t i = 0; i < N; i++) { d1[i] = static_cast<float>(i); d2[i] = static_cast<float>(i); }
+    for (size_t i = 0; i < N; i++) { d1[i] = static_cast<float>(i); d2[i] = static_cast<float>(512 + i); }
     set_values(input_memory,  d1);
     set_values(input_memory2, d2);
 
@@ -1900,14 +1901,15 @@ TEST(prepare_buffer_fusing, in_place_onednn_concat_multi_user_conv_as_user) {
     const auto& concat_node = network.get_primitive("concat")->get_node();
     ASSERT_TRUE(concat_node.can_be_optimized());
 
-    // out_concat: [shared_r(0..N-1) | other_r(0..N-1)] along feature axis.
+    // out_concat: [other_r(512..512+N-1) | shared_r(0..N-1)] along feature axis.
+    // (other_r is now the first concat input after the order swap)
     auto out_concat_mem = output.at("out_concat").get_memory();
     cldnn::mem_lock<float> concat_ptr(out_concat_mem, get_test_stream());
     ASSERT_EQ(concat_ptr.size(), 2 * N);
     for (size_t i = 0; i < N; i++)
-        ASSERT_NEAR(concat_ptr[i],     static_cast<float>(i), 1e-2f) << "out_concat first half mismatch at index " << i;
+        ASSERT_NEAR(concat_ptr[i],     static_cast<float>(512 + i), 1e-2f) << "out_concat first half mismatch at index " << i;
     for (size_t i = 0; i < N; i++)
-        ASSERT_NEAR(concat_ptr[N + i], static_cast<float>(i), 1e-2f) << "out_concat second half mismatch at index " << i;
+        ASSERT_NEAR(concat_ptr[N + i], static_cast<float>(i),       1e-2f) << "out_concat second half mismatch at index " << i;
 
     // out_conv: identity conv(shared_r) = 0..N-1 — confirms that conv reads from the
     // padded predecessor buffer by logical tensor coordinates, not raw byte offsets.

From 8dbc003666c1b9ea59dc924d102c12885fa35413 Mon Sep 17 00:00:00 2001
From: "S, Deepak" <deepak.s@intel.com>
Date: Wed, 3 Jun 2026 12:57:04 +0800
Subject: [PATCH 9/9] [GPU] Updating the comments in the source code

Reduce comments to a minimum, only for cases where the code is not obvious

Signed-off-by: S, Deepak <deepak.s@intel.com>
---
 .../graph_optimizer/prepare_buffer_fusing.cpp | 10 +--
 .../passes/prepare_buffer_fusing_test.cpp     | 70 ++++---------------
 2 files changed, 14 insertions(+), 66 deletions(-)

diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp
index 7225bce0af4b..f8b021946249 100644
--- a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp
+++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp
@@ -77,15 +77,7 @@ auto available_pred = [](const program_node& input) {
 };
 
 // Primitives that read input by explicit tensor coordinate and therefore correctly skip
-// padding on the input side. reorder and permute are excluded because some of their
-// implementations copy over raw buffer byte ranges and would include padding bytes.
-//
-// A can_be_optimzied node is excluded as it is transparent and shares buffer with
-// downstream consumers whoes types are not checked
-//
-// Eltwise is safe only when broadcast_spec is NONE/EXPLICIT, i.e, when both inputs
-// were declared equal-shape at graph construction and no dimension is expanded
-// over the padded region of the padded predecessor.
+// padding on the input side.
 auto reads_padded_input_safely = [](const program_node& user) {
     if (user.can_be_optimized())
         return false;
diff --git a/src/plugins/intel_gpu/tests/unit/passes/prepare_buffer_fusing_test.cpp b/src/plugins/intel_gpu/tests/unit/passes/prepare_buffer_fusing_test.cpp
index 971d9a4e738b..79f87577e74e 100644
--- a/src/plugins/intel_gpu/tests/unit/passes/prepare_buffer_fusing_test.cpp
+++ b/src/plugins/intel_gpu/tests/unit/passes/prepare_buffer_fusing_test.cpp
@@ -1650,24 +1650,15 @@ TEST(prepare_buffer_fusing, in_place_onednn_concat_static) {
         ASSERT_EQ(ref_output[x], output_ptr[x]);
     }
 }
-// Feature-axis oneDNN concat with batch>1 must now fuse (zero-copy, in-place buffer).
-// Before the fix: concat_out_l.batch() > 1 returned false for all axes in the oneDNN static path.
-// After the fix: the guard is scoped to batch-axis (axis=0) only; feature-axis (axis=1) at
-// batch>1 goes through the existing 64-byte alignment gate and may be optimized.
-//
-// Three inputs with non-uniform feature counts and non-square spatial dimensions stress the
-// alignment gate and the buffer-offset arithmetic more than symmetric shapes would.
-//
-// Run both an implicit (optimize_data=true) and an explicit (optimize_data=false) network over
-// the same random inputs and assert element-wise equality. This catches buffer aliasing bugs
-// that produce valid-looking floats but incorrect values without requiring hardcoded references.
+
+// Verifies that feature-axis concat with static batch > 1 and non-uniform feature counts
+// can be optimized in-place by oneDNN
 TEST(prepare_buffer_fusing, in_place_onednn_concat_static_batch_gt1) {
     auto& engine = get_test_engine();
     if (!engine.get_device_info().supports_immad)
         return;
 
     // Three inputs with batch=3 and non-uniform feature counts [16, 32, 16] at spatial 2×3.
-    // Previously all were blocked by the unconditional batch>1 guard on feature-axis concat.
     auto in_layout1 = layout{ ov::PartialShape{3, 16, 2, 3}, data_types::f32, format::bfyx };
     auto in_layout2 = layout{ ov::PartialShape{3, 32, 2, 3}, data_types::f32, format::bfyx };
     auto in_layout3 = layout{ ov::PartialShape{3, 16, 2, 3}, data_types::f32, format::bfyx };
@@ -1729,17 +1720,12 @@ TEST(prepare_buffer_fusing, in_place_onednn_concat_static_batch_gt1) {
         ASSERT_NEAR(ptr_implicit[i], ptr_explicit[i], 1e-3f) << "mismatch at index " << i;
 }
 
-// Predecessor with 3 users (concat + 2 safe-type activation nodes) must now be allowed
-// to fuse. Before the fix: get_users().size() > 2 blocked fusing unconditionally.
-// After the fix: the type-based reads_padded_input_safely check allows safe-type
-// multi-user predecessors.
+// Verifies that oneDNN in-place concat remains safe when a shared predecessor has multiple users.
 TEST(prepare_buffer_fusing, in_place_onednn_concat_multi_user_safe_type) {
     auto& engine = get_test_engine();
     if (!engine.get_device_info().supports_immad)
         return;
 
-    // shared_in → reorder (shared_r) feeds 3 users: concat, act1, act2 — all safe types.
-    // Use f32 inputs so reorder(f32→f16) is non-trivial and not optimized out by the pass.
     auto in_layout  = layout{ ov::PartialShape{1, 16, 4, 4}, data_types::f32, format::bfyx };
     auto in_layout2 = layout{ ov::PartialShape{1, 16, 4, 4}, data_types::f32, format::bfyx };
 
@@ -1764,7 +1750,6 @@ TEST(prepare_buffer_fusing, in_place_onednn_concat_multi_user_safe_type) {
     ExecutionConfig config = get_test_default_config(engine);
     config.set_property(ov::intel_gpu::optimize_data(true));
     config.set_property(ov::intel_gpu::allow_new_shape_infer(false));
-    // Force onednn on the concat predecessors so the onednn fusing path is exercised.
     config.set_property(ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{
         {"shared_r", ov::intel_gpu::ImplementationDesc{format::any, "", impl_types::onednn}},
         {"other_r",  ov::intel_gpu::ImplementationDesc{format::any, "", impl_types::onednn}},
@@ -1773,8 +1758,6 @@ TEST(prepare_buffer_fusing, in_place_onednn_concat_multi_user_safe_type) {
 
     auto input_memory  = engine.allocate_memory(in_layout);
     auto input_memory2 = engine.allocate_memory(in_layout2);
-    // Natural-number sequences — d1 starts at 0, d2 starts at 512 so buffers are
-    // distinguishable: any overlap/aliasing bug produces a clearly wrong value.
     const size_t N = 16 * 4 * 4;  // 256
     std::vector<float> d1(N), d2(N);
     for (size_t i = 0; i < N; i++) { d1[i] = static_cast<float>(i); d2[i] = static_cast<float>(512 + i); }
@@ -1790,8 +1773,6 @@ TEST(prepare_buffer_fusing, in_place_onednn_concat_multi_user_safe_type) {
     const auto& concat_node = network.get_primitive("concat")->get_node();
     ASSERT_TRUE(concat_node.can_be_optimized());
 
-    // out_concat = [other_r(512..512+N-1) | shared_r(0..N-1)] along feature axis
-    // (other_r is the first concat input after the order swap)
     auto out_concat_mem = output.at("out_concat").get_memory();
     cldnn::mem_lock<float> concat_ptr(out_concat_mem, get_test_stream());
     ASSERT_EQ(concat_ptr.size(), 2 * N);
@@ -1813,30 +1794,20 @@ TEST(prepare_buffer_fusing, in_place_onednn_concat_multi_user_safe_type) {
         ASSERT_NEAR(act2_ptr[i], static_cast<float>(i), 1e-3f) << "out_act2 mismatch at index " << i;
 }
 
-// Verifies that oneDNN convolution is recognised as a safe-reader type in
-// reads_padded_input_safely, so that implicit concat fusing still applies
-// when the shared predecessor has an oneDNN conv as one of its non-concat users.
-//
-// Topology (shared_r has 3 users):
-//   shared_in(f32) → shared_r(f32→f16, b_fs_yx_fsv16, oneDNN) ─┬→ concat ─→ out_concat
-//   other_in(f32)  → other_r(f32→f16)                          ─┘
-//                                     └→ conv(b_fs_yx_fsv16, oneDNN, reads padded safely) → out_conv
-//                                     └→ act1                                              → out_act1
+// Verifies that in-place concat fuses when an oneDNN conv is among the shared predecessor's users.
 //
-// Both conv and act1 are in reads_padded_input_safely. shared_r and conv are both
-// forced to b_fs_yx_fsv16 so reorder_inputs finds no format mismatch and does not
-// insert an intermediate reorder — oneDNN conv reads directly from the padded buffer.
+//   shared_in → shared_r (b_fs_yx_fsv16, oneDNN) ─┬→ concat → out_concat
+//   other_in  → other_r  (b_fs_yx_fsv16)          ─┘
+//                                  └→ conv (oneDNN, b_fs_yx_fsv16) → out_conv
+//                                  └→ act1                         → out_act1
 TEST(prepare_buffer_fusing, in_place_onednn_concat_multi_user_conv_as_user) {
     auto& engine = get_test_engine();
     if (!engine.get_device_info().supports_immad)
         return;
 
-    // f32 inputs so shared_r and other_r are genuinely non-trivial reorders
-    // (f32→f16) and are not eliminated by remove_redundant_reorders.
     auto in_layout  = layout{ ov::PartialShape{1, 16, 4, 4}, data_types::f32, format::bfyx };
     auto in_layout2 = layout{ ov::PartialShape{1, 16, 4, 4}, data_types::f32, format::bfyx };
 
-    // 1×1 identity conv weights (16 output channels, 16 input channels).
     auto weights_layout = layout{ ov::PartialShape{16, 16, 1, 1}, data_types::f16, format::bfyx };
     auto weights_mem = engine.allocate_memory(weights_layout);
     std::vector<ov::float16> wdata(16 * 16, ov::float16(0.f));
@@ -1849,19 +1820,13 @@ TEST(prepare_buffer_fusing, in_place_onednn_concat_multi_user_conv_as_user) {
     topology.add(input_layout("other_in",  in_layout2));
     topology.add(data("conv_w", weights_mem));
 
-    // shared_r is the predecessor node that will have 3 users after fusing.
-    // Declared as b_fs_yx_fsv16 to match the oneDNN conv preferred input format,
-    // so reorder_inputs does not insert a format-conversion reorder in between.
+    // shared_r must match the conv preferred format so reorder_inputs does not
+    // insert an intermediate reorder that would break the fusing path.
     topology.add(reorder("shared_r", input_info("shared_in"), format::b_fs_yx_fsv16, data_types::f16));
     topology.add(reorder("other_r",  input_info("other_in"),  format::b_fs_yx_fsv16, data_types::f16));
 
-    // User 1 of shared_r: feature-axis concat (the fusing candidate) — other_r first so
-    // shared_r lands in the second slot, making the padding layout more asymmetric.
     topology.add(concatenation("concat", { input_info("other_r"), input_info("shared_r") }, 1));
-    // User 2 of shared_r: 1×1 identity oneDNN conv — validates that oneDNN conv
-    // reads from the padded buffer by logical tensor coordinates, not raw byte offsets.
     topology.add(convolution("conv", input_info("shared_r"), "conv_w", "", 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, false));
-    // User 3 of shared_r: activation (always safe).
     topology.add(activation("act1", input_info("shared_r"), activation_func::relu));
 
     topology.add(reorder("out_concat", input_info("concat"), format::bfyx, data_types::f32));
@@ -1871,9 +1836,6 @@ TEST(prepare_buffer_fusing, in_place_onednn_concat_multi_user_conv_as_user) {
     ExecutionConfig config = get_test_default_config(engine);
     config.set_property(ov::intel_gpu::optimize_data(true));
     config.set_property(ov::intel_gpu::allow_new_shape_infer(false));
-    // Both shared_r and conv are forced to b_fs_yx_fsv16 + oneDNN. Since their
-    // formats already match, reorder_inputs inserts no intermediate reorder —
-    // oneDNN conv reads the padded buffer directly, exercising the safety guarantee.
     config.set_property(ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{
         {"shared_r", ov::intel_gpu::ImplementationDesc{format::b_fs_yx_fsv16, "", impl_types::onednn}},
         {"conv",     ov::intel_gpu::ImplementationDesc{format::b_fs_yx_fsv16, "", impl_types::onednn}},
@@ -1883,7 +1845,7 @@ TEST(prepare_buffer_fusing, in_place_onednn_concat_multi_user_conv_as_user) {
     auto input_memory  = engine.allocate_memory(in_layout);
     auto input_memory2 = engine.allocate_memory(in_layout2);
     // Natural-number sequences — d1 starts at 0, d2 starts at 512 so the two halves of
-    // the concat output are unambiguously distinguishable; aliasing bugs produce clearly wrong values.
+    // the concat output are unambiguously distinguishable
     const size_t N = 16 * 4 * 4;  // 256
     std::vector<float> d1(N), d2(N);
     for (size_t i = 0; i < N; i++) { d1[i] = static_cast<float>(i); d2[i] = static_cast<float>(512 + i); }
@@ -1896,13 +1858,10 @@ TEST(prepare_buffer_fusing, in_place_onednn_concat_multi_user_conv_as_user) {
     std::map<cldnn::primitive_id, cldnn::network_output> output;
     EXPECT_NO_THROW(output = network.execute());
 
-    // Confirm concat was fused: shared_r has 3 users (concat, conv, act1), all safe types,
-    // so the multi-user guard allows implicit concat optimization.
+    // Confirm concat was fused
     const auto& concat_node = network.get_primitive("concat")->get_node();
     ASSERT_TRUE(concat_node.can_be_optimized());
 
-    // out_concat: [other_r(512..512+N-1) | shared_r(0..N-1)] along feature axis.
-    // (other_r is now the first concat input after the order swap)
     auto out_concat_mem = output.at("out_concat").get_memory();
     cldnn::mem_lock<float> concat_ptr(out_concat_mem, get_test_stream());
     ASSERT_EQ(concat_ptr.size(), 2 * N);
@@ -1911,14 +1870,11 @@ TEST(prepare_buffer_fusing, in_place_onednn_concat_multi_user_conv_as_user) {
     for (size_t i = 0; i < N; i++)
         ASSERT_NEAR(concat_ptr[N + i], static_cast<float>(i),       1e-2f) << "out_concat second half mismatch at index " << i;
 
-    // out_conv: identity conv(shared_r) = 0..N-1 — confirms that conv reads from the
-    // padded predecessor buffer by logical tensor coordinates, not raw byte offsets.
     auto out_conv_mem = output.at("out_conv").get_memory();
     cldnn::mem_lock<float> conv_ptr(out_conv_mem, get_test_stream());
     for (size_t i = 0; i < conv_ptr.size(); i++)
         ASSERT_NEAR(conv_ptr[i], static_cast<float>(i), 1e-2f) << "out_conv mismatch at index " << i;
 
-    // out_act1: relu(shared_r) = 0..N-1 (all non-negative, relu is identity).
     auto out_act1_mem = output.at("out_act1").get_memory();
     cldnn::mem_lock<float> act1_ptr(out_act1_mem, get_test_stream());
     for (size_t i = 0; i < act1_ptr.size(); i++)