From 98118b86e741b5dd7176a9c148e73a205eab8b5e Mon Sep 17 00:00:00 2001
From: "Yu, Zijun" <zijun.yu@intel.com>
Date: Wed, 27 May 2026 13:57:23 +0800
Subject: [PATCH 1/4] [GPU] Fix GDN func test to use TEMPLATE plugin as
 reference

The GPU functional test for GatedDeltaNet was failing on A770 because
it used the CPU plugin as reference, which is not available on that
test platform.

Changes:
- Add evaluate() method to GatedDeltaNet op, enabling the TEMPLATE
  plugin to evaluate it directly as a reference implementation.
- Update the GPU functional test to use infer_on_template() instead
  of compiling on CPU.

CVS-187512
---
 .../dev_api/openvino/op/gated_delta_net.hpp   |   2 +
 src/core/src/op/gated_delta_net.cpp           | 118 ++++++++++++++++++
 .../single_layer_tests/gated_delta_net.cpp    |  27 ++--
 3 files changed, 134 insertions(+), 13 deletions(-)
diff --git a/src/core/dev_api/openvino/op/gated_delta_net.hpp b/src/core/dev_api/openvino/op/gated_delta_net.hpp
index 4b6b464606f3..a48702e06fa7 100644
--- a/src/core/dev_api/openvino/op/gated_delta_net.hpp
+++ b/src/core/dev_api/openvino/op/gated_delta_net.hpp
@@ -49,6 +49,8 @@ class OPENVINO_API GatedDeltaNet : public ov::op::Op {
     void validate_and_infer_types() override;
     bool visit_attributes(AttributeVisitor& visitor) override;
     std::shared_ptr<ov::Node> clone_with_new_inputs(const ov::OutputVector& new_args) const override;
+    bool evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const override;
+    bool has_evaluate() const override;
     bool get_fuse_qk_l2norm() const {
         return m_fuse_qk_l2norm;
     }
diff --git a/src/core/src/op/gated_delta_net.cpp b/src/core/src/op/gated_delta_net.cpp
index 32d19b176df3..98ab0fe6c07f 100644
--- a/src/core/src/op/gated_delta_net.cpp
+++ b/src/core/src/op/gated_delta_net.cpp
@@ -4,6 +4,8 @@
 
 #include "openvino/op/gated_delta_net.hpp"
 
+#include <cmath>
+
 #include "dimension_util.hpp"
 #include "gated_delta_net_shape_inference.hpp"
 #include "itt.hpp"
@@ -115,4 +117,120 @@ std::shared_ptr<ov::Node> GatedDeltaNet::clone_with_new_inputs(const ov::OutputV
     return cloned;
 }
 
+bool GatedDeltaNet::has_evaluate() const {
+    return get_input_element_type(0) == ov::element::f32;
+}
+
+bool GatedDeltaNet::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const {
+    OV_OP_SCOPE(GatedDeltaNet_evaluate);
+
+    const auto& q_tensor = inputs[0];
+    const auto& k_tensor = inputs[1];
+    const auto& v_tensor = inputs[2];
+    const auto& state_tensor = inputs[3];
+    const auto& gate_tensor = inputs[4];
+    const auto& beta_tensor = inputs[5];
+
+    const auto& q_shape = q_tensor.get_shape();
+    const auto& v_shape = v_tensor.get_shape();
+    const auto& state_shape = state_tensor.get_shape();
+
+    const size_t B = q_shape[0];
+    const size_t S = q_shape[1];
+    const size_t qk_H = q_shape[2];
+    const size_t D = q_shape[3];
+    const size_t v_H = v_shape[2];
+    const size_t Dv = v_shape[3];
+    const size_t group_size = v_H / qk_H;
+
+    outputs[0].set_shape(v_shape);
+    outputs[1].set_shape(state_shape);
+
+    const float* q_data = static_cast<const float*>(q_tensor.data());
+    const float* k_data = static_cast<const float*>(k_tensor.data());
+    const float* v_data = static_cast<const float*>(v_tensor.data());
+    const float* gate_data = static_cast<const float*>(gate_tensor.data());
+    const float* beta_data = static_cast<const float*>(beta_tensor.data());
+
+    // Copy state input to output state (will be modified in-place)
+    float* out_state = static_cast<float*>(outputs[1].data());
+    std::memcpy(out_state, state_tensor.data(), state_tensor.get_byte_size());
+
+    float* out_data = static_cast<float*>(outputs[0].data());
+    const float attn_scale = 1.0f / std::sqrt(static_cast<float>(D));
+
+    for (size_t b = 0; b < B; b++) {
+        for (size_t h_v = 0; h_v < v_H; h_v++) {
+            const size_t h_qk = h_v / group_size;
+            for (size_t d_v = 0; d_v < Dv; d_v++) {
+                // state slice: state[b, h_v, :, d_v] — D elements
+                // state layout: [B, v_H, D, Dv]
+                float* state_ptr = out_state + b * v_H * D * Dv + h_v * D * Dv + d_v;
+
+                for (size_t t = 0; t < S; t++) {
+                    // q[b, t, h_qk, :] — layout [B, S, qk_H, D]
+                    const float* q_ptr = q_data + b * S * qk_H * D + t * qk_H * D + h_qk * D;
+                    // k[b, t, h_qk, :] — layout [B, S, qk_H, D]
+                    const float* k_ptr = k_data + b * S * qk_H * D + t * qk_H * D + h_qk * D;
+
+                    // L2-normalize q and k
+                    std::vector<float> q_vec(q_ptr, q_ptr + D);
+                    std::vector<float> k_vec(k_ptr, k_ptr + D);
+
+                    if (m_fuse_qk_l2norm) {
+                        auto l2norm = [](std::vector<float>& vec, float eps) {
+                            float sum = 0.0f;
+                            for (auto v : vec)
+                                sum += v * v;
+                            sum = 1.0f / std::sqrt(sum + eps);
+                            for (auto& v : vec)
+                                v *= sum;
+                        };
+                        l2norm(q_vec, m_q_l2_norm_eps);
+                        l2norm(k_vec, m_k_l2_norm_eps);
+                    }
+
+                    // Scale q
+                    for (auto& v : q_vec)
+                        v *= attn_scale;
+
+                    // gate[b, t, h_v] — layout [B, S, v_H]
+                    float g = std::exp(gate_data[b * S * v_H + t * v_H + h_v]);
+                    // beta[b, t, h_v]
+                    float bt = beta_data[b * S * v_H + t * v_H + h_v];
+
+                    // Decay state: state *= g
+                    for (size_t d = 0; d < D; d++) {
+                        state_ptr[d * Dv] *= g;
+                    }
+
+                    // h_k = dot(state, k)
+                    float h_k = 0.0f;
+                    for (size_t d = 0; d < D; d++) {
+                        h_k += state_ptr[d * Dv] * k_vec[d];
+                    }
+
+                    // delta: v_val = value[b, t, h_v, d_v] - h_k
+                    // value layout: [B, S, v_H, Dv]
+                    float v_val = v_data[b * S * v_H * Dv + t * v_H * Dv + h_v * Dv + d_v] - h_k;
+
+                    // Update state: state += k * (v_val * beta)
+                    float update_scale = v_val * bt;
+                    for (size_t d = 0; d < D; d++) {
+                        state_ptr[d * Dv] += k_vec[d] * update_scale;
+                    }
+
+                    // Output: out[b, t, h_v, d_v] = dot(state, q)
+                    float out_val = 0.0f;
+                    for (size_t d = 0; d < D; d++) {
+                        out_val += state_ptr[d * Dv] * q_vec[d];
+                    }
+                    out_data[b * S * v_H * Dv + t * v_H * Dv + h_v * Dv + d_v] = out_val;
+                }
+            }
+        }
+    }
+    return true;
+}
+
 }  // namespace ov::op::internal
diff --git a/src/plugins/intel_gpu/tests/functional/single_layer_tests/gated_delta_net.cpp b/src/plugins/intel_gpu/tests/functional/single_layer_tests/gated_delta_net.cpp
index b33c4d137ebf..d8c700a600b5 100644
--- a/src/plugins/intel_gpu/tests/functional/single_layer_tests/gated_delta_net.cpp
+++ b/src/plugins/intel_gpu/tests/functional/single_layer_tests/gated_delta_net.cpp
@@ -2,12 +2,14 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 
+#include "openvino/op/gated_delta_net.hpp"
+
+#include "common_test_utils/common_utils.hpp"
 #include "common_test_utils/ov_tensor_utils.hpp"
+#include "common_test_utils/ov_test_utils.hpp"
 #include "common_test_utils/test_common.hpp"
-#include "common_test_utils/common_utils.hpp"
 #include "openvino/op/parameter.hpp"
 #include "openvino/op/result.hpp"
-#include "openvino/op/gated_delta_net.hpp"
 #include "openvino/runtime/core.hpp"
 
 namespace {
@@ -75,20 +77,20 @@ class GatedDeltaNetStaticTest : public testing::WithParamInterface<GatedDeltaNet
     std::shared_ptr<ov::Model> model;
 };
 
-TEST_P(GatedDeltaNetStaticTest, CompareWithCPU) {
+TEST_P(GatedDeltaNetStaticTest, CompareWithTemplate) {
     auto inputs = generate_inputs();
 
-    ov::Core core;
-
-    // Run on CPU (reference)
-    auto compiled_cpu = core.compile_model(model, "CPU");
-    auto req_cpu = compiled_cpu.create_infer_request();
-    for (const auto& [param, tensor] : inputs) {
-        req_cpu.set_tensor(param->output(0), tensor);
+    // Build input tensor vector for infer_on_template
+    ov::TensorVector input_tensors;
+    for (const auto& param : model->get_parameters()) {
+        input_tensors.push_back(inputs.at(param));
     }
-    req_cpu.infer();
+
+    // Run on TEMPLATE (reference)
+    auto ref_outputs = ov::test::utils::infer_on_template(model, input_tensors);
 
     // Run on GPU
+    ov::Core core;
     auto compiled_gpu = core.compile_model(model, "GPU");
     auto req_gpu = compiled_gpu.create_infer_request();
     for (const auto& [param, tensor] : inputs) {
@@ -98,9 +100,8 @@ TEST_P(GatedDeltaNetStaticTest, CompareWithCPU) {
 
     // Compare outputs
     for (size_t i = 0; i < model->get_output_size(); i++) {
-        auto out_cpu = req_cpu.get_output_tensor(i);
         auto out_gpu = req_gpu.get_output_tensor(i);
-        ov::test::utils::compare(out_cpu, out_gpu, 1e-2, 1e-2);
+        ov::test::utils::compare(ref_outputs[i], out_gpu, 1e-2, 1e-2);
     }
 }
 

From aed473bdefe8b1e63f25cbedc7caa68b4078f6a9 Mon Sep 17 00:00:00 2001
From: "Yu, Zijun" <zijun.yu@intel.com>
Date: Fri, 29 May 2026 14:42:39 +0800
Subject: [PATCH 2/4] Address review comments

---
 src/core/src/op/gated_delta_net.cpp           | 82 +++++++++++++------
 .../single_layer_tests/gated_delta_net.cpp    | 12 +--
 2 files changed, 62 insertions(+), 32 deletions(-)

diff --git a/src/core/src/op/gated_delta_net.cpp b/src/core/src/op/gated_delta_net.cpp
index 98ab0fe6c07f..59fd3d0d4bc3 100644
--- a/src/core/src/op/gated_delta_net.cpp
+++ b/src/core/src/op/gated_delta_net.cpp
@@ -5,6 +5,7 @@
 #include "openvino/op/gated_delta_net.hpp"
 
 #include <cmath>
+#include <cstddef>
 
 #include "dimension_util.hpp"
 #include "gated_delta_net_shape_inference.hpp"
@@ -118,7 +119,15 @@ std::shared_ptr<ov::Node> GatedDeltaNet::clone_with_new_inputs(const ov::OutputV
 }
 
 bool GatedDeltaNet::has_evaluate() const {
-    return get_input_element_type(0) == ov::element::f32;
+    for (size_t i = 0; i < get_input_size(); i++) {
+        if (get_input_element_type(i) != ov::element::f32)
+            return false;
+    }
+    for (size_t i = 0; i < get_output_size(); i++) {
+        if (get_output_element_type(i) != ov::element::f32)
+            return false;
+    }
+    return true;
 }
 
 bool GatedDeltaNet::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const {
@@ -141,6 +150,13 @@ bool GatedDeltaNet::evaluate(ov::TensorVector& outputs, const ov::TensorVector&
     const size_t D = q_shape[3];
     const size_t v_H = v_shape[2];
     const size_t Dv = v_shape[3];
+
+    OPENVINO_ASSERT(qk_H > 0 && v_H >= qk_H && v_H % qk_H == 0,
+                    "GatedDeltaNet evaluate: v_H (",
+                    v_H,
+                    ") must be a positive multiple of qk_H (",
+                    qk_H,
+                    ")");
     const size_t group_size = v_H / qk_H;
 
     outputs[0].set_shape(v_shape);
@@ -152,26 +168,41 @@ bool GatedDeltaNet::evaluate(ov::TensorVector& outputs, const ov::TensorVector&
     const float* gate_data = static_cast<const float*>(gate_tensor.data());
     const float* beta_data = static_cast<const float*>(beta_tensor.data());
 
-    // Copy state input to output state (will be modified in-place)
     float* out_state = static_cast<float*>(outputs[1].data());
-    std::memcpy(out_state, state_tensor.data(), state_tensor.get_byte_size());
-
     float* out_data = static_cast<float*>(outputs[0].data());
     const float attn_scale = 1.0f / std::sqrt(static_cast<float>(D));
 
+    const size_t qk_stride_batch = S * qk_H * D;
+    const size_t v_stride_batch = S * v_H * Dv;
+    const size_t gate_beta_stride_batch = S * v_H;
+
+    auto dot_product = [](const float* a, const float* b, size_t n, size_t a_stride = 1) {
+        float result = 0.0f;
+        for (size_t i = 0; i < n; i++) {
+            result += a[i * a_stride] * b[i];
+        }
+        return result;
+    };
+
     for (size_t b = 0; b < B; b++) {
         for (size_t h_v = 0; h_v < v_H; h_v++) {
             const size_t h_qk = h_v / group_size;
             for (size_t d_v = 0; d_v < Dv; d_v++) {
-                // state slice: state[b, h_v, :, d_v] — D elements
+                // state slice: state[b, h_v, :, d_v] — D elements with stride Dv
                 // state layout: [B, v_H, D, Dv]
-                float* state_ptr = out_state + b * v_H * D * Dv + h_v * D * Dv + d_v;
+                const size_t state_offset = b * v_H * D * Dv + h_v * D * Dv + d_v;
+                float* state_ptr = out_state + state_offset;
+
+                // Load initial state from input
+                std::vector<float> local_state(D);
+                const float* src_state = static_cast<const float*>(state_tensor.data()) + state_offset;
+                for (size_t d = 0; d < D; d++) {
+                    local_state[d] = src_state[d * Dv];
+                }
 
                 for (size_t t = 0; t < S; t++) {
-                    // q[b, t, h_qk, :] — layout [B, S, qk_H, D]
-                    const float* q_ptr = q_data + b * S * qk_H * D + t * qk_H * D + h_qk * D;
-                    // k[b, t, h_qk, :] — layout [B, S, qk_H, D]
-                    const float* k_ptr = k_data + b * S * qk_H * D + t * qk_H * D + h_qk * D;
+                    const float* q_ptr = q_data + b * qk_stride_batch + t * qk_H * D + h_qk * D;
+                    const float* k_ptr = k_data + b * qk_stride_batch + t * qk_H * D + h_qk * D;
 
                     // L2-normalize q and k
                     std::vector<float> q_vec(q_ptr, q_ptr + D);
@@ -180,7 +211,7 @@ bool GatedDeltaNet::evaluate(ov::TensorVector& outputs, const ov::TensorVector&
                     if (m_fuse_qk_l2norm) {
                         auto l2norm = [](std::vector<float>& vec, float eps) {
                             float sum = 0.0f;
-                            for (auto v : vec)
+                            for (const auto v : vec)
                                 sum += v * v;
                             sum = 1.0f / std::sqrt(sum + eps);
                             for (auto& v : vec)
@@ -195,37 +226,34 @@ bool GatedDeltaNet::evaluate(ov::TensorVector& outputs, const ov::TensorVector&
                         v *= attn_scale;
 
                     // gate[b, t, h_v] — layout [B, S, v_H]
-                    float g = std::exp(gate_data[b * S * v_H + t * v_H + h_v]);
-                    // beta[b, t, h_v]
-                    float bt = beta_data[b * S * v_H + t * v_H + h_v];
+                    float g = std::exp(gate_data[b * gate_beta_stride_batch + t * v_H + h_v]);
+                    float bt = beta_data[b * gate_beta_stride_batch + t * v_H + h_v];
 
                     // Decay state: state *= g
                     for (size_t d = 0; d < D; d++) {
-                        state_ptr[d * Dv] *= g;
+                        local_state[d] *= g;
                     }
 
                     // h_k = dot(state, k)
-                    float h_k = 0.0f;
-                    for (size_t d = 0; d < D; d++) {
-                        h_k += state_ptr[d * Dv] * k_vec[d];
-                    }
+                    float h_k = dot_product(local_state.data(), k_vec.data(), D);
 
                     // delta: v_val = value[b, t, h_v, d_v] - h_k
-                    // value layout: [B, S, v_H, Dv]
-                    float v_val = v_data[b * S * v_H * Dv + t * v_H * Dv + h_v * Dv + d_v] - h_k;
+                    float v_val = v_data[b * v_stride_batch + t * v_H * Dv + h_v * Dv + d_v] - h_k;
 
                     // Update state: state += k * (v_val * beta)
                     float update_scale = v_val * bt;
                     for (size_t d = 0; d < D; d++) {
-                        state_ptr[d * Dv] += k_vec[d] * update_scale;
+                        local_state[d] += k_vec[d] * update_scale;
                     }
 
                     // Output: out[b, t, h_v, d_v] = dot(state, q)
-                    float out_val = 0.0f;
-                    for (size_t d = 0; d < D; d++) {
-                        out_val += state_ptr[d * Dv] * q_vec[d];
-                    }
-                    out_data[b * S * v_H * Dv + t * v_H * Dv + h_v * Dv + d_v] = out_val;
+                    out_data[b * v_stride_batch + t * v_H * Dv + h_v * Dv + d_v] =
+                        dot_product(local_state.data(), q_vec.data(), D);
+                }
+
+                // Write final state back
+                for (size_t d = 0; d < D; d++) {
+                    state_ptr[d * Dv] = local_state[d];
                 }
             }
         }
diff --git a/src/plugins/intel_gpu/tests/functional/single_layer_tests/gated_delta_net.cpp b/src/plugins/intel_gpu/tests/functional/single_layer_tests/gated_delta_net.cpp
index d8c700a600b5..a99c3ffd05a0 100644
--- a/src/plugins/intel_gpu/tests/functional/single_layer_tests/gated_delta_net.cpp
+++ b/src/plugins/intel_gpu/tests/functional/single_layer_tests/gated_delta_net.cpp
@@ -105,16 +105,18 @@ TEST_P(GatedDeltaNetStaticTest, CompareWithTemplate) {
     }
 }
 
-// Shapes: query[B,S,H,D], key[B,S,H,D], value[B,S,H,Dv], state[B,H,D,Dv], gate[B,S,H], beta[B,S,H]
+// Shapes: query[B,S,qk_H,D], key[B,S,qk_H,D], value[B,S,v_H,Dv], state[B,v_H,D,Dv], gate[B,S,v_H], beta[B,S,v_H]
 const std::vector<std::vector<ov::Shape>> static_shapes = {
-    // B=1, S=1, H=4, D=16, Dv=16 (minimal)
+    // B=1, S=1, qk_H=4, v_H=4, D=16, Dv=16 (minimal)
     {{1, 1, 4, 16}, {1, 1, 4, 16}, {1, 1, 4, 16}, {1, 4, 16, 16}, {1, 1, 4}, {1, 1, 4}},
-    // B=1, S=1, H=32, D=128, Dv=128 (typical LLM decode)
+    // B=1, S=1, qk_H=32, v_H=32, D=128, Dv=128 (typical LLM decode)
     {{1, 1, 32, 128}, {1, 1, 32, 128}, {1, 1, 32, 128}, {1, 32, 128, 128}, {1, 1, 32}, {1, 1, 32}},
-    // B=1, S=16, H=2, D=16, Dv=32 (seq_len > 1, different D and Dv)
+    // B=1, S=16, qk_H=2, v_H=2, D=16, Dv=32 (seq_len > 1, different D and Dv)
     {{1, 16, 2, 16}, {1, 16, 2, 16}, {1, 16, 2, 32}, {1, 2, 16, 32}, {1, 16, 2}, {1, 16, 2}},
-    // B=2, S=1, H=8, D=64, Dv=64 (batch > 1)
+    // B=2, S=1, qk_H=8, v_H=8, D=64, Dv=64 (batch > 1)
     {{2, 1, 8, 64}, {2, 1, 8, 64}, {2, 1, 8, 64}, {2, 8, 64, 64}, {2, 1, 8}, {2, 1, 8}},
+    // B=1, S=4, qk_H=2, v_H=8, D=16, Dv=16 (GQA: v_H is multiple of qk_H)
+    {{1, 4, 2, 16}, {1, 4, 2, 16}, {1, 4, 8, 16}, {1, 8, 16, 16}, {1, 4, 8}, {1, 4, 8}},
 };
 
 INSTANTIATE_TEST_SUITE_P(

From 153c6f87cd6156dc54aaf7b16f902e3421b7c491 Mon Sep 17 00:00:00 2001
From: "Yu, Zijun" <zijun.yu@intel.com>
Date: Wed, 3 Jun 2026 13:33:51 +0800
Subject: [PATCH 3/4] Move GatedDeltaNet evaluate to Template plugin

---
 .../dev_api/openvino/op/gated_delta_net.hpp   |   2 -
 src/core/src/op/gated_delta_net.cpp           | 146 -----------------
 .../template/backend/ops/gated_delta_net.cpp  | 155 ++++++++++++++++++
 .../template/backend/ops/ops_evaluates.hpp    |   5 +
 .../template/backend/opset_int_tbl.hpp        |   1 +
 5 files changed, 161 insertions(+), 148 deletions(-)
 create mode 100644 src/plugins/template/backend/ops/gated_delta_net.cpp

diff --git a/src/core/dev_api/openvino/op/gated_delta_net.hpp b/src/core/dev_api/openvino/op/gated_delta_net.hpp
index a48702e06fa7..4b6b464606f3 100644
--- a/src/core/dev_api/openvino/op/gated_delta_net.hpp
+++ b/src/core/dev_api/openvino/op/gated_delta_net.hpp
@@ -49,8 +49,6 @@ class OPENVINO_API GatedDeltaNet : public ov::op::Op {
     void validate_and_infer_types() override;
     bool visit_attributes(AttributeVisitor& visitor) override;
     std::shared_ptr<ov::Node> clone_with_new_inputs(const ov::OutputVector& new_args) const override;
-    bool evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const override;
-    bool has_evaluate() const override;
     bool get_fuse_qk_l2norm() const {
         return m_fuse_qk_l2norm;
     }
diff --git a/src/core/src/op/gated_delta_net.cpp b/src/core/src/op/gated_delta_net.cpp
index 59fd3d0d4bc3..32d19b176df3 100644
--- a/src/core/src/op/gated_delta_net.cpp
+++ b/src/core/src/op/gated_delta_net.cpp
@@ -4,9 +4,6 @@
 
 #include "openvino/op/gated_delta_net.hpp"
 
-#include <cmath>
-#include <cstddef>
-
 #include "dimension_util.hpp"
 #include "gated_delta_net_shape_inference.hpp"
 #include "itt.hpp"
@@ -118,147 +115,4 @@ std::shared_ptr<ov::Node> GatedDeltaNet::clone_with_new_inputs(const ov::OutputV
     return cloned;
 }
 
-bool GatedDeltaNet::has_evaluate() const {
-    for (size_t i = 0; i < get_input_size(); i++) {
-        if (get_input_element_type(i) != ov::element::f32)
-            return false;
-    }
-    for (size_t i = 0; i < get_output_size(); i++) {
-        if (get_output_element_type(i) != ov::element::f32)
-            return false;
-    }
-    return true;
-}
-
-bool GatedDeltaNet::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const {
-    OV_OP_SCOPE(GatedDeltaNet_evaluate);
-
-    const auto& q_tensor = inputs[0];
-    const auto& k_tensor = inputs[1];
-    const auto& v_tensor = inputs[2];
-    const auto& state_tensor = inputs[3];
-    const auto& gate_tensor = inputs[4];
-    const auto& beta_tensor = inputs[5];
-
-    const auto& q_shape = q_tensor.get_shape();
-    const auto& v_shape = v_tensor.get_shape();
-    const auto& state_shape = state_tensor.get_shape();
-
-    const size_t B = q_shape[0];
-    const size_t S = q_shape[1];
-    const size_t qk_H = q_shape[2];
-    const size_t D = q_shape[3];
-    const size_t v_H = v_shape[2];
-    const size_t Dv = v_shape[3];
-
-    OPENVINO_ASSERT(qk_H > 0 && v_H >= qk_H && v_H % qk_H == 0,
-                    "GatedDeltaNet evaluate: v_H (",
-                    v_H,
-                    ") must be a positive multiple of qk_H (",
-                    qk_H,
-                    ")");
-    const size_t group_size = v_H / qk_H;
-
-    outputs[0].set_shape(v_shape);
-    outputs[1].set_shape(state_shape);
-
-    const float* q_data = static_cast<const float*>(q_tensor.data());
-    const float* k_data = static_cast<const float*>(k_tensor.data());
-    const float* v_data = static_cast<const float*>(v_tensor.data());
-    const float* gate_data = static_cast<const float*>(gate_tensor.data());
-    const float* beta_data = static_cast<const float*>(beta_tensor.data());
-
-    float* out_state = static_cast<float*>(outputs[1].data());
-    float* out_data = static_cast<float*>(outputs[0].data());
-    const float attn_scale = 1.0f / std::sqrt(static_cast<float>(D));
-
-    const size_t qk_stride_batch = S * qk_H * D;
-    const size_t v_stride_batch = S * v_H * Dv;
-    const size_t gate_beta_stride_batch = S * v_H;
-
-    auto dot_product = [](const float* a, const float* b, size_t n, size_t a_stride = 1) {
-        float result = 0.0f;
-        for (size_t i = 0; i < n; i++) {
-            result += a[i * a_stride] * b[i];
-        }
-        return result;
-    };
-
-    for (size_t b = 0; b < B; b++) {
-        for (size_t h_v = 0; h_v < v_H; h_v++) {
-            const size_t h_qk = h_v / group_size;
-            for (size_t d_v = 0; d_v < Dv; d_v++) {
-                // state slice: state[b, h_v, :, d_v] — D elements with stride Dv
-                // state layout: [B, v_H, D, Dv]
-                const size_t state_offset = b * v_H * D * Dv + h_v * D * Dv + d_v;
-                float* state_ptr = out_state + state_offset;
-
-                // Load initial state from input
-                std::vector<float> local_state(D);
-                const float* src_state = static_cast<const float*>(state_tensor.data()) + state_offset;
-                for (size_t d = 0; d < D; d++) {
-                    local_state[d] = src_state[d * Dv];
-                }
-
-                for (size_t t = 0; t < S; t++) {
-                    const float* q_ptr = q_data + b * qk_stride_batch + t * qk_H * D + h_qk * D;
-                    const float* k_ptr = k_data + b * qk_stride_batch + t * qk_H * D + h_qk * D;
-
-                    // L2-normalize q and k
-                    std::vector<float> q_vec(q_ptr, q_ptr + D);
-                    std::vector<float> k_vec(k_ptr, k_ptr + D);
-
-                    if (m_fuse_qk_l2norm) {
-                        auto l2norm = [](std::vector<float>& vec, float eps) {
-                            float sum = 0.0f;
-                            for (const auto v : vec)
-                                sum += v * v;
-                            sum = 1.0f / std::sqrt(sum + eps);
-                            for (auto& v : vec)
-                                v *= sum;
-                        };
-                        l2norm(q_vec, m_q_l2_norm_eps);
-                        l2norm(k_vec, m_k_l2_norm_eps);
-                    }
-
-                    // Scale q
-                    for (auto& v : q_vec)
-                        v *= attn_scale;
-
-                    // gate[b, t, h_v] — layout [B, S, v_H]
-                    float g = std::exp(gate_data[b * gate_beta_stride_batch + t * v_H + h_v]);
-                    float bt = beta_data[b * gate_beta_stride_batch + t * v_H + h_v];
-
-                    // Decay state: state *= g
-                    for (size_t d = 0; d < D; d++) {
-                        local_state[d] *= g;
-                    }
-
-                    // h_k = dot(state, k)
-                    float h_k = dot_product(local_state.data(), k_vec.data(), D);
-
-                    // delta: v_val = value[b, t, h_v, d_v] - h_k
-                    float v_val = v_data[b * v_stride_batch + t * v_H * Dv + h_v * Dv + d_v] - h_k;
-
-                    // Update state: state += k * (v_val * beta)
-                    float update_scale = v_val * bt;
-                    for (size_t d = 0; d < D; d++) {
-                        local_state[d] += k_vec[d] * update_scale;
-                    }
-
-                    // Output: out[b, t, h_v, d_v] = dot(state, q)
-                    out_data[b * v_stride_batch + t * v_H * Dv + h_v * Dv + d_v] =
-                        dot_product(local_state.data(), q_vec.data(), D);
-                }
-
-                // Write final state back
-                for (size_t d = 0; d < D; d++) {
-                    state_ptr[d * Dv] = local_state[d];
-                }
-            }
-        }
-    }
-    return true;
-}
-
 }  // namespace ov::op::internal
diff --git a/src/plugins/template/backend/ops/gated_delta_net.cpp b/src/plugins/template/backend/ops/gated_delta_net.cpp
new file mode 100644
index 000000000000..3dfecdde5d96
--- /dev/null
+++ b/src/plugins/template/backend/ops/gated_delta_net.cpp
@@ -0,0 +1,155 @@
+// Copyright (C) 2018-2026 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <cmath>
+#include <vector>
+
+#include "evaluate_node.hpp"
+#include "openvino/core/type/element_type_traits.hpp"
+#include "openvino/op/gated_delta_net.hpp"
+
+template <ov::element::Type_t ET>
+bool evaluate(const std::shared_ptr<ov::op::internal::GatedDeltaNet>& op,
+              ov::TensorVector& outputs,
+              const ov::TensorVector& inputs) {
+    using T = typename ov::element_type_traits<ET>::value_type;
+
+    const auto& q_shape = inputs[0].get_shape();
+    const auto& v_shape = inputs[2].get_shape();
+    const auto& state_shape = inputs[3].get_shape();
+
+    const size_t B = q_shape[0];
+    const size_t S = q_shape[1];
+    const size_t qk_H = q_shape[2];
+    const size_t D = q_shape[3];
+    const size_t v_H = v_shape[2];
+    const size_t Dv = v_shape[3];
+
+    OPENVINO_ASSERT(qk_H > 0 && v_H >= qk_H && v_H % qk_H == 0,
+                    "GatedDeltaNet evaluate: v_H (",
+                    v_H,
+                    ") must be a positive multiple of qk_H (",
+                    qk_H,
+                    ")");
+    const size_t group_size = v_H / qk_H;
+
+    outputs[0].set_shape(v_shape);
+    outputs[1].set_shape(state_shape);
+
+    const T* q_data = inputs[0].data<const T>();
+    const T* k_data = inputs[1].data<const T>();
+    const T* v_data = inputs[2].data<const T>();
+    const T* state_data = inputs[3].data<const T>();
+    const T* gate_data = inputs[4].data<const T>();
+    const T* beta_data = inputs[5].data<const T>();
+
+    T* out_state = outputs[1].data<T>();
+    T* out_data = outputs[0].data<T>();
+    const T attn_scale = static_cast<T>(1) / std::sqrt(static_cast<T>(D));
+
+    const size_t qk_stride_batch = S * qk_H * D;
+    const size_t v_stride_batch = S * v_H * Dv;
+    const size_t gate_beta_stride_batch = S * v_H;
+
+    const bool fuse_qk_l2norm = op->get_fuse_qk_l2norm();
+    const T q_l2_norm_eps = static_cast<T>(op->get_q_l2_norm_eps());
+    const T k_l2_norm_eps = static_cast<T>(op->get_k_l2_norm_eps());
+
+    auto dot_product = [](const T* a, const T* b, size_t n) {
+        T result = static_cast<T>(0);
+        for (size_t i = 0; i < n; i++) {
+            result += a[i] * b[i];
+        }
+        return result;
+    };
+
+    auto l2norm = [](std::vector<T>& vec, T eps) {
+        T sum = static_cast<T>(0);
+        for (size_t i = 0; i < vec.size(); i++)
+            sum += vec[i] * vec[i];
+        sum = static_cast<T>(1) / std::sqrt(sum + eps);
+        for (size_t i = 0; i < vec.size(); i++)
+            vec[i] *= sum;
+    };
+
+    for (size_t b = 0; b < B; b++) {
+        for (size_t h_v = 0; h_v < v_H; h_v++) {
+            const size_t h_qk = h_v / group_size;
+            for (size_t d_v = 0; d_v < Dv; d_v++) {
+                // state layout: [B, v_H, D, Dv]
+                const size_t state_offset = b * v_H * D * Dv + h_v * D * Dv + d_v;
+                T* state_ptr = out_state + state_offset;
+
+                // Load initial state from input
+                std::vector<T> local_state(D);
+                const T* src_state = state_data + state_offset;
+                for (size_t d = 0; d < D; d++) {
+                    local_state[d] = src_state[d * Dv];
+                }
+
+                for (size_t t = 0; t < S; t++) {
+                    const T* q_ptr = q_data + b * qk_stride_batch + t * qk_H * D + h_qk * D;
+                    const T* k_ptr = k_data + b * qk_stride_batch + t * qk_H * D + h_qk * D;
+
+                    std::vector<T> q_vec(q_ptr, q_ptr + D);
+                    std::vector<T> k_vec(k_ptr, k_ptr + D);
+
+                    if (fuse_qk_l2norm) {
+                        l2norm(q_vec, q_l2_norm_eps);
+                        l2norm(k_vec, k_l2_norm_eps);
+                    }
+
+                    // Scale q
+                    for (size_t i = 0; i < D; i++)
+                        q_vec[i] *= attn_scale;
+
+                    // gate[b, t, h_v] — layout [B, S, v_H]
+                    T g = std::exp(gate_data[b * gate_beta_stride_batch + t * v_H + h_v]);
+                    T bt = beta_data[b * gate_beta_stride_batch + t * v_H + h_v];
+
+                    // Decay state: state *= g
+                    for (size_t d = 0; d < D; d++) {
+                        local_state[d] *= g;
+                    }
+
+                    // h_k = dot(state, k)
+                    T h_k = dot_product(local_state.data(), k_vec.data(), D);
+
+                    // delta: v_val = value[b, t, h_v, d_v] - h_k
+                    T v_val = v_data[b * v_stride_batch + t * v_H * Dv + h_v * Dv + d_v] - h_k;
+
+                    // Update state: state += k * (v_val * beta)
+                    T update_scale = v_val * bt;
+                    for (size_t d = 0; d < D; d++) {
+                        local_state[d] += k_vec[d] * update_scale;
+                    }
+
+                    // Output: out[b, t, h_v, d_v] = dot(state, q)
+                    out_data[b * v_stride_batch + t * v_H * Dv + h_v * Dv + d_v] =
+                        dot_product(local_state.data(), q_vec.data(), D);
+                }
+
+                // Write final state back
+                for (size_t d = 0; d < D; d++) {
+                    state_ptr[d * Dv] = local_state[d];
+                }
+            }
+        }
+    }
+    return true;
+}
+
+template <>
+bool evaluate_node<ov::op::internal::GatedDeltaNet>(std::shared_ptr<ov::Node> node,
+                                                    ov::TensorVector& outputs,
+                                                    const ov::TensorVector& inputs) {
+    const auto& element_type = node->get_input_element_type(0);
+
+    switch (element_type) {
+    case ov::element::f32:
+        return evaluate<ov::element::f32>(ov::as_type_ptr<ov::op::internal::GatedDeltaNet>(node), outputs, inputs);
+    default:
+        OPENVINO_THROW("Unhandled data type ", element_type, " in evaluate_node<GatedDeltaNet>()");
+    }
+}
diff --git a/src/plugins/template/backend/ops/ops_evaluates.hpp b/src/plugins/template/backend/ops/ops_evaluates.hpp
index a186fc4a6d9e..d0a7de9be754 100644
--- a/src/plugins/template/backend/ops/ops_evaluates.hpp
+++ b/src/plugins/template/backend/ops/ops_evaluates.hpp
@@ -4,6 +4,7 @@
 
 #pragma once
 #include "evaluate_node.hpp"
+#include "openvino/op/gated_delta_net.hpp"
 #include "openvino/op/ops.hpp"
 #include "openvino/op/paged_attention.hpp"
 #include "openvino/op/rms_norm.hpp"
@@ -549,6 +550,10 @@ extern template bool evaluate_node<ov::op::internal::AUGRUSequence>(std::shared_
                                                                     ov::TensorVector& outputs,
                                                                     const ov::TensorVector& inputs);
 
+extern template bool evaluate_node<ov::op::internal::GatedDeltaNet>(std::shared_ptr<ov::Node> node,
+                                                                    ov::TensorVector& outputs,
+                                                                    const ov::TensorVector& inputs);
+
 extern template bool evaluate_node<ov::op::internal::RMS>(std::shared_ptr<ov::Node> node,
                                                           ov::TensorVector& outputs,
                                                           const ov::TensorVector& inputs);
diff --git a/src/plugins/template/backend/opset_int_tbl.hpp b/src/plugins/template/backend/opset_int_tbl.hpp
index c10ac0808e67..ebc765a67bd2 100644
--- a/src/plugins/template/backend/opset_int_tbl.hpp
+++ b/src/plugins/template/backend/opset_int_tbl.hpp
@@ -188,6 +188,7 @@ _OPENVINO_OP_REG(OneHot, ov::op::v16)
 
 _OPENVINO_OP_REG(AUGRUCell, ov::op::internal)
 _OPENVINO_OP_REG(AUGRUSequence, ov::op::internal)
+_OPENVINO_OP_REG(GatedDeltaNet, ov::op::internal)
 _OPENVINO_OP_REG(RMS, ov::op::internal)
 _OPENVINO_OP_REG(RMSNorm, ov::op::internal)
 _OPENVINO_OP_REG(PagedAttentionExtension, ov::op)

From a95681cbb52ea985b72e3cedda606890c7223216 Mon Sep 17 00:00:00 2001
From: "Yu, Zijun" <zijun.yu@intel.com>
Date: Wed, 3 Jun 2026 16:57:36 +0800
Subject: [PATCH 4/4] Apply clang-format

---
 .../single_layer_tests/gated_delta_net.cpp    | 32 +++++++------------
 .../template/backend/ops/gated_delta_net.cpp  |  3 +-
 2 files changed, 13 insertions(+), 22 deletions(-)

diff --git a/src/plugins/intel_gpu/tests/functional/single_layer_tests/gated_delta_net.cpp b/src/plugins/intel_gpu/tests/functional/single_layer_tests/gated_delta_net.cpp
index a99c3ffd05a0..74a87211ae16 100644
--- a/src/plugins/intel_gpu/tests/functional/single_layer_tests/gated_delta_net.cpp
+++ b/src/plugins/intel_gpu/tests/functional/single_layer_tests/gated_delta_net.cpp
@@ -14,13 +14,11 @@
 
 namespace {
 
-using GatedDeltaNetParams = std::tuple<
-    std::vector<ov::Shape>,  // Input shapes: query, key, value, state, gate, beta
-    ov::element::Type,       // Input precision
-    bool>;                   // fuse_qk_l2norm
+using GatedDeltaNetParams = std::tuple<std::vector<ov::Shape>,  // Input shapes: query, key, value, state, gate, beta
+                                       ov::element::Type,       // Input precision
+                                       bool>;                   // fuse_qk_l2norm
 
-class GatedDeltaNetStaticTest : public testing::WithParamInterface<GatedDeltaNetParams>,
-                                public ov::test::TestsCommon {
+class GatedDeltaNetStaticTest : public testing::WithParamInterface<GatedDeltaNetParams>, public ov::test::TestsCommon {
 public:
     static std::string getTestCaseName(const testing::TestParamInfo<GatedDeltaNetParams>& obj) {
         const auto& [input_shapes, precision, fuse_qk_l2norm] = obj.param;
@@ -48,16 +46,12 @@ class GatedDeltaNetStaticTest : public testing::WithParamInterface<GatedDeltaNet
         auto gate = std::make_shared<ov::op::v0::Parameter>(precision, input_shapes[4]);
         auto beta = std::make_shared<ov::op::v0::Parameter>(precision, input_shapes[5]);
 
-        auto gdn = std::make_shared<ov::op::internal::GatedDeltaNet>(
-            query, key, value, state, gate, beta, fuse_qk_l2norm);
+        auto gdn = std::make_shared<ov::op::internal::GatedDeltaNet>(query, key, value, state, gate, beta, fuse_qk_l2norm);
 
         auto result0 = std::make_shared<ov::op::v0::Result>(gdn->output(0));
         auto result1 = std::make_shared<ov::op::v0::Result>(gdn->output(1));
 
-        model = std::make_shared<ov::Model>(
-            ov::ResultVector{result0, result1},
-            ov::ParameterVector{query, key, value, state, gate, beta},
-            "GatedDeltaNetTest");
+        model = std::make_shared<ov::Model>(ov::ResultVector{result0, result1}, ov::ParameterVector{query, key, value, state, gate, beta}, "GatedDeltaNetTest");
     }
 
     std::map<std::shared_ptr<ov::op::v0::Parameter>, ov::Tensor> generate_inputs() {
@@ -68,8 +62,7 @@ class GatedDeltaNetStaticTest : public testing::WithParamInterface<GatedDeltaNet
             if (i == 4) {
                 in_data = ov::test::utils::InputGenerateData(-1, 1, 1000, 1);
             }
-            inputs[params[i]] = ov::test::utils::create_and_fill_tensor(
-                params[i]->get_element_type(), params[i]->get_shape(), in_data);
+            inputs[params[i]] = ov::test::utils::create_and_fill_tensor(params[i]->get_element_type(), params[i]->get_shape(), in_data);
         }
         return inputs;
     }
@@ -119,12 +112,9 @@ const std::vector<std::vector<ov::Shape>> static_shapes = {
     {{1, 4, 2, 16}, {1, 4, 2, 16}, {1, 4, 8, 16}, {1, 8, 16, 16}, {1, 4, 8}, {1, 4, 8}},
 };
 
-INSTANTIATE_TEST_SUITE_P(
-    smoke_GatedDeltaNetStatic,
-    GatedDeltaNetStaticTest,
-    ::testing::Combine(::testing::ValuesIn(static_shapes),
-                       ::testing::Values(ov::element::f32),
-                       ::testing::Values(false, true)),
-    GatedDeltaNetStaticTest::getTestCaseName);
+INSTANTIATE_TEST_SUITE_P(smoke_GatedDeltaNetStatic,
+                         GatedDeltaNetStaticTest,
+                         ::testing::Combine(::testing::ValuesIn(static_shapes), ::testing::Values(ov::element::f32), ::testing::Values(false, true)),
+                         GatedDeltaNetStaticTest::getTestCaseName);
 
 }  // namespace
diff --git a/src/plugins/template/backend/ops/gated_delta_net.cpp b/src/plugins/template/backend/ops/gated_delta_net.cpp
index 3dfecdde5d96..6565bf4e1a3f 100644
--- a/src/plugins/template/backend/ops/gated_delta_net.cpp
+++ b/src/plugins/template/backend/ops/gated_delta_net.cpp
@@ -2,12 +2,13 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 
+#include "openvino/op/gated_delta_net.hpp"
+
 #include <cmath>
 #include <vector>
 
 #include "evaluate_node.hpp"
 #include "openvino/core/type/element_type_traits.hpp"
-#include "openvino/op/gated_delta_net.hpp"
 
 template <ov::element::Type_t ET>
 bool evaluate(const std::shared_ptr<ov::op::internal::GatedDeltaNet>& op,