From 98118b86e741b5dd7176a9c148e73a205eab8b5e Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Wed, 27 May 2026 13:57:23 +0800 Subject: [PATCH 1/4] [GPU] Fix GDN func test to use TEMPLATE plugin as reference The GPU functional test for GatedDeltaNet was failing on A770 because it used the CPU plugin as reference, which is not available on that test platform. Changes: - Add evaluate() method to GatedDeltaNet op, enabling the TEMPLATE plugin to evaluate it directly as a reference implementation. - Update the GPU functional test to use infer_on_template() instead of compiling on CPU. CVS-187512 --- .../dev_api/openvino/op/gated_delta_net.hpp | 2 + src/core/src/op/gated_delta_net.cpp | 118 ++++++++++++++++++ .../single_layer_tests/gated_delta_net.cpp | 27 ++-- 3 files changed, 134 insertions(+), 13 deletions(-) diff --git a/src/core/dev_api/openvino/op/gated_delta_net.hpp b/src/core/dev_api/openvino/op/gated_delta_net.hpp index 4b6b464606f3..a48702e06fa7 100644 --- a/src/core/dev_api/openvino/op/gated_delta_net.hpp +++ b/src/core/dev_api/openvino/op/gated_delta_net.hpp @@ -49,6 +49,8 @@ class OPENVINO_API GatedDeltaNet : public ov::op::Op { void validate_and_infer_types() override; bool visit_attributes(AttributeVisitor& visitor) override; std::shared_ptr clone_with_new_inputs(const ov::OutputVector& new_args) const override; + bool evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const override; + bool has_evaluate() const override; bool get_fuse_qk_l2norm() const { return m_fuse_qk_l2norm; } diff --git a/src/core/src/op/gated_delta_net.cpp b/src/core/src/op/gated_delta_net.cpp index 32d19b176df3..98ab0fe6c07f 100644 --- a/src/core/src/op/gated_delta_net.cpp +++ b/src/core/src/op/gated_delta_net.cpp @@ -4,6 +4,8 @@ #include "openvino/op/gated_delta_net.hpp" +#include + #include "dimension_util.hpp" #include "gated_delta_net_shape_inference.hpp" #include "itt.hpp" @@ -115,4 +117,120 @@ std::shared_ptr GatedDeltaNet::clone_with_new_inputs(const ov::OutputV return cloned; } +bool GatedDeltaNet::has_evaluate() const { + return get_input_element_type(0) == ov::element::f32; +} + +bool GatedDeltaNet::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const { + OV_OP_SCOPE(GatedDeltaNet_evaluate); + + const auto& q_tensor = inputs[0]; + const auto& k_tensor = inputs[1]; + const auto& v_tensor = inputs[2]; + const auto& state_tensor = inputs[3]; + const auto& gate_tensor = inputs[4]; + const auto& beta_tensor = inputs[5]; + + const auto& q_shape = q_tensor.get_shape(); + const auto& v_shape = v_tensor.get_shape(); + const auto& state_shape = state_tensor.get_shape(); + + const size_t B = q_shape[0]; + const size_t S = q_shape[1]; + const size_t qk_H = q_shape[2]; + const size_t D = q_shape[3]; + const size_t v_H = v_shape[2]; + const size_t Dv = v_shape[3]; + const size_t group_size = v_H / qk_H; + + outputs[0].set_shape(v_shape); + outputs[1].set_shape(state_shape); + + const float* q_data = static_cast(q_tensor.data()); + const float* k_data = static_cast(k_tensor.data()); + const float* v_data = static_cast(v_tensor.data()); + const float* gate_data = static_cast(gate_tensor.data()); + const float* beta_data = static_cast(beta_tensor.data()); + + // Copy state input to output state (will be modified in-place) + float* out_state = static_cast(outputs[1].data()); + std::memcpy(out_state, state_tensor.data(), state_tensor.get_byte_size()); + + float* out_data = static_cast(outputs[0].data()); + const float attn_scale = 1.0f / std::sqrt(static_cast(D)); + + for (size_t b = 0; b < B; b++) { + for (size_t h_v = 0; h_v < v_H; h_v++) { + const size_t h_qk = h_v / group_size; + for (size_t d_v = 0; d_v < Dv; d_v++) { + // state slice: state[b, h_v, :, d_v] — D elements + // state layout: [B, v_H, D, Dv] + float* state_ptr = out_state + b * v_H * D * Dv + h_v * D * Dv + d_v; + + for (size_t t = 0; t < S; t++) { + // q[b, t, h_qk, :] — layout [B, S, qk_H, D] + const float* q_ptr = q_data + b * S * qk_H * D + t * qk_H * D + h_qk * D; + // k[b, t, h_qk, :] — layout [B, S, qk_H, D] + const float* k_ptr = k_data + b * S * qk_H * D + t * qk_H * D + h_qk * D; + + // L2-normalize q and k + std::vector q_vec(q_ptr, q_ptr + D); + std::vector k_vec(k_ptr, k_ptr + D); + + if (m_fuse_qk_l2norm) { + auto l2norm = [](std::vector& vec, float eps) { + float sum = 0.0f; + for (auto v : vec) + sum += v * v; + sum = 1.0f / std::sqrt(sum + eps); + for (auto& v : vec) + v *= sum; + }; + l2norm(q_vec, m_q_l2_norm_eps); + l2norm(k_vec, m_k_l2_norm_eps); + } + + // Scale q + for (auto& v : q_vec) + v *= attn_scale; + + // gate[b, t, h_v] — layout [B, S, v_H] + float g = std::exp(gate_data[b * S * v_H + t * v_H + h_v]); + // beta[b, t, h_v] + float bt = beta_data[b * S * v_H + t * v_H + h_v]; + + // Decay state: state *= g + for (size_t d = 0; d < D; d++) { + state_ptr[d * Dv] *= g; + } + + // h_k = dot(state, k) + float h_k = 0.0f; + for (size_t d = 0; d < D; d++) { + h_k += state_ptr[d * Dv] * k_vec[d]; + } + + // delta: v_val = value[b, t, h_v, d_v] - h_k + // value layout: [B, S, v_H, Dv] + float v_val = v_data[b * S * v_H * Dv + t * v_H * Dv + h_v * Dv + d_v] - h_k; + + // Update state: state += k * (v_val * beta) + float update_scale = v_val * bt; + for (size_t d = 0; d < D; d++) { + state_ptr[d * Dv] += k_vec[d] * update_scale; + } + + // Output: out[b, t, h_v, d_v] = dot(state, q) + float out_val = 0.0f; + for (size_t d = 0; d < D; d++) { + out_val += state_ptr[d * Dv] * q_vec[d]; + } + out_data[b * S * v_H * Dv + t * v_H * Dv + h_v * Dv + d_v] = out_val; + } + } + } + } + return true; +} + } // namespace ov::op::internal diff --git a/src/plugins/intel_gpu/tests/functional/single_layer_tests/gated_delta_net.cpp b/src/plugins/intel_gpu/tests/functional/single_layer_tests/gated_delta_net.cpp index b33c4d137ebf..d8c700a600b5 100644 --- a/src/plugins/intel_gpu/tests/functional/single_layer_tests/gated_delta_net.cpp +++ b/src/plugins/intel_gpu/tests/functional/single_layer_tests/gated_delta_net.cpp @@ -2,12 +2,14 @@ // SPDX-License-Identifier: Apache-2.0 // +#include "openvino/op/gated_delta_net.hpp" + +#include "common_test_utils/common_utils.hpp" #include "common_test_utils/ov_tensor_utils.hpp" +#include "common_test_utils/ov_test_utils.hpp" #include "common_test_utils/test_common.hpp" -#include "common_test_utils/common_utils.hpp" #include "openvino/op/parameter.hpp" #include "openvino/op/result.hpp" -#include "openvino/op/gated_delta_net.hpp" #include "openvino/runtime/core.hpp" namespace { @@ -75,20 +77,20 @@ class GatedDeltaNetStaticTest : public testing::WithParamInterface model; }; -TEST_P(GatedDeltaNetStaticTest, CompareWithCPU) { +TEST_P(GatedDeltaNetStaticTest, CompareWithTemplate) { auto inputs = generate_inputs(); - ov::Core core; - - // Run on CPU (reference) - auto compiled_cpu = core.compile_model(model, "CPU"); - auto req_cpu = compiled_cpu.create_infer_request(); - for (const auto& [param, tensor] : inputs) { - req_cpu.set_tensor(param->output(0), tensor); + // Build input tensor vector for infer_on_template + ov::TensorVector input_tensors; + for (const auto& param : model->get_parameters()) { + input_tensors.push_back(inputs.at(param)); } - req_cpu.infer(); + + // Run on TEMPLATE (reference) + auto ref_outputs = ov::test::utils::infer_on_template(model, input_tensors); // Run on GPU + ov::Core core; auto compiled_gpu = core.compile_model(model, "GPU"); auto req_gpu = compiled_gpu.create_infer_request(); for (const auto& [param, tensor] : inputs) { @@ -98,9 +100,8 @@ TEST_P(GatedDeltaNetStaticTest, CompareWithCPU) { // Compare outputs for (size_t i = 0; i < model->get_output_size(); i++) { - auto out_cpu = req_cpu.get_output_tensor(i); auto out_gpu = req_gpu.get_output_tensor(i); - ov::test::utils::compare(out_cpu, out_gpu, 1e-2, 1e-2); + ov::test::utils::compare(ref_outputs[i], out_gpu, 1e-2, 1e-2); } } From aed473bdefe8b1e63f25cbedc7caa68b4078f6a9 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Fri, 29 May 2026 14:42:39 +0800 Subject: [PATCH 2/4] Address review comments --- src/core/src/op/gated_delta_net.cpp | 82 +++++++++++++------ .../single_layer_tests/gated_delta_net.cpp | 12 +-- 2 files changed, 62 insertions(+), 32 deletions(-) diff --git a/src/core/src/op/gated_delta_net.cpp b/src/core/src/op/gated_delta_net.cpp index 98ab0fe6c07f..59fd3d0d4bc3 100644 --- a/src/core/src/op/gated_delta_net.cpp +++ b/src/core/src/op/gated_delta_net.cpp @@ -5,6 +5,7 @@ #include "openvino/op/gated_delta_net.hpp" #include +#include #include "dimension_util.hpp" #include "gated_delta_net_shape_inference.hpp" @@ -118,7 +119,15 @@ std::shared_ptr GatedDeltaNet::clone_with_new_inputs(const ov::OutputV } bool GatedDeltaNet::has_evaluate() const { - return get_input_element_type(0) == ov::element::f32; + for (size_t i = 0; i < get_input_size(); i++) { + if (get_input_element_type(i) != ov::element::f32) + return false; + } + for (size_t i = 0; i < get_output_size(); i++) { + if (get_output_element_type(i) != ov::element::f32) + return false; + } + return true; } bool GatedDeltaNet::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const { @@ -141,6 +150,13 @@ bool GatedDeltaNet::evaluate(ov::TensorVector& outputs, const ov::TensorVector& const size_t D = q_shape[3]; const size_t v_H = v_shape[2]; const size_t Dv = v_shape[3]; + + OPENVINO_ASSERT(qk_H > 0 && v_H >= qk_H && v_H % qk_H == 0, + "GatedDeltaNet evaluate: v_H (", + v_H, + ") must be a positive multiple of qk_H (", + qk_H, + ")"); const size_t group_size = v_H / qk_H; outputs[0].set_shape(v_shape); @@ -152,26 +168,41 @@ bool GatedDeltaNet::evaluate(ov::TensorVector& outputs, const ov::TensorVector& const float* gate_data = static_cast(gate_tensor.data()); const float* beta_data = static_cast(beta_tensor.data()); - // Copy state input to output state (will be modified in-place) float* out_state = static_cast(outputs[1].data()); - std::memcpy(out_state, state_tensor.data(), state_tensor.get_byte_size()); - float* out_data = static_cast(outputs[0].data()); const float attn_scale = 1.0f / std::sqrt(static_cast(D)); + const size_t qk_stride_batch = S * qk_H * D; + const size_t v_stride_batch = S * v_H * Dv; + const size_t gate_beta_stride_batch = S * v_H; + + auto dot_product = [](const float* a, const float* b, size_t n, size_t a_stride = 1) { + float result = 0.0f; + for (size_t i = 0; i < n; i++) { + result += a[i * a_stride] * b[i]; + } + return result; + }; + for (size_t b = 0; b < B; b++) { for (size_t h_v = 0; h_v < v_H; h_v++) { const size_t h_qk = h_v / group_size; for (size_t d_v = 0; d_v < Dv; d_v++) { - // state slice: state[b, h_v, :, d_v] — D elements + // state slice: state[b, h_v, :, d_v] — D elements with stride Dv // state layout: [B, v_H, D, Dv] - float* state_ptr = out_state + b * v_H * D * Dv + h_v * D * Dv + d_v; + const size_t state_offset = b * v_H * D * Dv + h_v * D * Dv + d_v; + float* state_ptr = out_state + state_offset; + + // Load initial state from input + std::vector local_state(D); + const float* src_state = static_cast(state_tensor.data()) + state_offset; + for (size_t d = 0; d < D; d++) { + local_state[d] = src_state[d * Dv]; + } for (size_t t = 0; t < S; t++) { - // q[b, t, h_qk, :] — layout [B, S, qk_H, D] - const float* q_ptr = q_data + b * S * qk_H * D + t * qk_H * D + h_qk * D; - // k[b, t, h_qk, :] — layout [B, S, qk_H, D] - const float* k_ptr = k_data + b * S * qk_H * D + t * qk_H * D + h_qk * D; + const float* q_ptr = q_data + b * qk_stride_batch + t * qk_H * D + h_qk * D; + const float* k_ptr = k_data + b * qk_stride_batch + t * qk_H * D + h_qk * D; // L2-normalize q and k std::vector q_vec(q_ptr, q_ptr + D); @@ -180,7 +211,7 @@ bool GatedDeltaNet::evaluate(ov::TensorVector& outputs, const ov::TensorVector& if (m_fuse_qk_l2norm) { auto l2norm = [](std::vector& vec, float eps) { float sum = 0.0f; - for (auto v : vec) + for (const auto v : vec) sum += v * v; sum = 1.0f / std::sqrt(sum + eps); for (auto& v : vec) @@ -195,37 +226,34 @@ bool GatedDeltaNet::evaluate(ov::TensorVector& outputs, const ov::TensorVector& v *= attn_scale; // gate[b, t, h_v] — layout [B, S, v_H] - float g = std::exp(gate_data[b * S * v_H + t * v_H + h_v]); - // beta[b, t, h_v] - float bt = beta_data[b * S * v_H + t * v_H + h_v]; + float g = std::exp(gate_data[b * gate_beta_stride_batch + t * v_H + h_v]); + float bt = beta_data[b * gate_beta_stride_batch + t * v_H + h_v]; // Decay state: state *= g for (size_t d = 0; d < D; d++) { - state_ptr[d * Dv] *= g; + local_state[d] *= g; } // h_k = dot(state, k) - float h_k = 0.0f; - for (size_t d = 0; d < D; d++) { - h_k += state_ptr[d * Dv] * k_vec[d]; - } + float h_k = dot_product(local_state.data(), k_vec.data(), D); // delta: v_val = value[b, t, h_v, d_v] - h_k - // value layout: [B, S, v_H, Dv] - float v_val = v_data[b * S * v_H * Dv + t * v_H * Dv + h_v * Dv + d_v] - h_k; + float v_val = v_data[b * v_stride_batch + t * v_H * Dv + h_v * Dv + d_v] - h_k; // Update state: state += k * (v_val * beta) float update_scale = v_val * bt; for (size_t d = 0; d < D; d++) { - state_ptr[d * Dv] += k_vec[d] * update_scale; + local_state[d] += k_vec[d] * update_scale; } // Output: out[b, t, h_v, d_v] = dot(state, q) - float out_val = 0.0f; - for (size_t d = 0; d < D; d++) { - out_val += state_ptr[d * Dv] * q_vec[d]; - } - out_data[b * S * v_H * Dv + t * v_H * Dv + h_v * Dv + d_v] = out_val; + out_data[b * v_stride_batch + t * v_H * Dv + h_v * Dv + d_v] = + dot_product(local_state.data(), q_vec.data(), D); + } + + // Write final state back + for (size_t d = 0; d < D; d++) { + state_ptr[d * Dv] = local_state[d]; } } } diff --git a/src/plugins/intel_gpu/tests/functional/single_layer_tests/gated_delta_net.cpp b/src/plugins/intel_gpu/tests/functional/single_layer_tests/gated_delta_net.cpp index d8c700a600b5..a99c3ffd05a0 100644 --- a/src/plugins/intel_gpu/tests/functional/single_layer_tests/gated_delta_net.cpp +++ b/src/plugins/intel_gpu/tests/functional/single_layer_tests/gated_delta_net.cpp @@ -105,16 +105,18 @@ TEST_P(GatedDeltaNetStaticTest, CompareWithTemplate) { } } -// Shapes: query[B,S,H,D], key[B,S,H,D], value[B,S,H,Dv], state[B,H,D,Dv], gate[B,S,H], beta[B,S,H] +// Shapes: query[B,S,qk_H,D], key[B,S,qk_H,D], value[B,S,v_H,Dv], state[B,v_H,D,Dv], gate[B,S,v_H], beta[B,S,v_H] const std::vector> static_shapes = { - // B=1, S=1, H=4, D=16, Dv=16 (minimal) + // B=1, S=1, qk_H=4, v_H=4, D=16, Dv=16 (minimal) {{1, 1, 4, 16}, {1, 1, 4, 16}, {1, 1, 4, 16}, {1, 4, 16, 16}, {1, 1, 4}, {1, 1, 4}}, - // B=1, S=1, H=32, D=128, Dv=128 (typical LLM decode) + // B=1, S=1, qk_H=32, v_H=32, D=128, Dv=128 (typical LLM decode) {{1, 1, 32, 128}, {1, 1, 32, 128}, {1, 1, 32, 128}, {1, 32, 128, 128}, {1, 1, 32}, {1, 1, 32}}, - // B=1, S=16, H=2, D=16, Dv=32 (seq_len > 1, different D and Dv) + // B=1, S=16, qk_H=2, v_H=2, D=16, Dv=32 (seq_len > 1, different D and Dv) {{1, 16, 2, 16}, {1, 16, 2, 16}, {1, 16, 2, 32}, {1, 2, 16, 32}, {1, 16, 2}, {1, 16, 2}}, - // B=2, S=1, H=8, D=64, Dv=64 (batch > 1) + // B=2, S=1, qk_H=8, v_H=8, D=64, Dv=64 (batch > 1) {{2, 1, 8, 64}, {2, 1, 8, 64}, {2, 1, 8, 64}, {2, 8, 64, 64}, {2, 1, 8}, {2, 1, 8}}, + // B=1, S=4, qk_H=2, v_H=8, D=16, Dv=16 (GQA: v_H is multiple of qk_H) + {{1, 4, 2, 16}, {1, 4, 2, 16}, {1, 4, 8, 16}, {1, 8, 16, 16}, {1, 4, 8}, {1, 4, 8}}, }; INSTANTIATE_TEST_SUITE_P( From 153c6f87cd6156dc54aaf7b16f902e3421b7c491 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Wed, 3 Jun 2026 13:33:51 +0800 Subject: [PATCH 3/4] Move GatedDeltaNet evaluate to Template plugin --- .../dev_api/openvino/op/gated_delta_net.hpp | 2 - src/core/src/op/gated_delta_net.cpp | 146 ----------------- .../template/backend/ops/gated_delta_net.cpp | 155 ++++++++++++++++++ .../template/backend/ops/ops_evaluates.hpp | 5 + .../template/backend/opset_int_tbl.hpp | 1 + 5 files changed, 161 insertions(+), 148 deletions(-) create mode 100644 src/plugins/template/backend/ops/gated_delta_net.cpp diff --git a/src/core/dev_api/openvino/op/gated_delta_net.hpp b/src/core/dev_api/openvino/op/gated_delta_net.hpp index a48702e06fa7..4b6b464606f3 100644 --- a/src/core/dev_api/openvino/op/gated_delta_net.hpp +++ b/src/core/dev_api/openvino/op/gated_delta_net.hpp @@ -49,8 +49,6 @@ class OPENVINO_API GatedDeltaNet : public ov::op::Op { void validate_and_infer_types() override; bool visit_attributes(AttributeVisitor& visitor) override; std::shared_ptr clone_with_new_inputs(const ov::OutputVector& new_args) const override; - bool evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const override; - bool has_evaluate() const override; bool get_fuse_qk_l2norm() const { return m_fuse_qk_l2norm; } diff --git a/src/core/src/op/gated_delta_net.cpp b/src/core/src/op/gated_delta_net.cpp index 59fd3d0d4bc3..32d19b176df3 100644 --- a/src/core/src/op/gated_delta_net.cpp +++ b/src/core/src/op/gated_delta_net.cpp @@ -4,9 +4,6 @@ #include "openvino/op/gated_delta_net.hpp" -#include -#include - #include "dimension_util.hpp" #include "gated_delta_net_shape_inference.hpp" #include "itt.hpp" @@ -118,147 +115,4 @@ std::shared_ptr GatedDeltaNet::clone_with_new_inputs(const ov::OutputV return cloned; } -bool GatedDeltaNet::has_evaluate() const { - for (size_t i = 0; i < get_input_size(); i++) { - if (get_input_element_type(i) != ov::element::f32) - return false; - } - for (size_t i = 0; i < get_output_size(); i++) { - if (get_output_element_type(i) != ov::element::f32) - return false; - } - return true; -} - -bool GatedDeltaNet::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const { - OV_OP_SCOPE(GatedDeltaNet_evaluate); - - const auto& q_tensor = inputs[0]; - const auto& k_tensor = inputs[1]; - const auto& v_tensor = inputs[2]; - const auto& state_tensor = inputs[3]; - const auto& gate_tensor = inputs[4]; - const auto& beta_tensor = inputs[5]; - - const auto& q_shape = q_tensor.get_shape(); - const auto& v_shape = v_tensor.get_shape(); - const auto& state_shape = state_tensor.get_shape(); - - const size_t B = q_shape[0]; - const size_t S = q_shape[1]; - const size_t qk_H = q_shape[2]; - const size_t D = q_shape[3]; - const size_t v_H = v_shape[2]; - const size_t Dv = v_shape[3]; - - OPENVINO_ASSERT(qk_H > 0 && v_H >= qk_H && v_H % qk_H == 0, - "GatedDeltaNet evaluate: v_H (", - v_H, - ") must be a positive multiple of qk_H (", - qk_H, - ")"); - const size_t group_size = v_H / qk_H; - - outputs[0].set_shape(v_shape); - outputs[1].set_shape(state_shape); - - const float* q_data = static_cast(q_tensor.data()); - const float* k_data = static_cast(k_tensor.data()); - const float* v_data = static_cast(v_tensor.data()); - const float* gate_data = static_cast(gate_tensor.data()); - const float* beta_data = static_cast(beta_tensor.data()); - - float* out_state = static_cast(outputs[1].data()); - float* out_data = static_cast(outputs[0].data()); - const float attn_scale = 1.0f / std::sqrt(static_cast(D)); - - const size_t qk_stride_batch = S * qk_H * D; - const size_t v_stride_batch = S * v_H * Dv; - const size_t gate_beta_stride_batch = S * v_H; - - auto dot_product = [](const float* a, const float* b, size_t n, size_t a_stride = 1) { - float result = 0.0f; - for (size_t i = 0; i < n; i++) { - result += a[i * a_stride] * b[i]; - } - return result; - }; - - for (size_t b = 0; b < B; b++) { - for (size_t h_v = 0; h_v < v_H; h_v++) { - const size_t h_qk = h_v / group_size; - for (size_t d_v = 0; d_v < Dv; d_v++) { - // state slice: state[b, h_v, :, d_v] — D elements with stride Dv - // state layout: [B, v_H, D, Dv] - const size_t state_offset = b * v_H * D * Dv + h_v * D * Dv + d_v; - float* state_ptr = out_state + state_offset; - - // Load initial state from input - std::vector local_state(D); - const float* src_state = static_cast(state_tensor.data()) + state_offset; - for (size_t d = 0; d < D; d++) { - local_state[d] = src_state[d * Dv]; - } - - for (size_t t = 0; t < S; t++) { - const float* q_ptr = q_data + b * qk_stride_batch + t * qk_H * D + h_qk * D; - const float* k_ptr = k_data + b * qk_stride_batch + t * qk_H * D + h_qk * D; - - // L2-normalize q and k - std::vector q_vec(q_ptr, q_ptr + D); - std::vector k_vec(k_ptr, k_ptr + D); - - if (m_fuse_qk_l2norm) { - auto l2norm = [](std::vector& vec, float eps) { - float sum = 0.0f; - for (const auto v : vec) - sum += v * v; - sum = 1.0f / std::sqrt(sum + eps); - for (auto& v : vec) - v *= sum; - }; - l2norm(q_vec, m_q_l2_norm_eps); - l2norm(k_vec, m_k_l2_norm_eps); - } - - // Scale q - for (auto& v : q_vec) - v *= attn_scale; - - // gate[b, t, h_v] — layout [B, S, v_H] - float g = std::exp(gate_data[b * gate_beta_stride_batch + t * v_H + h_v]); - float bt = beta_data[b * gate_beta_stride_batch + t * v_H + h_v]; - - // Decay state: state *= g - for (size_t d = 0; d < D; d++) { - local_state[d] *= g; - } - - // h_k = dot(state, k) - float h_k = dot_product(local_state.data(), k_vec.data(), D); - - // delta: v_val = value[b, t, h_v, d_v] - h_k - float v_val = v_data[b * v_stride_batch + t * v_H * Dv + h_v * Dv + d_v] - h_k; - - // Update state: state += k * (v_val * beta) - float update_scale = v_val * bt; - for (size_t d = 0; d < D; d++) { - local_state[d] += k_vec[d] * update_scale; - } - - // Output: out[b, t, h_v, d_v] = dot(state, q) - out_data[b * v_stride_batch + t * v_H * Dv + h_v * Dv + d_v] = - dot_product(local_state.data(), q_vec.data(), D); - } - - // Write final state back - for (size_t d = 0; d < D; d++) { - state_ptr[d * Dv] = local_state[d]; - } - } - } - } - return true; -} - } // namespace ov::op::internal diff --git a/src/plugins/template/backend/ops/gated_delta_net.cpp b/src/plugins/template/backend/ops/gated_delta_net.cpp new file mode 100644 index 000000000000..3dfecdde5d96 --- /dev/null +++ b/src/plugins/template/backend/ops/gated_delta_net.cpp @@ -0,0 +1,155 @@ +// Copyright (C) 2018-2026 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include + +#include "evaluate_node.hpp" +#include "openvino/core/type/element_type_traits.hpp" +#include "openvino/op/gated_delta_net.hpp" + +template +bool evaluate(const std::shared_ptr& op, + ov::TensorVector& outputs, + const ov::TensorVector& inputs) { + using T = typename ov::element_type_traits::value_type; + + const auto& q_shape = inputs[0].get_shape(); + const auto& v_shape = inputs[2].get_shape(); + const auto& state_shape = inputs[3].get_shape(); + + const size_t B = q_shape[0]; + const size_t S = q_shape[1]; + const size_t qk_H = q_shape[2]; + const size_t D = q_shape[3]; + const size_t v_H = v_shape[2]; + const size_t Dv = v_shape[3]; + + OPENVINO_ASSERT(qk_H > 0 && v_H >= qk_H && v_H % qk_H == 0, + "GatedDeltaNet evaluate: v_H (", + v_H, + ") must be a positive multiple of qk_H (", + qk_H, + ")"); + const size_t group_size = v_H / qk_H; + + outputs[0].set_shape(v_shape); + outputs[1].set_shape(state_shape); + + const T* q_data = inputs[0].data(); + const T* k_data = inputs[1].data(); + const T* v_data = inputs[2].data(); + const T* state_data = inputs[3].data(); + const T* gate_data = inputs[4].data(); + const T* beta_data = inputs[5].data(); + + T* out_state = outputs[1].data(); + T* out_data = outputs[0].data(); + const T attn_scale = static_cast(1) / std::sqrt(static_cast(D)); + + const size_t qk_stride_batch = S * qk_H * D; + const size_t v_stride_batch = S * v_H * Dv; + const size_t gate_beta_stride_batch = S * v_H; + + const bool fuse_qk_l2norm = op->get_fuse_qk_l2norm(); + const T q_l2_norm_eps = static_cast(op->get_q_l2_norm_eps()); + const T k_l2_norm_eps = static_cast(op->get_k_l2_norm_eps()); + + auto dot_product = [](const T* a, const T* b, size_t n) { + T result = static_cast(0); + for (size_t i = 0; i < n; i++) { + result += a[i] * b[i]; + } + return result; + }; + + auto l2norm = [](std::vector& vec, T eps) { + T sum = static_cast(0); + for (size_t i = 0; i < vec.size(); i++) + sum += vec[i] * vec[i]; + sum = static_cast(1) / std::sqrt(sum + eps); + for (size_t i = 0; i < vec.size(); i++) + vec[i] *= sum; + }; + + for (size_t b = 0; b < B; b++) { + for (size_t h_v = 0; h_v < v_H; h_v++) { + const size_t h_qk = h_v / group_size; + for (size_t d_v = 0; d_v < Dv; d_v++) { + // state layout: [B, v_H, D, Dv] + const size_t state_offset = b * v_H * D * Dv + h_v * D * Dv + d_v; + T* state_ptr = out_state + state_offset; + + // Load initial state from input + std::vector local_state(D); + const T* src_state = state_data + state_offset; + for (size_t d = 0; d < D; d++) { + local_state[d] = src_state[d * Dv]; + } + + for (size_t t = 0; t < S; t++) { + const T* q_ptr = q_data + b * qk_stride_batch + t * qk_H * D + h_qk * D; + const T* k_ptr = k_data + b * qk_stride_batch + t * qk_H * D + h_qk * D; + + std::vector q_vec(q_ptr, q_ptr + D); + std::vector k_vec(k_ptr, k_ptr + D); + + if (fuse_qk_l2norm) { + l2norm(q_vec, q_l2_norm_eps); + l2norm(k_vec, k_l2_norm_eps); + } + + // Scale q + for (size_t i = 0; i < D; i++) + q_vec[i] *= attn_scale; + + // gate[b, t, h_v] — layout [B, S, v_H] + T g = std::exp(gate_data[b * gate_beta_stride_batch + t * v_H + h_v]); + T bt = beta_data[b * gate_beta_stride_batch + t * v_H + h_v]; + + // Decay state: state *= g + for (size_t d = 0; d < D; d++) { + local_state[d] *= g; + } + + // h_k = dot(state, k) + T h_k = dot_product(local_state.data(), k_vec.data(), D); + + // delta: v_val = value[b, t, h_v, d_v] - h_k + T v_val = v_data[b * v_stride_batch + t * v_H * Dv + h_v * Dv + d_v] - h_k; + + // Update state: state += k * (v_val * beta) + T update_scale = v_val * bt; + for (size_t d = 0; d < D; d++) { + local_state[d] += k_vec[d] * update_scale; + } + + // Output: out[b, t, h_v, d_v] = dot(state, q) + out_data[b * v_stride_batch + t * v_H * Dv + h_v * Dv + d_v] = + dot_product(local_state.data(), q_vec.data(), D); + } + + // Write final state back + for (size_t d = 0; d < D; d++) { + state_ptr[d * Dv] = local_state[d]; + } + } + } + } + return true; +} + +template <> +bool evaluate_node(std::shared_ptr node, + ov::TensorVector& outputs, + const ov::TensorVector& inputs) { + const auto& element_type = node->get_input_element_type(0); + + switch (element_type) { + case ov::element::f32: + return evaluate(ov::as_type_ptr(node), outputs, inputs); + default: + OPENVINO_THROW("Unhandled data type ", element_type, " in evaluate_node()"); + } +} diff --git a/src/plugins/template/backend/ops/ops_evaluates.hpp b/src/plugins/template/backend/ops/ops_evaluates.hpp index a186fc4a6d9e..d0a7de9be754 100644 --- a/src/plugins/template/backend/ops/ops_evaluates.hpp +++ b/src/plugins/template/backend/ops/ops_evaluates.hpp @@ -4,6 +4,7 @@ #pragma once #include "evaluate_node.hpp" +#include "openvino/op/gated_delta_net.hpp" #include "openvino/op/ops.hpp" #include "openvino/op/paged_attention.hpp" #include "openvino/op/rms_norm.hpp" @@ -549,6 +550,10 @@ extern template bool evaluate_node(std::shared_ ov::TensorVector& outputs, const ov::TensorVector& inputs); +extern template bool evaluate_node(std::shared_ptr node, + ov::TensorVector& outputs, + const ov::TensorVector& inputs); + extern template bool evaluate_node(std::shared_ptr node, ov::TensorVector& outputs, const ov::TensorVector& inputs); diff --git a/src/plugins/template/backend/opset_int_tbl.hpp b/src/plugins/template/backend/opset_int_tbl.hpp index c10ac0808e67..ebc765a67bd2 100644 --- a/src/plugins/template/backend/opset_int_tbl.hpp +++ b/src/plugins/template/backend/opset_int_tbl.hpp @@ -188,6 +188,7 @@ _OPENVINO_OP_REG(OneHot, ov::op::v16) _OPENVINO_OP_REG(AUGRUCell, ov::op::internal) _OPENVINO_OP_REG(AUGRUSequence, ov::op::internal) +_OPENVINO_OP_REG(GatedDeltaNet, ov::op::internal) _OPENVINO_OP_REG(RMS, ov::op::internal) _OPENVINO_OP_REG(RMSNorm, ov::op::internal) _OPENVINO_OP_REG(PagedAttentionExtension, ov::op) From a95681cbb52ea985b72e3cedda606890c7223216 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Wed, 3 Jun 2026 16:57:36 +0800 Subject: [PATCH 4/4] Apply clang-format --- .../single_layer_tests/gated_delta_net.cpp | 32 +++++++------------ .../template/backend/ops/gated_delta_net.cpp | 3 +- 2 files changed, 13 insertions(+), 22 deletions(-) diff --git a/src/plugins/intel_gpu/tests/functional/single_layer_tests/gated_delta_net.cpp b/src/plugins/intel_gpu/tests/functional/single_layer_tests/gated_delta_net.cpp index a99c3ffd05a0..74a87211ae16 100644 --- a/src/plugins/intel_gpu/tests/functional/single_layer_tests/gated_delta_net.cpp +++ b/src/plugins/intel_gpu/tests/functional/single_layer_tests/gated_delta_net.cpp @@ -14,13 +14,11 @@ namespace { -using GatedDeltaNetParams = std::tuple< - std::vector, // Input shapes: query, key, value, state, gate, beta - ov::element::Type, // Input precision - bool>; // fuse_qk_l2norm +using GatedDeltaNetParams = std::tuple, // Input shapes: query, key, value, state, gate, beta + ov::element::Type, // Input precision + bool>; // fuse_qk_l2norm -class GatedDeltaNetStaticTest : public testing::WithParamInterface, - public ov::test::TestsCommon { +class GatedDeltaNetStaticTest : public testing::WithParamInterface, public ov::test::TestsCommon { public: static std::string getTestCaseName(const testing::TestParamInfo& obj) { const auto& [input_shapes, precision, fuse_qk_l2norm] = obj.param; @@ -48,16 +46,12 @@ class GatedDeltaNetStaticTest : public testing::WithParamInterface(precision, input_shapes[4]); auto beta = std::make_shared(precision, input_shapes[5]); - auto gdn = std::make_shared( - query, key, value, state, gate, beta, fuse_qk_l2norm); + auto gdn = std::make_shared(query, key, value, state, gate, beta, fuse_qk_l2norm); auto result0 = std::make_shared(gdn->output(0)); auto result1 = std::make_shared(gdn->output(1)); - model = std::make_shared( - ov::ResultVector{result0, result1}, - ov::ParameterVector{query, key, value, state, gate, beta}, - "GatedDeltaNetTest"); + model = std::make_shared(ov::ResultVector{result0, result1}, ov::ParameterVector{query, key, value, state, gate, beta}, "GatedDeltaNetTest"); } std::map, ov::Tensor> generate_inputs() { @@ -68,8 +62,7 @@ class GatedDeltaNetStaticTest : public testing::WithParamInterfaceget_element_type(), params[i]->get_shape(), in_data); + inputs[params[i]] = ov::test::utils::create_and_fill_tensor(params[i]->get_element_type(), params[i]->get_shape(), in_data); } return inputs; } @@ -119,12 +112,9 @@ const std::vector> static_shapes = { {{1, 4, 2, 16}, {1, 4, 2, 16}, {1, 4, 8, 16}, {1, 8, 16, 16}, {1, 4, 8}, {1, 4, 8}}, }; -INSTANTIATE_TEST_SUITE_P( - smoke_GatedDeltaNetStatic, - GatedDeltaNetStaticTest, - ::testing::Combine(::testing::ValuesIn(static_shapes), - ::testing::Values(ov::element::f32), - ::testing::Values(false, true)), - GatedDeltaNetStaticTest::getTestCaseName); +INSTANTIATE_TEST_SUITE_P(smoke_GatedDeltaNetStatic, + GatedDeltaNetStaticTest, + ::testing::Combine(::testing::ValuesIn(static_shapes), ::testing::Values(ov::element::f32), ::testing::Values(false, true)), + GatedDeltaNetStaticTest::getTestCaseName); } // namespace diff --git a/src/plugins/template/backend/ops/gated_delta_net.cpp b/src/plugins/template/backend/ops/gated_delta_net.cpp index 3dfecdde5d96..6565bf4e1a3f 100644 --- a/src/plugins/template/backend/ops/gated_delta_net.cpp +++ b/src/plugins/template/backend/ops/gated_delta_net.cpp @@ -2,12 +2,13 @@ // SPDX-License-Identifier: Apache-2.0 // +#include "openvino/op/gated_delta_net.hpp" + #include #include #include "evaluate_node.hpp" #include "openvino/core/type/element_type_traits.hpp" -#include "openvino/op/gated_delta_net.hpp" template bool evaluate(const std::shared_ptr& op,