openvinotoolkit · WeldonWangwang · May 26, 2026 · May 29, 2026
@@ -0,0 +1,68 @@
+// Copyright (C) 2018-2026 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "openvino/op/op.hpp"
+
+namespace ov {
+namespace op {
+namespace internal {
+
+/// \brief Compressed linear (MatMul) operation with IQ3_XXS quantized weights.
+///
+/// Performs Y = X @ W^T where W is stored in IQ3_XXS compressed format.
+/// The compressed weight blob is passed as an opaque u8 Constant input.
+/// The plugin is responsible for fused on-the-fly dequantization during compute.
+///
+/// Inputs:
+///   0: activation [M, K] - float16/float32
+///   1: compressed_weights [compressed_bytes] - u8 opaque blob
+///
+/// Attributes:
+///   weight_shape: logical shape of the weight matrix [N, K]
+///   block_size: number of weights per super-block (256 for IQ3_XXS)
+///   bytes_per_block: bytes per super-block (98 for IQ3_XXS)
+///
+/// Output:
+///   0: result [M, N] - same element type as activation
+///
+class OPENVINO_API IQ3XXSLinear : public ov::op::Op {
+public:
+    OPENVINO_OP("IQ3XXSLinear");
+
+    IQ3XXSLinear() = default;
+
+    /// \brief Constructs an IQ3XXSLinear operation.
+    ///
+    /// \param activation Input activation tensor [M, K]
+    /// \param compressed_weights Opaque u8 blob containing IQ3_XXS encoded weights
+    /// \param weight_shape Logical weight shape [N, K]
+    /// \param block_size Number of weights per block (default 256)
+    /// \param bytes_per_block Bytes per block (default 98)
+    IQ3XXSLinear(const Output<Node>& activation,
+                 const Output<Node>& compressed_weights,
+                 const ov::Shape& weight_shape,
+                 int64_t block_size = 256,
+                 int64_t bytes_per_block = 98);
+
+    bool visit_attributes(ov::AttributeVisitor& visitor) override;
+    void validate_and_infer_types() override;
+    std::shared_ptr<Node> clone_with_new_inputs(const ov::OutputVector& new_args) const override;
+    bool evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const override;
+    bool has_evaluate() const override { return true; }
+
+    const ov::Shape& get_weight_shape() const { return m_weight_shape; }
+    int64_t get_block_size() const { return m_block_size; }
+    int64_t get_bytes_per_block() const { return m_bytes_per_block; }
+
+private:
+    ov::Shape m_weight_shape;
+    int64_t m_block_size{256};
+    int64_t m_bytes_per_block{98};
+};
+
+}  // namespace internal
+}  // namespace op
+}  // namespace ov
@@ -0,0 +1,95 @@
+// Copyright (C) 2018-2026 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "openvino/op/iq3_xxs_linear.hpp"
+
+#include "openvino/core/validation_util.hpp"
+
+namespace ov {
+namespace op {
+namespace internal {
+
+IQ3XXSLinear::IQ3XXSLinear(const Output<Node>& activation,
+                           const Output<Node>& compressed_weights,
+                           const ov::Shape& weight_shape,
+                           int64_t block_size,
+                           int64_t bytes_per_block)
+    : Op({activation, compressed_weights}),
+      m_weight_shape(weight_shape),
+      m_block_size(block_size),
+      m_bytes_per_block(bytes_per_block) {
+    constructor_validate_and_infer_types();
+}
+
+bool IQ3XXSLinear::visit_attributes(ov::AttributeVisitor& visitor) {
+    visitor.on_attribute("weight_shape", m_weight_shape);
+    visitor.on_attribute("block_size", m_block_size);
+    visitor.on_attribute("bytes_per_block", m_bytes_per_block);
+    return true;
+}
+
+void IQ3XXSLinear::validate_and_infer_types() {
+    // Input 0: activation [M, K] or [batch..., M, K]
+    const auto& activation_type = get_input_element_type(0);
+    const auto& activation_pshape = get_input_partial_shape(0);
+
+    // Input 1: compressed weights blob [total_bytes] - must be u8
+    const auto& weights_type = get_input_element_type(1);
+    NODE_VALIDATION_CHECK(this,
+                          weights_type == element::u8,
+                          "Compressed weights must be u8 type, got: ",
+                          weights_type);
+
+    // Validate weight_shape: [N, K]
+    NODE_VALIDATION_CHECK(this,
+                          m_weight_shape.size() == 2,
+                          "weight_shape must be 2D [N, K], got rank: ",
+                          m_weight_shape.size());
+
+    const int64_t N = static_cast<int64_t>(m_weight_shape[0]);
+    const int64_t K = static_cast<int64_t>(m_weight_shape[1]);
+
+    // Validate K is compatible with block_size
+    NODE_VALIDATION_CHECK(this,
+                          K % m_block_size == 0,
+                          "K (", K, ") must be divisible by block_size (", m_block_size, ")");
+
+    // Validate compressed data size
+    const int64_t blocks_per_row = K / m_block_size;
+    const int64_t expected_bytes = N * blocks_per_row * m_bytes_per_block;
+    if (get_input_partial_shape(1).is_static()) {
+        const auto& weights_shape = get_input_partial_shape(1).to_shape();
+        NODE_VALIDATION_CHECK(this,
+                              weights_shape.size() == 1,
+                              "Compressed weights must be 1D blob");
+        NODE_VALIDATION_CHECK(this,
+                              static_cast<int64_t>(weights_shape[0]) == expected_bytes,
+                              "Compressed weights size mismatch: expected ",
+                              expected_bytes, " bytes, got ", weights_shape[0]);
+    }
+
+    // Output shape: activation leading dims + N
+    // activation: [..., M, K] -> output: [..., M, N]
+    if (activation_pshape.rank().is_dynamic()) {
+        set_output_type(0, activation_type, ov::PartialShape::dynamic());
+    } else {
+        auto output_pshape = activation_pshape;
+        // Last dim of activation (K) replaced by N (from weight_shape[0])
+        output_pshape[output_pshape.rank().get_length() - 1] = N;
+        set_output_type(0, activation_type, output_pshape);
+    }
+}
+
+std::shared_ptr<Node> IQ3XXSLinear::clone_with_new_inputs(const ov::OutputVector& new_args) const {
+    check_new_args_count(this, new_args);
+    return std::make_shared<IQ3XXSLinear>(new_args[0],
+                                          new_args[1],
+                                          m_weight_shape,
+                                          m_block_size,
+                                          m_bytes_per_block);
+}
+
+}  // namespace internal
+}  // namespace op
+}  // namespace ov