Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 68 additions & 0 deletions src/core/dev_api/openvino/op/iq3_xxs_linear.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
// Copyright (C) 2018-2026 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#pragma once

#include "openvino/op/op.hpp"

namespace ov {
namespace op {
namespace internal {

/// \brief Compressed linear (MatMul) operation with IQ3_XXS quantized weights.
///
/// Performs Y = X @ W^T where W is stored in IQ3_XXS compressed format.
/// The compressed weight blob is passed as an opaque u8 Constant input.
/// The plugin is responsible for fused on-the-fly dequantization during compute.
///
/// Inputs:
/// 0: activation [M, K] - float16/float32
/// 1: compressed_weights [compressed_bytes] - u8 opaque blob
///
/// Attributes:
/// weight_shape: logical shape of the weight matrix [N, K]
/// block_size: number of weights per super-block (256 for IQ3_XXS)
/// bytes_per_block: bytes per super-block (98 for IQ3_XXS)
///
/// Output:
/// 0: result [M, N] - same element type as activation
///
class OPENVINO_API IQ3XXSLinear : public ov::op::Op {
public:
OPENVINO_OP("IQ3XXSLinear");

IQ3XXSLinear() = default;

/// \brief Constructs an IQ3XXSLinear operation.
///
/// \param activation Input activation tensor [M, K]
/// \param compressed_weights Opaque u8 blob containing IQ3_XXS encoded weights
/// \param weight_shape Logical weight shape [N, K]
/// \param block_size Number of weights per block (default 256)
/// \param bytes_per_block Bytes per block (default 98)
IQ3XXSLinear(const Output<Node>& activation,
const Output<Node>& compressed_weights,
const ov::Shape& weight_shape,
int64_t block_size = 256,
int64_t bytes_per_block = 98);

bool visit_attributes(ov::AttributeVisitor& visitor) override;
void validate_and_infer_types() override;
std::shared_ptr<Node> clone_with_new_inputs(const ov::OutputVector& new_args) const override;
bool evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const override;
bool has_evaluate() const override { return true; }

const ov::Shape& get_weight_shape() const { return m_weight_shape; }
int64_t get_block_size() const { return m_block_size; }
int64_t get_bytes_per_block() const { return m_bytes_per_block; }

private:
ov::Shape m_weight_shape;
int64_t m_block_size{256};
int64_t m_bytes_per_block{98};
};

} // namespace internal
} // namespace op
} // namespace ov
95 changes: 95 additions & 0 deletions src/core/src/op/iq3_xxs_linear.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
// Copyright (C) 2018-2026 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#include "openvino/op/iq3_xxs_linear.hpp"

#include "openvino/core/validation_util.hpp"

namespace ov {
namespace op {
namespace internal {

IQ3XXSLinear::IQ3XXSLinear(const Output<Node>& activation,
const Output<Node>& compressed_weights,
const ov::Shape& weight_shape,
int64_t block_size,
int64_t bytes_per_block)
: Op({activation, compressed_weights}),
m_weight_shape(weight_shape),
m_block_size(block_size),
m_bytes_per_block(bytes_per_block) {
constructor_validate_and_infer_types();
}

bool IQ3XXSLinear::visit_attributes(ov::AttributeVisitor& visitor) {
visitor.on_attribute("weight_shape", m_weight_shape);
visitor.on_attribute("block_size", m_block_size);
visitor.on_attribute("bytes_per_block", m_bytes_per_block);
return true;
}

void IQ3XXSLinear::validate_and_infer_types() {
// Input 0: activation [M, K] or [batch..., M, K]
const auto& activation_type = get_input_element_type(0);
const auto& activation_pshape = get_input_partial_shape(0);

// Input 1: compressed weights blob [total_bytes] - must be u8
const auto& weights_type = get_input_element_type(1);
NODE_VALIDATION_CHECK(this,
weights_type == element::u8,
"Compressed weights must be u8 type, got: ",
weights_type);

// Validate weight_shape: [N, K]
NODE_VALIDATION_CHECK(this,
m_weight_shape.size() == 2,
"weight_shape must be 2D [N, K], got rank: ",
m_weight_shape.size());

const int64_t N = static_cast<int64_t>(m_weight_shape[0]);
const int64_t K = static_cast<int64_t>(m_weight_shape[1]);

// Validate K is compatible with block_size
NODE_VALIDATION_CHECK(this,
K % m_block_size == 0,
"K (", K, ") must be divisible by block_size (", m_block_size, ")");

// Validate compressed data size
const int64_t blocks_per_row = K / m_block_size;
const int64_t expected_bytes = N * blocks_per_row * m_bytes_per_block;
if (get_input_partial_shape(1).is_static()) {
const auto& weights_shape = get_input_partial_shape(1).to_shape();
NODE_VALIDATION_CHECK(this,
weights_shape.size() == 1,
"Compressed weights must be 1D blob");
NODE_VALIDATION_CHECK(this,
static_cast<int64_t>(weights_shape[0]) == expected_bytes,
"Compressed weights size mismatch: expected ",
expected_bytes, " bytes, got ", weights_shape[0]);
}

// Output shape: activation leading dims + N
// activation: [..., M, K] -> output: [..., M, N]
if (activation_pshape.rank().is_dynamic()) {
set_output_type(0, activation_type, ov::PartialShape::dynamic());
} else {
auto output_pshape = activation_pshape;
// Last dim of activation (K) replaced by N (from weight_shape[0])
output_pshape[output_pshape.rank().get_length() - 1] = N;
set_output_type(0, activation_type, output_pshape);
}
}

std::shared_ptr<Node> IQ3XXSLinear::clone_with_new_inputs(const ov::OutputVector& new_args) const {
check_new_args_count(this, new_args);
return std::make_shared<IQ3XXSLinear>(new_args[0],
new_args[1],
m_weight_shape,
m_block_size,
m_bytes_per_block);
}

} // namespace internal
} // namespace op
} // namespace ov
Loading