Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
258 changes: 94 additions & 164 deletions onnxruntime/core/providers/qnn/builder/opbuilder/conv_op_builder.cc
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,18 @@ Ort::Status ConvOpBuilder::IsOpSupported(QnnModelWrapper& qnn_model_wrapper,
int64_t quant_axis = 0;
RETURN_IF_ERROR(qnn_model_wrapper.IsPerChannelQuantized(input_1, is_per_axis_quant, quant_axis));

if (is_per_axis_quant) {
const bool is_block_quant = input_1.quant_param.has_value() &&
input_1.quant_param->block_size.has_value() &&
input_1.quant_param->block_size.value() > 0;

if (is_block_quant) {
if (conv_type == OnnxConvType::kConvTranspose) {
RETURN_IF_NOT(quant_axis == 0,
"ConvTranspose's input[1] must be use axis == 0 for block quantization");
} else {
RETURN_IF_NOT(quant_axis == 1, "Conv's input[1] must be use axis == 1 for block quantization");
}
} else if (is_per_axis_quant) {
if (conv_type == OnnxConvType::kConvTranspose) {
RETURN_IF_NOT(quant_axis == 1,
"ConvTranspose's input[1] must be use axis == 1 for per-channel quantization");
Expand Down Expand Up @@ -220,8 +231,8 @@ Ort::Status ConvOpBuilder::ProcessConv2D3DInputs(QnnModelWrapper& qnn_model_wrap
return MAKE_EP_FAIL(("QNN EP: Unexpected convolution op type: " + node_unit.OpType()).c_str());
}

// Transpose quantization parameter's axis if this is using per-channel quantization.
if (input_info.quant_param.IsPerChannel()) {
// Transpose quantization parameter's axis if this is using per-channel or LPBQ quantization.
if (input_info.quant_param.IsPerChannel() || input_info.quant_param.IsLPBQ()) {
std::vector<size_t> perm;
if (is_3d) {
perm = conv_type == OnnxConvType::kConv ? nchw2hwcn_perm_3d : cnhw2hwcn_perm_3d;
Expand All @@ -234,7 +245,7 @@ Ort::Status ConvOpBuilder::ProcessConv2D3DInputs(QnnModelWrapper& qnn_model_wrap
}
} else {
// Add transpose node above weight input.
RETURN_IF(input_info.quant_param.IsPerChannel(),
RETURN_IF(input_info.quant_param.IsPerChannel() || input_info.quant_param.IsLPBQ(),
"Non-constant Conv inputs only support per-tensor quantization");
bool is_graph_input = qnn_model_wrapper.IsGraphInput(input1_name);
ORT_CXX_LOG(logger, ORT_LOGGING_LEVEL_VERBOSE, ("Add HWCN Transpose node after input: " + input1_name).c_str());
Expand Down Expand Up @@ -333,9 +344,12 @@ Ort::Status ConvOpBuilder::ProcessConv2D3DInputs(QnnModelWrapper& qnn_model_wrap
TensorInfo bias_info = {};
RETURN_IF_ERROR(qnn_model_wrapper.GetTensorInfo(bias_input, bias_info));

// For static quantized bias, handle requantization if needed
if (bias_info.is_initializer && bias_info.quant_param.IsQuantized()) {
// Get activation and weight quantization parameters
bool bias_handled = false;

// For a static bias when activation and weight are both quantized, ensure
// bias_scale = activation_scale * weight_scale.
// This applies whether the bias is already quantized (requantize if needed) or float (quantize it).
if (bias_info.is_initializer) {
TensorInfo input0_info = {};
TensorInfo input1_info = {};
RETURN_IF_ERROR(qnn_model_wrapper.GetTensorInfo(inputs[0], input0_info));
Expand All @@ -351,186 +365,102 @@ Ort::Status ConvOpBuilder::ProcessConv2D3DInputs(QnnModelWrapper& qnn_model_wrap
activation_scale = act_quant_params.bwScaleOffsetEncoding.scale;
}

// Get weight scales (per-tensor or per-channel)
std::vector<float> weights_scales;

if (input1_info.quant_param.IsPerTensor()) {
// Handle per-tensor quantization (encodings 0 and 2)
const auto& weight_quant_params = input1_info.quant_param.Get();

if (weight_quant_params.quantizationEncoding == QNN_QUANTIZATION_ENCODING_SCALE_OFFSET) {
weights_scales.push_back(weight_quant_params.scaleOffsetEncoding.scale);
} else if (weight_quant_params.quantizationEncoding == QNN_QUANTIZATION_ENCODING_BW_SCALE_OFFSET) {
weights_scales.push_back(weight_quant_params.bwScaleOffsetEncoding.scale);
}
} else {
// Handle per-channel quantization (encodings 1 and 3)
const auto& weight_quant_params = input1_info.quant_param.Get();

if (weight_quant_params.quantizationEncoding == QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET) {
if (weight_quant_params.axisScaleOffsetEncoding.scaleOffset != nullptr &&
weight_quant_params.axisScaleOffsetEncoding.numScaleOffsets > 0) {
for (size_t i = 0; i < weight_quant_params.axisScaleOffsetEncoding.numScaleOffsets; ++i) {
weights_scales.push_back(weight_quant_params.axisScaleOffsetEncoding.scaleOffset[i].scale);
}
}
} else if (weight_quant_params.quantizationEncoding == QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET) {
if (weight_quant_params.bwAxisScaleOffsetEncoding.scales != nullptr &&
weight_quant_params.bwAxisScaleOffsetEncoding.numElements > 0) {
for (size_t i = 0; i < weight_quant_params.bwAxisScaleOffsetEncoding.numElements; ++i) {
weights_scales.push_back(weight_quant_params.bwAxisScaleOffsetEncoding.scales[i]);
}
RETURN_IF_ERROR(utils::GetWeightQuantScales(input1_info.quant_param, bias_info.shape[0], weights_scales));
RETURN_IF_NOT(!weights_scales.empty(), "No weight scales found for bias quantization");

if (bias_info.quant_param.IsQuantized()) {
// Bias is already quantized: check if scales match, requantize if needed.
std::vector<float> current_scales;
std::vector<int32_t> current_offsets;
int32_t quant_axis = 0;
RETURN_IF_ERROR(utils::GetBiasQuantScalesAndOffsets(bias_info.quant_param, current_scales, current_offsets, quant_axis));

const size_t num_channels = current_scales.size();
bool needs_requantization = false;
for (size_t i = 0; i < num_channels && !needs_requantization; ++i) {
const float weight_scale = (i < weights_scales.size()) ? weights_scales[i] : weights_scales[0];
if (current_offsets[i] != 0 ||
!utils::CheckBiasScaleMatch(current_scales[i], weight_scale, activation_scale, 1e-5f)) {
needs_requantization = true;
}
}
}

// Safety check to prevent crashes
RETURN_IF_NOT(!weights_scales.empty(), "No weight scales found for quantized weights");

// Check bias quantization type
if (bias_info.quant_param.IsPerTensor()) {
float bias_scale = 0.0f;
int32_t bias_offset = 0;
const auto& bias_quant_params = bias_info.quant_param.Get();
if (bias_quant_params.quantizationEncoding == QNN_QUANTIZATION_ENCODING_SCALE_OFFSET) {
bias_scale = bias_quant_params.scaleOffsetEncoding.scale;
bias_offset = bias_quant_params.scaleOffsetEncoding.offset;
} else if (bias_quant_params.quantizationEncoding == QNN_QUANTIZATION_ENCODING_BW_SCALE_OFFSET) {
bias_scale = bias_quant_params.bwScaleOffsetEncoding.scale;
bias_offset = bias_quant_params.bwScaleOffsetEncoding.offset;
} else {
return MAKE_EP_FAIL("Unsupported bias quantization encoding for per-tensor quantization.");
}
if (needs_requantization) {
ORT_CXX_LOG(logger, ORT_LOGGING_LEVEL_VERBOSE, ("Requantizing bias " + bias_input.name).c_str());

// Check if bias_offset = 0 AND bias_scale = (weights_scale[0] * activation_scale)
if (bias_offset == 0 && utils::CheckBiasScaleMatch(bias_scale, weights_scales[0], activation_scale, 1e-5f)) {
// No change needed - scales match and offset is 0
} else {
ORT_CXX_LOG(logger, ORT_LOGGING_LEVEL_VERBOSE, ("Requantizing per-tensor bias " + bias_input.name).c_str());
// Need to requantize the bias tensor
std::vector<uint8_t> original_bias_data;
RETURN_IF_ERROR(qnn_model_wrapper.UnpackInitializerData(bias_info.initializer_tensor, original_bias_data));

std::vector<float> current_scales = {bias_scale};
std::vector<int32_t> current_offsets = {bias_offset};
std::vector<uint8_t> requantized_bias_data;
std::vector<float> new_scales;
std::vector<int32_t> new_offsets;

const std::optional<int64_t> axis_opt = (num_channels > 1) ? std::optional<int64_t>(quant_axis) : std::nullopt;
RETURN_IF_ERROR(utils::RequantizeBiasTensor(
original_bias_data, bias_info.shape, current_scales, current_offsets,
weights_scales, activation_scale, bias_info.qnn_data_type,
requantized_bias_data, new_scales, new_offsets));
requantized_bias_data, new_scales, new_offsets, axis_opt));

QnnQuantParamsWrapper new_quant_params;
if (new_scales.size() == 1) {
new_quant_params = QnnQuantParamsWrapper(new_scales[0], new_offsets[0]);
} else {
new_quant_params = QnnQuantParamsWrapper(new_scales, new_offsets, quant_axis, false);
}

// Create new tensor wrapper with requantized data
std::string bias_name = bias_input.name;
QnnQuantParamsWrapper new_quant_params(new_scales[0], new_offsets[0]);
QnnTensorWrapper bias_tensorwrapper(bias_name, QNN_TENSOR_TYPE_STATIC, bias_info.qnn_data_type,
std::move(new_quant_params), std::vector<uint32_t>(bias_info.shape),
std::move(requantized_bias_data));
RETURN_IF_NOT(qnn_model_wrapper.AddTensorWrapper(std::move(bias_tensorwrapper)), "Failed to add requantized bias tensor.");
RETURN_IF_NOT(qnn_model_wrapper.AddTensorWrapper(std::move(bias_tensorwrapper)),
"Failed to add requantized bias tensor.");
input_names.push_back(bias_name);
return Ort::Status(); // We've handled the bias, return early
bias_handled = true;
}
} else {
// Handle per-channel bias
const auto& bias_quant_params = bias_info.quant_param.Get();

if (bias_quant_params.quantizationEncoding == QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET ||
bias_quant_params.quantizationEncoding == QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET) {
// Extract scales and offsets based on encoding type
std::vector<float> current_scales;
std::vector<int32_t> current_offsets;
int32_t quant_axis = 0;
size_t num_channels = 0;

if (bias_quant_params.quantizationEncoding == QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET) {
// Safety checks for AXIS_SCALE_OFFSET encoding
RETURN_IF_NOT(bias_quant_params.axisScaleOffsetEncoding.scaleOffset != nullptr,
"Invalid bias quantization parameters: scaleOffset is null");
RETURN_IF_NOT(bias_quant_params.axisScaleOffsetEncoding.numScaleOffsets > 0,
"Invalid bias quantization parameters: numScaleOffsets is zero");

num_channels = bias_quant_params.axisScaleOffsetEncoding.numScaleOffsets;
quant_axis = bias_quant_params.axisScaleOffsetEncoding.axis;
for (size_t i = 0; i < num_channels; ++i) {
current_scales.push_back(bias_quant_params.axisScaleOffsetEncoding.scaleOffset[i].scale);
current_offsets.push_back(bias_quant_params.axisScaleOffsetEncoding.scaleOffset[i].offset);
}
} else { // QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET
// Safety checks for BW_AXIS_SCALE_OFFSET encoding
RETURN_IF_NOT(bias_quant_params.bwAxisScaleOffsetEncoding.scales != nullptr,
"Invalid bias quantization parameters: scales is null");
RETURN_IF_NOT(bias_quant_params.bwAxisScaleOffsetEncoding.offsets != nullptr,
"Invalid bias quantization parameters: offsets is null");
RETURN_IF_NOT(bias_quant_params.bwAxisScaleOffsetEncoding.numElements > 0,
"Invalid bias quantization parameters: numElements is zero");

num_channels = bias_quant_params.bwAxisScaleOffsetEncoding.numElements;
quant_axis = bias_quant_params.bwAxisScaleOffsetEncoding.axis;
for (size_t i = 0; i < num_channels; ++i) {
current_scales.push_back(bias_quant_params.bwAxisScaleOffsetEncoding.scales[i]);
current_offsets.push_back(bias_quant_params.bwAxisScaleOffsetEncoding.offsets[i]);
}
}

// Check if all offsets are 0 and scales match expected values
bool all_offsets_zero = true;
bool all_scales_match = true;

for (size_t i = 0; i < num_channels; ++i) {
if (current_offsets[i] != 0) {
all_offsets_zero = false;
}

// Calculate expected scale for this channel
// Use the corresponding weight scale if available, otherwise use the first one
float weight_scale = (i < weights_scales.size()) ? weights_scales[i] : weights_scales[0];

if (!utils::CheckBiasScaleMatch(current_scales[i], weight_scale, activation_scale, 1e-5f)) {
all_scales_match = false;
}
}

if (all_offsets_zero && all_scales_match) {
// No change needed - scales match and offsets are 0
} else {
// Need to requantize per-channel bias
ORT_CXX_LOG(logger,
ORT_LOGGING_LEVEL_VERBOSE,
("Requantizing per-channel bias " + bias_input.name).c_str());

// Get current bias data and requantize
std::vector<uint8_t> original_bias_data;
RETURN_IF_ERROR(qnn_model_wrapper.UnpackInitializerData(bias_info.initializer_tensor, original_bias_data));

std::vector<uint8_t> requantized_bias_data;
std::vector<float> new_scales;
std::vector<int32_t> new_offsets;

RETURN_IF_ERROR(utils::RequantizeBiasTensor(
original_bias_data, bias_info.shape, current_scales, current_offsets,
weights_scales, activation_scale, bias_info.qnn_data_type,
requantized_bias_data, new_scales, new_offsets,
quant_axis));

// Create new tensor wrapper with requantized data
std::string bias_name = bias_input.name;
QnnQuantParamsWrapper new_quant_params(new_scales, new_offsets, quant_axis, false);
QnnTensorWrapper bias_tensorwrapper(bias_name, QNN_TENSOR_TYPE_STATIC, bias_info.qnn_data_type,
std::move(new_quant_params), std::vector<uint32_t>(bias_info.shape),
std::move(requantized_bias_data));
RETURN_IF_NOT(qnn_model_wrapper.AddTensorWrapper(std::move(bias_tensorwrapper)), "Failed to add requantized bias tensor.");
input_names.push_back(bias_name);
return Ort::Status(); // We've handled the bias, return early
}
// Bias is float: quantize using bias_scale = activation_scale * weight_scale.
ORT_CXX_LOG(logger, ORT_LOGGING_LEVEL_VERBOSE,
("Quantizing float bias " + bias_input.name + " using activation_scale * weight_scale[c]").c_str());

std::vector<uint8_t> original_bias_data;
RETURN_IF_ERROR(qnn_model_wrapper.UnpackInitializerData(bias_info.initializer_tensor, original_bias_data));

const size_t num_channels = bias_info.shape[0];
RETURN_IF_NOT(original_bias_data.size() == num_channels * sizeof(float),
"Unexpected bias data size for float bias quantization");
const float* bias_float_data = reinterpret_cast<const float*>(original_bias_data.data());

std::vector<uint8_t> quantized_bias_data;
std::vector<float> new_scales;
std::vector<int32_t> new_offsets;
RETURN_IF_ERROR(utils::QuantizeFloatBiasTensor(
gsl::span<const float>(bias_float_data, num_channels),
weights_scales, activation_scale,
quantized_bias_data, new_scales, new_offsets));

QnnQuantParamsWrapper new_quant_params;
if (weights_scales.size() == 1) {
new_quant_params = QnnQuantParamsWrapper(new_scales[0], 0);
} else {
new_quant_params = QnnQuantParamsWrapper(new_scales, new_offsets, /*axis=*/0, /*is_int4=*/false);
}

std::string bias_name = bias_input.name;
QnnTensorWrapper bias_tensorwrapper(bias_name, QNN_TENSOR_TYPE_STATIC, QNN_DATATYPE_SFIXED_POINT_32,
std::move(new_quant_params), std::vector<uint32_t>(bias_info.shape),
std::move(quantized_bias_data));
RETURN_IF_NOT(qnn_model_wrapper.AddTensorWrapper(std::move(bias_tensorwrapper)),
"Failed to add quantized float bias tensor.");
input_names.push_back(bias_name);
bias_handled = true;
}
}
}

// Process bias normally (non-quantized or static non-quantized or scales already match)
RETURN_IF_ERROR(ProcessInput(qnn_model_wrapper, bias_input, logger, input_names));
if (!bias_handled) {
// Process bias normally: non-initializer, or activation/weight not quantized, or scales already match.
RETURN_IF_ERROR(ProcessInput(qnn_model_wrapper, bias_input, logger, input_names));
}
}

#if QNN_API_VERSION_MAJOR == 2 && (QNN_API_VERSION_MINOR >= 16 && QNN_API_VERSION_MINOR <= 18)
Expand Down Expand Up @@ -675,7 +605,7 @@ Ort::Status ConvOpBuilder::ProcessConv1DInputs(QnnModelWrapper& qnn_model_wrappe
});

// The reshape (unsqueeze) may require us to shift the quant parameter's axis.
if (input_info.quant_param.IsPerChannel()) {
if (input_info.quant_param.IsPerChannel() || input_info.quant_param.IsLPBQ()) {
RETURN_IF_ERROR(input_info.quant_param.HandleUnsqueeze<uint32_t>(input_info.shape, shape_2d));
}

Expand Down Expand Up @@ -709,16 +639,16 @@ Ort::Status ConvOpBuilder::ProcessConv1DInputs(QnnModelWrapper& qnn_model_wrappe
return MAKE_EP_FAIL(("QNN EP: Unexpected convolution op type: " + node_unit.OpType()).c_str());
}

// Transpose quantization parameter's axis if this is using per-channel quantization.
if (input_info.quant_param.IsPerChannel()) {
// Transpose quantization parameter's axis if this is using per-channel or LPBQ quantization.
if (input_info.quant_param.IsPerChannel() || input_info.quant_param.IsLPBQ()) {
const std::vector<size_t>& perm = conv_type == OnnxConvType::kConv ? nchw2hwcn_perm : cnhw2hwcn_perm;
std::vector<size_t> perm_inv(perm.size());
RETURN_IF_ERROR(utils::InvertPerm<size_t>(perm, perm_inv));
RETURN_IF_ERROR(input_info.quant_param.HandleTranspose<size_t>(perm_inv));
}
} else {
// Dynamic weight: Add nodes to reshape to 2D, and then transpose.
RETURN_IF(input_info.quant_param.IsPerChannel(),
RETURN_IF(input_info.quant_param.IsPerChannel() || input_info.quant_param.IsLPBQ(),
"Non-constant Conv inputs only support per-tensor quantization");

if (!qnn_model_wrapper.IsQnnTensorWrapperExist(input1_name)) {
Expand Down
Loading