From 6689161d49b331460c503ca334df8ef3a6e34a93 Mon Sep 17 00:00:00 2001 From: qti-ashimaj Date: Mon, 27 Apr 2026 14:29:49 +0530 Subject: [PATCH 1/8] convert BQ to LPBQ encodings --- .../builder/opbuilder/matmul_op_builder.cc | 3 +- .../qnn/builder/qnn_quant_params_wrapper.cc | 76 +++++++++++- .../qnn/builder/qnn_quant_params_wrapper.h | 29 +++-- .../core/providers/qnn/builder/qnn_utils.cc | 109 +++++++++++++++++- .../core/providers/qnn/builder/qnn_utils.h | 28 +++++ onnxruntime/core/providers/qnn/ort_api.cc | 11 +- onnxruntime/core/providers/qnn/ort_api.h | 1 + 7 files changed, 239 insertions(+), 18 deletions(-) diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/matmul_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/matmul_op_builder.cc index 42b54a6e17..c0bb28759e 100644 --- a/onnxruntime/core/providers/qnn/builder/opbuilder/matmul_op_builder.cc +++ b/onnxruntime/core/providers/qnn/builder/opbuilder/matmul_op_builder.cc @@ -1,4 +1,4 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. +// Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. #include @@ -130,6 +130,7 @@ Ort::Status CheckInputs(const QnnModelWrapper& qnn_model_wrapper, const OrtNodeU !input_info_0.is_initializer && IsQuant16bit(input_info_1.qnn_data_type) && !input_info_1.is_initializer); + use_fully_connected = use_fully_connected && !input_info_1.quant_param.IsLPBQ(); #endif return Ort::Status(); } diff --git a/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.cc b/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.cc index 0e0cc6b8b8..225b4f7a50 100644 --- a/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.cc +++ b/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.cc @@ -11,6 +11,7 @@ #include "QnnTypes.h" #include "core/providers/qnn/builder/qnn_model_wrapper.h" +#include "core/providers/qnn/builder/qnn_utils.h" #define ALIGN_PTR_UP(ptr, align, type) \ reinterpret_cast((reinterpret_cast(ptr) + (align) - 1) & ~((align) - 1)) @@ -366,6 +367,8 @@ Ort::Status QnnQuantParamsWrapper::Init(const Qnn_QuantizeParams_t& params, cons params_.encodingDefinition = params.encodingDefinition; params_.quantizationEncoding = params.quantizationEncoding; + per_channel_scales_size_ = static_cast(num_scaleoffsets); + // Deep copy the blockwiseExpansion const size_t bwe_num_bytes = sizeof(Qnn_BlockwiseExpansion_t); constexpr std::uintptr_t bwe_align = alignof(Qnn_BlockwiseExpansion_t); @@ -399,6 +402,7 @@ Ort::Status QnnQuantParamsWrapper::Init(const Qnn_QuantizeParams_t& params, cons params_.encodingDefinition = params.encodingDefinition; params_.quantizationEncoding = params.quantizationEncoding; + num_blocks_ = static_cast(num_scaleoffsets); block_encoding_tensor_rank_ = static_cast(tensor_rank); block_encoding_axis_data_ = std::make_unique(block_encoding_tensor_rank_); std::memcpy(block_encoding_axis_data_.get(), @@ -480,10 +484,13 @@ Ort::Status QnnQuantParamsWrapper::Init(const QnnModelWrapper& qnn_model_wrapper } const bool is_per_tensor = scales.size() == 1; + const bool is_block_quant = ort_quant_params->block_size.has_value() && ort_quant_params->block_size.value() > 0; + const bool is_per_channel = scales.size() > 1 && !is_block_quant; // QNN uses different structs to represent quantization parameters depending on // - per-tensor vs per-channel // - int4 vs not int4 + // - block quantization (LPBQ / BLOCKWISE_EXPANSION) if (is_per_tensor && !is_int4_type) { params_.encodingDefinition = QNN_DEFINITION_DEFINED; params_.quantizationEncoding = QNN_QUANTIZATION_ENCODING_SCALE_OFFSET; @@ -507,7 +514,7 @@ Ort::Status QnnQuantParamsWrapper::Init(const QnnModelWrapper& qnn_model_wrapper } else { params_.bwScaleOffsetEncoding.offset = 0; } - } else if (!is_per_tensor && is_int4_type) { + } else if (is_per_channel && is_int4_type) { std::vector io_shape; RETURN_IF_NOT(qnn_model_wrapper.GetOnnxShape(io_def.shape, io_shape), "Cannot get shape"); const int32_t io_rank = static_cast(io_shape.size()); @@ -550,7 +557,7 @@ Ort::Status QnnQuantParamsWrapper::Init(const QnnModelWrapper& qnn_model_wrapper params_.bwAxisScaleOffsetEncoding.scales = scales_span.data(); params_.bwAxisScaleOffsetEncoding.offsets = zps_span.data(); - } else if (!is_per_tensor && !is_int4_type) { + } else if (is_per_channel && !is_int4_type) { std::vector io_shape; RETURN_IF_NOT(qnn_model_wrapper.GetOnnxShape(io_def.shape, io_shape), "Cannot get shape"); const int32_t io_rank = static_cast(io_shape.size()); @@ -586,6 +593,71 @@ Ort::Status QnnQuantParamsWrapper::Init(const QnnModelWrapper& qnn_model_wrapper params_.axisScaleOffsetEncoding.axis = static_cast(axis); params_.axisScaleOffsetEncoding.numScaleOffsets = static_cast(num_elems); params_.axisScaleOffsetEncoding.scaleOffset = data_span.data(); + } else if (is_block_quant) { + // ONNX block quantization -> QNN LPBQ (BLOCKWISE_EXPANSION) conversion. + + // Get scale tensor shape to determine block/channel dimensions. + const std::vector scale_shape = + utils::GetInitializerShape(ort_quant_params->scale, qnn_model_wrapper.GetOrtApi()); + RETURN_IF_NOT(scale_shape.size() == 2, + "Only 2D block quantization scale tensors are supported for LPBQ conversion"); + + // Determine block axis (= ONNX axis attribute, default 0). + constexpr int64_t kDefaultBlockAxis = 0; + int64_t onnx_axis = ort_quant_params->axis.value_or(kDefaultBlockAxis); + if (onnx_axis < 0) onnx_axis += static_cast(scale_shape.size()); + RETURN_IF_NOT(onnx_axis == 0 || onnx_axis == 1, + "Only axis 0 or 1 is supported for 2D block quantization LPBQ conversion"); + + // Scale shape: [num_blocks_per_channel, num_channels] when block_axis=0 + // [num_channels, num_blocks_per_channel] when block_axis=1 + const uint32_t num_blocks_per_channel = static_cast(scale_shape[onnx_axis]); + const uint32_t num_channels = static_cast(scale_shape[1 - onnx_axis]); + + // The conversion algorithm expects scales in block-major order [num_blocks, num_channels]. + // If block_axis=1 the raw tensor is channel-major [num_channels, num_blocks]; transpose it. + std::vector bq_scales_bm; + std::vector bq_offsets_bm; + if (onnx_axis == 0) { + bq_scales_bm = std::move(scales); + bq_offsets_bm = std::move(zero_points); + } else { + // Transpose [num_channels, num_blocks] -> [num_blocks, num_channels] + bq_scales_bm.resize(scales.size()); + for (uint32_t c = 0; c < num_channels; ++c) { + for (uint32_t b = 0; b < num_blocks_per_channel; ++b) { + bq_scales_bm[static_cast(b) * num_channels + c] = + scales[static_cast(c) * num_blocks_per_channel + b]; + } + } + if (!zero_points.empty()) { + bq_offsets_bm.resize(zero_points.size()); + for (uint32_t c = 0; c < num_channels; ++c) { + for (uint32_t b = 0; b < num_blocks_per_channel; ++b) { + bq_offsets_bm[static_cast(b) * num_channels + c] = + zero_points[static_cast(c) * num_blocks_per_channel + b]; + } + } + } + } + + // Apply BQ -> LPBQ algorithm + // Use bitwidth=4 for int4 weights and bitwidth=8 for int8 weights. + std::vector per_channel_scales; + std::vector per_block_int_scales; + std::vector lpbq_offsets; + const uint32_t kBitwidth = is_int4_type ? 4u : 8u; + RETURN_IF_ERROR(utils::TryConvertBlockQuantScalesToLpbq( + bq_scales_bm, bq_offsets_bm, + num_blocks_per_channel, num_channels, kBitwidth, + per_channel_scales, per_block_int_scales, lpbq_offsets)); + + // QNN LPBQ axis = the non-block axis in the weight tensor. + // For ONNX axis=0 (block axis=0): QNN axis=1; for axis=1: QNN axis=0. + const int64_t qnn_axis = 1 - onnx_axis; + + *this = QnnQuantParamsWrapper(per_channel_scales, per_block_int_scales, lpbq_offsets, + qnn_axis, ort_quant_params->block_size.value(), is_int4_type); } else { return MAKE_EP_FAIL("Unexpected tensor kind for QuantParamsWrapper::Init()"); } diff --git a/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.h b/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.h index ad4312257e..726e276b1c 100644 --- a/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.h +++ b/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.h @@ -106,35 +106,38 @@ class QnnQuantParamsWrapper { // Get a copy of scales. Works for both per-tensor and per-channel. Ort::Status GetScales(/*out*/ std::vector& scales) const; - // Handle transposing of a per-channel quantized tensor. The quantization parameter's axis - // must be transposed using the inverse permutation of the Transpose. + // Handle transposing of a per-channel or LPBQ quantized tensor. The quantization parameter's + // axis must be updated using the permutation of the Transpose. template Ort::Status HandleTranspose(gsl::span perm) { - if (!IsPerChannel()) { + if (!IsPerChannel() && !IsLPBQ()) { return Ort::Status(); } if (params_.quantizationEncoding == QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET) { RETURN_IF_NOT(static_cast(params_.axisScaleOffsetEncoding.axis) < perm.size(), "Axis value is out of range of the provided permutation"); - const int32_t new_axis = static_cast(perm[params_.axisScaleOffsetEncoding.axis]); - params_.axisScaleOffsetEncoding.axis = new_axis; + params_.axisScaleOffsetEncoding.axis = static_cast(perm[params_.axisScaleOffsetEncoding.axis]); } else if (params_.quantizationEncoding == QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET) { RETURN_IF_NOT(static_cast(params_.bwAxisScaleOffsetEncoding.axis) < perm.size(), "Axis value is out of range of the provided permutation"); - const int32_t new_axis = static_cast(perm[params_.bwAxisScaleOffsetEncoding.axis]); - params_.bwAxisScaleOffsetEncoding.axis = new_axis; + params_.bwAxisScaleOffsetEncoding.axis = static_cast(perm[params_.bwAxisScaleOffsetEncoding.axis]); + } else if (params_.quantizationEncoding == QNN_QUANTIZATION_ENCODING_BLOCKWISE_EXPANSION && + params_.blockwiseExpansion != nullptr) { + RETURN_IF_NOT(static_cast(params_.blockwiseExpansion->axis) < perm.size(), + "LPBQ axis value is out of range of the provided permutation"); + params_.blockwiseExpansion->axis = static_cast(perm[params_.blockwiseExpansion->axis]); } return Ort::Status(); } - // Handle "unsqueeze" of a per-channel quantized tensor. The quantization parameter's axis - // may need to be shifted if the unsqueeze inserted 1s before the quantization axis. + // Handle "unsqueeze" of a per-channel or LPBQ quantized tensor. The quantization parameter's + // axis may need to be shifted if the unsqueeze inserted 1s before the quantization axis. template Ort::Status HandleUnsqueeze(gsl::span orig_shape, gsl::span new_shape) { - if (!IsPerChannel()) { + if (!IsPerChannel() && !IsLPBQ()) { return Ort::Status(); } @@ -146,6 +149,9 @@ class QnnQuantParamsWrapper { axis = params_.axisScaleOffsetEncoding.axis; } else if (params_.quantizationEncoding == QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET) { axis = params_.bwAxisScaleOffsetEncoding.axis; + } else if (params_.quantizationEncoding == QNN_QUANTIZATION_ENCODING_BLOCKWISE_EXPANSION && + params_.blockwiseExpansion != nullptr) { + axis = params_.blockwiseExpansion->axis; } else { return MAKE_EP_FAIL(("Unhandled quantization encoding: " + std::to_string(params_.quantizationEncoding)).c_str()); } @@ -175,6 +181,9 @@ class QnnQuantParamsWrapper { params_.axisScaleOffsetEncoding.axis = static_cast(j); } else if (params_.quantizationEncoding == QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET) { params_.bwAxisScaleOffsetEncoding.axis = static_cast(j); + } else if (params_.quantizationEncoding == QNN_QUANTIZATION_ENCODING_BLOCKWISE_EXPANSION && + params_.blockwiseExpansion != nullptr) { + params_.blockwiseExpansion->axis = static_cast(j); } else { return MAKE_EP_FAIL(("Unhandled quantization encoding: " + std::to_string(params_.quantizationEncoding)).c_str()); } diff --git a/onnxruntime/core/providers/qnn/builder/qnn_utils.cc b/onnxruntime/core/providers/qnn/builder/qnn_utils.cc index c9afad2ee4..e1834840a1 100644 --- a/onnxruntime/core/providers/qnn/builder/qnn_utils.cc +++ b/onnxruntime/core/providers/qnn/builder/qnn_utils.cc @@ -328,11 +328,14 @@ std::ostream& operator<<(std::ostream& out, const Qnn_QuantizationEncoding_t& en case QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET: out << "QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET"; break; + case QNN_QUANTIZATION_ENCODING_BLOCKWISE_EXPANSION: + out << "QNN_QUANTIZATION_ENCODING_BLOCKWISE_EXPANSION"; + break; case QNN_QUANTIZATION_ENCODING_UNDEFINED: out << "QNN_QUANTIZATION_ENCODING_UNDEFINED"; break; default: - out << "Uknown quantization encoding"; + out << "Unknown quantization encoding"; } return out; } @@ -378,6 +381,35 @@ std::ostream& operator<<(std::ostream& out, const Qnn_QuantizeParams_t& quantize out << quantize_params.bwAxisScaleOffsetEncoding.offsets[i] << (i == num_elems - 1 ? "" : " "); } out << (truncate ? "...)" : ")"); + } else if (quantize_params.quantizationEncoding == QNN_QUANTIZATION_ENCODING_BLOCKWISE_EXPANSION && + quantize_params.blockwiseExpansion != nullptr) { + const Qnn_BlockwiseExpansion_t& lpbq = *quantize_params.blockwiseExpansion; + out << " axis=" << lpbq.axis + << " numBlocksPerAxis=" << lpbq.numBlocksPerAxis + << " blockScaleBitwidth=" << lpbq.blockScaleBitwidth; + // For lpbq, num_elems are not present in the quantize_params, + // we are using numBlocksPerAxis instead to print the first few scale offset values + size_t num_elems = lpbq.numBlocksPerAxis; + bool truncate = num_elems > 20; + num_elems = truncate ? 20 : num_elems; + if (lpbq.scaleOffsets != nullptr) { + out << " scales=("; + for (size_t i = 0; i < num_elems; i++) { + out << lpbq.scaleOffsets[i].scale << (i + 1 < num_elems ? " " : ""); + } + out << (truncate ? "...)" : ")") << " offsets=("; + for (size_t i = 0; i < num_elems; i++) { + out << lpbq.scaleOffsets[i].offset << (i + 1 < num_elems ? " " : ""); + } + out << (truncate ? "...)" : ")"); + } + if (lpbq.blocksScale8 != nullptr) { + out << " perBlockIntScales=("; + for (size_t i = 0; i < num_elems; i++) { + out << static_cast(lpbq.blocksScale8[i]) << (i + 1 < num_elems ? " " : ""); + } + out << (truncate ? "...)" : ")"); + } } else { out << " encoding not supported."; } @@ -1173,6 +1205,81 @@ Ort::Status DequantizePerChannel(gsl::span quant_bytes, gsl::span return Ort::Status(); } +Ort::Status TryConvertBlockQuantScalesToLpbq(gsl::span bq_scales, + gsl::span bq_offsets, + uint32_t num_blocks_per_channel, + uint32_t num_channels, + uint32_t bitwidth, + /*out*/ std::vector& per_channel_scales, + /*out*/ std::vector& per_block_int_scales, + /*out*/ std::vector& offsets) { + RETURN_IF_NOT(bq_scales.size() == static_cast(num_blocks_per_channel) * num_channels, + "BQ scales size does not match num_blocks_per_channel * num_channels"); + RETURN_IF_NOT(bq_offsets.empty() || bq_offsets.size() == bq_scales.size(), + "BQ offsets size must be empty or equal to bq_scales size"); + RETURN_IF_NOT(bitwidth > 0 && bitwidth <= 16, "bitwidth must be in range [1, 16]"); + + const uint32_t max_int_scale = 1u << bitwidth; // 2^bitwidth + + // Require symmetric quantization (all offsets must be zero). + if (!bq_offsets.empty()) { + for (size_t i = 0; i < bq_offsets.size(); ++i) { + RETURN_IF_NOT(bq_offsets[i] == 0, + "LPBQ conversion requires symmetric quantization (all block zero-points must be 0)"); + } + } + + // Validate that all scales are non-negative and finite. + for (size_t i = 0; i < bq_scales.size(); ++i) { + RETURN_IF_NOT(std::isfinite(bq_scales[i]) && bq_scales[i] >= 0.0f, + "BQ scales must be non-negative and finite"); + } + + // Algorithm: + // max_int_scale = 2^bitwidth + // per_channel_scale[c] = max(bq_scales[:, c]) / max_int_scale + // per_block_int_scale[c, b] = clamp(round(bq_scales[b, c] / per_channel_scale[c]), 1, max_int_scale) + // + // Note: This conversion is inherently approximate — the block scales are arbitrary floats and + // are rounded to the nearest integer multiple of per_channel_scale. The rounding error is + // bounded by 0.5 * per_channel_scale per block, which is the expected LPBQ quantization noise. + + per_channel_scales.resize(num_channels, 0.0f); + per_block_int_scales.resize(static_cast(num_channels) * num_blocks_per_channel, 0); + offsets.assign(num_channels, 0); + + // Step 1: Compute per-channel float scales. + // bq_scales is in block-major order: bq_scales[b * num_channels + c] + for (uint32_t c = 0; c < num_channels; ++c) { + float max_scale = 0.0f; + for (uint32_t b = 0; b < num_blocks_per_channel; ++b) { + float s = bq_scales[static_cast(b) * num_channels + c]; + if (s > max_scale) max_scale = s; + } + per_channel_scales[c] = max_scale / static_cast(max_int_scale); + } + + // Step 2: Compute per-block integer scales in channel-major order. + // Output layout: per_block_int_scales[c * num_blocks_per_channel + b] + for (uint32_t c = 0; c < num_channels; ++c) { + const float pc_scale = per_channel_scales[c]; + for (uint32_t b = 0; b < num_blocks_per_channel; ++b) { + const float raw_scale = bq_scales[static_cast(b) * num_channels + c]; + uint8_t int_scale; + if (pc_scale <= 0.0f) { + int_scale = 1; + } else { + const float tentative = std::round(raw_scale / pc_scale); + const uint32_t clamped = std::max(1u, std::min(static_cast(tentative), max_int_scale)); + int_scale = static_cast(clamped); + } + per_block_int_scales[static_cast(c) * num_blocks_per_channel + b] = int_scale; + } + } + + return Ort::Status(); +} + /** * @brief QuantizeData with LPBQ encodings (per_channel_float_scales, per_block_int_scales) * @pre-condition data should have axis at 0 diff --git a/onnxruntime/core/providers/qnn/builder/qnn_utils.h b/onnxruntime/core/providers/qnn/builder/qnn_utils.h index 1207929673..3b15b12a4f 100644 --- a/onnxruntime/core/providers/qnn/builder/qnn_utils.h +++ b/onnxruntime/core/providers/qnn/builder/qnn_utils.h @@ -267,6 +267,34 @@ Ort::Status QuantizeData(gsl::span data, gsl::span /*out*/ gsl::span quant_bytes, Qnn_DataType_t data_type, std::optional axis = std::nullopt); +// Converts ONNX block quantization (BQ) scales to QNN LPBQ (BLOCKWISE_EXPANSION) format. +// Supports both int8 (bitwidth=8) and int4 (bitwidth=4) weight block quantization. +// +// The ONNX BQ scale tensor has shape [num_blocks_per_channel, num_channels] in block-major order +// (i.e., the block axis is axis 0 and the channel axis is axis 1). If the ONNX block axis is 1 +// instead of 0, the caller must transpose the scale data before calling this function. +// +// Algorithm : +// max_int_scale = 2^bitwidth (256 for int8, 16 for int4) +// per_channel_scale[c] = max(bq_scales[:, c]) / max_int_scale +// per_block_int_scale[c, b] = clamp(round(bq_scales[b, c] / per_channel_scale[c]), 1, max_int_scale) +// +// The output per_block_int_scales is in channel-major order [num_channels * num_blocks_per_channel], +// which is the layout required by QNN LPBQ (BLOCKWISE_EXPANSION). +// +// Returns failure if: +// - The encoding is asymmetric (non-zero offsets), which LPBQ does not support. +// - Any block scale is negative or non-finite. +// - Input sizes are inconsistent. +Ort::Status TryConvertBlockQuantScalesToLpbq(gsl::span bq_scales, + gsl::span bq_offsets, + uint32_t num_blocks_per_channel, + uint32_t num_channels, + uint32_t bitwidth, + /*out*/ std::vector& per_channel_scales, + /*out*/ std::vector& per_block_int_scales, + /*out*/ std::vector& offsets); + // Quantizes the given float data using the provided Low Power Block Quantization parameters // (float channel_scales, int block_scales and offsets) // The provided offsets must use the QNN convention where offset = -zero_point. diff --git a/onnxruntime/core/providers/qnn/ort_api.cc b/onnxruntime/core/providers/qnn/ort_api.cc index c3a70eabe2..1b31b4fcd5 100644 --- a/onnxruntime/core/providers/qnn/ort_api.cc +++ b/onnxruntime/core/providers/qnn/ort_api.cc @@ -160,11 +160,12 @@ std::vector GetQDQIODefs(const OrtNode* target_node, continue; } - // Get the Q/DQ axis attribute if available. + // Get the Q/DQ axis and block_size attributes if available. std::optional axis = OrtNodeAttrHelper(*node).GetInt64("axis"); + std::optional block_size = OrtNodeAttrHelper(*node).GetInt64("block_size"); // Quantization scale and zp are always the input[1, 2]. - OrtNodeUnitIODef::QuantParam quant_param{node_inputs[1], num_node_inputs == 3 ? node_inputs[2] : nullptr, axis}; + OrtNodeUnitIODef::QuantParam quant_param{node_inputs[1], num_node_inputs == 3 ? node_inputs[2] : nullptr, axis, block_size}; OrtNodeUnitIODef io_def; if (is_input) { @@ -264,7 +265,8 @@ OrtStatus* OrtNodeUnit::InitForSingleNode(const OrtApi& ort_api) { if (std::string(op_type) == "DequantizeLinear") { std::optional axis = OrtNodeAttrHelper(*target_node_).GetInt64("axis"); - OrtNodeUnitIODef::QuantParam quant_param{inputs_data[1], num_inputs == 3 ? inputs_data[2] : nullptr, axis}; + std::optional block_size = OrtNodeAttrHelper(*target_node_).GetInt64("block_size"); + OrtNodeUnitIODef::QuantParam quant_param{inputs_data[1], num_inputs == 3 ? inputs_data[2] : nullptr, axis, block_size}; OrtNodeUnitIODef input_def, output_def; auto input_status = ParseOrtValueInfo(inputs_data[0], quant_param, ort_api, input_def); @@ -283,7 +285,8 @@ OrtStatus* OrtNodeUnit::InitForSingleNode(const OrtApi& ort_api) { outputs_.push_back(output_def); } else if (std::string(op_type) == "QuantizeLinear") { std::optional axis = OrtNodeAttrHelper(*target_node_).GetInt64("axis"); - OrtNodeUnitIODef::QuantParam quant_param{inputs_data[1], num_inputs == 3 ? inputs_data[2] : nullptr, axis}; + std::optional block_size = OrtNodeAttrHelper(*target_node_).GetInt64("block_size"); + OrtNodeUnitIODef::QuantParam quant_param{inputs_data[1], num_inputs == 3 ? inputs_data[2] : nullptr, axis, block_size}; OrtNodeUnitIODef input_def, output_def; auto input_status = ParseOrtValueInfo(inputs_data[0], std::nullopt, ort_api, input_def); diff --git a/onnxruntime/core/providers/qnn/ort_api.h b/onnxruntime/core/providers/qnn/ort_api.h index c4905f44b4..855c509fe2 100644 --- a/onnxruntime/core/providers/qnn/ort_api.h +++ b/onnxruntime/core/providers/qnn/ort_api.h @@ -247,6 +247,7 @@ struct OrtNodeUnitIODef { const OrtValueInfo* scale; const OrtValueInfo* zero_point{nullptr}; std::optional axis{std::nullopt}; + std::optional block_size{std::nullopt}; }; std::string name; From 6fd25fc3282483192344761a618b245a043e1c75 Mon Sep 17 00:00:00 2001 From: qti-ashimaj Date: Mon, 18 May 2026 10:14:29 +0530 Subject: [PATCH 2/8] address review comments --- .../builder/opbuilder/matmul_op_builder.cc | 1 - .../qnn/builder/qnn_quant_params_wrapper.cc | 81 ++++++++++--------- .../core/providers/qnn/builder/qnn_utils.cc | 24 +++--- .../core/providers/qnn/builder/qnn_utils.h | 4 +- 4 files changed, 57 insertions(+), 53 deletions(-) diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/matmul_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/matmul_op_builder.cc index c0bb28759e..d38303506b 100644 --- a/onnxruntime/core/providers/qnn/builder/opbuilder/matmul_op_builder.cc +++ b/onnxruntime/core/providers/qnn/builder/opbuilder/matmul_op_builder.cc @@ -130,7 +130,6 @@ Ort::Status CheckInputs(const QnnModelWrapper& qnn_model_wrapper, const OrtNodeU !input_info_0.is_initializer && IsQuant16bit(input_info_1.qnn_data_type) && !input_info_1.is_initializer); - use_fully_connected = use_fully_connected && !input_info_1.quant_param.IsLPBQ(); #endif return Ort::Status(); } diff --git a/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.cc b/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.cc index 225b4f7a50..7c81c1ad47 100644 --- a/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.cc +++ b/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.cc @@ -150,7 +150,7 @@ QnnQuantParamsWrapper::QnnQuantParamsWrapper(gsl::span per_channel_ } lpbq.numBlocksPerAxis = static_cast(per_block_int_scales.size()) / num_elems; - lpbq.blockScaleBitwidth = is_int4 ? 4 : 0; + lpbq.blockScaleBitwidth = is_int4 ? 4 : 8; lpbq.blockScaleStorageType = QNN_BLOCKWISE_EXPANSION_BITWIDTH_SCALE_STORAGE_8; // Deep copy the block int scales @@ -483,14 +483,15 @@ Ort::Status QnnQuantParamsWrapper::Init(const QnnModelWrapper& qnn_model_wrapper (onnx_tp_type == ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT4); } - const bool is_per_tensor = scales.size() == 1; const bool is_block_quant = ort_quant_params->block_size.has_value() && ort_quant_params->block_size.value() > 0; const bool is_per_channel = scales.size() > 1 && !is_block_quant; + const bool is_per_tensor = scales.size() == 1 && !is_block_quant; - // QNN uses different structs to represent quantization parameters depending on - // - per-tensor vs per-channel - // - int4 vs not int4 - // - block quantization (LPBQ / BLOCKWISE_EXPANSION) + // QNN uses different structs to represent quantization parameters depending on: + // - per-tensor (scales.size()==1, no block_size): SCALE_OFFSET or BW_SCALE_OFFSET + // - per-channel (scales.size()>1, no block_size): AXIS_SCALE_OFFSET or BW_AXIS_SCALE_OFFSET + // - block quantization (block_size>0): BLOCKWISE_EXPANSION (LPBQ) + // - fallback: error if (is_per_tensor && !is_int4_type) { params_.encodingDefinition = QNN_DEFINITION_DEFINED; params_.quantizationEncoding = QNN_QUANTIZATION_ENCODING_SCALE_OFFSET; @@ -596,31 +597,44 @@ Ort::Status QnnQuantParamsWrapper::Init(const QnnModelWrapper& qnn_model_wrapper } else if (is_block_quant) { // ONNX block quantization -> QNN LPBQ (BLOCKWISE_EXPANSION) conversion. + std::vector io_shape; + RETURN_IF_NOT(qnn_model_wrapper.GetOnnxShape(io_def.shape, io_shape), "Cannot get shape"); + const int32_t io_rank = static_cast(io_shape.size()); + // Get scale tensor shape to determine block/channel dimensions. + // Scale tensor may be rank 2 (e.g., MatMul/Gemm) or higher rank (e.g., Conv with rank-4 weights). + // Only the first two dimensions (indexed by onnx_axis and 1 - onnx_axis) are used for LPBQ conversion. const std::vector scale_shape = utils::GetInitializerShape(ort_quant_params->scale, qnn_model_wrapper.GetOrtApi()); - RETURN_IF_NOT(scale_shape.size() == 2, - "Only 2D block quantization scale tensors are supported for LPBQ conversion"); - - // Determine block axis (= ONNX axis attribute, default 0). - constexpr int64_t kDefaultBlockAxis = 0; - int64_t onnx_axis = ort_quant_params->axis.value_or(kDefaultBlockAxis); - if (onnx_axis < 0) onnx_axis += static_cast(scale_shape.size()); - RETURN_IF_NOT(onnx_axis == 0 || onnx_axis == 1, - "Only axis 0 or 1 is supported for 2D block quantization LPBQ conversion"); - - // Scale shape: [num_blocks_per_channel, num_channels] when block_axis=0 - // [num_channels, num_blocks_per_channel] when block_axis=1 - const uint32_t num_blocks_per_channel = static_cast(scale_shape[onnx_axis]); - const uint32_t num_channels = static_cast(scale_shape[1 - onnx_axis]); + RETURN_IF_NOT(scale_shape.size() >= 2, + "Block quantization scale tensors must have at least rank 2 for LPBQ conversion"); + RETURN_IF_NOT(scale_shape[0] > 0 && scale_shape[1] > 0, + "Block quantization scale tensor dimensions must be positive"); + RETURN_IF_NOT(scale_shape[0] * scale_shape[1] == static_cast(scales.size()), + "Block quantization scale tensor shape product must equal number of scales"); + + // Determine block axis (= ONNX axis attribute). + constexpr int64_t DEFAULT_QDQ_AXIS = 1; + int64_t axis = ort_quant_params->axis.value_or(DEFAULT_QDQ_AXIS); + if (axis < 0) axis += io_rank; + RETURN_IF_NOT(axis == 0 || axis == 1, + "Only axis 0 or 1 is supported for block quantization LPBQ conversion"); + + // Scale shape: [num_blocks_per_channel, num_channels] when axis=0 + // [num_channels, num_blocks_per_channel] when axis=1 + const uint32_t num_blocks_per_channel = static_cast(scale_shape[axis]); + const uint32_t num_channels = static_cast(scale_shape[1 - axis]); + + // LPBQ requires symmetric quantization (all zero-points must be zero). + for (const int32_t zp : zero_points) { + RETURN_IF_NOT(zp == 0, "LPBQ conversion requires symmetric quantization"); + } // The conversion algorithm expects scales in block-major order [num_blocks, num_channels]. - // If block_axis=1 the raw tensor is channel-major [num_channels, num_blocks]; transpose it. + // If axis=1 the raw tensor is channel-major [num_channels, num_blocks]; transpose it. std::vector bq_scales_bm; - std::vector bq_offsets_bm; - if (onnx_axis == 0) { + if (axis == 0) { bq_scales_bm = std::move(scales); - bq_offsets_bm = std::move(zero_points); } else { // Transpose [num_channels, num_blocks] -> [num_blocks, num_channels] bq_scales_bm.resize(scales.size()); @@ -630,15 +644,6 @@ Ort::Status QnnQuantParamsWrapper::Init(const QnnModelWrapper& qnn_model_wrapper scales[static_cast(c) * num_blocks_per_channel + b]; } } - if (!zero_points.empty()) { - bq_offsets_bm.resize(zero_points.size()); - for (uint32_t c = 0; c < num_channels; ++c) { - for (uint32_t b = 0; b < num_blocks_per_channel; ++b) { - bq_offsets_bm[static_cast(b) * num_channels + c] = - zero_points[static_cast(c) * num_blocks_per_channel + b]; - } - } - } } // Apply BQ -> LPBQ algorithm @@ -646,15 +651,15 @@ Ort::Status QnnQuantParamsWrapper::Init(const QnnModelWrapper& qnn_model_wrapper std::vector per_channel_scales; std::vector per_block_int_scales; std::vector lpbq_offsets; - const uint32_t kBitwidth = is_int4_type ? 4u : 8u; - RETURN_IF_ERROR(utils::TryConvertBlockQuantScalesToLpbq( - bq_scales_bm, bq_offsets_bm, - num_blocks_per_channel, num_channels, kBitwidth, + const uint32_t bitwidth = is_int4_type ? 4u : 8u; + RETURN_IF_ERROR(utils::ConvertBlockQuantScalesToLpbq( + bq_scales_bm, zero_points, + num_blocks_per_channel, num_channels, bitwidth, per_channel_scales, per_block_int_scales, lpbq_offsets)); // QNN LPBQ axis = the non-block axis in the weight tensor. // For ONNX axis=0 (block axis=0): QNN axis=1; for axis=1: QNN axis=0. - const int64_t qnn_axis = 1 - onnx_axis; + const int64_t qnn_axis = 1 - axis; *this = QnnQuantParamsWrapper(per_channel_scales, per_block_int_scales, lpbq_offsets, qnn_axis, ort_quant_params->block_size.value(), is_int4_type); diff --git a/onnxruntime/core/providers/qnn/builder/qnn_utils.cc b/onnxruntime/core/providers/qnn/builder/qnn_utils.cc index e1834840a1..5cf0637c18 100644 --- a/onnxruntime/core/providers/qnn/builder/qnn_utils.cc +++ b/onnxruntime/core/providers/qnn/builder/qnn_utils.cc @@ -387,8 +387,8 @@ std::ostream& operator<<(std::ostream& out, const Qnn_QuantizeParams_t& quantize out << " axis=" << lpbq.axis << " numBlocksPerAxis=" << lpbq.numBlocksPerAxis << " blockScaleBitwidth=" << lpbq.blockScaleBitwidth; - // For lpbq, num_elems are not present in the quantize_params, - // we are using numBlocksPerAxis instead to print the first few scale offset values + // For LPBQ, num_elems are not present in the quantize_params, + // we are using numBlocksPerAxis instead to print the first numBlocksPerAxis scale offset values size_t num_elems = lpbq.numBlocksPerAxis; bool truncate = num_elems > 20; num_elems = truncate ? 20 : num_elems; @@ -1205,21 +1205,21 @@ Ort::Status DequantizePerChannel(gsl::span quant_bytes, gsl::span return Ort::Status(); } -Ort::Status TryConvertBlockQuantScalesToLpbq(gsl::span bq_scales, - gsl::span bq_offsets, - uint32_t num_blocks_per_channel, - uint32_t num_channels, - uint32_t bitwidth, - /*out*/ std::vector& per_channel_scales, - /*out*/ std::vector& per_block_int_scales, - /*out*/ std::vector& offsets) { +Ort::Status ConvertBlockQuantScalesToLpbq(gsl::span bq_scales, + gsl::span bq_offsets, + uint32_t num_blocks_per_channel, + uint32_t num_channels, + uint32_t bitwidth, + /*out*/ std::vector& per_channel_scales, + /*out*/ std::vector& per_block_int_scales, + /*out*/ std::vector& offsets) { RETURN_IF_NOT(bq_scales.size() == static_cast(num_blocks_per_channel) * num_channels, "BQ scales size does not match num_blocks_per_channel * num_channels"); RETURN_IF_NOT(bq_offsets.empty() || bq_offsets.size() == bq_scales.size(), "BQ offsets size must be empty or equal to bq_scales size"); RETURN_IF_NOT(bitwidth > 0 && bitwidth <= 16, "bitwidth must be in range [1, 16]"); - const uint32_t max_int_scale = 1u << bitwidth; // 2^bitwidth + const uint32_t max_int_scale = (1u << bitwidth) - 1u; // Require symmetric quantization (all offsets must be zero). if (!bq_offsets.empty()) { @@ -1236,7 +1236,7 @@ Ort::Status TryConvertBlockQuantScalesToLpbq(gsl::span bq_scales, } // Algorithm: - // max_int_scale = 2^bitwidth + // max_int_scale = 2^bitwidth - 1 // per_channel_scale[c] = max(bq_scales[:, c]) / max_int_scale // per_block_int_scale[c, b] = clamp(round(bq_scales[b, c] / per_channel_scale[c]), 1, max_int_scale) // diff --git a/onnxruntime/core/providers/qnn/builder/qnn_utils.h b/onnxruntime/core/providers/qnn/builder/qnn_utils.h index 3b15b12a4f..f16b768b92 100644 --- a/onnxruntime/core/providers/qnn/builder/qnn_utils.h +++ b/onnxruntime/core/providers/qnn/builder/qnn_utils.h @@ -275,7 +275,7 @@ Ort::Status QuantizeData(gsl::span data, gsl::span // instead of 0, the caller must transpose the scale data before calling this function. // // Algorithm : -// max_int_scale = 2^bitwidth (256 for int8, 16 for int4) +// max_int_scale = 2^bitwidth - 1 (255 for int8, 15 for int4) // per_channel_scale[c] = max(bq_scales[:, c]) / max_int_scale // per_block_int_scale[c, b] = clamp(round(bq_scales[b, c] / per_channel_scale[c]), 1, max_int_scale) // @@ -286,7 +286,7 @@ Ort::Status QuantizeData(gsl::span data, gsl::span // - The encoding is asymmetric (non-zero offsets), which LPBQ does not support. // - Any block scale is negative or non-finite. // - Input sizes are inconsistent. -Ort::Status TryConvertBlockQuantScalesToLpbq(gsl::span bq_scales, +Ort::Status ConvertBlockQuantScalesToLpbq(gsl::span bq_scales, gsl::span bq_offsets, uint32_t num_blocks_per_channel, uint32_t num_channels, From 09ac5dcd155ca0ab5c62e2fe43c59a0151dc9ad3 Mon Sep 17 00:00:00 2001 From: qti-ashimaj Date: Mon, 18 May 2026 10:19:55 +0530 Subject: [PATCH 3/8] address review comments --- onnxruntime/core/providers/qnn/builder/qnn_utils.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/onnxruntime/core/providers/qnn/builder/qnn_utils.h b/onnxruntime/core/providers/qnn/builder/qnn_utils.h index f16b768b92..72f1065ad5 100644 --- a/onnxruntime/core/providers/qnn/builder/qnn_utils.h +++ b/onnxruntime/core/providers/qnn/builder/qnn_utils.h @@ -287,13 +287,13 @@ Ort::Status QuantizeData(gsl::span data, gsl::span // - Any block scale is negative or non-finite. // - Input sizes are inconsistent. Ort::Status ConvertBlockQuantScalesToLpbq(gsl::span bq_scales, - gsl::span bq_offsets, - uint32_t num_blocks_per_channel, - uint32_t num_channels, - uint32_t bitwidth, - /*out*/ std::vector& per_channel_scales, - /*out*/ std::vector& per_block_int_scales, - /*out*/ std::vector& offsets); + gsl::span bq_offsets, + uint32_t num_blocks_per_channel, + uint32_t num_channels, + uint32_t bitwidth, + /*out*/ std::vector& per_channel_scales, + /*out*/ std::vector& per_block_int_scales, + /*out*/ std::vector& offsets); // Quantizes the given float data using the provided Low Power Block Quantization parameters // (float channel_scales, int block_scales and offsets) From d2e886a655d42155a8cc472e1e565ea0fb52f742 Mon Sep 17 00:00:00 2001 From: qti-ashimaj Date: Wed, 3 Jun 2026 12:06:23 +0530 Subject: [PATCH 4/8] address review comments --- .../core/providers/qnn/builder/qnn_quant_params_wrapper.cc | 6 +++--- onnxruntime/core/providers/qnn/builder/qnn_utils.cc | 6 +++--- onnxruntime/core/providers/qnn/builder/qnn_utils.h | 4 ++-- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.cc b/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.cc index 7c81c1ad47..d6e8102021 100644 --- a/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.cc +++ b/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.cc @@ -490,7 +490,7 @@ Ort::Status QnnQuantParamsWrapper::Init(const QnnModelWrapper& qnn_model_wrapper // QNN uses different structs to represent quantization parameters depending on: // - per-tensor (scales.size()==1, no block_size): SCALE_OFFSET or BW_SCALE_OFFSET // - per-channel (scales.size()>1, no block_size): AXIS_SCALE_OFFSET or BW_AXIS_SCALE_OFFSET - // - block quantization (block_size>0): BLOCKWISE_EXPANSION (LPBQ) + // - block quantization (block_size>0): BLOCKWISE_EXPANSION (LPBQ) or ENCODING_BLOCK (BQ) // - fallback: error if (is_per_tensor && !is_int4_type) { params_.encodingDefinition = QNN_DEFINITION_DEFINED; @@ -594,8 +594,8 @@ Ort::Status QnnQuantParamsWrapper::Init(const QnnModelWrapper& qnn_model_wrapper params_.axisScaleOffsetEncoding.axis = static_cast(axis); params_.axisScaleOffsetEncoding.numScaleOffsets = static_cast(num_elems); params_.axisScaleOffsetEncoding.scaleOffset = data_span.data(); - } else if (is_block_quant) { - // ONNX block quantization -> QNN LPBQ (BLOCKWISE_EXPANSION) conversion. + } else if (is_block_quant && is_int4_type) { + // ONNX block quantization -> QNN LPBQ (BLOCKWISE_EXPANSION) conversion only supported for 4-bit. std::vector io_shape; RETURN_IF_NOT(qnn_model_wrapper.GetOnnxShape(io_def.shape, io_shape), "Cannot get shape"); diff --git a/onnxruntime/core/providers/qnn/builder/qnn_utils.cc b/onnxruntime/core/providers/qnn/builder/qnn_utils.cc index 5cf0637c18..9730ed5ab1 100644 --- a/onnxruntime/core/providers/qnn/builder/qnn_utils.cc +++ b/onnxruntime/core/providers/qnn/builder/qnn_utils.cc @@ -1217,9 +1217,9 @@ Ort::Status ConvertBlockQuantScalesToLpbq(gsl::span bq_scales, "BQ scales size does not match num_blocks_per_channel * num_channels"); RETURN_IF_NOT(bq_offsets.empty() || bq_offsets.size() == bq_scales.size(), "BQ offsets size must be empty or equal to bq_scales size"); - RETURN_IF_NOT(bitwidth > 0 && bitwidth <= 16, "bitwidth must be in range [1, 16]"); + RETURN_IF_NOT(bitwidth == 4, "BQ to LPBQ conversion is only supported for 4-bit"); - const uint32_t max_int_scale = (1u << bitwidth) - 1u; + const uint32_t max_int_scale = (1u << bitwidth); // 2^bitwidth // Require symmetric quantization (all offsets must be zero). if (!bq_offsets.empty()) { @@ -1236,7 +1236,7 @@ Ort::Status ConvertBlockQuantScalesToLpbq(gsl::span bq_scales, } // Algorithm: - // max_int_scale = 2^bitwidth - 1 + // max_int_scale = 2^bitwidth // per_channel_scale[c] = max(bq_scales[:, c]) / max_int_scale // per_block_int_scale[c, b] = clamp(round(bq_scales[b, c] / per_channel_scale[c]), 1, max_int_scale) // diff --git a/onnxruntime/core/providers/qnn/builder/qnn_utils.h b/onnxruntime/core/providers/qnn/builder/qnn_utils.h index 72f1065ad5..e1044d30e7 100644 --- a/onnxruntime/core/providers/qnn/builder/qnn_utils.h +++ b/onnxruntime/core/providers/qnn/builder/qnn_utils.h @@ -268,14 +268,14 @@ Ort::Status QuantizeData(gsl::span data, gsl::span std::optional axis = std::nullopt); // Converts ONNX block quantization (BQ) scales to QNN LPBQ (BLOCKWISE_EXPANSION) format. -// Supports both int8 (bitwidth=8) and int4 (bitwidth=4) weight block quantization. +// Supports int4 (bitwidth=4) weight block quantization. // // The ONNX BQ scale tensor has shape [num_blocks_per_channel, num_channels] in block-major order // (i.e., the block axis is axis 0 and the channel axis is axis 1). If the ONNX block axis is 1 // instead of 0, the caller must transpose the scale data before calling this function. // // Algorithm : -// max_int_scale = 2^bitwidth - 1 (255 for int8, 15 for int4) +// max_int_scale = 2^bitwidth (16 for int4) // per_channel_scale[c] = max(bq_scales[:, c]) / max_int_scale // per_block_int_scale[c, b] = clamp(round(bq_scales[b, c] / per_channel_scale[c]), 1, max_int_scale) // From 965a2817c359de00d2dfb4443dd78c3a22215535 Mon Sep 17 00:00:00 2001 From: qti-ashimaj Date: Wed, 3 Jun 2026 14:10:30 +0530 Subject: [PATCH 5/8] fix lint --- onnxruntime/core/providers/qnn/builder/qnn_utils.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/onnxruntime/core/providers/qnn/builder/qnn_utils.cc b/onnxruntime/core/providers/qnn/builder/qnn_utils.cc index 9730ed5ab1..188327d3f5 100644 --- a/onnxruntime/core/providers/qnn/builder/qnn_utils.cc +++ b/onnxruntime/core/providers/qnn/builder/qnn_utils.cc @@ -1219,7 +1219,7 @@ Ort::Status ConvertBlockQuantScalesToLpbq(gsl::span bq_scales, "BQ offsets size must be empty or equal to bq_scales size"); RETURN_IF_NOT(bitwidth == 4, "BQ to LPBQ conversion is only supported for 4-bit"); - const uint32_t max_int_scale = (1u << bitwidth); // 2^bitwidth + const uint32_t max_int_scale = (1u << bitwidth); // 2^bitwidth // Require symmetric quantization (all offsets must be zero). if (!bq_offsets.empty()) { From 24b4741ee3c2cca75b72a695f3cc5ca28f6eff19 Mon Sep 17 00:00:00 2001 From: qti-ashimaj Date: Mon, 15 Jun 2026 16:46:10 +0530 Subject: [PATCH 6/8] add provider option for bq to lpbq conversion --- .../QNN-ExecutionProvider.md | 7 +- .../builder/opbuilder/matmul_op_builder.cc | 4 + .../providers/qnn/builder/qnn_model_wrapper.h | 1 + .../qnn/builder/qnn_quant_params_wrapper.cc | 132 +++++++++--------- .../providers/qnn/qnn_execution_provider.cc | 6 + onnxruntime/test/providers/qnn/conv_test.cc | 1 + onnxruntime/test/providers/qnn/matmul_test.cc | 1 + 7 files changed, 86 insertions(+), 66 deletions(-) diff --git a/docs/execution_providers/QNN-ExecutionProvider.md b/docs/execution_providers/QNN-ExecutionProvider.md index b28fee9a31..2f5102e61c 100644 --- a/docs/execution_providers/QNN-ExecutionProvider.md +++ b/docs/execution_providers/QNN-ExecutionProvider.md @@ -197,6 +197,11 @@ Refer to the [QAIRT SDK documentation](https://docs.qualcomm.com/doc/80-63442-10 |'0'|Disabled. QNN EP will handle quantization and dequantization of graph I/O.| |'1'|Default. Enabled. Offload quantization and dequantization of graph I/O to CPU EP.| +|`"convert_bq_to_lpbq"`|Description| +|---|---| +|'0'|Disabled. Block quantized model will run with block quantized weight encodings and float activations.| +|'1'|Default. Enabled. Block quantized weight encodings will be converted to Low Power Block Quantized encodings.| + |`"enable_htp_shared_memory_allocator"`|Description| |---|---| |'0'|Default. Disabled.| @@ -903,7 +908,7 @@ session = ort.InferenceSession("model.onnx", sess_options=sess_options) ### Important Considerations #### Feature Disabled if Number of Subgraphs is Less Than 5 -While graph composition is responsible for the majority of the preparation time, asynchronously finalizing the subgraphs cuts the total time down by a considerable amount, depending on the graph. For smaller models or models with only a few subgraphs, the overhead of setting up for parallel graph preparation will negate any possible performance gains and may actually result in worse performance. +While graph composition is responsible for the majority of the preparation time, asynchronously finalizing the subgraphs cuts the total time down by a considerable amount, depending on the graph. For smaller models or models with only a few subgraphs, the overhead of setting up for parallel graph preparation will negate any possible performance gains and may actually result in worse performance. #### Feature Disabled if `num_graph_prepare_threads` is 1 This defeats the purpose of the feature, and enabling the feature will only add additional overhead from thread pool creation. diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/matmul_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/matmul_op_builder.cc index d38303506b..5fae3bdca5 100644 --- a/onnxruntime/core/providers/qnn/builder/opbuilder/matmul_op_builder.cc +++ b/onnxruntime/core/providers/qnn/builder/opbuilder/matmul_op_builder.cc @@ -89,6 +89,10 @@ bool IsBQWeight(const QnnModelWrapper& qnn_model_wrapper, const OrtNodeUnitIODef if (num_blocks <= 0 || static_cast(weight_shape[k_axis]) % num_blocks != 0) { return false; } + // Go for BQ FP16 only if LPBQ conversion is set to false + if (qnn_model_wrapper.GetModelSettings().convert_bq_to_lpbq) { + return false; + } return true; } diff --git a/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.h b/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.h index 0055dbaf70..94c708f2b6 100644 --- a/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.h +++ b/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.h @@ -38,6 +38,7 @@ struct ModelSettings { bool offload_graph_io_quantization = false; bool htp_shared_memory = false; bool htp_bf16_enable = false; + bool convert_bq_to_lpbq = true; }; class QnnModelWrapper { diff --git a/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.cc b/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.cc index d6e8102021..e42573460c 100644 --- a/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.cc +++ b/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.cc @@ -594,75 +594,77 @@ Ort::Status QnnQuantParamsWrapper::Init(const QnnModelWrapper& qnn_model_wrapper params_.axisScaleOffsetEncoding.axis = static_cast(axis); params_.axisScaleOffsetEncoding.numScaleOffsets = static_cast(num_elems); params_.axisScaleOffsetEncoding.scaleOffset = data_span.data(); - } else if (is_block_quant && is_int4_type) { - // ONNX block quantization -> QNN LPBQ (BLOCKWISE_EXPANSION) conversion only supported for 4-bit. - - std::vector io_shape; - RETURN_IF_NOT(qnn_model_wrapper.GetOnnxShape(io_def.shape, io_shape), "Cannot get shape"); - const int32_t io_rank = static_cast(io_shape.size()); - - // Get scale tensor shape to determine block/channel dimensions. - // Scale tensor may be rank 2 (e.g., MatMul/Gemm) or higher rank (e.g., Conv with rank-4 weights). - // Only the first two dimensions (indexed by onnx_axis and 1 - onnx_axis) are used for LPBQ conversion. - const std::vector scale_shape = - utils::GetInitializerShape(ort_quant_params->scale, qnn_model_wrapper.GetOrtApi()); - RETURN_IF_NOT(scale_shape.size() >= 2, - "Block quantization scale tensors must have at least rank 2 for LPBQ conversion"); - RETURN_IF_NOT(scale_shape[0] > 0 && scale_shape[1] > 0, - "Block quantization scale tensor dimensions must be positive"); - RETURN_IF_NOT(scale_shape[0] * scale_shape[1] == static_cast(scales.size()), - "Block quantization scale tensor shape product must equal number of scales"); - - // Determine block axis (= ONNX axis attribute). - constexpr int64_t DEFAULT_QDQ_AXIS = 1; - int64_t axis = ort_quant_params->axis.value_or(DEFAULT_QDQ_AXIS); - if (axis < 0) axis += io_rank; - RETURN_IF_NOT(axis == 0 || axis == 1, - "Only axis 0 or 1 is supported for block quantization LPBQ conversion"); - - // Scale shape: [num_blocks_per_channel, num_channels] when axis=0 - // [num_channels, num_blocks_per_channel] when axis=1 - const uint32_t num_blocks_per_channel = static_cast(scale_shape[axis]); - const uint32_t num_channels = static_cast(scale_shape[1 - axis]); - - // LPBQ requires symmetric quantization (all zero-points must be zero). - for (const int32_t zp : zero_points) { - RETURN_IF_NOT(zp == 0, "LPBQ conversion requires symmetric quantization"); - } + } else if (is_block_quant) { + if (is_int4_type && qnn_model_wrapper.GetModelSettings().convert_bq_to_lpbq) { + // ONNX block quantization -> QNN LPBQ (BLOCKWISE_EXPANSION) conversion only supported for 4-bit. + + std::vector io_shape; + RETURN_IF_NOT(qnn_model_wrapper.GetOnnxShape(io_def.shape, io_shape), "Cannot get shape"); + const int32_t io_rank = static_cast(io_shape.size()); + + // Get scale tensor shape to determine block/channel dimensions. + // Scale tensor may be rank 2 (e.g., MatMul/Gemm) or higher rank (e.g., Conv with rank-4 weights). + // Only the first two dimensions (indexed by onnx_axis and 1 - onnx_axis) are used for LPBQ conversion. + const std::vector scale_shape = + utils::GetInitializerShape(ort_quant_params->scale, qnn_model_wrapper.GetOrtApi()); + RETURN_IF_NOT(scale_shape.size() >= 2, + "Block quantization scale tensors must have at least rank 2 for LPBQ conversion"); + RETURN_IF_NOT(scale_shape[0] > 0 && scale_shape[1] > 0, + "Block quantization scale tensor dimensions must be positive"); + RETURN_IF_NOT(scale_shape[0] * scale_shape[1] == static_cast(scales.size()), + "Block quantization scale tensor shape product must equal number of scales"); + + // Determine block axis (= ONNX axis attribute). + constexpr int64_t DEFAULT_QDQ_AXIS = 1; + int64_t axis = ort_quant_params->axis.value_or(DEFAULT_QDQ_AXIS); + if (axis < 0) axis += io_rank; + RETURN_IF_NOT(axis == 0 || axis == 1, + "Only axis 0 or 1 is supported for block quantization LPBQ conversion"); + + // Scale shape: [num_blocks_per_channel, num_channels] when axis=0 + // [num_channels, num_blocks_per_channel] when axis=1 + const uint32_t num_blocks_per_channel = static_cast(scale_shape[axis]); + const uint32_t num_channels = static_cast(scale_shape[1 - axis]); + + // LPBQ requires symmetric quantization (all zero-points must be zero). + for (const int32_t zp : zero_points) { + RETURN_IF_NOT(zp == 0, "LPBQ conversion requires symmetric quantization"); + } - // The conversion algorithm expects scales in block-major order [num_blocks, num_channels]. - // If axis=1 the raw tensor is channel-major [num_channels, num_blocks]; transpose it. - std::vector bq_scales_bm; - if (axis == 0) { - bq_scales_bm = std::move(scales); - } else { - // Transpose [num_channels, num_blocks] -> [num_blocks, num_channels] - bq_scales_bm.resize(scales.size()); - for (uint32_t c = 0; c < num_channels; ++c) { - for (uint32_t b = 0; b < num_blocks_per_channel; ++b) { - bq_scales_bm[static_cast(b) * num_channels + c] = - scales[static_cast(c) * num_blocks_per_channel + b]; + // The conversion algorithm expects scales in block-major order [num_blocks, num_channels]. + // If axis=1 the raw tensor is channel-major [num_channels, num_blocks]; transpose it. + std::vector bq_scales_bm; + if (axis == 0) { + bq_scales_bm = std::move(scales); + } else { + // Transpose [num_channels, num_blocks] -> [num_blocks, num_channels] + bq_scales_bm.resize(scales.size()); + for (uint32_t c = 0; c < num_channels; ++c) { + for (uint32_t b = 0; b < num_blocks_per_channel; ++b) { + bq_scales_bm[static_cast(b) * num_channels + c] = + scales[static_cast(c) * num_blocks_per_channel + b]; + } } } - } - // Apply BQ -> LPBQ algorithm - // Use bitwidth=4 for int4 weights and bitwidth=8 for int8 weights. - std::vector per_channel_scales; - std::vector per_block_int_scales; - std::vector lpbq_offsets; - const uint32_t bitwidth = is_int4_type ? 4u : 8u; - RETURN_IF_ERROR(utils::ConvertBlockQuantScalesToLpbq( - bq_scales_bm, zero_points, - num_blocks_per_channel, num_channels, bitwidth, - per_channel_scales, per_block_int_scales, lpbq_offsets)); - - // QNN LPBQ axis = the non-block axis in the weight tensor. - // For ONNX axis=0 (block axis=0): QNN axis=1; for axis=1: QNN axis=0. - const int64_t qnn_axis = 1 - axis; - - *this = QnnQuantParamsWrapper(per_channel_scales, per_block_int_scales, lpbq_offsets, - qnn_axis, ort_quant_params->block_size.value(), is_int4_type); + // Apply BQ -> LPBQ algorithm + // Use bitwidth=4 for int4 weights and bitwidth=8 for int8 weights. + std::vector per_channel_scales; + std::vector per_block_int_scales; + std::vector lpbq_offsets; + const uint32_t bitwidth = is_int4_type ? 4u : 8u; + RETURN_IF_ERROR(utils::ConvertBlockQuantScalesToLpbq( + bq_scales_bm, zero_points, + num_blocks_per_channel, num_channels, bitwidth, + per_channel_scales, per_block_int_scales, lpbq_offsets)); + + // QNN LPBQ axis = the non-block axis in the weight tensor. + // For ONNX axis=0 (block axis=0): QNN axis=1; for axis=1: QNN axis=0. + const int64_t qnn_axis = 1 - axis; + + *this = QnnQuantParamsWrapper(per_channel_scales, per_block_int_scales, lpbq_offsets, + qnn_axis, ort_quant_params->block_size.value(), is_int4_type); + } } else { return MAKE_EP_FAIL("Unexpected tensor kind for QuantParamsWrapper::Init()"); } diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc index 0b877f0295..f1e88d87ee 100644 --- a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc +++ b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc @@ -909,6 +909,12 @@ QnnEp::QnnEp(QnnEpFactory& factory, } #endif + model_settings_.convert_bq_to_lpbq = ParseBoolOption(ort_api, + session_options_, + FormatEPConfigKey("convert_bq_to_lpbq"), + true, + logger_); + if (disable_cpu_ep_fallback_ && model_settings_.offload_graph_io_quantization) { ORT_CXX_LOG(logger_, ORT_LOGGING_LEVEL_INFO, diff --git a/onnxruntime/test/providers/qnn/conv_test.cc b/onnxruntime/test/providers/qnn/conv_test.cc index e27ac4a173..76d3bce210 100644 --- a/onnxruntime/test/providers/qnn/conv_test.cc +++ b/onnxruntime/test/providers/qnn/conv_test.cc @@ -3205,6 +3205,7 @@ ProviderOptions GetBQConvProviderOptions() { ProviderOptions opts; opts["backend_type"] = "htp"; opts["offload_graph_io_quantization"] = "0"; + opts["convert_bq_to_lpbq"] = "0"; #if defined(__linux__) && !defined(__aarch64__) // On the x86_64 Linux HTP simulator, specify SM8850 to enable BW_FLOAT_BLOCK support. // On real ARM64 hardware, the SoC model is auto-detected by QNN EP. diff --git a/onnxruntime/test/providers/qnn/matmul_test.cc b/onnxruntime/test/providers/qnn/matmul_test.cc index 4084fa0529..2d7b489b43 100644 --- a/onnxruntime/test/providers/qnn/matmul_test.cc +++ b/onnxruntime/test/providers/qnn/matmul_test.cc @@ -362,6 +362,7 @@ ProviderOptions GetBQMatMulProviderOptions() { ProviderOptions opts; opts["backend_type"] = "htp"; opts["offload_graph_io_quantization"] = "0"; + opts["convert_bq_to_lpbq"] = "0"; #if defined(__linux__) && !defined(__aarch64__) // On the x86_64 Linux HTP simulator, specify SM8850 to enable BW_FLOAT_BLOCK support. // On real ARM64 hardware, the SoC model is auto-detected by QNN EP. From 0131740f31800af002800abb7886dfba60d5febe Mon Sep 17 00:00:00 2001 From: qti-ashimaj Date: Wed, 17 Jun 2026 17:00:16 +0530 Subject: [PATCH 7/8] address review comments --- .../QNN-ExecutionProvider.md | 6 +- .../builder/opbuilder/matmul_op_builder.cc | 16 +- .../providers/qnn/builder/qnn_model_wrapper.h | 4 +- .../qnn/builder/qnn_quant_params_wrapper.cc | 142 ++++++++++-------- .../qnn/builder/qnn_quant_params_wrapper.h | 2 +- .../providers/qnn/qnn_execution_provider.cc | 10 +- onnxruntime/test/providers/qnn/conv_test.cc | 2 +- onnxruntime/test/providers/qnn/matmul_test.cc | 2 +- 8 files changed, 98 insertions(+), 86 deletions(-) diff --git a/docs/execution_providers/QNN-ExecutionProvider.md b/docs/execution_providers/QNN-ExecutionProvider.md index 2f5102e61c..dd64673552 100644 --- a/docs/execution_providers/QNN-ExecutionProvider.md +++ b/docs/execution_providers/QNN-ExecutionProvider.md @@ -197,10 +197,10 @@ Refer to the [QAIRT SDK documentation](https://docs.qualcomm.com/doc/80-63442-10 |'0'|Disabled. QNN EP will handle quantization and dequantization of graph I/O.| |'1'|Default. Enabled. Offload quantization and dequantization of graph I/O to CPU EP.| -|`"convert_bq_to_lpbq"`|Description| +|`"enable_block_quant_weight_optimization"`|Description| |---|---| -|'0'|Disabled. Block quantized model will run with block quantized weight encodings and float activations.| -|'1'|Default. Enabled. Block quantized weight encodings will be converted to Low Power Block Quantized encodings.| +|`"0"`|Default. Disabled. Block-quantized models use the standard compatibility path.| +|`"1"`|Enabled. Uses an optimized path for block-quantized weights when supported. If the optimized path is not available, QNN EP falls back to the standard compatibility path.| |`"enable_htp_shared_memory_allocator"`|Description| |---|---| diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/matmul_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/matmul_op_builder.cc index 5fae3bdca5..d6181e8290 100644 --- a/onnxruntime/core/providers/qnn/builder/opbuilder/matmul_op_builder.cc +++ b/onnxruntime/core/providers/qnn/builder/opbuilder/matmul_op_builder.cc @@ -1,4 +1,4 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. +// Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. #include @@ -89,10 +89,6 @@ bool IsBQWeight(const QnnModelWrapper& qnn_model_wrapper, const OrtNodeUnitIODef if (num_blocks <= 0 || static_cast(weight_shape[k_axis]) % num_blocks != 0) { return false; } - // Go for BQ FP16 only if LPBQ conversion is set to false - if (qnn_model_wrapper.GetModelSettings().convert_bq_to_lpbq) { - return false; - } return true; } @@ -250,17 +246,17 @@ Ort::Status MatMulOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper, c bool do_op_validation) const { const auto& inputs = node_unit.Inputs(); - // Block-quantized weight: translate to a QNN MatMul with a BW_FLOAT_BLOCK weight. - if (IsBQWeight(qnn_model_wrapper, inputs[1])) { - return ProcessInputsForBQMatMul(qnn_model_wrapper, node_unit, logger, input_names, do_op_validation); - } - TensorInfo input_info_0{}; TensorInfo input_info_1{}; bool use_fully_connected = false; RETURN_IF_ERROR( CheckInputs(qnn_model_wrapper, inputs[0], inputs[1], input_info_0, input_info_1, use_fully_connected)); + // Block-quantized weight: translate to a QNN MatMul with a BW_FLOAT_BLOCK weight. + if (IsBQWeight(qnn_model_wrapper, inputs[1]) && !input_info_1.quant_param.IsLPBQ()) { + return ProcessInputsForBQMatMul(qnn_model_wrapper, node_unit, logger, input_names, do_op_validation); + } + if (use_fully_connected) { return ProcessInputsForQnnFullyConnected(qnn_model_wrapper, node_unit, diff --git a/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.h b/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.h index 94c708f2b6..f05a95ac27 100644 --- a/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.h +++ b/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.h @@ -38,7 +38,7 @@ struct ModelSettings { bool offload_graph_io_quantization = false; bool htp_shared_memory = false; bool htp_bf16_enable = false; - bool convert_bq_to_lpbq = true; + bool enable_block_quant_weight_optimization = false; }; class QnnModelWrapper { @@ -364,6 +364,8 @@ class QnnModelWrapper { const OrtGraph& GetOrtGraph() const { return ort_graph_; } + const Ort::Logger& GetLogger() const { return logger_; } + const std::unordered_map& GetModelTensorsMap() const { return model_tensors_map_; } diff --git a/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.cc b/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.cc index e42573460c..f19ca37974 100644 --- a/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.cc +++ b/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.cc @@ -595,76 +595,90 @@ Ort::Status QnnQuantParamsWrapper::Init(const QnnModelWrapper& qnn_model_wrapper params_.axisScaleOffsetEncoding.numScaleOffsets = static_cast(num_elems); params_.axisScaleOffsetEncoding.scaleOffset = data_span.data(); } else if (is_block_quant) { - if (is_int4_type && qnn_model_wrapper.GetModelSettings().convert_bq_to_lpbq) { - // ONNX block quantization -> QNN LPBQ (BLOCKWISE_EXPANSION) conversion only supported for 4-bit. - - std::vector io_shape; - RETURN_IF_NOT(qnn_model_wrapper.GetOnnxShape(io_def.shape, io_shape), "Cannot get shape"); - const int32_t io_rank = static_cast(io_shape.size()); - - // Get scale tensor shape to determine block/channel dimensions. - // Scale tensor may be rank 2 (e.g., MatMul/Gemm) or higher rank (e.g., Conv with rank-4 weights). - // Only the first two dimensions (indexed by onnx_axis and 1 - onnx_axis) are used for LPBQ conversion. - const std::vector scale_shape = - utils::GetInitializerShape(ort_quant_params->scale, qnn_model_wrapper.GetOrtApi()); - RETURN_IF_NOT(scale_shape.size() >= 2, - "Block quantization scale tensors must have at least rank 2 for LPBQ conversion"); - RETURN_IF_NOT(scale_shape[0] > 0 && scale_shape[1] > 0, - "Block quantization scale tensor dimensions must be positive"); - RETURN_IF_NOT(scale_shape[0] * scale_shape[1] == static_cast(scales.size()), - "Block quantization scale tensor shape product must equal number of scales"); - - // Determine block axis (= ONNX axis attribute). - constexpr int64_t DEFAULT_QDQ_AXIS = 1; - int64_t axis = ort_quant_params->axis.value_or(DEFAULT_QDQ_AXIS); - if (axis < 0) axis += io_rank; - RETURN_IF_NOT(axis == 0 || axis == 1, - "Only axis 0 or 1 is supported for block quantization LPBQ conversion"); - - // Scale shape: [num_blocks_per_channel, num_channels] when axis=0 - // [num_channels, num_blocks_per_channel] when axis=1 - const uint32_t num_blocks_per_channel = static_cast(scale_shape[axis]); - const uint32_t num_channels = static_cast(scale_shape[1 - axis]); - - // LPBQ requires symmetric quantization (all zero-points must be zero). - for (const int32_t zp : zero_points) { - RETURN_IF_NOT(zp == 0, "LPBQ conversion requires symmetric quantization"); + if (!qnn_model_wrapper.GetModelSettings().enable_block_quant_weight_optimization) { + ORT_CXX_LOG(qnn_model_wrapper.GetLogger(), ORT_LOGGING_LEVEL_VERBOSE, + ("Block quant weight optimization disabled, falling back to float BQ path")); + return Ort::Status(); + } + // ONNX block quantization -> QNN LPBQ (BLOCKWISE_EXPANSION) conversion only supported for 4-bit. + if (io_def.type != ONNX_TENSOR_ELEMENT_DATA_TYPE_INT4) { + ORT_CXX_LOG(qnn_model_wrapper.GetLogger(), ORT_LOGGING_LEVEL_VERBOSE, + ("BQ to LPBQ conversion only supported for int4 weights, falling back to float BQ path")); + return Ort::Status(); + } + // LPBQ requires symmetric quantization (all zero-points must be zero). + for (const int32_t zp : zero_points) { + if (zp != 0) { + ORT_CXX_LOG(qnn_model_wrapper.GetLogger(), ORT_LOGGING_LEVEL_VERBOSE, + ("BQ to LPBQ conversion requires symmetric quantization, falling back to float BQ path")); + return Ort::Status(); } + } - // The conversion algorithm expects scales in block-major order [num_blocks, num_channels]. - // If axis=1 the raw tensor is channel-major [num_channels, num_blocks]; transpose it. - std::vector bq_scales_bm; - if (axis == 0) { - bq_scales_bm = std::move(scales); - } else { - // Transpose [num_channels, num_blocks] -> [num_blocks, num_channels] - bq_scales_bm.resize(scales.size()); - for (uint32_t c = 0; c < num_channels; ++c) { - for (uint32_t b = 0; b < num_blocks_per_channel; ++b) { - bq_scales_bm[static_cast(b) * num_channels + c] = - scales[static_cast(c) * num_blocks_per_channel + b]; - } + std::vector io_shape; + RETURN_IF_NOT(qnn_model_wrapper.GetOnnxShape(io_def.shape, io_shape), "Cannot get shape"); + const int32_t io_rank = static_cast(io_shape.size()); + + // Get scale tensor shape to determine block/channel dimensions. + // Scale tensor may be rank 2 (e.g., MatMul/Gemm) or higher rank (e.g., Conv with rank-4 weights). + // Only the first two dimensions (indexed by onnx_axis and 1 - onnx_axis) are used for LPBQ conversion. + const std::vector scale_shape = + utils::GetInitializerShape(ort_quant_params->scale, qnn_model_wrapper.GetOrtApi()); + RETURN_IF_NOT(scale_shape.size() >= 2, + "Block quantization scale tensors must have at least rank 2 for LPBQ conversion"); + RETURN_IF_NOT(scale_shape[0] > 0 && scale_shape[1] > 0, + "Block quantization scale tensor dimensions must be positive"); + RETURN_IF_NOT(scale_shape[0] * scale_shape[1] == static_cast(scales.size()), + "Block quantization scale tensor shape product must equal number of scales"); + + // Determine block axis (= ONNX axis attribute). + constexpr int64_t DEFAULT_QDQ_AXIS = 1; + int64_t axis = ort_quant_params->axis.value_or(DEFAULT_QDQ_AXIS); + if (axis < 0) axis += io_rank; + RETURN_IF_NOT(axis == 0 || axis == 1, + "Only axis 0 or 1 is supported for block quantization LPBQ conversion"); + + // Scale shape: [num_blocks_per_channel, num_channels] when axis=0 + // [num_channels, num_blocks_per_channel] when axis=1 + const uint32_t num_blocks_per_channel = static_cast(scale_shape[axis]); + const uint32_t num_channels = static_cast(scale_shape[1 - axis]); + + // The conversion algorithm expects scales in block-major order [num_blocks, num_channels]. + // If axis=1 the raw tensor is channel-major [num_channels, num_blocks]; transpose it. + std::vector bq_scales_bm; + if (axis == 0) { + bq_scales_bm = std::move(scales); + } else { + // Transpose [num_channels, num_blocks] -> [num_blocks, num_channels] + bq_scales_bm.resize(scales.size()); + for (uint32_t c = 0; c < num_channels; ++c) { + for (uint32_t b = 0; b < num_blocks_per_channel; ++b) { + bq_scales_bm[static_cast(b) * num_channels + c] = + scales[static_cast(c) * num_blocks_per_channel + b]; } } + } - // Apply BQ -> LPBQ algorithm - // Use bitwidth=4 for int4 weights and bitwidth=8 for int8 weights. - std::vector per_channel_scales; - std::vector per_block_int_scales; - std::vector lpbq_offsets; - const uint32_t bitwidth = is_int4_type ? 4u : 8u; - RETURN_IF_ERROR(utils::ConvertBlockQuantScalesToLpbq( - bq_scales_bm, zero_points, - num_blocks_per_channel, num_channels, bitwidth, - per_channel_scales, per_block_int_scales, lpbq_offsets)); - - // QNN LPBQ axis = the non-block axis in the weight tensor. - // For ONNX axis=0 (block axis=0): QNN axis=1; for axis=1: QNN axis=0. - const int64_t qnn_axis = 1 - axis; - - *this = QnnQuantParamsWrapper(per_channel_scales, per_block_int_scales, lpbq_offsets, - qnn_axis, ort_quant_params->block_size.value(), is_int4_type); + // Apply BQ -> LPBQ algorithm + std::vector per_channel_scales; + std::vector per_block_int_scales; + std::vector lpbq_offsets; + const uint32_t bitwidth = 4u; + Ort::Status status = utils::ConvertBlockQuantScalesToLpbq(bq_scales_bm, zero_points, num_blocks_per_channel, + num_channels, bitwidth, per_channel_scales, + per_block_int_scales, lpbq_offsets); + if (!status.IsOK()) { + ORT_CXX_LOG(qnn_model_wrapper.GetLogger(), ORT_LOGGING_LEVEL_VERBOSE, + ("BQ to LPBQ conversion failed, falling back to float BQ path: " + std::string(status.GetErrorMessage())).c_str()); + return Ort::Status(); } + + // QNN LPBQ axis = the non-block axis in the weight tensor. + // For ONNX axis=0 (block axis=0): QNN axis=1; for axis=1: QNN axis=0. + const int64_t qnn_axis = 1 - axis; + + *this = QnnQuantParamsWrapper(per_channel_scales, per_block_int_scales, lpbq_offsets, + qnn_axis, ort_quant_params->block_size.value(), is_int4_type); } else { return MAKE_EP_FAIL("Unexpected tensor kind for QuantParamsWrapper::Init()"); } diff --git a/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.h b/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.h index 726e276b1c..a4d5675dc7 100644 --- a/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.h +++ b/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.h @@ -203,7 +203,7 @@ class QnnQuantParamsWrapper { // Stores LowPowerBlockQuant encodings meta like number of per_channel_scales, per-block scales, // and blockwise_expansion_data - uint32_t per_channel_scales_size_; + uint32_t per_channel_scales_size_ = 0; std::unique_ptr block_scales_data_; std::unique_ptr blockwise_expansion_data_; diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc index f1e88d87ee..449b53783b 100644 --- a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc +++ b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc @@ -909,11 +909,11 @@ QnnEp::QnnEp(QnnEpFactory& factory, } #endif - model_settings_.convert_bq_to_lpbq = ParseBoolOption(ort_api, - session_options_, - FormatEPConfigKey("convert_bq_to_lpbq"), - true, - logger_); + model_settings_.enable_block_quant_weight_optimization = ParseBoolOption(ort_api, + session_options_, + FormatEPConfigKey("enable_block_quant_weight_optimization"), + false, + logger_); if (disable_cpu_ep_fallback_ && model_settings_.offload_graph_io_quantization) { ORT_CXX_LOG(logger_, diff --git a/onnxruntime/test/providers/qnn/conv_test.cc b/onnxruntime/test/providers/qnn/conv_test.cc index 76d3bce210..f5ad17ce1c 100644 --- a/onnxruntime/test/providers/qnn/conv_test.cc +++ b/onnxruntime/test/providers/qnn/conv_test.cc @@ -3205,7 +3205,7 @@ ProviderOptions GetBQConvProviderOptions() { ProviderOptions opts; opts["backend_type"] = "htp"; opts["offload_graph_io_quantization"] = "0"; - opts["convert_bq_to_lpbq"] = "0"; + opts["enable_block_quant_weight_optimization"] = "0"; #if defined(__linux__) && !defined(__aarch64__) // On the x86_64 Linux HTP simulator, specify SM8850 to enable BW_FLOAT_BLOCK support. // On real ARM64 hardware, the SoC model is auto-detected by QNN EP. diff --git a/onnxruntime/test/providers/qnn/matmul_test.cc b/onnxruntime/test/providers/qnn/matmul_test.cc index 2d7b489b43..2a08cdf7a9 100644 --- a/onnxruntime/test/providers/qnn/matmul_test.cc +++ b/onnxruntime/test/providers/qnn/matmul_test.cc @@ -362,7 +362,7 @@ ProviderOptions GetBQMatMulProviderOptions() { ProviderOptions opts; opts["backend_type"] = "htp"; opts["offload_graph_io_quantization"] = "0"; - opts["convert_bq_to_lpbq"] = "0"; + opts["enable_block_quant_weight_optimization"] = "0"; #if defined(__linux__) && !defined(__aarch64__) // On the x86_64 Linux HTP simulator, specify SM8850 to enable BW_FLOAT_BLOCK support. // On real ARM64 hardware, the SoC model is auto-detected by QNN EP. From ef2680e822c6ddf2471ef4799bffd0a50eee446e Mon Sep 17 00:00:00 2001 From: qti-ashimaj Date: Fri, 19 Jun 2026 13:04:08 +0530 Subject: [PATCH 8/8] fix scale size --- .../core/providers/qnn/builder/qnn_quant_params_wrapper.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.cc b/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.cc index f19ca37974..4eba4f1337 100644 --- a/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.cc +++ b/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.cc @@ -624,8 +624,8 @@ Ort::Status QnnQuantParamsWrapper::Init(const QnnModelWrapper& qnn_model_wrapper // Only the first two dimensions (indexed by onnx_axis and 1 - onnx_axis) are used for LPBQ conversion. const std::vector scale_shape = utils::GetInitializerShape(ort_quant_params->scale, qnn_model_wrapper.GetOrtApi()); - RETURN_IF_NOT(scale_shape.size() >= 2, - "Block quantization scale tensors must have at least rank 2 for LPBQ conversion"); + RETURN_IF_NOT(scale_shape.size() >= 2 && scale_shape.size() <= 4, + "Block quantization scale tensors must have rank between 2 and 4 for LPBQ conversion"); RETURN_IF_NOT(scale_shape[0] > 0 && scale_shape[1] > 0, "Block quantization scale tensor dimensions must be positive"); RETURN_IF_NOT(scale_shape[0] * scale_shape[1] == static_cast(scales.size()),