Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion docs/execution_providers/QNN-ExecutionProvider.md
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,11 @@ Refer to the [QAIRT SDK documentation](https://docs.qualcomm.com/doc/80-63442-10
|'0'|Disabled. QNN EP will handle quantization and dequantization of graph I/O.|
|'1'|Default. Enabled. Offload quantization and dequantization of graph I/O to CPU EP.|

|`"enable_block_quant_weight_optimization"`|Description|
|---|---|
|`"0"`|Default. Disabled. Block-quantized models use the standard compatibility path.|
|`"1"`|Enabled. Uses an optimized path for block-quantized weights when supported. If the optimized path is not available, QNN EP falls back to the standard compatibility path.|

|`"enable_htp_shared_memory_allocator"`|Description|
|---|---|
|'0'|Default. Disabled.|
Expand Down Expand Up @@ -903,7 +908,7 @@ session = ort.InferenceSession("model.onnx", sess_options=sess_options)

### Important Considerations
#### Feature Disabled if Number of Subgraphs is Less Than 5
While graph composition is responsible for the majority of the preparation time, asynchronously finalizing the subgraphs cuts the total time down by a considerable amount, depending on the graph. For smaller models or models with only a few subgraphs, the overhead of setting up for parallel graph preparation will negate any possible performance gains and may actually result in worse performance.
While graph composition is responsible for the majority of the preparation time, asynchronously finalizing the subgraphs cuts the total time down by a considerable amount, depending on the graph. For smaller models or models with only a few subgraphs, the overhead of setting up for parallel graph preparation will negate any possible performance gains and may actually result in worse performance.

#### Feature Disabled if `num_graph_prepare_threads` is 1
This defeats the purpose of the feature, and enabling the feature will only add additional overhead from thread pool creation.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -246,17 +246,17 @@ Ort::Status MatMulOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper, c
bool do_op_validation) const {
const auto& inputs = node_unit.Inputs();

// Block-quantized weight: translate to a QNN MatMul with a BW_FLOAT_BLOCK weight.
if (IsBQWeight(qnn_model_wrapper, inputs[1])) {
return ProcessInputsForBQMatMul(qnn_model_wrapper, node_unit, logger, input_names, do_op_validation);
}

TensorInfo input_info_0{};
TensorInfo input_info_1{};
bool use_fully_connected = false;
RETURN_IF_ERROR(
CheckInputs(qnn_model_wrapper, inputs[0], inputs[1], input_info_0, input_info_1, use_fully_connected));

// Block-quantized weight: translate to a QNN MatMul with a BW_FLOAT_BLOCK weight.
if (IsBQWeight(qnn_model_wrapper, inputs[1]) && !input_info_1.quant_param.IsLPBQ()) {
return ProcessInputsForBQMatMul(qnn_model_wrapper, node_unit, logger, input_names, do_op_validation);
}

if (use_fully_connected) {
return ProcessInputsForQnnFullyConnected(qnn_model_wrapper,
node_unit,
Expand Down
3 changes: 3 additions & 0 deletions onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ struct ModelSettings {
bool offload_graph_io_quantization = false;
bool htp_shared_memory = false;
bool htp_bf16_enable = false;
bool enable_block_quant_weight_optimization = false;
};

class QnnModelWrapper {
Expand Down Expand Up @@ -363,6 +364,8 @@ class QnnModelWrapper {

const OrtGraph& GetOrtGraph() const { return ort_graph_; }

const Ort::Logger& GetLogger() const { return logger_; }

const std::unordered_map<std::string, QnnTensorWrapper>& GetModelTensorsMap() const {
return model_tensors_map_;
}
Expand Down
107 changes: 100 additions & 7 deletions onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.cc
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include "QnnTypes.h"

#include "core/providers/qnn/builder/qnn_model_wrapper.h"
#include "core/providers/qnn/builder/qnn_utils.h"

#define ALIGN_PTR_UP(ptr, align, type) \
reinterpret_cast<type>((reinterpret_cast<std::uintptr_t>(ptr) + (align) - 1) & ~((align) - 1))
Expand Down Expand Up @@ -149,7 +150,7 @@ QnnQuantParamsWrapper::QnnQuantParamsWrapper(gsl::span<const float> per_channel_
}

lpbq.numBlocksPerAxis = static_cast<uint32_t>(per_block_int_scales.size()) / num_elems;
lpbq.blockScaleBitwidth = is_int4 ? 4 : 0;
lpbq.blockScaleBitwidth = is_int4 ? 4 : 8;
lpbq.blockScaleStorageType = QNN_BLOCKWISE_EXPANSION_BITWIDTH_SCALE_STORAGE_8;

// Deep copy the block int scales
Expand Down Expand Up @@ -366,6 +367,8 @@ Ort::Status QnnQuantParamsWrapper::Init(const Qnn_QuantizeParams_t& params, cons
params_.encodingDefinition = params.encodingDefinition;
params_.quantizationEncoding = params.quantizationEncoding;

per_channel_scales_size_ = static_cast<uint32_t>(num_scaleoffsets);

// Deep copy the blockwiseExpansion
const size_t bwe_num_bytes = sizeof(Qnn_BlockwiseExpansion_t);
constexpr std::uintptr_t bwe_align = alignof(Qnn_BlockwiseExpansion_t);
Expand Down Expand Up @@ -399,6 +402,7 @@ Ort::Status QnnQuantParamsWrapper::Init(const Qnn_QuantizeParams_t& params, cons
params_.encodingDefinition = params.encodingDefinition;
params_.quantizationEncoding = params.quantizationEncoding;

num_blocks_ = static_cast<uint32_t>(num_scaleoffsets);
block_encoding_tensor_rank_ = static_cast<uint32_t>(tensor_rank);
block_encoding_axis_data_ = std::make_unique<uint32_t[]>(block_encoding_tensor_rank_);
std::memcpy(block_encoding_axis_data_.get(),
Expand Down Expand Up @@ -479,11 +483,15 @@ Ort::Status QnnQuantParamsWrapper::Init(const QnnModelWrapper& qnn_model_wrapper
(onnx_tp_type == ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT4);
}

const bool is_per_tensor = scales.size() == 1;
const bool is_block_quant = ort_quant_params->block_size.has_value() && ort_quant_params->block_size.value() > 0;
const bool is_per_channel = scales.size() > 1 && !is_block_quant;
const bool is_per_tensor = scales.size() == 1 && !is_block_quant;

// QNN uses different structs to represent quantization parameters depending on
// - per-tensor vs per-channel
// - int4 vs not int4
// QNN uses different structs to represent quantization parameters depending on:
// - per-tensor (scales.size()==1, no block_size): SCALE_OFFSET or BW_SCALE_OFFSET
// - per-channel (scales.size()>1, no block_size): AXIS_SCALE_OFFSET or BW_AXIS_SCALE_OFFSET
// - block quantization (block_size>0): BLOCKWISE_EXPANSION (LPBQ) or ENCODING_BLOCK (BQ)
// - fallback: error
if (is_per_tensor && !is_int4_type) {
params_.encodingDefinition = QNN_DEFINITION_DEFINED;
params_.quantizationEncoding = QNN_QUANTIZATION_ENCODING_SCALE_OFFSET;
Expand All @@ -507,7 +515,7 @@ Ort::Status QnnQuantParamsWrapper::Init(const QnnModelWrapper& qnn_model_wrapper
} else {
params_.bwScaleOffsetEncoding.offset = 0;
}
} else if (!is_per_tensor && is_int4_type) {
} else if (is_per_channel && is_int4_type) {
std::vector<uint32_t> io_shape;
RETURN_IF_NOT(qnn_model_wrapper.GetOnnxShape(io_def.shape, io_shape), "Cannot get shape");
const int32_t io_rank = static_cast<int32_t>(io_shape.size());
Expand Down Expand Up @@ -550,7 +558,7 @@ Ort::Status QnnQuantParamsWrapper::Init(const QnnModelWrapper& qnn_model_wrapper

params_.bwAxisScaleOffsetEncoding.scales = scales_span.data();
params_.bwAxisScaleOffsetEncoding.offsets = zps_span.data();
} else if (!is_per_tensor && !is_int4_type) {
} else if (is_per_channel && !is_int4_type) {

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure if this gets taken elsewhere.
But before we make BQ->LPBQ transition, besides the two constraints (is_per_channel and is_int4_type) we also need to make sure that the input and output activation of the op is Integer as well. Otherwise better keep in BQ.

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added a check for (is_block_quant && is_int4_type) , the conversion will only happen for these data types else not.

std::vector<uint32_t> io_shape;
RETURN_IF_NOT(qnn_model_wrapper.GetOnnxShape(io_def.shape, io_shape), "Cannot get shape");
const int32_t io_rank = static_cast<int32_t>(io_shape.size());
Expand Down Expand Up @@ -586,6 +594,91 @@ Ort::Status QnnQuantParamsWrapper::Init(const QnnModelWrapper& qnn_model_wrapper
params_.axisScaleOffsetEncoding.axis = static_cast<int32_t>(axis);
params_.axisScaleOffsetEncoding.numScaleOffsets = static_cast<uint32_t>(num_elems);
params_.axisScaleOffsetEncoding.scaleOffset = data_span.data();
} else if (is_block_quant) {
if (!qnn_model_wrapper.GetModelSettings().enable_block_quant_weight_optimization) {
ORT_CXX_LOG(qnn_model_wrapper.GetLogger(), ORT_LOGGING_LEVEL_VERBOSE,
("Block quant weight optimization disabled, falling back to float BQ path"));
return Ort::Status();
}
// ONNX block quantization -> QNN LPBQ (BLOCKWISE_EXPANSION) conversion only supported for 4-bit.
if (io_def.type != ONNX_TENSOR_ELEMENT_DATA_TYPE_INT4) {
ORT_CXX_LOG(qnn_model_wrapper.GetLogger(), ORT_LOGGING_LEVEL_VERBOSE,
("BQ to LPBQ conversion only supported for int4 weights, falling back to float BQ path"));
return Ort::Status();
}
// LPBQ requires symmetric quantization (all zero-points must be zero).
for (const int32_t zp : zero_points) {
if (zp != 0) {
ORT_CXX_LOG(qnn_model_wrapper.GetLogger(), ORT_LOGGING_LEVEL_VERBOSE,
("BQ to LPBQ conversion requires symmetric quantization, falling back to float BQ path"));
return Ort::Status();
}
}

std::vector<uint32_t> io_shape;
RETURN_IF_NOT(qnn_model_wrapper.GetOnnxShape(io_def.shape, io_shape), "Cannot get shape");
const int32_t io_rank = static_cast<int32_t>(io_shape.size());

// Get scale tensor shape to determine block/channel dimensions.
// Scale tensor may be rank 2 (e.g., MatMul/Gemm) or higher rank (e.g., Conv with rank-4 weights).
// Only the first two dimensions (indexed by onnx_axis and 1 - onnx_axis) are used for LPBQ conversion.
const std::vector<int64_t> scale_shape =
utils::GetInitializerShape(ort_quant_params->scale, qnn_model_wrapper.GetOrtApi());
RETURN_IF_NOT(scale_shape.size() >= 2 && scale_shape.size() <= 4,
"Block quantization scale tensors must have rank between 2 and 4 for LPBQ conversion");
RETURN_IF_NOT(scale_shape[0] > 0 && scale_shape[1] > 0,
"Block quantization scale tensor dimensions must be positive");
RETURN_IF_NOT(scale_shape[0] * scale_shape[1] == static_cast<int64_t>(scales.size()),
"Block quantization scale tensor shape product must equal number of scales");

// Determine block axis (= ONNX axis attribute).
constexpr int64_t DEFAULT_QDQ_AXIS = 1;
int64_t axis = ort_quant_params->axis.value_or(DEFAULT_QDQ_AXIS);
if (axis < 0) axis += io_rank;
RETURN_IF_NOT(axis == 0 || axis == 1,
"Only axis 0 or 1 is supported for block quantization LPBQ conversion");

// Scale shape: [num_blocks_per_channel, num_channels] when axis=0
// [num_channels, num_blocks_per_channel] when axis=1
const uint32_t num_blocks_per_channel = static_cast<uint32_t>(scale_shape[axis]);
const uint32_t num_channels = static_cast<uint32_t>(scale_shape[1 - axis]);

// The conversion algorithm expects scales in block-major order [num_blocks, num_channels].
// If axis=1 the raw tensor is channel-major [num_channels, num_blocks]; transpose it.
std::vector<float> bq_scales_bm;
if (axis == 0) {
bq_scales_bm = std::move(scales);
} else {
// Transpose [num_channels, num_blocks] -> [num_blocks, num_channels]
bq_scales_bm.resize(scales.size());
for (uint32_t c = 0; c < num_channels; ++c) {
for (uint32_t b = 0; b < num_blocks_per_channel; ++b) {
bq_scales_bm[static_cast<size_t>(b) * num_channels + c] =
scales[static_cast<size_t>(c) * num_blocks_per_channel + b];
}
}
}

// Apply BQ -> LPBQ algorithm
std::vector<float> per_channel_scales;
std::vector<uint8_t> per_block_int_scales;
std::vector<int32_t> lpbq_offsets;
const uint32_t bitwidth = 4u;
Ort::Status status = utils::ConvertBlockQuantScalesToLpbq(bq_scales_bm, zero_points, num_blocks_per_channel,
num_channels, bitwidth, per_channel_scales,
per_block_int_scales, lpbq_offsets);
if (!status.IsOK()) {
ORT_CXX_LOG(qnn_model_wrapper.GetLogger(), ORT_LOGGING_LEVEL_VERBOSE,
("BQ to LPBQ conversion failed, falling back to float BQ path: " + std::string(status.GetErrorMessage())).c_str());
return Ort::Status();
}

// QNN LPBQ axis = the non-block axis in the weight tensor.
// For ONNX axis=0 (block axis=0): QNN axis=1; for axis=1: QNN axis=0.
const int64_t qnn_axis = 1 - axis;

*this = QnnQuantParamsWrapper(per_channel_scales, per_block_int_scales, lpbq_offsets,
qnn_axis, ort_quant_params->block_size.value(), is_int4_type);
} else {
return MAKE_EP_FAIL("Unexpected tensor kind for QuantParamsWrapper::Init()");
}
Expand Down
31 changes: 20 additions & 11 deletions onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.h
Original file line number Diff line number Diff line change
Expand Up @@ -106,35 +106,38 @@ class QnnQuantParamsWrapper {
// Get a copy of scales. Works for both per-tensor and per-channel.
Ort::Status GetScales(/*out*/ std::vector<float>& scales) const;

// Handle transposing of a per-channel quantized tensor. The quantization parameter's axis
// must be transposed using the inverse permutation of the Transpose.
// Handle transposing of a per-channel or LPBQ quantized tensor. The quantization parameter's
// axis must be updated using the permutation of the Transpose.
template <typename IntType>
Ort::Status HandleTranspose(gsl::span<const IntType> perm) {
if (!IsPerChannel()) {
if (!IsPerChannel() && !IsLPBQ()) {
return Ort::Status();
}

if (params_.quantizationEncoding == QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET) {
RETURN_IF_NOT(static_cast<size_t>(params_.axisScaleOffsetEncoding.axis) < perm.size(),
"Axis value is out of range of the provided permutation");
const int32_t new_axis = static_cast<int32_t>(perm[params_.axisScaleOffsetEncoding.axis]);
params_.axisScaleOffsetEncoding.axis = new_axis;
params_.axisScaleOffsetEncoding.axis = static_cast<int32_t>(perm[params_.axisScaleOffsetEncoding.axis]);
} else if (params_.quantizationEncoding == QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET) {
RETURN_IF_NOT(static_cast<size_t>(params_.bwAxisScaleOffsetEncoding.axis) < perm.size(),
"Axis value is out of range of the provided permutation");
const int32_t new_axis = static_cast<int32_t>(perm[params_.bwAxisScaleOffsetEncoding.axis]);
params_.bwAxisScaleOffsetEncoding.axis = new_axis;
params_.bwAxisScaleOffsetEncoding.axis = static_cast<int32_t>(perm[params_.bwAxisScaleOffsetEncoding.axis]);
} else if (params_.quantizationEncoding == QNN_QUANTIZATION_ENCODING_BLOCKWISE_EXPANSION &&
params_.blockwiseExpansion != nullptr) {
RETURN_IF_NOT(static_cast<size_t>(params_.blockwiseExpansion->axis) < perm.size(),
"LPBQ axis value is out of range of the provided permutation");
params_.blockwiseExpansion->axis = static_cast<int32_t>(perm[params_.blockwiseExpansion->axis]);
Comment thread
qti-kromero marked this conversation as resolved.
}

return Ort::Status();
}

// Handle "unsqueeze" of a per-channel quantized tensor. The quantization parameter's axis
// may need to be shifted if the unsqueeze inserted 1s before the quantization axis.
// Handle "unsqueeze" of a per-channel or LPBQ quantized tensor. The quantization parameter's
// axis may need to be shifted if the unsqueeze inserted 1s before the quantization axis.
template <typename IntType>
Ort::Status HandleUnsqueeze(gsl::span<const IntType> orig_shape,
gsl::span<const IntType> new_shape) {
if (!IsPerChannel()) {
if (!IsPerChannel() && !IsLPBQ()) {
return Ort::Status();
}

Expand All @@ -146,6 +149,9 @@ class QnnQuantParamsWrapper {
axis = params_.axisScaleOffsetEncoding.axis;
} else if (params_.quantizationEncoding == QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET) {
axis = params_.bwAxisScaleOffsetEncoding.axis;
} else if (params_.quantizationEncoding == QNN_QUANTIZATION_ENCODING_BLOCKWISE_EXPANSION &&
params_.blockwiseExpansion != nullptr) {
axis = params_.blockwiseExpansion->axis;
} else {
return MAKE_EP_FAIL(("Unhandled quantization encoding: " + std::to_string(params_.quantizationEncoding)).c_str());
}
Expand Down Expand Up @@ -175,6 +181,9 @@ class QnnQuantParamsWrapper {
params_.axisScaleOffsetEncoding.axis = static_cast<int32_t>(j);
} else if (params_.quantizationEncoding == QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET) {
params_.bwAxisScaleOffsetEncoding.axis = static_cast<int32_t>(j);
} else if (params_.quantizationEncoding == QNN_QUANTIZATION_ENCODING_BLOCKWISE_EXPANSION &&
params_.blockwiseExpansion != nullptr) {
params_.blockwiseExpansion->axis = static_cast<int32_t>(j);
} else {
return MAKE_EP_FAIL(("Unhandled quantization encoding: " + std::to_string(params_.quantizationEncoding)).c_str());
}
Expand All @@ -194,7 +203,7 @@ class QnnQuantParamsWrapper {

// Stores LowPowerBlockQuant encodings meta like number of per_channel_scales, per-block scales,
// and blockwise_expansion_data
uint32_t per_channel_scales_size_;
uint32_t per_channel_scales_size_ = 0;
std::unique_ptr<uint8_t[]> block_scales_data_;
std::unique_ptr<char[]> blockwise_expansion_data_;

Expand Down
Loading