onnxruntime · qti-ashimaj · Apr 27, 2026 · May 18, 2026 · May 18, 2026 · Jun 3, 2026
@@ -197,6 +197,11 @@ Refer to the [QAIRT SDK documentation](https://docs.qualcomm.com/doc/80-63442-10
 |'0'|Disabled. QNN EP will handle quantization and dequantization of graph I/O.|
 |'1'|Default. Enabled. Offload quantization and dequantization of graph I/O to CPU EP.|
 
+|`"enable_block_quant_weight_optimization"`|Description|
+|---|---|
+|`"0"`|Default. Disabled. Block-quantized models use the standard compatibility path.|
+|`"1"`|Enabled. Uses an optimized path for block-quantized weights when supported. If the optimized path is not available, QNN EP falls back to the standard compatibility path.|
+
 |`"enable_htp_shared_memory_allocator"`|Description|
 |---|---|
 |'0'|Default. Disabled.|
@@ -903,7 +908,7 @@ session = ort.InferenceSession("model.onnx", sess_options=sess_options)
 
 ### Important Considerations
 #### Feature Disabled if Number of Subgraphs is Less Than 5
-While graph composition is responsible for the majority of the preparation time, asynchronously finalizing the subgraphs cuts the total time down by a considerable amount, depending on the graph. For smaller models or models with only a few subgraphs, the overhead of setting up for parallel graph preparation will negate any possible performance gains and may actually result in worse performance. 
+While graph composition is responsible for the majority of the preparation time, asynchronously finalizing the subgraphs cuts the total time down by a considerable amount, depending on the graph. For smaller models or models with only a few subgraphs, the overhead of setting up for parallel graph preparation will negate any possible performance gains and may actually result in worse performance.
 
 #### Feature Disabled if `num_graph_prepare_threads` is 1
 This defeats the purpose of the feature, and enabling the feature will only add additional overhead from thread pool creation.

@@ -246,17 +246,17 @@ Ort::Status MatMulOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper, c
                                            bool do_op_validation) const {
   const auto& inputs = node_unit.Inputs();
 
-  // Block-quantized weight: translate to a QNN MatMul with a BW_FLOAT_BLOCK weight.
-  if (IsBQWeight(qnn_model_wrapper, inputs[1])) {
-    return ProcessInputsForBQMatMul(qnn_model_wrapper, node_unit, logger, input_names, do_op_validation);
-  }
-
   TensorInfo input_info_0{};
   TensorInfo input_info_1{};
   bool use_fully_connected = false;
   RETURN_IF_ERROR(
       CheckInputs(qnn_model_wrapper, inputs[0], inputs[1], input_info_0, input_info_1, use_fully_connected));
 
+  // Block-quantized weight: translate to a QNN MatMul with a BW_FLOAT_BLOCK weight.
+  if (IsBQWeight(qnn_model_wrapper, inputs[1]) && !input_info_1.quant_param.IsLPBQ()) {
+    return ProcessInputsForBQMatMul(qnn_model_wrapper, node_unit, logger, input_names, do_op_validation);
+  }
+
   if (use_fully_connected) {
     return ProcessInputsForQnnFullyConnected(qnn_model_wrapper,
                                              node_unit,

@@ -38,6 +38,7 @@ struct ModelSettings {
   bool offload_graph_io_quantization = false;
   bool htp_shared_memory = false;
   bool htp_bf16_enable = false;
+  bool enable_block_quant_weight_optimization = false;
 };
 
 class QnnModelWrapper {
@@ -363,6 +364,8 @@ class QnnModelWrapper {
 
   const OrtGraph& GetOrtGraph() const { return ort_graph_; }
 
+  const Ort::Logger& GetLogger() const { return logger_; }
+
   const std::unordered_map<std::string, QnnTensorWrapper>& GetModelTensorsMap() const {
     return model_tensors_map_;
   }

@@ -11,6 +11,7 @@
 #include "QnnTypes.h"
 
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
+#include "core/providers/qnn/builder/qnn_utils.h"
 
 #define ALIGN_PTR_UP(ptr, align, type) \
   reinterpret_cast<type>((reinterpret_cast<std::uintptr_t>(ptr) + (align) - 1) & ~((align) - 1))
@@ -149,7 +150,7 @@ QnnQuantParamsWrapper::QnnQuantParamsWrapper(gsl::span<const float> per_channel_
   }
 
   lpbq.numBlocksPerAxis = static_cast<uint32_t>(per_block_int_scales.size()) / num_elems;
-  lpbq.blockScaleBitwidth = is_int4 ? 4 : 0;
+  lpbq.blockScaleBitwidth = is_int4 ? 4 : 8;
   lpbq.blockScaleStorageType = QNN_BLOCKWISE_EXPANSION_BITWIDTH_SCALE_STORAGE_8;
 
   // Deep copy the block int scales
@@ -366,6 +367,8 @@ Ort::Status QnnQuantParamsWrapper::Init(const Qnn_QuantizeParams_t& params, cons
       params_.encodingDefinition = params.encodingDefinition;
       params_.quantizationEncoding = params.quantizationEncoding;
 
+      per_channel_scales_size_ = static_cast<uint32_t>(num_scaleoffsets);
+
       // Deep copy the blockwiseExpansion
       const size_t bwe_num_bytes = sizeof(Qnn_BlockwiseExpansion_t);
       constexpr std::uintptr_t bwe_align = alignof(Qnn_BlockwiseExpansion_t);
@@ -399,6 +402,7 @@ Ort::Status QnnQuantParamsWrapper::Init(const Qnn_QuantizeParams_t& params, cons
       params_.encodingDefinition = params.encodingDefinition;
       params_.quantizationEncoding = params.quantizationEncoding;
 
+      num_blocks_ = static_cast<uint32_t>(num_scaleoffsets);
       block_encoding_tensor_rank_ = static_cast<uint32_t>(tensor_rank);
       block_encoding_axis_data_ = std::make_unique<uint32_t[]>(block_encoding_tensor_rank_);
       std::memcpy(block_encoding_axis_data_.get(),
@@ -479,11 +483,15 @@ Ort::Status QnnQuantParamsWrapper::Init(const QnnModelWrapper& qnn_model_wrapper
                    (onnx_tp_type == ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT4);
   }
 
-  const bool is_per_tensor = scales.size() == 1;
+  const bool is_block_quant = ort_quant_params->block_size.has_value() && ort_quant_params->block_size.value() > 0;
+  const bool is_per_channel = scales.size() > 1 && !is_block_quant;
+  const bool is_per_tensor = scales.size() == 1 && !is_block_quant;
 
-  // QNN uses different structs to represent quantization parameters depending on
-  // - per-tensor vs per-channel
-  // - int4 vs not int4
+  // QNN uses different structs to represent quantization parameters depending on:
+  // - per-tensor (scales.size()==1, no block_size): SCALE_OFFSET or BW_SCALE_OFFSET
+  // - per-channel (scales.size()>1, no block_size): AXIS_SCALE_OFFSET or BW_AXIS_SCALE_OFFSET
+  // - block quantization (block_size>0): BLOCKWISE_EXPANSION (LPBQ) or ENCODING_BLOCK (BQ)
+  // - fallback: error
   if (is_per_tensor && !is_int4_type) {
     params_.encodingDefinition = QNN_DEFINITION_DEFINED;
     params_.quantizationEncoding = QNN_QUANTIZATION_ENCODING_SCALE_OFFSET;
@@ -507,7 +515,7 @@ Ort::Status QnnQuantParamsWrapper::Init(const QnnModelWrapper& qnn_model_wrapper
     } else {
       params_.bwScaleOffsetEncoding.offset = 0;
     }
-  } else if (!is_per_tensor && is_int4_type) {
+  } else if (is_per_channel && is_int4_type) {
     std::vector<uint32_t> io_shape;
     RETURN_IF_NOT(qnn_model_wrapper.GetOnnxShape(io_def.shape, io_shape), "Cannot get shape");
     const int32_t io_rank = static_cast<int32_t>(io_shape.size());
@@ -550,7 +558,7 @@ Ort::Status QnnQuantParamsWrapper::Init(const QnnModelWrapper& qnn_model_wrapper
 
     params_.bwAxisScaleOffsetEncoding.scales = scales_span.data();
     params_.bwAxisScaleOffsetEncoding.offsets = zps_span.data();
-  } else if (!is_per_tensor && !is_int4_type) {
+  } else if (is_per_channel && !is_int4_type) {
     std::vector<uint32_t> io_shape;
     RETURN_IF_NOT(qnn_model_wrapper.GetOnnxShape(io_def.shape, io_shape), "Cannot get shape");
     const int32_t io_rank = static_cast<int32_t>(io_shape.size());
@@ -586,6 +594,91 @@ Ort::Status QnnQuantParamsWrapper::Init(const QnnModelWrapper& qnn_model_wrapper
     params_.axisScaleOffsetEncoding.axis = static_cast<int32_t>(axis);
     params_.axisScaleOffsetEncoding.numScaleOffsets = static_cast<uint32_t>(num_elems);
     params_.axisScaleOffsetEncoding.scaleOffset = data_span.data();
+  } else if (is_block_quant) {
+    if (!qnn_model_wrapper.GetModelSettings().enable_block_quant_weight_optimization) {
+      ORT_CXX_LOG(qnn_model_wrapper.GetLogger(), ORT_LOGGING_LEVEL_VERBOSE,
+                  ("Block quant weight optimization disabled, falling back to float BQ path"));
+      return Ort::Status();
+    }
+    // ONNX block quantization -> QNN LPBQ (BLOCKWISE_EXPANSION) conversion only supported for 4-bit.
+    if (io_def.type != ONNX_TENSOR_ELEMENT_DATA_TYPE_INT4) {
+      ORT_CXX_LOG(qnn_model_wrapper.GetLogger(), ORT_LOGGING_LEVEL_VERBOSE,
+                  ("BQ to LPBQ conversion only supported for int4 weights, falling back to float BQ path"));
+      return Ort::Status();
+    }
+    // LPBQ requires symmetric quantization (all zero-points must be zero).
+    for (const int32_t zp : zero_points) {
+      if (zp != 0) {
+        ORT_CXX_LOG(qnn_model_wrapper.GetLogger(), ORT_LOGGING_LEVEL_VERBOSE,
+                    ("BQ to LPBQ conversion requires symmetric quantization, falling back to float BQ path"));
+        return Ort::Status();
+      }
+    }
+
+    std::vector<uint32_t> io_shape;
+    RETURN_IF_NOT(qnn_model_wrapper.GetOnnxShape(io_def.shape, io_shape), "Cannot get shape");
+    const int32_t io_rank = static_cast<int32_t>(io_shape.size());
+
+    // Get scale tensor shape to determine block/channel dimensions.
+    // Scale tensor may be rank 2 (e.g., MatMul/Gemm) or higher rank (e.g., Conv with rank-4 weights).
+    // Only the first two dimensions (indexed by onnx_axis and 1 - onnx_axis) are used for LPBQ conversion.
+    const std::vector<int64_t> scale_shape =
+        utils::GetInitializerShape(ort_quant_params->scale, qnn_model_wrapper.GetOrtApi());
+    RETURN_IF_NOT(scale_shape.size() >= 2 && scale_shape.size() <= 4,
+                  "Block quantization scale tensors must have rank between 2 and 4 for LPBQ conversion");
+    RETURN_IF_NOT(scale_shape[0] > 0 && scale_shape[1] > 0,
+                  "Block quantization scale tensor dimensions must be positive");
+    RETURN_IF_NOT(scale_shape[0] * scale_shape[1] == static_cast<int64_t>(scales.size()),
+                  "Block quantization scale tensor shape product must equal number of scales");
+
+    // Determine block axis (= ONNX axis attribute).
+    constexpr int64_t DEFAULT_QDQ_AXIS = 1;
+    int64_t axis = ort_quant_params->axis.value_or(DEFAULT_QDQ_AXIS);
+    if (axis < 0) axis += io_rank;
+    RETURN_IF_NOT(axis == 0 || axis == 1,
+                  "Only axis 0 or 1 is supported for block quantization LPBQ conversion");
+
+    // Scale shape: [num_blocks_per_channel, num_channels] when axis=0
+    //              [num_channels, num_blocks_per_channel] when axis=1
+    const uint32_t num_blocks_per_channel = static_cast<uint32_t>(scale_shape[axis]);
+    const uint32_t num_channels = static_cast<uint32_t>(scale_shape[1 - axis]);
+
+    // The conversion algorithm expects scales in block-major order [num_blocks, num_channels].
+    // If axis=1 the raw tensor is channel-major [num_channels, num_blocks]; transpose it.
+    std::vector<float> bq_scales_bm;
+    if (axis == 0) {
+      bq_scales_bm = std::move(scales);
+    } else {
+      // Transpose [num_channels, num_blocks] -> [num_blocks, num_channels]
+      bq_scales_bm.resize(scales.size());
+      for (uint32_t c = 0; c < num_channels; ++c) {
+        for (uint32_t b = 0; b < num_blocks_per_channel; ++b) {
+          bq_scales_bm[static_cast<size_t>(b) * num_channels + c] =
+              scales[static_cast<size_t>(c) * num_blocks_per_channel + b];
+        }
+      }
+    }
+
+    // Apply BQ -> LPBQ algorithm
+    std::vector<float> per_channel_scales;
+    std::vector<uint8_t> per_block_int_scales;
+    std::vector<int32_t> lpbq_offsets;
+    const uint32_t bitwidth = 4u;
+    Ort::Status status = utils::ConvertBlockQuantScalesToLpbq(bq_scales_bm, zero_points, num_blocks_per_channel,
+                                                              num_channels, bitwidth, per_channel_scales,
+                                                              per_block_int_scales, lpbq_offsets);
+    if (!status.IsOK()) {
+      ORT_CXX_LOG(qnn_model_wrapper.GetLogger(), ORT_LOGGING_LEVEL_VERBOSE,
+                  ("BQ to LPBQ conversion failed, falling back to float BQ path: " + std::string(status.GetErrorMessage())).c_str());
+      return Ort::Status();
+    }
+
+    // QNN LPBQ axis = the non-block axis in the weight tensor.
+    // For ONNX axis=0 (block axis=0): QNN axis=1; for axis=1: QNN axis=0.
+    const int64_t qnn_axis = 1 - axis;
+
+    *this = QnnQuantParamsWrapper(per_channel_scales, per_block_int_scales, lpbq_offsets,
+                                  qnn_axis, ort_quant_params->block_size.value(), is_int4_type);
   } else {
     return MAKE_EP_FAIL("Unexpected tensor kind for QuantParamsWrapper::Init()");
   }

@@ -106,35 +106,38 @@ class QnnQuantParamsWrapper {
   // Get a copy of scales. Works for both per-tensor and per-channel.
   Ort::Status GetScales(/*out*/ std::vector<float>& scales) const;
 
-  // Handle transposing of a per-channel quantized tensor. The quantization parameter's axis
-  // must be transposed using the inverse permutation of the Transpose.
+  // Handle transposing of a per-channel or LPBQ quantized tensor. The quantization parameter's
+  // axis must be updated using the permutation of the Transpose.
   template <typename IntType>
   Ort::Status HandleTranspose(gsl::span<const IntType> perm) {
-    if (!IsPerChannel()) {
+    if (!IsPerChannel() && !IsLPBQ()) {
       return Ort::Status();
     }
 
     if (params_.quantizationEncoding == QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET) {
       RETURN_IF_NOT(static_cast<size_t>(params_.axisScaleOffsetEncoding.axis) < perm.size(),
                     "Axis value is out of range of the provided permutation");
-      const int32_t new_axis = static_cast<int32_t>(perm[params_.axisScaleOffsetEncoding.axis]);
-      params_.axisScaleOffsetEncoding.axis = new_axis;
+      params_.axisScaleOffsetEncoding.axis = static_cast<int32_t>(perm[params_.axisScaleOffsetEncoding.axis]);
     } else if (params_.quantizationEncoding == QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET) {
       RETURN_IF_NOT(static_cast<size_t>(params_.bwAxisScaleOffsetEncoding.axis) < perm.size(),
                     "Axis value is out of range of the provided permutation");
-      const int32_t new_axis = static_cast<int32_t>(perm[params_.bwAxisScaleOffsetEncoding.axis]);
-      params_.bwAxisScaleOffsetEncoding.axis = new_axis;
+      params_.bwAxisScaleOffsetEncoding.axis = static_cast<int32_t>(perm[params_.bwAxisScaleOffsetEncoding.axis]);
+    } else if (params_.quantizationEncoding == QNN_QUANTIZATION_ENCODING_BLOCKWISE_EXPANSION &&
+               params_.blockwiseExpansion != nullptr) {
+      RETURN_IF_NOT(static_cast<size_t>(params_.blockwiseExpansion->axis) < perm.size(),
+                    "LPBQ axis value is out of range of the provided permutation");
+      params_.blockwiseExpansion->axis = static_cast<int32_t>(perm[params_.blockwiseExpansion->axis]);
     }
 
     return Ort::Status();
   }
 
-  // Handle "unsqueeze" of a per-channel quantized tensor. The quantization parameter's axis
-  // may need to be shifted if the unsqueeze inserted 1s before the quantization axis.
+  // Handle "unsqueeze" of a per-channel or LPBQ quantized tensor. The quantization parameter's
+  // axis may need to be shifted if the unsqueeze inserted 1s before the quantization axis.
   template <typename IntType>
   Ort::Status HandleUnsqueeze(gsl::span<const IntType> orig_shape,
                               gsl::span<const IntType> new_shape) {
-    if (!IsPerChannel()) {
+    if (!IsPerChannel() && !IsLPBQ()) {
       return Ort::Status();
     }
 
@@ -146,6 +149,9 @@ class QnnQuantParamsWrapper {
       axis = params_.axisScaleOffsetEncoding.axis;
     } else if (params_.quantizationEncoding == QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET) {
       axis = params_.bwAxisScaleOffsetEncoding.axis;
+    } else if (params_.quantizationEncoding == QNN_QUANTIZATION_ENCODING_BLOCKWISE_EXPANSION &&
+               params_.blockwiseExpansion != nullptr) {
+      axis = params_.blockwiseExpansion->axis;
     } else {
       return MAKE_EP_FAIL(("Unhandled quantization encoding: " + std::to_string(params_.quantizationEncoding)).c_str());
     }
@@ -175,6 +181,9 @@ class QnnQuantParamsWrapper {
       params_.axisScaleOffsetEncoding.axis = static_cast<int32_t>(j);
     } else if (params_.quantizationEncoding == QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET) {
       params_.bwAxisScaleOffsetEncoding.axis = static_cast<int32_t>(j);
+    } else if (params_.quantizationEncoding == QNN_QUANTIZATION_ENCODING_BLOCKWISE_EXPANSION &&
+               params_.blockwiseExpansion != nullptr) {
+      params_.blockwiseExpansion->axis = static_cast<int32_t>(j);
     } else {
       return MAKE_EP_FAIL(("Unhandled quantization encoding: " + std::to_string(params_.quantizationEncoding)).c_str());
     }
@@ -194,7 +203,7 @@ class QnnQuantParamsWrapper {
 
   // Stores LowPowerBlockQuant encodings meta like number of per_channel_scales, per-block scales,
   // and blockwise_expansion_data
-  uint32_t per_channel_scales_size_;
+  uint32_t per_channel_scales_size_ = 0;
   std::unique_ptr<uint8_t[]> block_scales_data_;
   std::unique_ptr<char[]> blockwise_expansion_data_;