From 6689161d49b331460c503ca334df8ef3a6e34a93 Mon Sep 17 00:00:00 2001
From: qti-ashimaj <ashimaj@qti.qualcomm.com>
Date: Mon, 27 Apr 2026 14:29:49 +0530
Subject: [PATCH 1/8] convert BQ to LPBQ encodings

---
 .../builder/opbuilder/matmul_op_builder.cc    |   3 +-
 .../qnn/builder/qnn_quant_params_wrapper.cc   |  76 +++++++++++-
 .../qnn/builder/qnn_quant_params_wrapper.h    |  29 +++--
 .../core/providers/qnn/builder/qnn_utils.cc   | 109 +++++++++++++++++-
 .../core/providers/qnn/builder/qnn_utils.h    |  28 +++++
 onnxruntime/core/providers/qnn/ort_api.cc     |  11 +-
 onnxruntime/core/providers/qnn/ort_api.h      |   1 +
 7 files changed, 239 insertions(+), 18 deletions(-)
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/matmul_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/matmul_op_builder.cc
index 42b54a6e17..c0bb28759e 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/matmul_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/matmul_op_builder.cc
@@ -1,4 +1,4 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
+﻿// Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
 #include <functional>
@@ -130,6 +130,7 @@ Ort::Status CheckInputs(const QnnModelWrapper& qnn_model_wrapper, const OrtNodeU
                                                  !input_info_0.is_initializer &&
                                                  IsQuant16bit(input_info_1.qnn_data_type) &&
                                                  !input_info_1.is_initializer);
+  use_fully_connected = use_fully_connected && !input_info_1.quant_param.IsLPBQ();
 #endif
   return Ort::Status();
 }
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.cc b/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.cc
index 0e0cc6b8b8..225b4f7a50 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.cc
@@ -11,6 +11,7 @@
 #include "QnnTypes.h"
 
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
+#include "core/providers/qnn/builder/qnn_utils.h"
 
 #define ALIGN_PTR_UP(ptr, align, type) \
   reinterpret_cast<type>((reinterpret_cast<std::uintptr_t>(ptr) + (align) - 1) & ~((align) - 1))
@@ -366,6 +367,8 @@ Ort::Status QnnQuantParamsWrapper::Init(const Qnn_QuantizeParams_t& params, cons
       params_.encodingDefinition = params.encodingDefinition;
       params_.quantizationEncoding = params.quantizationEncoding;
 
+      per_channel_scales_size_ = static_cast<uint32_t>(num_scaleoffsets);
+
       // Deep copy the blockwiseExpansion
       const size_t bwe_num_bytes = sizeof(Qnn_BlockwiseExpansion_t);
       constexpr std::uintptr_t bwe_align = alignof(Qnn_BlockwiseExpansion_t);
@@ -399,6 +402,7 @@ Ort::Status QnnQuantParamsWrapper::Init(const Qnn_QuantizeParams_t& params, cons
       params_.encodingDefinition = params.encodingDefinition;
       params_.quantizationEncoding = params.quantizationEncoding;
 
+      num_blocks_ = static_cast<uint32_t>(num_scaleoffsets);
       block_encoding_tensor_rank_ = static_cast<uint32_t>(tensor_rank);
       block_encoding_axis_data_ = std::make_unique<uint32_t[]>(block_encoding_tensor_rank_);
       std::memcpy(block_encoding_axis_data_.get(),
@@ -480,10 +484,13 @@ Ort::Status QnnQuantParamsWrapper::Init(const QnnModelWrapper& qnn_model_wrapper
   }
 
   const bool is_per_tensor = scales.size() == 1;
+  const bool is_block_quant = ort_quant_params->block_size.has_value() && ort_quant_params->block_size.value() > 0;
+  const bool is_per_channel = scales.size() > 1 && !is_block_quant;
 
   // QNN uses different structs to represent quantization parameters depending on
   // - per-tensor vs per-channel
   // - int4 vs not int4
+  // - block quantization (LPBQ / BLOCKWISE_EXPANSION)
   if (is_per_tensor && !is_int4_type) {
     params_.encodingDefinition = QNN_DEFINITION_DEFINED;
     params_.quantizationEncoding = QNN_QUANTIZATION_ENCODING_SCALE_OFFSET;
@@ -507,7 +514,7 @@ Ort::Status QnnQuantParamsWrapper::Init(const QnnModelWrapper& qnn_model_wrapper
     } else {
       params_.bwScaleOffsetEncoding.offset = 0;
     }
-  } else if (!is_per_tensor && is_int4_type) {
+  } else if (is_per_channel && is_int4_type) {
     std::vector<uint32_t> io_shape;
     RETURN_IF_NOT(qnn_model_wrapper.GetOnnxShape(io_def.shape, io_shape), "Cannot get shape");
     const int32_t io_rank = static_cast<int32_t>(io_shape.size());
@@ -550,7 +557,7 @@ Ort::Status QnnQuantParamsWrapper::Init(const QnnModelWrapper& qnn_model_wrapper
 
     params_.bwAxisScaleOffsetEncoding.scales = scales_span.data();
     params_.bwAxisScaleOffsetEncoding.offsets = zps_span.data();
-  } else if (!is_per_tensor && !is_int4_type) {
+  } else if (is_per_channel && !is_int4_type) {
     std::vector<uint32_t> io_shape;
     RETURN_IF_NOT(qnn_model_wrapper.GetOnnxShape(io_def.shape, io_shape), "Cannot get shape");
     const int32_t io_rank = static_cast<int32_t>(io_shape.size());
@@ -586,6 +593,71 @@ Ort::Status QnnQuantParamsWrapper::Init(const QnnModelWrapper& qnn_model_wrapper
     params_.axisScaleOffsetEncoding.axis = static_cast<int32_t>(axis);
     params_.axisScaleOffsetEncoding.numScaleOffsets = static_cast<uint32_t>(num_elems);
     params_.axisScaleOffsetEncoding.scaleOffset = data_span.data();
+  } else if (is_block_quant) {
+    // ONNX block quantization -> QNN LPBQ (BLOCKWISE_EXPANSION) conversion.
+
+    // Get scale tensor shape to determine block/channel dimensions.
+    const std::vector<int64_t> scale_shape =
+        utils::GetInitializerShape(ort_quant_params->scale, qnn_model_wrapper.GetOrtApi());
+    RETURN_IF_NOT(scale_shape.size() == 2,
+                  "Only 2D block quantization scale tensors are supported for LPBQ conversion");
+
+    // Determine block axis (= ONNX axis attribute, default 0).
+    constexpr int64_t kDefaultBlockAxis = 0;
+    int64_t onnx_axis = ort_quant_params->axis.value_or(kDefaultBlockAxis);
+    if (onnx_axis < 0) onnx_axis += static_cast<int64_t>(scale_shape.size());
+    RETURN_IF_NOT(onnx_axis == 0 || onnx_axis == 1,
+                  "Only axis 0 or 1 is supported for 2D block quantization LPBQ conversion");
+
+    // Scale shape: [num_blocks_per_channel, num_channels] when block_axis=0
+    //              [num_channels, num_blocks_per_channel] when block_axis=1
+    const uint32_t num_blocks_per_channel = static_cast<uint32_t>(scale_shape[onnx_axis]);
+    const uint32_t num_channels = static_cast<uint32_t>(scale_shape[1 - onnx_axis]);
+
+    // The conversion algorithm expects scales in block-major order [num_blocks, num_channels].
+    // If block_axis=1 the raw tensor is channel-major [num_channels, num_blocks]; transpose it.
+    std::vector<float> bq_scales_bm;
+    std::vector<int32_t> bq_offsets_bm;
+    if (onnx_axis == 0) {
+      bq_scales_bm = std::move(scales);
+      bq_offsets_bm = std::move(zero_points);
+    } else {
+      // Transpose [num_channels, num_blocks] -> [num_blocks, num_channels]
+      bq_scales_bm.resize(scales.size());
+      for (uint32_t c = 0; c < num_channels; ++c) {
+        for (uint32_t b = 0; b < num_blocks_per_channel; ++b) {
+          bq_scales_bm[static_cast<size_t>(b) * num_channels + c] =
+              scales[static_cast<size_t>(c) * num_blocks_per_channel + b];
+        }
+      }
+      if (!zero_points.empty()) {
+        bq_offsets_bm.resize(zero_points.size());
+        for (uint32_t c = 0; c < num_channels; ++c) {
+          for (uint32_t b = 0; b < num_blocks_per_channel; ++b) {
+            bq_offsets_bm[static_cast<size_t>(b) * num_channels + c] =
+                zero_points[static_cast<size_t>(c) * num_blocks_per_channel + b];
+          }
+        }
+      }
+    }
+
+    // Apply BQ -> LPBQ algorithm
+    // Use bitwidth=4 for int4 weights and bitwidth=8 for int8 weights.
+    std::vector<float> per_channel_scales;
+    std::vector<uint8_t> per_block_int_scales;
+    std::vector<int32_t> lpbq_offsets;
+    const uint32_t kBitwidth = is_int4_type ? 4u : 8u;
+    RETURN_IF_ERROR(utils::TryConvertBlockQuantScalesToLpbq(
+        bq_scales_bm, bq_offsets_bm,
+        num_blocks_per_channel, num_channels, kBitwidth,
+        per_channel_scales, per_block_int_scales, lpbq_offsets));
+
+    // QNN LPBQ axis = the non-block axis in the weight tensor.
+    // For ONNX axis=0 (block axis=0): QNN axis=1; for axis=1: QNN axis=0.
+    const int64_t qnn_axis = 1 - onnx_axis;
+
+    *this = QnnQuantParamsWrapper(per_channel_scales, per_block_int_scales, lpbq_offsets,
+                                  qnn_axis, ort_quant_params->block_size.value(), is_int4_type);
   } else {
     return MAKE_EP_FAIL("Unexpected tensor kind for QuantParamsWrapper::Init()");
   }
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.h b/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.h
index ad4312257e..726e276b1c 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.h
@@ -106,35 +106,38 @@ class QnnQuantParamsWrapper {
   // Get a copy of scales. Works for both per-tensor and per-channel.
   Ort::Status GetScales(/*out*/ std::vector<float>& scales) const;
 
-  // Handle transposing of a per-channel quantized tensor. The quantization parameter's axis
-  // must be transposed using the inverse permutation of the Transpose.
+  // Handle transposing of a per-channel or LPBQ quantized tensor. The quantization parameter's
+  // axis must be updated using the permutation of the Transpose.
   template <typename IntType>
   Ort::Status HandleTranspose(gsl::span<const IntType> perm) {
-    if (!IsPerChannel()) {
+    if (!IsPerChannel() && !IsLPBQ()) {
       return Ort::Status();
     }
 
     if (params_.quantizationEncoding == QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET) {
       RETURN_IF_NOT(static_cast<size_t>(params_.axisScaleOffsetEncoding.axis) < perm.size(),
                     "Axis value is out of range of the provided permutation");
-      const int32_t new_axis = static_cast<int32_t>(perm[params_.axisScaleOffsetEncoding.axis]);
-      params_.axisScaleOffsetEncoding.axis = new_axis;
+      params_.axisScaleOffsetEncoding.axis = static_cast<int32_t>(perm[params_.axisScaleOffsetEncoding.axis]);
     } else if (params_.quantizationEncoding == QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET) {
       RETURN_IF_NOT(static_cast<size_t>(params_.bwAxisScaleOffsetEncoding.axis) < perm.size(),
                     "Axis value is out of range of the provided permutation");
-      const int32_t new_axis = static_cast<int32_t>(perm[params_.bwAxisScaleOffsetEncoding.axis]);
-      params_.bwAxisScaleOffsetEncoding.axis = new_axis;
+      params_.bwAxisScaleOffsetEncoding.axis = static_cast<int32_t>(perm[params_.bwAxisScaleOffsetEncoding.axis]);
+    } else if (params_.quantizationEncoding == QNN_QUANTIZATION_ENCODING_BLOCKWISE_EXPANSION &&
+               params_.blockwiseExpansion != nullptr) {
+      RETURN_IF_NOT(static_cast<size_t>(params_.blockwiseExpansion->axis) < perm.size(),
+                    "LPBQ axis value is out of range of the provided permutation");
+      params_.blockwiseExpansion->axis = static_cast<int32_t>(perm[params_.blockwiseExpansion->axis]);
     }
 
     return Ort::Status();
   }
 
-  // Handle "unsqueeze" of a per-channel quantized tensor. The quantization parameter's axis
-  // may need to be shifted if the unsqueeze inserted 1s before the quantization axis.
+  // Handle "unsqueeze" of a per-channel or LPBQ quantized tensor. The quantization parameter's
+  // axis may need to be shifted if the unsqueeze inserted 1s before the quantization axis.
   template <typename IntType>
   Ort::Status HandleUnsqueeze(gsl::span<const IntType> orig_shape,
                               gsl::span<const IntType> new_shape) {
-    if (!IsPerChannel()) {
+    if (!IsPerChannel() && !IsLPBQ()) {
       return Ort::Status();
     }
 
@@ -146,6 +149,9 @@ class QnnQuantParamsWrapper {
       axis = params_.axisScaleOffsetEncoding.axis;
     } else if (params_.quantizationEncoding == QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET) {
       axis = params_.bwAxisScaleOffsetEncoding.axis;
+    } else if (params_.quantizationEncoding == QNN_QUANTIZATION_ENCODING_BLOCKWISE_EXPANSION &&
+               params_.blockwiseExpansion != nullptr) {
+      axis = params_.blockwiseExpansion->axis;
     } else {
       return MAKE_EP_FAIL(("Unhandled quantization encoding: " + std::to_string(params_.quantizationEncoding)).c_str());
     }
@@ -175,6 +181,9 @@ class QnnQuantParamsWrapper {
       params_.axisScaleOffsetEncoding.axis = static_cast<int32_t>(j);
     } else if (params_.quantizationEncoding == QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET) {
       params_.bwAxisScaleOffsetEncoding.axis = static_cast<int32_t>(j);
+    } else if (params_.quantizationEncoding == QNN_QUANTIZATION_ENCODING_BLOCKWISE_EXPANSION &&
+               params_.blockwiseExpansion != nullptr) {
+      params_.blockwiseExpansion->axis = static_cast<int32_t>(j);
     } else {
       return MAKE_EP_FAIL(("Unhandled quantization encoding: " + std::to_string(params_.quantizationEncoding)).c_str());
     }
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_utils.cc b/onnxruntime/core/providers/qnn/builder/qnn_utils.cc
index c9afad2ee4..e1834840a1 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_utils.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_utils.cc
@@ -328,11 +328,14 @@ std::ostream& operator<<(std::ostream& out, const Qnn_QuantizationEncoding_t& en
     case QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET:
       out << "QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET";
       break;
+    case QNN_QUANTIZATION_ENCODING_BLOCKWISE_EXPANSION:
+      out << "QNN_QUANTIZATION_ENCODING_BLOCKWISE_EXPANSION";
+      break;
     case QNN_QUANTIZATION_ENCODING_UNDEFINED:
       out << "QNN_QUANTIZATION_ENCODING_UNDEFINED";
       break;
     default:
-      out << "Uknown quantization encoding";
+      out << "Unknown quantization encoding";
   }
   return out;
 }
@@ -378,6 +381,35 @@ std::ostream& operator<<(std::ostream& out, const Qnn_QuantizeParams_t& quantize
         out << quantize_params.bwAxisScaleOffsetEncoding.offsets[i] << (i == num_elems - 1 ? "" : " ");
       }
       out << (truncate ? "...)" : ")");
+    } else if (quantize_params.quantizationEncoding == QNN_QUANTIZATION_ENCODING_BLOCKWISE_EXPANSION &&
+               quantize_params.blockwiseExpansion != nullptr) {
+      const Qnn_BlockwiseExpansion_t& lpbq = *quantize_params.blockwiseExpansion;
+      out << " axis=" << lpbq.axis
+          << " numBlocksPerAxis=" << lpbq.numBlocksPerAxis
+          << " blockScaleBitwidth=" << lpbq.blockScaleBitwidth;
+      // For lpbq, num_elems are not present in the quantize_params,
+      // we are using numBlocksPerAxis instead to print the first few scale offset values
+      size_t num_elems = lpbq.numBlocksPerAxis;
+      bool truncate = num_elems > 20;
+      num_elems = truncate ? 20 : num_elems;
+      if (lpbq.scaleOffsets != nullptr) {
+        out << " scales=(";
+        for (size_t i = 0; i < num_elems; i++) {
+          out << lpbq.scaleOffsets[i].scale << (i + 1 < num_elems ? " " : "");
+        }
+        out << (truncate ? "...)" : ")") << " offsets=(";
+        for (size_t i = 0; i < num_elems; i++) {
+          out << lpbq.scaleOffsets[i].offset << (i + 1 < num_elems ? " " : "");
+        }
+        out << (truncate ? "...)" : ")");
+      }
+      if (lpbq.blocksScale8 != nullptr) {
+        out << " perBlockIntScales=(";
+        for (size_t i = 0; i < num_elems; i++) {
+          out << static_cast<int32_t>(lpbq.blocksScale8[i]) << (i + 1 < num_elems ? " " : "");
+        }
+        out << (truncate ? "...)" : ")");
+      }
     } else {
       out << " encoding not supported.";
     }
@@ -1173,6 +1205,81 @@ Ort::Status DequantizePerChannel(gsl::span<const uint8_t> quant_bytes, gsl::span
   return Ort::Status();
 }
 
+Ort::Status TryConvertBlockQuantScalesToLpbq(gsl::span<const float> bq_scales,
+                                             gsl::span<const int32_t> bq_offsets,
+                                             uint32_t num_blocks_per_channel,
+                                             uint32_t num_channels,
+                                             uint32_t bitwidth,
+                                             /*out*/ std::vector<float>& per_channel_scales,
+                                             /*out*/ std::vector<uint8_t>& per_block_int_scales,
+                                             /*out*/ std::vector<int32_t>& offsets) {
+  RETURN_IF_NOT(bq_scales.size() == static_cast<size_t>(num_blocks_per_channel) * num_channels,
+                "BQ scales size does not match num_blocks_per_channel * num_channels");
+  RETURN_IF_NOT(bq_offsets.empty() || bq_offsets.size() == bq_scales.size(),
+                "BQ offsets size must be empty or equal to bq_scales size");
+  RETURN_IF_NOT(bitwidth > 0 && bitwidth <= 16, "bitwidth must be in range [1, 16]");
+
+  const uint32_t max_int_scale = 1u << bitwidth;  // 2^bitwidth
+
+  // Require symmetric quantization (all offsets must be zero).
+  if (!bq_offsets.empty()) {
+    for (size_t i = 0; i < bq_offsets.size(); ++i) {
+      RETURN_IF_NOT(bq_offsets[i] == 0,
+                    "LPBQ conversion requires symmetric quantization (all block zero-points must be 0)");
+    }
+  }
+
+  // Validate that all scales are non-negative and finite.
+  for (size_t i = 0; i < bq_scales.size(); ++i) {
+    RETURN_IF_NOT(std::isfinite(bq_scales[i]) && bq_scales[i] >= 0.0f,
+                  "BQ scales must be non-negative and finite");
+  }
+
+  // Algorithm:
+  //   max_int_scale   = 2^bitwidth
+  //   per_channel_scale[c]      = max(bq_scales[:, c]) / max_int_scale
+  //   per_block_int_scale[c, b] = clamp(round(bq_scales[b, c] / per_channel_scale[c]), 1, max_int_scale)
+  //
+  // Note: This conversion is inherently approximate — the block scales are arbitrary floats and
+  // are rounded to the nearest integer multiple of per_channel_scale. The rounding error is
+  // bounded by 0.5 * per_channel_scale per block, which is the expected LPBQ quantization noise.
+
+  per_channel_scales.resize(num_channels, 0.0f);
+  per_block_int_scales.resize(static_cast<size_t>(num_channels) * num_blocks_per_channel, 0);
+  offsets.assign(num_channels, 0);
+
+  // Step 1: Compute per-channel float scales.
+  // bq_scales is in block-major order: bq_scales[b * num_channels + c]
+  for (uint32_t c = 0; c < num_channels; ++c) {
+    float max_scale = 0.0f;
+    for (uint32_t b = 0; b < num_blocks_per_channel; ++b) {
+      float s = bq_scales[static_cast<size_t>(b) * num_channels + c];
+      if (s > max_scale) max_scale = s;
+    }
+    per_channel_scales[c] = max_scale / static_cast<float>(max_int_scale);
+  }
+
+  // Step 2: Compute per-block integer scales in channel-major order.
+  // Output layout: per_block_int_scales[c * num_blocks_per_channel + b]
+  for (uint32_t c = 0; c < num_channels; ++c) {
+    const float pc_scale = per_channel_scales[c];
+    for (uint32_t b = 0; b < num_blocks_per_channel; ++b) {
+      const float raw_scale = bq_scales[static_cast<size_t>(b) * num_channels + c];
+      uint8_t int_scale;
+      if (pc_scale <= 0.0f) {
+        int_scale = 1;
+      } else {
+        const float tentative = std::round(raw_scale / pc_scale);
+        const uint32_t clamped = std::max(1u, std::min(static_cast<uint32_t>(tentative), max_int_scale));
+        int_scale = static_cast<uint8_t>(clamped);
+      }
+      per_block_int_scales[static_cast<size_t>(c) * num_blocks_per_channel + b] = int_scale;
+    }
+  }
+
+  return Ort::Status();
+}
+
 /**
  * @brief QuantizeData with LPBQ encodings (per_channel_float_scales, per_block_int_scales)
  * @pre-condition data should have axis at 0
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_utils.h b/onnxruntime/core/providers/qnn/builder/qnn_utils.h
index 1207929673..3b15b12a4f 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_utils.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_utils.h
@@ -267,6 +267,34 @@ Ort::Status QuantizeData(gsl::span<const float> data, gsl::span<const uint32_t>
                          /*out*/ gsl::span<uint8_t> quant_bytes, Qnn_DataType_t data_type,
                          std::optional<int64_t> axis = std::nullopt);
 
+// Converts ONNX block quantization (BQ) scales to QNN LPBQ (BLOCKWISE_EXPANSION) format.
+// Supports both int8 (bitwidth=8) and int4 (bitwidth=4) weight block quantization.
+//
+// The ONNX BQ scale tensor has shape [num_blocks_per_channel, num_channels] in block-major order
+// (i.e., the block axis is axis 0 and the channel axis is axis 1). If the ONNX block axis is 1
+// instead of 0, the caller must transpose the scale data before calling this function.
+//
+// Algorithm :
+//   max_int_scale   = 2^bitwidth  (256 for int8, 16 for int4)
+//   per_channel_scale[c]      = max(bq_scales[:, c]) / max_int_scale
+//   per_block_int_scale[c, b] = clamp(round(bq_scales[b, c] / per_channel_scale[c]), 1, max_int_scale)
+//
+// The output per_block_int_scales is in channel-major order [num_channels * num_blocks_per_channel],
+// which is the layout required by QNN LPBQ (BLOCKWISE_EXPANSION).
+//
+// Returns failure if:
+//   - The encoding is asymmetric (non-zero offsets), which LPBQ does not support.
+//   - Any block scale is negative or non-finite.
+//   - Input sizes are inconsistent.
+Ort::Status TryConvertBlockQuantScalesToLpbq(gsl::span<const float> bq_scales,
+                                             gsl::span<const int32_t> bq_offsets,
+                                             uint32_t num_blocks_per_channel,
+                                             uint32_t num_channels,
+                                             uint32_t bitwidth,
+                                             /*out*/ std::vector<float>& per_channel_scales,
+                                             /*out*/ std::vector<uint8_t>& per_block_int_scales,
+                                             /*out*/ std::vector<int32_t>& offsets);
+
 // Quantizes the given float data using the provided Low Power Block Quantization parameters
 // (float channel_scales, int block_scales and offsets)
 // The provided offsets must use the QNN convention where offset = -zero_point.
diff --git a/onnxruntime/core/providers/qnn/ort_api.cc b/onnxruntime/core/providers/qnn/ort_api.cc
index c3a70eabe2..1b31b4fcd5 100644
--- a/onnxruntime/core/providers/qnn/ort_api.cc
+++ b/onnxruntime/core/providers/qnn/ort_api.cc
@@ -160,11 +160,12 @@ std::vector<OrtNodeUnitIODef> GetQDQIODefs(const OrtNode* target_node,
       continue;
     }
 
-    // Get the Q/DQ axis attribute if available.
+    // Get the Q/DQ axis and block_size attributes if available.
     std::optional<int64_t> axis = OrtNodeAttrHelper(*node).GetInt64("axis");
+    std::optional<int64_t> block_size = OrtNodeAttrHelper(*node).GetInt64("block_size");
 
     // Quantization scale and zp are always the input[1, 2].
-    OrtNodeUnitIODef::QuantParam quant_param{node_inputs[1], num_node_inputs == 3 ? node_inputs[2] : nullptr, axis};
+    OrtNodeUnitIODef::QuantParam quant_param{node_inputs[1], num_node_inputs == 3 ? node_inputs[2] : nullptr, axis, block_size};
 
     OrtNodeUnitIODef io_def;
     if (is_input) {
@@ -264,7 +265,8 @@ OrtStatus* OrtNodeUnit::InitForSingleNode(const OrtApi& ort_api) {
 
   if (std::string(op_type) == "DequantizeLinear") {
     std::optional<int64_t> axis = OrtNodeAttrHelper(*target_node_).GetInt64("axis");
-    OrtNodeUnitIODef::QuantParam quant_param{inputs_data[1], num_inputs == 3 ? inputs_data[2] : nullptr, axis};
+    std::optional<int64_t> block_size = OrtNodeAttrHelper(*target_node_).GetInt64("block_size");
+    OrtNodeUnitIODef::QuantParam quant_param{inputs_data[1], num_inputs == 3 ? inputs_data[2] : nullptr, axis, block_size};
 
     OrtNodeUnitIODef input_def, output_def;
     auto input_status = ParseOrtValueInfo(inputs_data[0], quant_param, ort_api, input_def);
@@ -283,7 +285,8 @@ OrtStatus* OrtNodeUnit::InitForSingleNode(const OrtApi& ort_api) {
     outputs_.push_back(output_def);
   } else if (std::string(op_type) == "QuantizeLinear") {
     std::optional<int64_t> axis = OrtNodeAttrHelper(*target_node_).GetInt64("axis");
-    OrtNodeUnitIODef::QuantParam quant_param{inputs_data[1], num_inputs == 3 ? inputs_data[2] : nullptr, axis};
+    std::optional<int64_t> block_size = OrtNodeAttrHelper(*target_node_).GetInt64("block_size");
+    OrtNodeUnitIODef::QuantParam quant_param{inputs_data[1], num_inputs == 3 ? inputs_data[2] : nullptr, axis, block_size};
 
     OrtNodeUnitIODef input_def, output_def;
     auto input_status = ParseOrtValueInfo(inputs_data[0], std::nullopt, ort_api, input_def);
diff --git a/onnxruntime/core/providers/qnn/ort_api.h b/onnxruntime/core/providers/qnn/ort_api.h
index c4905f44b4..855c509fe2 100644
--- a/onnxruntime/core/providers/qnn/ort_api.h
+++ b/onnxruntime/core/providers/qnn/ort_api.h
@@ -247,6 +247,7 @@ struct OrtNodeUnitIODef {
     const OrtValueInfo* scale;
     const OrtValueInfo* zero_point{nullptr};
     std::optional<int64_t> axis{std::nullopt};
+    std::optional<int64_t> block_size{std::nullopt};
   };
 
   std::string name;

From 6fd25fc3282483192344761a618b245a043e1c75 Mon Sep 17 00:00:00 2001
From: qti-ashimaj <ashimaj@qti.qualcomm.com>
Date: Mon, 18 May 2026 10:14:29 +0530
Subject: [PATCH 2/8] address review comments

---
 .../builder/opbuilder/matmul_op_builder.cc    |  1 -
 .../qnn/builder/qnn_quant_params_wrapper.cc   | 81 ++++++++++---------
 .../core/providers/qnn/builder/qnn_utils.cc   | 24 +++---
 .../core/providers/qnn/builder/qnn_utils.h    |  4 +-
 4 files changed, 57 insertions(+), 53 deletions(-)

diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/matmul_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/matmul_op_builder.cc
index c0bb28759e..d38303506b 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/matmul_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/matmul_op_builder.cc
@@ -130,7 +130,6 @@ Ort::Status CheckInputs(const QnnModelWrapper& qnn_model_wrapper, const OrtNodeU
                                                  !input_info_0.is_initializer &&
                                                  IsQuant16bit(input_info_1.qnn_data_type) &&
                                                  !input_info_1.is_initializer);
-  use_fully_connected = use_fully_connected && !input_info_1.quant_param.IsLPBQ();
 #endif
   return Ort::Status();
 }
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.cc b/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.cc
index 225b4f7a50..7c81c1ad47 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.cc
@@ -150,7 +150,7 @@ QnnQuantParamsWrapper::QnnQuantParamsWrapper(gsl::span<const float> per_channel_
   }
 
   lpbq.numBlocksPerAxis = static_cast<uint32_t>(per_block_int_scales.size()) / num_elems;
-  lpbq.blockScaleBitwidth = is_int4 ? 4 : 0;
+  lpbq.blockScaleBitwidth = is_int4 ? 4 : 8;
   lpbq.blockScaleStorageType = QNN_BLOCKWISE_EXPANSION_BITWIDTH_SCALE_STORAGE_8;
 
   // Deep copy the block int scales
@@ -483,14 +483,15 @@ Ort::Status QnnQuantParamsWrapper::Init(const QnnModelWrapper& qnn_model_wrapper
                    (onnx_tp_type == ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT4);
   }
 
-  const bool is_per_tensor = scales.size() == 1;
   const bool is_block_quant = ort_quant_params->block_size.has_value() && ort_quant_params->block_size.value() > 0;
   const bool is_per_channel = scales.size() > 1 && !is_block_quant;
+  const bool is_per_tensor = scales.size() == 1 && !is_block_quant;
 
-  // QNN uses different structs to represent quantization parameters depending on
-  // - per-tensor vs per-channel
-  // - int4 vs not int4
-  // - block quantization (LPBQ / BLOCKWISE_EXPANSION)
+  // QNN uses different structs to represent quantization parameters depending on:
+  // - per-tensor (scales.size()==1, no block_size): SCALE_OFFSET or BW_SCALE_OFFSET
+  // - per-channel (scales.size()>1, no block_size): AXIS_SCALE_OFFSET or BW_AXIS_SCALE_OFFSET
+  // - block quantization (block_size>0): BLOCKWISE_EXPANSION (LPBQ)
+  // - fallback: error
   if (is_per_tensor && !is_int4_type) {
     params_.encodingDefinition = QNN_DEFINITION_DEFINED;
     params_.quantizationEncoding = QNN_QUANTIZATION_ENCODING_SCALE_OFFSET;
@@ -596,31 +597,44 @@ Ort::Status QnnQuantParamsWrapper::Init(const QnnModelWrapper& qnn_model_wrapper
   } else if (is_block_quant) {
     // ONNX block quantization -> QNN LPBQ (BLOCKWISE_EXPANSION) conversion.
 
+    std::vector<uint32_t> io_shape;
+    RETURN_IF_NOT(qnn_model_wrapper.GetOnnxShape(io_def.shape, io_shape), "Cannot get shape");
+    const int32_t io_rank = static_cast<int32_t>(io_shape.size());
+
     // Get scale tensor shape to determine block/channel dimensions.
+    // Scale tensor may be rank 2 (e.g., MatMul/Gemm) or higher rank (e.g., Conv with rank-4 weights).
+    // Only the first two dimensions (indexed by onnx_axis and 1 - onnx_axis) are used for LPBQ conversion.
     const std::vector<int64_t> scale_shape =
         utils::GetInitializerShape(ort_quant_params->scale, qnn_model_wrapper.GetOrtApi());
-    RETURN_IF_NOT(scale_shape.size() == 2,
-                  "Only 2D block quantization scale tensors are supported for LPBQ conversion");
-
-    // Determine block axis (= ONNX axis attribute, default 0).
-    constexpr int64_t kDefaultBlockAxis = 0;
-    int64_t onnx_axis = ort_quant_params->axis.value_or(kDefaultBlockAxis);
-    if (onnx_axis < 0) onnx_axis += static_cast<int64_t>(scale_shape.size());
-    RETURN_IF_NOT(onnx_axis == 0 || onnx_axis == 1,
-                  "Only axis 0 or 1 is supported for 2D block quantization LPBQ conversion");
-
-    // Scale shape: [num_blocks_per_channel, num_channels] when block_axis=0
-    //              [num_channels, num_blocks_per_channel] when block_axis=1
-    const uint32_t num_blocks_per_channel = static_cast<uint32_t>(scale_shape[onnx_axis]);
-    const uint32_t num_channels = static_cast<uint32_t>(scale_shape[1 - onnx_axis]);
+    RETURN_IF_NOT(scale_shape.size() >= 2,
+                  "Block quantization scale tensors must have at least rank 2 for LPBQ conversion");
+    RETURN_IF_NOT(scale_shape[0] > 0 && scale_shape[1] > 0,
+                  "Block quantization scale tensor dimensions must be positive");
+    RETURN_IF_NOT(scale_shape[0] * scale_shape[1] == static_cast<int64_t>(scales.size()),
+                  "Block quantization scale tensor shape product must equal number of scales");
+
+    // Determine block axis (= ONNX axis attribute).
+    constexpr int64_t DEFAULT_QDQ_AXIS = 1;
+    int64_t axis = ort_quant_params->axis.value_or(DEFAULT_QDQ_AXIS);
+    if (axis < 0) axis += io_rank;
+    RETURN_IF_NOT(axis == 0 || axis == 1,
+                  "Only axis 0 or 1 is supported for block quantization LPBQ conversion");
+
+    // Scale shape: [num_blocks_per_channel, num_channels] when axis=0
+    //              [num_channels, num_blocks_per_channel] when axis=1
+    const uint32_t num_blocks_per_channel = static_cast<uint32_t>(scale_shape[axis]);
+    const uint32_t num_channels = static_cast<uint32_t>(scale_shape[1 - axis]);
+
+    // LPBQ requires symmetric quantization (all zero-points must be zero).
+    for (const int32_t zp : zero_points) {
+      RETURN_IF_NOT(zp == 0, "LPBQ conversion requires symmetric quantization");
+    }
 
     // The conversion algorithm expects scales in block-major order [num_blocks, num_channels].
-    // If block_axis=1 the raw tensor is channel-major [num_channels, num_blocks]; transpose it.
+    // If axis=1 the raw tensor is channel-major [num_channels, num_blocks]; transpose it.
     std::vector<float> bq_scales_bm;
-    std::vector<int32_t> bq_offsets_bm;
-    if (onnx_axis == 0) {
+    if (axis == 0) {
       bq_scales_bm = std::move(scales);
-      bq_offsets_bm = std::move(zero_points);
     } else {
       // Transpose [num_channels, num_blocks] -> [num_blocks, num_channels]
       bq_scales_bm.resize(scales.size());
@@ -630,15 +644,6 @@ Ort::Status QnnQuantParamsWrapper::Init(const QnnModelWrapper& qnn_model_wrapper
               scales[static_cast<size_t>(c) * num_blocks_per_channel + b];
         }
       }
-      if (!zero_points.empty()) {
-        bq_offsets_bm.resize(zero_points.size());
-        for (uint32_t c = 0; c < num_channels; ++c) {
-          for (uint32_t b = 0; b < num_blocks_per_channel; ++b) {
-            bq_offsets_bm[static_cast<size_t>(b) * num_channels + c] =
-                zero_points[static_cast<size_t>(c) * num_blocks_per_channel + b];
-          }
-        }
-      }
     }
 
     // Apply BQ -> LPBQ algorithm
@@ -646,15 +651,15 @@ Ort::Status QnnQuantParamsWrapper::Init(const QnnModelWrapper& qnn_model_wrapper
     std::vector<float> per_channel_scales;
     std::vector<uint8_t> per_block_int_scales;
     std::vector<int32_t> lpbq_offsets;
-    const uint32_t kBitwidth = is_int4_type ? 4u : 8u;
-    RETURN_IF_ERROR(utils::TryConvertBlockQuantScalesToLpbq(
-        bq_scales_bm, bq_offsets_bm,
-        num_blocks_per_channel, num_channels, kBitwidth,
+    const uint32_t bitwidth = is_int4_type ? 4u : 8u;
+    RETURN_IF_ERROR(utils::ConvertBlockQuantScalesToLpbq(
+        bq_scales_bm, zero_points,
+        num_blocks_per_channel, num_channels, bitwidth,
         per_channel_scales, per_block_int_scales, lpbq_offsets));
 
     // QNN LPBQ axis = the non-block axis in the weight tensor.
     // For ONNX axis=0 (block axis=0): QNN axis=1; for axis=1: QNN axis=0.
-    const int64_t qnn_axis = 1 - onnx_axis;
+    const int64_t qnn_axis = 1 - axis;
 
     *this = QnnQuantParamsWrapper(per_channel_scales, per_block_int_scales, lpbq_offsets,
                                   qnn_axis, ort_quant_params->block_size.value(), is_int4_type);
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_utils.cc b/onnxruntime/core/providers/qnn/builder/qnn_utils.cc
index e1834840a1..5cf0637c18 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_utils.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_utils.cc
@@ -387,8 +387,8 @@ std::ostream& operator<<(std::ostream& out, const Qnn_QuantizeParams_t& quantize
       out << " axis=" << lpbq.axis
           << " numBlocksPerAxis=" << lpbq.numBlocksPerAxis
           << " blockScaleBitwidth=" << lpbq.blockScaleBitwidth;
-      // For lpbq, num_elems are not present in the quantize_params,
-      // we are using numBlocksPerAxis instead to print the first few scale offset values
+      // For LPBQ, num_elems are not present in the quantize_params,
+      // we are using numBlocksPerAxis instead to print the first numBlocksPerAxis scale offset values
       size_t num_elems = lpbq.numBlocksPerAxis;
       bool truncate = num_elems > 20;
       num_elems = truncate ? 20 : num_elems;
@@ -1205,21 +1205,21 @@ Ort::Status DequantizePerChannel(gsl::span<const uint8_t> quant_bytes, gsl::span
   return Ort::Status();
 }
 
-Ort::Status TryConvertBlockQuantScalesToLpbq(gsl::span<const float> bq_scales,
-                                             gsl::span<const int32_t> bq_offsets,
-                                             uint32_t num_blocks_per_channel,
-                                             uint32_t num_channels,
-                                             uint32_t bitwidth,
-                                             /*out*/ std::vector<float>& per_channel_scales,
-                                             /*out*/ std::vector<uint8_t>& per_block_int_scales,
-                                             /*out*/ std::vector<int32_t>& offsets) {
+Ort::Status ConvertBlockQuantScalesToLpbq(gsl::span<const float> bq_scales,
+                                          gsl::span<const int32_t> bq_offsets,
+                                          uint32_t num_blocks_per_channel,
+                                          uint32_t num_channels,
+                                          uint32_t bitwidth,
+                                          /*out*/ std::vector<float>& per_channel_scales,
+                                          /*out*/ std::vector<uint8_t>& per_block_int_scales,
+                                          /*out*/ std::vector<int32_t>& offsets) {
   RETURN_IF_NOT(bq_scales.size() == static_cast<size_t>(num_blocks_per_channel) * num_channels,
                 "BQ scales size does not match num_blocks_per_channel * num_channels");
   RETURN_IF_NOT(bq_offsets.empty() || bq_offsets.size() == bq_scales.size(),
                 "BQ offsets size must be empty or equal to bq_scales size");
   RETURN_IF_NOT(bitwidth > 0 && bitwidth <= 16, "bitwidth must be in range [1, 16]");
 
-  const uint32_t max_int_scale = 1u << bitwidth;  // 2^bitwidth
+  const uint32_t max_int_scale = (1u << bitwidth) - 1u;
 
   // Require symmetric quantization (all offsets must be zero).
   if (!bq_offsets.empty()) {
@@ -1236,7 +1236,7 @@ Ort::Status TryConvertBlockQuantScalesToLpbq(gsl::span<const float> bq_scales,
   }
 
   // Algorithm:
-  //   max_int_scale   = 2^bitwidth
+  //   max_int_scale             = 2^bitwidth - 1
   //   per_channel_scale[c]      = max(bq_scales[:, c]) / max_int_scale
   //   per_block_int_scale[c, b] = clamp(round(bq_scales[b, c] / per_channel_scale[c]), 1, max_int_scale)
   //
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_utils.h b/onnxruntime/core/providers/qnn/builder/qnn_utils.h
index 3b15b12a4f..f16b768b92 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_utils.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_utils.h
@@ -275,7 +275,7 @@ Ort::Status QuantizeData(gsl::span<const float> data, gsl::span<const uint32_t>
 // instead of 0, the caller must transpose the scale data before calling this function.
 //
 // Algorithm :
-//   max_int_scale   = 2^bitwidth  (256 for int8, 16 for int4)
+//   max_int_scale   = 2^bitwidth - 1  (255 for int8, 15 for int4)
 //   per_channel_scale[c]      = max(bq_scales[:, c]) / max_int_scale
 //   per_block_int_scale[c, b] = clamp(round(bq_scales[b, c] / per_channel_scale[c]), 1, max_int_scale)
 //
@@ -286,7 +286,7 @@ Ort::Status QuantizeData(gsl::span<const float> data, gsl::span<const uint32_t>
 //   - The encoding is asymmetric (non-zero offsets), which LPBQ does not support.
 //   - Any block scale is negative or non-finite.
 //   - Input sizes are inconsistent.
-Ort::Status TryConvertBlockQuantScalesToLpbq(gsl::span<const float> bq_scales,
+Ort::Status ConvertBlockQuantScalesToLpbq(gsl::span<const float> bq_scales,
                                              gsl::span<const int32_t> bq_offsets,
                                              uint32_t num_blocks_per_channel,
                                              uint32_t num_channels,

From 09ac5dcd155ca0ab5c62e2fe43c59a0151dc9ad3 Mon Sep 17 00:00:00 2001
From: qti-ashimaj <ashimaj@qti.qualcomm.com>
Date: Mon, 18 May 2026 10:19:55 +0530
Subject: [PATCH 3/8] address review comments

---
 onnxruntime/core/providers/qnn/builder/qnn_utils.h | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/onnxruntime/core/providers/qnn/builder/qnn_utils.h b/onnxruntime/core/providers/qnn/builder/qnn_utils.h
index f16b768b92..72f1065ad5 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_utils.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_utils.h
@@ -287,13 +287,13 @@ Ort::Status QuantizeData(gsl::span<const float> data, gsl::span<const uint32_t>
 //   - Any block scale is negative or non-finite.
 //   - Input sizes are inconsistent.
 Ort::Status ConvertBlockQuantScalesToLpbq(gsl::span<const float> bq_scales,
-                                             gsl::span<const int32_t> bq_offsets,
-                                             uint32_t num_blocks_per_channel,
-                                             uint32_t num_channels,
-                                             uint32_t bitwidth,
-                                             /*out*/ std::vector<float>& per_channel_scales,
-                                             /*out*/ std::vector<uint8_t>& per_block_int_scales,
-                                             /*out*/ std::vector<int32_t>& offsets);
+                                          gsl::span<const int32_t> bq_offsets,
+                                          uint32_t num_blocks_per_channel,
+                                          uint32_t num_channels,
+                                          uint32_t bitwidth,
+                                          /*out*/ std::vector<float>& per_channel_scales,
+                                          /*out*/ std::vector<uint8_t>& per_block_int_scales,
+                                          /*out*/ std::vector<int32_t>& offsets);
 
 // Quantizes the given float data using the provided Low Power Block Quantization parameters
 // (float channel_scales, int block_scales and offsets)

From d2e886a655d42155a8cc472e1e565ea0fb52f742 Mon Sep 17 00:00:00 2001
From: qti-ashimaj <ashimaj@qti.qualcomm.com>
Date: Wed, 3 Jun 2026 12:06:23 +0530
Subject: [PATCH 4/8] address review comments

---
 .../core/providers/qnn/builder/qnn_quant_params_wrapper.cc  | 6 +++---
 onnxruntime/core/providers/qnn/builder/qnn_utils.cc         | 6 +++---
 onnxruntime/core/providers/qnn/builder/qnn_utils.h          | 4 ++--
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.cc b/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.cc
index 7c81c1ad47..d6e8102021 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.cc
@@ -490,7 +490,7 @@ Ort::Status QnnQuantParamsWrapper::Init(const QnnModelWrapper& qnn_model_wrapper
   // QNN uses different structs to represent quantization parameters depending on:
   // - per-tensor (scales.size()==1, no block_size): SCALE_OFFSET or BW_SCALE_OFFSET
   // - per-channel (scales.size()>1, no block_size): AXIS_SCALE_OFFSET or BW_AXIS_SCALE_OFFSET
-  // - block quantization (block_size>0): BLOCKWISE_EXPANSION (LPBQ)
+  // - block quantization (block_size>0): BLOCKWISE_EXPANSION (LPBQ) or ENCODING_BLOCK (BQ)
   // - fallback: error
   if (is_per_tensor && !is_int4_type) {
     params_.encodingDefinition = QNN_DEFINITION_DEFINED;
@@ -594,8 +594,8 @@ Ort::Status QnnQuantParamsWrapper::Init(const QnnModelWrapper& qnn_model_wrapper
     params_.axisScaleOffsetEncoding.axis = static_cast<int32_t>(axis);
     params_.axisScaleOffsetEncoding.numScaleOffsets = static_cast<uint32_t>(num_elems);
     params_.axisScaleOffsetEncoding.scaleOffset = data_span.data();
-  } else if (is_block_quant) {
-    // ONNX block quantization -> QNN LPBQ (BLOCKWISE_EXPANSION) conversion.
+  } else if (is_block_quant && is_int4_type) {
+    // ONNX block quantization -> QNN LPBQ (BLOCKWISE_EXPANSION) conversion only supported for 4-bit.
 
     std::vector<uint32_t> io_shape;
     RETURN_IF_NOT(qnn_model_wrapper.GetOnnxShape(io_def.shape, io_shape), "Cannot get shape");
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_utils.cc b/onnxruntime/core/providers/qnn/builder/qnn_utils.cc
index 5cf0637c18..9730ed5ab1 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_utils.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_utils.cc
@@ -1217,9 +1217,9 @@ Ort::Status ConvertBlockQuantScalesToLpbq(gsl::span<const float> bq_scales,
                 "BQ scales size does not match num_blocks_per_channel * num_channels");
   RETURN_IF_NOT(bq_offsets.empty() || bq_offsets.size() == bq_scales.size(),
                 "BQ offsets size must be empty or equal to bq_scales size");
-  RETURN_IF_NOT(bitwidth > 0 && bitwidth <= 16, "bitwidth must be in range [1, 16]");
+  RETURN_IF_NOT(bitwidth == 4, "BQ to LPBQ conversion is only supported for 4-bit");
 
-  const uint32_t max_int_scale = (1u << bitwidth) - 1u;
+  const uint32_t max_int_scale = (1u << bitwidth); // 2^bitwidth
 
   // Require symmetric quantization (all offsets must be zero).
   if (!bq_offsets.empty()) {
@@ -1236,7 +1236,7 @@ Ort::Status ConvertBlockQuantScalesToLpbq(gsl::span<const float> bq_scales,
   }
 
   // Algorithm:
-  //   max_int_scale             = 2^bitwidth - 1
+  //   max_int_scale             = 2^bitwidth
   //   per_channel_scale[c]      = max(bq_scales[:, c]) / max_int_scale
   //   per_block_int_scale[c, b] = clamp(round(bq_scales[b, c] / per_channel_scale[c]), 1, max_int_scale)
   //
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_utils.h b/onnxruntime/core/providers/qnn/builder/qnn_utils.h
index 72f1065ad5..e1044d30e7 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_utils.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_utils.h
@@ -268,14 +268,14 @@ Ort::Status QuantizeData(gsl::span<const float> data, gsl::span<const uint32_t>
                          std::optional<int64_t> axis = std::nullopt);
 
 // Converts ONNX block quantization (BQ) scales to QNN LPBQ (BLOCKWISE_EXPANSION) format.
-// Supports both int8 (bitwidth=8) and int4 (bitwidth=4) weight block quantization.
+// Supports int4 (bitwidth=4) weight block quantization.
 //
 // The ONNX BQ scale tensor has shape [num_blocks_per_channel, num_channels] in block-major order
 // (i.e., the block axis is axis 0 and the channel axis is axis 1). If the ONNX block axis is 1
 // instead of 0, the caller must transpose the scale data before calling this function.
 //
 // Algorithm :
-//   max_int_scale   = 2^bitwidth - 1  (255 for int8, 15 for int4)
+//   max_int_scale             = 2^bitwidth  (16 for int4)
 //   per_channel_scale[c]      = max(bq_scales[:, c]) / max_int_scale
 //   per_block_int_scale[c, b] = clamp(round(bq_scales[b, c] / per_channel_scale[c]), 1, max_int_scale)
 //

From 965a2817c359de00d2dfb4443dd78c3a22215535 Mon Sep 17 00:00:00 2001
From: qti-ashimaj <ashimaj@qti.qualcomm.com>
Date: Wed, 3 Jun 2026 14:10:30 +0530
Subject: [PATCH 5/8] fix lint

---
 onnxruntime/core/providers/qnn/builder/qnn_utils.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/onnxruntime/core/providers/qnn/builder/qnn_utils.cc b/onnxruntime/core/providers/qnn/builder/qnn_utils.cc
index 9730ed5ab1..188327d3f5 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_utils.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_utils.cc
@@ -1219,7 +1219,7 @@ Ort::Status ConvertBlockQuantScalesToLpbq(gsl::span<const float> bq_scales,
                 "BQ offsets size must be empty or equal to bq_scales size");
   RETURN_IF_NOT(bitwidth == 4, "BQ to LPBQ conversion is only supported for 4-bit");
 
-  const uint32_t max_int_scale = (1u << bitwidth); // 2^bitwidth
+  const uint32_t max_int_scale = (1u << bitwidth);  // 2^bitwidth
 
   // Require symmetric quantization (all offsets must be zero).
   if (!bq_offsets.empty()) {

From 24b4741ee3c2cca75b72a695f3cc5ca28f6eff19 Mon Sep 17 00:00:00 2001
From: qti-ashimaj <ashimaj@qti.qualcomm.com>
Date: Mon, 15 Jun 2026 16:46:10 +0530
Subject: [PATCH 6/8] add provider option for bq to lpbq conversion

---
 .../QNN-ExecutionProvider.md                  |   7 +-
 .../builder/opbuilder/matmul_op_builder.cc    |   4 +
 .../providers/qnn/builder/qnn_model_wrapper.h |   1 +
 .../qnn/builder/qnn_quant_params_wrapper.cc   | 132 +++++++++---------
 .../providers/qnn/qnn_execution_provider.cc   |   6 +
 onnxruntime/test/providers/qnn/conv_test.cc   |   1 +
 onnxruntime/test/providers/qnn/matmul_test.cc |   1 +
 7 files changed, 86 insertions(+), 66 deletions(-)

diff --git a/docs/execution_providers/QNN-ExecutionProvider.md b/docs/execution_providers/QNN-ExecutionProvider.md
index b28fee9a31..2f5102e61c 100644
--- a/docs/execution_providers/QNN-ExecutionProvider.md
+++ b/docs/execution_providers/QNN-ExecutionProvider.md
@@ -197,6 +197,11 @@ Refer to the [QAIRT SDK documentation](https://docs.qualcomm.com/doc/80-63442-10
 |'0'|Disabled. QNN EP will handle quantization and dequantization of graph I/O.|
 |'1'|Default. Enabled. Offload quantization and dequantization of graph I/O to CPU EP.|
 
+|`"convert_bq_to_lpbq"`|Description|
+|---|---|
+|'0'|Disabled. Block quantized model will run with block quantized weight encodings and float activations.|
+|'1'|Default. Enabled. Block quantized weight encodings will be converted to Low Power Block Quantized encodings.|
+
 |`"enable_htp_shared_memory_allocator"`|Description|
 |---|---|
 |'0'|Default. Disabled.|
@@ -903,7 +908,7 @@ session = ort.InferenceSession("model.onnx", sess_options=sess_options)
 
 ### Important Considerations
 #### Feature Disabled if Number of Subgraphs is Less Than 5
-While graph composition is responsible for the majority of the preparation time, asynchronously finalizing the subgraphs cuts the total time down by a considerable amount, depending on the graph. For smaller models or models with only a few subgraphs, the overhead of setting up for parallel graph preparation will negate any possible performance gains and may actually result in worse performance. 
+While graph composition is responsible for the majority of the preparation time, asynchronously finalizing the subgraphs cuts the total time down by a considerable amount, depending on the graph. For smaller models or models with only a few subgraphs, the overhead of setting up for parallel graph preparation will negate any possible performance gains and may actually result in worse performance.
 
 #### Feature Disabled if `num_graph_prepare_threads` is 1
 This defeats the purpose of the feature, and enabling the feature will only add additional overhead from thread pool creation.
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/matmul_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/matmul_op_builder.cc
index d38303506b..5fae3bdca5 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/matmul_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/matmul_op_builder.cc
@@ -89,6 +89,10 @@ bool IsBQWeight(const QnnModelWrapper& qnn_model_wrapper, const OrtNodeUnitIODef
   if (num_blocks <= 0 || static_cast<int64_t>(weight_shape[k_axis]) % num_blocks != 0) {
     return false;
   }
+  // Go for BQ FP16 only if LPBQ conversion is set to false
+  if (qnn_model_wrapper.GetModelSettings().convert_bq_to_lpbq) {
+    return false;
+  }
   return true;
 }
 
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.h b/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.h
index 0055dbaf70..94c708f2b6 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.h
@@ -38,6 +38,7 @@ struct ModelSettings {
   bool offload_graph_io_quantization = false;
   bool htp_shared_memory = false;
   bool htp_bf16_enable = false;
+  bool convert_bq_to_lpbq = true;
 };
 
 class QnnModelWrapper {
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.cc b/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.cc
index d6e8102021..e42573460c 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.cc
@@ -594,75 +594,77 @@ Ort::Status QnnQuantParamsWrapper::Init(const QnnModelWrapper& qnn_model_wrapper
     params_.axisScaleOffsetEncoding.axis = static_cast<int32_t>(axis);
     params_.axisScaleOffsetEncoding.numScaleOffsets = static_cast<uint32_t>(num_elems);
     params_.axisScaleOffsetEncoding.scaleOffset = data_span.data();
-  } else if (is_block_quant && is_int4_type) {
-    // ONNX block quantization -> QNN LPBQ (BLOCKWISE_EXPANSION) conversion only supported for 4-bit.
-
-    std::vector<uint32_t> io_shape;
-    RETURN_IF_NOT(qnn_model_wrapper.GetOnnxShape(io_def.shape, io_shape), "Cannot get shape");
-    const int32_t io_rank = static_cast<int32_t>(io_shape.size());
-
-    // Get scale tensor shape to determine block/channel dimensions.
-    // Scale tensor may be rank 2 (e.g., MatMul/Gemm) or higher rank (e.g., Conv with rank-4 weights).
-    // Only the first two dimensions (indexed by onnx_axis and 1 - onnx_axis) are used for LPBQ conversion.
-    const std::vector<int64_t> scale_shape =
-        utils::GetInitializerShape(ort_quant_params->scale, qnn_model_wrapper.GetOrtApi());
-    RETURN_IF_NOT(scale_shape.size() >= 2,
-                  "Block quantization scale tensors must have at least rank 2 for LPBQ conversion");
-    RETURN_IF_NOT(scale_shape[0] > 0 && scale_shape[1] > 0,
-                  "Block quantization scale tensor dimensions must be positive");
-    RETURN_IF_NOT(scale_shape[0] * scale_shape[1] == static_cast<int64_t>(scales.size()),
-                  "Block quantization scale tensor shape product must equal number of scales");
-
-    // Determine block axis (= ONNX axis attribute).
-    constexpr int64_t DEFAULT_QDQ_AXIS = 1;
-    int64_t axis = ort_quant_params->axis.value_or(DEFAULT_QDQ_AXIS);
-    if (axis < 0) axis += io_rank;
-    RETURN_IF_NOT(axis == 0 || axis == 1,
-                  "Only axis 0 or 1 is supported for block quantization LPBQ conversion");
-
-    // Scale shape: [num_blocks_per_channel, num_channels] when axis=0
-    //              [num_channels, num_blocks_per_channel] when axis=1
-    const uint32_t num_blocks_per_channel = static_cast<uint32_t>(scale_shape[axis]);
-    const uint32_t num_channels = static_cast<uint32_t>(scale_shape[1 - axis]);
-
-    // LPBQ requires symmetric quantization (all zero-points must be zero).
-    for (const int32_t zp : zero_points) {
-      RETURN_IF_NOT(zp == 0, "LPBQ conversion requires symmetric quantization");
-    }
+  } else if (is_block_quant) {
+    if (is_int4_type && qnn_model_wrapper.GetModelSettings().convert_bq_to_lpbq) {
+      // ONNX block quantization -> QNN LPBQ (BLOCKWISE_EXPANSION) conversion only supported for 4-bit.
+
+      std::vector<uint32_t> io_shape;
+      RETURN_IF_NOT(qnn_model_wrapper.GetOnnxShape(io_def.shape, io_shape), "Cannot get shape");
+      const int32_t io_rank = static_cast<int32_t>(io_shape.size());
+
+      // Get scale tensor shape to determine block/channel dimensions.
+      // Scale tensor may be rank 2 (e.g., MatMul/Gemm) or higher rank (e.g., Conv with rank-4 weights).
+      // Only the first two dimensions (indexed by onnx_axis and 1 - onnx_axis) are used for LPBQ conversion.
+      const std::vector<int64_t> scale_shape =
+          utils::GetInitializerShape(ort_quant_params->scale, qnn_model_wrapper.GetOrtApi());
+      RETURN_IF_NOT(scale_shape.size() >= 2,
+                    "Block quantization scale tensors must have at least rank 2 for LPBQ conversion");
+      RETURN_IF_NOT(scale_shape[0] > 0 && scale_shape[1] > 0,
+                    "Block quantization scale tensor dimensions must be positive");
+      RETURN_IF_NOT(scale_shape[0] * scale_shape[1] == static_cast<int64_t>(scales.size()),
+                    "Block quantization scale tensor shape product must equal number of scales");
+
+      // Determine block axis (= ONNX axis attribute).
+      constexpr int64_t DEFAULT_QDQ_AXIS = 1;
+      int64_t axis = ort_quant_params->axis.value_or(DEFAULT_QDQ_AXIS);
+      if (axis < 0) axis += io_rank;
+      RETURN_IF_NOT(axis == 0 || axis == 1,
+                    "Only axis 0 or 1 is supported for block quantization LPBQ conversion");
+
+      // Scale shape: [num_blocks_per_channel, num_channels] when axis=0
+      //              [num_channels, num_blocks_per_channel] when axis=1
+      const uint32_t num_blocks_per_channel = static_cast<uint32_t>(scale_shape[axis]);
+      const uint32_t num_channels = static_cast<uint32_t>(scale_shape[1 - axis]);
+
+      // LPBQ requires symmetric quantization (all zero-points must be zero).
+      for (const int32_t zp : zero_points) {
+        RETURN_IF_NOT(zp == 0, "LPBQ conversion requires symmetric quantization");
+      }
 
-    // The conversion algorithm expects scales in block-major order [num_blocks, num_channels].
-    // If axis=1 the raw tensor is channel-major [num_channels, num_blocks]; transpose it.
-    std::vector<float> bq_scales_bm;
-    if (axis == 0) {
-      bq_scales_bm = std::move(scales);
-    } else {
-      // Transpose [num_channels, num_blocks] -> [num_blocks, num_channels]
-      bq_scales_bm.resize(scales.size());
-      for (uint32_t c = 0; c < num_channels; ++c) {
-        for (uint32_t b = 0; b < num_blocks_per_channel; ++b) {
-          bq_scales_bm[static_cast<size_t>(b) * num_channels + c] =
-              scales[static_cast<size_t>(c) * num_blocks_per_channel + b];
+      // The conversion algorithm expects scales in block-major order [num_blocks, num_channels].
+      // If axis=1 the raw tensor is channel-major [num_channels, num_blocks]; transpose it.
+      std::vector<float> bq_scales_bm;
+      if (axis == 0) {
+        bq_scales_bm = std::move(scales);
+      } else {
+        // Transpose [num_channels, num_blocks] -> [num_blocks, num_channels]
+        bq_scales_bm.resize(scales.size());
+        for (uint32_t c = 0; c < num_channels; ++c) {
+          for (uint32_t b = 0; b < num_blocks_per_channel; ++b) {
+            bq_scales_bm[static_cast<size_t>(b) * num_channels + c] =
+                scales[static_cast<size_t>(c) * num_blocks_per_channel + b];
+          }
         }
       }
-    }
 
-    // Apply BQ -> LPBQ algorithm
-    // Use bitwidth=4 for int4 weights and bitwidth=8 for int8 weights.
-    std::vector<float> per_channel_scales;
-    std::vector<uint8_t> per_block_int_scales;
-    std::vector<int32_t> lpbq_offsets;
-    const uint32_t bitwidth = is_int4_type ? 4u : 8u;
-    RETURN_IF_ERROR(utils::ConvertBlockQuantScalesToLpbq(
-        bq_scales_bm, zero_points,
-        num_blocks_per_channel, num_channels, bitwidth,
-        per_channel_scales, per_block_int_scales, lpbq_offsets));
-
-    // QNN LPBQ axis = the non-block axis in the weight tensor.
-    // For ONNX axis=0 (block axis=0): QNN axis=1; for axis=1: QNN axis=0.
-    const int64_t qnn_axis = 1 - axis;
-
-    *this = QnnQuantParamsWrapper(per_channel_scales, per_block_int_scales, lpbq_offsets,
-                                  qnn_axis, ort_quant_params->block_size.value(), is_int4_type);
+      // Apply BQ -> LPBQ algorithm
+      // Use bitwidth=4 for int4 weights and bitwidth=8 for int8 weights.
+      std::vector<float> per_channel_scales;
+      std::vector<uint8_t> per_block_int_scales;
+      std::vector<int32_t> lpbq_offsets;
+      const uint32_t bitwidth = is_int4_type ? 4u : 8u;
+      RETURN_IF_ERROR(utils::ConvertBlockQuantScalesToLpbq(
+          bq_scales_bm, zero_points,
+          num_blocks_per_channel, num_channels, bitwidth,
+          per_channel_scales, per_block_int_scales, lpbq_offsets));
+
+      // QNN LPBQ axis = the non-block axis in the weight tensor.
+      // For ONNX axis=0 (block axis=0): QNN axis=1; for axis=1: QNN axis=0.
+      const int64_t qnn_axis = 1 - axis;
+
+      *this = QnnQuantParamsWrapper(per_channel_scales, per_block_int_scales, lpbq_offsets,
+                                    qnn_axis, ort_quant_params->block_size.value(), is_int4_type);
+    }
   } else {
     return MAKE_EP_FAIL("Unexpected tensor kind for QuantParamsWrapper::Init()");
   }
diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
index 0b877f0295..f1e88d87ee 100644
--- a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
+++ b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
@@ -909,6 +909,12 @@ QnnEp::QnnEp(QnnEpFactory& factory,
   }
 #endif
 
+  model_settings_.convert_bq_to_lpbq = ParseBoolOption(ort_api,
+                                                       session_options_,
+                                                       FormatEPConfigKey("convert_bq_to_lpbq"),
+                                                       true,
+                                                       logger_);
+
   if (disable_cpu_ep_fallback_ && model_settings_.offload_graph_io_quantization) {
     ORT_CXX_LOG(logger_,
                 ORT_LOGGING_LEVEL_INFO,
diff --git a/onnxruntime/test/providers/qnn/conv_test.cc b/onnxruntime/test/providers/qnn/conv_test.cc
index e27ac4a173..76d3bce210 100644
--- a/onnxruntime/test/providers/qnn/conv_test.cc
+++ b/onnxruntime/test/providers/qnn/conv_test.cc
@@ -3205,6 +3205,7 @@ ProviderOptions GetBQConvProviderOptions() {
   ProviderOptions opts;
   opts["backend_type"] = "htp";
   opts["offload_graph_io_quantization"] = "0";
+  opts["convert_bq_to_lpbq"] = "0";
 #if defined(__linux__) && !defined(__aarch64__)
   // On the x86_64 Linux HTP simulator, specify SM8850 to enable BW_FLOAT_BLOCK support.
   // On real ARM64 hardware, the SoC model is auto-detected by QNN EP.
diff --git a/onnxruntime/test/providers/qnn/matmul_test.cc b/onnxruntime/test/providers/qnn/matmul_test.cc
index 4084fa0529..2d7b489b43 100644
--- a/onnxruntime/test/providers/qnn/matmul_test.cc
+++ b/onnxruntime/test/providers/qnn/matmul_test.cc
@@ -362,6 +362,7 @@ ProviderOptions GetBQMatMulProviderOptions() {
   ProviderOptions opts;
   opts["backend_type"] = "htp";
   opts["offload_graph_io_quantization"] = "0";
+  opts["convert_bq_to_lpbq"] = "0";
 #if defined(__linux__) && !defined(__aarch64__)
   // On the x86_64 Linux HTP simulator, specify SM8850 to enable BW_FLOAT_BLOCK support.
   // On real ARM64 hardware, the SoC model is auto-detected by QNN EP.

From 0131740f31800af002800abb7886dfba60d5febe Mon Sep 17 00:00:00 2001
From: qti-ashimaj <ashimaj@qti.qualcomm.com>
Date: Wed, 17 Jun 2026 17:00:16 +0530
Subject: [PATCH 7/8] address review comments

---
 .../QNN-ExecutionProvider.md                  |   6 +-
 .../builder/opbuilder/matmul_op_builder.cc    |  16 +-
 .../providers/qnn/builder/qnn_model_wrapper.h |   4 +-
 .../qnn/builder/qnn_quant_params_wrapper.cc   | 142 ++++++++++--------
 .../qnn/builder/qnn_quant_params_wrapper.h    |   2 +-
 .../providers/qnn/qnn_execution_provider.cc   |  10 +-
 onnxruntime/test/providers/qnn/conv_test.cc   |   2 +-
 onnxruntime/test/providers/qnn/matmul_test.cc |   2 +-
 8 files changed, 98 insertions(+), 86 deletions(-)

diff --git a/docs/execution_providers/QNN-ExecutionProvider.md b/docs/execution_providers/QNN-ExecutionProvider.md
index 2f5102e61c..dd64673552 100644
--- a/docs/execution_providers/QNN-ExecutionProvider.md
+++ b/docs/execution_providers/QNN-ExecutionProvider.md
@@ -197,10 +197,10 @@ Refer to the [QAIRT SDK documentation](https://docs.qualcomm.com/doc/80-63442-10
 |'0'|Disabled. QNN EP will handle quantization and dequantization of graph I/O.|
 |'1'|Default. Enabled. Offload quantization and dequantization of graph I/O to CPU EP.|
 
-|`"convert_bq_to_lpbq"`|Description|
+|`"enable_block_quant_weight_optimization"`|Description|
 |---|---|
-|'0'|Disabled. Block quantized model will run with block quantized weight encodings and float activations.|
-|'1'|Default. Enabled. Block quantized weight encodings will be converted to Low Power Block Quantized encodings.|
+|`"0"`|Default. Disabled. Block-quantized models use the standard compatibility path.|
+|`"1"`|Enabled. Uses an optimized path for block-quantized weights when supported. If the optimized path is not available, QNN EP falls back to the standard compatibility path.|
 
 |`"enable_htp_shared_memory_allocator"`|Description|
 |---|---|
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/matmul_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/matmul_op_builder.cc
index 5fae3bdca5..d6181e8290 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/matmul_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/matmul_op_builder.cc
@@ -1,4 +1,4 @@
-﻿// Copyright (c) Microsoft Corporation. All rights reserved.
+// Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
 #include <functional>
@@ -89,10 +89,6 @@ bool IsBQWeight(const QnnModelWrapper& qnn_model_wrapper, const OrtNodeUnitIODef
   if (num_blocks <= 0 || static_cast<int64_t>(weight_shape[k_axis]) % num_blocks != 0) {
     return false;
   }
-  // Go for BQ FP16 only if LPBQ conversion is set to false
-  if (qnn_model_wrapper.GetModelSettings().convert_bq_to_lpbq) {
-    return false;
-  }
   return true;
 }
 
@@ -250,17 +246,17 @@ Ort::Status MatMulOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper, c
                                            bool do_op_validation) const {
   const auto& inputs = node_unit.Inputs();
 
-  // Block-quantized weight: translate to a QNN MatMul with a BW_FLOAT_BLOCK weight.
-  if (IsBQWeight(qnn_model_wrapper, inputs[1])) {
-    return ProcessInputsForBQMatMul(qnn_model_wrapper, node_unit, logger, input_names, do_op_validation);
-  }
-
   TensorInfo input_info_0{};
   TensorInfo input_info_1{};
   bool use_fully_connected = false;
   RETURN_IF_ERROR(
       CheckInputs(qnn_model_wrapper, inputs[0], inputs[1], input_info_0, input_info_1, use_fully_connected));
 
+  // Block-quantized weight: translate to a QNN MatMul with a BW_FLOAT_BLOCK weight.
+  if (IsBQWeight(qnn_model_wrapper, inputs[1]) && !input_info_1.quant_param.IsLPBQ()) {
+    return ProcessInputsForBQMatMul(qnn_model_wrapper, node_unit, logger, input_names, do_op_validation);
+  }
+
   if (use_fully_connected) {
     return ProcessInputsForQnnFullyConnected(qnn_model_wrapper,
                                              node_unit,
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.h b/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.h
index 94c708f2b6..f05a95ac27 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.h
@@ -38,7 +38,7 @@ struct ModelSettings {
   bool offload_graph_io_quantization = false;
   bool htp_shared_memory = false;
   bool htp_bf16_enable = false;
-  bool convert_bq_to_lpbq = true;
+  bool enable_block_quant_weight_optimization = false;
 };
 
 class QnnModelWrapper {
@@ -364,6 +364,8 @@ class QnnModelWrapper {
 
   const OrtGraph& GetOrtGraph() const { return ort_graph_; }
 
+  const Ort::Logger& GetLogger() const { return logger_; }
+
   const std::unordered_map<std::string, QnnTensorWrapper>& GetModelTensorsMap() const {
     return model_tensors_map_;
   }
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.cc b/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.cc
index e42573460c..f19ca37974 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.cc
@@ -595,76 +595,90 @@ Ort::Status QnnQuantParamsWrapper::Init(const QnnModelWrapper& qnn_model_wrapper
     params_.axisScaleOffsetEncoding.numScaleOffsets = static_cast<uint32_t>(num_elems);
     params_.axisScaleOffsetEncoding.scaleOffset = data_span.data();
   } else if (is_block_quant) {
-    if (is_int4_type && qnn_model_wrapper.GetModelSettings().convert_bq_to_lpbq) {
-      // ONNX block quantization -> QNN LPBQ (BLOCKWISE_EXPANSION) conversion only supported for 4-bit.
-
-      std::vector<uint32_t> io_shape;
-      RETURN_IF_NOT(qnn_model_wrapper.GetOnnxShape(io_def.shape, io_shape), "Cannot get shape");
-      const int32_t io_rank = static_cast<int32_t>(io_shape.size());
-
-      // Get scale tensor shape to determine block/channel dimensions.
-      // Scale tensor may be rank 2 (e.g., MatMul/Gemm) or higher rank (e.g., Conv with rank-4 weights).
-      // Only the first two dimensions (indexed by onnx_axis and 1 - onnx_axis) are used for LPBQ conversion.
-      const std::vector<int64_t> scale_shape =
-          utils::GetInitializerShape(ort_quant_params->scale, qnn_model_wrapper.GetOrtApi());
-      RETURN_IF_NOT(scale_shape.size() >= 2,
-                    "Block quantization scale tensors must have at least rank 2 for LPBQ conversion");
-      RETURN_IF_NOT(scale_shape[0] > 0 && scale_shape[1] > 0,
-                    "Block quantization scale tensor dimensions must be positive");
-      RETURN_IF_NOT(scale_shape[0] * scale_shape[1] == static_cast<int64_t>(scales.size()),
-                    "Block quantization scale tensor shape product must equal number of scales");
-
-      // Determine block axis (= ONNX axis attribute).
-      constexpr int64_t DEFAULT_QDQ_AXIS = 1;
-      int64_t axis = ort_quant_params->axis.value_or(DEFAULT_QDQ_AXIS);
-      if (axis < 0) axis += io_rank;
-      RETURN_IF_NOT(axis == 0 || axis == 1,
-                    "Only axis 0 or 1 is supported for block quantization LPBQ conversion");
-
-      // Scale shape: [num_blocks_per_channel, num_channels] when axis=0
-      //              [num_channels, num_blocks_per_channel] when axis=1
-      const uint32_t num_blocks_per_channel = static_cast<uint32_t>(scale_shape[axis]);
-      const uint32_t num_channels = static_cast<uint32_t>(scale_shape[1 - axis]);
-
-      // LPBQ requires symmetric quantization (all zero-points must be zero).
-      for (const int32_t zp : zero_points) {
-        RETURN_IF_NOT(zp == 0, "LPBQ conversion requires symmetric quantization");
+    if (!qnn_model_wrapper.GetModelSettings().enable_block_quant_weight_optimization) {
+      ORT_CXX_LOG(qnn_model_wrapper.GetLogger(), ORT_LOGGING_LEVEL_VERBOSE,
+                  ("Block quant weight optimization disabled, falling back to float BQ path"));
+      return Ort::Status();
+    }
+    // ONNX block quantization -> QNN LPBQ (BLOCKWISE_EXPANSION) conversion only supported for 4-bit.
+    if (io_def.type != ONNX_TENSOR_ELEMENT_DATA_TYPE_INT4) {
+      ORT_CXX_LOG(qnn_model_wrapper.GetLogger(), ORT_LOGGING_LEVEL_VERBOSE,
+                  ("BQ to LPBQ conversion only supported for int4 weights, falling back to float BQ path"));
+      return Ort::Status();
+    }
+    // LPBQ requires symmetric quantization (all zero-points must be zero).
+    for (const int32_t zp : zero_points) {
+      if (zp != 0) {
+        ORT_CXX_LOG(qnn_model_wrapper.GetLogger(), ORT_LOGGING_LEVEL_VERBOSE,
+                    ("BQ to LPBQ conversion requires symmetric quantization, falling back to float BQ path"));
+        return Ort::Status();
       }
+    }
 
-      // The conversion algorithm expects scales in block-major order [num_blocks, num_channels].
-      // If axis=1 the raw tensor is channel-major [num_channels, num_blocks]; transpose it.
-      std::vector<float> bq_scales_bm;
-      if (axis == 0) {
-        bq_scales_bm = std::move(scales);
-      } else {
-        // Transpose [num_channels, num_blocks] -> [num_blocks, num_channels]
-        bq_scales_bm.resize(scales.size());
-        for (uint32_t c = 0; c < num_channels; ++c) {
-          for (uint32_t b = 0; b < num_blocks_per_channel; ++b) {
-            bq_scales_bm[static_cast<size_t>(b) * num_channels + c] =
-                scales[static_cast<size_t>(c) * num_blocks_per_channel + b];
-          }
+    std::vector<uint32_t> io_shape;
+    RETURN_IF_NOT(qnn_model_wrapper.GetOnnxShape(io_def.shape, io_shape), "Cannot get shape");
+    const int32_t io_rank = static_cast<int32_t>(io_shape.size());
+
+    // Get scale tensor shape to determine block/channel dimensions.
+    // Scale tensor may be rank 2 (e.g., MatMul/Gemm) or higher rank (e.g., Conv with rank-4 weights).
+    // Only the first two dimensions (indexed by onnx_axis and 1 - onnx_axis) are used for LPBQ conversion.
+    const std::vector<int64_t> scale_shape =
+        utils::GetInitializerShape(ort_quant_params->scale, qnn_model_wrapper.GetOrtApi());
+    RETURN_IF_NOT(scale_shape.size() >= 2,
+                  "Block quantization scale tensors must have at least rank 2 for LPBQ conversion");
+    RETURN_IF_NOT(scale_shape[0] > 0 && scale_shape[1] > 0,
+                  "Block quantization scale tensor dimensions must be positive");
+    RETURN_IF_NOT(scale_shape[0] * scale_shape[1] == static_cast<int64_t>(scales.size()),
+                  "Block quantization scale tensor shape product must equal number of scales");
+
+    // Determine block axis (= ONNX axis attribute).
+    constexpr int64_t DEFAULT_QDQ_AXIS = 1;
+    int64_t axis = ort_quant_params->axis.value_or(DEFAULT_QDQ_AXIS);
+    if (axis < 0) axis += io_rank;
+    RETURN_IF_NOT(axis == 0 || axis == 1,
+                  "Only axis 0 or 1 is supported for block quantization LPBQ conversion");
+
+    // Scale shape: [num_blocks_per_channel, num_channels] when axis=0
+    //              [num_channels, num_blocks_per_channel] when axis=1
+    const uint32_t num_blocks_per_channel = static_cast<uint32_t>(scale_shape[axis]);
+    const uint32_t num_channels = static_cast<uint32_t>(scale_shape[1 - axis]);
+
+    // The conversion algorithm expects scales in block-major order [num_blocks, num_channels].
+    // If axis=1 the raw tensor is channel-major [num_channels, num_blocks]; transpose it.
+    std::vector<float> bq_scales_bm;
+    if (axis == 0) {
+      bq_scales_bm = std::move(scales);
+    } else {
+      // Transpose [num_channels, num_blocks] -> [num_blocks, num_channels]
+      bq_scales_bm.resize(scales.size());
+      for (uint32_t c = 0; c < num_channels; ++c) {
+        for (uint32_t b = 0; b < num_blocks_per_channel; ++b) {
+          bq_scales_bm[static_cast<size_t>(b) * num_channels + c] =
+              scales[static_cast<size_t>(c) * num_blocks_per_channel + b];
         }
       }
+    }
 
-      // Apply BQ -> LPBQ algorithm
-      // Use bitwidth=4 for int4 weights and bitwidth=8 for int8 weights.
-      std::vector<float> per_channel_scales;
-      std::vector<uint8_t> per_block_int_scales;
-      std::vector<int32_t> lpbq_offsets;
-      const uint32_t bitwidth = is_int4_type ? 4u : 8u;
-      RETURN_IF_ERROR(utils::ConvertBlockQuantScalesToLpbq(
-          bq_scales_bm, zero_points,
-          num_blocks_per_channel, num_channels, bitwidth,
-          per_channel_scales, per_block_int_scales, lpbq_offsets));
-
-      // QNN LPBQ axis = the non-block axis in the weight tensor.
-      // For ONNX axis=0 (block axis=0): QNN axis=1; for axis=1: QNN axis=0.
-      const int64_t qnn_axis = 1 - axis;
-
-      *this = QnnQuantParamsWrapper(per_channel_scales, per_block_int_scales, lpbq_offsets,
-                                    qnn_axis, ort_quant_params->block_size.value(), is_int4_type);
+    // Apply BQ -> LPBQ algorithm
+    std::vector<float> per_channel_scales;
+    std::vector<uint8_t> per_block_int_scales;
+    std::vector<int32_t> lpbq_offsets;
+    const uint32_t bitwidth = 4u;
+    Ort::Status status = utils::ConvertBlockQuantScalesToLpbq(bq_scales_bm, zero_points, num_blocks_per_channel,
+                                                              num_channels, bitwidth, per_channel_scales,
+                                                              per_block_int_scales, lpbq_offsets);
+    if (!status.IsOK()) {
+      ORT_CXX_LOG(qnn_model_wrapper.GetLogger(), ORT_LOGGING_LEVEL_VERBOSE,
+                  ("BQ to LPBQ conversion failed, falling back to float BQ path: " + std::string(status.GetErrorMessage())).c_str());
+      return Ort::Status();
     }
+
+    // QNN LPBQ axis = the non-block axis in the weight tensor.
+    // For ONNX axis=0 (block axis=0): QNN axis=1; for axis=1: QNN axis=0.
+    const int64_t qnn_axis = 1 - axis;
+
+    *this = QnnQuantParamsWrapper(per_channel_scales, per_block_int_scales, lpbq_offsets,
+                                  qnn_axis, ort_quant_params->block_size.value(), is_int4_type);
   } else {
     return MAKE_EP_FAIL("Unexpected tensor kind for QuantParamsWrapper::Init()");
   }
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.h b/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.h
index 726e276b1c..a4d5675dc7 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.h
@@ -203,7 +203,7 @@ class QnnQuantParamsWrapper {
 
   // Stores LowPowerBlockQuant encodings meta like number of per_channel_scales, per-block scales,
   // and blockwise_expansion_data
-  uint32_t per_channel_scales_size_;
+  uint32_t per_channel_scales_size_ = 0;
   std::unique_ptr<uint8_t[]> block_scales_data_;
   std::unique_ptr<char[]> blockwise_expansion_data_;
 
diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
index f1e88d87ee..449b53783b 100644
--- a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
+++ b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
@@ -909,11 +909,11 @@ QnnEp::QnnEp(QnnEpFactory& factory,
   }
 #endif
 
-  model_settings_.convert_bq_to_lpbq = ParseBoolOption(ort_api,
-                                                       session_options_,
-                                                       FormatEPConfigKey("convert_bq_to_lpbq"),
-                                                       true,
-                                                       logger_);
+  model_settings_.enable_block_quant_weight_optimization = ParseBoolOption(ort_api,
+                                                                           session_options_,
+                                                                           FormatEPConfigKey("enable_block_quant_weight_optimization"),
+                                                                           false,
+                                                                           logger_);
 
   if (disable_cpu_ep_fallback_ && model_settings_.offload_graph_io_quantization) {
     ORT_CXX_LOG(logger_,
diff --git a/onnxruntime/test/providers/qnn/conv_test.cc b/onnxruntime/test/providers/qnn/conv_test.cc
index 76d3bce210..f5ad17ce1c 100644
--- a/onnxruntime/test/providers/qnn/conv_test.cc
+++ b/onnxruntime/test/providers/qnn/conv_test.cc
@@ -3205,7 +3205,7 @@ ProviderOptions GetBQConvProviderOptions() {
   ProviderOptions opts;
   opts["backend_type"] = "htp";
   opts["offload_graph_io_quantization"] = "0";
-  opts["convert_bq_to_lpbq"] = "0";
+  opts["enable_block_quant_weight_optimization"] = "0";
 #if defined(__linux__) && !defined(__aarch64__)
   // On the x86_64 Linux HTP simulator, specify SM8850 to enable BW_FLOAT_BLOCK support.
   // On real ARM64 hardware, the SoC model is auto-detected by QNN EP.
diff --git a/onnxruntime/test/providers/qnn/matmul_test.cc b/onnxruntime/test/providers/qnn/matmul_test.cc
index 2d7b489b43..2a08cdf7a9 100644
--- a/onnxruntime/test/providers/qnn/matmul_test.cc
+++ b/onnxruntime/test/providers/qnn/matmul_test.cc
@@ -362,7 +362,7 @@ ProviderOptions GetBQMatMulProviderOptions() {
   ProviderOptions opts;
   opts["backend_type"] = "htp";
   opts["offload_graph_io_quantization"] = "0";
-  opts["convert_bq_to_lpbq"] = "0";
+  opts["enable_block_quant_weight_optimization"] = "0";
 #if defined(__linux__) && !defined(__aarch64__)
   // On the x86_64 Linux HTP simulator, specify SM8850 to enable BW_FLOAT_BLOCK support.
   // On real ARM64 hardware, the SoC model is auto-detected by QNN EP.

From ef2680e822c6ddf2471ef4799bffd0a50eee446e Mon Sep 17 00:00:00 2001
From: qti-ashimaj <ashimaj@qti.qualcomm.com>
Date: Fri, 19 Jun 2026 13:04:08 +0530
Subject: [PATCH 8/8] fix scale size

---
 .../core/providers/qnn/builder/qnn_quant_params_wrapper.cc    | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.cc b/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.cc
index f19ca37974..4eba4f1337 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.cc
@@ -624,8 +624,8 @@ Ort::Status QnnQuantParamsWrapper::Init(const QnnModelWrapper& qnn_model_wrapper
     // Only the first two dimensions (indexed by onnx_axis and 1 - onnx_axis) are used for LPBQ conversion.
     const std::vector<int64_t> scale_shape =
         utils::GetInitializerShape(ort_quant_params->scale, qnn_model_wrapper.GetOrtApi());
-    RETURN_IF_NOT(scale_shape.size() >= 2,
-                  "Block quantization scale tensors must have at least rank 2 for LPBQ conversion");
+    RETURN_IF_NOT(scale_shape.size() >= 2 && scale_shape.size() <= 4,
+                  "Block quantization scale tensors must have rank between 2 and 4 for LPBQ conversion");
     RETURN_IF_NOT(scale_shape[0] > 0 && scale_shape[1] > 0,
                   "Block quantization scale tensor dimensions must be positive");
     RETURN_IF_NOT(scale_shape[0] * scale_shape[1] == static_cast<int64_t>(scales.size()),