onnxruntime · qti-ashimaj · Jun 4, 2026
@@ -120,7 +120,18 @@ Ort::Status ConvOpBuilder::IsOpSupported(QnnModelWrapper& qnn_model_wrapper,
     int64_t quant_axis = 0;
     RETURN_IF_ERROR(qnn_model_wrapper.IsPerChannelQuantized(input_1, is_per_axis_quant, quant_axis));
 
-    if (is_per_axis_quant) {
+    const bool is_block_quant = input_1.quant_param.has_value() &&
+                                input_1.quant_param->block_size.has_value() &&
+                                input_1.quant_param->block_size.value() > 0;
+
+    if (is_block_quant) {
+      if (conv_type == OnnxConvType::kConvTranspose) {
+        RETURN_IF_NOT(quant_axis == 0,
+                      "ConvTranspose's input[1] must be use axis == 0 for block quantization");
+      } else {
+        RETURN_IF_NOT(quant_axis == 1, "Conv's input[1] must be use axis == 1 for block quantization");
+      }
+    } else if (is_per_axis_quant) {
       if (conv_type == OnnxConvType::kConvTranspose) {
         RETURN_IF_NOT(quant_axis == 1,
                       "ConvTranspose's input[1] must be use axis == 1 for per-channel quantization");
@@ -220,8 +231,8 @@ Ort::Status ConvOpBuilder::ProcessConv2D3DInputs(QnnModelWrapper& qnn_model_wrap
         return MAKE_EP_FAIL(("QNN EP: Unexpected convolution op type: " + node_unit.OpType()).c_str());
       }
 
-      // Transpose quantization parameter's axis if this is using per-channel quantization.
-      if (input_info.quant_param.IsPerChannel()) {
+      // Transpose quantization parameter's axis if this is using per-channel or LPBQ quantization.
+      if (input_info.quant_param.IsPerChannel() || input_info.quant_param.IsLPBQ()) {
         std::vector<size_t> perm;
         if (is_3d) {
           perm = conv_type == OnnxConvType::kConv ? nchw2hwcn_perm_3d : cnhw2hwcn_perm_3d;
@@ -234,7 +245,7 @@ Ort::Status ConvOpBuilder::ProcessConv2D3DInputs(QnnModelWrapper& qnn_model_wrap
       }
     } else {
       // Add transpose node above weight input.
-      RETURN_IF(input_info.quant_param.IsPerChannel(),
+      RETURN_IF(input_info.quant_param.IsPerChannel() || input_info.quant_param.IsLPBQ(),
                 "Non-constant Conv inputs only support per-tensor quantization");
       bool is_graph_input = qnn_model_wrapper.IsGraphInput(input1_name);
       ORT_CXX_LOG(logger, ORT_LOGGING_LEVEL_VERBOSE, ("Add HWCN Transpose node after input: " + input1_name).c_str());
@@ -333,9 +344,12 @@ Ort::Status ConvOpBuilder::ProcessConv2D3DInputs(QnnModelWrapper& qnn_model_wrap
     TensorInfo bias_info = {};
     RETURN_IF_ERROR(qnn_model_wrapper.GetTensorInfo(bias_input, bias_info));
 
-    // For static quantized bias, handle requantization if needed
-    if (bias_info.is_initializer && bias_info.quant_param.IsQuantized()) {
-      // Get activation and weight quantization parameters
+    bool bias_handled = false;
+
+    // For a static bias when activation and weight are both quantized, ensure
+    // bias_scale = activation_scale * weight_scale.
+    // This applies whether the bias is already quantized (requantize if needed) or float (quantize it).
+    if (bias_info.is_initializer) {
       TensorInfo input0_info = {};
       TensorInfo input1_info = {};
       RETURN_IF_ERROR(qnn_model_wrapper.GetTensorInfo(inputs[0], input0_info));
@@ -351,186 +365,102 @@ Ort::Status ConvOpBuilder::ProcessConv2D3DInputs(QnnModelWrapper& qnn_model_wrap
           activation_scale = act_quant_params.bwScaleOffsetEncoding.scale;
         }
 
-        // Get weight scales (per-tensor or per-channel)
         std::vector<float> weights_scales;
-
-        if (input1_info.quant_param.IsPerTensor()) {
-          // Handle per-tensor quantization (encodings 0 and 2)
-          const auto& weight_quant_params = input1_info.quant_param.Get();
-
-          if (weight_quant_params.quantizationEncoding == QNN_QUANTIZATION_ENCODING_SCALE_OFFSET) {
-            weights_scales.push_back(weight_quant_params.scaleOffsetEncoding.scale);
-          } else if (weight_quant_params.quantizationEncoding == QNN_QUANTIZATION_ENCODING_BW_SCALE_OFFSET) {
-            weights_scales.push_back(weight_quant_params.bwScaleOffsetEncoding.scale);
-          }
-        } else {
-          // Handle per-channel quantization (encodings 1 and 3)
-          const auto& weight_quant_params = input1_info.quant_param.Get();
-
-          if (weight_quant_params.quantizationEncoding == QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET) {
-            if (weight_quant_params.axisScaleOffsetEncoding.scaleOffset != nullptr &&
-                weight_quant_params.axisScaleOffsetEncoding.numScaleOffsets > 0) {
-              for (size_t i = 0; i < weight_quant_params.axisScaleOffsetEncoding.numScaleOffsets; ++i) {
-                weights_scales.push_back(weight_quant_params.axisScaleOffsetEncoding.scaleOffset[i].scale);
-              }
-            }
-          } else if (weight_quant_params.quantizationEncoding == QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET) {
-            if (weight_quant_params.bwAxisScaleOffsetEncoding.scales != nullptr &&
-                weight_quant_params.bwAxisScaleOffsetEncoding.numElements > 0) {
-              for (size_t i = 0; i < weight_quant_params.bwAxisScaleOffsetEncoding.numElements; ++i) {
-                weights_scales.push_back(weight_quant_params.bwAxisScaleOffsetEncoding.scales[i]);
-              }
+        RETURN_IF_ERROR(utils::GetWeightQuantScales(input1_info.quant_param, bias_info.shape[0], weights_scales));
+        RETURN_IF_NOT(!weights_scales.empty(), "No weight scales found for bias quantization");
+
+        if (bias_info.quant_param.IsQuantized()) {
+          // Bias is already quantized: check if scales match, requantize if needed.
+          std::vector<float> current_scales;
+          std::vector<int32_t> current_offsets;
+          int32_t quant_axis = 0;
+          RETURN_IF_ERROR(utils::GetBiasQuantScalesAndOffsets(bias_info.quant_param, current_scales, current_offsets, quant_axis));
+
+          const size_t num_channels = current_scales.size();
+          bool needs_requantization = false;
+          for (size_t i = 0; i < num_channels && !needs_requantization; ++i) {
+            const float weight_scale = (i < weights_scales.size()) ? weights_scales[i] : weights_scales[0];
+            if (current_offsets[i] != 0 ||
+                !utils::CheckBiasScaleMatch(current_scales[i], weight_scale, activation_scale, 1e-5f)) {
+              needs_requantization = true;
             }
           }
-        }
 
-        // Safety check to prevent crashes
-        RETURN_IF_NOT(!weights_scales.empty(), "No weight scales found for quantized weights");
-
-        // Check bias quantization type
-        if (bias_info.quant_param.IsPerTensor()) {
-          float bias_scale = 0.0f;
-          int32_t bias_offset = 0;
-          const auto& bias_quant_params = bias_info.quant_param.Get();
-          if (bias_quant_params.quantizationEncoding == QNN_QUANTIZATION_ENCODING_SCALE_OFFSET) {
-            bias_scale = bias_quant_params.scaleOffsetEncoding.scale;
-            bias_offset = bias_quant_params.scaleOffsetEncoding.offset;
-          } else if (bias_quant_params.quantizationEncoding == QNN_QUANTIZATION_ENCODING_BW_SCALE_OFFSET) {
-            bias_scale = bias_quant_params.bwScaleOffsetEncoding.scale;
-            bias_offset = bias_quant_params.bwScaleOffsetEncoding.offset;
-          } else {
-            return MAKE_EP_FAIL("Unsupported bias quantization encoding for per-tensor quantization.");
-          }
+          if (needs_requantization) {
+            ORT_CXX_LOG(logger, ORT_LOGGING_LEVEL_VERBOSE, ("Requantizing bias " + bias_input.name).c_str());
 
-          // Check if bias_offset = 0 AND bias_scale = (weights_scale[0] * activation_scale)
-          if (bias_offset == 0 && utils::CheckBiasScaleMatch(bias_scale, weights_scales[0], activation_scale, 1e-5f)) {
-            // No change needed - scales match and offset is 0
-          } else {
-            ORT_CXX_LOG(logger, ORT_LOGGING_LEVEL_VERBOSE, ("Requantizing per-tensor bias " + bias_input.name).c_str());
-            // Need to requantize the bias tensor
             std::vector<uint8_t> original_bias_data;
             RETURN_IF_ERROR(qnn_model_wrapper.UnpackInitializerData(bias_info.initializer_tensor, original_bias_data));
 
-            std::vector<float> current_scales = {bias_scale};
-            std::vector<int32_t> current_offsets = {bias_offset};
             std::vector<uint8_t> requantized_bias_data;
             std::vector<float> new_scales;
             std::vector<int32_t> new_offsets;
-
+            const std::optional<int64_t> axis_opt = (num_channels > 1) ? std::optional<int64_t>(quant_axis) : std::nullopt;
             RETURN_IF_ERROR(utils::RequantizeBiasTensor(
                 original_bias_data, bias_info.shape, current_scales, current_offsets,
                 weights_scales, activation_scale, bias_info.qnn_data_type,
-                requantized_bias_data, new_scales, new_offsets));
+                requantized_bias_data, new_scales, new_offsets, axis_opt));
+
+            QnnQuantParamsWrapper new_quant_params;
+            if (new_scales.size() == 1) {
+              new_quant_params = QnnQuantParamsWrapper(new_scales[0], new_offsets[0]);
+            } else {
+              new_quant_params = QnnQuantParamsWrapper(new_scales, new_offsets, quant_axis, false);
+            }
 
-            // Create new tensor wrapper with requantized data
             std::string bias_name = bias_input.name;
-            QnnQuantParamsWrapper new_quant_params(new_scales[0], new_offsets[0]);
             QnnTensorWrapper bias_tensorwrapper(bias_name, QNN_TENSOR_TYPE_STATIC, bias_info.qnn_data_type,
                                                 std::move(new_quant_params), std::vector<uint32_t>(bias_info.shape),
                                                 std::move(requantized_bias_data));
-            RETURN_IF_NOT(qnn_model_wrapper.AddTensorWrapper(std::move(bias_tensorwrapper)), "Failed to add requantized bias tensor.");
+            RETURN_IF_NOT(qnn_model_wrapper.AddTensorWrapper(std::move(bias_tensorwrapper)),
+                          "Failed to add requantized bias tensor.");
             input_names.push_back(bias_name);
-            return Ort::Status();  // We've handled the bias, return early
+            bias_handled = true;
           }
         } else {
-          // Handle per-channel bias
-          const auto& bias_quant_params = bias_info.quant_param.Get();
-
-          if (bias_quant_params.quantizationEncoding == QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET ||
-              bias_quant_params.quantizationEncoding == QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET) {
-            // Extract scales and offsets based on encoding type
-            std::vector<float> current_scales;
-            std::vector<int32_t> current_offsets;
-            int32_t quant_axis = 0;
-            size_t num_channels = 0;
-
-            if (bias_quant_params.quantizationEncoding == QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET) {
-              // Safety checks for AXIS_SCALE_OFFSET encoding
-              RETURN_IF_NOT(bias_quant_params.axisScaleOffsetEncoding.scaleOffset != nullptr,
-                            "Invalid bias quantization parameters: scaleOffset is null");
-              RETURN_IF_NOT(bias_quant_params.axisScaleOffsetEncoding.numScaleOffsets > 0,
-                            "Invalid bias quantization parameters: numScaleOffsets is zero");
-
-              num_channels = bias_quant_params.axisScaleOffsetEncoding.numScaleOffsets;
-              quant_axis = bias_quant_params.axisScaleOffsetEncoding.axis;
-              for (size_t i = 0; i < num_channels; ++i) {
-                current_scales.push_back(bias_quant_params.axisScaleOffsetEncoding.scaleOffset[i].scale);
-                current_offsets.push_back(bias_quant_params.axisScaleOffsetEncoding.scaleOffset[i].offset);
-              }
-            } else {  // QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET
-              // Safety checks for BW_AXIS_SCALE_OFFSET encoding
-              RETURN_IF_NOT(bias_quant_params.bwAxisScaleOffsetEncoding.scales != nullptr,
-                            "Invalid bias quantization parameters: scales is null");
-              RETURN_IF_NOT(bias_quant_params.bwAxisScaleOffsetEncoding.offsets != nullptr,
-                            "Invalid bias quantization parameters: offsets is null");
-              RETURN_IF_NOT(bias_quant_params.bwAxisScaleOffsetEncoding.numElements > 0,
-                            "Invalid bias quantization parameters: numElements is zero");
-
-              num_channels = bias_quant_params.bwAxisScaleOffsetEncoding.numElements;
-              quant_axis = bias_quant_params.bwAxisScaleOffsetEncoding.axis;
-              for (size_t i = 0; i < num_channels; ++i) {
-                current_scales.push_back(bias_quant_params.bwAxisScaleOffsetEncoding.scales[i]);
-                current_offsets.push_back(bias_quant_params.bwAxisScaleOffsetEncoding.offsets[i]);
-              }
-            }
-
-            // Check if all offsets are 0 and scales match expected values
-            bool all_offsets_zero = true;
-            bool all_scales_match = true;
-
-            for (size_t i = 0; i < num_channels; ++i) {
-              if (current_offsets[i] != 0) {
-                all_offsets_zero = false;
-              }
-
-              // Calculate expected scale for this channel
-              // Use the corresponding weight scale if available, otherwise use the first one
-              float weight_scale = (i < weights_scales.size()) ? weights_scales[i] : weights_scales[0];
-
-              if (!utils::CheckBiasScaleMatch(current_scales[i], weight_scale, activation_scale, 1e-5f)) {
-                all_scales_match = false;
-              }
-            }
-
-            if (all_offsets_zero && all_scales_match) {
-              // No change needed - scales match and offsets are 0
-            } else {
-              // Need to requantize per-channel bias
-              ORT_CXX_LOG(logger,
-                          ORT_LOGGING_LEVEL_VERBOSE,
-                          ("Requantizing per-channel bias " + bias_input.name).c_str());
-
-              // Get current bias data and requantize
-              std::vector<uint8_t> original_bias_data;
-              RETURN_IF_ERROR(qnn_model_wrapper.UnpackInitializerData(bias_info.initializer_tensor, original_bias_data));
-
-              std::vector<uint8_t> requantized_bias_data;
-              std::vector<float> new_scales;
-              std::vector<int32_t> new_offsets;
-
-              RETURN_IF_ERROR(utils::RequantizeBiasTensor(
-                  original_bias_data, bias_info.shape, current_scales, current_offsets,
-                  weights_scales, activation_scale, bias_info.qnn_data_type,
-                  requantized_bias_data, new_scales, new_offsets,
-                  quant_axis));
-
-              // Create new tensor wrapper with requantized data
-              std::string bias_name = bias_input.name;
-              QnnQuantParamsWrapper new_quant_params(new_scales, new_offsets, quant_axis, false);
-              QnnTensorWrapper bias_tensorwrapper(bias_name, QNN_TENSOR_TYPE_STATIC, bias_info.qnn_data_type,
-                                                  std::move(new_quant_params), std::vector<uint32_t>(bias_info.shape),
-                                                  std::move(requantized_bias_data));
-              RETURN_IF_NOT(qnn_model_wrapper.AddTensorWrapper(std::move(bias_tensorwrapper)), "Failed to add requantized bias tensor.");
-              input_names.push_back(bias_name);
-              return Ort::Status();  // We've handled the bias, return early
-            }
+          // Bias is float: quantize using bias_scale = activation_scale * weight_scale.
+          ORT_CXX_LOG(logger, ORT_LOGGING_LEVEL_VERBOSE,
+                      ("Quantizing float bias " + bias_input.name + " using activation_scale * weight_scale[c]").c_str());
+
+          std::vector<uint8_t> original_bias_data;
+          RETURN_IF_ERROR(qnn_model_wrapper.UnpackInitializerData(bias_info.initializer_tensor, original_bias_data));
+
+          const size_t num_channels = bias_info.shape[0];
+          RETURN_IF_NOT(original_bias_data.size() == num_channels * sizeof(float),
+                        "Unexpected bias data size for float bias quantization");
+          const float* bias_float_data = reinterpret_cast<const float*>(original_bias_data.data());
+
+          std::vector<uint8_t> quantized_bias_data;
+          std::vector<float> new_scales;
+          std::vector<int32_t> new_offsets;
+          RETURN_IF_ERROR(utils::QuantizeFloatBiasTensor(
+              gsl::span<const float>(bias_float_data, num_channels),
+              weights_scales, activation_scale,
+              quantized_bias_data, new_scales, new_offsets));
+
+          QnnQuantParamsWrapper new_quant_params;
+          if (weights_scales.size() == 1) {
+            new_quant_params = QnnQuantParamsWrapper(new_scales[0], 0);
+          } else {
+            new_quant_params = QnnQuantParamsWrapper(new_scales, new_offsets, /*axis=*/0, /*is_int4=*/false);
           }
+
+          std::string bias_name = bias_input.name;
+          QnnTensorWrapper bias_tensorwrapper(bias_name, QNN_TENSOR_TYPE_STATIC, QNN_DATATYPE_SFIXED_POINT_32,
+                                              std::move(new_quant_params), std::vector<uint32_t>(bias_info.shape),
+                                              std::move(quantized_bias_data));
+          RETURN_IF_NOT(qnn_model_wrapper.AddTensorWrapper(std::move(bias_tensorwrapper)),
+                        "Failed to add quantized float bias tensor.");
+          input_names.push_back(bias_name);
+          bias_handled = true;
         }
       }
     }
 
-    // Process bias normally (non-quantized or static non-quantized or scales already match)
-    RETURN_IF_ERROR(ProcessInput(qnn_model_wrapper, bias_input, logger, input_names));
+    if (!bias_handled) {
+      // Process bias normally: non-initializer, or activation/weight not quantized, or scales already match.
+      RETURN_IF_ERROR(ProcessInput(qnn_model_wrapper, bias_input, logger, input_names));
+    }
   }
 
 #if QNN_API_VERSION_MAJOR == 2 && (QNN_API_VERSION_MINOR >= 16 && QNN_API_VERSION_MINOR <= 18)
@@ -675,7 +605,7 @@ Ort::Status ConvOpBuilder::ProcessConv1DInputs(QnnModelWrapper& qnn_model_wrappe
       });
 
       // The reshape (unsqueeze) may require us to shift the quant parameter's axis.
-      if (input_info.quant_param.IsPerChannel()) {
+      if (input_info.quant_param.IsPerChannel() || input_info.quant_param.IsLPBQ()) {
         RETURN_IF_ERROR(input_info.quant_param.HandleUnsqueeze<uint32_t>(input_info.shape, shape_2d));
       }
 
@@ -709,16 +639,16 @@ Ort::Status ConvOpBuilder::ProcessConv1DInputs(QnnModelWrapper& qnn_model_wrappe
         return MAKE_EP_FAIL(("QNN EP: Unexpected convolution op type: " + node_unit.OpType()).c_str());
       }
 
-      // Transpose quantization parameter's axis if this is using per-channel quantization.
-      if (input_info.quant_param.IsPerChannel()) {
+      // Transpose quantization parameter's axis if this is using per-channel or LPBQ quantization.
+      if (input_info.quant_param.IsPerChannel() || input_info.quant_param.IsLPBQ()) {
         const std::vector<size_t>& perm = conv_type == OnnxConvType::kConv ? nchw2hwcn_perm : cnhw2hwcn_perm;
         std::vector<size_t> perm_inv(perm.size());
         RETURN_IF_ERROR(utils::InvertPerm<size_t>(perm, perm_inv));
         RETURN_IF_ERROR(input_info.quant_param.HandleTranspose<size_t>(perm_inv));
       }
     } else {
       // Dynamic weight: Add nodes to reshape to 2D, and then transpose.
-      RETURN_IF(input_info.quant_param.IsPerChannel(),
+      RETURN_IF(input_info.quant_param.IsPerChannel() || input_info.quant_param.IsLPBQ(),
                 "Non-constant Conv inputs only support per-tensor quantization");
 
       if (!qnn_model_wrapper.IsQnnTensorWrapperExist(input1_name)) {