From 9cd77eca5ec6a2f834a6628456c5e28cbe8ba2ba Mon Sep 17 00:00:00 2001 From: Matthew Sinclair Date: Tue, 5 May 2026 13:35:08 -0700 Subject: [PATCH] Enable FP16 activations in MatMulNBits ### Description * Remove FP32 input restriction in MatMulNBits op builder. Note that the scales initializer must be cast to FP32 in the op builder as QNN currently requires FP32 scales at the API level. * Add FP16 MatMulNBits unit tests. ### Motivation and Context * Enable w4a16 LLMs on the GPU for faster inferencing. --- .../opbuilder/matmulnbits_op_builder.cc | 42 +++++-- .../test/providers/qnn/matmulnbits_test.cc | 114 ++++++++++++------ .../graph_transform_test_builder.h | 14 +++ .../test/unittest_util/qdq_test_utils.h | 4 +- 4 files changed, 125 insertions(+), 49 deletions(-) diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/matmulnbits_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/matmulnbits_op_builder.cc index de38f02675..75e2461b86 100755 --- a/onnxruntime/core/providers/qnn/builder/opbuilder/matmulnbits_op_builder.cc +++ b/onnxruntime/core/providers/qnn/builder/opbuilder/matmulnbits_op_builder.cc @@ -21,7 +21,7 @@ namespace qnn { Inputs - A : : (fp16/32) : [batch_size{1}, sequence_len, K] - B : Init : (uint8) : [N, K/block_size, (block_size * bits) / 8] - - scales : Init : (fp32) : [N * K / block_size] + - scales : Init : (fp16/32) : [N * K / block_size] - zero_points : (optional)Init : (uint8) : [N * K / (block_size * 2)] - bias : (optional)Init : [fp16/32] : [N] Outputs @@ -123,8 +123,6 @@ Ort::Status MatMulNBitsOpBuilder::IsOpSupported(QnnModelWrapper& qnn_model_wrapp const auto& inputs = node_unit.Inputs(); // 1. input : Datatype should be float16 or float32 - // Float16 Dlc serialization failing, Skipping float16 support for this op builder - // TODO :: Add Float16 Support { const OrtNodeUnitIODef& input_tensor = inputs[0]; TensorInfo input_info{}; @@ -132,7 +130,7 @@ Ort::Status MatMulNBitsOpBuilder::IsOpSupported(QnnModelWrapper& qnn_model_wrapp RETURN_IF_ERROR(utils::GetQnnDataType(input_tensor.quant_param.has_value(), input_tensor.type, input_datatype)); - RETURN_IF(input_datatype != QNN_DATATYPE_FLOAT_32, "Unsupported Input datatype"); + RETURN_IF(input_datatype != QNN_DATATYPE_FLOAT_32 && input_datatype != QNN_DATATYPE_FLOAT_16, "Unsupported Input datatype"); } // 2. weight : weight supported with packed int4 into int8. @@ -151,7 +149,7 @@ Ort::Status MatMulNBitsOpBuilder::IsOpSupported(QnnModelWrapper& qnn_model_wrapp "Invalid B dimensions. Qnn Gpu Only Supports MatMulNBits with bits == 4 in packed format"); } - // 3. scales : scales only float32 datatype + // 3. scales : Datatype should be float16 or float32 { const OrtNodeUnitIODef& input_tensor = inputs[2]; TensorInfo input_info{}; @@ -159,7 +157,8 @@ Ort::Status MatMulNBitsOpBuilder::IsOpSupported(QnnModelWrapper& qnn_model_wrapp RETURN_IF_ERROR(utils::GetQnnDataType(input_tensor.quant_param.has_value(), input_tensor.type, input_datatype)); - RETURN_IF(input_datatype != QNN_DATATYPE_FLOAT_32, "Unsupported Input datatype"); + RETURN_IF(input_datatype != QNN_DATATYPE_FLOAT_32 && input_datatype != QNN_DATATYPE_FLOAT_16, + "Unsupported scales datatype"); } // 4. If input 3 exists, it has to be zero point. @@ -254,11 +253,34 @@ Ort::Status MatMulNBitsOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapp std::vector per_block_uint8_scale; const OrtValueInfo* scale_tensor_proto = qnn_model_wrapper.GetConstantTensor(scales_tensor.name); RETURN_IF_ERROR(qnn_model_wrapper.UnpackInitializerData(scale_tensor_proto, per_block_uint8_scale)); - RETURN_IF_NOT(per_block_uint8_scale.size() == (num_blocks * sizeof(float)), + + const OrtTypeInfo* type_info = nullptr; + const auto& ort_api = qnn_model_wrapper.GetOrtApi(); + ORT_CXX_RETURN_ON_API_FAIL(ort_api.GetValueInfoTypeInfo(scale_tensor_proto, &type_info)); + const OrtTensorTypeAndShapeInfo* tensor_type_and_shape_info = nullptr; + ORT_CXX_RETURN_ON_API_FAIL(ort_api.CastTypeInfoToTensorInfo(type_info, &tensor_type_and_shape_info)); + ONNXTensorElementDataType onnx_data_type = ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED; + ORT_CXX_RETURN_ON_API_FAIL(ort_api.GetTensorElementType(tensor_type_and_shape_info, &onnx_data_type)); + + RETURN_IF(onnx_data_type != ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT && + onnx_data_type != ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16, + "Unsupported scales datatype"); + + const size_t elem_byte_size = qnn::utils::GetElementSizeByType(onnx_data_type); + RETURN_IF_NOT(per_block_uint8_scale.size() == (num_blocks * elem_byte_size), "Scale Initializer Invalid Size"); - float* per_block_float_scale_ptr = reinterpret_cast(per_block_uint8_scale.data()); - const std::vector per_block_float_scale(per_block_float_scale_ptr, - per_block_float_scale_ptr + num_blocks); + + std::vector per_block_float_scale; + if (onnx_data_type == ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT) { + float* per_block_float_scale_ptr = reinterpret_cast(per_block_uint8_scale.data()); + per_block_float_scale = std::vector(per_block_float_scale_ptr, per_block_float_scale_ptr + num_blocks); + } else { + Ort::Float16_t* per_block_fp16_scale_ptr = reinterpret_cast(per_block_uint8_scale.data()); + per_block_float_scale.reserve(num_blocks); + for (int64_t i = 0; i < num_blocks; i++) { + per_block_float_scale.emplace_back(static_cast(per_block_fp16_scale_ptr[i])); + } + } // 2.3 Quantization Offsets : QNN Support only symmetric quantization with default value of 0 std::vector per_block_int32_offset(num_blocks, 0); diff --git a/onnxruntime/test/providers/qnn/matmulnbits_test.cc b/onnxruntime/test/providers/qnn/matmulnbits_test.cc index c558581ce5..a446f99771 100644 --- a/onnxruntime/test/providers/qnn/matmulnbits_test.cc +++ b/onnxruntime/test/providers/qnn/matmulnbits_test.cc @@ -17,32 +17,33 @@ namespace test { constexpr int QBits = 4; -void QuantizeDequantize(std::vector& raw_vals, +template +void QuantizeDequantize(std::vector& raw_vals, std::vector& quant_vals, - std::vector& scales, + std::vector& scales, std::vector* zp, int32_t N, int32_t K, int32_t block_size) { - QuantizeBlockwise(quant_vals.data(), - scales.data(), - zp != nullptr ? zp->data() : nullptr, - raw_vals.data(), - block_size, - true, - K, - N, - N); + QuantizeBlockwise(quant_vals.data(), + scales.data(), + zp != nullptr ? zp->data() : nullptr, + raw_vals.data(), + block_size, + true, + K, + N, + N); // Note that raw_vals is NxK after dequant - DequantizeBlockwise(raw_vals.data(), // dequantized output - quant_vals.data(), // quantized input - scales.data(), // quantization scales - zp != nullptr ? zp->data() : nullptr, // quantization zero points - block_size, // quantization block size - true, // columnwise quantization - K, // number of rows - N); // number of columns + DequantizeBlockwise(raw_vals.data(), // dequantized output + quant_vals.data(), // quantized input + scales.data(), // quantization scales + zp != nullptr ? zp->data() : nullptr, // quantization zero points + block_size, // quantization block size + true, // columnwise quantization + K, // number of rows + N); // number of columns } struct TestParams4Bits { @@ -59,6 +60,7 @@ struct TestParams4Bits { bool has_bias{false}; }; +template static void RunMatMul4BitsTest(const TestParams4Bits params, ExpectedEPNodeAssignment expected_ep_assignment = ExpectedEPNodeAssignment::All, const std::string& backend_name = "gpu", @@ -71,13 +73,15 @@ static void RunMatMul4BitsTest(const TestParams4Bits params, std::vector input_names; RandomValueGenerator random{1234}; - std::vector input0_vals(random.Gaussian(AsSpan({params.batch_count, params.M, params.K}), - 0.0f, - 0.25f)); - std::vector input1_f_vals(random.Gaussian(AsSpan({params.K, params.N}), 0.0f, 0.25f)); - - auto input0_def = TestInputDef({params.batch_count, params.M, params.K}, false, input0_vals); - MakeTestInput(builder, "input0", input0_def); + std::vector input0_vals(random.Gaussian(AsSpan({params.batch_count, params.M, params.K}), + static_cast(0.0f), + static_cast(0.25f))); + std::vector input1_f_vals(random.Gaussian(AsSpan({params.K, params.N}), + static_cast(0.0f), + static_cast(0.25f))); + + auto input0_def = TestInputDef({params.batch_count, params.M, params.K}, false, input0_vals); + MakeTestInput(builder, "input0", input0_def); input_names.push_back("input0"); int64_t k_blocks = (params.K + params.block_size - 1) / params.block_size; @@ -88,7 +92,7 @@ static void RunMatMul4BitsTest(const TestParams4Bits params, size_t q_zp_size_in_bytes = static_cast(params.N * zero_point_blob_size); // packed as UInt4x2 std::vector input1_vals(q_data_size_in_bytes); - std::vector scales(q_scale_size); + std::vector scales(q_scale_size); // TODO // Not sure why zp is not calculated from QuantizeDequantize. Since QNN GPU only support zp=8, hardcode it here // as workaround. @@ -106,8 +110,8 @@ static void RunMatMul4BitsTest(const TestParams4Bits params, MakeTestInput(builder, "input1", input1_def); input_names.push_back("input1"); - auto scales_def = TestInputDef({params.N, k_blocks}, true, scales); - MakeTestInput(builder, "scales", scales_def); + auto scales_def = TestInputDef({params.N, k_blocks}, true, scales); + MakeTestInput(builder, "scales", scales_def); input_names.push_back("scales"); if (params.has_zero_point) { @@ -144,40 +148,76 @@ static void RunMatMul4BitsTest(const TestParams4Bits params, // QNN GPU only support FP16 activations and Q4_0 weights, with zero_points = 8 // Accumulation with larger channel accumulates more error. Set higher abs_error with respect to K. -TEST_F(QnnGPUBackendTests, MatMulNBits_Basic_M1_N128_K512_withZp) { +TEST_F(QnnGPUBackendTests, MatMulNBits_Basic_Fp32_M1_N128_K512_withZp) { + TestParams4Bits params; + params.M = 1; + params.N = 128; + params.K = 512; + params.has_zero_point = true; + RunMatMul4BitsTest(params); +} + +TEST_F(QnnGPUBackendTests, MatMulNBits_Basic_Fp16_M1_N128_K512_withZp) { TestParams4Bits params; params.M = 1; params.N = 128; params.K = 512; params.has_zero_point = true; - RunMatMul4BitsTest(params); + RunMatMul4BitsTest(params); +} + +TEST_F(QnnGPUBackendTests, MatMulNBits_Basic_Fp32_M1_N128_K512) { + TestParams4Bits params; + params.M = 1; + params.N = 128; + params.K = 512; + params.has_zero_point = false; + RunMatMul4BitsTest(params); } -TEST_F(QnnGPUBackendTests, MatMulNBits_Basic_M1_N128_K512) { +TEST_F(QnnGPUBackendTests, MatMulNBits_Basic_Fp16_M1_N128_K512) { TestParams4Bits params; params.M = 1; params.N = 128; params.K = 512; params.has_zero_point = false; - RunMatMul4BitsTest(params); + RunMatMul4BitsTest(params); } -TEST_F(QnnGPUBackendTests, MatMulNBits_Basic_M10_N128_K512_withZp) { +TEST_F(QnnGPUBackendTests, MatMulNBits_Basic_Fp32_M10_N128_K512_withZp) { TestParams4Bits params; params.M = 10; params.N = 128; params.K = 512; params.has_zero_point = true; - RunMatMul4BitsTest(params); + RunMatMul4BitsTest(params); +} + +TEST_F(QnnGPUBackendTests, MatMulNBits_Basic_Fp16_M10_N128_K512_withZp) { + TestParams4Bits params; + params.M = 10; + params.N = 128; + params.K = 512; + params.has_zero_point = true; + RunMatMul4BitsTest(params); +} + +TEST_F(QnnGPUBackendTests, MatMulNBits_Basic_Fp32_M10_N128_K512) { + TestParams4Bits params; + params.M = 10; + params.N = 128; + params.K = 512; + params.has_zero_point = false; + RunMatMul4BitsTest(params); } -TEST_F(QnnGPUBackendTests, MatMulNBits_Basic_M10_N128_K512) { +TEST_F(QnnGPUBackendTests, MatMulNBits_Basic_Fp16_M10_N128_K512) { TestParams4Bits params; params.M = 10; params.N = 128; params.K = 512; params.has_zero_point = false; - RunMatMul4BitsTest(params); + RunMatMul4BitsTest(params); } #endif diff --git a/onnxruntime/test/unittest_util/graph_transform_test_builder.h b/onnxruntime/test/unittest_util/graph_transform_test_builder.h index c5e9664a1b..694d9130aa 100644 --- a/onnxruntime/test/unittest_util/graph_transform_test_builder.h +++ b/onnxruntime/test/unittest_util/graph_transform_test_builder.h @@ -143,6 +143,20 @@ class RandomValueGenerator { return val; } + // Gaussian distribution for float16 + template + typename std::enable_if< + std::is_same_v || std::is_same_v, + std::vector>::type + Gaussian(gsl::span dims, TFloat16 mean, TFloat16 stddev) { + std::vector val(SizeFromDims(dims)); + std::normal_distribution distribution(static_cast(mean), static_cast(stddev)); + for (size_t i = 0; i < val.size(); ++i) { + val[i] = TFloat16(static_cast(distribution(generator_))); + } + return val; + } + // Gaussian distribution for Integer template typename std::enable_if< diff --git a/onnxruntime/test/unittest_util/qdq_test_utils.h b/onnxruntime/test/unittest_util/qdq_test_utils.h index 32a235a470..2bb7525b51 100644 --- a/onnxruntime/test/unittest_util/qdq_test_utils.h +++ b/onnxruntime/test/unittest_util/qdq_test_utils.h @@ -86,7 +86,7 @@ inline void QuantizeBlockwise( int columns, int leading_dimension) { static_assert(qbits == 4, "Only 4-bit quantization is supported"); - static_assert(std::is_same::value, "Only float type is supported"); + static_assert(std::is_same::value || std::is_same::value, "Only float type is supported"); if (!columnwise) { throw std::runtime_error("Only column-wise quantization is supported in test utilities"); @@ -223,7 +223,7 @@ inline void DequantizeBlockwise( int rows, int columns) { static_assert(qbits == 4, "Only 4-bit quantization is supported"); - static_assert(std::is_same::value, "Only float type is supported"); + static_assert(std::is_same::value || std::is_same::value, "Only float type is supported"); if (!columnwise) { throw std::runtime_error("Only column-wise dequantization is supported in test utilities");