Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ namespace qnn {
Inputs
- A : : (fp16/32) : [batch_size{1}, sequence_len, K]
- B : Init : (uint8) : [N, K/block_size, (block_size * bits) / 8]
- scales : Init : (fp32) : [N * K / block_size]
- scales : Init : (fp16/32) : [N * K / block_size]
- zero_points : (optional)Init : (uint8) : [N * K / (block_size * 2)]
- bias : (optional)Init : [fp16/32] : [N]
Outputs
Expand Down Expand Up @@ -123,16 +123,14 @@ Ort::Status MatMulNBitsOpBuilder::IsOpSupported(QnnModelWrapper& qnn_model_wrapp

const auto& inputs = node_unit.Inputs();
// 1. input : Datatype should be float16 or float32
// Float16 Dlc serialization failing, Skipping float16 support for this op builder
// TODO :: Add Float16 Support
{
const OrtNodeUnitIODef& input_tensor = inputs[0];
TensorInfo input_info{};
RETURN_IF_ERROR(qnn_model_wrapper.GetTensorInfo(input_tensor, input_info));
RETURN_IF_ERROR(utils::GetQnnDataType(input_tensor.quant_param.has_value(),
input_tensor.type,
input_datatype));
RETURN_IF(input_datatype != QNN_DATATYPE_FLOAT_32, "Unsupported Input datatype");
RETURN_IF(input_datatype != QNN_DATATYPE_FLOAT_32 && input_datatype != QNN_DATATYPE_FLOAT_16, "Unsupported Input datatype");
}

// 2. weight : weight supported with packed int4 into int8.
Expand All @@ -151,15 +149,16 @@ Ort::Status MatMulNBitsOpBuilder::IsOpSupported(QnnModelWrapper& qnn_model_wrapp
"Invalid B dimensions. Qnn Gpu Only Supports MatMulNBits with bits == 4 in packed format");
}

// 3. scales : scales only float32 datatype
// 3. scales : Datatype should be float16 or float32
{
const OrtNodeUnitIODef& input_tensor = inputs[2];
TensorInfo input_info{};
RETURN_IF_ERROR(qnn_model_wrapper.GetTensorInfo(input_tensor, input_info));
RETURN_IF_ERROR(utils::GetQnnDataType(input_tensor.quant_param.has_value(),
input_tensor.type,
input_datatype));
RETURN_IF(input_datatype != QNN_DATATYPE_FLOAT_32, "Unsupported Input datatype");
RETURN_IF(input_datatype != QNN_DATATYPE_FLOAT_32 && input_datatype != QNN_DATATYPE_FLOAT_16,
"Unsupported scales datatype");
}

// 4. If input 3 exists, it has to be zero point.
Expand Down Expand Up @@ -254,11 +253,34 @@ Ort::Status MatMulNBitsOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapp
std::vector<uint8_t> per_block_uint8_scale;
const OrtValueInfo* scale_tensor_proto = qnn_model_wrapper.GetConstantTensor(scales_tensor.name);
RETURN_IF_ERROR(qnn_model_wrapper.UnpackInitializerData(scale_tensor_proto, per_block_uint8_scale));
RETURN_IF_NOT(per_block_uint8_scale.size() == (num_blocks * sizeof(float)),

const OrtTypeInfo* type_info = nullptr;
const auto& ort_api = qnn_model_wrapper.GetOrtApi();
ORT_CXX_RETURN_ON_API_FAIL(ort_api.GetValueInfoTypeInfo(scale_tensor_proto, &type_info));
const OrtTensorTypeAndShapeInfo* tensor_type_and_shape_info = nullptr;
ORT_CXX_RETURN_ON_API_FAIL(ort_api.CastTypeInfoToTensorInfo(type_info, &tensor_type_and_shape_info));
ONNXTensorElementDataType onnx_data_type = ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED;
ORT_CXX_RETURN_ON_API_FAIL(ort_api.GetTensorElementType(tensor_type_and_shape_info, &onnx_data_type));

RETURN_IF(onnx_data_type != ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT &&
onnx_data_type != ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16,
"Unsupported scales datatype");
Comment on lines +257 to +267

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
const OrtTypeInfo* type_info = nullptr;
const auto& ort_api = qnn_model_wrapper.GetOrtApi();
ORT_CXX_RETURN_ON_API_FAIL(ort_api.GetValueInfoTypeInfo(scale_tensor_proto, &type_info));
const OrtTensorTypeAndShapeInfo* tensor_type_and_shape_info = nullptr;
ORT_CXX_RETURN_ON_API_FAIL(ort_api.CastTypeInfoToTensorInfo(type_info, &tensor_type_and_shape_info));
ONNXTensorElementDataType onnx_data_type = ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED;
ORT_CXX_RETURN_ON_API_FAIL(ort_api.GetTensorElementType(tensor_type_and_shape_info, &onnx_data_type));
RETURN_IF(onnx_data_type != ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT &&
onnx_data_type != ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16,
"Unsupported scales datatype");
RETURN_IF(scales_tensor.type != ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT &&
scale_tensor.type != ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16,
"Unsupported scales datatype");


const size_t elem_byte_size = qnn::utils::GetElementSizeByType(onnx_data_type);
RETURN_IF_NOT(per_block_uint8_scale.size() == (num_blocks * elem_byte_size),
Comment thread
minfhong-qti marked this conversation as resolved.
"Scale Initializer Invalid Size");
float* per_block_float_scale_ptr = reinterpret_cast<float*>(per_block_uint8_scale.data());
const std::vector<float> per_block_float_scale(per_block_float_scale_ptr,
per_block_float_scale_ptr + num_blocks);

std::vector<float> per_block_float_scale;
if (onnx_data_type == ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT) {
float* per_block_float_scale_ptr = reinterpret_cast<float*>(per_block_uint8_scale.data());
per_block_float_scale = std::vector<float>(per_block_float_scale_ptr, per_block_float_scale_ptr + num_blocks);
} else {
Ort::Float16_t* per_block_fp16_scale_ptr = reinterpret_cast<Ort::Float16_t*>(per_block_uint8_scale.data());
per_block_float_scale.reserve(num_blocks);
Comment thread
minfhong-qti marked this conversation as resolved.
for (int64_t i = 0; i < num_blocks; i++) {
per_block_float_scale.emplace_back(static_cast<float>(per_block_fp16_scale_ptr[i]));
}
}

// 2.3 Quantization Offsets : QNN Support only symmetric quantization with default value of 0
std::vector<int32_t> per_block_int32_offset(num_blocks, 0);
Expand Down
114 changes: 77 additions & 37 deletions onnxruntime/test/providers/qnn/matmulnbits_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -17,32 +17,33 @@ namespace test {

constexpr int QBits = 4;

void QuantizeDequantize(std::vector<float>& raw_vals,
template <typename T>
void QuantizeDequantize(std::vector<T>& raw_vals,
std::vector<uint8_t>& quant_vals,
std::vector<float>& scales,
std::vector<T>& scales,
std::vector<uint8_t>* zp,
int32_t N,
int32_t K,
int32_t block_size) {
QuantizeBlockwise<float, QBits>(quant_vals.data(),
scales.data(),
zp != nullptr ? zp->data() : nullptr,
raw_vals.data(),
block_size,
true,
K,
N,
N);
QuantizeBlockwise<T, QBits>(quant_vals.data(),
scales.data(),
zp != nullptr ? zp->data() : nullptr,
raw_vals.data(),
block_size,
true,
K,
N,
N);

// Note that raw_vals is NxK after dequant
DequantizeBlockwise<float, QBits>(raw_vals.data(), // dequantized output
quant_vals.data(), // quantized input
scales.data(), // quantization scales
zp != nullptr ? zp->data() : nullptr, // quantization zero points
block_size, // quantization block size
true, // columnwise quantization
K, // number of rows
N); // number of columns
DequantizeBlockwise<T, QBits>(raw_vals.data(), // dequantized output
quant_vals.data(), // quantized input
scales.data(), // quantization scales
zp != nullptr ? zp->data() : nullptr, // quantization zero points
block_size, // quantization block size
true, // columnwise quantization
K, // number of rows
N); // number of columns
}

struct TestParams4Bits {
Expand All @@ -59,6 +60,7 @@ struct TestParams4Bits {
bool has_bias{false};
};

template <typename DataType>
static void RunMatMul4BitsTest(const TestParams4Bits params,
ExpectedEPNodeAssignment expected_ep_assignment = ExpectedEPNodeAssignment::All,
const std::string& backend_name = "gpu",
Expand All @@ -71,13 +73,15 @@ static void RunMatMul4BitsTest(const TestParams4Bits params,
std::vector<std::string> input_names;

RandomValueGenerator random{1234};
std::vector<float> input0_vals(random.Gaussian<float>(AsSpan({params.batch_count, params.M, params.K}),
0.0f,
0.25f));
std::vector<float> input1_f_vals(random.Gaussian<float>(AsSpan({params.K, params.N}), 0.0f, 0.25f));

auto input0_def = TestInputDef<float>({params.batch_count, params.M, params.K}, false, input0_vals);
MakeTestInput<float>(builder, "input0", input0_def);
std::vector<DataType> input0_vals(random.Gaussian<DataType>(AsSpan({params.batch_count, params.M, params.K}),
static_cast<DataType>(0.0f),
static_cast<DataType>(0.25f)));
std::vector<DataType> input1_f_vals(random.Gaussian<DataType>(AsSpan({params.K, params.N}),
static_cast<DataType>(0.0f),
static_cast<DataType>(0.25f)));

auto input0_def = TestInputDef<DataType>({params.batch_count, params.M, params.K}, false, input0_vals);
MakeTestInput<DataType>(builder, "input0", input0_def);
input_names.push_back("input0");

int64_t k_blocks = (params.K + params.block_size - 1) / params.block_size;
Expand All @@ -88,7 +92,7 @@ static void RunMatMul4BitsTest(const TestParams4Bits params,
size_t q_zp_size_in_bytes = static_cast<size_t>(params.N * zero_point_blob_size); // packed as UInt4x2

std::vector<uint8_t> input1_vals(q_data_size_in_bytes);
std::vector<float> scales(q_scale_size);
std::vector<DataType> scales(q_scale_size);
// TODO
// Not sure why zp is not calculated from QuantizeDequantize. Since QNN GPU only support zp=8, hardcode it here
// as workaround.
Expand All @@ -106,8 +110,8 @@ static void RunMatMul4BitsTest(const TestParams4Bits params,
MakeTestInput<uint8_t>(builder, "input1", input1_def);
input_names.push_back("input1");

auto scales_def = TestInputDef<float>({params.N, k_blocks}, true, scales);
MakeTestInput<float>(builder, "scales", scales_def);
auto scales_def = TestInputDef<DataType>({params.N, k_blocks}, true, scales);
MakeTestInput<DataType>(builder, "scales", scales_def);
input_names.push_back("scales");

if (params.has_zero_point) {
Expand Down Expand Up @@ -144,40 +148,76 @@ static void RunMatMul4BitsTest(const TestParams4Bits params,

// QNN GPU only support FP16 activations and Q4_0 weights, with zero_points = 8
// Accumulation with larger channel accumulates more error. Set higher abs_error with respect to K.
TEST_F(QnnGPUBackendTests, MatMulNBits_Basic_M1_N128_K512_withZp) {
TEST_F(QnnGPUBackendTests, MatMulNBits_Basic_Fp32_M1_N128_K512_withZp) {
TestParams4Bits params;
params.M = 1;
params.N = 128;
params.K = 512;
params.has_zero_point = true;
RunMatMul4BitsTest<float>(params);
}

TEST_F(QnnGPUBackendTests, MatMulNBits_Basic_Fp16_M1_N128_K512_withZp) {
TestParams4Bits params;
params.M = 1;
params.N = 128;
params.K = 512;
params.has_zero_point = true;
RunMatMul4BitsTest(params);
RunMatMul4BitsTest<Ort::Float16_t>(params);
}

TEST_F(QnnGPUBackendTests, MatMulNBits_Basic_Fp32_M1_N128_K512) {
TestParams4Bits params;
params.M = 1;
params.N = 128;
params.K = 512;
params.has_zero_point = false;
RunMatMul4BitsTest<float>(params);
}

TEST_F(QnnGPUBackendTests, MatMulNBits_Basic_M1_N128_K512) {
TEST_F(QnnGPUBackendTests, MatMulNBits_Basic_Fp16_M1_N128_K512) {
TestParams4Bits params;
params.M = 1;
params.N = 128;
params.K = 512;
params.has_zero_point = false;
RunMatMul4BitsTest(params);
RunMatMul4BitsTest<Ort::Float16_t>(params);
}

TEST_F(QnnGPUBackendTests, MatMulNBits_Basic_M10_N128_K512_withZp) {
TEST_F(QnnGPUBackendTests, MatMulNBits_Basic_Fp32_M10_N128_K512_withZp) {
TestParams4Bits params;
params.M = 10;
params.N = 128;
params.K = 512;
params.has_zero_point = true;
RunMatMul4BitsTest(params);
RunMatMul4BitsTest<float>(params);
}

TEST_F(QnnGPUBackendTests, MatMulNBits_Basic_Fp16_M10_N128_K512_withZp) {
TestParams4Bits params;
params.M = 10;
params.N = 128;
params.K = 512;
params.has_zero_point = true;
RunMatMul4BitsTest<Ort::Float16_t>(params);
}

TEST_F(QnnGPUBackendTests, MatMulNBits_Basic_Fp32_M10_N128_K512) {
TestParams4Bits params;
params.M = 10;
params.N = 128;
params.K = 512;
params.has_zero_point = false;
RunMatMul4BitsTest<float>(params);
}

TEST_F(QnnGPUBackendTests, MatMulNBits_Basic_M10_N128_K512) {
TEST_F(QnnGPUBackendTests, MatMulNBits_Basic_Fp16_M10_N128_K512) {
TestParams4Bits params;
params.M = 10;
params.N = 128;
params.K = 512;
params.has_zero_point = false;
RunMatMul4BitsTest(params);
RunMatMul4BitsTest<Ort::Float16_t>(params);
}
#endif

Expand Down
14 changes: 14 additions & 0 deletions onnxruntime/test/unittest_util/graph_transform_test_builder.h
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,20 @@ class RandomValueGenerator {
return val;
}

// Gaussian distribution for float16
template <typename TFloat16>
typename std::enable_if<
std::is_same_v<TFloat16, Ort::Float16_t> || std::is_same_v<TFloat16, Ort::BFloat16_t>,
std::vector<TFloat16>>::type
Gaussian(gsl::span<const int64_t> dims, TFloat16 mean, TFloat16 stddev) {
std::vector<TFloat16> val(SizeFromDims(dims));
std::normal_distribution<float> distribution(static_cast<float>(mean), static_cast<float>(stddev));
for (size_t i = 0; i < val.size(); ++i) {
val[i] = TFloat16(static_cast<float>(distribution(generator_)));
}
return val;
}

// Gaussian distribution for Integer
template <typename TInt>
typename std::enable_if<
Expand Down
4 changes: 2 additions & 2 deletions onnxruntime/test/unittest_util/qdq_test_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ inline void QuantizeBlockwise(
int columns,
int leading_dimension) {
static_assert(qbits == 4, "Only 4-bit quantization is supported");
static_assert(std::is_same<T, float>::value, "Only float type is supported");
static_assert(std::is_same<T, float>::value || std::is_same<T, Ort::Float16_t>::value, "Only float type is supported");

if (!columnwise) {
throw std::runtime_error("Only column-wise quantization is supported in test utilities");
Expand Down Expand Up @@ -223,7 +223,7 @@ inline void DequantizeBlockwise(
int rows,
int columns) {
static_assert(qbits == 4, "Only 4-bit quantization is supported");
static_assert(std::is_same<T, float>::value, "Only float type is supported");
static_assert(std::is_same<T, float>::value || std::is_same<T, Ort::Float16_t>::value, "Only float type is supported");

if (!columnwise) {
throw std::runtime_error("Only column-wise dequantization is supported in test utilities");
Expand Down