From 92d9b145c6077c031b09c21568476becc96ab870 Mon Sep 17 00:00:00 2001
From: morelos <morelos@devvm4573.ash0.facebook.com>
Date: Fri, 13 Jun 2025 15:49:21 -0700
Subject: [PATCH] [ET] enabling half dtype input for quantization

Pull Request resolved: https://github.com/pytorch/executorch/pull/11479

# Context
Currently the cpu implementation for the quantization operator (which includes `quantize_per_token`, `quantize_per_tensor`, and `quantize_per_channel`), does not inherently support half (fp16) input scalar types. In order to align with the PyTorch implementation that accepts fp16 and bfp16 inputs, this diff aims to enable half input dtype support for the quantization operators. We will be comparing this implementation against the vulkan operators.

# Changes
As defined in ExecuTorch [scalar_type_util.h](https://github.com/pytorch/executorch/blob/053686242c1687f0d51b3bb8befd14b047d7b025/runtime/core/exec_aten/util/scalar_type_util.h#L190) file, there is a method to enable support simply changing which preprocessor is called to ET_FORALL_FLOATH_TYPES. This enables support for Half (fp16), Float (fp32), and Double (fp64).

I have also included more comprehensive testing against the input dtypes, including adding double testing since it didn't already exist before. Instead of just confirming that all the output dtypes are supported, we also have a check that all input dtypes are supported now as well.
ghstack-source-id: 290376481
@exported-using-ghexport

Differential Revision: [D76053764](https://our.internmc.facebook.com/intern/diff/D76053764/)
---
 kernels/quantized/cpu/op_quantize.cpp       |  4 +-
 kernels/quantized/test/op_quantize_test.cpp | 65 +++++++++++++++++++++
 2 files changed, 67 insertions(+), 2 deletions(-)
diff --git a/kernels/quantized/cpu/op_quantize.cpp b/kernels/quantized/cpu/op_quantize.cpp
index 4665c3d665b..d0b7c882f8e 100644
--- a/kernels/quantized/cpu/op_quantize.cpp
+++ b/kernels/quantized/cpu/op_quantize.cpp
@@ -150,7 +150,7 @@ Tensor& quantize_per_tensor_out(
     break;
 
   switch (input.scalar_type()) {
-    ET_FORALL_FLOAT_TYPES(CALCULATE_FLOAT_TYPE);
+    ET_FORALL_FLOATH_TYPES(CALCULATE_FLOAT_TYPE);
     default:
       ET_CHECK_MSG(
           false,
@@ -346,7 +346,7 @@ Tensor& quantize_per_channel_out(
     break;
 
   switch (input.scalar_type()) {
-    ET_FORALL_FLOAT_TYPES(CALCULATE_FLOAT_TYPE);
+    ET_FORALL_FLOATH_TYPES(CALCULATE_FLOAT_TYPE);
     default:
       ET_CHECK_MSG(
           false,
diff --git a/kernels/quantized/test/op_quantize_test.cpp b/kernels/quantized/test/op_quantize_test.cpp
index 704d8d06c5c..5cd17223d80 100644
--- a/kernels/quantized/test/op_quantize_test.cpp
+++ b/kernels/quantized/test/op_quantize_test.cpp
@@ -49,6 +49,32 @@ void test_dtype() {
   EXPECT_TENSOR_EQ(out, expected);
 }
 
+template <ScalarType INPUT_DTYPE>
+void test_input_dtype() {
+  TensorFactory<INPUT_DTYPE> tf_input;
+
+  Tensor input = tf_input.full({3, 5}, 4);
+  double scale = 0.5;
+  int64_t zero_point = 108;
+  int64_t quant_min = 0;
+  int64_t quant_max = 127;
+
+  TensorFactory<ScalarType::Char> tfo;
+  Tensor out = tfo.zeros({3, 5});
+  // 4 / 0.5 + 108 = 116
+  Tensor expected = tfo.full({3, 5}, 116);
+  quantize_per_tensor_out(
+      input, scale, zero_point, quant_min, quant_max, ScalarType::Char, out);
+
+  EXPECT_TENSOR_EQ(out, expected);
+}
+
+TEST(OpQuantizeOutTest, AllInputDtypesSupported) {
+  test_input_dtype<ScalarType::Float>();
+  test_input_dtype<ScalarType::Half>();
+  test_input_dtype<ScalarType::Double>();
+}
+
 TEST(OpQuantizeOutTest, AllDtypesSupported) {
   test_dtype<ScalarType::Byte>();
   test_dtype<ScalarType::Char>();
@@ -58,6 +84,45 @@ TEST(OpQuantizeOutTest, AllDtypesSupported) {
   test_dtype<ScalarType::Int>();
 }
 
+TEST(OpQuantizeOutTest, DoubleInputTest) {
+  TensorFactory<ScalarType::Double> tf_double;
+
+  // Test with a more complex value that might have precision differences
+  Tensor input = tf_double.full({2, 3}, 3.14159265359);
+  double scale = 0.01;
+  int64_t zero_point = -100;
+  int64_t quant_min = 0;
+  int64_t quant_max = 255;
+
+  TensorFactory<ScalarType::Byte> tfo;
+  Tensor out = tfo.zeros({2, 3});
+  // 3.14159265359 / 0.01 - 100 = 214.159265359
+  Tensor expected = tfo.full({2, 3}, 214);
+  quantize_per_tensor_out(
+      input, scale, zero_point, quant_min, quant_max, ScalarType::Byte, out);
+
+  EXPECT_TENSOR_EQ(out, expected);
+}
+
+TEST(OpQuantizeOutTest, HalfInputTest) {
+  TensorFactory<ScalarType::Half> tf_half;
+
+  Tensor input = tf_half.full({2, 3}, 2.5);
+  double scale = 0.5;
+  int64_t zero_point = 10;
+  int64_t quant_min = -128;
+  int64_t quant_max = 127;
+
+  TensorFactory<ScalarType::Char> tfo;
+  Tensor out = tfo.zeros({2, 3});
+  // 2.5 / 0.5 + 10 = 15
+  Tensor expected = tfo.full({2, 3}, 15);
+  quantize_per_tensor_out(
+      input, scale, zero_point, quant_min, quant_max, ScalarType::Char, out);
+
+  EXPECT_TENSOR_EQ(out, expected);
+}
+
 TEST(OpQuantizeOutTest, TensorArgOverload) {
   TensorFactory<ScalarType::Float> tf_float;
   TensorFactory<ScalarType::Double> tf_double;