From 878419c2b9f583843213f82ead33d0eb43ea0d9e Mon Sep 17 00:00:00 2001
From: morelos <morelos@devvm4573.ash0.facebook.com>
Date: Fri, 13 Jun 2025 15:49:27 -0700
Subject: [PATCH] [ET-VK][Ops] quantize_per_token.default test setup

Pull Request resolved: https://github.com/pytorch/executorch/pull/11367

# Context
In order to enhance my own understanding of these operators, I needed to create a reference implementation, and also build out the vulkan testing framework which creates the necessary build up when I need to call the vulkan implementations. I won't explain what the quantize operator actually is in this diff, but will rather opt to explain the operator in a future diff where I implement the glsl shader, however, the reference implementation is heavily inspired by the cpu implementation and aims to create similar checks when calculating the tokens and performing the quantization with the given scales and zero points.

# Changes
The main changes were the include of the reference implementation that is used for my own learning, and the necessary wrapper functions that will be called later when the vulkan implementation is successfully completed. It has everything necessary for this purpose, including calling the operator by its appropriate name as when defined in the C++ implementation header, and staging components correctly from the GPU and then the CPU which will be where the comparison is done. I have also included comprehensive failure print statements that prints the tensor size along with relevant parameters such as the zero points or scales passed in, and even the min and max for quantization.
ghstack-source-id: 290376493
@exported-using-ghexport

Differential Revision: [D75607854](https://our.internmc.facebook.com/intern/diff/D75607854/)
---
 .../vulkan/test/op_tests/quantize_test.cpp    | 384 ++++++++++++++++++
 1 file changed, 384 insertions(+)
diff --git a/backends/vulkan/test/op_tests/quantize_test.cpp b/backends/vulkan/test/op_tests/quantize_test.cpp
index 6ff61dac19b..0ac08b65972 100644
--- a/backends/vulkan/test/op_tests/quantize_test.cpp
+++ b/backends/vulkan/test/op_tests/quantize_test.cpp
@@ -156,3 +156,387 @@ void check_quantize_args(
       " actual quant_max: ",
       quant_max);
 }
+/*
+ * Reference implementation of quantize_per_token
+ */
+at::Tensor quantize_per_token_reference_impl(
+    const at::Tensor& input,
+    const at::Tensor& scale,
+    const at::Tensor& zero_point,
+    int64_t quant_min,
+    int64_t quant_max,
+    at::ScalarType dtype) {
+  // Create output tensor with the target dtype
+  at::Tensor out = at::empty_like(input, dtype);
+
+  // Calculate number of tokens
+  int num_tokens = 1;
+  for (int i = 0; i < input.dim() - 1; i++) {
+    num_tokens *= input.size(i);
+  }
+
+  // Verify that the number of tokens matches the size of scale and zero_point
+  // tensors
+  assert(num_tokens == scale.numel());
+  assert(num_tokens == zero_point.numel());
+
+  // Reshape input to [num_tokens, last_dim]
+  at::Tensor reshaped_input = input.reshape({num_tokens, input.size(-1)});
+  at::Tensor reshaped_out = out.reshape({num_tokens, input.size(-1)});
+
+  // Quantize each token separately
+  for (int token_idx = 0; token_idx < num_tokens; token_idx++) {
+    // Use float for scale since Vulkan doesn't support double
+    float token_scale = scale[token_idx].item<float>();
+    // Use int for zero_point since Vulkan doesn't support int64_t
+    int token_zero_point = zero_point[token_idx].item<int>();
+
+    float inv_scale = 1.0 / token_scale;
+
+    // Quantize the token
+    for (int i = 0; i < input.size(-1); i++) {
+      float value = reshaped_input[token_idx][i].item<float>();
+      int qvalue = token_zero_point + std::nearbyint(inv_scale * value);
+
+      qvalue = std::max<int64_t>(qvalue, quant_min);
+      qvalue = std::min<int64_t>(qvalue, quant_max);
+
+      if (dtype == at::kByte) {
+        reshaped_out[token_idx][i] = static_cast<uint8_t>(qvalue);
+      } else if (dtype == at::kChar) {
+        reshaped_out[token_idx][i] = static_cast<int8_t>(qvalue);
+      } else if (dtype == at::kShort) {
+        reshaped_out[token_idx][i] = static_cast<int16_t>(qvalue);
+      } else if (dtype == at::kInt) {
+        reshaped_out[token_idx][i] = static_cast<int32_t>(qvalue);
+      } else if (dtype == at::kLong) {
+        reshaped_out[token_idx][i] = static_cast<int64_t>(qvalue);
+      }
+    }
+  }
+
+  return out;
+}
+
+void test_vulkan_quantize_per_token_impl(
+    const std::vector<int>& input_sizes,
+    const std::vector<float>& scales,
+    const std::vector<int>& zero_points,
+    int64_t quant_min,
+    int64_t quant_max,
+    at::ScalarType in_dtype,
+    at::ScalarType dtype,
+    const vkcompute::utils::StorageType in_storage,
+    const vkcompute::utils::StorageType out_storage);
+
+// Wrapper function to test both buffer and texture storage types
+void test_vulkan_quantize_per_token(
+    const std::vector<int>& input_sizes,
+    const std::vector<float>& scales,
+    const std::vector<int>& zero_points,
+    int64_t quant_min,
+    int64_t quant_max,
+    at::ScalarType in_dtype = at::kFloat,
+    at::ScalarType dtype = at::kInt) {
+  // Test with buffer storage
+  test_vulkan_quantize_per_token_impl(
+      input_sizes,
+      scales,
+      zero_points,
+      quant_min,
+      quant_max,
+      in_dtype,
+      dtype,
+      vkcompute::utils::kBuffer,
+      vkcompute::utils::kBuffer);
+
+  // Test with texture storage
+  test_vulkan_quantize_per_token_impl(
+      input_sizes,
+      scales,
+      zero_points,
+      quant_min,
+      quant_max,
+      in_dtype,
+      dtype,
+      vkcompute::utils::kTexture3D,
+      vkcompute::utils::kTexture3D);
+}
+
+void test_reference_quantize_per_token(
+    const std::vector<int>& input_sizes,
+    const std::vector<float>& scales,
+    const std::vector<int>& zero_points,
+    int64_t quant_min,
+    int64_t quant_max,
+    at::ScalarType in_dtype = at::kFloat,
+    at::ScalarType dtype = at::kInt) {
+  check_quantize_args(quant_min, quant_max, dtype);
+  std::vector<int64_t> input_sizes_int64(
+      input_sizes.begin(), input_sizes.end());
+  at::Tensor input =
+      at::zeros(input_sizes_int64, at::device(at::kCPU).dtype(in_dtype));
+
+  // Fill with a simple pattern: values from 0 to 1 in steps
+  float step = 1.0 / (input.numel() - 1);
+  auto flat_input = input.flatten();
+  for (int i = 0; i < flat_input.numel(); i++) {
+    flat_input[i] = i * step;
+  }
+
+  // Reshape back to original dimensions
+  input = flat_input.reshape(input_sizes_int64);
+
+  // Calculate number of tokens
+  int num_tokens = 1;
+  for (int i = 0; i < input.dim() - 1; i++) {
+    num_tokens *= input.size(i);
+  }
+
+  // Verify that the number of tokens matches the size of scales and zero_points
+  ASSERT_EQ(num_tokens, scales.size());
+  ASSERT_EQ(num_tokens, zero_points.size());
+
+  // Create scale and zero_point tensors
+  at::Tensor scale_tensor =
+      at::tensor(scales, at::device(at::kCPU).dtype(at::kDouble));
+  at::Tensor zero_point_tensor =
+      at::tensor(zero_points, at::device(at::kCPU).dtype(at::kLong));
+
+  // Get reference output
+  at::Tensor reference_out = quantize_per_token_reference_impl(
+      input, scale_tensor, zero_point_tensor, quant_min, quant_max, dtype);
+
+  // Get implementation output
+  at::Tensor impl_out = torch::executor::native::quantize_per_token_aten(
+      input, scale_tensor, zero_point_tensor, quant_min, quant_max, dtype);
+
+  // Convert to int for consistent display regardless of underlying type
+  at::Tensor reference_int = reference_out.to(at::kInt);
+  at::Tensor impl_int = impl_out.to(at::kInt);
+
+  const bool output_correct = at::equal(reference_int, impl_out);
+  if (!output_correct) {
+    std::cout << "\n"
+              << "Failed with parameters: " << std::endl;
+    std::cout << "  scale(s):";
+    for (size_t i = 0; i < scales.size(); i++) {
+      std::cout << " " << scales[i] << " ";
+    }
+    std::cout << "" << std::endl;
+    std::cout << "  zero_point(s):";
+    for (size_t i = 0; i < zero_points.size(); i++) {
+      std::cout << " " << zero_points[i] << " ";
+    }
+    std::cout << "" << std::endl;
+    std::cout << "  quant_min: " << quant_min << std::endl;
+    std::cout << "  quant_max: " << quant_max << std::endl;
+
+    std::cout << "input:" << std::endl;
+    std::cout << input << std::endl;
+    std::cout << "reference:" << std::endl;
+    std::cout << reference_int << std::endl;
+    std::cout << "my_reference:" << std::endl;
+    std::cout << impl_out << std::endl;
+  }
+
+  ASSERT_TRUE(output_correct);
+}
+
+void test_vulkan_quantize_per_token_impl(
+    const std::vector<int>& input_sizes,
+    const std::vector<float>& scales,
+    const std::vector<int>& zero_points,
+    int64_t quant_min,
+    int64_t quant_max,
+    at::ScalarType in_dtype = at::kFloat,
+    at::ScalarType dtype = at::kInt,
+    const vkcompute::utils::StorageType in_storage =
+        vkcompute::utils::kTexture3D,
+    const vkcompute::utils::StorageType out_storage =
+        vkcompute::utils::kTexture3D) {
+  check_quantize_args(quant_min, quant_max, dtype);
+  int num_tokens = 1;
+  for (int i = 0; i < input_sizes.size() - 1; i++) {
+    num_tokens *= input_sizes[i];
+  }
+
+  ASSERT_EQ(num_tokens, scales.size());
+  ASSERT_EQ(num_tokens, zero_points.size());
+
+  // Create input tensor with random values
+  std::vector<int64_t> input_sizes_int64(
+      input_sizes.begin(), input_sizes.end());
+  at::Tensor input =
+      at::rand(input_sizes_int64, at::device(at::kCPU).dtype(in_dtype));
+  at::Tensor scale_tensor =
+      at::tensor(scales, at::device(at::kCPU).dtype(at::kDouble));
+  at::Tensor zero_point_tensor =
+      at::tensor(zero_points, at::device(at::kCPU).dtype(at::kLong));
+
+  // Get reference output to show what we would compare against
+  at::Tensor reference_out = torch::executor::native::quantize_per_token_aten(
+      input, scale_tensor, zero_point_tensor, quant_min, quant_max, dtype);
+
+  using namespace vkcompute;
+
+  GraphConfig config;
+  config.set_storage_type_override(in_storage);
+  ComputeGraph graph(config);
+
+  IOValueRef r_input = graph.add_input_tensor(
+      input.sizes().vec(), from_at_scalartype(input.scalar_type()), in_storage);
+  IOValueRef r_scale = graph.add_input_tensor(
+      scale_tensor.sizes().vec(), vkapi::kFloat, in_storage);
+  IOValueRef r_zero_point = graph.add_input_tensor(
+      zero_point_tensor.sizes().vec(), vkapi::kInt, in_storage);
+
+  const ValueRef r_quant_min = graph.add_scalar<int64_t>(quant_min);
+  const ValueRef r_quant_max = graph.add_scalar<int64_t>(quant_max);
+
+  const ValueRef r_out = graph.add_tensor(
+      input.sizes().vec(), from_at_scalartype(dtype), out_storage);
+
+  VK_GET_OP_FN("quantize_per_token.default")
+  (graph,
+   {
+       r_input.value,
+       r_scale.value,
+       r_zero_point.value,
+       r_quant_min,
+       r_quant_max,
+       r_out,
+   });
+
+  ValueRef staging_out = graph.set_output_tensor(r_out);
+
+  graph.prepare();
+  graph.encode_prepack();
+  graph.prepack();
+  graph.encode_execute();
+
+  // Copy input data to GPU
+  graph.copy_into_staging(
+      r_input.staging, input.const_data_ptr(), input.numel());
+
+  // Convert scale tensor to float and copy to GPU
+  at::Tensor scale_float = scale_tensor.to(at::kFloat);
+  graph.copy_into_staging(
+      r_scale.staging, scale_float.const_data_ptr(), scale_float.numel());
+
+  // Convert zero_point tensor to int and copy to GPU
+  at::Tensor zero_point_int = zero_point_tensor.to(at::kInt);
+  graph.copy_into_staging(
+      r_zero_point.staging,
+      zero_point_int.const_data_ptr(),
+      zero_point_int.numel());
+
+  // Execute the graph
+  graph.execute();
+
+  // Copy output data back to CPU
+  at::Tensor vk_out = at::empty_like(reference_out).contiguous();
+  graph.copy_from_staging(
+      staging_out, vk_out.mutable_data_ptr(), vk_out.numel());
+
+  // Compare outputs
+  at::Tensor reference_int = reference_out.to(at::kInt);
+  at::Tensor vk_int = vk_out.to(at::kInt);
+
+  const bool output_correct = at::equal(reference_int, vk_int);
+  if (!output_correct) {
+    at::Tensor diffs = at::abs(reference_int - vk_int);
+
+    std::cout << "\n"
+              << "Failed with parameters: " << std::endl;
+    std::cout << "  scale(s):";
+    for (size_t i = 0; i < scales.size(); i++) {
+      std::cout << " " << scales[i] << " ";
+    }
+    std::cout << "" << std::endl;
+    std::cout << "  zero_point(s):";
+    for (size_t i = 0; i < zero_points.size(); i++) {
+      std::cout << " " << zero_points[i] << " ";
+    }
+    std::cout << "" << std::endl;
+    std::cout << "  quant_min: " << quant_min << std::endl;
+    std::cout << "  quant_max: " << quant_max << std::endl;
+    std::cout << "  storage type: "
+              << (in_storage == vkcompute::utils::kBuffer ? "buffer"
+                                                          : "texture")
+              << std::endl;
+
+    std::cout << "input:" << std::endl;
+    std::cout << input << std::endl;
+    std::cout << "reference:" << std::endl;
+    std::cout << reference_int << std::endl;
+    std::cout << "vulkan:" << std::endl;
+    std::cout << vk_int << std::endl;
+  }
+
+  ASSERT_TRUE(output_correct);
+}
+
+TEST(
+    VulkanQuantizePerTensorTest,
+    test_reference_quantize_per_token_float_to_int8) {
+  std::vector<float> scales = {0.1, 0, 0.3, 0.1, 0.2, 0.3};
+  std::vector<int> zero_points = {1, 2, 3, 0, -1, -2};
+
+  test_reference_quantize_per_token(
+      {2, 3, 4}, // input sizes (2*3=6 tokens)
+      scales,
+      zero_points,
+      -128, // quant_min
+      127, // quant_max
+      at::kFloat,
+      at::kChar);
+}
+
+TEST(
+    VulkanQuantizePerTensorTest,
+    test_reference_quantize_per_token_float_to_int32) {
+  std::vector<float> scales = {0.1, 0, 0.3, 0.1, 0.2, 0.3};
+  std::vector<int> zero_points = {1, 2, 3, 0, -1, -2};
+
+  test_reference_quantize_per_token(
+      {2, 3, 4}, // input sizes (2*3=6 tokens)
+      scales,
+      zero_points,
+      std::numeric_limits<int32_t>::min(), // quant_min
+      std::numeric_limits<int32_t>::max(), // quant_max
+      at::kFloat,
+      at::kInt);
+}
+
+TEST(
+    VulkanQuantizePerTensorTest,
+    test_reference_quantize_per_token_half_to_int32) {
+  std::vector<float> scales = {0.1, 0, 0.3, 0.1, 0.2, 0.3};
+  std::vector<int> zero_points = {1, 2, 3, 0, -1, -2};
+
+  test_reference_quantize_per_token(
+      {2, 3, 4}, // input sizes (2*3=6 tokens)
+      scales,
+      zero_points,
+      std::numeric_limits<int32_t>::min(), // quant_min
+      std::numeric_limits<int32_t>::max(), // quant_max
+      at::kHalf,
+      at::kInt);
+}
+
+TEST(
+    VulkanQuantizePerTensorTest,
+    test_reference_quantize_per_token_half_to_uint8) {
+  std::vector<float> scales = {0.1, 0, 0.3, 0.1, 0.2, 0.3};
+  std::vector<int> zero_points = {1, 2, 3, 0, -1, -2};
+
+  test_reference_quantize_per_token(
+      {2, 3, 4}, // input sizes (2*3=6 tokens)
+      scales,
+      zero_points,
+      0, // quant_min
+      255, // quant_max
+      at::kHalf,
+      at::kByte);
+}