From 086646d4bf917f38069bf8d14e4f6e93e8350352 Mon Sep 17 00:00:00 2001
From: morelos <morelos@devvm4573.ash0.facebook.com>
Date: Fri, 13 Jun 2025 15:49:31 -0700
Subject: [PATCH] [ET-VK][Ops] dequantize_per_token.default test setup

Pull Request resolved: https://github.com/pytorch/executorch/pull/11482

# Context
In order to enhance my own understanding of these operators, I needed to create a reference implementation, and also build out the vulkan testing framework which creates the necessary build up when I need to call the vulkan implementations. I won't explain what the dequantize operator actually is in this diff, but will rather opt to explain the operator in a future diff where I implement the glsl shader, however, the reference implementation is heavily inspired by the cpu implementation and aims to create similar checks when using the zero points and scales and performing the dequantization with the given parameters.

# Changes
The main changes were the include of the reference implementation that is used for my own learning, and the necessary wrapper functions that will be called later when the vulkan implementation is successfully completed. It has everything necessary for this purpose, including calling the operator by its appropriate name as when defined in the C++ implementation header, and staging components correctly from the GPU and then the CPU which will be where the comparison is done. I have also included comprehensive failure print statements that prints the tensor size along with relevant parameters such as the zero points or scales passed in.
ghstack-source-id: 290376484
@exported-using-ghexport

Differential Revision: [D76267037](https://our.internmc.facebook.com/intern/diff/D76267037/)
---
 .../vulkan/test/op_tests/dequantize_test.cpp  | 494 ++++++++++++++++++
 1 file changed, 494 insertions(+)
diff --git a/backends/vulkan/test/op_tests/dequantize_test.cpp b/backends/vulkan/test/op_tests/dequantize_test.cpp
index 2d12197fb5b..7b155c8f98b 100644
--- a/backends/vulkan/test/op_tests/dequantize_test.cpp
+++ b/backends/vulkan/test/op_tests/dequantize_test.cpp
@@ -243,6 +243,84 @@ at::Tensor dequantize_per_tensor_reference_impl(
   return out.reshape(input.sizes());
 }
 
+/*
+ * Reference implementation of dequantize_per_token
+ */
+at::Tensor dequantize_per_token_reference_impl(
+    const at::Tensor& input,
+    const at::Tensor& scale,
+    const at::Tensor& zero_point,
+    int64_t quant_min,
+    int64_t quant_max,
+    at::ScalarType dtype,
+    at::ScalarType out_dtype) {
+  // Create output tensor with the target dtype
+  at::Tensor out = at::empty_like(input, out_dtype);
+
+  // Calculate number of tokens
+  int num_tokens = 1;
+  for (int i = 0; i < input.dim() - 1; i++) {
+    num_tokens *= input.size(i);
+  }
+
+  // Verify that the number of tokens matches the size of scale and zero_point
+  // tensors
+  assert(num_tokens == scale.numel());
+  assert(num_tokens == zero_point.numel());
+
+  // Reshape input to [num_tokens, last_dim]
+  at::Tensor reshaped_input = input.reshape({num_tokens, input.size(-1)});
+  at::Tensor reshaped_out = out.reshape({num_tokens, input.size(-1)});
+
+  // Dequantize each token separately
+  for (int token_idx = 0; token_idx < num_tokens; token_idx++) {
+    // Get scale and zero_point for this token
+    float token_scale = scale[token_idx].item<float>();
+    int64_t token_zero_point = zero_point[token_idx].item<int64_t>();
+
+    // Store casted values to avoid repeated casting
+    const int32_t token_zero_point_int32 =
+        static_cast<int32_t>(token_zero_point);
+
+    // Dequantize the token
+    for (int i = 0; i < input.size(-1); i++) {
+      double dequantized_value = 0.0;
+
+      // Extract quantized value and dequantize based on input dtype
+      // Following the CPU implementation pattern: (input - zero_point) * scale
+      if (dtype == at::kByte) {
+        uint8_t qvalue = reshaped_input[token_idx][i].item<uint8_t>();
+        dequantized_value = (qvalue - token_zero_point_int32) * token_scale;
+      } else if (dtype == at::kChar) {
+        int8_t qvalue = reshaped_input[token_idx][i].item<int8_t>();
+        dequantized_value = (qvalue - token_zero_point_int32) * token_scale;
+      } else if (dtype == at::kShort) {
+        int16_t qvalue = reshaped_input[token_idx][i].item<int16_t>();
+        dequantized_value = (qvalue - token_zero_point_int32) * token_scale;
+      } else if (dtype == at::kInt) {
+        int32_t qvalue = reshaped_input[token_idx][i].item<int32_t>();
+        dequantized_value = (qvalue - token_zero_point_int32) * token_scale;
+      } else if (dtype == at::kLong) {
+        int64_t qvalue = reshaped_input[token_idx][i].item<int64_t>();
+        dequantized_value = (qvalue - token_zero_point_int32) * token_scale;
+      } else {
+        throw std::runtime_error("Unsupported input dtype");
+      }
+
+      // Store result based on output dtype
+      if (out_dtype == at::kFloat) {
+        reshaped_out[token_idx][i] = static_cast<float>(dequantized_value);
+      } else if (out_dtype == at::kDouble) {
+        reshaped_out[token_idx][i] = dequantized_value;
+      } else if (out_dtype == at::kHalf) {
+        reshaped_out[token_idx][i] = static_cast<c10::Half>(dequantized_value);
+      }
+    }
+  }
+
+  return out;
+}
+
 // Forward declaration of implementation functions
 void test_vulkan_dequantize_per_tensor_impl(
     const std::vector<int>& input_sizes,
@@ -255,6 +333,17 @@ void test_vulkan_dequantize_per_tensor_impl(
     const vkcompute::utils::StorageType in_storage,
     const vkcompute::utils::StorageType out_storage);
 
+void test_vulkan_dequantize_per_token_impl(
+    const std::vector<int>& input_sizes,
+    const std::vector<float>& scales,
+    const std::vector<int>& zero_points,
+    int64_t quant_min,
+    int64_t quant_max,
+    at::ScalarType dtype,
+    at::ScalarType out_dtype,
+    const vkcompute::utils::StorageType in_storage,
+    const vkcompute::utils::StorageType out_storage);
+
 // Wrapper function to test both buffer and texture storage types
 void test_vulkan_dequantize_per_tensor(
     const std::vector<int>& input_sizes,
@@ -289,6 +378,40 @@ void test_vulkan_dequantize_per_tensor(
       vkcompute::utils::kTexture3D);
 }
 
+// Wrapper function to test both buffer and texture storage types
+void test_vulkan_dequantize_per_token(
+    const std::vector<int>& input_sizes,
+    const std::vector<float>& scales,
+    const std::vector<int>& zero_points,
+    int64_t quant_min,
+    int64_t quant_max,
+    at::ScalarType dtype,
+    at::ScalarType out_dtype) {
+  // Test with buffer storage
+  test_vulkan_dequantize_per_token_impl(
+      input_sizes,
+      scales,
+      zero_points,
+      quant_min,
+      quant_max,
+      dtype,
+      out_dtype,
+      vkcompute::utils::kBuffer,
+      vkcompute::utils::kBuffer);
+
+  // Test with texture storage
+  test_vulkan_dequantize_per_token_impl(
+      input_sizes,
+      scales,
+      zero_points,
+      quant_min,
+      quant_max,
+      dtype,
+      out_dtype,
+      vkcompute::utils::kTexture3D,
+      vkcompute::utils::kTexture3D);
+}
+
 void test_reference_dequantize_per_tensor(
     const std::vector<int>& input_sizes,
     float scale,
@@ -565,3 +688,374 @@ TEST(
       at::kInt, // input dtype
       at::kHalf); // output dtype
 }
+
+void test_reference_dequantize_per_token(
+    const std::vector<int>& input_sizes,
+    const std::vector<float>& scales,
+    const std::vector<int>& zero_points,
+    int64_t quant_min,
+    int64_t quant_max,
+    at::ScalarType dtype,
+    at::ScalarType out_dtype) {
+  check_dequantize_args(quant_min, quant_max, dtype, out_dtype);
+  int num_tokens = 1;
+  for (int i = 0; i < input_sizes.size() - 1; i++) {
+    num_tokens *= input_sizes[i];
+  }
+
+  ASSERT_EQ(num_tokens, scales.size());
+  ASSERT_EQ(num_tokens, zero_points.size());
+
+  // Create input tensor with quantized values
+  std::vector<int64_t> input_sizes_int64(
+      input_sizes.begin(), input_sizes.end());
+  at::Tensor input;
+  if (dtype == at::kByte) {
+    input = at::zeros(input_sizes_int64, at::device(at::kCPU).dtype(at::kByte));
+  } else if (dtype == at::kChar) {
+    input = at::zeros(input_sizes_int64, at::device(at::kCPU).dtype(at::kChar));
+  } else if (dtype == at::kShort) {
+    input =
+        at::zeros(input_sizes_int64, at::device(at::kCPU).dtype(at::kShort));
+  } else if (dtype == at::kInt) {
+    input = at::zeros(input_sizes_int64, at::device(at::kCPU).dtype(at::kInt));
+  } else {
+    input = at::zeros(input_sizes_int64, at::device(at::kCPU).dtype(at::kLong));
+  }
+
+  // Fill with a simple pattern: values from quant_min to quant_max in steps
+  at::Tensor reshaped_input = input.reshape({num_tokens, input.size(-1)});
+  for (int token_idx = 0; token_idx < num_tokens; token_idx++) {
+    float step = 1.0f;
+    if (input.size(-1) > 1) {
+      step = static_cast<float>(quant_max - quant_min) / (input.size(-1) - 1);
+    }
+
+    for (int i = 0; i < input.size(-1); i++) {
+      int64_t qvalue = quant_min + i * step;
+      if (dtype == at::kByte) {
+        reshaped_input[token_idx][i] = static_cast<uint8_t>(qvalue);
+      } else if (dtype == at::kChar) {
+        reshaped_input[token_idx][i] = static_cast<int8_t>(qvalue);
+      } else if (dtype == at::kShort) {
+        reshaped_input[token_idx][i] = static_cast<int16_t>(qvalue);
+      } else if (dtype == at::kInt) {
+        reshaped_input[token_idx][i] = static_cast<int32_t>(qvalue);
+      } else if (dtype == at::kLong) {
+        reshaped_input[token_idx][i] = static_cast<int64_t>(qvalue);
+      }
+    }
+  }
+
+  // Reshape back to original dimensions
+  input = reshaped_input.reshape(input_sizes_int64);
+
+  // Create scale and zero_point tensors
+  at::Tensor scale_tensor =
+      at::tensor(scales, at::device(at::kCPU).dtype(at::kFloat));
+  at::Tensor zero_point_tensor =
+      at::tensor(zero_points, at::device(at::kCPU).dtype(at::kLong));
+
+  // Get reference output
+  at::Tensor reference_out = dequantize_per_token_reference_impl(
+      input,
+      scale_tensor,
+      zero_point_tensor,
+      quant_min,
+      quant_max,
+      dtype,
+      out_dtype);
+
+  // Get implementation output
+  at::Tensor impl_out = torch::executor::native::dequantize_per_token_aten(
+      input,
+      scale_tensor,
+      zero_point_tensor,
+      quant_min,
+      quant_max,
+      dtype,
+      out_dtype);
+
+  // Compare outputs
+  const bool output_correct = at::allclose(reference_out, impl_out);
+  if (!output_correct) {
+    std::cout << "\n"
+              << "Failed with parameters: " << std::endl;
+    std::cout << "  scale(s):";
+    for (size_t i = 0; i < scales.size(); i++) {
+      std::cout << " " << scales[i] << " ";
+    }
+    std::cout << "" << std::endl;
+    std::cout << "  zero_point(s):";
+    for (size_t i = 0; i < zero_points.size(); i++) {
+      std::cout << " " << zero_points[i] << " ";
+    }
+    std::cout << "" << std::endl;
+    std::cout << "  quant_min: " << quant_min << std::endl;
+    std::cout << "  quant_max: " << quant_max << std::endl;
+
+    std::cout << "input:" << std::endl;
+    std::cout << input << std::endl;
+    std::cout << "reference:" << std::endl;
+    std::cout << reference_out << std::endl;
+    std::cout << "implementation:" << std::endl;
+    std::cout << impl_out << std::endl;
+  }
+
+  ASSERT_TRUE(output_correct);
+}
+
+void test_vulkan_dequantize_per_token_impl(
+    const std::vector<int>& input_sizes,
+    const std::vector<float>& scales,
+    const std::vector<int>& zero_points,
+    int64_t quant_min,
+    int64_t quant_max,
+    at::ScalarType dtype,
+    at::ScalarType out_dtype,
+    const vkcompute::utils::StorageType in_storage,
+    const vkcompute::utils::StorageType out_storage) {
+  check_dequantize_args(quant_min, quant_max, dtype, out_dtype);
+  int num_tokens = 1;
+  for (int i = 0; i < input_sizes.size() - 1; i++) {
+    num_tokens *= input_sizes[i];
+  }
+
+  ASSERT_EQ(num_tokens, scales.size());
+  ASSERT_EQ(num_tokens, zero_points.size());
+
+  // Create input tensor with quantized values
+  std::vector<int64_t> input_sizes_int64(
+      input_sizes.begin(), input_sizes.end());
+  at::Tensor input;
+  if (dtype == at::kByte) {
+    input = at::zeros(input_sizes_int64, at::device(at::kCPU).dtype(at::kByte));
+  } else if (dtype == at::kChar) {
+    input = at::zeros(input_sizes_int64, at::device(at::kCPU).dtype(at::kChar));
+  } else if (dtype == at::kShort) {
+    input =
+        at::zeros(input_sizes_int64, at::device(at::kCPU).dtype(at::kShort));
+  } else if (dtype == at::kInt) {
+    input = at::zeros(input_sizes_int64, at::device(at::kCPU).dtype(at::kInt));
+  } else {
+    input = at::zeros(input_sizes_int64, at::device(at::kCPU).dtype(at::kLong));
+  }
+
+  // Fill with a simple pattern: values from quant_min to quant_max in steps
+  at::Tensor reshaped_input = input.reshape({num_tokens, input.size(-1)});
+  for (int token_idx = 0; token_idx < num_tokens; token_idx++) {
+    float step = 1.0f;
+    if (input.size(-1) > 1) {
+      step = static_cast<float>(quant_max - quant_min) / (input.size(-1) - 1);
+    }
+
+    for (int i = 0; i < input.size(-1); i++) {
+      int64_t qvalue = quant_min + i * step;
+      if (dtype == at::kByte) {
+        reshaped_input[token_idx][i] = static_cast<uint8_t>(qvalue);
+      } else if (dtype == at::kChar) {
+        reshaped_input[token_idx][i] = static_cast<int8_t>(qvalue);
+      } else if (dtype == at::kShort) {
+        reshaped_input[token_idx][i] = static_cast<int16_t>(qvalue);
+      } else if (dtype == at::kInt) {
+        reshaped_input[token_idx][i] = static_cast<int32_t>(qvalue);
+      } else if (dtype == at::kLong) {
+        reshaped_input[token_idx][i] = static_cast<int64_t>(qvalue);
+      }
+    }
+  }
+
+  // Reshape back to original dimensions
+  input = reshaped_input.reshape(input_sizes_int64);
+
+  // Create scale and zero_point tensors
+  at::Tensor scale_tensor =
+      at::tensor(scales, at::device(at::kCPU).dtype(at::kFloat));
+  at::Tensor zero_point_tensor =
+      at::tensor(zero_points, at::device(at::kCPU).dtype(at::kLong));
+
+  // Get reference output
+  at::Tensor reference_out = torch::executor::native::dequantize_per_token_aten(
+      input,
+      scale_tensor,
+      zero_point_tensor,
+      quant_min,
+      quant_max,
+      dtype,
+      out_dtype);
+
+  // Build Vulkan dequantize_per_token graph
+  using namespace vkcompute;
+
+  GraphConfig config;
+  config.set_storage_type_override(in_storage);
+  ComputeGraph graph(config);
+
+  IOValueRef r_input = graph.add_input_tensor(
+      input.sizes().vec(), from_at_scalartype(dtype), in_storage);
+  IOValueRef r_scale = graph.add_input_tensor(
+      scale_tensor.sizes().vec(), vkapi::kFloat, in_storage);
+  IOValueRef r_zero_point = graph.add_input_tensor(
+      zero_point_tensor.sizes().vec(), vkapi::kInt, in_storage);
+
+  const ValueRef r_quant_min = graph.add_scalar<int64_t>(quant_min);
+  const ValueRef r_quant_max = graph.add_scalar<int64_t>(quant_max);
+
+  const ValueRef r_out = graph.add_tensor(
+      input.sizes().vec(), from_at_scalartype(out_dtype), out_storage);
+
+  VK_GET_OP_FN("dequantize_per_token.default")
+  (graph,
+   {
+       r_input.value,
+       r_scale.value,
+       r_zero_point.value,
+       r_quant_min,
+       r_quant_max,
+       r_out,
+   });
+
+  ValueRef staging_out = graph.set_output_tensor(r_out);
+
+  graph.prepare();
+  graph.encode_prepack();
+  graph.prepack();
+  graph.encode_execute();
+
+  // Copy input data to GPU
+  graph.copy_into_staging(
+      r_input.staging, input.const_data_ptr(), input.numel());
+
+  // Convert scale tensor to float and copy to GPU
+  at::Tensor scale_float = scale_tensor.to(at::kFloat);
+  graph.copy_into_staging(
+      r_scale.staging, scale_float.const_data_ptr(), scale_float.numel());
+
+  // Convert zero_point tensor to int and copy to GPU
+  at::Tensor zero_point_int = zero_point_tensor.to(at::kInt);
+  graph.copy_into_staging(
+      r_zero_point.staging,
+      zero_point_int.const_data_ptr(),
+      zero_point_int.numel());
+
+  // Execute the graph
+  graph.execute();
+
+  // Copy output data back to CPU
+  at::Tensor vk_out = at::empty_like(reference_out).contiguous();
+  graph.copy_from_staging(
+      staging_out, vk_out.mutable_data_ptr(), vk_out.numel());
+
+  // Compare outputs
+  const bool output_correct = at::allclose(reference_out, vk_out);
+  if (!output_correct) {
+    std::cout << "\n"
+              << "Failed with parameters: " << std::endl;
+    std::cout << "  scale(s):";
+    for (size_t i = 0; i < scales.size(); i++) {
+      std::cout << " " << scales[i] << " ";
+    }
+    std::cout << "" << std::endl;
+    std::cout << "  zero_point(s):";
+    for (size_t i = 0; i < zero_points.size(); i++) {
+      std::cout << " " << zero_points[i] << " ";
+    }
+    std::cout << "" << std::endl;
+    std::cout << "  quant_min: " << quant_min << std::endl;
+    std::cout << "  quant_max: " << quant_max << std::endl;
+    std::cout << "  storage type: "
+              << (in_storage == vkcompute::utils::kBuffer ? "buffer"
+                                                          : "texture")
+              << std::endl;
+
+    std::cout << "input:" << std::endl;
+    std::cout << input << std::endl;
+    std::cout << "reference:" << std::endl;
+    std::cout << reference_out << std::endl;
+    std::cout << "vulkan:" << std::endl;
+    std::cout << vk_out << std::endl;
+  }
+
+  ASSERT_TRUE(output_correct);
+}
+
+// Test cases for dequantize_per_token
+TEST(
+    VulkanDequantizePerTokenTest,
+    test_reference_dequantize_per_token_uint8_to_float) {
+  std::vector<float> scales = {0.1, 0.2, 0.3, 0.4, 0.5, 0.6};
+  std::vector<int> zero_points = {5, 10, 15, 20, 25, 30};
+
+  test_reference_dequantize_per_token(
+      {2, 3, 4}, // input sizes (2*3=6 tokens)
+      scales,
+      zero_points,
+      0, // quant_min
+      255, // quant_max
+      at::kByte, // input dtype
+      at::kFloat); // output dtype
+}
+
+TEST(
+    VulkanDequantizePerTokenTest,
+    test_reference_dequantize_per_token_int8_to_float) {
+  std::vector<float> scales = {0.05, 0.1, 0.15, 0.2};
+  std::vector<int> zero_points = {0, -5, 5, 10};
+
+  test_reference_dequantize_per_token(
+      {2, 2, 5}, // input sizes (2*2=4 tokens)
+      scales,
+      zero_points,
+      -128, // quant_min
+      127, // quant_max
+      at::kChar, // input dtype
+      at::kFloat); // output dtype
+}
+
+TEST(
+    VulkanDequantizePerTokenTest,
+    test_reference_dequantize_per_token_int32_to_float) {
+  std::vector<float> scales = {0.05, 0.1, 0.15, 0.2};
+  std::vector<int> zero_points = {0, -5, 5, 10};
+
+  test_reference_dequantize_per_token(
+      {2, 2, 10}, // input sizes (2*2=4 tokens)
+      scales,
+      zero_points,
+      std::numeric_limits<int32_t>::min(), // quant_min
+      std::numeric_limits<int32_t>::max(), // quant_max
+      at::kInt, // input dtype
+      at::kFloat); // output dtype
+}
+
+TEST(
+    VulkanDequantizePerTokenTest,
+    test_reference_dequantize_per_token_int8_to_half) {
+  std::vector<float> scales = {0.05, 0.1, 0.15, 0.2};
+  std::vector<int> zero_points = {0, -5, 5, 10};
+
+  test_reference_dequantize_per_token(
+      {4, 1, 5}, // input sizes (4*1=4 tokens)
+      scales,
+      zero_points,
+      -128, // quant_min
+      127, // quant_max
+      at::kChar, // input dtype (int8)
+      at::kHalf); // output dtype
+}
+
+TEST(
+    VulkanDequantizePerTokenTest,
+    test_reference_dequantize_per_token_int32_to_half) {
+  std::vector<float> scales = {0.05, 0.1};
+  std::vector<int> zero_points = {0, -5};
+
+  test_reference_dequantize_per_token(
+      {2, 2}, // input sizes (2 tokens)
+      scales,
+      zero_points,
+      std::numeric_limits<int32_t>::min(), // quant_min
+      std::numeric_limits<int32_t>::max(), // quant_max
+      at::kInt, // input dtype
+      at::kHalf); // output dtype
+}