From e5605ee2db4c4067f3b790ffa9b257d44e74f298 Mon Sep 17 00:00:00 2001
From: morelos <morelos@devvm4573.ash0.facebook.com>
Date: Thu, 3 Jul 2025 11:17:34 -0700
Subject: [PATCH] [ET-VK][testing] Q/DQ/CQP op comprehensive delegate dynamic
 quantization testing

# Context

We need to ensure that most of the operators that were created work in tandem with each other for dynamic quantization.

# Changes

This creates two test cases to test the per_token and per_tensor pipeline to ensure that the whole full quantization workflow works as intended.

Differential Revision: [D77746139](https://our.internmc.facebook.com/intern/diff/D77746139/)

[ghstack-poisoned]
---
 backends/vulkan/test/test_vulkan_delegate.py | 84 ++++++++++++++++++++
 1 file changed, 84 insertions(+)

diff --git a/backends/vulkan/test/test_vulkan_delegate.py b/backends/vulkan/test/test_vulkan_delegate.py
index 04adf183e55..efd6fdb81fe 100644
--- a/backends/vulkan/test/test_vulkan_delegate.py
+++ b/backends/vulkan/test/test_vulkan_delegate.py
@@ -1964,3 +1964,87 @@ def forward(self, x):
                     GroupNormModule(num_groups, num_channels),
                     sample_inputs,
                 )
+
+    def test_vulkan_backend_full_quantization_workflow(self):
+        class FullQuantizationWorkflowModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                # Step 1: Choose quantization parameters per tensor
+                scale, zero_point = torch.ops.quantized_decomposed.choose_qparams.tensor(
+                    x,
+                    quant_min=-2147483648,  # int32 min
+                    quant_max=2147483647,   # int32 max
+                    eps=1e-5,
+                    dtype=torch.int32,
+                )
+
+                # Step 2: Quantize using the calculated parameters
+                quantized = torch.ops.quantized_decomposed.quantize_per_tensor.tensor(
+                    x,
+                    scale,
+                    zero_point,
+                    quant_min=-2147483648,  # int32 min
+                    quant_max=2147483647,   # int32 max
+                    dtype=torch.int32,
+                )
+
+                # Step 3: Dequantize back to float
+                dequantized = torch.ops.quantized_decomposed.dequantize_per_tensor.tensor(
+                    quantized,
+                    scale,
+                    zero_point,
+                    quant_min=-2147483648,  # int32 min
+                    quant_max=2147483647,   # int32 max
+                    dtype=torch.int32,
+                )
+
+                return dequantized
+
+        full_workflow_module = FullQuantizationWorkflowModule()
+        sample_inputs = (torch.rand(size=(2, 3, 4), dtype=torch.float32),)
+
+        # Use higher tolerance since quantization introduces some error
+        self.lower_module_and_test_output(full_workflow_module, sample_inputs, atol=5e-3, rtol=5e-3)
+
+    def test_vulkan_backend_full_per_token_quantization_workflow(self):
+        class FullPerTokenQuantizationWorkflowModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                # Step 1: Choose quantization parameters per token
+                scale, zero_point = torch.ops.quantized_decomposed.choose_qparams_per_token_asymmetric.default(
+                    x,
+                    dtype=torch.int32,
+                )
+
+                # Step 2: Quantize using the calculated parameters per token
+                quantized = torch.ops.quantized_decomposed.quantize_per_token.default(
+                    x,
+                    scale,
+                    zero_point,
+                    quant_min=-2147483648,  # int32 min
+                    quant_max=2147483647,   # int32 max
+                    dtype=torch.int32,
+                )
+
+                # Step 3: Dequantize back to float per token
+                dequantized = torch.ops.quantized_decomposed.dequantize_per_token.default(
+                    quantized,
+                    scale,
+                    zero_point,
+                    quant_min=-2147483648,  # int32 min
+                    quant_max=2147483647,   # int32 max
+                    dtype=torch.int32,
+                    output_dtype=torch.float32,
+                )
+
+                return dequantized
+
+        full_per_token_workflow_module = FullPerTokenQuantizationWorkflowModule()
+        sample_inputs = (torch.rand(size=(6, 4), dtype=torch.float32),)
+
+        # Use higher tolerance since quantization introduces some error
+        self.lower_module_and_test_output(full_per_token_workflow_module, sample_inputs, atol=5e-3, rtol=5e-3)