From e5605ee2db4c4067f3b790ffa9b257d44e74f298 Mon Sep 17 00:00:00 2001 From: morelos Date: Thu, 3 Jul 2025 11:17:34 -0700 Subject: [PATCH] [ET-VK][testing] Q/DQ/CQP op comprehensive delegate dynamic quantization testing # Context We need to ensure that most of the operators that were created work in tandem with each other for dynamic quantization. # Changes This creates two test cases to test the per_token and per_tensor pipeline to ensure that the whole full quantization workflow works as intended. Differential Revision: [D77746139](https://our.internmc.facebook.com/intern/diff/D77746139/) [ghstack-poisoned] --- backends/vulkan/test/test_vulkan_delegate.py | 84 ++++++++++++++++++++ 1 file changed, 84 insertions(+) diff --git a/backends/vulkan/test/test_vulkan_delegate.py b/backends/vulkan/test/test_vulkan_delegate.py index 04adf183e55..efd6fdb81fe 100644 --- a/backends/vulkan/test/test_vulkan_delegate.py +++ b/backends/vulkan/test/test_vulkan_delegate.py @@ -1964,3 +1964,87 @@ def forward(self, x): GroupNormModule(num_groups, num_channels), sample_inputs, ) + + def test_vulkan_backend_full_quantization_workflow(self): + class FullQuantizationWorkflowModule(torch.nn.Module): + def __init__(self): + super().__init__() + + def forward(self, x): + # Step 1: Choose quantization parameters per tensor + scale, zero_point = torch.ops.quantized_decomposed.choose_qparams.tensor( + x, + quant_min=-2147483648, # int32 min + quant_max=2147483647, # int32 max + eps=1e-5, + dtype=torch.int32, + ) + + # Step 2: Quantize using the calculated parameters + quantized = torch.ops.quantized_decomposed.quantize_per_tensor.tensor( + x, + scale, + zero_point, + quant_min=-2147483648, # int32 min + quant_max=2147483647, # int32 max + dtype=torch.int32, + ) + + # Step 3: Dequantize back to float + dequantized = torch.ops.quantized_decomposed.dequantize_per_tensor.tensor( + quantized, + scale, + zero_point, + quant_min=-2147483648, # int32 min + quant_max=2147483647, # int32 max + dtype=torch.int32, + ) + + return dequantized + + full_workflow_module = FullQuantizationWorkflowModule() + sample_inputs = (torch.rand(size=(2, 3, 4), dtype=torch.float32),) + + # Use higher tolerance since quantization introduces some error + self.lower_module_and_test_output(full_workflow_module, sample_inputs, atol=5e-3, rtol=5e-3) + + def test_vulkan_backend_full_per_token_quantization_workflow(self): + class FullPerTokenQuantizationWorkflowModule(torch.nn.Module): + def __init__(self): + super().__init__() + + def forward(self, x): + # Step 1: Choose quantization parameters per token + scale, zero_point = torch.ops.quantized_decomposed.choose_qparams_per_token_asymmetric.default( + x, + dtype=torch.int32, + ) + + # Step 2: Quantize using the calculated parameters per token + quantized = torch.ops.quantized_decomposed.quantize_per_token.default( + x, + scale, + zero_point, + quant_min=-2147483648, # int32 min + quant_max=2147483647, # int32 max + dtype=torch.int32, + ) + + # Step 3: Dequantize back to float per token + dequantized = torch.ops.quantized_decomposed.dequantize_per_token.default( + quantized, + scale, + zero_point, + quant_min=-2147483648, # int32 min + quant_max=2147483647, # int32 max + dtype=torch.int32, + output_dtype=torch.float32, + ) + + return dequantized + + full_per_token_workflow_module = FullPerTokenQuantizationWorkflowModule() + sample_inputs = (torch.rand(size=(6, 4), dtype=torch.float32),) + + # Use higher tolerance since quantization introduces some error + self.lower_module_and_test_output(full_per_token_workflow_module, sample_inputs, atol=5e-3, rtol=5e-3)