From 0e34e301c4770f9d56496b7383740261f7252a4d Mon Sep 17 00:00:00 2001 From: morelos Date: Tue, 17 Jun 2025 10:06:34 -0700 Subject: [PATCH] [ET-VK][Ops] quantization op shaders and impl Pull Request resolved: https://github.com/pytorch/executorch/pull/11369 # Operator Description The quantization operator converts floating-point tensors (fp16/fp32) to lower-precision integer formats (uint8/int8/int32) using affine quantization. This operator supports two quantization modes: - **Per-tensor quantization**: Uses a single scale and zero_point for the entire tensor - **Per-token quantization**: Uses different scale and zero_point values for each "token" (typically rows or channels) The quantization formula is: `quantized_value = clamp(round(input_value / scale) + zero_point, quant_min, quant_max)` **Example**: For a float value `2.5` with `scale=0.1`, `zero_point=128`, `quant_min=0`, `quant_max=255`: - `round(2.5 / 0.1) + 128 = round(25) + 128 = 153` - `clamp(153, 0, 255) = 153` (uint8 output) The quantization parameters serve these purposes: - **scale**: Controls the granularity of quantization (smaller scale = finer precision) - **zero_point**: Maps the floating-point zero to an integer value - **quant_min/quant_max**: Define the valid range for the quantized output type # Shader Algorithm Overview ## Texture Storage Implementation (`quantize_texture.glsl`) The texture-based implementation operates on 3D textures where data is stored in RGBA texel format (4 components per texel): **Per-tensor Mode**: Each compute thread processes one texel position. It loads a 4-component texel from the input texture, and applies quantization to each of the 4 components using shared scale/zero_point. It then writes the quantized 4-component result to the output texture. This method is fairly linear. **Per-token Mode**: We need to calculate the token index based on the spatial position, it'll differ between various cases like 3D and 2D. For instand we might define the token_idx as `z * dims.y + y` for 3D, or just `y` for 2D cases. We then retrieve the per-token scale/zero_point from the texture storage according to the token_idx. We need to do component indexing based on the texel_idx and token_idx: `texel_idx = token_idx / 4`, along with the component id `comp_idx = token_idx % 4` to get the necessary scale/zero_point. We then apply quantization with the corresponding token-specific parameters to the 4 components of the current texel. ## Buffer Storage Implementation (`quantize_buffer.glsl`) The buffer-based implementation operates on linear memory buffers with stride-based indexing: **Per-tensor Mode**: In this case, each compute thread will process one element at its global position. It converts the 3D position to linear buffer indices using stride calculations `tidx_to_bufi(pos, strides)`. It then loads single scalar values from the input buffer and applies quantization using shared scale/zero_point parameters. We then store the quantized result to the output buffer at the corresponding index. **Per-token Mode**: We first calculate the logical tensor position from the linear buffer index through dimension unwrapping. We then determine the token index based on the tensor dimensionality: - 4D: `token_idx = w * (z * y) + z * y + y` - 3D: `token_idx = z * y + y` - 2D: `token_idx = y` We then directly index into scale/zero_point buffers using token_idx and also apply quantization with the token-specific parameters. # Performance Considerations / Future Improvements Current implementation uses default workgroup sizing. Profiling different local workgroup sizes could improve occupancy and cache utilization. Buffer implementation processes one element per thread. Could be optimized to process multiple elements per thread. NOTE: Currently the only input types supported are **half** (fp16) and **float** (fp32). The only output types supported are **byte** (uint8), **char** (int8), **int** (int32). A future diff plans to implement **double** (fp64) input dtype support. ghstack-source-id: 291010148 @exported-using-ghexport Differential Revision: [D75959064](https://our.internmc.facebook.com/intern/diff/D75959064/) --- .../runtime/graph/ops/glsl/quantize.glslh | 25 ++ .../graph/ops/glsl/quantize_buffer.glsl | 179 ++++++++++++ .../graph/ops/glsl/quantize_buffer.yaml | 18 ++ .../graph/ops/glsl/quantize_texture.glsl | 184 +++++++++++++ .../graph/ops/glsl/quantize_texture.yaml | 18 ++ .../runtime/graph/ops/impl/Quantize.cpp | 258 ++++++++++++++++++ .../vulkan/test/op_tests/quantize_test.cpp | 250 ++++++++++++++++- backends/vulkan/test/op_tests/test_utils.cpp | 3 +- 8 files changed, 926 insertions(+), 9 deletions(-) create mode 100644 backends/vulkan/runtime/graph/ops/glsl/quantize.glslh create mode 100644 backends/vulkan/runtime/graph/ops/glsl/quantize_buffer.glsl create mode 100644 backends/vulkan/runtime/graph/ops/glsl/quantize_buffer.yaml create mode 100644 backends/vulkan/runtime/graph/ops/glsl/quantize_texture.glsl create mode 100644 backends/vulkan/runtime/graph/ops/glsl/quantize_texture.yaml create mode 100644 backends/vulkan/runtime/graph/ops/impl/Quantize.cpp diff --git a/backends/vulkan/runtime/graph/ops/glsl/quantize.glslh b/backends/vulkan/runtime/graph/ops/glsl/quantize.glslh new file mode 100644 index 00000000000..cde72e41ac7 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/quantize.glslh @@ -0,0 +1,25 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#ifndef QUANTIZE_GLSLH +#define QUANTIZE_GLSLH + +OUT_T quantize_val(IN_T value, float scale_val, int zero_point_val) { + float inv_scale = 1.0 / scale_val; + + float rounded_float = round(inv_scale * float(value)); + + int qvalue = zero_point_val + int(rounded_float); + + qvalue = max(qvalue, quant_min); + qvalue = min(qvalue, quant_max); + + return OUT_T(qvalue); +} + +#endif // QUANTIZE_GLSLH diff --git a/backends/vulkan/runtime/graph/ops/glsl/quantize_buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/quantize_buffer.glsl new file mode 100644 index 00000000000..ea0c2f7dce7 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/quantize_buffer.glsl @@ -0,0 +1,179 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#version 450 core + +#define PRECISION ${PRECISION} + +#define IN_T ${buffer_scalar_type(IN_DTYPE)} +#define OUT_T ${buffer_scalar_type(OUT_DTYPE)} + +#define ${MODE} + +${define_active_storage_type("buffer")} +${define_required_extensions(IN_DTYPE)} +${define_required_extensions(OUT_DTYPE)} + +layout(std430) buffer; + +#include "indexing_utils.h" + +${layout_declare_tensor(B, "w", "t_out", OUT_DTYPE, "buffer")} +${layout_declare_tensor(B, "r", "t_in", IN_DTYPE, "buffer")} + +$if MODE == "per_tensor": + layout(push_constant) uniform restrict Block { + float scale; + int zero_point; + int quant_min; + int quant_max; + }; +$if MODE == "per_token": + ${layout_declare_tensor(B, "r", "t_scale", "float", "buffer")} + ${layout_declare_tensor(B, "r", "t_zero_point", "int", "buffer")} + + layout(push_constant) uniform restrict Block { + int num_tokens; + int quant_min; + int quant_max; + }; + +${layout_declare_ubo(B, "int", "out_numel")} +${layout_declare_ubo(B, "ivec4", "t_in_sizes")} +${layout_declare_ubo(B, "ivec4", "t_in_strides")} +${layout_declare_ubo(B, "ivec4", "t_out_sizes")} +${layout_declare_ubo(B, "ivec4", "t_out_strides")} + +${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")} +${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")} + +#include "quantize.glslh" + +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; + +const lowp ivec4 out_dim_order = unhash_dim_order(out_layout); +const lowp ivec4 in_dim_order = unhash_dim_order(in_layout); + +/* + * QUANTIZATION SHADER (BUFFER STORAGE) + * + * This shader converts floating-point tensor values to n-bit integer representations + * using pre-computed quantization parameters (scale and zero_point). The quantization + * maps floating-point values to a discrete integer range while preserving the + * original data distribution as much as possible. + * + * ALGORITHM: + * 1. Load floating-point input value from buffer + * 2. Apply quantization formula: qvalue = round(value / scale) + zero_point + * 3. Clamp result to [quant_min, quant_max] range + * 4. Store quantized integer value to output buffer + * + * WORKGROUP CONFIGURATION: + * - Per-Tensor Mode: + * - Global WG Size: {num_elements, 1, 1} (one thread per tensor element) + * - Local WG Size: Default (typically {64, 1, 1} or based on global WG size) + * - Per-Token Mode: + * - Global WG Size: {num_elements, 1, 1} (one thread per tensor element) + * - Local WG Size: Default (typically {64, 1, 1} or based on global WG size) + * + * SUPPORTED CONFIGURATIONS: + * - Per-Tensor Config: Uses linear buffer indexing with stride-based tensor access + * - and supports any tensor layout through stride calculations and dimension ordering + * - Per-Token Config: Assumes width-packed layout (packed_dim = 0) + * - since that is how token index is calculated + * + * QUANTIZATION FORMULA VISUALIZATION: + * For input range [min_val, max_val] mapped to integer range [quant_min, quant_max]: + * + * Floating Point Domain: Integer Domain: + * min_val ────────────────► quant_min + * │ │ + * │ scale = (max_val - min_val) / (quant_max - quant_min) + * │ zero_point = quant_min - round(min_val / scale) + * │ │ + * max_val ────────────────► quant_max + * + * Quantization Process: + * Input: 2.5 (float) + * Step 1: value / scale = 2.5 / 0.1 = 25.0 + * Step 2: round(25.0) + zero_point = 25 + (-128) = -103 + * Step 3: clamp(-103, -128, 127) = -103 + * Output: -103 (int8) + * + * PER-TENSOR QUANTIZATION: + * - Single scale and zero_point values for entire tensor + * - All elements use same quantization parameters + * - Parameters passed as push constants for efficiency + * - Formula: qvalue = clamp(round(value / scale) + zero_point, quant_min, quant_max) + * + * PER-TOKEN QUANTIZATION: + * - Separate scale and zero_point for each token + * - Token = all elements except last dimension (e.g., for [B,S,H]: B*S tokens of H elements) + * - Parameters stored in buffer arrays indexed by token_id + * - Each thread calculates its token_id from tensor coordinates + * - Formula: qvalue = clamp(round(value / scale[token_id]) + zero_point[token_id], quant_min, quant_max) + */ + +#ifdef per_tensor + +void quantize_per_tensor() { + const int out_bufi = int(gl_GlobalInvocationID.x); + + if (out_bufi >= out_numel) { + return; + } + + const ivec4 out_tidx = bufi_to_tidx(out_bufi, t_out_strides, out_dim_order); + const int in_bufi = tidx_to_bufi(out_tidx, t_in_strides); + + IN_T value = t_in[in_bufi]; + OUT_T qvalue = quantize_val(value, scale, zero_point); + + t_out[out_bufi] = qvalue; +} + +#else + +void quantize_per_token() { + const int out_bufi = int(gl_GlobalInvocationID.x); + + if (out_bufi >= out_numel) { + return; + } + + const ivec4 out_tidx = bufi_to_tidx(out_bufi, t_out_strides, out_dim_order); + const int in_bufi = tidx_to_bufi(out_tidx, t_in_strides); + + IN_T value = t_in[in_bufi]; + + int token_idx = 0; + + if (t_out_sizes.w > 1) { + // 4D tensor + token_idx = out_tidx.w * (t_out_sizes.z * t_out_sizes.y) + out_tidx.z * t_out_sizes.y + out_tidx.y; + } else if (t_out_sizes.z > 1) { + // 3D tensor + token_idx = out_tidx.z * t_out_sizes.y + out_tidx.y; + } else if (t_out_sizes.y > 1) { + // 2D tensor + token_idx = out_tidx.y; + } + // For 1D tensor, token_idx remains 0 + + token_idx = min(token_idx, num_tokens - 1); + + OUT_T qvalue = quantize_val(value, t_scale[token_idx], t_zero_point[token_idx]); + + t_out[out_bufi] = qvalue; +} + +#endif + +void main() { + quantize_${MODE}(); +} diff --git a/backends/vulkan/runtime/graph/ops/glsl/quantize_buffer.yaml b/backends/vulkan/runtime/graph/ops/glsl/quantize_buffer.yaml new file mode 100644 index 00000000000..90af2590936 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/quantize_buffer.yaml @@ -0,0 +1,18 @@ +quantize_buffer: + parameter_names_with_default_values: + IN_DTYPE: float + OUT_DTYPE: int32 + MODE: per_tensor + generate_variant_forall: + IN_DTYPE: + - VALUE: half + - VALUE: float + OUT_DTYPE: + - VALUE: uint8 + - VALUE: int8 + - VALUE: int32 + shader_variants: + - NAME: quantize_per_tensor_buffer + MODE: per_tensor + - NAME: quantize_per_token_buffer + MODE: per_token diff --git a/backends/vulkan/runtime/graph/ops/glsl/quantize_texture.glsl b/backends/vulkan/runtime/graph/ops/glsl/quantize_texture.glsl new file mode 100644 index 00000000000..9ba7074f75b --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/quantize_texture.glsl @@ -0,0 +1,184 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#version 450 core + +#define PRECISION ${PRECISION} + +#define IN_T ${buffer_scalar_type(IN_DTYPE)} +#define FVEC4_T ${texel_load_type(IN_DTYPE, "texture3d")} + +#define OUT_T ${buffer_scalar_type(OUT_DTYPE)} +#define IVEC4_T ${texel_load_type(OUT_DTYPE, "texture3d")} + +#define ${MODE} + +${define_active_storage_type("texture3d")} +${define_required_extensions(IN_DTYPE)} +${define_required_extensions(OUT_DTYPE)} + +#extension GL_EXT_control_flow_attributes : require + +layout(std430) buffer; + +${layout_declare_tensor(B, "w", "t_out", OUT_DTYPE, "texture3d")} +${layout_declare_tensor(B, "r", "t_in", IN_DTYPE, "texture3d")} + +$if MODE == "per_tensor": + layout(push_constant) uniform restrict Block { + float scale; + int zero_point; + int quant_min; + int quant_max; + }; +$if MODE == "per_token": + ${layout_declare_tensor(B, "r", "t_scale", "float", "buffer")} + ${layout_declare_tensor(B, "r", "t_zero_point", "int", "buffer")} + + layout(push_constant) uniform restrict Block { + int num_tokens; + int quant_min; + int quant_max; + }; + +${layout_declare_ubo(B, "ivec3", "t_in_limits")} +${layout_declare_ubo(B, "ivec3", "t_out_limits")} + +#include "indexing_utils.h" +#include "quantize.glslh" + +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; + +/* + * QUANTIZATION SHADER (TEXTURE STORAGE) + * + * This shader converts floating-point tensor values to n-bit integer representations + * using pre-computed quantization parameters (scale and zero_point). The quantization + * maps floating-point values to a discrete integer range while preserving the + * original data distribution as much as possible. + * + * ALGORITHM: + * 1. Load floating-point texel (4 values) from 3D texture + * 2. Apply quantization formula to each component: qvalue = round(value / scale) + zero_point + * 3. Clamp each result to [quant_min, quant_max] range + * 4. Store quantized integer texel to output texture + * + * WORKGROUP CONFIGURATION: + * - Per-Tensor Mode: + * - Global WG Size: {W, H, C/4} for input size (W, H, C) with width-packing + * - Local WG Size: Default (typically {8, 8, 1} or based on global WG size) + * - Per-Token Mode: + * - Global WG Size: {W, H, C/4} for input size (W, H, C) with width-packing + * - Local WG Size: Default (typically {8, 8, 1} or based on global WG size) + * + * SUPPORTED CONFIGURATIONS: + * - Texture Storage: Uses 3D texture indexing with texel-based processing + * - Assumes width-packed layout (packed_dim = 0) in current implementation + * - Handles texel padding for non-multiple-of-4 tensor dimensions + * - For per-token mode: scale/zero_point tensors must use buffer storage + * + * QUANTIZATION FORMULA VISUALIZATION: + * For input range [min_val, max_val] mapped to integer range [quant_min, quant_max]: + * + * Floating Point Domain: Integer Domain: + * min_val ────────────────► quant_min + * │ │ + * │ scale = (max_val - min_val) / (quant_max - quant_min) + * │ zero_point = quant_min - round(min_val / scale) + * │ │ + * max_val ────────────────► quant_max + * + * Texel Quantization Process: + * Input Texel: [2.5, -1.0, 0.5, 3.2] (float4) + * Per-component quantization with scale=0.1, zero_point=-128: + * Component 0: round(2.5 / 0.1) + (-128) = 25 + (-128) = -103 + * Component 1: round(-1.0 / 0.1) + (-128) = -10 + (-128) = -138 → clamp to -128 + * Component 2: round(0.5 / 0.1) + (-128) = 5 + (-128) = -123 + * Component 3: round(3.2 / 0.1) + (-128) = 32 + (-128) = -96 + * Output Texel: [-103, -128, -123, -96] (int4) + * + * PER-TENSOR QUANTIZATION: + * - Single scale and zero_point values for entire tensor + * - All texel components use same quantization parameters + * - Parameters passed as push constants for efficiency + * - Each thread processes one texel (4 elements) independently + * - Formula: qvalue[i] = clamp(round(value[i] / scale) + zero_point, quant_min, quant_max) + * + * PER-TOKEN QUANTIZATION: + * - Separate scale and zero_point for each token + * - Token = all elements except last dimension (e.g., for [B,S,H]: B*S tokens of H elements) + * - Parameters stored in buffer arrays indexed by token_id + * - Each thread calculates token_id from its 3D texture position + * - Scale/zero_point buffers accessed directly (not as textures) + * - Formula: qvalue[i] = clamp(round(value[i] / scale[token_id]) + zero_point[token_id], quant_min, quant_max) + */ + +#ifdef per_tensor + +void quantize_per_tensor() { + const ivec3 pos = ivec3(gl_GlobalInvocationID); + + if (any(greaterThanEqual(pos, t_in_limits))) { + return; + } + + FVEC4_T intex = load_texel(t_in, pos); + IVEC4_T outtex; + + [[unroll]] for (int i = 0; i < 4; ++i) { + IN_T value = IN_T(intex[i]); + OUT_T qvalue = quantize_val(value, scale, zero_point); + outtex[i] = qvalue; + } + write_texel(t_out, pos, outtex); +} + +#else + +void quantize_per_token() { + const ivec3 pos = ivec3(gl_GlobalInvocationID); + + if (any(greaterThanEqual(pos, t_in_limits))) { + return; + } + + FVEC4_T intex = load_texel(t_in, pos); + + int token_idx = 0; + ivec3 dims = t_in_limits; + + if (dims.z > 1) { + // 3D tensor + token_idx = pos.z * dims.y + pos.y; + } else if (dims.y > 1) { + // 2D tensor + token_idx = pos.y; + } + // For 1D tensor, token_idx remains 0 + + token_idx = min(token_idx, num_tokens - 1); + + // Scale and zero_point are prepacked as buffers, so direct access + float scale_val = t_scale[token_idx]; + int zero_point_val = t_zero_point[token_idx]; + + IVEC4_T outtex; + [[unroll]] for (int i = 0; i < 4; ++i) { + IN_T value = IN_T(intex[i]); + OUT_T qvalue = quantize_val(value, scale_val, zero_point_val); + outtex[i] = qvalue; + } + + write_texel(t_out, pos, outtex); +} + +#endif + +void main() { + quantize_${MODE}(); +} diff --git a/backends/vulkan/runtime/graph/ops/glsl/quantize_texture.yaml b/backends/vulkan/runtime/graph/ops/glsl/quantize_texture.yaml new file mode 100644 index 00000000000..042eb0f8196 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/quantize_texture.yaml @@ -0,0 +1,18 @@ +quantize_texture: + parameter_names_with_default_values: + IN_DTYPE: float + OUT_DTYPE: int32 + MODE: per_tensor + generate_variant_forall: + IN_DTYPE: + - VALUE: half + - VALUE: float + OUT_DTYPE: + - VALUE: uint8 + - VALUE: int8 + - VALUE: int32 + shader_variants: + - NAME: quantize_per_tensor_texture3d + MODE: per_tensor + - NAME: quantize_per_token_texture3d + MODE: per_token diff --git a/backends/vulkan/runtime/graph/ops/impl/Quantize.cpp b/backends/vulkan/runtime/graph/ops/impl/Quantize.cpp new file mode 100644 index 00000000000..35712d59fb9 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/impl/Quantize.cpp @@ -0,0 +1,258 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include + +#include +#include +#include + +namespace vkcompute { + +namespace { + +void resize_quantize_output( + ComputeGraph* graph, + const std::vector& args, + const std::vector& extra_args) { + (void)extra_args; + const ValueRef out = args.at(0).refs.at(0); + const ValueRef in = args.at(1).refs.at(0); + graph->virtual_resize(out, graph->sizes_of(in)); +} + +} // namespace + +void add_quantize_per_tensor_node( + ComputeGraph& graph, + const ValueRef& input, + const ValueRef& scale, + const ValueRef& zero_point, + const ValueRef& quant_min, + const ValueRef& quant_max, + const ValueRef& output) { + std::string kernel_name("quantize_per_tensor"); + add_storage_type_suffix(kernel_name, graph.storage_type_of(input)); + add_dtype_suffix(kernel_name, graph.dtype_of(input)); + add_dtype_suffix(kernel_name, graph.dtype_of(output)); + + float scale_val = static_cast(graph.get_double(scale)); + int zero_point_val = static_cast(graph.get_int(zero_point)); + int quant_min_val = static_cast(graph.get_int(quant_min)); + int quant_max_val = static_cast(graph.get_int(quant_max)); + + vkapi::ParamsBindList param_ubos; + std::vector push_constants; + + if (graph.is_buffer_storage(input)) { + param_ubos = { + graph.numel_ubo(input), + graph.sizes_ubo(input), + graph.strides_ubo(input), + graph.sizes_ubo(output), + graph.strides_ubo(output)}; + push_constants = { + PushConstantDataInfo(&scale_val, sizeof(float)), + PushConstantDataInfo(&zero_point_val, sizeof(int)), + PushConstantDataInfo(&quant_min_val, sizeof(int)), + PushConstantDataInfo(&quant_max_val, sizeof(int)), + }; + } else { + param_ubos = { + graph.logical_limits_ubo(input), graph.logical_limits_ubo(output)}; + push_constants = { + PushConstantDataInfo(&scale_val, sizeof(float)), + PushConstantDataInfo(&zero_point_val, sizeof(int)), + PushConstantDataInfo(&quant_min_val, sizeof(int)), + PushConstantDataInfo(&quant_max_val, sizeof(int)), + }; + } + + vkapi::SpecVarList spec_vars = { + graph.hashed_layout_of(output), + graph.hashed_layout_of(input), + }; + + graph.execute_nodes().emplace_back(new DynamicDispatchNode( + graph, + VK_KERNEL_FROM_STR(kernel_name), + default_pick_global_wg_size, + default_pick_local_wg_size, + // Inputs and Outputs + {{output, vkapi::kWrite}, {input, vkapi::kRead}}, + // Shader param buffers + param_ubos, + // Push Constants + push_constants, + // Specialization Constants + spec_vars, + // Resize Args + {}, + // Resizing Logic + resize_quantize_output)); +} + +void add_quantize_per_token_node( + ComputeGraph& graph, + const ValueRef& input, + const ValueRef& scale, + const ValueRef& zero_point, + const ValueRef& quant_min, + const ValueRef& quant_max, + const ValueRef& output) { + std::string kernel_name("quantize_per_token"); + add_storage_type_suffix(kernel_name, graph.storage_type_of(input)); + add_dtype_suffix(kernel_name, graph.dtype_of(input)); + add_dtype_suffix(kernel_name, graph.dtype_of(output)); + + int quant_min_val = static_cast(graph.get_int(quant_min)); + int quant_max_val = static_cast(graph.get_int(quant_max)); + + int num_tokens = static_cast(graph.sizes_of(scale)[0]); + + vkapi::ParamsBindList param_ubos; + std::vector push_constants; + + if (graph.is_buffer_storage(input)) { + param_ubos = { + graph.numel_ubo(input), + graph.sizes_ubo(input), + graph.strides_ubo(input), + graph.sizes_ubo(output), + graph.strides_ubo(output), + }; + push_constants = { + PushConstantDataInfo(&num_tokens, sizeof(int)), + PushConstantDataInfo(&quant_min_val, sizeof(int)), + PushConstantDataInfo(&quant_max_val, sizeof(int)), + }; + } else { + param_ubos = { + graph.logical_limits_ubo(input), + graph.logical_limits_ubo(output), + }; + push_constants = { + PushConstantDataInfo(&num_tokens, sizeof(int)), + PushConstantDataInfo(&quant_min_val, sizeof(int)), + PushConstantDataInfo(&quant_max_val, sizeof(int)), + }; + } + + vkapi::SpecVarList spec_vars = { + graph.hashed_layout_of(output), + graph.hashed_layout_of(input), + }; + + graph.execute_nodes().emplace_back(new DynamicDispatchNode( + graph, + VK_KERNEL_FROM_STR(kernel_name), + default_pick_global_wg_size, + default_pick_local_wg_size, + // Inputs and Outputs + {{output, vkapi::kWrite}, + {input, vkapi::kRead}, + {{scale, zero_point}, vkapi::kRead}}, + // Shader param buffers + param_ubos, + // Push Constants + push_constants, + // Specialization Constants + spec_vars, + // Resize Args + {}, + // Resizing Logic + resize_quantize_output)); +} + +void quantize_per_tensor_impl( + ComputeGraph& graph, + const std::vector& args) { + int arg_idx = 0; + const ValueRef input = args[arg_idx++]; + const ValueRef scale = args[arg_idx++]; + const ValueRef zero_point = args[arg_idx++]; + const ValueRef quant_min = args[arg_idx++]; + const ValueRef quant_max = args[arg_idx++]; + const ValueRef output = args[arg_idx++]; + + // Check tensor types + VK_CHECK_COND(graph.val_is_tensor(input)); + VK_CHECK_COND(graph.val_is_tensor(output)); + + // Verify input is a floating point type + VK_CHECK_COND( + graph.dtype_of(input) == vkapi::kFloat || + graph.dtype_of(input) == vkapi::kHalf); + + add_quantize_per_tensor_node( + graph, input, scale, zero_point, quant_min, quant_max, output); +} + +void quantize_per_token_impl( + ComputeGraph& graph, + const std::vector& args) { + int arg_idx = 0; + const ValueRef input = args[arg_idx++]; + const ValueRef scale = args[arg_idx++]; + const ValueRef zero_point = args[arg_idx++]; + const ValueRef quant_min = args[arg_idx++]; + const ValueRef quant_max = args[arg_idx++]; + const ValueRef output = args[arg_idx++]; + + // Check tensor types + VK_CHECK_COND(graph.val_is_tensor(input)); + VK_CHECK_COND(graph.val_is_tensor(scale)); + VK_CHECK_COND(graph.val_is_tensor(zero_point)); + VK_CHECK_COND(graph.val_is_tensor(output)); + + // Verify input is a floating point type + VK_CHECK_COND( + graph.dtype_of(input) == vkapi::kFloat || + graph.dtype_of(input) == vkapi::kHalf); + + // Check that scale and zero_point have buffer storage and width packing + VK_CHECK_COND(graph.is_buffer_storage(scale)); + VK_CHECK_COND(graph.packed_dim_of(scale) == WHCN::kWidthDim); + VK_CHECK_COND(graph.is_buffer_storage(zero_point)); + VK_CHECK_COND(graph.packed_dim_of(zero_point) == WHCN::kWidthDim); + + // Check that tensors with texture storage have standard axis map + if (!graph.is_buffer_storage(input)) { + VK_CHECK_COND(graph.has_standard_axis_map(input)); + } + if (!graph.is_buffer_storage(output)) { + VK_CHECK_COND(graph.has_standard_axis_map(output)); + } + + // Calculate number of tokens (product of all dimensions except the last one) + int64_t num_tokens = 1; + const auto input_sizes = graph.sizes_of(input); + for (size_t i = 0; i < input_sizes.size() - 1; i++) { + num_tokens *= input_sizes[i]; + } + + const auto scale_sizes = graph.sizes_of(scale); + const auto zero_point_sizes = graph.sizes_of(zero_point); + + VK_CHECK_COND(scale_sizes.size() == 1); + VK_CHECK_COND(zero_point_sizes.size() == 1); + VK_CHECK_COND(scale_sizes[0] == num_tokens); + VK_CHECK_COND(zero_point_sizes[0] == num_tokens); + + add_quantize_per_token_node( + graph, input, scale, zero_point, quant_min, quant_max, output); +} + +REGISTER_OPERATORS { + VK_REGISTER_OP(quantize_per_tensor.default, quantize_per_tensor_impl); + VK_REGISTER_OP(quantize_per_token.default, quantize_per_token_impl); +} + +} // namespace vkcompute diff --git a/backends/vulkan/test/op_tests/quantize_test.cpp b/backends/vulkan/test/op_tests/quantize_test.cpp index 8b79dc1ce6b..7ea98b14fb2 100644 --- a/backends/vulkan/test/op_tests/quantize_test.cpp +++ b/backends/vulkan/test/op_tests/quantize_test.cpp @@ -21,6 +21,9 @@ #include #include +#include + +float eps = 1e-7; namespace torch { namespace executor { @@ -383,6 +386,8 @@ void test_reference_quantize_per_tensor( // Reshape back to original dimensions input = flat_input.reshape(input_sizes_int64); + scale = scale < eps ? eps : scale; + // Get reference output at::Tensor reference_out = quantize_per_tensor_reference_impl( input, scale, zero_point, quant_min, quant_max, dtype); @@ -435,6 +440,8 @@ void test_vulkan_quantize_per_tensor_impl( at::Tensor input = at::rand(input_sizes_int64, at::device(at::kCPU).dtype(in_dtype)); + scale = scale < eps ? eps : scale; + // Get reference output at::Tensor reference_out = torch::executor::native::quantize_per_tensor_aten( input, scale, zero_point, quant_min, quant_max, dtype); @@ -490,7 +497,7 @@ void test_vulkan_quantize_per_tensor_impl( at::Tensor reference_int = reference_out.to(at::kInt); at::Tensor vk_int = vk_out.to(at::kInt); - const bool output_correct = at::equal(reference_int, vk_int); + const bool output_correct = at::allclose(reference_int, vk_int); if (!output_correct) { at::Tensor diffs = at::abs(reference_int - vk_int); @@ -500,6 +507,10 @@ void test_vulkan_quantize_per_tensor_impl( std::cout << " zero_point: " << zero_point << std::endl; std::cout << " quant_min: " << quant_min << std::endl; std::cout << " quant_max: " << quant_max << std::endl; + std::cout << " storage type: " + << (in_storage == vkcompute::utils::kBuffer ? "buffer" + : "texture") + << std::endl; std::cout << "input:" << std::endl; std::cout << input << std::endl; @@ -564,9 +575,89 @@ TEST( at::kInt); } +TEST( + VulkanQuantizePerTensorTest, + test_vulkan_quantize_per_tensor_float_to_uint8) { + if (!vkcompute::api::context() + ->adapter_ptr() + ->has_full_int8_buffers_support()) { + GTEST_SKIP(); + } + test_vulkan_quantize_per_tensor( + {5, 3, 2, 4}, // input sizes + 0.01, // scale + 1, // zero_point + 0, // quant_min + 255, // quant_max + at::kFloat, + at::kByte); +} + +TEST( + VulkanQuantizePerTensorTest, + test_vulkan_quantize_per_tensor_float_to_int8) { + if (!vkcompute::api::context() + ->adapter_ptr() + ->has_full_int8_buffers_support()) { + GTEST_SKIP(); + } + test_vulkan_quantize_per_tensor( + {5, 3, 2, 4}, // input sizes + 0.01, // scale + 1, // zero_point + -128, // quant_min + 127, // quant_max + at::kFloat, + at::kChar); +} + +TEST( + VulkanQuantizePerTensorTest, + test_vulkan_quantize_per_tensor_float_to_int32) { + test_vulkan_quantize_per_tensor( + {5, 3, 2, 4}, // input sizes + 0.01, // scale + 1, // zero_point + -2147483648, // quant_min + 2147483647, // quant_max + at::kFloat, + at::kInt); +} + +TEST( + VulkanQuantizePerTensorTest, + test_vulkan_quantize_per_tensor_float_to_int32_small_scale) { + test_vulkan_quantize_per_tensor( + {2, 8, 1, 3}, // input sizes + 0.0, // scale + 20, // zero_point + -2147483648, // quant_min + 2147483647, // quant_max + at::kFloat, + at::kInt); +} + +TEST( + VulkanQuantizePerTensorTest, + test_vulkan_quantize_per_tensor_half_to_int8) { + if (!vkcompute::api::context() + ->adapter_ptr() + ->has_full_float16_buffers_support()) { + GTEST_SKIP(); + } + test_vulkan_quantize_per_tensor( + {2, 3}, // input sizes + 0.01, // scale + 1, // zero_point + -128, // quant_min + 127, // quant_max + at::kHalf, // input dtype + at::kChar); // output dtype +} + void test_reference_quantize_per_token( const std::vector& input_sizes, - const std::vector& scales, + const std::vector& pre_scales, const std::vector& zero_points, int64_t quant_min, int64_t quant_max, @@ -595,9 +686,14 @@ void test_reference_quantize_per_token( } // Verify that the number of tokens matches the size of scales and zero_points - ASSERT_EQ(num_tokens, scales.size()); + ASSERT_EQ(num_tokens, pre_scales.size()); ASSERT_EQ(num_tokens, zero_points.size()); + std::vector scales = pre_scales; + for (auto& s : scales) { + s = s < eps ? eps : s; + } + // Create scale and zero_point tensors at::Tensor scale_tensor = at::tensor(scales, at::device(at::kCPU).dtype(at::kDouble)); @@ -646,7 +742,7 @@ void test_reference_quantize_per_token( void test_vulkan_quantize_per_token_impl( const std::vector& input_sizes, - const std::vector& scales, + const std::vector& pre_scales, const std::vector& zero_points, int64_t quant_min, int64_t quant_max, @@ -662,9 +758,14 @@ void test_vulkan_quantize_per_token_impl( num_tokens *= input_sizes[i]; } - ASSERT_EQ(num_tokens, scales.size()); + ASSERT_EQ(num_tokens, pre_scales.size()); ASSERT_EQ(num_tokens, zero_points.size()); + std::vector scales = pre_scales; + for (auto& s : scales) { + s = s < eps ? eps : s; + } + // Create input tensor with random values std::vector input_sizes_int64( input_sizes.begin(), input_sizes.end()); @@ -688,9 +789,15 @@ void test_vulkan_quantize_per_token_impl( IOValueRef r_input = graph.add_input_tensor( input.sizes().vec(), from_at_scalartype(input.scalar_type()), in_storage); IOValueRef r_scale = graph.add_input_tensor( - scale_tensor.sizes().vec(), vkapi::kFloat, in_storage); + scale_tensor.sizes().vec(), + vkapi::kFloat, + utils::kBuffer, + utils::kWidthPacked); IOValueRef r_zero_point = graph.add_input_tensor( - zero_point_tensor.sizes().vec(), vkapi::kInt, in_storage); + zero_point_tensor.sizes().vec(), + vkapi::kInt, + utils::kBuffer, + utils::kWidthPacked); const ValueRef r_quant_min = graph.add_scalar(quant_min); const ValueRef r_quant_max = graph.add_scalar(quant_max); @@ -744,7 +851,7 @@ void test_vulkan_quantize_per_token_impl( at::Tensor reference_int = reference_out.to(at::kInt); at::Tensor vk_int = vk_out.to(at::kInt); - const bool output_correct = at::equal(reference_int, vk_int); + const bool output_correct = at::allclose(reference_int, vk_int); if (!output_correct) { at::Tensor diffs = at::abs(reference_int - vk_int); @@ -841,3 +948,130 @@ TEST( at::kHalf, at::kByte); } + +TEST( + VulkanQuantizePerTensorTest, + test_vulkan_quantize_per_token_float_to_uint8) { + if (!vkcompute::api::context() + ->adapter_ptr() + ->has_full_int8_buffers_support()) { + GTEST_SKIP(); + } + std::vector scales = { + -0.5, -0.3, -0.2, 0, 0.1, 0.8, 0.1, 0.2, 0.3, 0.4}; + std::vector zero_points = {-8, 0, 15, 20, 19, 12, 47, 1, -50, -12}; + + test_vulkan_quantize_per_token( + {5, 2, 4}, // input sizes (5*2=10 tokens) + scales, + zero_points, + 0, // quant_min + 255, // quant_max + at::kFloat, + at::kByte); +} + +TEST( + VulkanQuantizePerTensorTest, + test_vulkan_quantize_per_token_float_to_int8) { + if (!vkcompute::api::context() + ->adapter_ptr() + ->has_full_int8_buffers_support()) { + GTEST_SKIP(); + } + std::vector scales = { + -0.5, -0.3, -0.2, 0, 0.1, 0.8, 0.1, 0.2, 0.3, 0.4}; + std::vector zero_points = {-8, 0, 15, 20, 19, 12, 47, 1, -50, -12}; + + test_vulkan_quantize_per_token( + {5, 2, 4}, // input sizes (5 tokens) + scales, + zero_points, + -128, // quant_min + 127, // quant_max + at::kFloat, + at::kChar); +} + +TEST( + VulkanQuantizePerTensorTest, + test_vulkan_quantize_per_token_float_to_int32) { + std::vector scales = { + -0.5, -0.3, -0.2, 0, 0.1, 0.8, 0.1, 0.2, 0.3, 0.4}; + std::vector zero_points = {-8, 0, 15, 20, 19, 12, 47, 1, -50, -12}; + + test_vulkan_quantize_per_token( + {5, 2, 4}, // input sizes (5*2=10 tokens) + scales, + zero_points, + -2147483648, // quant_min + 2147483647, // quant_max + at::kFloat, + at::kInt); +} + +TEST( + VulkanQuantizePerTensorTest, + test_vulkan_quantize_per_token_float_to_int32_small_scales) { + std::vector scales = { + 0, + 2.9387358770557188e-39f, + 1.40129846e-45f, + 1.17549435e-38f, + 0.0000000000001}; + std::vector zero_points = {20, -10, 15, 200, 50}; + + test_vulkan_quantize_per_token( + {5, 2}, // input sizes (3 tokens) + scales, + zero_points, + -2147483648, // quant_min + 2147483647, // quant_max + at::kFloat, + at::kInt); +} + +TEST( + VulkanQuantizePerTensorTest, + test_vulkan_quantize_per_token_float_to_uint8_many_tokens) { + if (!vkcompute::api::context() + ->adapter_ptr() + ->has_full_int8_buffers_support()) { + GTEST_SKIP(); + } + std::vector scales(18, 0.1); + std::vector zero_points(18, 5); + + // Alternate scale values + for (size_t i = 0; i < scales.size(); i++) { + scales[i] = (i % 2 == 0) ? 0.3 : -0.5; + } + + test_vulkan_quantize_per_token( + {3, 3, 2, 3}, // input sizes (3*3*2=18 tokens) + scales, + zero_points, + 0, // quant_min + 125, // quant_max + at::kFloat, + at::kByte); +} + +TEST(VulkanQuantizePerTensorTest, test_vulkan_quantize_per_token_half_to_int8) { + if (!vkcompute::api::context() + ->adapter_ptr() + ->has_full_float16_buffers_support()) { + GTEST_SKIP(); + } + std::vector scales = {0.1, 0.2}; + std::vector zero_points = {0, 5}; + + test_vulkan_quantize_per_token( + {2, 2}, // input sizes (2*2=4 tokens) + scales, + zero_points, + -128, // quant_min + 127, // quant_max + at::kHalf, // input dtype + at::kChar); // output dtype +} diff --git a/backends/vulkan/test/op_tests/test_utils.cpp b/backends/vulkan/test/op_tests/test_utils.cpp index 196f079be2c..c5702abd079 100644 --- a/backends/vulkan/test/op_tests/test_utils.cpp +++ b/backends/vulkan/test/op_tests/test_utils.cpp @@ -94,7 +94,8 @@ vkcompute::vkapi::ScalarType from_at_scalartype(c10::ScalarType at_scalartype) { case c10::kInt: return vkapi::kInt; case c10::kLong: - return vkapi::kLong; + // No support for 64-bit integers + return vkapi::kInt; case c10::kChar: return vkapi::kChar; case c10::kByte: