diff --git a/backends/vulkan/_passes/remove_redundant_ops.py b/backends/vulkan/_passes/remove_redundant_ops.py index 25bdd34de70..b95733021fc 100644 --- a/backends/vulkan/_passes/remove_redundant_ops.py +++ b/backends/vulkan/_passes/remove_redundant_ops.py @@ -32,6 +32,13 @@ class RemoveRedundantOpsTransform(ExportPass): exir_ops.edge.dim_order_ops._to_dim_order_copy.default, exir_ops.edge.dim_order_ops._clone_dim_order.default, exir_ops.edge.aten.expand_copy.default, + # copy.default(self, src): no-op when src dtype/shape matches self. + exir_ops.edge.aten.copy.default, + } + + # For these ops the meaningful input is args[1] (src), not args[0] (self). + _src_arg1_ops: Set[OpType] = { + exir_ops.edge.aten.copy.default, } def __init__(self) -> None: @@ -41,7 +48,8 @@ def _should_remove(self, node: torch.fx.Node) -> bool: if node.target not in self.redundant_ops: return False - orig_node = node.args[0] + src_arg_idx = 1 if node.target in self._src_arg1_ops else 0 + orig_node = node.args[src_arg_idx] assert isinstance(orig_node, torch.fx.Node) src_dtype = orig_node.meta["val"].dtype @@ -61,7 +69,8 @@ def _remove(self, graph_module: torch.fx.GraphModule) -> None: if not self._should_remove(node): continue - node.replace_all_uses_with(node.args[0]) + src_arg_idx = 1 if node.target in self._src_arg1_ops else 0 + node.replace_all_uses_with(node.args[src_arg_idx]) graph_module.graph.eliminate_dead_code() diff --git a/backends/vulkan/op_registry.py b/backends/vulkan/op_registry.py index 855df9d2e74..b18bf3b81c6 100644 --- a/backends/vulkan/op_registry.py +++ b/backends/vulkan/op_registry.py @@ -167,6 +167,9 @@ def update_features_impl(op: OpKey): # Guard and assert ops torch.ops.aten._assert_scalar.default, torch.ops.aten.sym_constrain_range_for_size.default, + # copy.default is a no-op when src dtype matches dst dtype; removed by + # RemoveRedundantOpsTransform before execution. + exir_ops.edge.aten.copy.default, ] ) def register_ephemeral_ops(): @@ -231,6 +234,19 @@ def register_clamp(): exir_ops.edge.aten.div.Tensor, exir_ops.edge.aten.div.Tensor_mode, exir_ops.edge.aten.pow.Tensor_Tensor, + ] +) +def register_binaryop_cpp_ops(): + return OpFeatures( + inputs_storage=utils.ANY_STORAGE, + inputs_dtypes=utils.FP_INT_T, + supports_resize=True, + supports_highdim=True, + ) + + +@update_features( + [ exir_ops.edge.aten.eq.Tensor, exir_ops.edge.aten.lt.Tensor, exir_ops.edge.aten.le.Tensor, @@ -238,10 +254,26 @@ def register_clamp(): exir_ops.edge.aten.ge.Tensor, ] ) -def register_binaryop_cpp_ops(): +def register_comparison_ops(): return OpFeatures( inputs_storage=utils.ANY_STORAGE, inputs_dtypes=utils.FP_INT_T, + outputs_dtypes=utils.BOOL_T, + supports_resize=True, + supports_highdim=True, + ) + + +# ============================================================================= +# BinaryOp.cpp (bitwise) +# ============================================================================= + + +@update_features(exir_ops.edge.aten.bitwise_and.Tensor) +def register_bitwise_and(): + return OpFeatures( + inputs_storage=utils.ANY_STORAGE, + inputs_dtypes=utils.BOOL_T, supports_resize=True, supports_highdim=True, ) @@ -673,6 +705,7 @@ def register_argreduce_cpp_ops(): return OpFeatures( inputs_storage=utils.ANY_TEXTURE, inputs_dtypes=utils.FP_T, + outputs_dtypes=utils.INT_T, supports_resize=True, supports_highdim=True, are_node_inputs_supported_fn=is_reduce_node_supported, @@ -1157,6 +1190,58 @@ def register_index_select(): ) +# ============================================================================= +# Where.cpp +# ============================================================================= + + +@update_features(exir_ops.edge.aten.where.self) +def register_where(): + return OpFeatures( + inputs_storage=utils.ANY_STORAGE, + inputs_dtypes=[utils.BOOL_T, utils.FP_T, utils.FP_T], + outputs_dtypes=utils.FP_T, + supports_resize=True, + ) + + +# ============================================================================= +# IndexTensor.cpp +# ============================================================================= + + +@update_features(exir_ops.edge.aten.index.Tensor) +def register_index_tensor(): + def check_index_tensor_node(node: torch.fx.Node) -> bool: + self_arg = node.args[0] + indices = node.args[1] + + # Only support 1D self tensor + if not isinstance(self_arg, torch.fx.Node): + return False + self_val = self_arg.meta.get("val", None) + if self_val is None: + return False + if len(self_val.size()) != 1: + return False + + # Only support exactly one non-None index tensor + if not isinstance(indices, (list, tuple)): + return False + non_none = [idx for idx in indices if idx is not None] + if len(non_none) != 1: + return False + + return True + + return OpFeatures( + inputs_storage=utils.ANY_STORAGE, + inputs_dtypes=utils.FP_INT_T, + supports_resize=True, + are_node_inputs_supported_fn=check_index_tensor_node, + ) + + # ============================================================================= # Arange.cpp # ============================================================================= diff --git a/backends/vulkan/runtime/graph/ops/glsl/binary_op.yaml b/backends/vulkan/runtime/graph/ops/glsl/binary_op.yaml index ee96b5c05b4..c3d5cd00204 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/binary_op.yaml +++ b/backends/vulkan/runtime/graph/ops/glsl/binary_op.yaml @@ -116,3 +116,11 @@ binary_op: - VALUE: half - VALUE: float - VALUE: int32 + - NAME: binary_bitwise_and + OPERATOR: X & Y + generate_variant_forall: + STORAGE: + - VALUE: buffer + - VALUE: texture3d + DTYPE: + - VALUE: uint8 diff --git a/backends/vulkan/runtime/graph/ops/glsl/index_tensor_buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/index_tensor_buffer.glsl new file mode 100644 index 00000000000..3469bb22fcc --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/index_tensor_buffer.glsl @@ -0,0 +1,58 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#version 450 core + +${define_required_extensions("buffer", DTYPE)} + +#define PRECISION ${PRECISION} + +#define T ${buffer_scalar_type(DTYPE)} + +${define_active_storage_type("buffer")} + +layout(std430) buffer; + +#include "indexing.glslh" + +${layout_declare_tensor(B, "w", "t_out", DTYPE, "buffer")} +${layout_declare_tensor(B, "r", "t_self", DTYPE, "buffer")} +${layout_declare_tensor(B, "r", "t_index", "int", "buffer")} + +${layout_declare_ubo(B, "BufferMetadata", "outp")} +${layout_declare_ubo(B, "BufferMetadata", "inp")} +${layout_declare_ubo(B, "BufferMetadata", "index")} + +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; + +// Implements aten.index.Tensor for the case where self is 1D and there is +// exactly one index tensor. Each output element is: +// output[...] = self[index[...]] + +void main() { + const uint out_bufi = gl_GlobalInvocationID.x; + if (out_of_bounds(out_bufi, outp)) { + return; + } + + // Convert output buffer index to tensor index + TensorIndex out_tidx = linear_idx_to_tensor_idx(outp, out_bufi); + + // Read the index value at the same tensor position + const uint index_bufi = tensor_idx_to_linear_idx(index, out_tidx); + const int idx = t_index[index_bufi]; + + // Construct a tensor index for the 1D self tensor. + // In WHCN ordering, a 1D tensor has its elements along dim 0 (width). + TensorIndex self_tidx; + self_tidx.data[0] = uvec4(uint(idx), 0, 0, 0); + self_tidx.data[1] = uvec4(0); + const uint self_bufi = tensor_idx_to_linear_idx(inp, self_tidx); + + t_out[out_bufi] = t_self[self_bufi]; +} diff --git a/backends/vulkan/runtime/graph/ops/glsl/index_tensor_buffer.yaml b/backends/vulkan/runtime/graph/ops/glsl/index_tensor_buffer.yaml new file mode 100644 index 00000000000..ef79704203f --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/index_tensor_buffer.yaml @@ -0,0 +1,16 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +index_tensor_buffer: + parameter_names_with_default_values: + DTYPE: float + STORAGE: buffer + generate_variant_forall: + DTYPE: + - VALUE: half + - VALUE: float + shader_variants: + - NAME: index_tensor_buffer diff --git a/backends/vulkan/runtime/graph/ops/glsl/index_tensor_texture.glsl b/backends/vulkan/runtime/graph/ops/glsl/index_tensor_texture.glsl new file mode 100644 index 00000000000..8f8026c0a0c --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/index_tensor_texture.glsl @@ -0,0 +1,72 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#version 450 core + +${define_required_extensions("texture3d", DTYPE)} + +#define PRECISION ${PRECISION} + +#define VEC4_T ${texel_load_type(DTYPE, "texture3d")} + +${define_active_storage_type("texture3d")} + +#extension GL_EXT_control_flow_attributes : require + +layout(std430) buffer; + +#include "common.glslh" +#include "indexing.glslh" + +${layout_declare_tensor(B, "w", "t_out", DTYPE, "texture3d")} +${layout_declare_tensor(B, "r", "t_self", DTYPE, "texture3d")} +${layout_declare_tensor(B, "r", "t_index", "int", "texture3d")} + +${layout_declare_ubo(B, "TextureMetadata", "outp")} +${layout_declare_ubo(B, "TextureMetadata", "inp")} +${layout_declare_ubo(B, "TextureMetadata", "index")} + +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; + +// Implements aten.index.Tensor for the case where self is 1D and there is +// exactly one index tensor. Each output element is: +// output[...] = self[index[...]] + +void main() { + const ivec3 out_pos = ivec3(gl_GlobalInvocationID); + + if (out_of_bounds(out_pos, outp)) { + return; + } + + TensorIndex4D out_tidx = texture_pos_to_tensor4d_idx_simple(outp, out_pos); + ivec4 idx_texel = texelFetch(t_index, out_pos, 0); + + VEC4_T out_texel = VEC4_T(0); + + int limit = min( + 4, outp.sizes[outp.packed_dim] - out_tidx.data[outp.packed_dim]); + for (int comp = 0; comp < limit; comp++) { + int idx = idx_texel[comp]; + + // Construct a tensor index for the 1D self tensor. + // In WHCN ordering, a 1D tensor has its elements along dim 0 (width). + TensorIndex4D self_tidx; + self_tidx.data = ivec4(idx, 0, 0, 0); + + TextureElementIndex self_elem = + tensor4d_idx_to_texture_element_idx_simple(inp, self_tidx); + + VEC4_T self_texel = texelFetch(t_self, self_elem.pos, 0); + out_texel[comp] = self_texel[self_elem.comp]; + + out_tidx.data[outp.packed_dim]++; + } + + imageStore(t_out, out_pos, out_texel); +} diff --git a/backends/vulkan/runtime/graph/ops/glsl/index_tensor_texture.yaml b/backends/vulkan/runtime/graph/ops/glsl/index_tensor_texture.yaml new file mode 100644 index 00000000000..3e274fa177a --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/index_tensor_texture.yaml @@ -0,0 +1,16 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +index_tensor_texture: + parameter_names_with_default_values: + DTYPE: float + STORAGE: texture3d + generate_variant_forall: + DTYPE: + - VALUE: half + - VALUE: float + shader_variants: + - NAME: index_tensor_texture3d diff --git a/backends/vulkan/runtime/graph/ops/glsl/int8x4_buffer_to_nchw.glsl b/backends/vulkan/runtime/graph/ops/glsl/int8x4_buffer_to_nchw.glsl new file mode 100644 index 00000000000..76e6a6c6238 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/int8x4_buffer_to_nchw.glsl @@ -0,0 +1,75 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#version 450 core + +#define PRECISION ${PRECISION} + +${define_active_storage_type("buffer")} + +layout(std430) buffer; + +#include "indexing.glslh" + +// Output staging buffer: raw int8 data interpreted as int32 for device compat +${layout_declare_tensor(B, "w", "nchw_out", "int", "buffer")} +// Input buffer: packed int8x4 values (each int32 contains 4 packed int8) +${layout_declare_tensor(B, "r", "t_inp", "int", "buffer")} + +// Metadata for input tensor +${layout_declare_ubo(B, "BufferMetadata", "inp")} + +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; + +${layout_declare_spec_const(C, "int", "inp_layout", "CONTIG_LAYOUT_INT")} + +void main() { + // One thread per output int32 in the NCHW staging buffer. + // Each output int32 holds 4 consecutive NCHW bytes. + const uint out_int32_idx = gl_GlobalInvocationID.x; + + const uint W = inp.sizes[0][0]; + const uint H = inp.sizes[0][1]; + const uint C = inp.sizes[0][2]; + const uint N = inp.sizes[0][3]; + const uint total_numel = W * H * C * N; + const uint num_out_int32s = (total_numel + 3u) / 4u; + + if (out_int32_idx >= num_out_int32s) { + return; + } + + int output_int32 = 0; + [[unroll]] for (int j = 0; j < 4; ++j) { + const uint nchw_idx = out_int32_idx * 4u + uint(j); + if (nchw_idx >= total_numel) { + break; + } + + // Convert NCHW linear index to tensor4D (WHCN) coordinates. + const uint w = nchw_idx % W; + const uint h = (nchw_idx / W) % H; + const uint c = (nchw_idx / (W * H)) % C; + const uint n = nchw_idx / (W * H * C); + + TensorIndex4D tidx; + tidx.data = ivec4(int(w), int(h), int(c), int(n)); + + // tensor4d_idx_to_buf_idx returns a linear element index where + // element_index / 4 is the int32 slot and element_index % 4 is the byte + // position within that int32. This matches the packing order used by + // nchw_to_int8x4_buffer when writing to the int8x4 buffer. + const int elem_buf_idx = tensor4d_idx_to_buf_idx(inp, tidx, inp_layout); + const int int8_val = + (t_inp[elem_buf_idx / 4] >> ((elem_buf_idx % 4) * 8)) & 0xFF; + + output_int32 |= (int8_val << (j * 8)); + } + + nchw_out[out_int32_idx] = output_int32; +} diff --git a/backends/vulkan/runtime/graph/ops/glsl/int8x4_buffer_to_nchw.yaml b/backends/vulkan/runtime/graph/ops/glsl/int8x4_buffer_to_nchw.yaml new file mode 100644 index 00000000000..1ee9728779a --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/int8x4_buffer_to_nchw.yaml @@ -0,0 +1,11 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +int8x4_buffer_to_nchw: + parameter_names_with_default_values: + DTYPE: int + shader_variants: + - NAME: int8x4_buffer_to_nchw diff --git a/backends/vulkan/runtime/graph/ops/glsl/reduce_per_row_buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/reduce_per_row_buffer.glsl index b0c07e73637..3a63099e7df 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/reduce_per_row_buffer.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/reduce_per_row_buffer.glsl @@ -115,7 +115,7 @@ void main() { #endif #ifdef OUTPUT_IS_INDICES - t_out[out_bufi] = int(0); // int(local_accum.idx); + t_out[out_bufi] = int(local_accum.idx); #else t_out[out_bufi] = convert_to_T(local_accum.val); #endif diff --git a/backends/vulkan/runtime/graph/ops/glsl/where.glsl b/backends/vulkan/runtime/graph/ops/glsl/where.glsl index 281b317e0b5..cab7cf54046 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/where.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/where.glsl @@ -1,5 +1,3 @@ -// where.glsl - /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. @@ -8,7 +6,6 @@ * LICENSE file in the root directory of this source tree. */ - #version 450 core ${define_required_extensions(STORAGE, DTYPE)} @@ -24,44 +21,50 @@ ${define_active_storage_type(STORAGE)} layout(std430) buffer; +#include "indexing.glslh" + ${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)} ${layout_declare_tensor(B, "r", "t_condition", "bool", STORAGE)} ${layout_declare_tensor(B, "r", "t_self", DTYPE, STORAGE)} ${layout_declare_tensor(B, "r", "t_other", DTYPE, STORAGE)} - -#include "indexing_utils.h" - $if STORAGE == "buffer": - ${layout_declare_ubo(B, "int", "out_numl")} - ${layout_declare_ubo(B, "ivec4", "out_strides")} - ${layout_declare_ubo(B, "ivec4", "cond_strides")} - ${layout_declare_ubo(B, "ivec4", "self_strides")} - ${layout_declare_ubo(B, "ivec4", "other_strides")} + ${layout_declare_ubo(B, "BufferMetadata", "outp")} + ${layout_declare_ubo(B, "BufferMetadata", "condp")} + ${layout_declare_ubo(B, "BufferMetadata", "selfp")} + ${layout_declare_ubo(B, "BufferMetadata", "otherp")} $else: - ${layout_declare_ubo(B, "ivec3", "out_limits")} - -${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_DIM_ORDER")} - -const lowp ivec4 out_dim_order = unhash_dim_order(out_layout); + ${layout_declare_ubo(B, "TextureMetadata", "outp")} + ${layout_declare_ubo(B, "TextureMetadata", "condp")} + ${layout_declare_ubo(B, "TextureMetadata", "selfp")} + ${layout_declare_ubo(B, "TextureMetadata", "otherp")} layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; #ifdef USING_BUFFER void main() { - int out_bufi = int(gl_GlobalInvocationID.x); - if (out_bufi >= out_numl) { + const uint out_bufi = gl_GlobalInvocationID.x; + if (out_of_bounds(out_bufi, outp)) { return; } - const ivec4 out_tidx = bufi_to_tidx(out_bufi, out_strides, out_dim_order); + TensorIndex out_tidx = linear_idx_to_tensor_idx(outp, out_bufi); + + TensorIndex cond_tidx = out_tidx; + clamp_tensor_idx(condp, cond_tidx); - const int cond_bufi = tidx_to_bufi(out_tidx, cond_strides); - const int self_bufi = tidx_to_bufi(out_tidx, self_strides); - const int other_bufi = tidx_to_bufi(out_tidx, other_strides); + TensorIndex self_tidx = out_tidx; + clamp_tensor_idx(selfp, self_tidx); - COND_T cond = t_condition[cond_bufi] ; + TensorIndex other_tidx = out_tidx; + clamp_tensor_idx(otherp, other_tidx); + + const uint cond_bufi = tensor_idx_to_linear_idx(condp, cond_tidx); + const uint self_bufi = tensor_idx_to_linear_idx(selfp, self_tidx); + const uint other_bufi = tensor_idx_to_linear_idx(otherp, other_tidx); + + COND_T cond = t_condition[cond_bufi]; T v_self = t_self[self_bufi]; T v_other = t_other[other_bufi]; @@ -72,29 +75,49 @@ void main() { } } -#else // !USING_BUFFER +#else // USING_TEXTURE void main() { - const ivec3 pos = ivec3(gl_GlobalInvocationID); + const ivec3 out_pos = ivec3(gl_GlobalInvocationID); - - if (any(greaterThanEqual(pos, out_limits))) { + if (out_of_bounds(out_pos, outp)) { return; } - vec4 cond = load_texel(t_condition, pos); - VEC4_T selftex = load_texel(t_self, pos); - VEC4_T othertex = load_texel(t_other, pos); - - VEC4_T outtex; - - for (int idx = 0; idx < 4; ++idx) { - if (cond[idx] == 1) { - outtex[idx] = selftex[idx]; + TensorIndex4D out_tidx = texture_pos_to_tensor4d_idx_simple(outp, out_pos); + + VEC4_T outtex = VEC4_T(0); + + int limit = min( + 4, outp.sizes[outp.packed_dim] - out_tidx.data[outp.packed_dim]); + for (int comp = 0; comp < limit; comp++) { + TensorIndex4D cond_tidx; + cond_tidx.data = min(out_tidx.data, condp.sizes - 1); + TextureElementIndex cond_elem = + tensor4d_idx_to_texture_element_idx_simple(condp, cond_tidx); + uint cond_val = texelFetch(t_condition, cond_elem.pos, 0)[cond_elem.comp]; + + TensorIndex4D self_tidx; + self_tidx.data = min(out_tidx.data, selfp.sizes - 1); + TextureElementIndex self_elem = + tensor4d_idx_to_texture_element_idx_simple(selfp, self_tidx); + VEC4_T self_texel = texelFetch(t_self, self_elem.pos, 0); + + TensorIndex4D other_tidx; + other_tidx.data = min(out_tidx.data, otherp.sizes - 1); + TextureElementIndex other_elem = + tensor4d_idx_to_texture_element_idx_simple(otherp, other_tidx); + VEC4_T other_texel = texelFetch(t_other, other_elem.pos, 0); + + if (cond_val > 0) { + outtex[comp] = self_texel[self_elem.comp]; } else { - outtex[idx] = othertex[idx]; + outtex[comp] = other_texel[other_elem.comp]; } + + out_tidx.data[outp.packed_dim]++; } - write_texel(t_out, pos, outtex); + + imageStore(t_out, out_pos, outtex); } - #endif // !USING_BUFFER +#endif diff --git a/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp b/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp index 025b483eab7..92c2fa218ec 100644 --- a/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp @@ -198,6 +198,7 @@ DEFINE_BINARY_OP_FN(lt); DEFINE_BINARY_OP_FN(le); DEFINE_BINARY_OP_FN(gt); DEFINE_BINARY_OP_FN(ge); +DEFINE_BINARY_OP_FN(bitwise_and); REGISTER_OPERATORS { VK_REGISTER_OP(aten.add.Tensor, add); @@ -212,6 +213,7 @@ REGISTER_OPERATORS { VK_REGISTER_OP(aten.le.Tensor, le); VK_REGISTER_OP(aten.gt.Tensor, gt); VK_REGISTER_OP(aten.ge.Tensor, ge); + VK_REGISTER_OP(aten.bitwise_and.Tensor, bitwise_and); } } // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/Common.cpp b/backends/vulkan/runtime/graph/ops/impl/Common.cpp index 0286889de5c..d052882afde 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Common.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Common.cpp @@ -232,11 +232,12 @@ utils::uvec3 pick_linear_global_wg_with_block_config( BlockConfig::outer_dim_from_packed_int(packed_block_config); const std::vector& sizes = graph->sizes_of(output); - const size_t ndim = sizes.size(); - // Compute number of blocks along inner and outer dimensions - const int64_t inner_size = sizes[ndim - 1 - inner_dim]; - const int64_t outer_size = sizes[ndim - 1 - outer_dim]; + // Use val_at with negative indices to safely access WHCN dimensions. + // val_at returns 1 for out-of-bounds indices, correctly handling tensors + // with fewer than 4 dimensions. WHCN dim d maps to val_at(-(d+1), sizes). + const int64_t inner_size = utils::val_at(-1 - inner_dim, sizes); + const int64_t outer_size = utils::val_at(-1 - outer_dim, sizes); const uint32_t num_inner_blocks = utils::safe_downcast(utils::div_up(inner_size, int64_t(4))); @@ -245,10 +246,10 @@ utils::uvec3 pick_linear_global_wg_with_block_config( // Compute number of planes (product of dimensions not in the block) uint32_t num_planes = 1; - for (size_t i = 0; i < ndim; ++i) { - const int32_t whcn_dim = ndim - 1 - i; - if (whcn_dim != inner_dim && whcn_dim != outer_dim) { - num_planes *= utils::safe_downcast(sizes[i]); + for (int32_t d = 0; d < 4; ++d) { + if (d != inner_dim && d != outer_dim) { + num_planes *= + utils::safe_downcast(utils::val_at(-1 - d, sizes)); } } diff --git a/backends/vulkan/runtime/graph/ops/impl/IndexTensor.cpp b/backends/vulkan/runtime/graph/ops/impl/IndexTensor.cpp new file mode 100644 index 00000000000..b7da1b1ac40 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/impl/IndexTensor.cpp @@ -0,0 +1,80 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include + +#include + +namespace vkcompute { + +void resize_index_tensor_node( + ComputeGraph* graph, + const std::vector& args, + const std::vector& resize_args) { + (void)resize_args; + const ValueRef out = args.at(0).refs.at(0); + const ValueRef index = args.at(1).refs.at(1); + + std::vector out_sizes = graph->sizes_of(index); + graph->virtual_resize(out, out_sizes); +} + +void add_index_tensor_node( + ComputeGraph& graph, + const ValueRef self, + const ValueRef index, + const ValueRef out) { + std::string kernel_name = "index_tensor"; + kernel_name.reserve(kShaderNameReserve); + add_storage_type_suffix(kernel_name, graph.storage_type_of(out)); + add_dtype_suffix(kernel_name, graph.dtype_of(out)); + + vkapi::ParamsBindList param_ubos = { + graph.meta_ubo(out), graph.meta_ubo(self), graph.meta_ubo(index)}; + + graph.execute_nodes().emplace_back(new DynamicDispatchNode( + graph, + VK_KERNEL_FROM_STR(kernel_name), + default_pick_global_wg_size, + default_pick_local_wg_size, + // Inputs and Outputs + {{out, vkapi::kWrite}, {{self, index}, vkapi::kRead}}, + // Shader params buffers + param_ubos, + // Push Constants + {}, + // Specialization Constants + {}, + // Resize Args + {}, + // Resizing Logic + resize_index_tensor_node)); +} + +void index_tensor(ComputeGraph& graph, const std::vector& args) { + ValueRef self = args[0]; + ValueRef indices_list_ref = args[1]; + ValueRef out = args[2]; + + ValueListPtr indices_list = graph.get_value_list(indices_list_ref); + VK_CHECK_COND( + indices_list->size() == 1, + "index.Tensor: only one index tensor is supported"); + + ValueRef index = indices_list->at(0); + + add_index_tensor_node(graph, self, index, out); +} + +REGISTER_OPERATORS { + VK_REGISTER_OP(aten.index.Tensor, index_tensor); +} + +} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/Int8x4Staging.cpp b/backends/vulkan/runtime/graph/ops/impl/Int8x4Staging.cpp new file mode 100644 index 00000000000..eb1d9965f30 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/impl/Int8x4Staging.cpp @@ -0,0 +1,140 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include +#include +#include + +namespace vkcompute { + +void add_prepack_int8x4_buffer_node( + ComputeGraph& graph, + const ValueRef tensor_data, + const ValueRef tensor) { + VK_CHECK_COND(graph.dtype_of(tensor) == vkapi::kInt8x4); + // TODO(ssjia): Update shaders to handle high-dim tensors + VK_CHECK_COND(graph.dim_of(tensor) <= 4); + + std::string kernel_name = "nchw_to_int8x4_buffer"; + + vkapi::ParamsBindList param_buffers; + param_buffers.append(graph.buffer_meta_ubo(tensor)); + + // One thread per texel (each texel = one int32 = 4 packed int8). + // Use padded_numel to account for dimension padding in packed int8 layouts + // (e.g., kPackedInt8_4C with C=3 pads to C=4). + uint32_t num_texels = + utils::safe_downcast(graph.padded_numel_of(tensor) / 4); + utils::uvec3 global_wg_size = {num_texels, 1, 1}; + utils::uvec3 local_wg_size = graph.create_local_wg_size(global_wg_size); + + graph.prepack_nodes().emplace_back(new PrepackNode( + graph, + VK_KERNEL_FROM_STR(kernel_name), + global_wg_size, + local_wg_size, + // Input and Output + tensor_data, + tensor, + // Parameter Buffers + param_buffers, + // Specialization Constants + {graph.hashed_layout_of(tensor)})); +} + +static utils::uvec3 staging_to_int8x4_buffer_global_wg_size( + ComputeGraph* graph, + const vkapi::ShaderInfo& shader, + const std::vector& args, + const std::vector& resize_args) { + (void)shader; + (void)resize_args; + const ValueRef out_tensor = args.at(0).refs.at(0); + const uint32_t num_texels = + utils::safe_downcast(graph->padded_numel_of(out_tensor) / 4); + return {num_texels, 1, 1}; +} + +void add_staging_to_int8x4_buffer_node( + ComputeGraph& graph, + const ValueRef in_staging, + const ValueRef tensor) { + VK_CHECK_COND(graph.dtype_of(tensor) == vkapi::kInt8x4); + // TODO(ssjia): Update shaders to handle high-dim tensors + VK_CHECK_COND(graph.dim_of(tensor) <= 4); + + vkapi::ParamsBindList param_buffers; + param_buffers.append(graph.buffer_meta_ubo(tensor)); + + graph.execute_nodes().emplace_back(new DynamicDispatchNode( + graph, + VK_KERNEL_FROM_STR("nchw_to_int8x4_buffer"), + staging_to_int8x4_buffer_global_wg_size, + default_pick_local_wg_size, + // Input and Output + {{tensor, vkapi::kWrite}, {in_staging, vkapi::kRead}}, + // Parameter Buffers + param_buffers, + // Push Constants + {}, + // Specialization Constants + {graph.hashed_layout_of(tensor)}, + // Resize Args + {}, + // Resizing Logic + nullptr)); +} + +static utils::uvec3 int8x4_buffer_to_staging_global_wg_size( + ComputeGraph* graph, + const vkapi::ShaderInfo& shader, + const std::vector& args, + const std::vector& resize_args) { + (void)shader; + (void)resize_args; + const ValueRef in_tensor = args.at(1).refs.at(0); + // One thread per output int32 in the NCHW staging buffer. + const int32_t numel = graph->numel_of(in_tensor); + const uint32_t num_out_int32s = + utils::safe_downcast((numel + 3) / 4); + return {num_out_int32s, 1, 1}; +} + +void add_int8x4_buffer_to_staging_node( + ComputeGraph& graph, + const ValueRef tensor, + const ValueRef staging_data) { + VK_CHECK_COND(graph.dtype_of(tensor) == vkapi::kInt8x4); + // TODO(ssjia): Update shaders to handle high-dim tensors + VK_CHECK_COND(graph.dim_of(tensor) <= 4); + + vkapi::ParamsBindList param_buffers; + param_buffers.append(graph.buffer_meta_ubo(tensor)); + + graph.execute_nodes().emplace_back(new DynamicDispatchNode( + graph, + VK_KERNEL_FROM_STR("int8x4_buffer_to_nchw"), + int8x4_buffer_to_staging_global_wg_size, + default_pick_local_wg_size, + // Input and Output + {{staging_data, vkapi::kWrite}, {tensor, vkapi::kRead}}, + // Parameter Buffers + param_buffers, + // Push Constants + {}, + // Specialization Constants + {graph.hashed_layout_of(tensor)}, + // Resize Args + {}, + // Resizing Logic + nullptr)); +} + +} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/Q8taStaging.h b/backends/vulkan/runtime/graph/ops/impl/Int8x4Staging.h similarity index 65% rename from backends/vulkan/runtime/graph/ops/impl/Q8taStaging.h rename to backends/vulkan/runtime/graph/ops/impl/Int8x4Staging.h index 40386551e36..659ed696cd1 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Q8taStaging.h +++ b/backends/vulkan/runtime/graph/ops/impl/Int8x4Staging.h @@ -12,9 +12,19 @@ namespace vkcompute { -void add_staging_to_int8x4_buffer_node( +void add_prepack_int8x4_buffer_node( ComputeGraph& graph, const ValueRef tensor_data, const ValueRef tensor); +void add_staging_to_int8x4_buffer_node( + ComputeGraph& graph, + const ValueRef in_staging, + const ValueRef tensor); + +void add_int8x4_buffer_to_staging_node( + ComputeGraph& graph, + const ValueRef tensor, + const ValueRef staging_data); + } // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/Q8taStaging.cpp b/backends/vulkan/runtime/graph/ops/impl/Q8taStaging.cpp deleted file mode 100644 index 8dc3f8156f8..00000000000 --- a/backends/vulkan/runtime/graph/ops/impl/Q8taStaging.cpp +++ /dev/null @@ -1,49 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -#include -#include - -namespace vkcompute { - -void add_staging_to_int8x4_buffer_node( - ComputeGraph& graph, - const ValueRef tensor_data, - const ValueRef tensor) { - VK_CHECK_COND(graph.dtype_of(tensor) == vkapi::kInt8x4); - - std::string kernel_name = "nchw_to_int8x4_buffer"; - - vkapi::ParamsBindList param_buffers; - param_buffers.append(graph.buffer_meta_ubo(tensor)); - - // One thread per texel (each texel = one int32 = 4 packed int8). - // Use padded_numel to account for dimension padding in packed int8 layouts - // (e.g., kPackedInt8_4C with C=3 pads to C=4). - uint32_t num_texels = - utils::safe_downcast(graph.padded_numel_of(tensor) / 4); - utils::uvec3 global_wg_size = {num_texels, 1, 1}; - utils::uvec3 local_wg_size = graph.create_local_wg_size(global_wg_size); - - graph.prepack_nodes().emplace_back(new PrepackNode( - graph, - VK_KERNEL_FROM_STR(kernel_name), - global_wg_size, - local_wg_size, - // Input and Output - tensor_data, - tensor, - // Parameter Buffers - param_buffers, - // Specialization Constants - {graph.hashed_layout_of(tensor)})); -} - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/Staging.cpp b/backends/vulkan/runtime/graph/ops/impl/Staging.cpp index adcad9f9817..c418a3681c8 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Staging.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Staging.cpp @@ -12,7 +12,7 @@ #include #include -#include +#include #include #include @@ -27,6 +27,10 @@ void add_staging_to_tensor_node( const ValueRef out_tensor) { VK_CHECK_COND(graph.val_is_staging(in_staging)); + if (graph.dtype_of(out_tensor) == vkapi::kInt8x4) { + return add_staging_to_int8x4_buffer_node(graph, in_staging, out_tensor); + } + vkapi::ShaderInfo shader = get_nchw_to_tensor_shader( graph, out_tensor, @@ -104,6 +108,10 @@ void add_tensor_to_staging_node( const ValueRef out_staging) { VK_CHECK_COND(graph.val_is_staging(out_staging)); + if (graph.dtype_of(in_tensor) == vkapi::kInt8x4) { + return add_int8x4_buffer_to_staging_node(graph, in_tensor, out_staging); + } + vkapi::ShaderInfo shader = get_tensor_to_nchw_shader( graph, in_tensor, @@ -329,7 +337,7 @@ ValueRef prepack_int4_linear_weight_transposed_interleaved( void prepack_op(ComputeGraph& graph, const std::vector& args) { if (graph.dtype_of(args[1]) == vkapi::kInt8x4) { - return add_staging_to_int8x4_buffer_node(graph, args[0], args[1]); + return add_prepack_int8x4_buffer_node(graph, args[0], args[1]); } return add_prepack_standard_node(graph, args[0], args[1]); } diff --git a/backends/vulkan/runtime/graph/ops/impl/Where.cpp b/backends/vulkan/runtime/graph/ops/impl/Where.cpp index c1c482d9967..adb7fb1beca 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Where.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Where.cpp @@ -21,43 +21,13 @@ void resize_where_node( const std::vector& extra_args) { (void)extra_args; const ValueRef out = args.at(0).refs.at(0); - const ValueRef in = args.at(1).refs.at(0); + const ValueRef self = args.at(1).refs.at(1); - const std::vector in_sizes = graph->sizes_of(in); - graph->virtual_resize(out, in_sizes); + const std::vector self_sizes = graph->sizes_of(self); + graph->virtual_resize(out, self_sizes); } -void add_where_texture_node( - ComputeGraph& graph, - const ValueRef cond, - const ValueRef self, - const ValueRef other, - const ValueRef out) { - std::string kernel_name = "where"; - - add_storage_type_suffix(kernel_name, graph.storage_type_of(out)); - add_dtype_suffix(kernel_name, graph.dtype_of(out)); - - graph.execute_nodes().emplace_back(new DynamicDispatchNode( - graph, - VK_KERNEL_FROM_STR(kernel_name), - default_pick_global_wg_size, - default_pick_local_wg_size, - // Inputs and Outputs - {{out, vkapi::kWrite}, {{cond, self, other}, vkapi::kRead}}, - // Parameter buffers - {graph.logical_limits_ubo(self)}, - // Push Constants - {}, - // Specialization Constants - {graph.hashed_layout_of(out)}, - // Resize Arguments - {}, - // Resizing Logic - resize_where_node)); -} - -void add_where_buffer_node( +void add_where_node( ComputeGraph& graph, const ValueRef cond, const ValueRef self, @@ -69,11 +39,10 @@ void add_where_buffer_node( add_dtype_suffix(kernel_name, graph.dtype_of(out)); vkapi::ParamsBindList ubos = { - graph.numel_ubo(out), - graph.strides_ubo(out), - graph.strides_ubo(cond), - graph.strides_ubo(self), - graph.strides_ubo(other)}; + graph.meta_ubo(out), + graph.meta_ubo(cond), + graph.meta_ubo(self), + graph.meta_ubo(other)}; graph.execute_nodes().emplace_back(new DynamicDispatchNode( graph, @@ -87,7 +56,7 @@ void add_where_buffer_node( // Push Constants {}, // Specialization Constants - {graph.hashed_layout_of(out)}, + {}, // Resize Arguments {}, // Resizing Logic @@ -100,11 +69,7 @@ void where(ComputeGraph& graph, const std::vector& args) { const ValueRef self = args[args_i++]; const ValueRef other = args[args_i++]; const ValueRef out = args[args_i++]; - if (graph.is_buffer_storage(out)) { - add_where_buffer_node(graph, cond, self, other, out); - } else { - add_where_texture_node(graph, cond, self, other, out); - } + add_where_node(graph, cond, self, other, out); } REGISTER_OPERATORS { diff --git a/backends/vulkan/test/custom_ops/impl/TestQ8taBinary.cpp b/backends/vulkan/test/custom_ops/impl/TestQ8taBinary.cpp index f5214221359..e3c3e6e2642 100644 --- a/backends/vulkan/test/custom_ops/impl/TestQ8taBinary.cpp +++ b/backends/vulkan/test/custom_ops/impl/TestQ8taBinary.cpp @@ -8,9 +8,9 @@ #include +#include #include #include -#include namespace vkcompute { @@ -62,7 +62,7 @@ void q8ta_add_test(ComputeGraph& graph, const std::vector& args) { if (input_b_is_int8) { // Input B is a pre-quantized int8 TensorRef; prepack directly into packed // int8x4 format - add_staging_to_int8x4_buffer_node(graph, input_b, packed_int8_input_b); + add_prepack_int8x4_buffer_node(graph, input_b, packed_int8_input_b); } else { // Input B is a float tensor; quantize at runtime add_q8ta_quantize_node( diff --git a/backends/vulkan/test/custom_ops/test_q8ta_binary.cpp b/backends/vulkan/test/custom_ops/test_q8ta_binary.cpp index 86725ca8fb8..f60b113828b 100644 --- a/backends/vulkan/test/custom_ops/test_q8ta_binary.cpp +++ b/backends/vulkan/test/custom_ops/test_q8ta_binary.cpp @@ -133,10 +133,10 @@ TestCase create_test_case_from_config( std::vector generate_q8ta_add_easy_cases() { std::vector test_cases; - // Single simple configuration for debugging - Q8taBinaryConfig config = { - {1, 16, 16, 16}, // shape: [N, C, H, W] - "ACCU", // test_case_name + std::vector> shapes = { + {1, 16, 16, 16}, // 4D: [N, C, H, W] + {1, 144}, // 2D: exercises block config with ndim < 4 + {1, 90}, // 2D: matches skin_seg model's keypoint/bbox tensor sizes }; // Quantized memory layouts to test @@ -148,20 +148,23 @@ std::vector generate_q8ta_add_easy_cases() { utils::kPackedInt8_4C1W, }; - for (const auto& quant_layout : quant_layouts) { - test_cases.push_back(create_test_case_from_config( - config, - /*storage_type=*/utils::kBuffer, - /*input_dtype=*/vkapi::kFloat, - /*fp_memory_layout=*/utils::kWidthPacked, - quant_layout)); - test_cases.push_back(create_test_case_from_config( - config, - /*fp_storage_type=*/utils::kBuffer, - /*input_dtype=*/vkapi::kFloat, - /*fp_layout=*/utils::kWidthPacked, - quant_layout, - /*const_b=*/true)); + for (const auto& shape : shapes) { + Q8taBinaryConfig config = {shape, "ACCU"}; + for (const auto& quant_layout : quant_layouts) { + test_cases.push_back(create_test_case_from_config( + config, + /*storage_type=*/utils::kBuffer, + /*input_dtype=*/vkapi::kFloat, + /*fp_memory_layout=*/utils::kWidthPacked, + quant_layout)); + test_cases.push_back(create_test_case_from_config( + config, + /*fp_storage_type=*/utils::kBuffer, + /*input_dtype=*/vkapi::kFloat, + /*fp_layout=*/utils::kWidthPacked, + quant_layout, + /*const_b=*/true)); + } } return test_cases; @@ -173,6 +176,20 @@ std::vector generate_q8ta_add_test_cases() { // Shapes to test std::vector> shapes = { + // 1D tensors + {144}, + {90}, + + // 3D tensors + {1, 16, 32}, + {1, 3, 64}, + + // 2D tensors (exercises block config with ndim < 4) + {1, 144}, + {1, 90}, + {1, 4}, + {3, 32}, + // Small test cases for correctness {1, 3, 16, 16}, {1, 8, 32, 32}, diff --git a/backends/vulkan/test/custom_ops/test_q8ta_qdq.cpp b/backends/vulkan/test/custom_ops/test_q8ta_qdq.cpp index e0efd6ea85d..a3ff8c42f86 100644 --- a/backends/vulkan/test/custom_ops/test_q8ta_qdq.cpp +++ b/backends/vulkan/test/custom_ops/test_q8ta_qdq.cpp @@ -104,10 +104,10 @@ TestCase create_test_case_from_config( std::vector generate_q_dq_8bit_easy_cases() { std::vector test_cases; - // Single simple configuration for debugging - QDQ8BitConfig config = { - {1, 16, 16, 16}, // shape: [N, C, H, W] - "ACCU", // test_case_name + std::vector> shapes = { + {1, 16, 16, 16}, // 4D: [N, C, H, W] + {1, 144}, // 2D: exercises block config with ndim < 4 + {1, 90}, // 2D: matches skin_seg model's keypoint/bbox tensor sizes }; // FP memory layouts to test @@ -129,21 +129,24 @@ std::vector generate_q_dq_8bit_easy_cases() { std::vector float_types = {vkapi::kFloat}; // Generate test cases for each combination - for (const auto& fp_layout : fp_layouts) { - for (const auto& quant_layout : quant_layouts) { - for (const auto& storage_type : storage_types) { - for (const auto& input_dtype : float_types) { - test_cases.push_back(create_test_case_from_config( - config, storage_type, input_dtype, fp_layout, quant_layout)); - // For 4W4C layout, also test with legacy implementation - if (quant_layout == utils::kPackedInt8_4W4C) { + for (const auto& shape : shapes) { + QDQ8BitConfig config = {shape, "ACCU"}; + for (const auto& fp_layout : fp_layouts) { + for (const auto& quant_layout : quant_layouts) { + for (const auto& storage_type : storage_types) { + for (const auto& input_dtype : float_types) { test_cases.push_back(create_test_case_from_config( - config, - storage_type, - input_dtype, - fp_layout, - quant_layout, - /*impl_selector=*/"legacy_4w4c")); + config, storage_type, input_dtype, fp_layout, quant_layout)); + // For 4W4C layout, also test with legacy implementation + if (quant_layout == utils::kPackedInt8_4W4C) { + test_cases.push_back(create_test_case_from_config( + config, + storage_type, + input_dtype, + fp_layout, + quant_layout, + /*impl_selector=*/"legacy_4w4c")); + } } } } @@ -159,6 +162,20 @@ std::vector generate_q_dq_8bit_test_cases() { // Shapes to test (no layout specified - will be combined with all layouts) std::vector> shapes = { + // 1D tensors + {144}, + {90}, + + // 2D tensors (exercises block config with ndim < 4) + {1, 144}, + {1, 90}, + {1, 4}, + {3, 32}, + + // 3D tensors + {1, 16, 32}, + {1, 3, 64}, + // Small test cases for correctness {1, 3, 16, 16}, {1, 8, 32, 32}, diff --git a/backends/vulkan/test/op_tests/cases.py b/backends/vulkan/test/op_tests/cases.py index 534462ed179..fe2e4169f05 100644 --- a/backends/vulkan/test/op_tests/cases.py +++ b/backends/vulkan/test/op_tests/cases.py @@ -2001,6 +2001,56 @@ def get_where_inputs(): return test_suite +@register_test_suite("aten.bitwise_and.Tensor") +def get_bitwise_and_inputs(): + test_suite = VkTestSuite( + [ + ((M1, M2), (M1, M2)), + ((S, S1, S2), (S, S1, S2)), + ((XS, S, S1, S2), (XS, S, S1, S2)), + ((1, M1), (1, M1)), + ] + ) + test_suite.layouts = [ + "utils::kWidthPacked", + "utils::kChannelsPacked", + ] + test_suite.storage_types = [ + "utils::kBuffer", + "utils::kTexture3D", + ] + test_suite.dtypes = ["at::kBool"] + test_suite.data_gen = "make_seq_tensor" + return test_suite + + +@register_test_suite("aten.index.Tensor") +def get_index_tensor_inputs(): + Test = namedtuple("IndexTensorTest", ["self", "indices"]) + + test_cases = [ + # 1D index tensor + Test(self=(M1,), indices=[(S,)]), + Test(self=(M1,), indices=[(M2,)]), + # 2D index tensor + Test(self=(L,), indices=[(S, S1)]), + Test(self=(L,), indices=[(M1, M2)]), + # 3D index tensor + Test(self=(M1,), indices=[(XS, S, S1)]), + ] + + test_suite = VkTestSuite([tuple(tc) for tc in test_cases]) + test_suite.layouts = [ + "utils::kWidthPacked", + "utils::kChannelsPacked", + ] + test_suite.storage_types = ["utils::kBuffer", "utils::kTexture3D"] + test_suite.dtypes = ["at::kFloat"] + test_suite.arg_dtype["indices"] = "at::kInt" + test_suite.arg_data_gen_fn["indices"] = "make_casted_randint_tensor" + return test_suite + + @register_test_suite("aten.pow.Tensor_Scalar") def get_pow_tensor_scalar_inputs(): test_suite = VkTestSuite( diff --git a/backends/vulkan/test/op_tests/utils/aten_types.py b/backends/vulkan/test/op_tests/utils/aten_types.py index 6ad2f568e91..a78263987a1 100644 --- a/backends/vulkan/test/op_tests/utils/aten_types.py +++ b/backends/vulkan/test/op_tests/utils/aten_types.py @@ -12,6 +12,7 @@ AT_SCALAR = "at::Scalar" AT_TENSOR = "at::Tensor" AT_TENSOR_LIST = "at::TensorList" +OPT_TENSOR_LIST = "c10::List<::std::optional>" BOOL = "bool" DOUBLE = "double" INT = "int64_t" diff --git a/backends/vulkan/test/op_tests/utils/gen_computegraph.py b/backends/vulkan/test/op_tests/utils/gen_computegraph.py index 6a7dc2e5d0a..a09b4d36b18 100644 --- a/backends/vulkan/test/op_tests/utils/gen_computegraph.py +++ b/backends/vulkan/test/op_tests/utils/gen_computegraph.py @@ -26,6 +26,7 @@ OPT_LAYOUT, OPT_MEMORY_FORMAT, OPT_SCALAR_TYPE, + OPT_TENSOR_LIST, STRING, TENSOR_VECTOR, THREE_TENSOR_TUPLE, @@ -86,7 +87,7 @@ def vk_out(self): ValueRefList = Union[ValueRef, List[ValueRef]] -InableCppType = frozenset([AT_TENSOR, AT_TENSOR_LIST]) +InableCppType = frozenset([AT_TENSOR, AT_TENSOR_LIST, OPT_TENSOR_LIST]) class ComputeGraphGen: @@ -313,7 +314,7 @@ def create_value_decl_for(self, ref: ValueRefList) -> str: # noqa: C901 return ret_str cpp_type = "IOValueRef" if (ref.is_in or ref.requires_prepack) else "ValueRef" - if ref.src_cpp_type == AT_TENSOR_LIST: + if ref.src_cpp_type in (AT_TENSOR_LIST, OPT_TENSOR_LIST): ret_str = f"std::vector {ref.name}_io_value_refs;\n" ret_str += f"std::vector {ref.name}_value_refs;\n" return ret_str @@ -409,6 +410,25 @@ def create_value_for( # noqa: C901 ret_str += "}\n" ret_str += f"ValueRef {ref.name} = {self.graph}{self.dot}add_value_list(std::move({ref.name}_value_refs));\n" return ret_str + elif ref.src_cpp_type == OPT_TENSOR_LIST: + assert ref.is_in, "OPT_TENSOR_LIST must be an input" + ret_str = "" + if include_declarations: + ret_str += f"std::vector {ref.name}_io_value_refs;\n" + ret_str += f"std::vector {ref.name}_value_refs;\n" + ret_str += f"for (int i=0; i < (int){ref.src_cpp_name}.size(); i++) {{\n" + ret_str += ( + f" IOValueRef io_value_ref = {self.graph}{self.dot}add_input_tensor(\n" + ) + ret_str += f" {ref.src_cpp_name}[i]->sizes().vec(),\n" + ret_str += ( + f" from_at_scalartype({ref.src_cpp_name}[i]->scalar_type())); \n" + ) + ret_str += f" {ref.name}_value_refs.emplace_back(io_value_ref.value);\n" + ret_str += f" {ref.name}_io_value_refs.emplace_back(io_value_ref);\n" + ret_str += "}\n" + ret_str += f"ValueRef {ref.name} = {self.graph}{self.dot}add_value_list(std::move({ref.name}_value_refs));\n" + return ret_str elif ref.src_cpp_type == TENSOR_VECTOR: ret_str = "" if include_declarations: @@ -491,7 +511,7 @@ def create_op_call(self) -> str: for aten_arg in self.args: ref = self.refs[aten_arg.name] - if ref.src_cpp_type == AT_TENSOR_LIST: + if ref.src_cpp_type in (AT_TENSOR_LIST, OPT_TENSOR_LIST): # Special case. Underlying tensors are input tensors, but the # container itself is just a normal value. op_create_code += f"{ref.name}, " @@ -553,10 +573,20 @@ def virtual_resize(self, ref: ValueRefList) -> str: ret_str += f"{ref.src_cpp_name}.sizes().vec());\n" elif ref.src_cpp_type == AT_TENSOR_LIST: ret_str = "" - ret_str += f"for (int i=0; i < {ref.name}_io_value_refs.size(); i++) {{\n" + ret_str += ( + f"for (int i=0; i < (int){ref.name}_io_value_refs.size(); i++) {{\n" + ) ret_str += f" {self.graph}{self.dot}virtual_resize({ref.name}_io_value_refs[i].value, " ret_str += f"{ref.src_cpp_name}[i].sizes().vec());\n" ret_str += "}\n" + elif ref.src_cpp_type == OPT_TENSOR_LIST: + ret_str = "" + ret_str += ( + f"for (int i=0; i < (int){ref.name}_io_value_refs.size(); i++) {{\n" + ) + ret_str += f" {self.graph}{self.dot}virtual_resize({ref.name}_io_value_refs[i].value, " + ret_str += f"{ref.src_cpp_name}[i]->sizes().vec());\n" + ret_str += "}\n" else: raise AssertionError(f"{ref.src_cpp_type} not expected") @@ -577,13 +607,26 @@ def copy_into_staging(self, ref: ValueRefList) -> str: ret_str += f"from_at_scalartype({ref.src_cpp_name}.scalar_type()));\n" elif ref.src_cpp_type == AT_TENSOR_LIST: ret_str = "" - ret_str += f"for (int i=0; i < {ref.name}_io_value_refs.size(); i++) {{\n" + ret_str += ( + f"for (int i=0; i < (int){ref.name}_io_value_refs.size(); i++) {{\n" + ) ret_str += f" {self.graph}{self.dot}maybe_cast_and_copy_into_staging(" ret_str += f"{ref.name}_io_value_refs[i].staging, " ret_str += f"{ref.src_cpp_name}[i].const_data_ptr(), " ret_str += f"{ref.src_cpp_name}[i].numel(), " ret_str += f"from_at_scalartype({ref.src_cpp_name}[i].scalar_type()));\n" ret_str += "}\n" + elif ref.src_cpp_type == OPT_TENSOR_LIST: + ret_str = "" + ret_str += ( + f"for (int i=0; i < (int){ref.name}_io_value_refs.size(); i++) {{\n" + ) + ret_str += f" {self.graph}{self.dot}maybe_cast_and_copy_into_staging(" + ret_str += f"{ref.name}_io_value_refs[i].staging, " + ret_str += f"{ref.src_cpp_name}[i]->const_data_ptr(), " + ret_str += f"{ref.src_cpp_name}[i]->numel(), " + ret_str += f"from_at_scalartype({ref.src_cpp_name}[i]->scalar_type()));\n" + ret_str += "}\n" else: raise AssertionError(f"{ref.src_cpp_type} not expected") return ret_str diff --git a/backends/vulkan/test/op_tests/utils/gen_correctness_base.py b/backends/vulkan/test/op_tests/utils/gen_correctness_base.py index 15627726173..efd073a0cfb 100644 --- a/backends/vulkan/test/op_tests/utils/gen_correctness_base.py +++ b/backends/vulkan/test/op_tests/utils/gen_correctness_base.py @@ -25,6 +25,7 @@ OPT_LAYOUT, OPT_MEMORY_FORMAT, OPT_SCALAR_TYPE, + OPT_TENSOR_LIST, STRING, ) from executorch.backends.vulkan.test.op_tests.utils.test_suite import TestSuite @@ -166,6 +167,12 @@ def create_input_data(self, arg: Argument, data: Any) -> str: # noqa: C901 ret_str += f"{cpp_type} {arg.name} = tensor_vec;\n" return ret_str + "\n" + if cpp_type == OPT_TENSOR_LIST: + ret_str = f"{OPT_TENSOR_LIST} {arg.name};\n" + for elem in data: + ret_str += f"{arg.name}.push_back({self.call_data_gen_fn(arg, elem, False)});\n" + return ret_str + "\n" + if cpp_type == AT_INT_ARRAY_REF: ret_str = f"std::vector {arg.name} = " elif cpp_type == OPT_AT_DOUBLE_ARRAY_REF and str(data) != "None": diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp index 261d3f72d01..746fa2c5253 100644 --- a/backends/vulkan/test/vulkan_compute_api_test.cpp +++ b/backends/vulkan/test/vulkan_compute_api_test.cpp @@ -29,6 +29,8 @@ #include +#include + using namespace vkcompute; using namespace vkcompute::api; @@ -3490,3 +3492,85 @@ void test_dynamic_dispatch(int M, int N) { TEST(VulkanComputeGraphOpsTest, test_dynamic_dispatch_graph) { test_dynamic_dispatch(128, 128); } + +// +// Int8x4 Staging Tests +// + +void test_int8x4_staging_round_trip( + const std::vector& sizes, + const utils::GPUMemoryLayout layout) { + GraphConfig config; + ComputeGraph graph(config); + + const int32_t numel = utils::multiply_integers(sizes); + + // Build graph: + // staging_in (kInt8x4) -> [execute: nchw_to_int8x4_buffer] -> tensor + // (kInt8x4) + // -> [execute: int8x4_buffer_to_nchw] -> staging_out + ValueRef tensor = + graph.add_tensor(sizes, vkapi::kInt8x4, utils::kBuffer, layout); + + ValueRef staging_in = graph.set_input_tensor(tensor); + ValueRef staging_out = graph.set_output_tensor(tensor); + + // staging_buffer_numel_of returns padded_numel / 4 (number of int32 + // elements). Multiply by 4 to get the byte count, which is used to zero-pad + // the input. + const size_t staging_numel = graph.staging_buffer_numel_of(tensor); + // Create NCHW int8 input data zero-padded to the full staging buffer size. + std::vector data_in(staging_numel * 4, 0); + for (int32_t i = 0; i < numel; ++i) { + data_in[i] = static_cast(static_cast(i * 37 + 13)); + } + + graph.prepare(); + // prepack() allocates Vulkan memory for all tensors even when there are no + // prepack nodes; it must be called before execute(). + graph.prepack(); + + // Copy NCHW int8 data into the input staging buffer. The staging buffer has + // kInt8x4 dtype (staging_numel int32 elements), so reinterpret the int8 data + // as int32 for the copy call. + graph.maybe_cast_and_copy_into_staging( + staging_in, + reinterpret_cast(data_in.data()), + staging_numel, + vkapi::kInt8x4); + + graph.execute(); + + // Read back packed int32s from staging. The staging dtype is kInt8x4 (4 + // bytes per element = one packed int32 holding 4 int8 values). + std::vector data_out_packed(staging_numel); + graph.maybe_cast_and_copy_from_staging( + staging_out, data_out_packed.data(), staging_numel, vkapi::kInt8x4); + + // Verify each int8 element matches the round-trip + for (int32_t i = 0; i < numel; ++i) { + const uint8_t byte = static_cast( + static_cast(data_out_packed[i / 4]) >> ((i % 4) * 8)); + const int8_t actual = static_cast(byte); + EXPECT_EQ(actual, data_in[i]) + << "Mismatch at nchw index " << i << " for sizes [" << sizes[0] + << (sizes.size() > 1 ? ", " + std::to_string(sizes[1]) : "") + << (sizes.size() > 2 ? ", " + std::to_string(sizes[2]) : "") + << (sizes.size() > 3 ? ", " + std::to_string(sizes[3]) : "") + << "] layout " << layout; + } +} + +TEST(VulkanComputeGraphTest, test_int8x4_staging_round_trip) { + const std::vector layouts = { + utils::kPackedInt8_4C, + utils::kPackedInt8_4W, + utils::kPackedInt8_4W4C, + utils::kPackedInt8_4C1W, + }; + for (const auto& sizes : standard_sizes_to_test) { + for (const auto layout : layouts) { + test_int8x4_staging_round_trip(sizes, layout); + } + } +}