From 318f8d010b8c2bf0f475b8df3e729d3ce737ae41 Mon Sep 17 00:00:00 2001 From: ssjia Date: Wed, 25 Mar 2026 16:30:47 -0700 Subject: [PATCH 1/2] [ET-VK] Modernize permute op with safe indexing and unified dispatch Modernize the permute operator to follow current best practices, fixing an Adreno 740 driver crash caused by dynamic UBO indexing in the texture shader. Texture shader changes: - Replace old indexing_utils.h with indexing.glslh - Use TextureMetadata UBOs instead of push constant sizes - Use texture_pos_to_tensor4d_idx_simple() and related helpers - Replace permute_dims[out_packed_dim] with safe_idx() to avoid dynamic indexing of push constant with spec-const-derived index - Use TextureElementIndex pattern for the slow path C++ dispatch changes: - Merge add_permute_node() and add_permute_buffer_node() into a single unified function using graph.meta_ubo() and conditional logic - Remove unused channel_info computation - Move WHCNPermuteDims struct into anonymous namespace - Guard texture path with VK_CHECK_COND(permute_ndim <= 4) Differential Revision: [D98220451](https://our.internmc.facebook.com/intern/diff/D98220451/) ghstack-source-id: 357844381 Pull Request resolved: https://github.com/pytorch/executorch/pull/18511 --- .../graph/ops/glsl/permute_texture.glsl | 100 ++++----- .../vulkan/runtime/graph/ops/impl/Permute.cpp | 192 ++++++------------ 2 files changed, 110 insertions(+), 182 deletions(-) diff --git a/backends/vulkan/runtime/graph/ops/glsl/permute_texture.glsl b/backends/vulkan/runtime/graph/ops/glsl/permute_texture.glsl index b8f8550baf3..903baabc3cf 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/permute_texture.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/permute_texture.glsl @@ -9,97 +9,97 @@ #version 450 core ${define_required_extensions("texture3d", DTYPE)} -${define_explicit_type_extensions(DTYPE)} #define PRECISION ${PRECISION} -#define VEC4_T ${texel_type(DTYPE)} -#define T ${buffer_scalar_type(DTYPE)} +#define VEC4_T ${texel_load_type(DTYPE, "texture3d")} +#define T ${texel_load_component_type(DTYPE, "texture3d")} ${define_active_storage_type("texture3d")} +#extension GL_EXT_control_flow_attributes : require + layout(std430) buffer; -#include "indexing_utils.h" +#include "common.glslh" +#include "indexing.glslh" ${layout_declare_tensor(B, "w", "t_out", DTYPE, "texture3d")} ${layout_declare_tensor(B, "r", "t_in", DTYPE, "texture3d")} +${layout_declare_ubo(B, "TextureMetadata", "outp")} +${layout_declare_ubo(B, "TextureMetadata", "inp")} + layout(push_constant) uniform restrict Block { - ivec4 out_sizes; - ivec4 in_sizes; - ivec4 permute_dims; // Permutation mapping: permute_dims[i] = j means output dim i comes from input dim j + ivec4 permute_dims; }; -${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")} -const lowp ivec4 out_axis_map = unhash_axis_map(out_layout); -const lowp int out_packed_dim = unhash_packed_dim(out_layout); - -${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")} -const lowp ivec4 in_axis_map = unhash_axis_map(in_layout); -const lowp int in_packed_dim = unhash_packed_dim(in_layout); +${layout_declare_spec_const(C, "int", "out_layout", "CONTIG_LAYOUT_INT")} +${layout_declare_spec_const(C, "int", "in_layout", "CONTIG_LAYOUT_INT")} +const int out_packed_dim = get_packed_dim(out_layout); +const int in_packed_dim = get_packed_dim(in_layout); layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; -// Convert output tensor index to input tensor index based on permutation +// Convert output tensor index to input tensor index based on permutation. +// permute_dims[i] = j means output dim i comes from input dim j. +// We write: in_tidx[permute_dims.{x,y,z,w}] = out_tidx.{x,y,z,w} +// This uses literal component access on the push constant (safe) and dynamic +// indexing into the local in_tidx variable (also safe). ivec4 out_tidx_to_in_tidx(const ivec4 out_tidx) { ivec4 in_tidx; - - // Apply the permutation mapping: in_tidx[permute_dims[i]] = out_tidx[i] in_tidx[permute_dims.x] = out_tidx.x; in_tidx[permute_dims.y] = out_tidx.y; in_tidx[permute_dims.z] = out_tidx.z; in_tidx[permute_dims.w] = out_tidx.w; - return in_tidx; } -// Check if we can use the fast path where texels from the input tensor can be -// copied directly into the output tensor. This occurs when the packed dimension -// is preserved in the permutation, i.e. reading a texel from the output tensor -// produces 4 texels along the same dimension as reading a texel from the input -// tensor. -bool can_use_fast_path() { - // Fast path is possible when the packed dimension is preserved in the permutation - // This means permute_dims[out_packed_dim] == in_packed_dim - return permute_dims[out_packed_dim] == in_packed_dim; -} - void main() { - const ivec3 lpos = ivec3(gl_GlobalInvocationID); - ivec4 out_tidx = lpos_to_tidx(lpos, out_sizes, out_axis_map.w, out_packed_dim); + const ivec3 out_pos = ivec3(gl_GlobalInvocationID); - if (any(greaterThanEqual(out_tidx, out_sizes))) { + if (out_of_bounds(out_pos, outp)) { return; } - if (can_use_fast_path()) { + TensorIndex4D out_tidx = + texture_pos_to_tensor4d_idx_simple(outp, out_pos, out_layout); + + // Check if packed dimension is preserved in the permutation. Use safe_idx + // to avoid dynamic indexing of push constant with spec-const-derived index. + const bool fast_path = + safe_idx(permute_dims, out_packed_dim) == in_packed_dim; + + if (fast_path) { // Fast path: packed dimension is preserved, so we can copy texels directly - ivec4 in_tidx = out_tidx_to_in_tidx(out_tidx); - ivec3 in_pos = tidx_to_pos(in_tidx, in_sizes, in_axis_map, in_packed_dim); - VEC4_T in_texel = VEC4_T(load_texel(t_in, in_pos)); + ivec4 in_tidx_data = out_tidx_to_in_tidx(out_tidx.data); + TensorIndex4D in_tidx; + in_tidx.data = in_tidx_data; - write_texel_lpos(t_out, lpos, in_texel, out_axis_map); - } - else { + ivec3 in_pos = + tensor4d_idx_to_texel_pos_simple(inp, in_tidx, in_layout); + VEC4_T in_texel = texelFetch(t_in, in_pos, 0); + + imageStore(t_out, out_pos, in_texel); + } else { // Slow path: packed dimension is not preserved, so each element of the - // output texel may be "sourced" from a different texel in the input tensor. - // Therefore each output texel element is processed individually. + // output texel may come from a different texel in the input tensor. VEC4_T out_texel = VEC4_T(0); - for (int texel_i = 0; texel_i < 4; ++texel_i) { - ivec4 in_tidx = out_tidx_to_in_tidx(out_tidx); - ivec3 in_pos = tidx_to_pos(in_tidx, in_sizes, in_axis_map, in_packed_dim); - int element_idx = in_tidx[in_packed_dim] % 4; + for (int comp = 0; comp < 4; comp++) { + ivec4 in_tidx_data = out_tidx_to_in_tidx(out_tidx.data); + TensorIndex4D in_tidx; + in_tidx.data = in_tidx_data; - VEC4_T in_texel = VEC4_T(load_texel(t_in, in_pos)); - T selected_value = T(in_texel[element_idx]); + TextureElementIndex in_elem = + tensor4d_idx_to_texture_element_idx_simple(inp, in_tidx, in_layout); - out_texel[texel_i] = selected_value; + VEC4_T in_texel = texelFetch(t_in, in_elem.pos, 0); + out_texel[comp] = in_texel[in_elem.comp]; - out_tidx[out_packed_dim]++; + out_tidx.data[out_packed_dim]++; } - write_texel_lpos(t_out, lpos, out_texel, out_axis_map); + imageStore(t_out, out_pos, out_texel); } } diff --git a/backends/vulkan/runtime/graph/ops/impl/Permute.cpp b/backends/vulkan/runtime/graph/ops/impl/Permute.cpp index d7b06015b72..8081424cfb7 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Permute.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Permute.cpp @@ -11,17 +11,12 @@ #include #include -#include -#include #include #include namespace vkcompute { -using utils::ivec2; -using utils::ivec3; using utils::ivec4; -using utils::uvec4; namespace { @@ -32,13 +27,38 @@ void check_args( const ValueRef out) { (void)permute_dims; VK_CHECK_COND(check_same_packed_dim(graph, in, out)); - - // This implementation doesn't not requires the input tensor to have the same - // dim size as the argument. The code will work as long as the input tensor's - // dim size is shorter than the permute dim array. In this case, the code - // assume size of 1 at the higher dimensions. } +struct WHCNPermuteDims { + int32_t whcn_permute_dims[api::kTensorDimLimit]; + + void initialize(const std::vector& permute_dims) { + const int32_t permute_ndim = permute_dims.size(); + for (int32_t whcn_i = 0; whcn_i < permute_ndim; whcn_i++) { + const int32_t nchw_i = permute_ndim - 1 - whcn_i; + int64_t index_val = permute_dims.at(nchw_i); + if (index_val < 0) { + index_val += permute_ndim; + } + const int32_t permute_dim_whcn = permute_ndim - 1 - index_val; + whcn_permute_dims[whcn_i] = permute_dim_whcn; + } + for (int32_t whcn_i = permute_ndim; whcn_i < api::kTensorDimLimit; + whcn_i++) { + whcn_permute_dims[whcn_i] = whcn_i; + } + } + + int32_t pack_into_int32() const { + VK_CHECK_COND(api::kTensorDimLimit <= 8); + int32_t packed = 0; + for (int32_t i = 0; i < api::kTensorDimLimit; i++) { + packed |= (whcn_permute_dims[i] & 0x0F) << (i * 4); + } + return packed; + } +}; + } // namespace void resize_permute_node( @@ -101,15 +121,36 @@ void add_permute_node( const ValueRef out) { check_args(graph, in, permute_dims, out); - // Convert the permute dims to WHCN dimension order, which is the standard in - // our compute shaders. The following transformations are applied. - // 1. Change dimension index values from NCHW order valueto WHCN order value - // 2. Reverse the order of the permute array from NCHW order to WHCN order + std::string kernel_name = "permute"; + kernel_name.reserve(kShaderNameReserve); + add_storage_type_suffix(kernel_name, graph.storage_type_of(out)); + add_dtype_suffix(kernel_name, graph.dtype_of(out)); + + vkapi::ParamsBindList param_ubos = {graph.meta_ubo(out), graph.meta_ubo(in)}; + + std::vector push_constants; + vkapi::SpecVarList spec_vars = { + graph.hashed_layout_of(out), graph.hashed_layout_of(in)}; + + // WHCN permute dims for the texture path (ivec4, max 4D). + // Declared here so its lifetime extends to the DynamicDispatchNode creation + // where push_constants references it. ivec4 whcn_permute_dims{0, 1, 2, 3}; - { + + if (graph.is_buffer_storage(out)) { + // Buffer path: supports up to kTensorDimLimit dims via WHCNPermuteDims, + // packed into a spec constant int + WHCNPermuteDims whcn_pd; + whcn_pd.initialize(*graph.get_int_list(permute_dims)); + spec_vars.append(whcn_pd.pack_into_int32()); + } else { + // Texture path: compute 4D WHCN permute dims and pass as push constant IntListPtr permute_dims_ptr = graph.get_int_list(permute_dims); const int32_t permute_ndim = utils::safe_downcast(permute_dims_ptr->size()); + VK_CHECK_COND( + permute_ndim <= 4, + "Texture storage only supports permute with up to 4 dims"); for (int32_t nchw_i = permute_ndim - 1, whcn_i = 0; nchw_i >= 0; nchw_i--, whcn_i++) { @@ -119,133 +160,23 @@ void add_permute_node( permute_dim_nchw += permute_ndim; } const int32_t permute_dim_whcn = permute_ndim - 1 - permute_dim_nchw; - whcn_permute_dims[whcn_i] = permute_dim_whcn; } - } - std::string kernel_name = "permute"; - kernel_name.reserve(kShaderNameReserve); - add_storage_type_suffix(kernel_name, graph.storage_type_of(out)); - add_dtype_suffix(kernel_name, graph.dtype_of(out)); - - vkapi::ParamsBindList param_buffers; - std::vector push_constants; - vkapi::SpecVarList spec_vars; - - const int32_t out_channels = dim_at(graph.sizes_of(out)); - const int32_t in_channels = dim_at(graph.sizes_of(in)); - - const int32_t packed_dim = graph.packed_dim_of(in); - ivec2 channel_info = {out_channels, in_channels}; - if (packed_dim == WHCN::kChannelsDim) { - channel_info[0] = utils::align_up_4(channel_info[0]); - channel_info[1] = utils::align_up_4(channel_info[1]); + push_constants.push_back( + PushConstantDataInfo(&whcn_permute_dims, sizeof(whcn_permute_dims))); } - push_constants = { - graph.sizes_pc_of(out), - graph.sizes_pc_of(in), - PushConstantDataInfo(&whcn_permute_dims, sizeof(whcn_permute_dims))}; - - spec_vars = {graph.hashed_layout_of(out), graph.hashed_layout_of(in)}; - graph.execute_nodes().emplace_back(new DynamicDispatchNode( graph, VK_KERNEL_FROM_STR(kernel_name), default_pick_global_wg_size, default_pick_local_wg_size, {{out, vkapi::kWrite}, {in, vkapi::kRead}}, - // Parameter buffers - param_buffers, - // Push Constants + param_ubos, push_constants, - // Specialization Constants spec_vars, - // Resize Args {permute_dims}, - // Resizing Logic - resize_permute_node)); -} - -struct WHCNPermuteDims { - int32_t whcn_permute_dims[api::kTensorDimLimit]; - - void initialize(const std::vector& permute_dims) { - const int32_t permute_ndim = permute_dims.size(); - for (int32_t whcn_i = 0; whcn_i < permute_ndim; whcn_i++) { - const int32_t nchw_i = permute_ndim - 1 - whcn_i; - int64_t index_val = permute_dims.at(nchw_i); - if (index_val < 0) { - index_val += permute_ndim; - } - const int32_t permute_dim_whcn = permute_ndim - 1 - index_val; - whcn_permute_dims[whcn_i] = permute_dim_whcn; - } - for (int32_t whcn_i = permute_ndim; whcn_i < api::kTensorDimLimit; - whcn_i++) { - whcn_permute_dims[whcn_i] = whcn_i; - } - } - - int32_t pack_into_int32() const { - // If kTensorDimLimit is increased, we will need to send in an additional - // int. - VK_CHECK_COND(api::kTensorDimLimit <= 8); - // Packs the 8 elements in whcn_permute_dims into a single int32_t. Each - // element is packed into 4 bits. - int32_t packed = 0; - for (int32_t i = 0; i < api::kTensorDimLimit; i++) { - packed |= (whcn_permute_dims[i] & 0x0F) << (i * 4); - } - return packed; - } -}; - -void add_permute_buffer_node( - ComputeGraph& graph, - const ValueRef in, - const ValueRef permute_dims, - const ValueRef out) { - check_args(graph, in, permute_dims, out); - - WHCNPermuteDims whcn_permute_dims; - // Convert the permute dims to WHCN dimension order, which is the standard in - // our compute shaders. The following transformations are applied. - // 1. Change dimension index values from NCHW order value to WHCN order value - // 2. Extend the permute array to kTensorDimLimit - { - IntListPtr permute_dims_ptr = graph.get_int_list(permute_dims); - whcn_permute_dims.initialize(*permute_dims_ptr); - } - - std::string kernel_name = "permute"; - kernel_name.reserve(kShaderNameReserve); - add_storage_type_suffix(kernel_name, graph.storage_type_of(out)); - add_dtype_suffix(kernel_name, graph.dtype_of(out)); - - vkapi::ParamsBindList param_buffers = { - graph.buffer_meta_ubo(out), - graph.buffer_meta_ubo(in), - }; - - graph.execute_nodes().emplace_back(new DynamicDispatchNode( - graph, - VK_KERNEL_FROM_STR(kernel_name), - default_pick_global_wg_size, - default_pick_local_wg_size, - {{out, vkapi::kWrite}, {in, vkapi::kRead}}, - // Parameter buffers - param_buffers, - // Push Constants - {}, - // Specialization Constants - {graph.hashed_layout_of(out), - graph.hashed_layout_of(in), - whcn_permute_dims.pack_into_int32()}, - // Resize Args - {permute_dims}, - // Resizing Logic resize_permute_node)); } @@ -255,10 +186,7 @@ void permute(ComputeGraph& graph, const std::vector& args) { const ValueRef permute_dims = args.at(idx++); const ValueRef out = args.at(idx++); - if (graph.is_buffer_storage(args[2])) { - return add_permute_buffer_node(graph, in, permute_dims, out); - } - return add_permute_node(graph, in, permute_dims, out); + add_permute_node(graph, in, permute_dims, out); } REGISTER_OPERATORS { From de83a9fbfacdae6b9542a5e1c98fb48c32f3c5b4 Mon Sep 17 00:00:00 2001 From: ssjia Date: Wed, 25 Mar 2026 16:30:51 -0700 Subject: [PATCH 2/2] [ET-VK] Replace dynamic UBO indexing with safe_idx across shaders The Adreno 740 GPU driver crashes (SIGSEGV in vkCreateComputePipelines) when GLSL shaders dynamically index a UBO-backed ivec4/ivec3 with a specialization-constant-derived value. This was causing the skin segmentation model to crash during pipeline creation on Samsung S23. Fix all instances across 18 shader files by replacing patterns like `meta.sizes[packed_dim]` with `safe_idx(meta.sizes, packed_dim)`, which uses an if/else chain that the driver resolves at pipeline creation time. Changes: - Add safe_idx(ivec3) overload to indexing.glslh - Fix transfer_texture.glsl, slice.glslh, select.glslh (transfer ops) - Fix nchw_to_int8x4_buffer.glsl, full_texture.glsl (staging/utility) - Fix gather, split, index_tensor, where, expand, pad, repeat, arange texture shaders (1-line fixes each) - Fix softmax.glsl, reduce.glsl, reduce2d.glsl, var_texture3d.glsl (reduction shaders with multiple fixes + added indexing.glslh include) - Remove unused ShaderNameUtils.h include from Slice.cpp Differential Revision: [D98220450](https://our.internmc.facebook.com/intern/diff/D98220450/) ghstack-source-id: 357844383 Pull Request resolved: https://github.com/pytorch/executorch/pull/18512 --- .../graph/ops/glsl/arange_texture.glsl | 2 +- .../graph/ops/glsl/expand_texture.glsl | 2 +- .../runtime/graph/ops/glsl/full_texture.glsl | 2 +- .../graph/ops/glsl/gather_texture.glsl | 2 +- .../graph/ops/glsl/index_tensor_texture.glsl | 2 +- .../runtime/graph/ops/glsl/indexing.glslh | 8 ++++++++ .../graph/ops/glsl/nchw_to_int8x4_buffer.glsl | 4 ++-- .../runtime/graph/ops/glsl/pad_texture.glsl | 2 +- .../vulkan/runtime/graph/ops/glsl/reduce.glsl | 13 ++++++------ .../runtime/graph/ops/glsl/reduce2d.glsl | 11 +++++----- .../graph/ops/glsl/repeat_texture.glsl | 2 +- .../runtime/graph/ops/glsl/select.glslh | 2 +- .../vulkan/runtime/graph/ops/glsl/slice.glslh | 2 +- .../runtime/graph/ops/glsl/softmax.glsl | 20 +++++++++---------- .../runtime/graph/ops/glsl/split_texture.glsl | 2 +- .../graph/ops/glsl/transfer_texture.glsl | 2 +- .../runtime/graph/ops/glsl/var_texture3d.glsl | 13 ++++++------ .../vulkan/runtime/graph/ops/glsl/where.glsl | 2 +- .../vulkan/runtime/graph/ops/impl/Slice.cpp | 1 - 19 files changed, 52 insertions(+), 42 deletions(-) diff --git a/backends/vulkan/runtime/graph/ops/glsl/arange_texture.glsl b/backends/vulkan/runtime/graph/ops/glsl/arange_texture.glsl index e2a22213c05..0a5636b300f 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/arange_texture.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/arange_texture.glsl @@ -45,7 +45,7 @@ void main() { // Compute the value for each element in the texel along the packed dim. VEC4_T outtex = VEC4_T(0); int limit = min( - 4, outp.sizes[packed_dim] - out_tidx.data[packed_dim]); + 4, safe_idx(outp.sizes, packed_dim) - out_tidx.data[packed_dim]); for (int comp = 0; comp < limit; comp++) { int elem_idx = out_tidx.data[0]; // W index is the linear element index outtex[comp] = VEC4_T(start + elem_idx * step).x; diff --git a/backends/vulkan/runtime/graph/ops/glsl/expand_texture.glsl b/backends/vulkan/runtime/graph/ops/glsl/expand_texture.glsl index c75ccba9f2d..3dfd7213c9f 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/expand_texture.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/expand_texture.glsl @@ -45,7 +45,7 @@ void main() { VEC4_T out_texel = VEC4_T(0); int limit = min( - 4, outp.sizes[packed_dim] - out_tidx.data[packed_dim]); + 4, safe_idx(outp.sizes, packed_dim) - out_tidx.data[packed_dim]); for (int comp = 0; comp < 4; comp++) { if (comp >= limit) { break; diff --git a/backends/vulkan/runtime/graph/ops/glsl/full_texture.glsl b/backends/vulkan/runtime/graph/ops/glsl/full_texture.glsl index dbd2336e209..0124ac2d0b9 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/full_texture.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/full_texture.glsl @@ -40,7 +40,7 @@ void main() { TensorIndex4D tidx = texture_pos_to_tensor4d_idx_simple(outp, pos, out_layout); - const int packed_dim_size = outp.sizes[packed_dim]; + const int packed_dim_size = safe_idx(outp.sizes, packed_dim); int packed_idx = tidx.data[packed_dim]; if (packed_idx + 3 >= packed_dim_size) { diff --git a/backends/vulkan/runtime/graph/ops/glsl/gather_texture.glsl b/backends/vulkan/runtime/graph/ops/glsl/gather_texture.glsl index 6af78867a2e..2f95365ca73 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/gather_texture.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/gather_texture.glsl @@ -53,7 +53,7 @@ void main() { VEC4_T out_texel = VEC4_T(0); int limit = min( - 4, outp.sizes[out_packed_dim] - out_tidx.data[out_packed_dim]); + 4, safe_idx(outp.sizes, out_packed_dim) - out_tidx.data[out_packed_dim]); for (int comp = 0; comp < 4; comp++) { TensorIndex4D input_tidx = out_tidx; int gather_idx = idx_texel[comp]; diff --git a/backends/vulkan/runtime/graph/ops/glsl/index_tensor_texture.glsl b/backends/vulkan/runtime/graph/ops/glsl/index_tensor_texture.glsl index 7dc25cda040..90b1c6e5a5b 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/index_tensor_texture.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/index_tensor_texture.glsl @@ -55,7 +55,7 @@ void main() { VEC4_T out_texel = VEC4_T(0); int limit = min( - 4, outp.sizes[out_packed_dim] - out_tidx.data[out_packed_dim]); + 4, safe_idx(outp.sizes, out_packed_dim) - out_tidx.data[out_packed_dim]); for (int comp = 0; comp < limit; comp++) { int idx = idx_texel[comp]; diff --git a/backends/vulkan/runtime/graph/ops/glsl/indexing.glslh b/backends/vulkan/runtime/graph/ops/glsl/indexing.glslh index 6f01dee0fe3..ddcc6d03de2 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/indexing.glslh +++ b/backends/vulkan/runtime/graph/ops/glsl/indexing.glslh @@ -99,6 +99,14 @@ uint safe_idx(const uvec4 v, const int idx) { return v.w; } +// Safe ivec3 component access via if/else chain. Same rationale as safe_idx +// for ivec4. +int safe_idx(const ivec3 v, const int idx) { + if (idx == 0) return v.x; + if (idx == 1) return v.y; + return v.z; +} + // Safe ivec4 component write via if/else chain. Companion to safe_idx for // cases where we need to set a component by a spec-const-derived index. void safe_set(inout ivec4 v, const int idx, const int val) { diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_int8x4_buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_int8x4_buffer.glsl index d8f7bdabe53..6b535400554 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_int8x4_buffer.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_int8x4_buffer.glsl @@ -42,7 +42,7 @@ void main() { texel_idx_to_tensor4d_idx(outp, texel_idx, outp_layout); // Bounds check on outer dimension - if (tidx.data[outer_dim] >= int(outp.sizes[0][outer_dim])) { + if (tidx.data[outer_dim] >= int(safe_idx(outp.sizes[0], outer_dim))) { return; } @@ -55,7 +55,7 @@ void main() { int packed = 0; [[unroll]] for (int i = 0; i < 4; ++i) { const int elem_inner = tidx.data[inner_dim] + i; - if (elem_inner < int(outp.sizes[0][inner_dim])) { + if (elem_inner < int(safe_idx(outp.sizes[0], inner_dim))) { // Build element coordinates ivec4 elem = tidx.data; elem[inner_dim] = elem_inner; diff --git a/backends/vulkan/runtime/graph/ops/glsl/pad_texture.glsl b/backends/vulkan/runtime/graph/ops/glsl/pad_texture.glsl index 35a44485e27..d09c5890ed7 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/pad_texture.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/pad_texture.glsl @@ -52,7 +52,7 @@ void main() { // Tail texels may have fewer than 4 valid elements; leave extras as 0. const int limit = - min(4, outp.sizes[packed_dim] - out_tidx.data[packed_dim]); + min(4, safe_idx(outp.sizes, packed_dim) - out_tidx.data[packed_dim]); VEC4_T out_texel = VEC4_T(0); diff --git a/backends/vulkan/runtime/graph/ops/glsl/reduce.glsl b/backends/vulkan/runtime/graph/ops/glsl/reduce.glsl index 7a6263d9f55..209440cec6a 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/reduce.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/reduce.glsl @@ -43,6 +43,7 @@ layout(constant_id = 5) const int group_dim = 1; shared vec4 shared_vecs[MAX_NTHREADS]; #include "indexing_utils.h" +#include "indexing.glslh" int tid_to_smi(const ivec2 tid) { return tid.x + tid.y * NWORKERS; @@ -95,7 +96,7 @@ void reduce_nonpacked_dim(const ivec2 tid, ivec3 scan_pos) { scan_pos[reduce_dim] = tid.x; // Partially accumulate over elements i, i + NWORKERS, i + 2*NWORKERS, ... of // the reduction row - for (int i = tid.x; i < tin_sizes[reduce_dim]; + for (int i = tid.x; i < safe_idx(tin_sizes, reduce_dim); i += NWORKERS, scan_pos[reduce_dim] += NWORKERS) { accum = UPDATE_ACCUM(accum, load_texel(tin, scan_pos)); } @@ -115,11 +116,11 @@ void reduce_nonpacked_dim(const ivec2 tid, ivec3 scan_pos) { // Determine if there are any padding elements in the final texel of the // packed dimension - const int nspill = mod4(tin_sizes[packed_dim]); + const int nspill = mod4(safe_idx(tin_sizes, packed_dim)); // Detect if this thread is working on the final texels of the packed // dimension, which may have padding elements const bool is_last_texel = - scan_pos[packed_dim] == (tin_limits[packed_dim] - 1); + scan_pos[packed_dim] == (safe_idx(tin_limits, packed_dim) - 1); // Explicitly set padding elements to 0 if (is_last_texel && nspill > 0) { @@ -145,10 +146,10 @@ void reduce_packed_dim(const ivec2 tid, ivec3 scan_pos) { const int smi = tid_to_smi(tid); // Number of non-padding elements in the last texel in the reduction row - const int nspill = mod4(tin_sizes[packed_dim]); + const int nspill = mod4(safe_idx(tin_sizes, packed_dim)); // Only reduce up to the last "complete" texel. The last texel will need to be // handled specially if it has padding elements. - const int reduce_len = tin_sizes[packed_dim] - nspill; + const int reduce_len = safe_idx(tin_sizes, packed_dim) - nspill; scan_pos[reduce_dim] = 0; vec4 accum = INIT_ACCUM(vec4(load_texel(tin, scan_pos).x)); @@ -163,7 +164,7 @@ void reduce_packed_dim(const ivec2 tid, ivec3 scan_pos) { // For the last texel in the dim, if there are padding elements then each // element of the texel needs to be processed individually such that the // padding elements are ignored - if (scan_pos[reduce_dim] == tin_limits[reduce_dim] - 1 && nspill > 0) { + if (scan_pos[reduce_dim] == safe_idx(tin_limits, reduce_dim) - 1 && nspill > 0) { const vec4 intex = load_texel(tin, scan_pos); for (int i = 0; i < nspill; i++) { accum.x = UPDATE_ACCUM(accum.x, intex[i]); diff --git a/backends/vulkan/runtime/graph/ops/glsl/reduce2d.glsl b/backends/vulkan/runtime/graph/ops/glsl/reduce2d.glsl index 98370a9bcde..bd55025f534 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/reduce2d.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/reduce2d.glsl @@ -44,6 +44,7 @@ layout(constant_id = 6) const int group_dim = 2; shared vec4 shared_vecs[MAX_NTHREADS]; #include "indexing_utils.h" +#include "indexing.glslh" int tid_to_smi(const ivec2 tid) { return tid.x + tid.y * NWORKERS; @@ -68,12 +69,12 @@ void reduce_2d_non_packed_dim(const ivec2 tid, ivec3 scan_pos) { // First dimension reduction scan_pos[reduce_dim1] = tid.x; - for (int i = tid.x; i < tin_sizes[reduce_dim1]; + for (int i = tid.x; i < safe_idx(tin_sizes, reduce_dim1); i += NWORKERS, scan_pos[reduce_dim1] += NWORKERS) { // Second dimension reduction scan_pos[reduce_dim2] = 0; - for (int j = 0; j < tin_sizes[reduce_dim2]; j++, scan_pos[reduce_dim2]++) { + for (int j = 0; j < safe_idx(tin_sizes, reduce_dim2); j++, scan_pos[reduce_dim2]++) { accum = UPDATE_ACCUM(accum, load_texel(tin, scan_pos)); } } @@ -93,11 +94,11 @@ void reduce_2d_non_packed_dim(const ivec2 tid, ivec3 scan_pos) { // Determine if there are any padding elements in the final texel of the // packed dimension - const int nspill = mod4(tin_sizes[packed_dim]); + const int nspill = mod4(safe_idx(tin_sizes, packed_dim)); // Detect if this thread is working on the final texels of the packed // dimension, which may have padding elements - const bool is_last_texel = - scan_pos[packed_dim] == (tin_limits[packed_dim] - 1); + const bool is_last_texel = + scan_pos[packed_dim] == (safe_idx(tin_limits, packed_dim) - 1); // Explicitly set padding elements to 0 if (is_last_texel && nspill > 0) { diff --git a/backends/vulkan/runtime/graph/ops/glsl/repeat_texture.glsl b/backends/vulkan/runtime/graph/ops/glsl/repeat_texture.glsl index 02f8956ce1f..6661d747876 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/repeat_texture.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/repeat_texture.glsl @@ -47,7 +47,7 @@ void main() { VEC4_T out_texel = VEC4_T(0); const int limit = min( - 4, out_meta.sizes[packed_dim] - out_tidx.data[packed_dim]); + 4, safe_idx(out_meta.sizes, packed_dim) - out_tidx.data[packed_dim]); for (int comp = 0; comp < limit; comp++) { TensorIndex4D in_tidx = out_tidx; in_tidx.data = ivec4( diff --git a/backends/vulkan/runtime/graph/ops/glsl/select.glslh b/backends/vulkan/runtime/graph/ops/glsl/select.glslh index 5390e2a4bb2..4fdb48926fa 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/select.glslh +++ b/backends/vulkan/runtime/graph/ops/glsl/select.glslh @@ -69,7 +69,7 @@ TensorIndex4D out_tidx_to_in_tidx(const TensorIndex4D out_tidx) { int adjusted_index = index; if (index < 0) { - adjusted_index = index + inp.sizes[selected_dim]; + adjusted_index = index + safe_idx(inp.sizes, selected_dim); } // Handle different dimensions for selection diff --git a/backends/vulkan/runtime/graph/ops/glsl/slice.glslh b/backends/vulkan/runtime/graph/ops/glsl/slice.glslh index 0a815c85d66..a7fc94bb5d6 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/slice.glslh +++ b/backends/vulkan/runtime/graph/ops/glsl/slice.glslh @@ -56,7 +56,7 @@ TensorIndex4D out_tidx_to_in_tidx(const TensorIndex4D out_tidx) { int adjusted_start = start; if (start < 0) { - adjusted_start = start + inp.sizes[selected_dim]; + adjusted_start = start + safe_idx(inp.sizes, selected_dim); } in_tidx.data[selected_dim] = adjusted_start + out_tidx.data[selected_dim] * step; diff --git a/backends/vulkan/runtime/graph/ops/glsl/softmax.glsl b/backends/vulkan/runtime/graph/ops/glsl/softmax.glsl index bf7facae761..ce9d2477795 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/softmax.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/softmax.glsl @@ -57,7 +57,7 @@ void softmax_nonpacked_dim(const ivec2 tid, ivec3 scan_pos) { scan_pos[reduce_dim] = tid.x; vec4 max_elements = texelFetch(tin, scan_pos, 0); - for (int i = tid.x; i < in_meta.sizes[reduce_dim]; + for (int i = tid.x; i < safe_idx(in_meta.sizes, reduce_dim); i += NWORKERS, scan_pos[reduce_dim] += NWORKERS) { max_elements = max(max_elements, texelFetch(tin, scan_pos, 0)); } @@ -71,7 +71,7 @@ void softmax_nonpacked_dim(const ivec2 tid, ivec3 scan_pos) { scan_pos[reduce_dim] = tid.x; vec4 denominators = vec4(0); - for (int i = tid.x; i < in_meta.sizes[reduce_dim]; + for (int i = tid.x; i < safe_idx(in_meta.sizes, reduce_dim); i += NWORKERS, scan_pos[reduce_dim] += NWORKERS) { denominators += exp(texelFetch(tin, scan_pos, 0) - max_elements); } @@ -83,12 +83,12 @@ void softmax_nonpacked_dim(const ivec2 tid, ivec3 scan_pos) { denominators += shared_sum[group_i]; } - const int nspill = mod_4(in_meta.sizes[packed_dim]); + const int nspill = mod_4(safe_idx(in_meta.sizes, packed_dim)); const bool is_last_texel = - scan_pos[packed_dim] == (out_meta.limits[packed_dim] - 1); + scan_pos[packed_dim] == (safe_idx(out_meta.limits, packed_dim) - 1); scan_pos[reduce_dim] = tid.x; - for (int i = tid.x; i < in_meta.sizes[reduce_dim]; + for (int i = tid.x; i < safe_idx(in_meta.sizes, reduce_dim); i += NWORKERS, scan_pos[reduce_dim] += NWORKERS) { const vec4 numerators = op1(texelFetch(tin, scan_pos, 0) - max_elements); const vec4 safe_denom = max(denominators, vec4(1e-37)); @@ -124,8 +124,8 @@ void softmax_packed_dim(const ivec2 tid, ivec3 scan_pos) { const int smi = tid_to_smi(tid); int group_i; - const int nspill = mod_4(in_meta.sizes[packed_dim]); - const int reduce_len = in_meta.sizes[packed_dim] - nspill; + const int nspill = mod_4(safe_idx(in_meta.sizes, packed_dim)); + const int reduce_len = safe_idx(in_meta.sizes, packed_dim) - nspill; scan_pos[reduce_dim] = tid.x; vec4 max_elements = vec4(-3.402823e+38); @@ -133,7 +133,7 @@ void softmax_packed_dim(const ivec2 tid, ivec3 scan_pos) { i += NWORKERS * 4, scan_pos[reduce_dim] += NWORKERS) { max_elements = max(max_elements, texelFetch(tin, scan_pos, 0)); } - if (scan_pos[reduce_dim] == out_meta.limits[reduce_dim] - 1 && nspill > 0) { + if (scan_pos[reduce_dim] == safe_idx(out_meta.limits, reduce_dim) - 1 && nspill > 0) { const vec4 intex = texelFetch(tin, scan_pos, 0); for (int i = 0; i < nspill; ++i) { max_elements.x = max(intex[i], max_elements.x); @@ -157,7 +157,7 @@ void softmax_packed_dim(const ivec2 tid, ivec3 scan_pos) { i += NWORKERS * 4, scan_pos[reduce_dim] += NWORKERS) { denominators += exp(texelFetch(tin, scan_pos, 0) - max_element); } - if (nspill > 0 && scan_pos[reduce_dim] == out_meta.limits[reduce_dim] - 1) { + if (nspill > 0 && scan_pos[reduce_dim] == safe_idx(out_meta.limits, reduce_dim) - 1) { const vec4 intex = texelFetch(tin, scan_pos, 0); for (int i = 0; i < nspill; ++i) { denominators.x += exp(intex[i] - max_element); @@ -182,7 +182,7 @@ void softmax_packed_dim(const ivec2 tid, ivec3 scan_pos) { const vec4 numerators = op1(texelFetch(tin, scan_pos, 0) - max_element); imageStore(tout, scan_pos, op2(numerators, safe_denominator)); } - if (nspill > 0 && scan_pos[reduce_dim] == out_meta.limits[reduce_dim] - 1) { + if (nspill > 0 && scan_pos[reduce_dim] == safe_idx(out_meta.limits, reduce_dim) - 1) { const vec4 numerator = op1(texelFetch(tin, scan_pos, 0) - max_element); vec4 outtex = op2(numerator, safe_denominator); [[unroll]] for (int i = nspill; i < 4; ++i) { diff --git a/backends/vulkan/runtime/graph/ops/glsl/split_texture.glsl b/backends/vulkan/runtime/graph/ops/glsl/split_texture.glsl index 254d4de1af6..17ee0619cb0 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/split_texture.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/split_texture.glsl @@ -51,7 +51,7 @@ void main() { VEC4_T out_texel = VEC4_T(0); int limit = min( - 4, outp.sizes[out_packed_dim] - out_tidx.data[out_packed_dim]); + 4, safe_idx(outp.sizes, out_packed_dim) - out_tidx.data[out_packed_dim]); TensorIndex4D input_tidx = out_tidx; input_tidx.data[split_dim] += split_offset; diff --git a/backends/vulkan/runtime/graph/ops/glsl/transfer_texture.glsl b/backends/vulkan/runtime/graph/ops/glsl/transfer_texture.glsl index 5f2a2097e2c..2d70e645de9 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/transfer_texture.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/transfer_texture.glsl @@ -69,7 +69,7 @@ void main() { VEC4_T out_texel = VEC4_T(0); int limit = min( - 4, outp.sizes[out_packed_dim] - out_tidx.data[out_packed_dim]); + 4, safe_idx(outp.sizes, out_packed_dim) - out_tidx.data[out_packed_dim]); for (int comp = 0; comp < limit; comp++) { TensorIndex4D in_tidx = out_tidx_to_in_tidx(out_tidx); diff --git a/backends/vulkan/runtime/graph/ops/glsl/var_texture3d.glsl b/backends/vulkan/runtime/graph/ops/glsl/var_texture3d.glsl index faeac01fcd2..1ee938c58dc 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/var_texture3d.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/var_texture3d.glsl @@ -48,6 +48,7 @@ shared VEC4_T shared_sum_sq[MAX_NTHREADS]; shared int shared_count[MAX_NTHREADS]; #include "indexing_utils.h" +#include "indexing.glslh" int tid_to_smi(const ivec2 tid) { return tid.x + tid.y * NWORKERS; @@ -73,7 +74,7 @@ void reduce_nonpacked_dim(const ivec2 tid, ivec3 scan_pos) { int count = 0; scan_pos[reduce_dim] = tid.x; - for (int i = tid.x; i < tin_sizes[reduce_dim]; + for (int i = tid.x; i < safe_idx(tin_sizes, reduce_dim); i += NWORKERS, scan_pos[reduce_dim] += NWORKERS) { VEC4_T val = load_texel(tin, scan_pos); sum += val; @@ -103,11 +104,11 @@ void reduce_nonpacked_dim(const ivec2 tid, ivec3 scan_pos) { // Determine if there are any padding elements in the final texel of the // packed dimension - const int nspill = mod4(tin_sizes[packed_dim]); + const int nspill = mod4(safe_idx(tin_sizes, packed_dim)); // Detect if this thread is working on the final texels of the packed // dimension, which may have padding elements const bool is_last_texel = - scan_pos[packed_dim] == (tin_limits[packed_dim] - 1); + scan_pos[packed_dim] == (safe_idx(tin_limits, packed_dim) - 1); VEC4_T variance = calculate_variance(sum, sum_sq, count); @@ -136,10 +137,10 @@ void reduce_packed_dim(const ivec2 tid, ivec3 scan_pos) { const int smi = tid_to_smi(tid); // Number of non-padding elements in the last texel in the reduction row - const int nspill = mod4(tin_sizes[packed_dim]); + const int nspill = mod4(safe_idx(tin_sizes, packed_dim)); // Only reduce up to the last "complete" texel. The last texel will need to be // handled specially if it has padding elements. - const int reduce_len = tin_sizes[packed_dim] - nspill; + const int reduce_len = safe_idx(tin_sizes, packed_dim) - nspill; VEC4_T sum = VEC4_T(0); VEC4_T sum_sq = VEC4_T(0); @@ -158,7 +159,7 @@ void reduce_packed_dim(const ivec2 tid, ivec3 scan_pos) { // For the last texel in the dim, if there are padding elements then each // element of the texel needs to be processed individually such that the // padding elements are ignored - if (scan_pos[reduce_dim] == tin_limits[reduce_dim] - 1 && nspill > 0) { + if (scan_pos[reduce_dim] == safe_idx(tin_limits, reduce_dim) - 1 && nspill > 0) { const VEC4_T val = load_texel(tin, scan_pos); for (int i = 0; i < nspill; i++) { sum.x += val[i]; diff --git a/backends/vulkan/runtime/graph/ops/glsl/where.glsl b/backends/vulkan/runtime/graph/ops/glsl/where.glsl index cc673ff2001..6982d41dd3a 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/where.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/where.glsl @@ -96,7 +96,7 @@ void main() { VEC4_T outtex = VEC4_T(0); int limit = min( - 4, outp.sizes[out_packed_dim] - out_tidx.data[out_packed_dim]); + 4, safe_idx(outp.sizes, out_packed_dim) - out_tidx.data[out_packed_dim]); for (int comp = 0; comp < limit; comp++) { TensorIndex4D cond_tidx; cond_tidx.data = min(out_tidx.data, condp.sizes - 1); diff --git a/backends/vulkan/runtime/graph/ops/impl/Slice.cpp b/backends/vulkan/runtime/graph/ops/impl/Slice.cpp index 67d714d10aa..a35fb65355d 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Slice.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Slice.cpp @@ -13,7 +13,6 @@ #include #include -#include namespace vkcompute {