diff --git a/backends/vulkan/runtime/graph/ops/glsl/arange_texture.glsl b/backends/vulkan/runtime/graph/ops/glsl/arange_texture.glsl index e2a22213c05..0a5636b300f 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/arange_texture.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/arange_texture.glsl @@ -45,7 +45,7 @@ void main() { // Compute the value for each element in the texel along the packed dim. VEC4_T outtex = VEC4_T(0); int limit = min( - 4, outp.sizes[packed_dim] - out_tidx.data[packed_dim]); + 4, safe_idx(outp.sizes, packed_dim) - out_tidx.data[packed_dim]); for (int comp = 0; comp < limit; comp++) { int elem_idx = out_tidx.data[0]; // W index is the linear element index outtex[comp] = VEC4_T(start + elem_idx * step).x; diff --git a/backends/vulkan/runtime/graph/ops/glsl/expand_texture.glsl b/backends/vulkan/runtime/graph/ops/glsl/expand_texture.glsl index c75ccba9f2d..3dfd7213c9f 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/expand_texture.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/expand_texture.glsl @@ -45,7 +45,7 @@ void main() { VEC4_T out_texel = VEC4_T(0); int limit = min( - 4, outp.sizes[packed_dim] - out_tidx.data[packed_dim]); + 4, safe_idx(outp.sizes, packed_dim) - out_tidx.data[packed_dim]); for (int comp = 0; comp < 4; comp++) { if (comp >= limit) { break; diff --git a/backends/vulkan/runtime/graph/ops/glsl/full_texture.glsl b/backends/vulkan/runtime/graph/ops/glsl/full_texture.glsl index dbd2336e209..0124ac2d0b9 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/full_texture.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/full_texture.glsl @@ -40,7 +40,7 @@ void main() { TensorIndex4D tidx = texture_pos_to_tensor4d_idx_simple(outp, pos, out_layout); - const int packed_dim_size = outp.sizes[packed_dim]; + const int packed_dim_size = safe_idx(outp.sizes, packed_dim); int packed_idx = tidx.data[packed_dim]; if (packed_idx + 3 >= packed_dim_size) { diff --git a/backends/vulkan/runtime/graph/ops/glsl/gather_texture.glsl b/backends/vulkan/runtime/graph/ops/glsl/gather_texture.glsl index 6af78867a2e..2f95365ca73 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/gather_texture.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/gather_texture.glsl @@ -53,7 +53,7 @@ void main() { VEC4_T out_texel = VEC4_T(0); int limit = min( - 4, outp.sizes[out_packed_dim] - out_tidx.data[out_packed_dim]); + 4, safe_idx(outp.sizes, out_packed_dim) - out_tidx.data[out_packed_dim]); for (int comp = 0; comp < 4; comp++) { TensorIndex4D input_tidx = out_tidx; int gather_idx = idx_texel[comp]; diff --git a/backends/vulkan/runtime/graph/ops/glsl/index_tensor_texture.glsl b/backends/vulkan/runtime/graph/ops/glsl/index_tensor_texture.glsl index 7dc25cda040..90b1c6e5a5b 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/index_tensor_texture.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/index_tensor_texture.glsl @@ -55,7 +55,7 @@ void main() { VEC4_T out_texel = VEC4_T(0); int limit = min( - 4, outp.sizes[out_packed_dim] - out_tidx.data[out_packed_dim]); + 4, safe_idx(outp.sizes, out_packed_dim) - out_tidx.data[out_packed_dim]); for (int comp = 0; comp < limit; comp++) { int idx = idx_texel[comp]; diff --git a/backends/vulkan/runtime/graph/ops/glsl/indexing.glslh b/backends/vulkan/runtime/graph/ops/glsl/indexing.glslh index 6f01dee0fe3..ddcc6d03de2 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/indexing.glslh +++ b/backends/vulkan/runtime/graph/ops/glsl/indexing.glslh @@ -99,6 +99,14 @@ uint safe_idx(const uvec4 v, const int idx) { return v.w; } +// Safe ivec3 component access via if/else chain. Same rationale as safe_idx +// for ivec4. +int safe_idx(const ivec3 v, const int idx) { + if (idx == 0) return v.x; + if (idx == 1) return v.y; + return v.z; +} + // Safe ivec4 component write via if/else chain. Companion to safe_idx for // cases where we need to set a component by a spec-const-derived index. void safe_set(inout ivec4 v, const int idx, const int val) { diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_int8x4_buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_int8x4_buffer.glsl index d8f7bdabe53..6b535400554 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_int8x4_buffer.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_int8x4_buffer.glsl @@ -42,7 +42,7 @@ void main() { texel_idx_to_tensor4d_idx(outp, texel_idx, outp_layout); // Bounds check on outer dimension - if (tidx.data[outer_dim] >= int(outp.sizes[0][outer_dim])) { + if (tidx.data[outer_dim] >= int(safe_idx(outp.sizes[0], outer_dim))) { return; } @@ -55,7 +55,7 @@ void main() { int packed = 0; [[unroll]] for (int i = 0; i < 4; ++i) { const int elem_inner = tidx.data[inner_dim] + i; - if (elem_inner < int(outp.sizes[0][inner_dim])) { + if (elem_inner < int(safe_idx(outp.sizes[0], inner_dim))) { // Build element coordinates ivec4 elem = tidx.data; elem[inner_dim] = elem_inner; diff --git a/backends/vulkan/runtime/graph/ops/glsl/pad_texture.glsl b/backends/vulkan/runtime/graph/ops/glsl/pad_texture.glsl index 35a44485e27..d09c5890ed7 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/pad_texture.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/pad_texture.glsl @@ -52,7 +52,7 @@ void main() { // Tail texels may have fewer than 4 valid elements; leave extras as 0. const int limit = - min(4, outp.sizes[packed_dim] - out_tidx.data[packed_dim]); + min(4, safe_idx(outp.sizes, packed_dim) - out_tidx.data[packed_dim]); VEC4_T out_texel = VEC4_T(0); diff --git a/backends/vulkan/runtime/graph/ops/glsl/permute_texture.glsl b/backends/vulkan/runtime/graph/ops/glsl/permute_texture.glsl index b8f8550baf3..903baabc3cf 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/permute_texture.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/permute_texture.glsl @@ -9,97 +9,97 @@ #version 450 core ${define_required_extensions("texture3d", DTYPE)} -${define_explicit_type_extensions(DTYPE)} #define PRECISION ${PRECISION} -#define VEC4_T ${texel_type(DTYPE)} -#define T ${buffer_scalar_type(DTYPE)} +#define VEC4_T ${texel_load_type(DTYPE, "texture3d")} +#define T ${texel_load_component_type(DTYPE, "texture3d")} ${define_active_storage_type("texture3d")} +#extension GL_EXT_control_flow_attributes : require + layout(std430) buffer; -#include "indexing_utils.h" +#include "common.glslh" +#include "indexing.glslh" ${layout_declare_tensor(B, "w", "t_out", DTYPE, "texture3d")} ${layout_declare_tensor(B, "r", "t_in", DTYPE, "texture3d")} +${layout_declare_ubo(B, "TextureMetadata", "outp")} +${layout_declare_ubo(B, "TextureMetadata", "inp")} + layout(push_constant) uniform restrict Block { - ivec4 out_sizes; - ivec4 in_sizes; - ivec4 permute_dims; // Permutation mapping: permute_dims[i] = j means output dim i comes from input dim j + ivec4 permute_dims; }; -${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")} -const lowp ivec4 out_axis_map = unhash_axis_map(out_layout); -const lowp int out_packed_dim = unhash_packed_dim(out_layout); - -${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")} -const lowp ivec4 in_axis_map = unhash_axis_map(in_layout); -const lowp int in_packed_dim = unhash_packed_dim(in_layout); +${layout_declare_spec_const(C, "int", "out_layout", "CONTIG_LAYOUT_INT")} +${layout_declare_spec_const(C, "int", "in_layout", "CONTIG_LAYOUT_INT")} +const int out_packed_dim = get_packed_dim(out_layout); +const int in_packed_dim = get_packed_dim(in_layout); layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; -// Convert output tensor index to input tensor index based on permutation +// Convert output tensor index to input tensor index based on permutation. +// permute_dims[i] = j means output dim i comes from input dim j. +// We write: in_tidx[permute_dims.{x,y,z,w}] = out_tidx.{x,y,z,w} +// This uses literal component access on the push constant (safe) and dynamic +// indexing into the local in_tidx variable (also safe). ivec4 out_tidx_to_in_tidx(const ivec4 out_tidx) { ivec4 in_tidx; - - // Apply the permutation mapping: in_tidx[permute_dims[i]] = out_tidx[i] in_tidx[permute_dims.x] = out_tidx.x; in_tidx[permute_dims.y] = out_tidx.y; in_tidx[permute_dims.z] = out_tidx.z; in_tidx[permute_dims.w] = out_tidx.w; - return in_tidx; } -// Check if we can use the fast path where texels from the input tensor can be -// copied directly into the output tensor. This occurs when the packed dimension -// is preserved in the permutation, i.e. reading a texel from the output tensor -// produces 4 texels along the same dimension as reading a texel from the input -// tensor. -bool can_use_fast_path() { - // Fast path is possible when the packed dimension is preserved in the permutation - // This means permute_dims[out_packed_dim] == in_packed_dim - return permute_dims[out_packed_dim] == in_packed_dim; -} - void main() { - const ivec3 lpos = ivec3(gl_GlobalInvocationID); - ivec4 out_tidx = lpos_to_tidx(lpos, out_sizes, out_axis_map.w, out_packed_dim); + const ivec3 out_pos = ivec3(gl_GlobalInvocationID); - if (any(greaterThanEqual(out_tidx, out_sizes))) { + if (out_of_bounds(out_pos, outp)) { return; } - if (can_use_fast_path()) { + TensorIndex4D out_tidx = + texture_pos_to_tensor4d_idx_simple(outp, out_pos, out_layout); + + // Check if packed dimension is preserved in the permutation. Use safe_idx + // to avoid dynamic indexing of push constant with spec-const-derived index. + const bool fast_path = + safe_idx(permute_dims, out_packed_dim) == in_packed_dim; + + if (fast_path) { // Fast path: packed dimension is preserved, so we can copy texels directly - ivec4 in_tidx = out_tidx_to_in_tidx(out_tidx); - ivec3 in_pos = tidx_to_pos(in_tidx, in_sizes, in_axis_map, in_packed_dim); - VEC4_T in_texel = VEC4_T(load_texel(t_in, in_pos)); + ivec4 in_tidx_data = out_tidx_to_in_tidx(out_tidx.data); + TensorIndex4D in_tidx; + in_tidx.data = in_tidx_data; - write_texel_lpos(t_out, lpos, in_texel, out_axis_map); - } - else { + ivec3 in_pos = + tensor4d_idx_to_texel_pos_simple(inp, in_tidx, in_layout); + VEC4_T in_texel = texelFetch(t_in, in_pos, 0); + + imageStore(t_out, out_pos, in_texel); + } else { // Slow path: packed dimension is not preserved, so each element of the - // output texel may be "sourced" from a different texel in the input tensor. - // Therefore each output texel element is processed individually. + // output texel may come from a different texel in the input tensor. VEC4_T out_texel = VEC4_T(0); - for (int texel_i = 0; texel_i < 4; ++texel_i) { - ivec4 in_tidx = out_tidx_to_in_tidx(out_tidx); - ivec3 in_pos = tidx_to_pos(in_tidx, in_sizes, in_axis_map, in_packed_dim); - int element_idx = in_tidx[in_packed_dim] % 4; + for (int comp = 0; comp < 4; comp++) { + ivec4 in_tidx_data = out_tidx_to_in_tidx(out_tidx.data); + TensorIndex4D in_tidx; + in_tidx.data = in_tidx_data; - VEC4_T in_texel = VEC4_T(load_texel(t_in, in_pos)); - T selected_value = T(in_texel[element_idx]); + TextureElementIndex in_elem = + tensor4d_idx_to_texture_element_idx_simple(inp, in_tidx, in_layout); - out_texel[texel_i] = selected_value; + VEC4_T in_texel = texelFetch(t_in, in_elem.pos, 0); + out_texel[comp] = in_texel[in_elem.comp]; - out_tidx[out_packed_dim]++; + out_tidx.data[out_packed_dim]++; } - write_texel_lpos(t_out, lpos, out_texel, out_axis_map); + imageStore(t_out, out_pos, out_texel); } } diff --git a/backends/vulkan/runtime/graph/ops/glsl/reduce.glsl b/backends/vulkan/runtime/graph/ops/glsl/reduce.glsl index 7a6263d9f55..209440cec6a 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/reduce.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/reduce.glsl @@ -43,6 +43,7 @@ layout(constant_id = 5) const int group_dim = 1; shared vec4 shared_vecs[MAX_NTHREADS]; #include "indexing_utils.h" +#include "indexing.glslh" int tid_to_smi(const ivec2 tid) { return tid.x + tid.y * NWORKERS; @@ -95,7 +96,7 @@ void reduce_nonpacked_dim(const ivec2 tid, ivec3 scan_pos) { scan_pos[reduce_dim] = tid.x; // Partially accumulate over elements i, i + NWORKERS, i + 2*NWORKERS, ... of // the reduction row - for (int i = tid.x; i < tin_sizes[reduce_dim]; + for (int i = tid.x; i < safe_idx(tin_sizes, reduce_dim); i += NWORKERS, scan_pos[reduce_dim] += NWORKERS) { accum = UPDATE_ACCUM(accum, load_texel(tin, scan_pos)); } @@ -115,11 +116,11 @@ void reduce_nonpacked_dim(const ivec2 tid, ivec3 scan_pos) { // Determine if there are any padding elements in the final texel of the // packed dimension - const int nspill = mod4(tin_sizes[packed_dim]); + const int nspill = mod4(safe_idx(tin_sizes, packed_dim)); // Detect if this thread is working on the final texels of the packed // dimension, which may have padding elements const bool is_last_texel = - scan_pos[packed_dim] == (tin_limits[packed_dim] - 1); + scan_pos[packed_dim] == (safe_idx(tin_limits, packed_dim) - 1); // Explicitly set padding elements to 0 if (is_last_texel && nspill > 0) { @@ -145,10 +146,10 @@ void reduce_packed_dim(const ivec2 tid, ivec3 scan_pos) { const int smi = tid_to_smi(tid); // Number of non-padding elements in the last texel in the reduction row - const int nspill = mod4(tin_sizes[packed_dim]); + const int nspill = mod4(safe_idx(tin_sizes, packed_dim)); // Only reduce up to the last "complete" texel. The last texel will need to be // handled specially if it has padding elements. - const int reduce_len = tin_sizes[packed_dim] - nspill; + const int reduce_len = safe_idx(tin_sizes, packed_dim) - nspill; scan_pos[reduce_dim] = 0; vec4 accum = INIT_ACCUM(vec4(load_texel(tin, scan_pos).x)); @@ -163,7 +164,7 @@ void reduce_packed_dim(const ivec2 tid, ivec3 scan_pos) { // For the last texel in the dim, if there are padding elements then each // element of the texel needs to be processed individually such that the // padding elements are ignored - if (scan_pos[reduce_dim] == tin_limits[reduce_dim] - 1 && nspill > 0) { + if (scan_pos[reduce_dim] == safe_idx(tin_limits, reduce_dim) - 1 && nspill > 0) { const vec4 intex = load_texel(tin, scan_pos); for (int i = 0; i < nspill; i++) { accum.x = UPDATE_ACCUM(accum.x, intex[i]); diff --git a/backends/vulkan/runtime/graph/ops/glsl/reduce2d.glsl b/backends/vulkan/runtime/graph/ops/glsl/reduce2d.glsl index 98370a9bcde..bd55025f534 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/reduce2d.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/reduce2d.glsl @@ -44,6 +44,7 @@ layout(constant_id = 6) const int group_dim = 2; shared vec4 shared_vecs[MAX_NTHREADS]; #include "indexing_utils.h" +#include "indexing.glslh" int tid_to_smi(const ivec2 tid) { return tid.x + tid.y * NWORKERS; @@ -68,12 +69,12 @@ void reduce_2d_non_packed_dim(const ivec2 tid, ivec3 scan_pos) { // First dimension reduction scan_pos[reduce_dim1] = tid.x; - for (int i = tid.x; i < tin_sizes[reduce_dim1]; + for (int i = tid.x; i < safe_idx(tin_sizes, reduce_dim1); i += NWORKERS, scan_pos[reduce_dim1] += NWORKERS) { // Second dimension reduction scan_pos[reduce_dim2] = 0; - for (int j = 0; j < tin_sizes[reduce_dim2]; j++, scan_pos[reduce_dim2]++) { + for (int j = 0; j < safe_idx(tin_sizes, reduce_dim2); j++, scan_pos[reduce_dim2]++) { accum = UPDATE_ACCUM(accum, load_texel(tin, scan_pos)); } } @@ -93,11 +94,11 @@ void reduce_2d_non_packed_dim(const ivec2 tid, ivec3 scan_pos) { // Determine if there are any padding elements in the final texel of the // packed dimension - const int nspill = mod4(tin_sizes[packed_dim]); + const int nspill = mod4(safe_idx(tin_sizes, packed_dim)); // Detect if this thread is working on the final texels of the packed // dimension, which may have padding elements - const bool is_last_texel = - scan_pos[packed_dim] == (tin_limits[packed_dim] - 1); + const bool is_last_texel = + scan_pos[packed_dim] == (safe_idx(tin_limits, packed_dim) - 1); // Explicitly set padding elements to 0 if (is_last_texel && nspill > 0) { diff --git a/backends/vulkan/runtime/graph/ops/glsl/repeat_texture.glsl b/backends/vulkan/runtime/graph/ops/glsl/repeat_texture.glsl index 02f8956ce1f..6661d747876 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/repeat_texture.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/repeat_texture.glsl @@ -47,7 +47,7 @@ void main() { VEC4_T out_texel = VEC4_T(0); const int limit = min( - 4, out_meta.sizes[packed_dim] - out_tidx.data[packed_dim]); + 4, safe_idx(out_meta.sizes, packed_dim) - out_tidx.data[packed_dim]); for (int comp = 0; comp < limit; comp++) { TensorIndex4D in_tidx = out_tidx; in_tidx.data = ivec4( diff --git a/backends/vulkan/runtime/graph/ops/glsl/select.glslh b/backends/vulkan/runtime/graph/ops/glsl/select.glslh index 5390e2a4bb2..4fdb48926fa 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/select.glslh +++ b/backends/vulkan/runtime/graph/ops/glsl/select.glslh @@ -69,7 +69,7 @@ TensorIndex4D out_tidx_to_in_tidx(const TensorIndex4D out_tidx) { int adjusted_index = index; if (index < 0) { - adjusted_index = index + inp.sizes[selected_dim]; + adjusted_index = index + safe_idx(inp.sizes, selected_dim); } // Handle different dimensions for selection diff --git a/backends/vulkan/runtime/graph/ops/glsl/slice.glslh b/backends/vulkan/runtime/graph/ops/glsl/slice.glslh index 0a815c85d66..a7fc94bb5d6 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/slice.glslh +++ b/backends/vulkan/runtime/graph/ops/glsl/slice.glslh @@ -56,7 +56,7 @@ TensorIndex4D out_tidx_to_in_tidx(const TensorIndex4D out_tidx) { int adjusted_start = start; if (start < 0) { - adjusted_start = start + inp.sizes[selected_dim]; + adjusted_start = start + safe_idx(inp.sizes, selected_dim); } in_tidx.data[selected_dim] = adjusted_start + out_tidx.data[selected_dim] * step; diff --git a/backends/vulkan/runtime/graph/ops/glsl/softmax.glsl b/backends/vulkan/runtime/graph/ops/glsl/softmax.glsl index bf7facae761..ce9d2477795 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/softmax.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/softmax.glsl @@ -57,7 +57,7 @@ void softmax_nonpacked_dim(const ivec2 tid, ivec3 scan_pos) { scan_pos[reduce_dim] = tid.x; vec4 max_elements = texelFetch(tin, scan_pos, 0); - for (int i = tid.x; i < in_meta.sizes[reduce_dim]; + for (int i = tid.x; i < safe_idx(in_meta.sizes, reduce_dim); i += NWORKERS, scan_pos[reduce_dim] += NWORKERS) { max_elements = max(max_elements, texelFetch(tin, scan_pos, 0)); } @@ -71,7 +71,7 @@ void softmax_nonpacked_dim(const ivec2 tid, ivec3 scan_pos) { scan_pos[reduce_dim] = tid.x; vec4 denominators = vec4(0); - for (int i = tid.x; i < in_meta.sizes[reduce_dim]; + for (int i = tid.x; i < safe_idx(in_meta.sizes, reduce_dim); i += NWORKERS, scan_pos[reduce_dim] += NWORKERS) { denominators += exp(texelFetch(tin, scan_pos, 0) - max_elements); } @@ -83,12 +83,12 @@ void softmax_nonpacked_dim(const ivec2 tid, ivec3 scan_pos) { denominators += shared_sum[group_i]; } - const int nspill = mod_4(in_meta.sizes[packed_dim]); + const int nspill = mod_4(safe_idx(in_meta.sizes, packed_dim)); const bool is_last_texel = - scan_pos[packed_dim] == (out_meta.limits[packed_dim] - 1); + scan_pos[packed_dim] == (safe_idx(out_meta.limits, packed_dim) - 1); scan_pos[reduce_dim] = tid.x; - for (int i = tid.x; i < in_meta.sizes[reduce_dim]; + for (int i = tid.x; i < safe_idx(in_meta.sizes, reduce_dim); i += NWORKERS, scan_pos[reduce_dim] += NWORKERS) { const vec4 numerators = op1(texelFetch(tin, scan_pos, 0) - max_elements); const vec4 safe_denom = max(denominators, vec4(1e-37)); @@ -124,8 +124,8 @@ void softmax_packed_dim(const ivec2 tid, ivec3 scan_pos) { const int smi = tid_to_smi(tid); int group_i; - const int nspill = mod_4(in_meta.sizes[packed_dim]); - const int reduce_len = in_meta.sizes[packed_dim] - nspill; + const int nspill = mod_4(safe_idx(in_meta.sizes, packed_dim)); + const int reduce_len = safe_idx(in_meta.sizes, packed_dim) - nspill; scan_pos[reduce_dim] = tid.x; vec4 max_elements = vec4(-3.402823e+38); @@ -133,7 +133,7 @@ void softmax_packed_dim(const ivec2 tid, ivec3 scan_pos) { i += NWORKERS * 4, scan_pos[reduce_dim] += NWORKERS) { max_elements = max(max_elements, texelFetch(tin, scan_pos, 0)); } - if (scan_pos[reduce_dim] == out_meta.limits[reduce_dim] - 1 && nspill > 0) { + if (scan_pos[reduce_dim] == safe_idx(out_meta.limits, reduce_dim) - 1 && nspill > 0) { const vec4 intex = texelFetch(tin, scan_pos, 0); for (int i = 0; i < nspill; ++i) { max_elements.x = max(intex[i], max_elements.x); @@ -157,7 +157,7 @@ void softmax_packed_dim(const ivec2 tid, ivec3 scan_pos) { i += NWORKERS * 4, scan_pos[reduce_dim] += NWORKERS) { denominators += exp(texelFetch(tin, scan_pos, 0) - max_element); } - if (nspill > 0 && scan_pos[reduce_dim] == out_meta.limits[reduce_dim] - 1) { + if (nspill > 0 && scan_pos[reduce_dim] == safe_idx(out_meta.limits, reduce_dim) - 1) { const vec4 intex = texelFetch(tin, scan_pos, 0); for (int i = 0; i < nspill; ++i) { denominators.x += exp(intex[i] - max_element); @@ -182,7 +182,7 @@ void softmax_packed_dim(const ivec2 tid, ivec3 scan_pos) { const vec4 numerators = op1(texelFetch(tin, scan_pos, 0) - max_element); imageStore(tout, scan_pos, op2(numerators, safe_denominator)); } - if (nspill > 0 && scan_pos[reduce_dim] == out_meta.limits[reduce_dim] - 1) { + if (nspill > 0 && scan_pos[reduce_dim] == safe_idx(out_meta.limits, reduce_dim) - 1) { const vec4 numerator = op1(texelFetch(tin, scan_pos, 0) - max_element); vec4 outtex = op2(numerator, safe_denominator); [[unroll]] for (int i = nspill; i < 4; ++i) { diff --git a/backends/vulkan/runtime/graph/ops/glsl/split_texture.glsl b/backends/vulkan/runtime/graph/ops/glsl/split_texture.glsl index 254d4de1af6..17ee0619cb0 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/split_texture.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/split_texture.glsl @@ -51,7 +51,7 @@ void main() { VEC4_T out_texel = VEC4_T(0); int limit = min( - 4, outp.sizes[out_packed_dim] - out_tidx.data[out_packed_dim]); + 4, safe_idx(outp.sizes, out_packed_dim) - out_tidx.data[out_packed_dim]); TensorIndex4D input_tidx = out_tidx; input_tidx.data[split_dim] += split_offset; diff --git a/backends/vulkan/runtime/graph/ops/glsl/transfer_texture.glsl b/backends/vulkan/runtime/graph/ops/glsl/transfer_texture.glsl index 5f2a2097e2c..2d70e645de9 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/transfer_texture.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/transfer_texture.glsl @@ -69,7 +69,7 @@ void main() { VEC4_T out_texel = VEC4_T(0); int limit = min( - 4, outp.sizes[out_packed_dim] - out_tidx.data[out_packed_dim]); + 4, safe_idx(outp.sizes, out_packed_dim) - out_tidx.data[out_packed_dim]); for (int comp = 0; comp < limit; comp++) { TensorIndex4D in_tidx = out_tidx_to_in_tidx(out_tidx); diff --git a/backends/vulkan/runtime/graph/ops/glsl/var_texture3d.glsl b/backends/vulkan/runtime/graph/ops/glsl/var_texture3d.glsl index faeac01fcd2..1ee938c58dc 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/var_texture3d.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/var_texture3d.glsl @@ -48,6 +48,7 @@ shared VEC4_T shared_sum_sq[MAX_NTHREADS]; shared int shared_count[MAX_NTHREADS]; #include "indexing_utils.h" +#include "indexing.glslh" int tid_to_smi(const ivec2 tid) { return tid.x + tid.y * NWORKERS; @@ -73,7 +74,7 @@ void reduce_nonpacked_dim(const ivec2 tid, ivec3 scan_pos) { int count = 0; scan_pos[reduce_dim] = tid.x; - for (int i = tid.x; i < tin_sizes[reduce_dim]; + for (int i = tid.x; i < safe_idx(tin_sizes, reduce_dim); i += NWORKERS, scan_pos[reduce_dim] += NWORKERS) { VEC4_T val = load_texel(tin, scan_pos); sum += val; @@ -103,11 +104,11 @@ void reduce_nonpacked_dim(const ivec2 tid, ivec3 scan_pos) { // Determine if there are any padding elements in the final texel of the // packed dimension - const int nspill = mod4(tin_sizes[packed_dim]); + const int nspill = mod4(safe_idx(tin_sizes, packed_dim)); // Detect if this thread is working on the final texels of the packed // dimension, which may have padding elements const bool is_last_texel = - scan_pos[packed_dim] == (tin_limits[packed_dim] - 1); + scan_pos[packed_dim] == (safe_idx(tin_limits, packed_dim) - 1); VEC4_T variance = calculate_variance(sum, sum_sq, count); @@ -136,10 +137,10 @@ void reduce_packed_dim(const ivec2 tid, ivec3 scan_pos) { const int smi = tid_to_smi(tid); // Number of non-padding elements in the last texel in the reduction row - const int nspill = mod4(tin_sizes[packed_dim]); + const int nspill = mod4(safe_idx(tin_sizes, packed_dim)); // Only reduce up to the last "complete" texel. The last texel will need to be // handled specially if it has padding elements. - const int reduce_len = tin_sizes[packed_dim] - nspill; + const int reduce_len = safe_idx(tin_sizes, packed_dim) - nspill; VEC4_T sum = VEC4_T(0); VEC4_T sum_sq = VEC4_T(0); @@ -158,7 +159,7 @@ void reduce_packed_dim(const ivec2 tid, ivec3 scan_pos) { // For the last texel in the dim, if there are padding elements then each // element of the texel needs to be processed individually such that the // padding elements are ignored - if (scan_pos[reduce_dim] == tin_limits[reduce_dim] - 1 && nspill > 0) { + if (scan_pos[reduce_dim] == safe_idx(tin_limits, reduce_dim) - 1 && nspill > 0) { const VEC4_T val = load_texel(tin, scan_pos); for (int i = 0; i < nspill; i++) { sum.x += val[i]; diff --git a/backends/vulkan/runtime/graph/ops/glsl/where.glsl b/backends/vulkan/runtime/graph/ops/glsl/where.glsl index cc673ff2001..6982d41dd3a 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/where.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/where.glsl @@ -96,7 +96,7 @@ void main() { VEC4_T outtex = VEC4_T(0); int limit = min( - 4, outp.sizes[out_packed_dim] - out_tidx.data[out_packed_dim]); + 4, safe_idx(outp.sizes, out_packed_dim) - out_tidx.data[out_packed_dim]); for (int comp = 0; comp < limit; comp++) { TensorIndex4D cond_tidx; cond_tidx.data = min(out_tidx.data, condp.sizes - 1); diff --git a/backends/vulkan/runtime/graph/ops/impl/Permute.cpp b/backends/vulkan/runtime/graph/ops/impl/Permute.cpp index d7b06015b72..8081424cfb7 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Permute.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Permute.cpp @@ -11,17 +11,12 @@ #include #include -#include -#include #include #include namespace vkcompute { -using utils::ivec2; -using utils::ivec3; using utils::ivec4; -using utils::uvec4; namespace { @@ -32,13 +27,38 @@ void check_args( const ValueRef out) { (void)permute_dims; VK_CHECK_COND(check_same_packed_dim(graph, in, out)); - - // This implementation doesn't not requires the input tensor to have the same - // dim size as the argument. The code will work as long as the input tensor's - // dim size is shorter than the permute dim array. In this case, the code - // assume size of 1 at the higher dimensions. } +struct WHCNPermuteDims { + int32_t whcn_permute_dims[api::kTensorDimLimit]; + + void initialize(const std::vector& permute_dims) { + const int32_t permute_ndim = permute_dims.size(); + for (int32_t whcn_i = 0; whcn_i < permute_ndim; whcn_i++) { + const int32_t nchw_i = permute_ndim - 1 - whcn_i; + int64_t index_val = permute_dims.at(nchw_i); + if (index_val < 0) { + index_val += permute_ndim; + } + const int32_t permute_dim_whcn = permute_ndim - 1 - index_val; + whcn_permute_dims[whcn_i] = permute_dim_whcn; + } + for (int32_t whcn_i = permute_ndim; whcn_i < api::kTensorDimLimit; + whcn_i++) { + whcn_permute_dims[whcn_i] = whcn_i; + } + } + + int32_t pack_into_int32() const { + VK_CHECK_COND(api::kTensorDimLimit <= 8); + int32_t packed = 0; + for (int32_t i = 0; i < api::kTensorDimLimit; i++) { + packed |= (whcn_permute_dims[i] & 0x0F) << (i * 4); + } + return packed; + } +}; + } // namespace void resize_permute_node( @@ -101,15 +121,36 @@ void add_permute_node( const ValueRef out) { check_args(graph, in, permute_dims, out); - // Convert the permute dims to WHCN dimension order, which is the standard in - // our compute shaders. The following transformations are applied. - // 1. Change dimension index values from NCHW order valueto WHCN order value - // 2. Reverse the order of the permute array from NCHW order to WHCN order + std::string kernel_name = "permute"; + kernel_name.reserve(kShaderNameReserve); + add_storage_type_suffix(kernel_name, graph.storage_type_of(out)); + add_dtype_suffix(kernel_name, graph.dtype_of(out)); + + vkapi::ParamsBindList param_ubos = {graph.meta_ubo(out), graph.meta_ubo(in)}; + + std::vector push_constants; + vkapi::SpecVarList spec_vars = { + graph.hashed_layout_of(out), graph.hashed_layout_of(in)}; + + // WHCN permute dims for the texture path (ivec4, max 4D). + // Declared here so its lifetime extends to the DynamicDispatchNode creation + // where push_constants references it. ivec4 whcn_permute_dims{0, 1, 2, 3}; - { + + if (graph.is_buffer_storage(out)) { + // Buffer path: supports up to kTensorDimLimit dims via WHCNPermuteDims, + // packed into a spec constant int + WHCNPermuteDims whcn_pd; + whcn_pd.initialize(*graph.get_int_list(permute_dims)); + spec_vars.append(whcn_pd.pack_into_int32()); + } else { + // Texture path: compute 4D WHCN permute dims and pass as push constant IntListPtr permute_dims_ptr = graph.get_int_list(permute_dims); const int32_t permute_ndim = utils::safe_downcast(permute_dims_ptr->size()); + VK_CHECK_COND( + permute_ndim <= 4, + "Texture storage only supports permute with up to 4 dims"); for (int32_t nchw_i = permute_ndim - 1, whcn_i = 0; nchw_i >= 0; nchw_i--, whcn_i++) { @@ -119,133 +160,23 @@ void add_permute_node( permute_dim_nchw += permute_ndim; } const int32_t permute_dim_whcn = permute_ndim - 1 - permute_dim_nchw; - whcn_permute_dims[whcn_i] = permute_dim_whcn; } - } - std::string kernel_name = "permute"; - kernel_name.reserve(kShaderNameReserve); - add_storage_type_suffix(kernel_name, graph.storage_type_of(out)); - add_dtype_suffix(kernel_name, graph.dtype_of(out)); - - vkapi::ParamsBindList param_buffers; - std::vector push_constants; - vkapi::SpecVarList spec_vars; - - const int32_t out_channels = dim_at(graph.sizes_of(out)); - const int32_t in_channels = dim_at(graph.sizes_of(in)); - - const int32_t packed_dim = graph.packed_dim_of(in); - ivec2 channel_info = {out_channels, in_channels}; - if (packed_dim == WHCN::kChannelsDim) { - channel_info[0] = utils::align_up_4(channel_info[0]); - channel_info[1] = utils::align_up_4(channel_info[1]); + push_constants.push_back( + PushConstantDataInfo(&whcn_permute_dims, sizeof(whcn_permute_dims))); } - push_constants = { - graph.sizes_pc_of(out), - graph.sizes_pc_of(in), - PushConstantDataInfo(&whcn_permute_dims, sizeof(whcn_permute_dims))}; - - spec_vars = {graph.hashed_layout_of(out), graph.hashed_layout_of(in)}; - graph.execute_nodes().emplace_back(new DynamicDispatchNode( graph, VK_KERNEL_FROM_STR(kernel_name), default_pick_global_wg_size, default_pick_local_wg_size, {{out, vkapi::kWrite}, {in, vkapi::kRead}}, - // Parameter buffers - param_buffers, - // Push Constants + param_ubos, push_constants, - // Specialization Constants spec_vars, - // Resize Args {permute_dims}, - // Resizing Logic - resize_permute_node)); -} - -struct WHCNPermuteDims { - int32_t whcn_permute_dims[api::kTensorDimLimit]; - - void initialize(const std::vector& permute_dims) { - const int32_t permute_ndim = permute_dims.size(); - for (int32_t whcn_i = 0; whcn_i < permute_ndim; whcn_i++) { - const int32_t nchw_i = permute_ndim - 1 - whcn_i; - int64_t index_val = permute_dims.at(nchw_i); - if (index_val < 0) { - index_val += permute_ndim; - } - const int32_t permute_dim_whcn = permute_ndim - 1 - index_val; - whcn_permute_dims[whcn_i] = permute_dim_whcn; - } - for (int32_t whcn_i = permute_ndim; whcn_i < api::kTensorDimLimit; - whcn_i++) { - whcn_permute_dims[whcn_i] = whcn_i; - } - } - - int32_t pack_into_int32() const { - // If kTensorDimLimit is increased, we will need to send in an additional - // int. - VK_CHECK_COND(api::kTensorDimLimit <= 8); - // Packs the 8 elements in whcn_permute_dims into a single int32_t. Each - // element is packed into 4 bits. - int32_t packed = 0; - for (int32_t i = 0; i < api::kTensorDimLimit; i++) { - packed |= (whcn_permute_dims[i] & 0x0F) << (i * 4); - } - return packed; - } -}; - -void add_permute_buffer_node( - ComputeGraph& graph, - const ValueRef in, - const ValueRef permute_dims, - const ValueRef out) { - check_args(graph, in, permute_dims, out); - - WHCNPermuteDims whcn_permute_dims; - // Convert the permute dims to WHCN dimension order, which is the standard in - // our compute shaders. The following transformations are applied. - // 1. Change dimension index values from NCHW order value to WHCN order value - // 2. Extend the permute array to kTensorDimLimit - { - IntListPtr permute_dims_ptr = graph.get_int_list(permute_dims); - whcn_permute_dims.initialize(*permute_dims_ptr); - } - - std::string kernel_name = "permute"; - kernel_name.reserve(kShaderNameReserve); - add_storage_type_suffix(kernel_name, graph.storage_type_of(out)); - add_dtype_suffix(kernel_name, graph.dtype_of(out)); - - vkapi::ParamsBindList param_buffers = { - graph.buffer_meta_ubo(out), - graph.buffer_meta_ubo(in), - }; - - graph.execute_nodes().emplace_back(new DynamicDispatchNode( - graph, - VK_KERNEL_FROM_STR(kernel_name), - default_pick_global_wg_size, - default_pick_local_wg_size, - {{out, vkapi::kWrite}, {in, vkapi::kRead}}, - // Parameter buffers - param_buffers, - // Push Constants - {}, - // Specialization Constants - {graph.hashed_layout_of(out), - graph.hashed_layout_of(in), - whcn_permute_dims.pack_into_int32()}, - // Resize Args - {permute_dims}, - // Resizing Logic resize_permute_node)); } @@ -255,10 +186,7 @@ void permute(ComputeGraph& graph, const std::vector& args) { const ValueRef permute_dims = args.at(idx++); const ValueRef out = args.at(idx++); - if (graph.is_buffer_storage(args[2])) { - return add_permute_buffer_node(graph, in, permute_dims, out); - } - return add_permute_node(graph, in, permute_dims, out); + add_permute_node(graph, in, permute_dims, out); } REGISTER_OPERATORS { diff --git a/backends/vulkan/runtime/graph/ops/impl/Slice.cpp b/backends/vulkan/runtime/graph/ops/impl/Slice.cpp index 67d714d10aa..a35fb65355d 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Slice.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Slice.cpp @@ -13,7 +13,6 @@ #include #include -#include namespace vkcompute {