From 9363a9caa75bd04ea75bac6a26dfccb92fdd80ad Mon Sep 17 00:00:00 2001 From: ssjia Date: Wed, 25 Mar 2026 16:30:48 -0700 Subject: [PATCH] [ET-VK] Replace dynamic UBO indexing with safe_idx across shaders The Adreno 740 GPU driver crashes (SIGSEGV in vkCreateComputePipelines) when GLSL shaders dynamically index a UBO-backed ivec4/ivec3 with a specialization-constant-derived value. This was causing the skin segmentation model to crash during pipeline creation on Samsung S23. Fix all instances across 18 shader files by replacing patterns like `meta.sizes[packed_dim]` with `safe_idx(meta.sizes, packed_dim)`, which uses an if/else chain that the driver resolves at pipeline creation time. Changes: - Add safe_idx(ivec3) overload to indexing.glslh - Fix transfer_texture.glsl, slice.glslh, select.glslh (transfer ops) - Fix nchw_to_int8x4_buffer.glsl, full_texture.glsl (staging/utility) - Fix gather, split, index_tensor, where, expand, pad, repeat, arange texture shaders (1-line fixes each) - Fix softmax.glsl, reduce.glsl, reduce2d.glsl, var_texture3d.glsl (reduction shaders with multiple fixes + added indexing.glslh include) - Remove unused ShaderNameUtils.h include from Slice.cpp Differential Revision: [D98220450](https://our.internmc.facebook.com/intern/diff/D98220450/) [ghstack-poisoned] --- .../graph/ops/glsl/arange_texture.glsl | 2 +- .../graph/ops/glsl/expand_texture.glsl | 2 +- .../runtime/graph/ops/glsl/full_texture.glsl | 2 +- .../graph/ops/glsl/gather_texture.glsl | 2 +- .../graph/ops/glsl/index_tensor_texture.glsl | 2 +- .../runtime/graph/ops/glsl/indexing.glslh | 8 ++++++++ .../graph/ops/glsl/nchw_to_int8x4_buffer.glsl | 4 ++-- .../runtime/graph/ops/glsl/pad_texture.glsl | 2 +- .../vulkan/runtime/graph/ops/glsl/reduce.glsl | 13 ++++++------ .../runtime/graph/ops/glsl/reduce2d.glsl | 11 +++++----- .../graph/ops/glsl/repeat_texture.glsl | 2 +- .../runtime/graph/ops/glsl/select.glslh | 2 +- .../vulkan/runtime/graph/ops/glsl/slice.glslh | 2 +- .../runtime/graph/ops/glsl/softmax.glsl | 20 +++++++++---------- .../runtime/graph/ops/glsl/split_texture.glsl | 2 +- .../graph/ops/glsl/transfer_texture.glsl | 2 +- .../runtime/graph/ops/glsl/var_texture3d.glsl | 13 ++++++------ .../vulkan/runtime/graph/ops/glsl/where.glsl | 2 +- .../vulkan/runtime/graph/ops/impl/Slice.cpp | 1 - 19 files changed, 52 insertions(+), 42 deletions(-) diff --git a/backends/vulkan/runtime/graph/ops/glsl/arange_texture.glsl b/backends/vulkan/runtime/graph/ops/glsl/arange_texture.glsl index e2a22213c05..0a5636b300f 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/arange_texture.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/arange_texture.glsl @@ -45,7 +45,7 @@ void main() { // Compute the value for each element in the texel along the packed dim. VEC4_T outtex = VEC4_T(0); int limit = min( - 4, outp.sizes[packed_dim] - out_tidx.data[packed_dim]); + 4, safe_idx(outp.sizes, packed_dim) - out_tidx.data[packed_dim]); for (int comp = 0; comp < limit; comp++) { int elem_idx = out_tidx.data[0]; // W index is the linear element index outtex[comp] = VEC4_T(start + elem_idx * step).x; diff --git a/backends/vulkan/runtime/graph/ops/glsl/expand_texture.glsl b/backends/vulkan/runtime/graph/ops/glsl/expand_texture.glsl index c75ccba9f2d..3dfd7213c9f 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/expand_texture.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/expand_texture.glsl @@ -45,7 +45,7 @@ void main() { VEC4_T out_texel = VEC4_T(0); int limit = min( - 4, outp.sizes[packed_dim] - out_tidx.data[packed_dim]); + 4, safe_idx(outp.sizes, packed_dim) - out_tidx.data[packed_dim]); for (int comp = 0; comp < 4; comp++) { if (comp >= limit) { break; diff --git a/backends/vulkan/runtime/graph/ops/glsl/full_texture.glsl b/backends/vulkan/runtime/graph/ops/glsl/full_texture.glsl index dbd2336e209..0124ac2d0b9 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/full_texture.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/full_texture.glsl @@ -40,7 +40,7 @@ void main() { TensorIndex4D tidx = texture_pos_to_tensor4d_idx_simple(outp, pos, out_layout); - const int packed_dim_size = outp.sizes[packed_dim]; + const int packed_dim_size = safe_idx(outp.sizes, packed_dim); int packed_idx = tidx.data[packed_dim]; if (packed_idx + 3 >= packed_dim_size) { diff --git a/backends/vulkan/runtime/graph/ops/glsl/gather_texture.glsl b/backends/vulkan/runtime/graph/ops/glsl/gather_texture.glsl index 6af78867a2e..2f95365ca73 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/gather_texture.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/gather_texture.glsl @@ -53,7 +53,7 @@ void main() { VEC4_T out_texel = VEC4_T(0); int limit = min( - 4, outp.sizes[out_packed_dim] - out_tidx.data[out_packed_dim]); + 4, safe_idx(outp.sizes, out_packed_dim) - out_tidx.data[out_packed_dim]); for (int comp = 0; comp < 4; comp++) { TensorIndex4D input_tidx = out_tidx; int gather_idx = idx_texel[comp]; diff --git a/backends/vulkan/runtime/graph/ops/glsl/index_tensor_texture.glsl b/backends/vulkan/runtime/graph/ops/glsl/index_tensor_texture.glsl index 7dc25cda040..90b1c6e5a5b 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/index_tensor_texture.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/index_tensor_texture.glsl @@ -55,7 +55,7 @@ void main() { VEC4_T out_texel = VEC4_T(0); int limit = min( - 4, outp.sizes[out_packed_dim] - out_tidx.data[out_packed_dim]); + 4, safe_idx(outp.sizes, out_packed_dim) - out_tidx.data[out_packed_dim]); for (int comp = 0; comp < limit; comp++) { int idx = idx_texel[comp]; diff --git a/backends/vulkan/runtime/graph/ops/glsl/indexing.glslh b/backends/vulkan/runtime/graph/ops/glsl/indexing.glslh index 6f01dee0fe3..ddcc6d03de2 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/indexing.glslh +++ b/backends/vulkan/runtime/graph/ops/glsl/indexing.glslh @@ -99,6 +99,14 @@ uint safe_idx(const uvec4 v, const int idx) { return v.w; } +// Safe ivec3 component access via if/else chain. Same rationale as safe_idx +// for ivec4. +int safe_idx(const ivec3 v, const int idx) { + if (idx == 0) return v.x; + if (idx == 1) return v.y; + return v.z; +} + // Safe ivec4 component write via if/else chain. Companion to safe_idx for // cases where we need to set a component by a spec-const-derived index. void safe_set(inout ivec4 v, const int idx, const int val) { diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_int8x4_buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_int8x4_buffer.glsl index d8f7bdabe53..6b535400554 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_int8x4_buffer.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_int8x4_buffer.glsl @@ -42,7 +42,7 @@ void main() { texel_idx_to_tensor4d_idx(outp, texel_idx, outp_layout); // Bounds check on outer dimension - if (tidx.data[outer_dim] >= int(outp.sizes[0][outer_dim])) { + if (tidx.data[outer_dim] >= int(safe_idx(outp.sizes[0], outer_dim))) { return; } @@ -55,7 +55,7 @@ void main() { int packed = 0; [[unroll]] for (int i = 0; i < 4; ++i) { const int elem_inner = tidx.data[inner_dim] + i; - if (elem_inner < int(outp.sizes[0][inner_dim])) { + if (elem_inner < int(safe_idx(outp.sizes[0], inner_dim))) { // Build element coordinates ivec4 elem = tidx.data; elem[inner_dim] = elem_inner; diff --git a/backends/vulkan/runtime/graph/ops/glsl/pad_texture.glsl b/backends/vulkan/runtime/graph/ops/glsl/pad_texture.glsl index 35a44485e27..d09c5890ed7 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/pad_texture.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/pad_texture.glsl @@ -52,7 +52,7 @@ void main() { // Tail texels may have fewer than 4 valid elements; leave extras as 0. const int limit = - min(4, outp.sizes[packed_dim] - out_tidx.data[packed_dim]); + min(4, safe_idx(outp.sizes, packed_dim) - out_tidx.data[packed_dim]); VEC4_T out_texel = VEC4_T(0); diff --git a/backends/vulkan/runtime/graph/ops/glsl/reduce.glsl b/backends/vulkan/runtime/graph/ops/glsl/reduce.glsl index 7a6263d9f55..209440cec6a 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/reduce.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/reduce.glsl @@ -43,6 +43,7 @@ layout(constant_id = 5) const int group_dim = 1; shared vec4 shared_vecs[MAX_NTHREADS]; #include "indexing_utils.h" +#include "indexing.glslh" int tid_to_smi(const ivec2 tid) { return tid.x + tid.y * NWORKERS; @@ -95,7 +96,7 @@ void reduce_nonpacked_dim(const ivec2 tid, ivec3 scan_pos) { scan_pos[reduce_dim] = tid.x; // Partially accumulate over elements i, i + NWORKERS, i + 2*NWORKERS, ... of // the reduction row - for (int i = tid.x; i < tin_sizes[reduce_dim]; + for (int i = tid.x; i < safe_idx(tin_sizes, reduce_dim); i += NWORKERS, scan_pos[reduce_dim] += NWORKERS) { accum = UPDATE_ACCUM(accum, load_texel(tin, scan_pos)); } @@ -115,11 +116,11 @@ void reduce_nonpacked_dim(const ivec2 tid, ivec3 scan_pos) { // Determine if there are any padding elements in the final texel of the // packed dimension - const int nspill = mod4(tin_sizes[packed_dim]); + const int nspill = mod4(safe_idx(tin_sizes, packed_dim)); // Detect if this thread is working on the final texels of the packed // dimension, which may have padding elements const bool is_last_texel = - scan_pos[packed_dim] == (tin_limits[packed_dim] - 1); + scan_pos[packed_dim] == (safe_idx(tin_limits, packed_dim) - 1); // Explicitly set padding elements to 0 if (is_last_texel && nspill > 0) { @@ -145,10 +146,10 @@ void reduce_packed_dim(const ivec2 tid, ivec3 scan_pos) { const int smi = tid_to_smi(tid); // Number of non-padding elements in the last texel in the reduction row - const int nspill = mod4(tin_sizes[packed_dim]); + const int nspill = mod4(safe_idx(tin_sizes, packed_dim)); // Only reduce up to the last "complete" texel. The last texel will need to be // handled specially if it has padding elements. - const int reduce_len = tin_sizes[packed_dim] - nspill; + const int reduce_len = safe_idx(tin_sizes, packed_dim) - nspill; scan_pos[reduce_dim] = 0; vec4 accum = INIT_ACCUM(vec4(load_texel(tin, scan_pos).x)); @@ -163,7 +164,7 @@ void reduce_packed_dim(const ivec2 tid, ivec3 scan_pos) { // For the last texel in the dim, if there are padding elements then each // element of the texel needs to be processed individually such that the // padding elements are ignored - if (scan_pos[reduce_dim] == tin_limits[reduce_dim] - 1 && nspill > 0) { + if (scan_pos[reduce_dim] == safe_idx(tin_limits, reduce_dim) - 1 && nspill > 0) { const vec4 intex = load_texel(tin, scan_pos); for (int i = 0; i < nspill; i++) { accum.x = UPDATE_ACCUM(accum.x, intex[i]); diff --git a/backends/vulkan/runtime/graph/ops/glsl/reduce2d.glsl b/backends/vulkan/runtime/graph/ops/glsl/reduce2d.glsl index 98370a9bcde..bd55025f534 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/reduce2d.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/reduce2d.glsl @@ -44,6 +44,7 @@ layout(constant_id = 6) const int group_dim = 2; shared vec4 shared_vecs[MAX_NTHREADS]; #include "indexing_utils.h" +#include "indexing.glslh" int tid_to_smi(const ivec2 tid) { return tid.x + tid.y * NWORKERS; @@ -68,12 +69,12 @@ void reduce_2d_non_packed_dim(const ivec2 tid, ivec3 scan_pos) { // First dimension reduction scan_pos[reduce_dim1] = tid.x; - for (int i = tid.x; i < tin_sizes[reduce_dim1]; + for (int i = tid.x; i < safe_idx(tin_sizes, reduce_dim1); i += NWORKERS, scan_pos[reduce_dim1] += NWORKERS) { // Second dimension reduction scan_pos[reduce_dim2] = 0; - for (int j = 0; j < tin_sizes[reduce_dim2]; j++, scan_pos[reduce_dim2]++) { + for (int j = 0; j < safe_idx(tin_sizes, reduce_dim2); j++, scan_pos[reduce_dim2]++) { accum = UPDATE_ACCUM(accum, load_texel(tin, scan_pos)); } } @@ -93,11 +94,11 @@ void reduce_2d_non_packed_dim(const ivec2 tid, ivec3 scan_pos) { // Determine if there are any padding elements in the final texel of the // packed dimension - const int nspill = mod4(tin_sizes[packed_dim]); + const int nspill = mod4(safe_idx(tin_sizes, packed_dim)); // Detect if this thread is working on the final texels of the packed // dimension, which may have padding elements - const bool is_last_texel = - scan_pos[packed_dim] == (tin_limits[packed_dim] - 1); + const bool is_last_texel = + scan_pos[packed_dim] == (safe_idx(tin_limits, packed_dim) - 1); // Explicitly set padding elements to 0 if (is_last_texel && nspill > 0) { diff --git a/backends/vulkan/runtime/graph/ops/glsl/repeat_texture.glsl b/backends/vulkan/runtime/graph/ops/glsl/repeat_texture.glsl index 02f8956ce1f..6661d747876 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/repeat_texture.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/repeat_texture.glsl @@ -47,7 +47,7 @@ void main() { VEC4_T out_texel = VEC4_T(0); const int limit = min( - 4, out_meta.sizes[packed_dim] - out_tidx.data[packed_dim]); + 4, safe_idx(out_meta.sizes, packed_dim) - out_tidx.data[packed_dim]); for (int comp = 0; comp < limit; comp++) { TensorIndex4D in_tidx = out_tidx; in_tidx.data = ivec4( diff --git a/backends/vulkan/runtime/graph/ops/glsl/select.glslh b/backends/vulkan/runtime/graph/ops/glsl/select.glslh index 5390e2a4bb2..4fdb48926fa 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/select.glslh +++ b/backends/vulkan/runtime/graph/ops/glsl/select.glslh @@ -69,7 +69,7 @@ TensorIndex4D out_tidx_to_in_tidx(const TensorIndex4D out_tidx) { int adjusted_index = index; if (index < 0) { - adjusted_index = index + inp.sizes[selected_dim]; + adjusted_index = index + safe_idx(inp.sizes, selected_dim); } // Handle different dimensions for selection diff --git a/backends/vulkan/runtime/graph/ops/glsl/slice.glslh b/backends/vulkan/runtime/graph/ops/glsl/slice.glslh index 0a815c85d66..a7fc94bb5d6 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/slice.glslh +++ b/backends/vulkan/runtime/graph/ops/glsl/slice.glslh @@ -56,7 +56,7 @@ TensorIndex4D out_tidx_to_in_tidx(const TensorIndex4D out_tidx) { int adjusted_start = start; if (start < 0) { - adjusted_start = start + inp.sizes[selected_dim]; + adjusted_start = start + safe_idx(inp.sizes, selected_dim); } in_tidx.data[selected_dim] = adjusted_start + out_tidx.data[selected_dim] * step; diff --git a/backends/vulkan/runtime/graph/ops/glsl/softmax.glsl b/backends/vulkan/runtime/graph/ops/glsl/softmax.glsl index bf7facae761..ce9d2477795 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/softmax.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/softmax.glsl @@ -57,7 +57,7 @@ void softmax_nonpacked_dim(const ivec2 tid, ivec3 scan_pos) { scan_pos[reduce_dim] = tid.x; vec4 max_elements = texelFetch(tin, scan_pos, 0); - for (int i = tid.x; i < in_meta.sizes[reduce_dim]; + for (int i = tid.x; i < safe_idx(in_meta.sizes, reduce_dim); i += NWORKERS, scan_pos[reduce_dim] += NWORKERS) { max_elements = max(max_elements, texelFetch(tin, scan_pos, 0)); } @@ -71,7 +71,7 @@ void softmax_nonpacked_dim(const ivec2 tid, ivec3 scan_pos) { scan_pos[reduce_dim] = tid.x; vec4 denominators = vec4(0); - for (int i = tid.x; i < in_meta.sizes[reduce_dim]; + for (int i = tid.x; i < safe_idx(in_meta.sizes, reduce_dim); i += NWORKERS, scan_pos[reduce_dim] += NWORKERS) { denominators += exp(texelFetch(tin, scan_pos, 0) - max_elements); } @@ -83,12 +83,12 @@ void softmax_nonpacked_dim(const ivec2 tid, ivec3 scan_pos) { denominators += shared_sum[group_i]; } - const int nspill = mod_4(in_meta.sizes[packed_dim]); + const int nspill = mod_4(safe_idx(in_meta.sizes, packed_dim)); const bool is_last_texel = - scan_pos[packed_dim] == (out_meta.limits[packed_dim] - 1); + scan_pos[packed_dim] == (safe_idx(out_meta.limits, packed_dim) - 1); scan_pos[reduce_dim] = tid.x; - for (int i = tid.x; i < in_meta.sizes[reduce_dim]; + for (int i = tid.x; i < safe_idx(in_meta.sizes, reduce_dim); i += NWORKERS, scan_pos[reduce_dim] += NWORKERS) { const vec4 numerators = op1(texelFetch(tin, scan_pos, 0) - max_elements); const vec4 safe_denom = max(denominators, vec4(1e-37)); @@ -124,8 +124,8 @@ void softmax_packed_dim(const ivec2 tid, ivec3 scan_pos) { const int smi = tid_to_smi(tid); int group_i; - const int nspill = mod_4(in_meta.sizes[packed_dim]); - const int reduce_len = in_meta.sizes[packed_dim] - nspill; + const int nspill = mod_4(safe_idx(in_meta.sizes, packed_dim)); + const int reduce_len = safe_idx(in_meta.sizes, packed_dim) - nspill; scan_pos[reduce_dim] = tid.x; vec4 max_elements = vec4(-3.402823e+38); @@ -133,7 +133,7 @@ void softmax_packed_dim(const ivec2 tid, ivec3 scan_pos) { i += NWORKERS * 4, scan_pos[reduce_dim] += NWORKERS) { max_elements = max(max_elements, texelFetch(tin, scan_pos, 0)); } - if (scan_pos[reduce_dim] == out_meta.limits[reduce_dim] - 1 && nspill > 0) { + if (scan_pos[reduce_dim] == safe_idx(out_meta.limits, reduce_dim) - 1 && nspill > 0) { const vec4 intex = texelFetch(tin, scan_pos, 0); for (int i = 0; i < nspill; ++i) { max_elements.x = max(intex[i], max_elements.x); @@ -157,7 +157,7 @@ void softmax_packed_dim(const ivec2 tid, ivec3 scan_pos) { i += NWORKERS * 4, scan_pos[reduce_dim] += NWORKERS) { denominators += exp(texelFetch(tin, scan_pos, 0) - max_element); } - if (nspill > 0 && scan_pos[reduce_dim] == out_meta.limits[reduce_dim] - 1) { + if (nspill > 0 && scan_pos[reduce_dim] == safe_idx(out_meta.limits, reduce_dim) - 1) { const vec4 intex = texelFetch(tin, scan_pos, 0); for (int i = 0; i < nspill; ++i) { denominators.x += exp(intex[i] - max_element); @@ -182,7 +182,7 @@ void softmax_packed_dim(const ivec2 tid, ivec3 scan_pos) { const vec4 numerators = op1(texelFetch(tin, scan_pos, 0) - max_element); imageStore(tout, scan_pos, op2(numerators, safe_denominator)); } - if (nspill > 0 && scan_pos[reduce_dim] == out_meta.limits[reduce_dim] - 1) { + if (nspill > 0 && scan_pos[reduce_dim] == safe_idx(out_meta.limits, reduce_dim) - 1) { const vec4 numerator = op1(texelFetch(tin, scan_pos, 0) - max_element); vec4 outtex = op2(numerator, safe_denominator); [[unroll]] for (int i = nspill; i < 4; ++i) { diff --git a/backends/vulkan/runtime/graph/ops/glsl/split_texture.glsl b/backends/vulkan/runtime/graph/ops/glsl/split_texture.glsl index 254d4de1af6..17ee0619cb0 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/split_texture.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/split_texture.glsl @@ -51,7 +51,7 @@ void main() { VEC4_T out_texel = VEC4_T(0); int limit = min( - 4, outp.sizes[out_packed_dim] - out_tidx.data[out_packed_dim]); + 4, safe_idx(outp.sizes, out_packed_dim) - out_tidx.data[out_packed_dim]); TensorIndex4D input_tidx = out_tidx; input_tidx.data[split_dim] += split_offset; diff --git a/backends/vulkan/runtime/graph/ops/glsl/transfer_texture.glsl b/backends/vulkan/runtime/graph/ops/glsl/transfer_texture.glsl index 5f2a2097e2c..2d70e645de9 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/transfer_texture.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/transfer_texture.glsl @@ -69,7 +69,7 @@ void main() { VEC4_T out_texel = VEC4_T(0); int limit = min( - 4, outp.sizes[out_packed_dim] - out_tidx.data[out_packed_dim]); + 4, safe_idx(outp.sizes, out_packed_dim) - out_tidx.data[out_packed_dim]); for (int comp = 0; comp < limit; comp++) { TensorIndex4D in_tidx = out_tidx_to_in_tidx(out_tidx); diff --git a/backends/vulkan/runtime/graph/ops/glsl/var_texture3d.glsl b/backends/vulkan/runtime/graph/ops/glsl/var_texture3d.glsl index faeac01fcd2..1ee938c58dc 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/var_texture3d.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/var_texture3d.glsl @@ -48,6 +48,7 @@ shared VEC4_T shared_sum_sq[MAX_NTHREADS]; shared int shared_count[MAX_NTHREADS]; #include "indexing_utils.h" +#include "indexing.glslh" int tid_to_smi(const ivec2 tid) { return tid.x + tid.y * NWORKERS; @@ -73,7 +74,7 @@ void reduce_nonpacked_dim(const ivec2 tid, ivec3 scan_pos) { int count = 0; scan_pos[reduce_dim] = tid.x; - for (int i = tid.x; i < tin_sizes[reduce_dim]; + for (int i = tid.x; i < safe_idx(tin_sizes, reduce_dim); i += NWORKERS, scan_pos[reduce_dim] += NWORKERS) { VEC4_T val = load_texel(tin, scan_pos); sum += val; @@ -103,11 +104,11 @@ void reduce_nonpacked_dim(const ivec2 tid, ivec3 scan_pos) { // Determine if there are any padding elements in the final texel of the // packed dimension - const int nspill = mod4(tin_sizes[packed_dim]); + const int nspill = mod4(safe_idx(tin_sizes, packed_dim)); // Detect if this thread is working on the final texels of the packed // dimension, which may have padding elements const bool is_last_texel = - scan_pos[packed_dim] == (tin_limits[packed_dim] - 1); + scan_pos[packed_dim] == (safe_idx(tin_limits, packed_dim) - 1); VEC4_T variance = calculate_variance(sum, sum_sq, count); @@ -136,10 +137,10 @@ void reduce_packed_dim(const ivec2 tid, ivec3 scan_pos) { const int smi = tid_to_smi(tid); // Number of non-padding elements in the last texel in the reduction row - const int nspill = mod4(tin_sizes[packed_dim]); + const int nspill = mod4(safe_idx(tin_sizes, packed_dim)); // Only reduce up to the last "complete" texel. The last texel will need to be // handled specially if it has padding elements. - const int reduce_len = tin_sizes[packed_dim] - nspill; + const int reduce_len = safe_idx(tin_sizes, packed_dim) - nspill; VEC4_T sum = VEC4_T(0); VEC4_T sum_sq = VEC4_T(0); @@ -158,7 +159,7 @@ void reduce_packed_dim(const ivec2 tid, ivec3 scan_pos) { // For the last texel in the dim, if there are padding elements then each // element of the texel needs to be processed individually such that the // padding elements are ignored - if (scan_pos[reduce_dim] == tin_limits[reduce_dim] - 1 && nspill > 0) { + if (scan_pos[reduce_dim] == safe_idx(tin_limits, reduce_dim) - 1 && nspill > 0) { const VEC4_T val = load_texel(tin, scan_pos); for (int i = 0; i < nspill; i++) { sum.x += val[i]; diff --git a/backends/vulkan/runtime/graph/ops/glsl/where.glsl b/backends/vulkan/runtime/graph/ops/glsl/where.glsl index cc673ff2001..6982d41dd3a 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/where.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/where.glsl @@ -96,7 +96,7 @@ void main() { VEC4_T outtex = VEC4_T(0); int limit = min( - 4, outp.sizes[out_packed_dim] - out_tidx.data[out_packed_dim]); + 4, safe_idx(outp.sizes, out_packed_dim) - out_tidx.data[out_packed_dim]); for (int comp = 0; comp < limit; comp++) { TensorIndex4D cond_tidx; cond_tidx.data = min(out_tidx.data, condp.sizes - 1); diff --git a/backends/vulkan/runtime/graph/ops/impl/Slice.cpp b/backends/vulkan/runtime/graph/ops/impl/Slice.cpp index 67d714d10aa..a35fb65355d 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Slice.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Slice.cpp @@ -13,7 +13,6 @@ #include #include -#include namespace vkcompute {