pytorch · meta-codesync · Mar 26, 2026 · Mar 25, 2026
@@ -45,7 +45,7 @@ void main() {
   // Compute the value for each element in the texel along the packed dim.
   VEC4_T outtex = VEC4_T(0);
   int limit = min(
-      4, outp.sizes[packed_dim] - out_tidx.data[packed_dim]);
+      4, safe_idx(outp.sizes, packed_dim) - out_tidx.data[packed_dim]);
   for (int comp = 0; comp < limit; comp++) {
     int elem_idx = out_tidx.data[0]; // W index is the linear element index
     outtex[comp] = VEC4_T(start + elem_idx * step).x;

@@ -45,7 +45,7 @@ void main() {
   VEC4_T out_texel = VEC4_T(0);
 
   int limit = min(
-      4, outp.sizes[packed_dim] - out_tidx.data[packed_dim]);
+      4, safe_idx(outp.sizes, packed_dim) - out_tidx.data[packed_dim]);
   for (int comp = 0; comp < 4; comp++) {
     if (comp >= limit) {
       break;

@@ -40,7 +40,7 @@ void main() {
 
   TensorIndex4D tidx =
       texture_pos_to_tensor4d_idx_simple(outp, pos, out_layout);
-  const int packed_dim_size = outp.sizes[packed_dim];
+  const int packed_dim_size = safe_idx(outp.sizes, packed_dim);
   int packed_idx = tidx.data[packed_dim];
 
   if (packed_idx + 3 >= packed_dim_size) {

@@ -53,7 +53,7 @@ void main() {
   VEC4_T out_texel = VEC4_T(0);
 
   int limit = min(
-      4, outp.sizes[out_packed_dim] - out_tidx.data[out_packed_dim]);
+      4, safe_idx(outp.sizes, out_packed_dim) - out_tidx.data[out_packed_dim]);
   for (int comp = 0; comp < 4; comp++) {
     TensorIndex4D input_tidx = out_tidx;
     int gather_idx = idx_texel[comp];

@@ -55,7 +55,7 @@ void main() {
   VEC4_T out_texel = VEC4_T(0);
 
   int limit = min(
-      4, outp.sizes[out_packed_dim] - out_tidx.data[out_packed_dim]);
+      4, safe_idx(outp.sizes, out_packed_dim) - out_tidx.data[out_packed_dim]);
   for (int comp = 0; comp < limit; comp++) {
     int idx = idx_texel[comp];
 

@@ -99,6 +99,14 @@ uint safe_idx(const uvec4 v, const int idx) {
   return v.w;
 }
 
+// Safe ivec3 component access via if/else chain. Same rationale as safe_idx
+// for ivec4.
+int safe_idx(const ivec3 v, const int idx) {
+  if (idx == 0) return v.x;
+  if (idx == 1) return v.y;
+  return v.z;
+}
+
 // Safe ivec4 component write via if/else chain. Companion to safe_idx for
 // cases where we need to set a component by a spec-const-derived index.
 void safe_set(inout ivec4 v, const int idx, const int val) {

@@ -42,7 +42,7 @@ void main() {
       texel_idx_to_tensor4d_idx(outp, texel_idx, outp_layout);
 
   // Bounds check on outer dimension
-  if (tidx.data[outer_dim] >= int(outp.sizes[0][outer_dim])) {
+  if (tidx.data[outer_dim] >= int(safe_idx(outp.sizes[0], outer_dim))) {
     return;
   }
 
@@ -55,7 +55,7 @@ void main() {
   int packed = 0;
   [[unroll]] for (int i = 0; i < 4; ++i) {
     const int elem_inner = tidx.data[inner_dim] + i;
-    if (elem_inner < int(outp.sizes[0][inner_dim])) {
+    if (elem_inner < int(safe_idx(outp.sizes[0], inner_dim))) {
       // Build element coordinates
       ivec4 elem = tidx.data;
       elem[inner_dim] = elem_inner;

@@ -52,7 +52,7 @@ void main() {
 
   // Tail texels may have fewer than 4 valid elements; leave extras as 0.
   const int limit =
-      min(4, outp.sizes[packed_dim] - out_tidx.data[packed_dim]);
+      min(4, safe_idx(outp.sizes, packed_dim) - out_tidx.data[packed_dim]);
 
   VEC4_T out_texel = VEC4_T(0);
 

@@ -43,6 +43,7 @@ layout(constant_id = 5) const int group_dim = 1;
 shared vec4 shared_vecs[MAX_NTHREADS];
 
 #include "indexing_utils.h"
+#include "indexing.glslh"
 
 int tid_to_smi(const ivec2 tid) {
   return tid.x + tid.y * NWORKERS;
@@ -95,7 +96,7 @@ void reduce_nonpacked_dim(const ivec2 tid, ivec3 scan_pos) {
   scan_pos[reduce_dim] = tid.x;
   // Partially accumulate over elements i, i + NWORKERS, i + 2*NWORKERS, ... of
   // the reduction row
-  for (int i = tid.x; i < tin_sizes[reduce_dim];
+  for (int i = tid.x; i < safe_idx(tin_sizes, reduce_dim);
        i += NWORKERS, scan_pos[reduce_dim] += NWORKERS) {
     accum = UPDATE_ACCUM(accum, load_texel(tin, scan_pos));
   }
@@ -115,11 +116,11 @@ void reduce_nonpacked_dim(const ivec2 tid, ivec3 scan_pos) {
 
     // Determine if there are any padding elements in the final texel of the
     // packed dimension
-    const int nspill = mod4(tin_sizes[packed_dim]);
+    const int nspill = mod4(safe_idx(tin_sizes, packed_dim));
     // Detect if this thread is working on the final texels of the packed
     // dimension, which may have padding elements
     const bool is_last_texel =
-        scan_pos[packed_dim] == (tin_limits[packed_dim] - 1);
+        scan_pos[packed_dim] == (safe_idx(tin_limits, packed_dim) - 1);
 
     // Explicitly set padding elements to 0
     if (is_last_texel && nspill > 0) {
@@ -145,10 +146,10 @@ void reduce_packed_dim(const ivec2 tid, ivec3 scan_pos) {
   const int smi = tid_to_smi(tid);
 
   // Number of non-padding elements in the last texel in the reduction row
-  const int nspill = mod4(tin_sizes[packed_dim]);
+  const int nspill = mod4(safe_idx(tin_sizes, packed_dim));
   // Only reduce up to the last "complete" texel. The last texel will need to be
   // handled specially if it has padding elements.
-  const int reduce_len = tin_sizes[packed_dim] - nspill;
+  const int reduce_len = safe_idx(tin_sizes, packed_dim) - nspill;
 
   scan_pos[reduce_dim] = 0;
   vec4 accum = INIT_ACCUM(vec4(load_texel(tin, scan_pos).x));
@@ -163,7 +164,7 @@ void reduce_packed_dim(const ivec2 tid, ivec3 scan_pos) {
   // For the last texel in the dim, if there are padding elements then each
   // element of the texel needs to be processed individually such that the
   // padding elements are ignored
-  if (scan_pos[reduce_dim] == tin_limits[reduce_dim] - 1 && nspill > 0) {
+  if (scan_pos[reduce_dim] == safe_idx(tin_limits, reduce_dim) - 1 && nspill > 0) {
     const vec4 intex = load_texel(tin, scan_pos);
     for (int i = 0; i < nspill; i++) {
       accum.x = UPDATE_ACCUM(accum.x, intex[i]);

@@ -44,6 +44,7 @@ layout(constant_id = 6) const int group_dim = 2;
 shared vec4 shared_vecs[MAX_NTHREADS];
 
 #include "indexing_utils.h"
+#include "indexing.glslh"
 
 int tid_to_smi(const ivec2 tid) {
   return tid.x + tid.y * NWORKERS;
@@ -68,12 +69,12 @@ void reduce_2d_non_packed_dim(const ivec2 tid, ivec3 scan_pos) {
 
   // First dimension reduction
   scan_pos[reduce_dim1] = tid.x;
-  for (int i = tid.x; i < tin_sizes[reduce_dim1]; 
+  for (int i = tid.x; i < safe_idx(tin_sizes, reduce_dim1);
        i += NWORKERS, scan_pos[reduce_dim1] += NWORKERS) {
 
     // Second dimension reduction
     scan_pos[reduce_dim2] = 0;
-    for (int j = 0; j < tin_sizes[reduce_dim2]; j++, scan_pos[reduce_dim2]++) {
+    for (int j = 0; j < safe_idx(tin_sizes, reduce_dim2); j++, scan_pos[reduce_dim2]++) {
       accum = UPDATE_ACCUM(accum, load_texel(tin, scan_pos));
     }
   }
@@ -93,11 +94,11 @@ void reduce_2d_non_packed_dim(const ivec2 tid, ivec3 scan_pos) {
 
     // Determine if there are any padding elements in the final texel of the
     // packed dimension
-    const int nspill = mod4(tin_sizes[packed_dim]);
+    const int nspill = mod4(safe_idx(tin_sizes, packed_dim));
     // Detect if this thread is working on the final texels of the packed
     // dimension, which may have padding elements
-    const bool is_last_texel = 
-        scan_pos[packed_dim] == (tin_limits[packed_dim] - 1);
+    const bool is_last_texel =
+        scan_pos[packed_dim] == (safe_idx(tin_limits, packed_dim) - 1);
 
     // Explicitly set padding elements to 0
     if (is_last_texel && nspill > 0) {

@@ -47,7 +47,7 @@ void main() {
   VEC4_T out_texel = VEC4_T(0);
 
   const int limit = min(
-      4, out_meta.sizes[packed_dim] - out_tidx.data[packed_dim]);
+      4, safe_idx(out_meta.sizes, packed_dim) - out_tidx.data[packed_dim]);
   for (int comp = 0; comp < limit; comp++) {
     TensorIndex4D in_tidx = out_tidx;
     in_tidx.data = ivec4(

@@ -69,7 +69,7 @@ TensorIndex4D out_tidx_to_in_tidx(const TensorIndex4D out_tidx) {
 
   int adjusted_index = index;
   if (index < 0) {
-    adjusted_index = index + inp.sizes[selected_dim];
+    adjusted_index = index + safe_idx(inp.sizes, selected_dim);
   }
 
   // Handle different dimensions for selection

@@ -56,7 +56,7 @@ TensorIndex4D out_tidx_to_in_tidx(const TensorIndex4D out_tidx) {
 
   int adjusted_start = start;
   if (start < 0) {
-    adjusted_start = start + inp.sizes[selected_dim];
+    adjusted_start = start + safe_idx(inp.sizes, selected_dim);
   }
 
   in_tidx.data[selected_dim] = adjusted_start + out_tidx.data[selected_dim] * step;

@@ -57,7 +57,7 @@ void softmax_nonpacked_dim(const ivec2 tid, ivec3 scan_pos) {
 
   scan_pos[reduce_dim] = tid.x;
   vec4 max_elements = texelFetch(tin, scan_pos, 0);
-  for (int i = tid.x; i < in_meta.sizes[reduce_dim];
+  for (int i = tid.x; i < safe_idx(in_meta.sizes, reduce_dim);
        i += NWORKERS, scan_pos[reduce_dim] += NWORKERS) {
     max_elements = max(max_elements, texelFetch(tin, scan_pos, 0));
   }
@@ -71,7 +71,7 @@ void softmax_nonpacked_dim(const ivec2 tid, ivec3 scan_pos) {
 
   scan_pos[reduce_dim] = tid.x;
   vec4 denominators = vec4(0);
-  for (int i = tid.x; i < in_meta.sizes[reduce_dim];
+  for (int i = tid.x; i < safe_idx(in_meta.sizes, reduce_dim);
        i += NWORKERS, scan_pos[reduce_dim] += NWORKERS) {
     denominators += exp(texelFetch(tin, scan_pos, 0) - max_elements);
   }
@@ -83,12 +83,12 @@ void softmax_nonpacked_dim(const ivec2 tid, ivec3 scan_pos) {
     denominators += shared_sum[group_i];
   }
 
-  const int nspill = mod_4(in_meta.sizes[packed_dim]);
+  const int nspill = mod_4(safe_idx(in_meta.sizes, packed_dim));
   const bool is_last_texel =
-      scan_pos[packed_dim] == (out_meta.limits[packed_dim] - 1);
+      scan_pos[packed_dim] == (safe_idx(out_meta.limits, packed_dim) - 1);
 
   scan_pos[reduce_dim] = tid.x;
-  for (int i = tid.x; i < in_meta.sizes[reduce_dim];
+  for (int i = tid.x; i < safe_idx(in_meta.sizes, reduce_dim);
        i += NWORKERS, scan_pos[reduce_dim] += NWORKERS) {
     const vec4 numerators = op1(texelFetch(tin, scan_pos, 0) - max_elements);
     const vec4 safe_denom = max(denominators, vec4(1e-37));
@@ -124,16 +124,16 @@ void softmax_packed_dim(const ivec2 tid, ivec3 scan_pos) {
   const int smi = tid_to_smi(tid);
   int group_i;
 
-  const int nspill = mod_4(in_meta.sizes[packed_dim]);
-  const int reduce_len = in_meta.sizes[packed_dim] - nspill;
+  const int nspill = mod_4(safe_idx(in_meta.sizes, packed_dim));
+  const int reduce_len = safe_idx(in_meta.sizes, packed_dim) - nspill;
 
   scan_pos[reduce_dim] = tid.x;
   vec4 max_elements = vec4(-3.402823e+38);
   for (int i = tid.x * 4; i < reduce_len;
        i += NWORKERS * 4, scan_pos[reduce_dim] += NWORKERS) {
     max_elements = max(max_elements, texelFetch(tin, scan_pos, 0));
   }
-  if (scan_pos[reduce_dim] == out_meta.limits[reduce_dim] - 1 && nspill > 0) {
+  if (scan_pos[reduce_dim] == safe_idx(out_meta.limits, reduce_dim) - 1 && nspill > 0) {
     const vec4 intex = texelFetch(tin, scan_pos, 0);
     for (int i = 0; i < nspill; ++i) {
       max_elements.x = max(intex[i], max_elements.x);
@@ -157,7 +157,7 @@ void softmax_packed_dim(const ivec2 tid, ivec3 scan_pos) {
        i += NWORKERS * 4, scan_pos[reduce_dim] += NWORKERS) {
     denominators += exp(texelFetch(tin, scan_pos, 0) - max_element);
   }
-  if (nspill > 0 && scan_pos[reduce_dim] == out_meta.limits[reduce_dim] - 1) {
+  if (nspill > 0 && scan_pos[reduce_dim] == safe_idx(out_meta.limits, reduce_dim) - 1) {
     const vec4 intex = texelFetch(tin, scan_pos, 0);
     for (int i = 0; i < nspill; ++i) {
       denominators.x += exp(intex[i] - max_element);
@@ -182,7 +182,7 @@ void softmax_packed_dim(const ivec2 tid, ivec3 scan_pos) {
     const vec4 numerators = op1(texelFetch(tin, scan_pos, 0) - max_element);
     imageStore(tout, scan_pos, op2(numerators, safe_denominator));
   }
-  if (nspill > 0 && scan_pos[reduce_dim] == out_meta.limits[reduce_dim] - 1) {
+  if (nspill > 0 && scan_pos[reduce_dim] == safe_idx(out_meta.limits, reduce_dim) - 1) {
     const vec4 numerator = op1(texelFetch(tin, scan_pos, 0) - max_element);
     vec4 outtex = op2(numerator, safe_denominator);
     [[unroll]] for (int i = nspill; i < 4; ++i) {

@@ -51,7 +51,7 @@ void main() {
   VEC4_T out_texel = VEC4_T(0);
 
   int limit = min(
-      4, outp.sizes[out_packed_dim] - out_tidx.data[out_packed_dim]);
+      4, safe_idx(outp.sizes, out_packed_dim) - out_tidx.data[out_packed_dim]);
 
   TensorIndex4D input_tidx = out_tidx;
   input_tidx.data[split_dim] += split_offset;

@@ -69,7 +69,7 @@ void main() {
   VEC4_T out_texel = VEC4_T(0);
 
   int limit = min(
-      4, outp.sizes[out_packed_dim] - out_tidx.data[out_packed_dim]);
+      4, safe_idx(outp.sizes, out_packed_dim) - out_tidx.data[out_packed_dim]);
   for (int comp = 0; comp < limit; comp++) {
     TensorIndex4D in_tidx = out_tidx_to_in_tidx(out_tidx);
 

@@ -48,6 +48,7 @@ shared VEC4_T shared_sum_sq[MAX_NTHREADS];
 shared int shared_count[MAX_NTHREADS];
 
 #include "indexing_utils.h"
+#include "indexing.glslh"
 
 int tid_to_smi(const ivec2 tid) {
   return tid.x + tid.y * NWORKERS;
@@ -73,7 +74,7 @@ void reduce_nonpacked_dim(const ivec2 tid, ivec3 scan_pos) {
   int count = 0;
 
   scan_pos[reduce_dim] = tid.x;
-  for (int i = tid.x; i < tin_sizes[reduce_dim];
+  for (int i = tid.x; i < safe_idx(tin_sizes, reduce_dim);
        i += NWORKERS, scan_pos[reduce_dim] += NWORKERS) {
     VEC4_T val = load_texel(tin, scan_pos);
     sum += val;
@@ -103,11 +104,11 @@ void reduce_nonpacked_dim(const ivec2 tid, ivec3 scan_pos) {
 
     // Determine if there are any padding elements in the final texel of the
     // packed dimension
-    const int nspill = mod4(tin_sizes[packed_dim]);
+    const int nspill = mod4(safe_idx(tin_sizes, packed_dim));
     // Detect if this thread is working on the final texels of the packed
     // dimension, which may have padding elements
     const bool is_last_texel =
-        scan_pos[packed_dim] == (tin_limits[packed_dim] - 1);
+        scan_pos[packed_dim] == (safe_idx(tin_limits, packed_dim) - 1);
 
     VEC4_T variance = calculate_variance(sum, sum_sq, count);
 
@@ -136,10 +137,10 @@ void reduce_packed_dim(const ivec2 tid, ivec3 scan_pos) {
   const int smi = tid_to_smi(tid);
 
   // Number of non-padding elements in the last texel in the reduction row
-  const int nspill = mod4(tin_sizes[packed_dim]);
+  const int nspill = mod4(safe_idx(tin_sizes, packed_dim));
   // Only reduce up to the last "complete" texel. The last texel will need to be
   // handled specially if it has padding elements.
-  const int reduce_len = tin_sizes[packed_dim] - nspill;
+  const int reduce_len = safe_idx(tin_sizes, packed_dim) - nspill;
 
   VEC4_T sum = VEC4_T(0);
   VEC4_T sum_sq = VEC4_T(0);
@@ -158,7 +159,7 @@ void reduce_packed_dim(const ivec2 tid, ivec3 scan_pos) {
   // For the last texel in the dim, if there are padding elements then each
   // element of the texel needs to be processed individually such that the
   // padding elements are ignored
-  if (scan_pos[reduce_dim] == tin_limits[reduce_dim] - 1 && nspill > 0) {
+  if (scan_pos[reduce_dim] == safe_idx(tin_limits, reduce_dim) - 1 && nspill > 0) {
     const VEC4_T val = load_texel(tin, scan_pos);
     for (int i = 0; i < nspill; i++) {
       sum.x += val[i];

@@ -96,7 +96,7 @@ void main() {
   VEC4_T outtex = VEC4_T(0);
 
   int limit = min(
-      4, outp.sizes[out_packed_dim] - out_tidx.data[out_packed_dim]);
+      4, safe_idx(outp.sizes, out_packed_dim) - out_tidx.data[out_packed_dim]);
   for (int comp = 0; comp < limit; comp++) {
     TensorIndex4D cond_tidx;
     cond_tidx.data = min(out_tidx.data, condp.sizes - 1);

@@ -13,7 +13,6 @@
 
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
 
 namespace vkcompute {