diff --git a/backends/vulkan/runtime/graph/ops/glsl/arange_texture.glsl b/backends/vulkan/runtime/graph/ops/glsl/arange_texture.glsl
index e2a22213c05..0a5636b300f 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/arange_texture.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/arange_texture.glsl
@@ -45,7 +45,7 @@ void main() {
   // Compute the value for each element in the texel along the packed dim.
   VEC4_T outtex = VEC4_T(0);
   int limit = min(
-      4, outp.sizes[packed_dim] - out_tidx.data[packed_dim]);
+      4, safe_idx(outp.sizes, packed_dim) - out_tidx.data[packed_dim]);
   for (int comp = 0; comp < limit; comp++) {
     int elem_idx = out_tidx.data[0]; // W index is the linear element index
     outtex[comp] = VEC4_T(start + elem_idx * step).x;
diff --git a/backends/vulkan/runtime/graph/ops/glsl/expand_texture.glsl b/backends/vulkan/runtime/graph/ops/glsl/expand_texture.glsl
index c75ccba9f2d..3dfd7213c9f 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/expand_texture.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/expand_texture.glsl
@@ -45,7 +45,7 @@ void main() {
   VEC4_T out_texel = VEC4_T(0);
 
   int limit = min(
-      4, outp.sizes[packed_dim] - out_tidx.data[packed_dim]);
+      4, safe_idx(outp.sizes, packed_dim) - out_tidx.data[packed_dim]);
   for (int comp = 0; comp < 4; comp++) {
     if (comp >= limit) {
       break;
diff --git a/backends/vulkan/runtime/graph/ops/glsl/full_texture.glsl b/backends/vulkan/runtime/graph/ops/glsl/full_texture.glsl
index dbd2336e209..0124ac2d0b9 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/full_texture.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/full_texture.glsl
@@ -40,7 +40,7 @@ void main() {
 
   TensorIndex4D tidx =
       texture_pos_to_tensor4d_idx_simple(outp, pos, out_layout);
-  const int packed_dim_size = outp.sizes[packed_dim];
+  const int packed_dim_size = safe_idx(outp.sizes, packed_dim);
   int packed_idx = tidx.data[packed_dim];
 
   if (packed_idx + 3 >= packed_dim_size) {
diff --git a/backends/vulkan/runtime/graph/ops/glsl/gather_texture.glsl b/backends/vulkan/runtime/graph/ops/glsl/gather_texture.glsl
index 6af78867a2e..2f95365ca73 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/gather_texture.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/gather_texture.glsl
@@ -53,7 +53,7 @@ void main() {
   VEC4_T out_texel = VEC4_T(0);
 
   int limit = min(
-      4, outp.sizes[out_packed_dim] - out_tidx.data[out_packed_dim]);
+      4, safe_idx(outp.sizes, out_packed_dim) - out_tidx.data[out_packed_dim]);
   for (int comp = 0; comp < 4; comp++) {
     TensorIndex4D input_tidx = out_tidx;
     int gather_idx = idx_texel[comp];
diff --git a/backends/vulkan/runtime/graph/ops/glsl/index_tensor_texture.glsl b/backends/vulkan/runtime/graph/ops/glsl/index_tensor_texture.glsl
index 7dc25cda040..90b1c6e5a5b 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/index_tensor_texture.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/index_tensor_texture.glsl
@@ -55,7 +55,7 @@ void main() {
   VEC4_T out_texel = VEC4_T(0);
 
   int limit = min(
-      4, outp.sizes[out_packed_dim] - out_tidx.data[out_packed_dim]);
+      4, safe_idx(outp.sizes, out_packed_dim) - out_tidx.data[out_packed_dim]);
   for (int comp = 0; comp < limit; comp++) {
     int idx = idx_texel[comp];
 
diff --git a/backends/vulkan/runtime/graph/ops/glsl/indexing.glslh b/backends/vulkan/runtime/graph/ops/glsl/indexing.glslh
index 6f01dee0fe3..ddcc6d03de2 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/indexing.glslh
+++ b/backends/vulkan/runtime/graph/ops/glsl/indexing.glslh
@@ -99,6 +99,14 @@ uint safe_idx(const uvec4 v, const int idx) {
   return v.w;
 }
 
+// Safe ivec3 component access via if/else chain. Same rationale as safe_idx
+// for ivec4.
+int safe_idx(const ivec3 v, const int idx) {
+  if (idx == 0) return v.x;
+  if (idx == 1) return v.y;
+  return v.z;
+}
+
 // Safe ivec4 component write via if/else chain. Companion to safe_idx for
 // cases where we need to set a component by a spec-const-derived index.
 void safe_set(inout ivec4 v, const int idx, const int val) {
diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_int8x4_buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_int8x4_buffer.glsl
index d8f7bdabe53..6b535400554 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_int8x4_buffer.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_int8x4_buffer.glsl
@@ -42,7 +42,7 @@ void main() {
       texel_idx_to_tensor4d_idx(outp, texel_idx, outp_layout);
 
   // Bounds check on outer dimension
-  if (tidx.data[outer_dim] >= int(outp.sizes[0][outer_dim])) {
+  if (tidx.data[outer_dim] >= int(safe_idx(outp.sizes[0], outer_dim))) {
     return;
   }
 
@@ -55,7 +55,7 @@ void main() {
   int packed = 0;
   [[unroll]] for (int i = 0; i < 4; ++i) {
     const int elem_inner = tidx.data[inner_dim] + i;
-    if (elem_inner < int(outp.sizes[0][inner_dim])) {
+    if (elem_inner < int(safe_idx(outp.sizes[0], inner_dim))) {
       // Build element coordinates
       ivec4 elem = tidx.data;
       elem[inner_dim] = elem_inner;
diff --git a/backends/vulkan/runtime/graph/ops/glsl/pad_texture.glsl b/backends/vulkan/runtime/graph/ops/glsl/pad_texture.glsl
index 35a44485e27..d09c5890ed7 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/pad_texture.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/pad_texture.glsl
@@ -52,7 +52,7 @@ void main() {
 
   // Tail texels may have fewer than 4 valid elements; leave extras as 0.
   const int limit =
-      min(4, outp.sizes[packed_dim] - out_tidx.data[packed_dim]);
+      min(4, safe_idx(outp.sizes, packed_dim) - out_tidx.data[packed_dim]);
 
   VEC4_T out_texel = VEC4_T(0);
 
diff --git a/backends/vulkan/runtime/graph/ops/glsl/permute_texture.glsl b/backends/vulkan/runtime/graph/ops/glsl/permute_texture.glsl
index b8f8550baf3..903baabc3cf 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/permute_texture.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/permute_texture.glsl
@@ -9,97 +9,97 @@
 #version 450 core
 
 ${define_required_extensions("texture3d", DTYPE)}
-${define_explicit_type_extensions(DTYPE)}
 
 #define PRECISION ${PRECISION}
 
-#define VEC4_T ${texel_type(DTYPE)}
-#define T ${buffer_scalar_type(DTYPE)}
+#define VEC4_T ${texel_load_type(DTYPE, "texture3d")}
+#define T ${texel_load_component_type(DTYPE, "texture3d")}
 
 ${define_active_storage_type("texture3d")}
 
+#extension GL_EXT_control_flow_attributes : require
+
 layout(std430) buffer;
 
-#include "indexing_utils.h"
+#include "common.glslh"
+#include "indexing.glslh"
 
 ${layout_declare_tensor(B, "w", "t_out", DTYPE, "texture3d")}
 ${layout_declare_tensor(B, "r", "t_in", DTYPE, "texture3d")}
 
+${layout_declare_ubo(B, "TextureMetadata", "outp")}
+${layout_declare_ubo(B, "TextureMetadata", "inp")}
+
 layout(push_constant) uniform restrict Block {
-  ivec4 out_sizes;
-  ivec4 in_sizes;
-  ivec4 permute_dims; // Permutation mapping: permute_dims[i] = j means output dim i comes from input dim j
+  ivec4 permute_dims;
 };
 
-${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")}
-const lowp ivec4 out_axis_map = unhash_axis_map(out_layout);
-const lowp int out_packed_dim = unhash_packed_dim(out_layout);
-
-${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")}
-const lowp ivec4 in_axis_map = unhash_axis_map(in_layout);
-const lowp int in_packed_dim = unhash_packed_dim(in_layout);
+${layout_declare_spec_const(C, "int", "out_layout", "CONTIG_LAYOUT_INT")}
+${layout_declare_spec_const(C, "int", "in_layout", "CONTIG_LAYOUT_INT")}
+const int out_packed_dim = get_packed_dim(out_layout);
+const int in_packed_dim = get_packed_dim(in_layout);
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
-// Convert output tensor index to input tensor index based on permutation
+// Convert output tensor index to input tensor index based on permutation.
+// permute_dims[i] = j means output dim i comes from input dim j.
+// We write: in_tidx[permute_dims.{x,y,z,w}] = out_tidx.{x,y,z,w}
+// This uses literal component access on the push constant (safe) and dynamic
+// indexing into the local in_tidx variable (also safe).
 ivec4 out_tidx_to_in_tidx(const ivec4 out_tidx) {
   ivec4 in_tidx;
-
-  // Apply the permutation mapping: in_tidx[permute_dims[i]] = out_tidx[i]
   in_tidx[permute_dims.x] = out_tidx.x;
   in_tidx[permute_dims.y] = out_tidx.y;
   in_tidx[permute_dims.z] = out_tidx.z;
   in_tidx[permute_dims.w] = out_tidx.w;
-
   return in_tidx;
 }
 
-// Check if we can use the fast path where texels from the input tensor can be
-// copied directly into the output tensor. This occurs when the packed dimension
-// is preserved in the permutation, i.e. reading a texel from the output tensor
-// produces 4 texels along the same dimension as reading a texel from the input
-// tensor.
-bool can_use_fast_path() {
-  // Fast path is possible when the packed dimension is preserved in the permutation
-  // This means permute_dims[out_packed_dim] == in_packed_dim
-  return permute_dims[out_packed_dim] == in_packed_dim;
-}
-
 void main() {
-  const ivec3 lpos = ivec3(gl_GlobalInvocationID);
-  ivec4 out_tidx = lpos_to_tidx(lpos, out_sizes, out_axis_map.w, out_packed_dim);
+  const ivec3 out_pos = ivec3(gl_GlobalInvocationID);
 
-  if (any(greaterThanEqual(out_tidx, out_sizes))) {
+  if (out_of_bounds(out_pos, outp)) {
     return;
   }
 
-  if (can_use_fast_path()) {
+  TensorIndex4D out_tidx =
+      texture_pos_to_tensor4d_idx_simple(outp, out_pos, out_layout);
+
+  // Check if packed dimension is preserved in the permutation. Use safe_idx
+  // to avoid dynamic indexing of push constant with spec-const-derived index.
+  const bool fast_path =
+      safe_idx(permute_dims, out_packed_dim) == in_packed_dim;
+
+  if (fast_path) {
     // Fast path: packed dimension is preserved, so we can copy texels directly
-    ivec4 in_tidx = out_tidx_to_in_tidx(out_tidx);
-    ivec3 in_pos = tidx_to_pos(in_tidx, in_sizes, in_axis_map, in_packed_dim);
-    VEC4_T in_texel = VEC4_T(load_texel(t_in, in_pos));
+    ivec4 in_tidx_data = out_tidx_to_in_tidx(out_tidx.data);
+    TensorIndex4D in_tidx;
+    in_tidx.data = in_tidx_data;
 
-    write_texel_lpos(t_out, lpos, in_texel, out_axis_map);
-  }
-  else {
+    ivec3 in_pos =
+        tensor4d_idx_to_texel_pos_simple(inp, in_tidx, in_layout);
+    VEC4_T in_texel = texelFetch(t_in, in_pos, 0);
+
+    imageStore(t_out, out_pos, in_texel);
+  } else {
     // Slow path: packed dimension is not preserved, so each element of the
-    // output texel may be "sourced" from a different texel in the input tensor.
-    // Therefore each output texel element is processed individually.
+    // output texel may come from a different texel in the input tensor.
     VEC4_T out_texel = VEC4_T(0);
 
-    for (int texel_i = 0; texel_i < 4; ++texel_i) {
-      ivec4 in_tidx = out_tidx_to_in_tidx(out_tidx);
-      ivec3 in_pos = tidx_to_pos(in_tidx, in_sizes, in_axis_map, in_packed_dim);
-      int element_idx = in_tidx[in_packed_dim] % 4;
+    for (int comp = 0; comp < 4; comp++) {
+      ivec4 in_tidx_data = out_tidx_to_in_tidx(out_tidx.data);
+      TensorIndex4D in_tidx;
+      in_tidx.data = in_tidx_data;
 
-      VEC4_T in_texel = VEC4_T(load_texel(t_in, in_pos));
-      T selected_value = T(in_texel[element_idx]);
+      TextureElementIndex in_elem =
+          tensor4d_idx_to_texture_element_idx_simple(inp, in_tidx, in_layout);
 
-      out_texel[texel_i] = selected_value;
+      VEC4_T in_texel = texelFetch(t_in, in_elem.pos, 0);
+      out_texel[comp] = in_texel[in_elem.comp];
 
-      out_tidx[out_packed_dim]++;
+      out_tidx.data[out_packed_dim]++;
     }
 
-    write_texel_lpos(t_out, lpos, out_texel, out_axis_map);
+    imageStore(t_out, out_pos, out_texel);
   }
 }
diff --git a/backends/vulkan/runtime/graph/ops/glsl/reduce.glsl b/backends/vulkan/runtime/graph/ops/glsl/reduce.glsl
index 7a6263d9f55..209440cec6a 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/reduce.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/reduce.glsl
@@ -43,6 +43,7 @@ layout(constant_id = 5) const int group_dim = 1;
 shared vec4 shared_vecs[MAX_NTHREADS];
 
 #include "indexing_utils.h"
+#include "indexing.glslh"
 
 int tid_to_smi(const ivec2 tid) {
   return tid.x + tid.y * NWORKERS;
@@ -95,7 +96,7 @@ void reduce_nonpacked_dim(const ivec2 tid, ivec3 scan_pos) {
   scan_pos[reduce_dim] = tid.x;
   // Partially accumulate over elements i, i + NWORKERS, i + 2*NWORKERS, ... of
   // the reduction row
-  for (int i = tid.x; i < tin_sizes[reduce_dim];
+  for (int i = tid.x; i < safe_idx(tin_sizes, reduce_dim);
        i += NWORKERS, scan_pos[reduce_dim] += NWORKERS) {
     accum = UPDATE_ACCUM(accum, load_texel(tin, scan_pos));
   }
@@ -115,11 +116,11 @@ void reduce_nonpacked_dim(const ivec2 tid, ivec3 scan_pos) {
 
     // Determine if there are any padding elements in the final texel of the
     // packed dimension
-    const int nspill = mod4(tin_sizes[packed_dim]);
+    const int nspill = mod4(safe_idx(tin_sizes, packed_dim));
     // Detect if this thread is working on the final texels of the packed
     // dimension, which may have padding elements
     const bool is_last_texel =
-        scan_pos[packed_dim] == (tin_limits[packed_dim] - 1);
+        scan_pos[packed_dim] == (safe_idx(tin_limits, packed_dim) - 1);
 
     // Explicitly set padding elements to 0
     if (is_last_texel && nspill > 0) {
@@ -145,10 +146,10 @@ void reduce_packed_dim(const ivec2 tid, ivec3 scan_pos) {
   const int smi = tid_to_smi(tid);
 
   // Number of non-padding elements in the last texel in the reduction row
-  const int nspill = mod4(tin_sizes[packed_dim]);
+  const int nspill = mod4(safe_idx(tin_sizes, packed_dim));
   // Only reduce up to the last "complete" texel. The last texel will need to be
   // handled specially if it has padding elements.
-  const int reduce_len = tin_sizes[packed_dim] - nspill;
+  const int reduce_len = safe_idx(tin_sizes, packed_dim) - nspill;
 
   scan_pos[reduce_dim] = 0;
   vec4 accum = INIT_ACCUM(vec4(load_texel(tin, scan_pos).x));
@@ -163,7 +164,7 @@ void reduce_packed_dim(const ivec2 tid, ivec3 scan_pos) {
   // For the last texel in the dim, if there are padding elements then each
   // element of the texel needs to be processed individually such that the
   // padding elements are ignored
-  if (scan_pos[reduce_dim] == tin_limits[reduce_dim] - 1 && nspill > 0) {
+  if (scan_pos[reduce_dim] == safe_idx(tin_limits, reduce_dim) - 1 && nspill > 0) {
     const vec4 intex = load_texel(tin, scan_pos);
     for (int i = 0; i < nspill; i++) {
       accum.x = UPDATE_ACCUM(accum.x, intex[i]);
diff --git a/backends/vulkan/runtime/graph/ops/glsl/reduce2d.glsl b/backends/vulkan/runtime/graph/ops/glsl/reduce2d.glsl
index 98370a9bcde..bd55025f534 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/reduce2d.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/reduce2d.glsl
@@ -44,6 +44,7 @@ layout(constant_id = 6) const int group_dim = 2;
 shared vec4 shared_vecs[MAX_NTHREADS];
 
 #include "indexing_utils.h"
+#include "indexing.glslh"
 
 int tid_to_smi(const ivec2 tid) {
   return tid.x + tid.y * NWORKERS;
@@ -68,12 +69,12 @@ void reduce_2d_non_packed_dim(const ivec2 tid, ivec3 scan_pos) {
   
   // First dimension reduction
   scan_pos[reduce_dim1] = tid.x;
-  for (int i = tid.x; i < tin_sizes[reduce_dim1]; 
+  for (int i = tid.x; i < safe_idx(tin_sizes, reduce_dim1);
        i += NWORKERS, scan_pos[reduce_dim1] += NWORKERS) {
     
     // Second dimension reduction
     scan_pos[reduce_dim2] = 0;
-    for (int j = 0; j < tin_sizes[reduce_dim2]; j++, scan_pos[reduce_dim2]++) {
+    for (int j = 0; j < safe_idx(tin_sizes, reduce_dim2); j++, scan_pos[reduce_dim2]++) {
       accum = UPDATE_ACCUM(accum, load_texel(tin, scan_pos));
     }
   }
@@ -93,11 +94,11 @@ void reduce_2d_non_packed_dim(const ivec2 tid, ivec3 scan_pos) {
     
     // Determine if there are any padding elements in the final texel of the
     // packed dimension
-    const int nspill = mod4(tin_sizes[packed_dim]);
+    const int nspill = mod4(safe_idx(tin_sizes, packed_dim));
     // Detect if this thread is working on the final texels of the packed
     // dimension, which may have padding elements
-    const bool is_last_texel = 
-        scan_pos[packed_dim] == (tin_limits[packed_dim] - 1);
+    const bool is_last_texel =
+        scan_pos[packed_dim] == (safe_idx(tin_limits, packed_dim) - 1);
     
     // Explicitly set padding elements to 0
     if (is_last_texel && nspill > 0) {
diff --git a/backends/vulkan/runtime/graph/ops/glsl/repeat_texture.glsl b/backends/vulkan/runtime/graph/ops/glsl/repeat_texture.glsl
index 02f8956ce1f..6661d747876 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/repeat_texture.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/repeat_texture.glsl
@@ -47,7 +47,7 @@ void main() {
   VEC4_T out_texel = VEC4_T(0);
 
   const int limit = min(
-      4, out_meta.sizes[packed_dim] - out_tidx.data[packed_dim]);
+      4, safe_idx(out_meta.sizes, packed_dim) - out_tidx.data[packed_dim]);
   for (int comp = 0; comp < limit; comp++) {
     TensorIndex4D in_tidx = out_tidx;
     in_tidx.data = ivec4(
diff --git a/backends/vulkan/runtime/graph/ops/glsl/select.glslh b/backends/vulkan/runtime/graph/ops/glsl/select.glslh
index 5390e2a4bb2..4fdb48926fa 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/select.glslh
+++ b/backends/vulkan/runtime/graph/ops/glsl/select.glslh
@@ -69,7 +69,7 @@ TensorIndex4D out_tidx_to_in_tidx(const TensorIndex4D out_tidx) {
 
   int adjusted_index = index;
   if (index < 0) {
-    adjusted_index = index + inp.sizes[selected_dim];
+    adjusted_index = index + safe_idx(inp.sizes, selected_dim);
   }
 
   // Handle different dimensions for selection
diff --git a/backends/vulkan/runtime/graph/ops/glsl/slice.glslh b/backends/vulkan/runtime/graph/ops/glsl/slice.glslh
index 0a815c85d66..a7fc94bb5d6 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/slice.glslh
+++ b/backends/vulkan/runtime/graph/ops/glsl/slice.glslh
@@ -56,7 +56,7 @@ TensorIndex4D out_tidx_to_in_tidx(const TensorIndex4D out_tidx) {
 
   int adjusted_start = start;
   if (start < 0) {
-    adjusted_start = start + inp.sizes[selected_dim];
+    adjusted_start = start + safe_idx(inp.sizes, selected_dim);
   }
 
   in_tidx.data[selected_dim] = adjusted_start + out_tidx.data[selected_dim] * step;
diff --git a/backends/vulkan/runtime/graph/ops/glsl/softmax.glsl b/backends/vulkan/runtime/graph/ops/glsl/softmax.glsl
index bf7facae761..ce9d2477795 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/softmax.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/softmax.glsl
@@ -57,7 +57,7 @@ void softmax_nonpacked_dim(const ivec2 tid, ivec3 scan_pos) {
 
   scan_pos[reduce_dim] = tid.x;
   vec4 max_elements = texelFetch(tin, scan_pos, 0);
-  for (int i = tid.x; i < in_meta.sizes[reduce_dim];
+  for (int i = tid.x; i < safe_idx(in_meta.sizes, reduce_dim);
        i += NWORKERS, scan_pos[reduce_dim] += NWORKERS) {
     max_elements = max(max_elements, texelFetch(tin, scan_pos, 0));
   }
@@ -71,7 +71,7 @@ void softmax_nonpacked_dim(const ivec2 tid, ivec3 scan_pos) {
 
   scan_pos[reduce_dim] = tid.x;
   vec4 denominators = vec4(0);
-  for (int i = tid.x; i < in_meta.sizes[reduce_dim];
+  for (int i = tid.x; i < safe_idx(in_meta.sizes, reduce_dim);
        i += NWORKERS, scan_pos[reduce_dim] += NWORKERS) {
     denominators += exp(texelFetch(tin, scan_pos, 0) - max_elements);
   }
@@ -83,12 +83,12 @@ void softmax_nonpacked_dim(const ivec2 tid, ivec3 scan_pos) {
     denominators += shared_sum[group_i];
   }
 
-  const int nspill = mod_4(in_meta.sizes[packed_dim]);
+  const int nspill = mod_4(safe_idx(in_meta.sizes, packed_dim));
   const bool is_last_texel =
-      scan_pos[packed_dim] == (out_meta.limits[packed_dim] - 1);
+      scan_pos[packed_dim] == (safe_idx(out_meta.limits, packed_dim) - 1);
 
   scan_pos[reduce_dim] = tid.x;
-  for (int i = tid.x; i < in_meta.sizes[reduce_dim];
+  for (int i = tid.x; i < safe_idx(in_meta.sizes, reduce_dim);
        i += NWORKERS, scan_pos[reduce_dim] += NWORKERS) {
     const vec4 numerators = op1(texelFetch(tin, scan_pos, 0) - max_elements);
     const vec4 safe_denom = max(denominators, vec4(1e-37));
@@ -124,8 +124,8 @@ void softmax_packed_dim(const ivec2 tid, ivec3 scan_pos) {
   const int smi = tid_to_smi(tid);
   int group_i;
 
-  const int nspill = mod_4(in_meta.sizes[packed_dim]);
-  const int reduce_len = in_meta.sizes[packed_dim] - nspill;
+  const int nspill = mod_4(safe_idx(in_meta.sizes, packed_dim));
+  const int reduce_len = safe_idx(in_meta.sizes, packed_dim) - nspill;
 
   scan_pos[reduce_dim] = tid.x;
   vec4 max_elements = vec4(-3.402823e+38);
@@ -133,7 +133,7 @@ void softmax_packed_dim(const ivec2 tid, ivec3 scan_pos) {
        i += NWORKERS * 4, scan_pos[reduce_dim] += NWORKERS) {
     max_elements = max(max_elements, texelFetch(tin, scan_pos, 0));
   }
-  if (scan_pos[reduce_dim] == out_meta.limits[reduce_dim] - 1 && nspill > 0) {
+  if (scan_pos[reduce_dim] == safe_idx(out_meta.limits, reduce_dim) - 1 && nspill > 0) {
     const vec4 intex = texelFetch(tin, scan_pos, 0);
     for (int i = 0; i < nspill; ++i) {
       max_elements.x = max(intex[i], max_elements.x);
@@ -157,7 +157,7 @@ void softmax_packed_dim(const ivec2 tid, ivec3 scan_pos) {
        i += NWORKERS * 4, scan_pos[reduce_dim] += NWORKERS) {
     denominators += exp(texelFetch(tin, scan_pos, 0) - max_element);
   }
-  if (nspill > 0 && scan_pos[reduce_dim] == out_meta.limits[reduce_dim] - 1) {
+  if (nspill > 0 && scan_pos[reduce_dim] == safe_idx(out_meta.limits, reduce_dim) - 1) {
     const vec4 intex = texelFetch(tin, scan_pos, 0);
     for (int i = 0; i < nspill; ++i) {
       denominators.x += exp(intex[i] - max_element);
@@ -182,7 +182,7 @@ void softmax_packed_dim(const ivec2 tid, ivec3 scan_pos) {
     const vec4 numerators = op1(texelFetch(tin, scan_pos, 0) - max_element);
     imageStore(tout, scan_pos, op2(numerators, safe_denominator));
   }
-  if (nspill > 0 && scan_pos[reduce_dim] == out_meta.limits[reduce_dim] - 1) {
+  if (nspill > 0 && scan_pos[reduce_dim] == safe_idx(out_meta.limits, reduce_dim) - 1) {
     const vec4 numerator = op1(texelFetch(tin, scan_pos, 0) - max_element);
     vec4 outtex = op2(numerator, safe_denominator);
     [[unroll]] for (int i = nspill; i < 4; ++i) {
diff --git a/backends/vulkan/runtime/graph/ops/glsl/split_texture.glsl b/backends/vulkan/runtime/graph/ops/glsl/split_texture.glsl
index 254d4de1af6..17ee0619cb0 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/split_texture.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/split_texture.glsl
@@ -51,7 +51,7 @@ void main() {
   VEC4_T out_texel = VEC4_T(0);
 
   int limit = min(
-      4, outp.sizes[out_packed_dim] - out_tidx.data[out_packed_dim]);
+      4, safe_idx(outp.sizes, out_packed_dim) - out_tidx.data[out_packed_dim]);
 
   TensorIndex4D input_tidx = out_tidx;
   input_tidx.data[split_dim] += split_offset;
diff --git a/backends/vulkan/runtime/graph/ops/glsl/transfer_texture.glsl b/backends/vulkan/runtime/graph/ops/glsl/transfer_texture.glsl
index 5f2a2097e2c..2d70e645de9 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/transfer_texture.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/transfer_texture.glsl
@@ -69,7 +69,7 @@ void main() {
   VEC4_T out_texel = VEC4_T(0);
 
   int limit = min(
-      4, outp.sizes[out_packed_dim] - out_tidx.data[out_packed_dim]);
+      4, safe_idx(outp.sizes, out_packed_dim) - out_tidx.data[out_packed_dim]);
   for (int comp = 0; comp < limit; comp++) {
     TensorIndex4D in_tidx = out_tidx_to_in_tidx(out_tidx);
 
diff --git a/backends/vulkan/runtime/graph/ops/glsl/var_texture3d.glsl b/backends/vulkan/runtime/graph/ops/glsl/var_texture3d.glsl
index faeac01fcd2..1ee938c58dc 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/var_texture3d.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/var_texture3d.glsl
@@ -48,6 +48,7 @@ shared VEC4_T shared_sum_sq[MAX_NTHREADS];
 shared int shared_count[MAX_NTHREADS];
 
 #include "indexing_utils.h"
+#include "indexing.glslh"
 
 int tid_to_smi(const ivec2 tid) {
   return tid.x + tid.y * NWORKERS;
@@ -73,7 +74,7 @@ void reduce_nonpacked_dim(const ivec2 tid, ivec3 scan_pos) {
   int count = 0;
 
   scan_pos[reduce_dim] = tid.x;
-  for (int i = tid.x; i < tin_sizes[reduce_dim];
+  for (int i = tid.x; i < safe_idx(tin_sizes, reduce_dim);
        i += NWORKERS, scan_pos[reduce_dim] += NWORKERS) {
     VEC4_T val = load_texel(tin, scan_pos);
     sum += val;
@@ -103,11 +104,11 @@ void reduce_nonpacked_dim(const ivec2 tid, ivec3 scan_pos) {
 
     // Determine if there are any padding elements in the final texel of the
     // packed dimension
-    const int nspill = mod4(tin_sizes[packed_dim]);
+    const int nspill = mod4(safe_idx(tin_sizes, packed_dim));
     // Detect if this thread is working on the final texels of the packed
     // dimension, which may have padding elements
     const bool is_last_texel =
-        scan_pos[packed_dim] == (tin_limits[packed_dim] - 1);
+        scan_pos[packed_dim] == (safe_idx(tin_limits, packed_dim) - 1);
 
     VEC4_T variance = calculate_variance(sum, sum_sq, count);
 
@@ -136,10 +137,10 @@ void reduce_packed_dim(const ivec2 tid, ivec3 scan_pos) {
   const int smi = tid_to_smi(tid);
 
   // Number of non-padding elements in the last texel in the reduction row
-  const int nspill = mod4(tin_sizes[packed_dim]);
+  const int nspill = mod4(safe_idx(tin_sizes, packed_dim));
   // Only reduce up to the last "complete" texel. The last texel will need to be
   // handled specially if it has padding elements.
-  const int reduce_len = tin_sizes[packed_dim] - nspill;
+  const int reduce_len = safe_idx(tin_sizes, packed_dim) - nspill;
 
   VEC4_T sum = VEC4_T(0);
   VEC4_T sum_sq = VEC4_T(0);
@@ -158,7 +159,7 @@ void reduce_packed_dim(const ivec2 tid, ivec3 scan_pos) {
   // For the last texel in the dim, if there are padding elements then each
   // element of the texel needs to be processed individually such that the
   // padding elements are ignored
-  if (scan_pos[reduce_dim] == tin_limits[reduce_dim] - 1 && nspill > 0) {
+  if (scan_pos[reduce_dim] == safe_idx(tin_limits, reduce_dim) - 1 && nspill > 0) {
     const VEC4_T val = load_texel(tin, scan_pos);
     for (int i = 0; i < nspill; i++) {
       sum.x += val[i];
diff --git a/backends/vulkan/runtime/graph/ops/glsl/where.glsl b/backends/vulkan/runtime/graph/ops/glsl/where.glsl
index cc673ff2001..6982d41dd3a 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/where.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/where.glsl
@@ -96,7 +96,7 @@ void main() {
   VEC4_T outtex = VEC4_T(0);
 
   int limit = min(
-      4, outp.sizes[out_packed_dim] - out_tidx.data[out_packed_dim]);
+      4, safe_idx(outp.sizes, out_packed_dim) - out_tidx.data[out_packed_dim]);
   for (int comp = 0; comp < limit; comp++) {
     TensorIndex4D cond_tidx;
     cond_tidx.data = min(out_tidx.data, condp.sizes - 1);
diff --git a/backends/vulkan/runtime/graph/ops/impl/Permute.cpp b/backends/vulkan/runtime/graph/ops/impl/Permute.cpp
index d7b06015b72..8081424cfb7 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Permute.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Permute.cpp
@@ -11,17 +11,12 @@
 #include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
 
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
 
 namespace vkcompute {
 
-using utils::ivec2;
-using utils::ivec3;
 using utils::ivec4;
-using utils::uvec4;
 
 namespace {
 
@@ -32,13 +27,38 @@ void check_args(
     const ValueRef out) {
   (void)permute_dims;
   VK_CHECK_COND(check_same_packed_dim(graph, in, out));
-
-  // This implementation doesn't not requires the input tensor to have the same
-  // dim size as the argument. The code will work as long as the input tensor's
-  // dim size is shorter than the permute dim array. In this case, the code
-  // assume size of 1 at the higher dimensions.
 }
 
+struct WHCNPermuteDims {
+  int32_t whcn_permute_dims[api::kTensorDimLimit];
+
+  void initialize(const std::vector<int64_t>& permute_dims) {
+    const int32_t permute_ndim = permute_dims.size();
+    for (int32_t whcn_i = 0; whcn_i < permute_ndim; whcn_i++) {
+      const int32_t nchw_i = permute_ndim - 1 - whcn_i;
+      int64_t index_val = permute_dims.at(nchw_i);
+      if (index_val < 0) {
+        index_val += permute_ndim;
+      }
+      const int32_t permute_dim_whcn = permute_ndim - 1 - index_val;
+      whcn_permute_dims[whcn_i] = permute_dim_whcn;
+    }
+    for (int32_t whcn_i = permute_ndim; whcn_i < api::kTensorDimLimit;
+         whcn_i++) {
+      whcn_permute_dims[whcn_i] = whcn_i;
+    }
+  }
+
+  int32_t pack_into_int32() const {
+    VK_CHECK_COND(api::kTensorDimLimit <= 8);
+    int32_t packed = 0;
+    for (int32_t i = 0; i < api::kTensorDimLimit; i++) {
+      packed |= (whcn_permute_dims[i] & 0x0F) << (i * 4);
+    }
+    return packed;
+  }
+};
+
 } // namespace
 
 void resize_permute_node(
@@ -101,15 +121,36 @@ void add_permute_node(
     const ValueRef out) {
   check_args(graph, in, permute_dims, out);
 
-  // Convert the permute dims to WHCN dimension order, which is the standard in
-  // our compute shaders. The following transformations are applied.
-  // 1. Change dimension index values from NCHW order valueto WHCN order value
-  // 2. Reverse the order of the permute array from NCHW order to WHCN order
+  std::string kernel_name = "permute";
+  kernel_name.reserve(kShaderNameReserve);
+  add_storage_type_suffix(kernel_name, graph.storage_type_of(out));
+  add_dtype_suffix(kernel_name, graph.dtype_of(out));
+
+  vkapi::ParamsBindList param_ubos = {graph.meta_ubo(out), graph.meta_ubo(in)};
+
+  std::vector<PushConstantDataInfo> push_constants;
+  vkapi::SpecVarList spec_vars = {
+      graph.hashed_layout_of(out), graph.hashed_layout_of(in)};
+
+  // WHCN permute dims for the texture path (ivec4, max 4D).
+  // Declared here so its lifetime extends to the DynamicDispatchNode creation
+  // where push_constants references it.
   ivec4 whcn_permute_dims{0, 1, 2, 3};
-  {
+
+  if (graph.is_buffer_storage(out)) {
+    // Buffer path: supports up to kTensorDimLimit dims via WHCNPermuteDims,
+    // packed into a spec constant int
+    WHCNPermuteDims whcn_pd;
+    whcn_pd.initialize(*graph.get_int_list(permute_dims));
+    spec_vars.append(whcn_pd.pack_into_int32());
+  } else {
+    // Texture path: compute 4D WHCN permute dims and pass as push constant
     IntListPtr permute_dims_ptr = graph.get_int_list(permute_dims);
     const int32_t permute_ndim =
         utils::safe_downcast<int32_t>(permute_dims_ptr->size());
+    VK_CHECK_COND(
+        permute_ndim <= 4,
+        "Texture storage only supports permute with up to 4 dims");
 
     for (int32_t nchw_i = permute_ndim - 1, whcn_i = 0; nchw_i >= 0;
          nchw_i--, whcn_i++) {
@@ -119,133 +160,23 @@ void add_permute_node(
         permute_dim_nchw += permute_ndim;
       }
       const int32_t permute_dim_whcn = permute_ndim - 1 - permute_dim_nchw;
-
       whcn_permute_dims[whcn_i] = permute_dim_whcn;
     }
-  }
 
-  std::string kernel_name = "permute";
-  kernel_name.reserve(kShaderNameReserve);
-  add_storage_type_suffix(kernel_name, graph.storage_type_of(out));
-  add_dtype_suffix(kernel_name, graph.dtype_of(out));
-
-  vkapi::ParamsBindList param_buffers;
-  std::vector<PushConstantDataInfo> push_constants;
-  vkapi::SpecVarList spec_vars;
-
-  const int32_t out_channels = dim_at<kChannel4D>(graph.sizes_of(out));
-  const int32_t in_channels = dim_at<kChannel4D>(graph.sizes_of(in));
-
-  const int32_t packed_dim = graph.packed_dim_of(in);
-  ivec2 channel_info = {out_channels, in_channels};
-  if (packed_dim == WHCN::kChannelsDim) {
-    channel_info[0] = utils::align_up_4(channel_info[0]);
-    channel_info[1] = utils::align_up_4(channel_info[1]);
+    push_constants.push_back(
+        PushConstantDataInfo(&whcn_permute_dims, sizeof(whcn_permute_dims)));
   }
 
-  push_constants = {
-      graph.sizes_pc_of(out),
-      graph.sizes_pc_of(in),
-      PushConstantDataInfo(&whcn_permute_dims, sizeof(whcn_permute_dims))};
-
-  spec_vars = {graph.hashed_layout_of(out), graph.hashed_layout_of(in)};
-
   graph.execute_nodes().emplace_back(new DynamicDispatchNode(
       graph,
       VK_KERNEL_FROM_STR(kernel_name),
       default_pick_global_wg_size,
       default_pick_local_wg_size,
       {{out, vkapi::kWrite}, {in, vkapi::kRead}},
-      // Parameter buffers
-      param_buffers,
-      // Push Constants
+      param_ubos,
       push_constants,
-      // Specialization Constants
       spec_vars,
-      // Resize Args
       {permute_dims},
-      // Resizing Logic
-      resize_permute_node));
-}
-
-struct WHCNPermuteDims {
-  int32_t whcn_permute_dims[api::kTensorDimLimit];
-
-  void initialize(const std::vector<int64_t>& permute_dims) {
-    const int32_t permute_ndim = permute_dims.size();
-    for (int32_t whcn_i = 0; whcn_i < permute_ndim; whcn_i++) {
-      const int32_t nchw_i = permute_ndim - 1 - whcn_i;
-      int64_t index_val = permute_dims.at(nchw_i);
-      if (index_val < 0) {
-        index_val += permute_ndim;
-      }
-      const int32_t permute_dim_whcn = permute_ndim - 1 - index_val;
-      whcn_permute_dims[whcn_i] = permute_dim_whcn;
-    }
-    for (int32_t whcn_i = permute_ndim; whcn_i < api::kTensorDimLimit;
-         whcn_i++) {
-      whcn_permute_dims[whcn_i] = whcn_i;
-    }
-  }
-
-  int32_t pack_into_int32() const {
-    // If kTensorDimLimit is increased, we will need to send in an additional
-    // int.
-    VK_CHECK_COND(api::kTensorDimLimit <= 8);
-    // Packs the 8 elements in whcn_permute_dims into a single int32_t. Each
-    // element is packed into 4 bits.
-    int32_t packed = 0;
-    for (int32_t i = 0; i < api::kTensorDimLimit; i++) {
-      packed |= (whcn_permute_dims[i] & 0x0F) << (i * 4);
-    }
-    return packed;
-  }
-};
-
-void add_permute_buffer_node(
-    ComputeGraph& graph,
-    const ValueRef in,
-    const ValueRef permute_dims,
-    const ValueRef out) {
-  check_args(graph, in, permute_dims, out);
-
-  WHCNPermuteDims whcn_permute_dims;
-  // Convert the permute dims to WHCN dimension order, which is the standard in
-  // our compute shaders. The following transformations are applied.
-  // 1. Change dimension index values from NCHW order value to WHCN order value
-  // 2. Extend the permute array to kTensorDimLimit
-  {
-    IntListPtr permute_dims_ptr = graph.get_int_list(permute_dims);
-    whcn_permute_dims.initialize(*permute_dims_ptr);
-  }
-
-  std::string kernel_name = "permute";
-  kernel_name.reserve(kShaderNameReserve);
-  add_storage_type_suffix(kernel_name, graph.storage_type_of(out));
-  add_dtype_suffix(kernel_name, graph.dtype_of(out));
-
-  vkapi::ParamsBindList param_buffers = {
-      graph.buffer_meta_ubo(out),
-      graph.buffer_meta_ubo(in),
-  };
-
-  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
-      graph,
-      VK_KERNEL_FROM_STR(kernel_name),
-      default_pick_global_wg_size,
-      default_pick_local_wg_size,
-      {{out, vkapi::kWrite}, {in, vkapi::kRead}},
-      // Parameter buffers
-      param_buffers,
-      // Push Constants
-      {},
-      // Specialization Constants
-      {graph.hashed_layout_of(out),
-       graph.hashed_layout_of(in),
-       whcn_permute_dims.pack_into_int32()},
-      // Resize Args
-      {permute_dims},
-      // Resizing Logic
       resize_permute_node));
 }
 
@@ -255,10 +186,7 @@ void permute(ComputeGraph& graph, const std::vector<ValueRef>& args) {
   const ValueRef permute_dims = args.at(idx++);
   const ValueRef out = args.at(idx++);
 
-  if (graph.is_buffer_storage(args[2])) {
-    return add_permute_buffer_node(graph, in, permute_dims, out);
-  }
-  return add_permute_node(graph, in, permute_dims, out);
+  add_permute_node(graph, in, permute_dims, out);
 }
 
 REGISTER_OPERATORS {
diff --git a/backends/vulkan/runtime/graph/ops/impl/Slice.cpp b/backends/vulkan/runtime/graph/ops/impl/Slice.cpp
index 67d714d10aa..a35fb65355d 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Slice.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Slice.cpp
@@ -13,7 +13,6 @@
 
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
 
 namespace vkcompute {