diff --git a/backends/vulkan/_passes/remove_redundant_ops.py b/backends/vulkan/_passes/remove_redundant_ops.py
index 25bdd34de70..b95733021fc 100644
--- a/backends/vulkan/_passes/remove_redundant_ops.py
+++ b/backends/vulkan/_passes/remove_redundant_ops.py
@@ -32,6 +32,13 @@ class RemoveRedundantOpsTransform(ExportPass):
         exir_ops.edge.dim_order_ops._to_dim_order_copy.default,
         exir_ops.edge.dim_order_ops._clone_dim_order.default,
         exir_ops.edge.aten.expand_copy.default,
+        # copy.default(self, src): no-op when src dtype/shape matches self.
+        exir_ops.edge.aten.copy.default,
+    }
+
+    # For these ops the meaningful input is args[1] (src), not args[0] (self).
+    _src_arg1_ops: Set[OpType] = {
+        exir_ops.edge.aten.copy.default,
     }
 
     def __init__(self) -> None:
@@ -41,7 +48,8 @@ def _should_remove(self, node: torch.fx.Node) -> bool:
         if node.target not in self.redundant_ops:
             return False
 
-        orig_node = node.args[0]
+        src_arg_idx = 1 if node.target in self._src_arg1_ops else 0
+        orig_node = node.args[src_arg_idx]
         assert isinstance(orig_node, torch.fx.Node)
 
         src_dtype = orig_node.meta["val"].dtype
@@ -61,7 +69,8 @@ def _remove(self, graph_module: torch.fx.GraphModule) -> None:
             if not self._should_remove(node):
                 continue
 
-            node.replace_all_uses_with(node.args[0])
+            src_arg_idx = 1 if node.target in self._src_arg1_ops else 0
+            node.replace_all_uses_with(node.args[src_arg_idx])
 
         graph_module.graph.eliminate_dead_code()
 
diff --git a/backends/vulkan/op_registry.py b/backends/vulkan/op_registry.py
index 855df9d2e74..b18bf3b81c6 100644
--- a/backends/vulkan/op_registry.py
+++ b/backends/vulkan/op_registry.py
@@ -167,6 +167,9 @@ def update_features_impl(op: OpKey):
         # Guard and assert ops
         torch.ops.aten._assert_scalar.default,
         torch.ops.aten.sym_constrain_range_for_size.default,
+        # copy.default is a no-op when src dtype matches dst dtype; removed by
+        # RemoveRedundantOpsTransform before execution.
+        exir_ops.edge.aten.copy.default,
     ]
 )
 def register_ephemeral_ops():
@@ -231,6 +234,19 @@ def register_clamp():
         exir_ops.edge.aten.div.Tensor,
         exir_ops.edge.aten.div.Tensor_mode,
         exir_ops.edge.aten.pow.Tensor_Tensor,
+    ]
+)
+def register_binaryop_cpp_ops():
+    return OpFeatures(
+        inputs_storage=utils.ANY_STORAGE,
+        inputs_dtypes=utils.FP_INT_T,
+        supports_resize=True,
+        supports_highdim=True,
+    )
+
+
+@update_features(
+    [
         exir_ops.edge.aten.eq.Tensor,
         exir_ops.edge.aten.lt.Tensor,
         exir_ops.edge.aten.le.Tensor,
@@ -238,10 +254,26 @@ def register_clamp():
         exir_ops.edge.aten.ge.Tensor,
     ]
 )
-def register_binaryop_cpp_ops():
+def register_comparison_ops():
     return OpFeatures(
         inputs_storage=utils.ANY_STORAGE,
         inputs_dtypes=utils.FP_INT_T,
+        outputs_dtypes=utils.BOOL_T,
+        supports_resize=True,
+        supports_highdim=True,
+    )
+
+
+# =============================================================================
+# BinaryOp.cpp (bitwise)
+# =============================================================================
+
+
+@update_features(exir_ops.edge.aten.bitwise_and.Tensor)
+def register_bitwise_and():
+    return OpFeatures(
+        inputs_storage=utils.ANY_STORAGE,
+        inputs_dtypes=utils.BOOL_T,
         supports_resize=True,
         supports_highdim=True,
     )
@@ -673,6 +705,7 @@ def register_argreduce_cpp_ops():
     return OpFeatures(
         inputs_storage=utils.ANY_TEXTURE,
         inputs_dtypes=utils.FP_T,
+        outputs_dtypes=utils.INT_T,
         supports_resize=True,
         supports_highdim=True,
         are_node_inputs_supported_fn=is_reduce_node_supported,
@@ -1157,6 +1190,58 @@ def register_index_select():
     )
 
 
+# =============================================================================
+# Where.cpp
+# =============================================================================
+
+
+@update_features(exir_ops.edge.aten.where.self)
+def register_where():
+    return OpFeatures(
+        inputs_storage=utils.ANY_STORAGE,
+        inputs_dtypes=[utils.BOOL_T, utils.FP_T, utils.FP_T],
+        outputs_dtypes=utils.FP_T,
+        supports_resize=True,
+    )
+
+
+# =============================================================================
+# IndexTensor.cpp
+# =============================================================================
+
+
+@update_features(exir_ops.edge.aten.index.Tensor)
+def register_index_tensor():
+    def check_index_tensor_node(node: torch.fx.Node) -> bool:
+        self_arg = node.args[0]
+        indices = node.args[1]
+
+        # Only support 1D self tensor
+        if not isinstance(self_arg, torch.fx.Node):
+            return False
+        self_val = self_arg.meta.get("val", None)
+        if self_val is None:
+            return False
+        if len(self_val.size()) != 1:
+            return False
+
+        # Only support exactly one non-None index tensor
+        if not isinstance(indices, (list, tuple)):
+            return False
+        non_none = [idx for idx in indices if idx is not None]
+        if len(non_none) != 1:
+            return False
+
+        return True
+
+    return OpFeatures(
+        inputs_storage=utils.ANY_STORAGE,
+        inputs_dtypes=utils.FP_INT_T,
+        supports_resize=True,
+        are_node_inputs_supported_fn=check_index_tensor_node,
+    )
+
+
 # =============================================================================
 # Arange.cpp
 # =============================================================================
diff --git a/backends/vulkan/runtime/graph/ops/glsl/binary_op.yaml b/backends/vulkan/runtime/graph/ops/glsl/binary_op.yaml
index ee96b5c05b4..c3d5cd00204 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/binary_op.yaml
+++ b/backends/vulkan/runtime/graph/ops/glsl/binary_op.yaml
@@ -116,3 +116,11 @@ binary_op:
           - VALUE: half
           - VALUE: float
           - VALUE: int32
+    - NAME: binary_bitwise_and
+      OPERATOR: X & Y
+      generate_variant_forall:
+        STORAGE:
+          - VALUE: buffer
+          - VALUE: texture3d
+        DTYPE:
+          - VALUE: uint8
diff --git a/backends/vulkan/runtime/graph/ops/glsl/index_tensor_buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/index_tensor_buffer.glsl
new file mode 100644
index 00000000000..3469bb22fcc
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/index_tensor_buffer.glsl
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+${define_required_extensions("buffer", DTYPE)}
+
+#define PRECISION ${PRECISION}
+
+#define T ${buffer_scalar_type(DTYPE)}
+
+${define_active_storage_type("buffer")}
+
+layout(std430) buffer;
+
+#include "indexing.glslh"
+
+${layout_declare_tensor(B, "w", "t_out", DTYPE, "buffer")}
+${layout_declare_tensor(B, "r", "t_self", DTYPE, "buffer")}
+${layout_declare_tensor(B, "r", "t_index", "int", "buffer")}
+
+${layout_declare_ubo(B, "BufferMetadata", "outp")}
+${layout_declare_ubo(B, "BufferMetadata", "inp")}
+${layout_declare_ubo(B, "BufferMetadata", "index")}
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+// Implements aten.index.Tensor for the case where self is 1D and there is
+// exactly one index tensor. Each output element is:
+//   output[...] = self[index[...]]
+
+void main() {
+  const uint out_bufi = gl_GlobalInvocationID.x;
+  if (out_of_bounds(out_bufi, outp)) {
+    return;
+  }
+
+  // Convert output buffer index to tensor index
+  TensorIndex out_tidx = linear_idx_to_tensor_idx(outp, out_bufi);
+
+  // Read the index value at the same tensor position
+  const uint index_bufi = tensor_idx_to_linear_idx(index, out_tidx);
+  const int idx = t_index[index_bufi];
+
+  // Construct a tensor index for the 1D self tensor.
+  // In WHCN ordering, a 1D tensor has its elements along dim 0 (width).
+  TensorIndex self_tidx;
+  self_tidx.data[0] = uvec4(uint(idx), 0, 0, 0);
+  self_tidx.data[1] = uvec4(0);
+  const uint self_bufi = tensor_idx_to_linear_idx(inp, self_tidx);
+
+  t_out[out_bufi] = t_self[self_bufi];
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/index_tensor_buffer.yaml b/backends/vulkan/runtime/graph/ops/glsl/index_tensor_buffer.yaml
new file mode 100644
index 00000000000..ef79704203f
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/index_tensor_buffer.yaml
@@ -0,0 +1,16 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+index_tensor_buffer:
+  parameter_names_with_default_values:
+    DTYPE: float
+    STORAGE: buffer
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: half
+      - VALUE: float
+  shader_variants:
+    - NAME: index_tensor_buffer
diff --git a/backends/vulkan/runtime/graph/ops/glsl/index_tensor_texture.glsl b/backends/vulkan/runtime/graph/ops/glsl/index_tensor_texture.glsl
new file mode 100644
index 00000000000..8f8026c0a0c
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/index_tensor_texture.glsl
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+${define_required_extensions("texture3d", DTYPE)}
+
+#define PRECISION ${PRECISION}
+
+#define VEC4_T ${texel_load_type(DTYPE, "texture3d")}
+
+${define_active_storage_type("texture3d")}
+
+#extension GL_EXT_control_flow_attributes : require
+
+layout(std430) buffer;
+
+#include "common.glslh"
+#include "indexing.glslh"
+
+${layout_declare_tensor(B, "w", "t_out", DTYPE, "texture3d")}
+${layout_declare_tensor(B, "r", "t_self", DTYPE, "texture3d")}
+${layout_declare_tensor(B, "r", "t_index", "int", "texture3d")}
+
+${layout_declare_ubo(B, "TextureMetadata", "outp")}
+${layout_declare_ubo(B, "TextureMetadata", "inp")}
+${layout_declare_ubo(B, "TextureMetadata", "index")}
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+// Implements aten.index.Tensor for the case where self is 1D and there is
+// exactly one index tensor. Each output element is:
+//   output[...] = self[index[...]]
+
+void main() {
+  const ivec3 out_pos = ivec3(gl_GlobalInvocationID);
+
+  if (out_of_bounds(out_pos, outp)) {
+    return;
+  }
+
+  TensorIndex4D out_tidx = texture_pos_to_tensor4d_idx_simple(outp, out_pos);
+  ivec4 idx_texel = texelFetch(t_index, out_pos, 0);
+
+  VEC4_T out_texel = VEC4_T(0);
+
+  int limit = min(
+      4, outp.sizes[outp.packed_dim] - out_tidx.data[outp.packed_dim]);
+  for (int comp = 0; comp < limit; comp++) {
+    int idx = idx_texel[comp];
+
+    // Construct a tensor index for the 1D self tensor.
+    // In WHCN ordering, a 1D tensor has its elements along dim 0 (width).
+    TensorIndex4D self_tidx;
+    self_tidx.data = ivec4(idx, 0, 0, 0);
+
+    TextureElementIndex self_elem =
+        tensor4d_idx_to_texture_element_idx_simple(inp, self_tidx);
+
+    VEC4_T self_texel = texelFetch(t_self, self_elem.pos, 0);
+    out_texel[comp] = self_texel[self_elem.comp];
+
+    out_tidx.data[outp.packed_dim]++;
+  }
+
+  imageStore(t_out, out_pos, out_texel);
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/index_tensor_texture.yaml b/backends/vulkan/runtime/graph/ops/glsl/index_tensor_texture.yaml
new file mode 100644
index 00000000000..3e274fa177a
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/index_tensor_texture.yaml
@@ -0,0 +1,16 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+index_tensor_texture:
+  parameter_names_with_default_values:
+    DTYPE: float
+    STORAGE: texture3d
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: half
+      - VALUE: float
+  shader_variants:
+    - NAME: index_tensor_texture3d
diff --git a/backends/vulkan/runtime/graph/ops/glsl/int8x4_buffer_to_nchw.glsl b/backends/vulkan/runtime/graph/ops/glsl/int8x4_buffer_to_nchw.glsl
new file mode 100644
index 00000000000..76e6a6c6238
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/int8x4_buffer_to_nchw.glsl
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+${define_active_storage_type("buffer")}
+
+layout(std430) buffer;
+
+#include "indexing.glslh"
+
+// Output staging buffer: raw int8 data interpreted as int32 for device compat
+${layout_declare_tensor(B, "w", "nchw_out", "int", "buffer")}
+// Input buffer: packed int8x4 values (each int32 contains 4 packed int8)
+${layout_declare_tensor(B, "r", "t_inp", "int", "buffer")}
+
+// Metadata for input tensor
+${layout_declare_ubo(B, "BufferMetadata", "inp")}
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+${layout_declare_spec_const(C, "int", "inp_layout", "CONTIG_LAYOUT_INT")}
+
+void main() {
+  // One thread per output int32 in the NCHW staging buffer.
+  // Each output int32 holds 4 consecutive NCHW bytes.
+  const uint out_int32_idx = gl_GlobalInvocationID.x;
+
+  const uint W = inp.sizes[0][0];
+  const uint H = inp.sizes[0][1];
+  const uint C = inp.sizes[0][2];
+  const uint N = inp.sizes[0][3];
+  const uint total_numel = W * H * C * N;
+  const uint num_out_int32s = (total_numel + 3u) / 4u;
+
+  if (out_int32_idx >= num_out_int32s) {
+    return;
+  }
+
+  int output_int32 = 0;
+  [[unroll]] for (int j = 0; j < 4; ++j) {
+    const uint nchw_idx = out_int32_idx * 4u + uint(j);
+    if (nchw_idx >= total_numel) {
+      break;
+    }
+
+    // Convert NCHW linear index to tensor4D (WHCN) coordinates.
+    const uint w = nchw_idx % W;
+    const uint h = (nchw_idx / W) % H;
+    const uint c = (nchw_idx / (W * H)) % C;
+    const uint n = nchw_idx / (W * H * C);
+
+    TensorIndex4D tidx;
+    tidx.data = ivec4(int(w), int(h), int(c), int(n));
+
+    // tensor4d_idx_to_buf_idx returns a linear element index where
+    // element_index / 4 is the int32 slot and element_index % 4 is the byte
+    // position within that int32. This matches the packing order used by
+    // nchw_to_int8x4_buffer when writing to the int8x4 buffer.
+    const int elem_buf_idx = tensor4d_idx_to_buf_idx(inp, tidx, inp_layout);
+    const int int8_val =
+        (t_inp[elem_buf_idx / 4] >> ((elem_buf_idx % 4) * 8)) & 0xFF;
+
+    output_int32 |= (int8_val << (j * 8));
+  }
+
+  nchw_out[out_int32_idx] = output_int32;
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/int8x4_buffer_to_nchw.yaml b/backends/vulkan/runtime/graph/ops/glsl/int8x4_buffer_to_nchw.yaml
new file mode 100644
index 00000000000..1ee9728779a
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/int8x4_buffer_to_nchw.yaml
@@ -0,0 +1,11 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+int8x4_buffer_to_nchw:
+  parameter_names_with_default_values:
+    DTYPE: int
+  shader_variants:
+    - NAME: int8x4_buffer_to_nchw
diff --git a/backends/vulkan/runtime/graph/ops/glsl/reduce_per_row_buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/reduce_per_row_buffer.glsl
index b0c07e73637..3a63099e7df 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/reduce_per_row_buffer.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/reduce_per_row_buffer.glsl
@@ -115,7 +115,7 @@ void main() {
 #endif
 
 #ifdef OUTPUT_IS_INDICES
-    t_out[out_bufi] = int(0); // int(local_accum.idx);
+    t_out[out_bufi] = int(local_accum.idx);
 #else
     t_out[out_bufi] = convert_to_T(local_accum.val);
 #endif
diff --git a/backends/vulkan/runtime/graph/ops/glsl/where.glsl b/backends/vulkan/runtime/graph/ops/glsl/where.glsl
index 281b317e0b5..cab7cf54046 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/where.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/where.glsl
@@ -1,5 +1,3 @@
-// where.glsl
-
 /*
  * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
@@ -8,7 +6,6 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-
 #version 450 core
 
 ${define_required_extensions(STORAGE, DTYPE)}
@@ -24,44 +21,50 @@ ${define_active_storage_type(STORAGE)}
 
 layout(std430) buffer;
 
+#include "indexing.glslh"
+
 ${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)}
 ${layout_declare_tensor(B, "r", "t_condition", "bool", STORAGE)}
 ${layout_declare_tensor(B, "r", "t_self", DTYPE, STORAGE)}
 ${layout_declare_tensor(B, "r", "t_other", DTYPE, STORAGE)}
 
-
-#include "indexing_utils.h"
-
 $if STORAGE == "buffer":
-  ${layout_declare_ubo(B, "int", "out_numl")}
-  ${layout_declare_ubo(B, "ivec4", "out_strides")}
-  ${layout_declare_ubo(B, "ivec4", "cond_strides")}
-  ${layout_declare_ubo(B, "ivec4", "self_strides")}
-  ${layout_declare_ubo(B, "ivec4", "other_strides")}
+  ${layout_declare_ubo(B, "BufferMetadata", "outp")}
+  ${layout_declare_ubo(B, "BufferMetadata", "condp")}
+  ${layout_declare_ubo(B, "BufferMetadata", "selfp")}
+  ${layout_declare_ubo(B, "BufferMetadata", "otherp")}
 $else:
-  ${layout_declare_ubo(B, "ivec3", "out_limits")}
-
-${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_DIM_ORDER")}
-
-const lowp ivec4 out_dim_order = unhash_dim_order(out_layout);
+  ${layout_declare_ubo(B, "TextureMetadata", "outp")}
+  ${layout_declare_ubo(B, "TextureMetadata", "condp")}
+  ${layout_declare_ubo(B, "TextureMetadata", "selfp")}
+  ${layout_declare_ubo(B, "TextureMetadata", "otherp")}
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
 #ifdef USING_BUFFER
 
 void main() {
-  int out_bufi = int(gl_GlobalInvocationID.x);
-  if (out_bufi >= out_numl) {
+  const uint out_bufi = gl_GlobalInvocationID.x;
+  if (out_of_bounds(out_bufi, outp)) {
     return;
   }
 
-  const ivec4 out_tidx = bufi_to_tidx(out_bufi, out_strides, out_dim_order);
+  TensorIndex out_tidx = linear_idx_to_tensor_idx(outp, out_bufi);
+
+  TensorIndex cond_tidx = out_tidx;
+  clamp_tensor_idx(condp, cond_tidx);
 
-  const int cond_bufi = tidx_to_bufi(out_tidx, cond_strides);
-  const int self_bufi = tidx_to_bufi(out_tidx, self_strides);
-  const int other_bufi = tidx_to_bufi(out_tidx, other_strides);
+  TensorIndex self_tidx = out_tidx;
+  clamp_tensor_idx(selfp, self_tidx);
 
-  COND_T cond = t_condition[cond_bufi] ;
+  TensorIndex other_tidx = out_tidx;
+  clamp_tensor_idx(otherp, other_tidx);
+
+  const uint cond_bufi = tensor_idx_to_linear_idx(condp, cond_tidx);
+  const uint self_bufi = tensor_idx_to_linear_idx(selfp, self_tidx);
+  const uint other_bufi = tensor_idx_to_linear_idx(otherp, other_tidx);
+
+  COND_T cond = t_condition[cond_bufi];
   T v_self = t_self[self_bufi];
   T v_other = t_other[other_bufi];
 
@@ -72,29 +75,49 @@ void main() {
   }
 }
 
-#else // !USING_BUFFER
+#else // USING_TEXTURE
 
 void main() {
-  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+  const ivec3 out_pos = ivec3(gl_GlobalInvocationID);
 
-
-  if (any(greaterThanEqual(pos, out_limits))) {
+  if (out_of_bounds(out_pos, outp)) {
     return;
   }
 
-  vec4 cond = load_texel(t_condition, pos);
-  VEC4_T selftex = load_texel(t_self, pos);
-  VEC4_T othertex = load_texel(t_other, pos);
-
-  VEC4_T outtex;
-
-  for (int idx = 0; idx < 4; ++idx) {
-    if (cond[idx] == 1) {
-      outtex[idx] = selftex[idx];
+  TensorIndex4D out_tidx = texture_pos_to_tensor4d_idx_simple(outp, out_pos);
+
+  VEC4_T outtex = VEC4_T(0);
+
+  int limit = min(
+      4, outp.sizes[outp.packed_dim] - out_tidx.data[outp.packed_dim]);
+  for (int comp = 0; comp < limit; comp++) {
+    TensorIndex4D cond_tidx;
+    cond_tidx.data = min(out_tidx.data, condp.sizes - 1);
+    TextureElementIndex cond_elem =
+        tensor4d_idx_to_texture_element_idx_simple(condp, cond_tidx);
+    uint cond_val = texelFetch(t_condition, cond_elem.pos, 0)[cond_elem.comp];
+
+    TensorIndex4D self_tidx;
+    self_tidx.data = min(out_tidx.data, selfp.sizes - 1);
+    TextureElementIndex self_elem =
+        tensor4d_idx_to_texture_element_idx_simple(selfp, self_tidx);
+    VEC4_T self_texel = texelFetch(t_self, self_elem.pos, 0);
+
+    TensorIndex4D other_tidx;
+    other_tidx.data = min(out_tidx.data, otherp.sizes - 1);
+    TextureElementIndex other_elem =
+        tensor4d_idx_to_texture_element_idx_simple(otherp, other_tidx);
+    VEC4_T other_texel = texelFetch(t_other, other_elem.pos, 0);
+
+    if (cond_val > 0) {
+      outtex[comp] = self_texel[self_elem.comp];
     } else {
-      outtex[idx] = othertex[idx];
+      outtex[comp] = other_texel[other_elem.comp];
     }
+
+    out_tidx.data[outp.packed_dim]++;
   }
-  write_texel(t_out, pos, outtex);
+
+  imageStore(t_out, out_pos, outtex);
 }
- #endif // !USING_BUFFER
+#endif
diff --git a/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp b/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp
index 025b483eab7..92c2fa218ec 100644
--- a/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp
@@ -198,6 +198,7 @@ DEFINE_BINARY_OP_FN(lt);
 DEFINE_BINARY_OP_FN(le);
 DEFINE_BINARY_OP_FN(gt);
 DEFINE_BINARY_OP_FN(ge);
+DEFINE_BINARY_OP_FN(bitwise_and);
 
 REGISTER_OPERATORS {
   VK_REGISTER_OP(aten.add.Tensor, add);
@@ -212,6 +213,7 @@ REGISTER_OPERATORS {
   VK_REGISTER_OP(aten.le.Tensor, le);
   VK_REGISTER_OP(aten.gt.Tensor, gt);
   VK_REGISTER_OP(aten.ge.Tensor, ge);
+  VK_REGISTER_OP(aten.bitwise_and.Tensor, bitwise_and);
 }
 
 } // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Common.cpp b/backends/vulkan/runtime/graph/ops/impl/Common.cpp
index 0286889de5c..d052882afde 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Common.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Common.cpp
@@ -232,11 +232,12 @@ utils::uvec3 pick_linear_global_wg_with_block_config(
       BlockConfig::outer_dim_from_packed_int(packed_block_config);
 
   const std::vector<int64_t>& sizes = graph->sizes_of(output);
-  const size_t ndim = sizes.size();
 
-  // Compute number of blocks along inner and outer dimensions
-  const int64_t inner_size = sizes[ndim - 1 - inner_dim];
-  const int64_t outer_size = sizes[ndim - 1 - outer_dim];
+  // Use val_at with negative indices to safely access WHCN dimensions.
+  // val_at returns 1 for out-of-bounds indices, correctly handling tensors
+  // with fewer than 4 dimensions. WHCN dim d maps to val_at(-(d+1), sizes).
+  const int64_t inner_size = utils::val_at(-1 - inner_dim, sizes);
+  const int64_t outer_size = utils::val_at(-1 - outer_dim, sizes);
 
   const uint32_t num_inner_blocks =
       utils::safe_downcast<uint32_t>(utils::div_up(inner_size, int64_t(4)));
@@ -245,10 +246,10 @@ utils::uvec3 pick_linear_global_wg_with_block_config(
 
   // Compute number of planes (product of dimensions not in the block)
   uint32_t num_planes = 1;
-  for (size_t i = 0; i < ndim; ++i) {
-    const int32_t whcn_dim = ndim - 1 - i;
-    if (whcn_dim != inner_dim && whcn_dim != outer_dim) {
-      num_planes *= utils::safe_downcast<uint32_t>(sizes[i]);
+  for (int32_t d = 0; d < 4; ++d) {
+    if (d != inner_dim && d != outer_dim) {
+      num_planes *=
+          utils::safe_downcast<uint32_t>(utils::val_at(-1 - d, sizes));
     }
   }
 
diff --git a/backends/vulkan/runtime/graph/ops/impl/IndexTensor.cpp b/backends/vulkan/runtime/graph/ops/impl/IndexTensor.cpp
new file mode 100644
index 00000000000..b7da1b1ac40
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/impl/IndexTensor.cpp
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
+
+namespace vkcompute {
+
+void resize_index_tensor_node(
+    ComputeGraph* graph,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& resize_args) {
+  (void)resize_args;
+  const ValueRef out = args.at(0).refs.at(0);
+  const ValueRef index = args.at(1).refs.at(1);
+
+  std::vector<int64_t> out_sizes = graph->sizes_of(index);
+  graph->virtual_resize(out, out_sizes);
+}
+
+void add_index_tensor_node(
+    ComputeGraph& graph,
+    const ValueRef self,
+    const ValueRef index,
+    const ValueRef out) {
+  std::string kernel_name = "index_tensor";
+  kernel_name.reserve(kShaderNameReserve);
+  add_storage_type_suffix(kernel_name, graph.storage_type_of(out));
+  add_dtype_suffix(kernel_name, graph.dtype_of(out));
+
+  vkapi::ParamsBindList param_ubos = {
+      graph.meta_ubo(out), graph.meta_ubo(self), graph.meta_ubo(index)};
+
+  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
+      graph,
+      VK_KERNEL_FROM_STR(kernel_name),
+      default_pick_global_wg_size,
+      default_pick_local_wg_size,
+      // Inputs and Outputs
+      {{out, vkapi::kWrite}, {{self, index}, vkapi::kRead}},
+      // Shader params buffers
+      param_ubos,
+      // Push Constants
+      {},
+      // Specialization Constants
+      {},
+      // Resize Args
+      {},
+      // Resizing Logic
+      resize_index_tensor_node));
+}
+
+void index_tensor(ComputeGraph& graph, const std::vector<ValueRef>& args) {
+  ValueRef self = args[0];
+  ValueRef indices_list_ref = args[1];
+  ValueRef out = args[2];
+
+  ValueListPtr indices_list = graph.get_value_list(indices_list_ref);
+  VK_CHECK_COND(
+      indices_list->size() == 1,
+      "index.Tensor: only one index tensor is supported");
+
+  ValueRef index = indices_list->at(0);
+
+  add_index_tensor_node(graph, self, index, out);
+}
+
+REGISTER_OPERATORS {
+  VK_REGISTER_OP(aten.index.Tensor, index_tensor);
+}
+
+} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Int8x4Staging.cpp b/backends/vulkan/runtime/graph/ops/impl/Int8x4Staging.cpp
new file mode 100644
index 00000000000..eb1d9965f30
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/impl/Int8x4Staging.cpp
@@ -0,0 +1,140 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Int8x4Staging.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/DynamicDispatchNode.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
+
+namespace vkcompute {
+
+void add_prepack_int8x4_buffer_node(
+    ComputeGraph& graph,
+    const ValueRef tensor_data,
+    const ValueRef tensor) {
+  VK_CHECK_COND(graph.dtype_of(tensor) == vkapi::kInt8x4);
+  // TODO(ssjia): Update shaders to handle high-dim tensors
+  VK_CHECK_COND(graph.dim_of(tensor) <= 4);
+
+  std::string kernel_name = "nchw_to_int8x4_buffer";
+
+  vkapi::ParamsBindList param_buffers;
+  param_buffers.append(graph.buffer_meta_ubo(tensor));
+
+  // One thread per texel (each texel = one int32 = 4 packed int8).
+  // Use padded_numel to account for dimension padding in packed int8 layouts
+  // (e.g., kPackedInt8_4C with C=3 pads to C=4).
+  uint32_t num_texels =
+      utils::safe_downcast<uint32_t>(graph.padded_numel_of(tensor) / 4);
+  utils::uvec3 global_wg_size = {num_texels, 1, 1};
+  utils::uvec3 local_wg_size = graph.create_local_wg_size(global_wg_size);
+
+  graph.prepack_nodes().emplace_back(new PrepackNode(
+      graph,
+      VK_KERNEL_FROM_STR(kernel_name),
+      global_wg_size,
+      local_wg_size,
+      // Input and Output
+      tensor_data,
+      tensor,
+      // Parameter Buffers
+      param_buffers,
+      // Specialization Constants
+      {graph.hashed_layout_of(tensor)}));
+}
+
+static utils::uvec3 staging_to_int8x4_buffer_global_wg_size(
+    ComputeGraph* graph,
+    const vkapi::ShaderInfo& shader,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& resize_args) {
+  (void)shader;
+  (void)resize_args;
+  const ValueRef out_tensor = args.at(0).refs.at(0);
+  const uint32_t num_texels =
+      utils::safe_downcast<uint32_t>(graph->padded_numel_of(out_tensor) / 4);
+  return {num_texels, 1, 1};
+}
+
+void add_staging_to_int8x4_buffer_node(
+    ComputeGraph& graph,
+    const ValueRef in_staging,
+    const ValueRef tensor) {
+  VK_CHECK_COND(graph.dtype_of(tensor) == vkapi::kInt8x4);
+  // TODO(ssjia): Update shaders to handle high-dim tensors
+  VK_CHECK_COND(graph.dim_of(tensor) <= 4);
+
+  vkapi::ParamsBindList param_buffers;
+  param_buffers.append(graph.buffer_meta_ubo(tensor));
+
+  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
+      graph,
+      VK_KERNEL_FROM_STR("nchw_to_int8x4_buffer"),
+      staging_to_int8x4_buffer_global_wg_size,
+      default_pick_local_wg_size,
+      // Input and Output
+      {{tensor, vkapi::kWrite}, {in_staging, vkapi::kRead}},
+      // Parameter Buffers
+      param_buffers,
+      // Push Constants
+      {},
+      // Specialization Constants
+      {graph.hashed_layout_of(tensor)},
+      // Resize Args
+      {},
+      // Resizing Logic
+      nullptr));
+}
+
+static utils::uvec3 int8x4_buffer_to_staging_global_wg_size(
+    ComputeGraph* graph,
+    const vkapi::ShaderInfo& shader,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& resize_args) {
+  (void)shader;
+  (void)resize_args;
+  const ValueRef in_tensor = args.at(1).refs.at(0);
+  // One thread per output int32 in the NCHW staging buffer.
+  const int32_t numel = graph->numel_of(in_tensor);
+  const uint32_t num_out_int32s =
+      utils::safe_downcast<uint32_t>((numel + 3) / 4);
+  return {num_out_int32s, 1, 1};
+}
+
+void add_int8x4_buffer_to_staging_node(
+    ComputeGraph& graph,
+    const ValueRef tensor,
+    const ValueRef staging_data) {
+  VK_CHECK_COND(graph.dtype_of(tensor) == vkapi::kInt8x4);
+  // TODO(ssjia): Update shaders to handle high-dim tensors
+  VK_CHECK_COND(graph.dim_of(tensor) <= 4);
+
+  vkapi::ParamsBindList param_buffers;
+  param_buffers.append(graph.buffer_meta_ubo(tensor));
+
+  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
+      graph,
+      VK_KERNEL_FROM_STR("int8x4_buffer_to_nchw"),
+      int8x4_buffer_to_staging_global_wg_size,
+      default_pick_local_wg_size,
+      // Input and Output
+      {{staging_data, vkapi::kWrite}, {tensor, vkapi::kRead}},
+      // Parameter Buffers
+      param_buffers,
+      // Push Constants
+      {},
+      // Specialization Constants
+      {graph.hashed_layout_of(tensor)},
+      // Resize Args
+      {},
+      // Resizing Logic
+      nullptr));
+}
+
+} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Q8taStaging.h b/backends/vulkan/runtime/graph/ops/impl/Int8x4Staging.h
similarity index 65%
rename from backends/vulkan/runtime/graph/ops/impl/Q8taStaging.h
rename to backends/vulkan/runtime/graph/ops/impl/Int8x4Staging.h
index 40386551e36..659ed696cd1 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Q8taStaging.h
+++ b/backends/vulkan/runtime/graph/ops/impl/Int8x4Staging.h
@@ -12,9 +12,19 @@
 
 namespace vkcompute {
 
-void add_staging_to_int8x4_buffer_node(
+void add_prepack_int8x4_buffer_node(
     ComputeGraph& graph,
     const ValueRef tensor_data,
     const ValueRef tensor);
 
+void add_staging_to_int8x4_buffer_node(
+    ComputeGraph& graph,
+    const ValueRef in_staging,
+    const ValueRef tensor);
+
+void add_int8x4_buffer_to_staging_node(
+    ComputeGraph& graph,
+    const ValueRef tensor,
+    const ValueRef staging_data);
+
 } // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Q8taStaging.cpp b/backends/vulkan/runtime/graph/ops/impl/Q8taStaging.cpp
deleted file mode 100644
index 8dc3f8156f8..00000000000
--- a/backends/vulkan/runtime/graph/ops/impl/Q8taStaging.cpp
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Q8taStaging.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
-
-namespace vkcompute {
-
-void add_staging_to_int8x4_buffer_node(
-    ComputeGraph& graph,
-    const ValueRef tensor_data,
-    const ValueRef tensor) {
-  VK_CHECK_COND(graph.dtype_of(tensor) == vkapi::kInt8x4);
-
-  std::string kernel_name = "nchw_to_int8x4_buffer";
-
-  vkapi::ParamsBindList param_buffers;
-  param_buffers.append(graph.buffer_meta_ubo(tensor));
-
-  // One thread per texel (each texel = one int32 = 4 packed int8).
-  // Use padded_numel to account for dimension padding in packed int8 layouts
-  // (e.g., kPackedInt8_4C with C=3 pads to C=4).
-  uint32_t num_texels =
-      utils::safe_downcast<uint32_t>(graph.padded_numel_of(tensor) / 4);
-  utils::uvec3 global_wg_size = {num_texels, 1, 1};
-  utils::uvec3 local_wg_size = graph.create_local_wg_size(global_wg_size);
-
-  graph.prepack_nodes().emplace_back(new PrepackNode(
-      graph,
-      VK_KERNEL_FROM_STR(kernel_name),
-      global_wg_size,
-      local_wg_size,
-      // Input and Output
-      tensor_data,
-      tensor,
-      // Parameter Buffers
-      param_buffers,
-      // Specialization Constants
-      {graph.hashed_layout_of(tensor)}));
-}
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Staging.cpp b/backends/vulkan/runtime/graph/ops/impl/Staging.cpp
index adcad9f9817..c418a3681c8 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Staging.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Staging.cpp
@@ -12,7 +12,7 @@
 
 #include <executorch/backends/vulkan/runtime/graph/ops/DynamicDispatchNode.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Q8taStaging.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Int8x4Staging.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h>
 
@@ -27,6 +27,10 @@ void add_staging_to_tensor_node(
     const ValueRef out_tensor) {
   VK_CHECK_COND(graph.val_is_staging(in_staging));
 
+  if (graph.dtype_of(out_tensor) == vkapi::kInt8x4) {
+    return add_staging_to_int8x4_buffer_node(graph, in_staging, out_tensor);
+  }
+
   vkapi::ShaderInfo shader = get_nchw_to_tensor_shader(
       graph,
       out_tensor,
@@ -104,6 +108,10 @@ void add_tensor_to_staging_node(
     const ValueRef out_staging) {
   VK_CHECK_COND(graph.val_is_staging(out_staging));
 
+  if (graph.dtype_of(in_tensor) == vkapi::kInt8x4) {
+    return add_int8x4_buffer_to_staging_node(graph, in_tensor, out_staging);
+  }
+
   vkapi::ShaderInfo shader = get_tensor_to_nchw_shader(
       graph,
       in_tensor,
@@ -329,7 +337,7 @@ ValueRef prepack_int4_linear_weight_transposed_interleaved(
 
 void prepack_op(ComputeGraph& graph, const std::vector<ValueRef>& args) {
   if (graph.dtype_of(args[1]) == vkapi::kInt8x4) {
-    return add_staging_to_int8x4_buffer_node(graph, args[0], args[1]);
+    return add_prepack_int8x4_buffer_node(graph, args[0], args[1]);
   }
   return add_prepack_standard_node(graph, args[0], args[1]);
 }
diff --git a/backends/vulkan/runtime/graph/ops/impl/Where.cpp b/backends/vulkan/runtime/graph/ops/impl/Where.cpp
index c1c482d9967..adb7fb1beca 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Where.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Where.cpp
@@ -21,43 +21,13 @@ void resize_where_node(
     const std::vector<ValueRef>& extra_args) {
   (void)extra_args;
   const ValueRef out = args.at(0).refs.at(0);
-  const ValueRef in = args.at(1).refs.at(0);
+  const ValueRef self = args.at(1).refs.at(1);
 
-  const std::vector<int64_t> in_sizes = graph->sizes_of(in);
-  graph->virtual_resize(out, in_sizes);
+  const std::vector<int64_t> self_sizes = graph->sizes_of(self);
+  graph->virtual_resize(out, self_sizes);
 }
 
-void add_where_texture_node(
-    ComputeGraph& graph,
-    const ValueRef cond,
-    const ValueRef self,
-    const ValueRef other,
-    const ValueRef out) {
-  std::string kernel_name = "where";
-
-  add_storage_type_suffix(kernel_name, graph.storage_type_of(out));
-  add_dtype_suffix(kernel_name, graph.dtype_of(out));
-
-  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
-      graph,
-      VK_KERNEL_FROM_STR(kernel_name),
-      default_pick_global_wg_size,
-      default_pick_local_wg_size,
-      // Inputs and Outputs
-      {{out, vkapi::kWrite}, {{cond, self, other}, vkapi::kRead}},
-      // Parameter buffers
-      {graph.logical_limits_ubo(self)},
-      // Push Constants
-      {},
-      // Specialization Constants
-      {graph.hashed_layout_of(out)},
-      // Resize Arguments
-      {},
-      // Resizing Logic
-      resize_where_node));
-}
-
-void add_where_buffer_node(
+void add_where_node(
     ComputeGraph& graph,
     const ValueRef cond,
     const ValueRef self,
@@ -69,11 +39,10 @@ void add_where_buffer_node(
   add_dtype_suffix(kernel_name, graph.dtype_of(out));
 
   vkapi::ParamsBindList ubos = {
-      graph.numel_ubo(out),
-      graph.strides_ubo(out),
-      graph.strides_ubo(cond),
-      graph.strides_ubo(self),
-      graph.strides_ubo(other)};
+      graph.meta_ubo(out),
+      graph.meta_ubo(cond),
+      graph.meta_ubo(self),
+      graph.meta_ubo(other)};
 
   graph.execute_nodes().emplace_back(new DynamicDispatchNode(
       graph,
@@ -87,7 +56,7 @@ void add_where_buffer_node(
       // Push Constants
       {},
       // Specialization Constants
-      {graph.hashed_layout_of(out)},
+      {},
       // Resize Arguments
       {},
       // Resizing Logic
@@ -100,11 +69,7 @@ void where(ComputeGraph& graph, const std::vector<ValueRef>& args) {
   const ValueRef self = args[args_i++];
   const ValueRef other = args[args_i++];
   const ValueRef out = args[args_i++];
-  if (graph.is_buffer_storage(out)) {
-    add_where_buffer_node(graph, cond, self, other, out);
-  } else {
-    add_where_texture_node(graph, cond, self, other, out);
-  }
+  add_where_node(graph, cond, self, other, out);
 }
 
 REGISTER_OPERATORS {
diff --git a/backends/vulkan/test/custom_ops/impl/TestQ8taBinary.cpp b/backends/vulkan/test/custom_ops/impl/TestQ8taBinary.cpp
index f5214221359..e3c3e6e2642 100644
--- a/backends/vulkan/test/custom_ops/impl/TestQ8taBinary.cpp
+++ b/backends/vulkan/test/custom_ops/impl/TestQ8taBinary.cpp
@@ -8,9 +8,9 @@
 
 #include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
 
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Int8x4Staging.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/Q8taBinary.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/Q8taQuantizeDequantize.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Q8taStaging.h>
 
 namespace vkcompute {
 
@@ -62,7 +62,7 @@ void q8ta_add_test(ComputeGraph& graph, const std::vector<ValueRef>& args) {
   if (input_b_is_int8) {
     // Input B is a pre-quantized int8 TensorRef; prepack directly into packed
     // int8x4 format
-    add_staging_to_int8x4_buffer_node(graph, input_b, packed_int8_input_b);
+    add_prepack_int8x4_buffer_node(graph, input_b, packed_int8_input_b);
   } else {
     // Input B is a float tensor; quantize at runtime
     add_q8ta_quantize_node(
diff --git a/backends/vulkan/test/custom_ops/test_q8ta_binary.cpp b/backends/vulkan/test/custom_ops/test_q8ta_binary.cpp
index 86725ca8fb8..f60b113828b 100644
--- a/backends/vulkan/test/custom_ops/test_q8ta_binary.cpp
+++ b/backends/vulkan/test/custom_ops/test_q8ta_binary.cpp
@@ -133,10 +133,10 @@ TestCase create_test_case_from_config(
 std::vector<TestCase> generate_q8ta_add_easy_cases() {
   std::vector<TestCase> test_cases;
 
-  // Single simple configuration for debugging
-  Q8taBinaryConfig config = {
-      {1, 16, 16, 16}, // shape: [N, C, H, W]
-      "ACCU", // test_case_name
+  std::vector<std::vector<int64_t>> shapes = {
+      {1, 16, 16, 16}, // 4D: [N, C, H, W]
+      {1, 144}, // 2D: exercises block config with ndim < 4
+      {1, 90}, // 2D: matches skin_seg model's keypoint/bbox tensor sizes
   };
 
   // Quantized memory layouts to test
@@ -148,20 +148,23 @@ std::vector<TestCase> generate_q8ta_add_easy_cases() {
       utils::kPackedInt8_4C1W,
   };
 
-  for (const auto& quant_layout : quant_layouts) {
-    test_cases.push_back(create_test_case_from_config(
-        config,
-        /*storage_type=*/utils::kBuffer,
-        /*input_dtype=*/vkapi::kFloat,
-        /*fp_memory_layout=*/utils::kWidthPacked,
-        quant_layout));
-    test_cases.push_back(create_test_case_from_config(
-        config,
-        /*fp_storage_type=*/utils::kBuffer,
-        /*input_dtype=*/vkapi::kFloat,
-        /*fp_layout=*/utils::kWidthPacked,
-        quant_layout,
-        /*const_b=*/true));
+  for (const auto& shape : shapes) {
+    Q8taBinaryConfig config = {shape, "ACCU"};
+    for (const auto& quant_layout : quant_layouts) {
+      test_cases.push_back(create_test_case_from_config(
+          config,
+          /*storage_type=*/utils::kBuffer,
+          /*input_dtype=*/vkapi::kFloat,
+          /*fp_memory_layout=*/utils::kWidthPacked,
+          quant_layout));
+      test_cases.push_back(create_test_case_from_config(
+          config,
+          /*fp_storage_type=*/utils::kBuffer,
+          /*input_dtype=*/vkapi::kFloat,
+          /*fp_layout=*/utils::kWidthPacked,
+          quant_layout,
+          /*const_b=*/true));
+    }
   }
 
   return test_cases;
@@ -173,6 +176,20 @@ std::vector<TestCase> generate_q8ta_add_test_cases() {
 
   // Shapes to test
   std::vector<std::vector<int64_t>> shapes = {
+      // 1D tensors
+      {144},
+      {90},
+
+      // 3D tensors
+      {1, 16, 32},
+      {1, 3, 64},
+
+      // 2D tensors (exercises block config with ndim < 4)
+      {1, 144},
+      {1, 90},
+      {1, 4},
+      {3, 32},
+
       // Small test cases for correctness
       {1, 3, 16, 16},
       {1, 8, 32, 32},
diff --git a/backends/vulkan/test/custom_ops/test_q8ta_qdq.cpp b/backends/vulkan/test/custom_ops/test_q8ta_qdq.cpp
index e0efd6ea85d..a3ff8c42f86 100644
--- a/backends/vulkan/test/custom_ops/test_q8ta_qdq.cpp
+++ b/backends/vulkan/test/custom_ops/test_q8ta_qdq.cpp
@@ -104,10 +104,10 @@ TestCase create_test_case_from_config(
 std::vector<TestCase> generate_q_dq_8bit_easy_cases() {
   std::vector<TestCase> test_cases;
 
-  // Single simple configuration for debugging
-  QDQ8BitConfig config = {
-      {1, 16, 16, 16}, // shape: [N, C, H, W]
-      "ACCU", // test_case_name
+  std::vector<std::vector<int64_t>> shapes = {
+      {1, 16, 16, 16}, // 4D: [N, C, H, W]
+      {1, 144}, // 2D: exercises block config with ndim < 4
+      {1, 90}, // 2D: matches skin_seg model's keypoint/bbox tensor sizes
   };
 
   // FP memory layouts to test
@@ -129,21 +129,24 @@ std::vector<TestCase> generate_q_dq_8bit_easy_cases() {
   std::vector<vkapi::ScalarType> float_types = {vkapi::kFloat};
 
   // Generate test cases for each combination
-  for (const auto& fp_layout : fp_layouts) {
-    for (const auto& quant_layout : quant_layouts) {
-      for (const auto& storage_type : storage_types) {
-        for (const auto& input_dtype : float_types) {
-          test_cases.push_back(create_test_case_from_config(
-              config, storage_type, input_dtype, fp_layout, quant_layout));
-          // For 4W4C layout, also test with legacy implementation
-          if (quant_layout == utils::kPackedInt8_4W4C) {
+  for (const auto& shape : shapes) {
+    QDQ8BitConfig config = {shape, "ACCU"};
+    for (const auto& fp_layout : fp_layouts) {
+      for (const auto& quant_layout : quant_layouts) {
+        for (const auto& storage_type : storage_types) {
+          for (const auto& input_dtype : float_types) {
             test_cases.push_back(create_test_case_from_config(
-                config,
-                storage_type,
-                input_dtype,
-                fp_layout,
-                quant_layout,
-                /*impl_selector=*/"legacy_4w4c"));
+                config, storage_type, input_dtype, fp_layout, quant_layout));
+            // For 4W4C layout, also test with legacy implementation
+            if (quant_layout == utils::kPackedInt8_4W4C) {
+              test_cases.push_back(create_test_case_from_config(
+                  config,
+                  storage_type,
+                  input_dtype,
+                  fp_layout,
+                  quant_layout,
+                  /*impl_selector=*/"legacy_4w4c"));
+            }
           }
         }
       }
@@ -159,6 +162,20 @@ std::vector<TestCase> generate_q_dq_8bit_test_cases() {
 
   // Shapes to test (no layout specified - will be combined with all layouts)
   std::vector<std::vector<int64_t>> shapes = {
+      // 1D tensors
+      {144},
+      {90},
+
+      // 2D tensors (exercises block config with ndim < 4)
+      {1, 144},
+      {1, 90},
+      {1, 4},
+      {3, 32},
+
+      // 3D tensors
+      {1, 16, 32},
+      {1, 3, 64},
+
       // Small test cases for correctness
       {1, 3, 16, 16},
       {1, 8, 32, 32},
diff --git a/backends/vulkan/test/op_tests/cases.py b/backends/vulkan/test/op_tests/cases.py
index 534462ed179..fe2e4169f05 100644
--- a/backends/vulkan/test/op_tests/cases.py
+++ b/backends/vulkan/test/op_tests/cases.py
@@ -2001,6 +2001,56 @@ def get_where_inputs():
     return test_suite
 
 
+@register_test_suite("aten.bitwise_and.Tensor")
+def get_bitwise_and_inputs():
+    test_suite = VkTestSuite(
+        [
+            ((M1, M2), (M1, M2)),
+            ((S, S1, S2), (S, S1, S2)),
+            ((XS, S, S1, S2), (XS, S, S1, S2)),
+            ((1, M1), (1, M1)),
+        ]
+    )
+    test_suite.layouts = [
+        "utils::kWidthPacked",
+        "utils::kChannelsPacked",
+    ]
+    test_suite.storage_types = [
+        "utils::kBuffer",
+        "utils::kTexture3D",
+    ]
+    test_suite.dtypes = ["at::kBool"]
+    test_suite.data_gen = "make_seq_tensor"
+    return test_suite
+
+
+@register_test_suite("aten.index.Tensor")
+def get_index_tensor_inputs():
+    Test = namedtuple("IndexTensorTest", ["self", "indices"])
+
+    test_cases = [
+        # 1D index tensor
+        Test(self=(M1,), indices=[(S,)]),
+        Test(self=(M1,), indices=[(M2,)]),
+        # 2D index tensor
+        Test(self=(L,), indices=[(S, S1)]),
+        Test(self=(L,), indices=[(M1, M2)]),
+        # 3D index tensor
+        Test(self=(M1,), indices=[(XS, S, S1)]),
+    ]
+
+    test_suite = VkTestSuite([tuple(tc) for tc in test_cases])
+    test_suite.layouts = [
+        "utils::kWidthPacked",
+        "utils::kChannelsPacked",
+    ]
+    test_suite.storage_types = ["utils::kBuffer", "utils::kTexture3D"]
+    test_suite.dtypes = ["at::kFloat"]
+    test_suite.arg_dtype["indices"] = "at::kInt"
+    test_suite.arg_data_gen_fn["indices"] = "make_casted_randint_tensor"
+    return test_suite
+
+
 @register_test_suite("aten.pow.Tensor_Scalar")
 def get_pow_tensor_scalar_inputs():
     test_suite = VkTestSuite(
diff --git a/backends/vulkan/test/op_tests/utils/aten_types.py b/backends/vulkan/test/op_tests/utils/aten_types.py
index 6ad2f568e91..a78263987a1 100644
--- a/backends/vulkan/test/op_tests/utils/aten_types.py
+++ b/backends/vulkan/test/op_tests/utils/aten_types.py
@@ -12,6 +12,7 @@
 AT_SCALAR = "at::Scalar"
 AT_TENSOR = "at::Tensor"
 AT_TENSOR_LIST = "at::TensorList"
+OPT_TENSOR_LIST = "c10::List<::std::optional<at::Tensor>>"
 BOOL = "bool"
 DOUBLE = "double"
 INT = "int64_t"
diff --git a/backends/vulkan/test/op_tests/utils/gen_computegraph.py b/backends/vulkan/test/op_tests/utils/gen_computegraph.py
index 6a7dc2e5d0a..a09b4d36b18 100644
--- a/backends/vulkan/test/op_tests/utils/gen_computegraph.py
+++ b/backends/vulkan/test/op_tests/utils/gen_computegraph.py
@@ -26,6 +26,7 @@
     OPT_LAYOUT,
     OPT_MEMORY_FORMAT,
     OPT_SCALAR_TYPE,
+    OPT_TENSOR_LIST,
     STRING,
     TENSOR_VECTOR,
     THREE_TENSOR_TUPLE,
@@ -86,7 +87,7 @@ def vk_out(self):
 
 ValueRefList = Union[ValueRef, List[ValueRef]]
 
-InableCppType = frozenset([AT_TENSOR, AT_TENSOR_LIST])
+InableCppType = frozenset([AT_TENSOR, AT_TENSOR_LIST, OPT_TENSOR_LIST])
 
 
 class ComputeGraphGen:
@@ -313,7 +314,7 @@ def create_value_decl_for(self, ref: ValueRefList) -> str:  # noqa: C901
             return ret_str
 
         cpp_type = "IOValueRef" if (ref.is_in or ref.requires_prepack) else "ValueRef"
-        if ref.src_cpp_type == AT_TENSOR_LIST:
+        if ref.src_cpp_type in (AT_TENSOR_LIST, OPT_TENSOR_LIST):
             ret_str = f"std::vector<IOValueRef> {ref.name}_io_value_refs;\n"
             ret_str += f"std::vector<ValueRef> {ref.name}_value_refs;\n"
             return ret_str
@@ -409,6 +410,25 @@ def create_value_for(  # noqa: C901
             ret_str += "}\n"
             ret_str += f"ValueRef {ref.name} = {self.graph}{self.dot}add_value_list(std::move({ref.name}_value_refs));\n"
             return ret_str
+        elif ref.src_cpp_type == OPT_TENSOR_LIST:
+            assert ref.is_in, "OPT_TENSOR_LIST must be an input"
+            ret_str = ""
+            if include_declarations:
+                ret_str += f"std::vector<IOValueRef> {ref.name}_io_value_refs;\n"
+                ret_str += f"std::vector<ValueRef> {ref.name}_value_refs;\n"
+            ret_str += f"for (int i=0; i < (int){ref.src_cpp_name}.size(); i++) {{\n"
+            ret_str += (
+                f"  IOValueRef io_value_ref = {self.graph}{self.dot}add_input_tensor(\n"
+            )
+            ret_str += f"      {ref.src_cpp_name}[i]->sizes().vec(),\n"
+            ret_str += (
+                f"      from_at_scalartype({ref.src_cpp_name}[i]->scalar_type())); \n"
+            )
+            ret_str += f"  {ref.name}_value_refs.emplace_back(io_value_ref.value);\n"
+            ret_str += f"  {ref.name}_io_value_refs.emplace_back(io_value_ref);\n"
+            ret_str += "}\n"
+            ret_str += f"ValueRef {ref.name} = {self.graph}{self.dot}add_value_list(std::move({ref.name}_value_refs));\n"
+            return ret_str
         elif ref.src_cpp_type == TENSOR_VECTOR:
             ret_str = ""
             if include_declarations:
@@ -491,7 +511,7 @@ def create_op_call(self) -> str:
 
         for aten_arg in self.args:
             ref = self.refs[aten_arg.name]
-            if ref.src_cpp_type == AT_TENSOR_LIST:
+            if ref.src_cpp_type in (AT_TENSOR_LIST, OPT_TENSOR_LIST):
                 # Special case. Underlying tensors are input tensors, but the
                 # container itself is just a normal value.
                 op_create_code += f"{ref.name}, "
@@ -553,10 +573,20 @@ def virtual_resize(self, ref: ValueRefList) -> str:
             ret_str += f"{ref.src_cpp_name}.sizes().vec());\n"
         elif ref.src_cpp_type == AT_TENSOR_LIST:
             ret_str = ""
-            ret_str += f"for (int i=0; i < {ref.name}_io_value_refs.size(); i++) {{\n"
+            ret_str += (
+                f"for (int i=0; i < (int){ref.name}_io_value_refs.size(); i++) {{\n"
+            )
             ret_str += f"  {self.graph}{self.dot}virtual_resize({ref.name}_io_value_refs[i].value, "
             ret_str += f"{ref.src_cpp_name}[i].sizes().vec());\n"
             ret_str += "}\n"
+        elif ref.src_cpp_type == OPT_TENSOR_LIST:
+            ret_str = ""
+            ret_str += (
+                f"for (int i=0; i < (int){ref.name}_io_value_refs.size(); i++) {{\n"
+            )
+            ret_str += f"  {self.graph}{self.dot}virtual_resize({ref.name}_io_value_refs[i].value, "
+            ret_str += f"{ref.src_cpp_name}[i]->sizes().vec());\n"
+            ret_str += "}\n"
         else:
             raise AssertionError(f"{ref.src_cpp_type} not expected")
 
@@ -577,13 +607,26 @@ def copy_into_staging(self, ref: ValueRefList) -> str:
             ret_str += f"from_at_scalartype({ref.src_cpp_name}.scalar_type()));\n"
         elif ref.src_cpp_type == AT_TENSOR_LIST:
             ret_str = ""
-            ret_str += f"for (int i=0; i < {ref.name}_io_value_refs.size(); i++) {{\n"
+            ret_str += (
+                f"for (int i=0; i < (int){ref.name}_io_value_refs.size(); i++) {{\n"
+            )
             ret_str += f"  {self.graph}{self.dot}maybe_cast_and_copy_into_staging("
             ret_str += f"{ref.name}_io_value_refs[i].staging, "
             ret_str += f"{ref.src_cpp_name}[i].const_data_ptr(), "
             ret_str += f"{ref.src_cpp_name}[i].numel(), "
             ret_str += f"from_at_scalartype({ref.src_cpp_name}[i].scalar_type()));\n"
             ret_str += "}\n"
+        elif ref.src_cpp_type == OPT_TENSOR_LIST:
+            ret_str = ""
+            ret_str += (
+                f"for (int i=0; i < (int){ref.name}_io_value_refs.size(); i++) {{\n"
+            )
+            ret_str += f"  {self.graph}{self.dot}maybe_cast_and_copy_into_staging("
+            ret_str += f"{ref.name}_io_value_refs[i].staging, "
+            ret_str += f"{ref.src_cpp_name}[i]->const_data_ptr(), "
+            ret_str += f"{ref.src_cpp_name}[i]->numel(), "
+            ret_str += f"from_at_scalartype({ref.src_cpp_name}[i]->scalar_type()));\n"
+            ret_str += "}\n"
         else:
             raise AssertionError(f"{ref.src_cpp_type} not expected")
         return ret_str
diff --git a/backends/vulkan/test/op_tests/utils/gen_correctness_base.py b/backends/vulkan/test/op_tests/utils/gen_correctness_base.py
index 15627726173..efd073a0cfb 100644
--- a/backends/vulkan/test/op_tests/utils/gen_correctness_base.py
+++ b/backends/vulkan/test/op_tests/utils/gen_correctness_base.py
@@ -25,6 +25,7 @@
     OPT_LAYOUT,
     OPT_MEMORY_FORMAT,
     OPT_SCALAR_TYPE,
+    OPT_TENSOR_LIST,
     STRING,
 )
 from executorch.backends.vulkan.test.op_tests.utils.test_suite import TestSuite
@@ -166,6 +167,12 @@ def create_input_data(self, arg: Argument, data: Any) -> str:  # noqa: C901
             ret_str += f"{cpp_type} {arg.name} = tensor_vec;\n"
             return ret_str + "\n"
 
+        if cpp_type == OPT_TENSOR_LIST:
+            ret_str = f"{OPT_TENSOR_LIST} {arg.name};\n"
+            for elem in data:
+                ret_str += f"{arg.name}.push_back({self.call_data_gen_fn(arg, elem, False)});\n"
+            return ret_str + "\n"
+
         if cpp_type == AT_INT_ARRAY_REF:
             ret_str = f"std::vector<int64_t> {arg.name} = "
         elif cpp_type == OPT_AT_DOUBLE_ARRAY_REF and str(data) != "None":
diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp
index 261d3f72d01..746fa2c5253 100644
--- a/backends/vulkan/test/vulkan_compute_api_test.cpp
+++ b/backends/vulkan/test/vulkan_compute_api_test.cpp
@@ -29,6 +29,8 @@
 
 #include <executorch/backends/vulkan/runtime/graph/ops/DispatchNode.h>
 
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Int8x4Staging.h>
+
 using namespace vkcompute;
 using namespace vkcompute::api;
 
@@ -3490,3 +3492,85 @@ void test_dynamic_dispatch(int M, int N) {
 TEST(VulkanComputeGraphOpsTest, test_dynamic_dispatch_graph) {
   test_dynamic_dispatch(128, 128);
 }
+
+//
+// Int8x4 Staging Tests
+//
+
+void test_int8x4_staging_round_trip(
+    const std::vector<int64_t>& sizes,
+    const utils::GPUMemoryLayout layout) {
+  GraphConfig config;
+  ComputeGraph graph(config);
+
+  const int32_t numel = utils::multiply_integers(sizes);
+
+  // Build graph:
+  // staging_in (kInt8x4) -> [execute: nchw_to_int8x4_buffer] -> tensor
+  // (kInt8x4)
+  //                      -> [execute: int8x4_buffer_to_nchw] -> staging_out
+  ValueRef tensor =
+      graph.add_tensor(sizes, vkapi::kInt8x4, utils::kBuffer, layout);
+
+  ValueRef staging_in = graph.set_input_tensor(tensor);
+  ValueRef staging_out = graph.set_output_tensor(tensor);
+
+  // staging_buffer_numel_of returns padded_numel / 4 (number of int32
+  // elements). Multiply by 4 to get the byte count, which is used to zero-pad
+  // the input.
+  const size_t staging_numel = graph.staging_buffer_numel_of(tensor);
+  // Create NCHW int8 input data zero-padded to the full staging buffer size.
+  std::vector<int8_t> data_in(staging_numel * 4, 0);
+  for (int32_t i = 0; i < numel; ++i) {
+    data_in[i] = static_cast<int8_t>(static_cast<uint8_t>(i * 37 + 13));
+  }
+
+  graph.prepare();
+  // prepack() allocates Vulkan memory for all tensors even when there are no
+  // prepack nodes; it must be called before execute().
+  graph.prepack();
+
+  // Copy NCHW int8 data into the input staging buffer. The staging buffer has
+  // kInt8x4 dtype (staging_numel int32 elements), so reinterpret the int8 data
+  // as int32 for the copy call.
+  graph.maybe_cast_and_copy_into_staging(
+      staging_in,
+      reinterpret_cast<const int32_t*>(data_in.data()),
+      staging_numel,
+      vkapi::kInt8x4);
+
+  graph.execute();
+
+  // Read back packed int32s from staging. The staging dtype is kInt8x4 (4
+  // bytes per element = one packed int32 holding 4 int8 values).
+  std::vector<int32_t> data_out_packed(staging_numel);
+  graph.maybe_cast_and_copy_from_staging(
+      staging_out, data_out_packed.data(), staging_numel, vkapi::kInt8x4);
+
+  // Verify each int8 element matches the round-trip
+  for (int32_t i = 0; i < numel; ++i) {
+    const uint8_t byte = static_cast<uint8_t>(
+        static_cast<uint32_t>(data_out_packed[i / 4]) >> ((i % 4) * 8));
+    const int8_t actual = static_cast<int8_t>(byte);
+    EXPECT_EQ(actual, data_in[i])
+        << "Mismatch at nchw index " << i << " for sizes [" << sizes[0]
+        << (sizes.size() > 1 ? ", " + std::to_string(sizes[1]) : "")
+        << (sizes.size() > 2 ? ", " + std::to_string(sizes[2]) : "")
+        << (sizes.size() > 3 ? ", " + std::to_string(sizes[3]) : "")
+        << "] layout " << layout;
+  }
+}
+
+TEST(VulkanComputeGraphTest, test_int8x4_staging_round_trip) {
+  const std::vector<utils::GPUMemoryLayout> layouts = {
+      utils::kPackedInt8_4C,
+      utils::kPackedInt8_4W,
+      utils::kPackedInt8_4W4C,
+      utils::kPackedInt8_4C1W,
+  };
+  for (const auto& sizes : standard_sizes_to_test) {
+    for (const auto layout : layouts) {
+      test_int8x4_staging_round_trip(sizes, layout);
+    }
+  }
+}