From 9268767ea11e527ed22872d0370cf99136f822f0 Mon Sep 17 00:00:00 2001
From: ssjia <ssjia@devgpu053.atn3.facebook.com>
Date: Tue, 17 Mar 2026 14:19:09 -0700
Subject: [PATCH 01/13] [ET-VK][ez] Fix IndexError in Vulkan partitioner
 DtypeSetList/TensorRepSetList

Pull Request resolved: https://github.com/pytorch/executorch/pull/18048

The `__getitem__` methods of `DtypeSetList` and `TensorRepSetList` in
`utils.py` could raise an `IndexError` when the index is greater than or
equal to the length of the list. This can happen when partitioning ops
whose number of inputs or outputs exceeds the number of entries in the
dtype/tensor-rep specification list. Fix by returning an empty set in
this case, matching the intent of the existing broadcasting logic.
ghstack-source-id: 353546684
@exported-using-ghexport

Differential Revision: [D95970163](https://our.internmc.facebook.com/intern/diff/D95970163/)
---
 backends/vulkan/utils.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/backends/vulkan/utils.py b/backends/vulkan/utils.py
index 2a3c3910c48..c17f9332e0c 100644
--- a/backends/vulkan/utils.py
+++ b/backends/vulkan/utils.py
@@ -91,6 +91,8 @@ def __getitem__(self, idx: int) -> DtypeSet:
         # Broadcasting: single set applies to all positions
         if idx > 0 and len(self.vals) == 1:
             return self.vals[0]
+        if idx >= len(self.vals):
+            return set()
         return self.vals[idx]
 
     def is_empty(self) -> bool:
@@ -1227,8 +1229,9 @@ def __len__(self):
     def __getitem__(self, idx: int) -> TensorRepSet:
         if idx > 0 and len(self) == 1:
             return self.vals[0]
-        else:
-            return self.vals[idx]
+        if idx >= len(self.vals):
+            return set()
+        return self.vals[idx]
 
     def __setitem__(self, idx: int, val: TensorRepSet) -> None:
         if idx > 0 and len(self.vals) == 1:

From 18a0efba59a233151eac154fc2d2d7f83ee42606 Mon Sep 17 00:00:00 2001
From: ssjia <ssjia@devgpu053.atn3.facebook.com>
Date: Tue, 17 Mar 2026 14:19:11 -0700
Subject: [PATCH 02/13] [ET-VK] Add Vulkan backend support for Parakeet runner
 and export

Pull Request resolved: https://github.com/pytorch/executorch/pull/18049

Add Vulkan build support for the Parakeet runner: llm-debug-vulkan preset
in root CMakePresets.json, parakeet-vulkan presets in the Parakeet
CMakePresets.json, vulkan_backend linkage in CMakeLists.txt, and a
`make parakeet-vulkan` Makefile target.

Add _create_vulkan_partitioners() and wire it into lower_to_executorch()
so that `--backend vulkan` is accepted by export_parakeet_tdt.py.
ghstack-source-id: 353546680
@exported-using-ghexport

Differential Revision: [D95970157](https://our.internmc.facebook.com/intern/diff/D95970157/)
---
 CMakePresets.json                             | 38 +++++++++++++++++++
 Makefile                                      | 12 +++++-
 examples/models/parakeet/CMakeLists.txt       |  5 +++
 examples/models/parakeet/CMakePresets.json    | 34 +++++++++++++++++
 .../models/parakeet/export_parakeet_tdt.py    | 36 ++++++++++++++++--
 examples/models/parakeet/main.cpp             |  2 +-
 6 files changed, 121 insertions(+), 6 deletions(-)

diff --git a/CMakePresets.json b/CMakePresets.json
index ca4da226ba1..4d8b70f08b2 100644
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -217,6 +217,21 @@
         "rhs": "Darwin"
       }
     },
+    {
+      "name": "llm-debug-vulkan",
+      "displayName": "LLM debug build with Vulkan",
+      "inherits": [
+        "llm-debug"
+      ],
+      "cacheVariables": {
+        "EXECUTORCH_BUILD_VULKAN": "ON"
+      },
+      "condition": {
+        "type": "inList",
+        "string": "${hostSystemName}",
+        "list": ["Linux", "Windows"]
+      }
+    },
     {
       "name": "llm-metal-stats",
       "displayName": "LLM Metal build with stats collection and logging",
@@ -354,6 +369,15 @@
       ],
       "jobs": 0
     },
+    {
+      "name": "llm-debug-vulkan-install",
+      "displayName": "Build and install LLM extension debug artifacts (Vulkan)",
+      "configurePreset": "llm-debug-vulkan",
+      "targets": [
+        "install"
+      ],
+      "jobs": 0
+    },
     {
       "name": "llm-metal-stats-install",
       "displayName": "Build and install LLM extension artifacts with Metal stats",
@@ -449,6 +473,20 @@
         }
       ]
     },
+    {
+      "name": "llm-debug-vulkan",
+      "displayName": "Configure, build and install ExecuTorch LLM extension with Vulkan enabled (Debug)",
+      "steps": [
+        {
+          "type": "configure",
+          "name": "llm-debug-vulkan"
+        },
+        {
+          "type": "build",
+          "name": "llm-debug-vulkan-install"
+        }
+      ]
+    },
     {
       "name": "llm-metal-stats",
       "displayName": "Configure, build and install ExecuTorch LLM extension with Metal stats and logging",
diff --git a/Makefile b/Makefile
index 6b8ea37e7b2..9dcfa26027e 100644
--- a/Makefile
+++ b/Makefile
@@ -91,7 +91,7 @@
 #
 # ==============================================================================
 
-.PHONY: voxtral-cuda voxtral-cpu voxtral-metal voxtral_realtime-cuda voxtral_realtime-cpu voxtral_realtime-metal whisper-cuda whisper-cuda-debug whisper-cpu whisper-metal parakeet-cuda parakeet-cuda-debug parakeet-cpu parakeet-metal dinov2-cuda dinov2-cuda-debug sortformer-cuda sortformer-cpu silero-vad-cpu llama-cuda llama-cuda-debug llama-cpu llava-cpu gemma3-cuda gemma3-cpu clean help
+.PHONY: voxtral-cuda voxtral-cpu voxtral-metal voxtral_realtime-cuda voxtral_realtime-cpu voxtral_realtime-metal whisper-cuda whisper-cuda-debug whisper-cpu whisper-metal parakeet-cuda parakeet-cuda-debug parakeet-cpu parakeet-metal parakeet-vulkan dinov2-cuda dinov2-cuda-debug sortformer-cuda sortformer-cpu silero-vad-cpu llama-cuda llama-cuda-debug llama-cpu llava-cpu gemma3-cuda gemma3-cpu clean help
 
 help:
 	@echo "This Makefile adds targets to build runners for various models on various backends. Run using \`make <target>\`. Available targets:"
@@ -109,6 +109,7 @@ help:
 	@echo "  parakeet-cuda-debug - Build Parakeet runner with CUDA backend (debug mode)"
 	@echo "  parakeet-cpu        - Build Parakeet runner with CPU backend"
 	@echo "  parakeet-metal      - Build Parakeet runner with Metal backend (macOS only)"
+	@echo "  parakeet-vulkan     - Build Parakeet runner with Vulkan backend"
 	@echo "  dinov2-cuda         - Build DINOv2 runner with CUDA backend"
 	@echo "  dinov2-cuda-debug   - Build DINOv2 runner with CUDA backend (debug mode)"
 	@echo "  sortformer-cuda     - Build Sortformer runner with CUDA backend"
@@ -221,6 +222,15 @@ parakeet-metal:
 	@echo "✓ Build complete!"
 	@echo "  Binary: cmake-out/examples/models/parakeet/parakeet_runner"
 
+parakeet-vulkan:
+	@echo "==> Building and installing ExecuTorch with Vulkan..."
+	cmake --workflow --preset llm-debug-vulkan
+	@echo "==> Building Parakeet runner with Vulkan..."
+	cd examples/models/parakeet && cmake --workflow --preset parakeet-vulkan
+	@echo ""
+	@echo "✓ Build complete!"
+	@echo "  Binary: cmake-out/examples/models/parakeet/parakeet_runner"
+
 dinov2-cuda:
 	@echo "==> Building and installing ExecuTorch with CUDA..."
 	cmake --workflow --preset llm-release-cuda
diff --git a/examples/models/parakeet/CMakeLists.txt b/examples/models/parakeet/CMakeLists.txt
index ec52a596af2..9354afe5f86 100644
--- a/examples/models/parakeet/CMakeLists.txt
+++ b/examples/models/parakeet/CMakeLists.txt
@@ -91,6 +91,11 @@ if(EXECUTORCH_BUILD_METAL)
   executorch_target_link_options_shared_lib(metal_backend)
 endif()
 
+if(EXECUTORCH_BUILD_VULKAN)
+  list(APPEND link_libraries vulkan_backend)
+  executorch_target_link_options_shared_lib(vulkan_backend)
+endif()
+
 add_executable(parakeet_runner main.cpp timestamp_utils.cpp tokenizer_utils.cpp)
 if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
   target_link_options_gc_sections(parakeet_runner)
diff --git a/examples/models/parakeet/CMakePresets.json b/examples/models/parakeet/CMakePresets.json
index ccb4f4fcdd2..afcfd99491c 100644
--- a/examples/models/parakeet/CMakePresets.json
+++ b/examples/models/parakeet/CMakePresets.json
@@ -55,6 +55,19 @@
                 "type": "equals",
                 "rhs": "Darwin"
             }
+        },
+        {
+            "name": "parakeet-vulkan",
+            "displayName": "Parakeet runner (Vulkan)",
+            "inherits": ["parakeet-base"],
+            "cacheVariables": {
+                "EXECUTORCH_BUILD_VULKAN": "ON"
+            },
+            "condition": {
+                "type": "inList",
+                "string": "${hostSystemName}",
+                "list": ["Linux", "Windows"]
+            }
         }
     ],
     "buildPresets": [
@@ -85,6 +98,13 @@
             "configurePreset": "parakeet-metal",
             "configuration": "Release",
             "targets": ["parakeet_runner"]
+        },
+        {
+            "name": "parakeet-vulkan",
+            "displayName": "Build Parakeet runner (Vulkan)",
+            "configurePreset": "parakeet-vulkan",
+            "configuration": "Release",
+            "targets": ["parakeet_runner"]
         }
     ],
     "workflowPresets": [
@@ -143,6 +163,20 @@
                     "name": "parakeet-metal"
                 }
             ]
+        },
+        {
+            "name": "parakeet-vulkan",
+            "displayName": "Configure and build Parakeet runner (Vulkan)",
+            "steps": [
+                {
+                    "type": "configure",
+                    "name": "parakeet-vulkan"
+                },
+                {
+                    "type": "build",
+                    "name": "parakeet-vulkan"
+                }
+            ]
         }
     ]
 }
diff --git a/examples/models/parakeet/export_parakeet_tdt.py b/examples/models/parakeet/export_parakeet_tdt.py
index 6747880cd9e..f3ed0d2b070 100644
--- a/examples/models/parakeet/export_parakeet_tdt.py
+++ b/examples/models/parakeet/export_parakeet_tdt.py
@@ -9,7 +9,6 @@
 
 import torch
 import torchaudio
-
 from executorch.examples.models.parakeet.quantize import quantize_model_
 from executorch.exir import (
     EdgeCompileConfig,
@@ -560,7 +559,25 @@ def _create_cuda_partitioners(programs, is_windows=False):
     return partitioner, updated_programs
 
 
-def lower_to_executorch(programs, metadata=None, backend="portable"):
+def _create_vulkan_partitioners(programs, vulkan_force_fp16=False):
+    """Create Vulkan partitioners for all programs except preprocessor."""
+    from executorch.backends.vulkan.partitioner.vulkan_partitioner import (
+        VulkanPartitioner,
+    )
+
+    print("\nLowering to ExecuTorch with Vulkan...")
+    partitioner = {}
+    for key in programs.keys():
+        if key == "preprocessor":
+            partitioner[key] = []
+        else:
+            partitioner[key] = [VulkanPartitioner({"force_fp16": vulkan_force_fp16})]
+    return partitioner, programs
+
+
+def lower_to_executorch(
+    programs, metadata=None, backend="portable", vulkan_force_fp16=False
+):
     if backend == "xnnpack":
         partitioner, programs = _create_xnnpack_partitioners(programs)
     elif backend == "metal":
@@ -569,6 +586,10 @@ def lower_to_executorch(programs, metadata=None, backend="portable"):
         partitioner, programs = _create_cuda_partitioners(
             programs, is_windows=(backend == "cuda-windows")
         )
+    elif backend == "vulkan":
+        partitioner, programs = _create_vulkan_partitioners(
+            programs, vulkan_force_fp16=vulkan_force_fp16
+        )
     else:
         print("\nLowering to ExecuTorch...")
         partitioner = []
@@ -607,7 +628,7 @@ def main():
         "--backend",
         type=str,
         default="xnnpack",
-        choices=["portable", "xnnpack", "metal", "cuda", "cuda-windows"],
+        choices=["portable", "xnnpack", "metal", "cuda", "cuda-windows", "vulkan"],
         help="Backend for acceleration (default: xnnpack)",
     )
     parser.add_argument(
@@ -672,6 +693,8 @@ def main():
         help="Group size for embedding quantization (default: 0 = per-axis)",
     )
 
+    parser.add_argument("--vulkan_force_fp16", action="store_true")
+
     args = parser.parse_args()
 
     # Validate dtype
@@ -719,7 +742,12 @@ def main():
         qembedding_group_size=args.qembedding_group_size,
     )
 
-    et = lower_to_executorch(programs, metadata=metadata, backend=args.backend)
+    et = lower_to_executorch(
+        programs,
+        metadata=metadata,
+        backend=args.backend,
+        vulkan_force_fp16=args.vulkan_force_fp16,
+    )
 
     pte_path = os.path.join(args.output_dir, "model.pte")
     print(f"\nSaving ExecuTorch program to: {pte_path}")
diff --git a/examples/models/parakeet/main.cpp b/examples/models/parakeet/main.cpp
index 2941484bea6..87768cec38b 100644
--- a/examples/models/parakeet/main.cpp
+++ b/examples/models/parakeet/main.cpp
@@ -489,7 +489,7 @@ int main(int argc, char** argv) {
       static_cast<long long>(pred_hidden),
       static_cast<long long>(sample_rate),
       window_stride,
-      encoder_subsampling_factor);
+      static_cast<long long>(encoder_subsampling_factor));
 
   ET_LOG(Info, "Running TDT greedy decode...");
   auto decoded_tokens = greedy_decode_executorch(

From 4856d2967d90432a57a5c3ae1cbb92af0be3f343 Mon Sep 17 00:00:00 2001
From: ssjia <ssjia@devgpu053.atn3.facebook.com>
Date: Tue, 17 Mar 2026 14:19:12 -0700
Subject: [PATCH 03/13] [ET-VK] Fix output offset calculation and add symint
 support to ComputeGraph

Fix output argument indexing in VulkanBackend::execute() and extend
ComputeGraph to transparently handle symint values.

The output loop previously computed the args index as `i + num_inputs`,
which breaks when non-tensor arguments (e.g. symints) sit between the
tensor inputs and outputs in the args array. Fix by computing the offset
from the end: `args.size() - num_outputs`.

ComputeGraph changes add symint support so that operators can read symint
values uniformly:
- `extract_scalar<T>()` now handles SymInt values, allowing operators to
  call extract_scalar on arguments that may be either plain ints or
  symints without special-casing.
- `read_symint()` falls back to reading plain Int values, so values
  stored as Int (rather than SymInt objects) can be read uniformly.

Pull Request resolved: https://github.com/pytorch/executorch/pull/18050
ghstack-source-id: 353546683
@exported-using-ghexport

Differential Revision: [D95970167](https://our.internmc.facebook.com/intern/diff/D95970167/)
---
 backends/vulkan/runtime/VulkanBackend.cpp      | 8 ++++----
 backends/vulkan/runtime/graph/ComputeGraph.cpp | 3 +++
 backends/vulkan/runtime/graph/ComputeGraph.h   | 3 +++
 3 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/backends/vulkan/runtime/VulkanBackend.cpp b/backends/vulkan/runtime/VulkanBackend.cpp
index d4eeb9b1dd4..3b18915eae5 100644
--- a/backends/vulkan/runtime/VulkanBackend.cpp
+++ b/backends/vulkan/runtime/VulkanBackend.cpp
@@ -671,6 +671,7 @@ class VulkanBackend final : public ::executorch::runtime::BackendInterface {
     ComputeGraph* compute_graph = static_cast<ComputeGraph*>(handle);
 
     const size_t num_inputs = compute_graph->inputs().size();
+    const size_t num_outputs = compute_graph->outputs().size();
     bool should_propagate_resize = false;
 #ifdef ET_EVENT_TRACER_ENABLED
     runtime::EventTracer* event_tracer = context.event_tracer();
@@ -770,14 +771,13 @@ class VulkanBackend final : public ::executorch::runtime::BackendInterface {
             "ETVK_COPY_OUTPUTS",
             /* delegate_debug_id = */ -1);
 #endif // ET_EVENT_TRACER_ENABLED
-    for (size_t i = 0; i < compute_graph->outputs().size(); i++) {
-      const size_t o = i + num_inputs;
+    const size_t output_offset = args.size() - num_outputs;
+    for (size_t i = 0; i < num_outputs; i++) {
+      const size_t o = output_offset + i;
       const ValueRef oref = compute_graph->outputs()[i].value;
       if (compute_graph->val_is_tensor(oref)) {
         VK_CHECK_COND(args[o]->isTensor());
         maybe_resize_output(compute_graph, i, args[o]->toTensor());
-        // args holds inputs directly followed by outputs, so the i'th output
-        // for compute_graph corresponds to the o'th arg
         compute_graph->maybe_cast_and_copy_from_staging(
             compute_graph->outputs()[i].staging,
             args[o]->toTensor().mutable_data_ptr(),
diff --git a/backends/vulkan/runtime/graph/ComputeGraph.cpp b/backends/vulkan/runtime/graph/ComputeGraph.cpp
index bb2df30a174..4b435ae6215 100644
--- a/backends/vulkan/runtime/graph/ComputeGraph.cpp
+++ b/backends/vulkan/runtime/graph/ComputeGraph.cpp
@@ -725,6 +725,9 @@ void ComputeGraph::set_symint(const ValueRef idx, const int32_t val) {
 }
 
 int32_t ComputeGraph::read_symint(const ValueRef idx) {
+  if (values_.at(idx).isInt()) {
+    return static_cast<int32_t>(values_.at(idx).toInt());
+  }
   return get_symint(idx)->get();
 }
 
diff --git a/backends/vulkan/runtime/graph/ComputeGraph.h b/backends/vulkan/runtime/graph/ComputeGraph.h
index 5ce84dd705b..9935b9be51b 100644
--- a/backends/vulkan/runtime/graph/ComputeGraph.h
+++ b/backends/vulkan/runtime/graph/ComputeGraph.h
@@ -573,6 +573,9 @@ class ComputeGraph final {
     if (value.isBool()) {
       return static_cast<T>(value.toBool());
     }
+    if (value.isSymInt()) {
+      return utils::safe_downcast<T>(read_symint(idx));
+    }
     VK_THROW("Cannot extract scalar from Value with type ", value.type());
   }
 

From 6f63f59fd0dff4484dcbe9f068534ee3df8e786b Mon Sep 17 00:00:00 2001
From: ssjia <ssjia@devgpu053.atn3.facebook.com>
Date: Tue, 17 Mar 2026 14:19:14 -0700
Subject: [PATCH 04/13] [ET-VK] Modernize constant_pad_nd

Modernize constant_pad_nd to support ANY_STORAGE (both buffer and
texture). Migrate shaders to BufferMetadata/TextureMetadata with
indexing.glslh and unify dispatch into a single add_constant_pad_nd_node
function using DynamicDispatchNode.

Pull Request resolved: https://github.com/pytorch/executorch/pull/18051
ghstack-source-id: 353546682
@exported-using-ghexport

Differential Revision: [D95970168](https://our.internmc.facebook.com/intern/diff/D95970168/)
---
 backends/vulkan/op_registry.py                |  3 +-
 .../runtime/graph/ops/glsl/pad_buffer.glsl    | 60 +++++++++++++
 .../{pad_channel.yaml => pad_buffer.yaml}     |  7 +-
 .../runtime/graph/ops/glsl/pad_channel.glsl   | 80 -----------------
 .../graph/ops/glsl/pad_height_width.glsl      | 50 -----------
 .../runtime/graph/ops/glsl/pad_texture.glsl   | 86 +++++++++++++++++++
 ...pad_height_width.yaml => pad_texture.yaml} |  7 +-
 .../vulkan/runtime/graph/ops/impl/Pad.cpp     | 70 +++++++--------
 backends/vulkan/test/op_tests/cases.py        | 15 +++-
 9 files changed, 193 insertions(+), 185 deletions(-)
 create mode 100644 backends/vulkan/runtime/graph/ops/glsl/pad_buffer.glsl
 rename backends/vulkan/runtime/graph/ops/glsl/{pad_channel.yaml => pad_buffer.yaml} (67%)
 delete mode 100644 backends/vulkan/runtime/graph/ops/glsl/pad_channel.glsl
 delete mode 100644 backends/vulkan/runtime/graph/ops/glsl/pad_height_width.glsl
 create mode 100644 backends/vulkan/runtime/graph/ops/glsl/pad_texture.glsl
 rename backends/vulkan/runtime/graph/ops/glsl/{pad_height_width.yaml => pad_texture.yaml} (65%)

diff --git a/backends/vulkan/op_registry.py b/backends/vulkan/op_registry.py
index af2389d72f9..fbd2d0cc7cb 100644
--- a/backends/vulkan/op_registry.py
+++ b/backends/vulkan/op_registry.py
@@ -1298,8 +1298,9 @@ def register_arange():
 @update_features(exir_ops.edge.aten.constant_pad_nd.default)
 def register_constant_pad_nd():
     return OpFeatures(
-        inputs_storage=utils.CHANNELS_PACKED_TEXTURE,
+        inputs_storage=utils.ANY_STORAGE,
         inputs_dtypes=utils.FP_INT_BOOL_T,
+        supports_resize=True,
     )
 
 
diff --git a/backends/vulkan/runtime/graph/ops/glsl/pad_buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/pad_buffer.glsl
new file mode 100644
index 00000000000..ea5222c74b3
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/pad_buffer.glsl
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+${define_required_extensions("buffer", DTYPE)}
+
+#define PRECISION ${PRECISION}
+
+#define T ${buffer_scalar_type(DTYPE)}
+
+${define_active_storage_type("buffer")}
+
+#extension GL_EXT_control_flow_attributes : require
+
+layout(std430) buffer;
+
+#include "indexing.glslh"
+
+${layout_declare_tensor(B, "w", "t_out", DTYPE, "buffer")}
+${layout_declare_tensor(B, "r", "t_in", DTYPE, "buffer")}
+
+${layout_declare_ubo(B, "BufferMetadata", "outp")}
+${layout_declare_ubo(B, "BufferMetadata", "inp")}
+${layout_declare_ubo(B, "ivec4", "pad_per_dim")}
+${layout_declare_ubo(B, "float", "fill_value")}
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+${layout_declare_spec_const(C, "int", "out_layout", "CONTIG_LAYOUT_INT")}
+
+void main() {
+  const uint out_bufi = gl_GlobalInvocationID.x;
+  if (out_of_bounds(out_bufi, outp)) {
+    return;
+  }
+
+  TensorIndex out_tidx = linear_idx_to_tensor_idx(outp, out_bufi, out_layout);
+
+  // Subtract pad offsets per dimension to get input tensor index.
+  // Unsigned underflow (when output index < pad offset) wraps to a large
+  // value that fails the out_of_bounds check below.
+  TensorIndex in_tidx = out_tidx;
+  [[unroll]] for (int d = 0; d < 4; d++) {
+    in_tidx.data[0][d] -= uint(pad_per_dim[d]);
+  }
+
+  if (out_of_bounds(in_tidx, inp)) {
+    t_out[out_bufi] = T(fill_value);
+    return;
+  }
+
+  const uint in_bufi = tensor_idx_to_linear_idx(inp, in_tidx);
+  t_out[out_bufi] = t_in[in_bufi];
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/pad_channel.yaml b/backends/vulkan/runtime/graph/ops/glsl/pad_buffer.yaml
similarity index 67%
rename from backends/vulkan/runtime/graph/ops/glsl/pad_channel.yaml
rename to backends/vulkan/runtime/graph/ops/glsl/pad_buffer.yaml
index 91306bd4cbf..8271ab2e64c 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/pad_channel.yaml
+++ b/backends/vulkan/runtime/graph/ops/glsl/pad_buffer.yaml
@@ -1,9 +1,6 @@
-pad_channel:
+pad_buffer:
   parameter_names_with_default_values:
-    NDIM: 3
     DTYPE: float
-    PACKING: C_packed
-    STORAGE: texture3d
   generate_variant_forall:
     DTYPE:
       - VALUE: float
@@ -11,4 +8,4 @@ pad_channel:
       - VALUE: int32
       - VALUE: uint8
   shader_variants:
-    - NAME: pad_channel
+    - NAME: pad_buffer
diff --git a/backends/vulkan/runtime/graph/ops/glsl/pad_channel.glsl b/backends/vulkan/runtime/graph/ops/glsl/pad_channel.glsl
deleted file mode 100644
index 8c01ebef897..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/pad_channel.glsl
+++ /dev/null
@@ -1,80 +0,0 @@
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-#define VEC4_T ${texel_type(DTYPE)}
-
-layout(std430) buffer;
-
-#include "indexing_utils.h"
-
-${layout_declare_tensor(0, "w", "t_out", DTYPE, STORAGE)}
-${layout_declare_tensor(1, "r", "t_in", DTYPE, STORAGE)}
-${layout_declare_ubo(2, "ivec4", "out_sizes")}
-${layout_declare_ubo(3, "ivec4", "in_sizes")}
-${layout_declare_ubo(4, "int", "pad_left", "int", "pad_top", "int", "pad_front")}
-${layout_declare_ubo(5, "float", "fill_value")}
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-layout(constant_id = 3) const int packed_dim = C_DIM;
-
-void main() {
-  const ivec3 pos = ivec3(gl_GlobalInvocationID);
-  const ivec4 idx = to_tensor_idx(pos, out_sizes, packed_dim);
-
-  if (pos_out_of_bounds(pos, out_sizes, packed_dim)) {
-    return;
-  }
-
-  VEC4_T outtex = VEC4_T(fill_value);
-  // mask_z/y/x is used to determine whether need to fecth data from input tensor
-  bool mask_z = (idx.z + 3) < pad_front || idx.z > (pad_front + in_sizes.z - 1);
-  bool mask_y = idx.y >= pad_top && idx.y <= pad_top + in_sizes.y - 1;
-  bool mask_x = idx.x >= pad_left && idx.x <= pad_left + in_sizes.x - 1;
-
-  if (!mask_z && mask_y && mask_x) {
-    // channel_mask is to determine the situation that when padding channel dimension,
-    // in one texel, some elements are filled vaule and some value are from input tensor
-    ivec4 c_ind = ivec4(idx.z) + ivec4(0, 1, 2, 3);
-    ivec4 channel_mask = ivec4(lessThan(c_ind, ivec4(pad_front))) + ivec4(greaterThan(c_ind, ivec4(pad_front + in_sizes.z - 1)));
-
-    ivec4 in_idx = idx;
-    in_idx.x -= pad_left;
-    in_idx.y -= pad_top;
-    in_idx.z -= divup4(pad_front) * 4;
-    const int shift = pad_front % 4;
-    VEC4_T cur_in_texel = texelFetch(t_in, to_texture_pos(in_idx, in_sizes, packed_dim), 0);
-    VEC4_T next_in_texel;
-    // When shift is not 0, we need to read 2 texels from input tensor to write into output
-    // for example:
-    // input texel is [[1 2 3 4], [5 6 x x]] and front_pad = 2
-    // output texel is [[p p 1 2], [3 4 5 6]], where p is the filled value then need to fetch 2 texels to fill [3 4 5 6].
-    if (shift != 0) {
-      in_idx.z += 4;
-      next_in_texel = texelFetch(t_in, to_texture_pos(in_idx, in_sizes, packed_dim), 0);
-    } else {
-      next_in_texel = cur_in_texel;
-    }
-
-    VEC4_T inter_texel;
-    for (int i = 0; i < 4; i++) {
-      if (i < shift) {
-        inter_texel[i] = cur_in_texel[4-shift+i];
-      } else {
-        inter_texel[i] = next_in_texel[i-shift];
-      }
-    }
-    outtex = inter_texel * (VEC4_T(1) - channel_mask) + outtex * channel_mask;
-  }
-
-  int packed_idx = idx[packed_dim];
-  const int packed_dim_size = out_sizes[packed_dim];
-  if (packed_idx + 3 >= packed_dim_size) {
-    ivec4 packed_ind = ivec4(packed_idx) + ivec4(0, 1, 2, 3);
-    VEC4_T valid_idx = VEC4_T(lessThan(packed_ind, ivec4(packed_dim_size)));
-    outtex = outtex * valid_idx;
-  }
-
-  imageStore(t_out, pos, outtex);
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/pad_height_width.glsl b/backends/vulkan/runtime/graph/ops/glsl/pad_height_width.glsl
deleted file mode 100644
index c5b2c692bdc..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/pad_height_width.glsl
+++ /dev/null
@@ -1,50 +0,0 @@
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-#define VEC4_T ${texel_type(DTYPE)}
-
-layout(std430) buffer;
-
-#include "indexing_utils.h"
-
-${layout_declare_tensor(0, "w", "t_out", DTYPE, STORAGE)}
-${layout_declare_tensor(1, "r", "t_in", DTYPE, STORAGE)}
-${layout_declare_ubo(2, "ivec4", "out_sizes")}
-${layout_declare_ubo(3, "ivec4", "in_sizes")}
-${layout_declare_ubo(4, "int", "pad_left", "int", "pad_top", "int", "pad_front")}
-${layout_declare_ubo(5, "float", "fill_value")}
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-layout(constant_id = 3) const int packed_dim = C_DIM;
-
-void main() {
-  const ivec3 pos = ivec3(gl_GlobalInvocationID);
-  const ivec4 idx = to_tensor_idx(pos, out_sizes, packed_dim);
-
-  if (pos_out_of_bounds(pos, out_sizes, packed_dim)) {
-    return;
-  }
-
-  bool mask_height = idx.y >= pad_top && idx.y <= pad_top + in_sizes.y - 1;
-  bool mask_width = idx.x >= pad_left && idx.x <= pad_left + in_sizes.x - 1;
-
-  VEC4_T outtex = VEC4_T(fill_value);
-  if (mask_height && mask_width) {
-    ivec4 in_idx = idx;
-    in_idx.x -= pad_left;
-    in_idx.y -= pad_top;
-    outtex = texelFetch(t_in, to_texture_pos(in_idx, in_sizes, packed_dim), 0);
-  }
-
-  int packed_idx = idx[packed_dim];
-  const int packed_dim_size = out_sizes[packed_dim];
-  if (packed_idx + 3 >= packed_dim_size) {
-    ivec4 packed_ind = ivec4(packed_idx) + ivec4(0, 1, 2, 3);
-    VEC4_T valid_idx = VEC4_T(lessThan(packed_ind, ivec4(packed_dim_size)));
-    outtex = outtex * valid_idx;
-  }
-
-  imageStore(t_out, pos, outtex);
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/pad_texture.glsl b/backends/vulkan/runtime/graph/ops/glsl/pad_texture.glsl
new file mode 100644
index 00000000000..12d9fedde03
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/pad_texture.glsl
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+${define_required_extensions("texture3d", DTYPE)}
+
+#define PRECISION ${PRECISION}
+
+#define VEC4_T ${texel_load_type(DTYPE, "texture3d")}
+#define T ${texel_load_component_type(DTYPE, "texture3d")}
+
+${define_active_storage_type("texture3d")}
+
+#extension GL_EXT_control_flow_attributes : require
+
+layout(std430) buffer;
+
+#include "common.glslh"
+#include "indexing.glslh"
+
+${layout_declare_tensor(B, "w", "t_out", DTYPE, "texture3d")}
+${layout_declare_tensor(B, "r", "t_in", DTYPE, "texture3d")}
+
+${layout_declare_ubo(B, "TextureMetadata", "outp")}
+${layout_declare_ubo(B, "TextureMetadata", "inp")}
+${layout_declare_ubo(B, "ivec4", "pad_per_dim")}
+${layout_declare_ubo(B, "float", "fill_value")}
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+${layout_declare_spec_const(C, "int", "out_layout", "CONTIG_LAYOUT_INT")}
+const int packed_dim = get_packed_dim(out_layout);
+
+void main() {
+  const ivec3 out_pos = ivec3(gl_GlobalInvocationID);
+
+  if (out_of_bounds(out_pos, outp)) {
+    return;
+  }
+
+  // Convert the thread position to output tensor indices in element space.
+  // out_tidx.data[packed_dim] is the element index of the first component in
+  // this texel; the remaining three dims are scalar element indices.
+  TensorIndex4D out_tidx = texture_pos_to_tensor4d_idx_simple(outp, out_pos);
+
+  // Tail texels may have fewer than 4 valid elements; leave extras as 0.
+  const int limit =
+      min(4, outp.sizes[packed_dim] - out_tidx.data[packed_dim]);
+
+  VEC4_T out_texel = VEC4_T(0);
+
+  // Process each of the (up to 4) elements in this output texel independently.
+  // For each element: subtract pad offsets to obtain the input element index,
+  // then copy from the input if in-bounds or write fill_value if in the padding
+  // region.
+  [[unroll]] for (int comp = 0; comp < limit; comp++) {
+    TensorIndex4D in_tidx = out_tidx;
+    in_tidx.data[packed_dim] += comp;
+    in_tidx.data[0] -= pad_per_dim[0];
+    in_tidx.data[1] -= pad_per_dim[1];
+    in_tidx.data[2] -= pad_per_dim[2];
+    in_tidx.data[3] -= pad_per_dim[3];
+
+    // Signed underflow (output index < pad) produces a negative value that
+    // fails the >= 0 check, correctly identifying the padding region.
+    if (in_tidx.data[0] >= 0 && in_tidx.data[0] < inp.sizes[0] &&
+        in_tidx.data[1] >= 0 && in_tidx.data[1] < inp.sizes[1] &&
+        in_tidx.data[2] >= 0 && in_tidx.data[2] < inp.sizes[2] &&
+        in_tidx.data[3] >= 0 && in_tidx.data[3] < inp.sizes[3]) {
+      TextureElementIndex elem =
+          tensor4d_idx_to_texture_element_idx_simple(inp, in_tidx);
+      VEC4_T in_texel = texelFetch(t_in, elem.pos, 0);
+      out_texel[comp] = T(in_texel[elem.comp]);
+    } else {
+      out_texel[comp] = T(fill_value);
+    }
+  }
+
+  imageStore(t_out, out_pos, out_texel);
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/pad_height_width.yaml b/backends/vulkan/runtime/graph/ops/glsl/pad_texture.yaml
similarity index 65%
rename from backends/vulkan/runtime/graph/ops/glsl/pad_height_width.yaml
rename to backends/vulkan/runtime/graph/ops/glsl/pad_texture.yaml
index 2eb57291bb2..f2a40d289bf 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/pad_height_width.yaml
+++ b/backends/vulkan/runtime/graph/ops/glsl/pad_texture.yaml
@@ -1,9 +1,6 @@
-pad_height_width:
+pad_texture:
   parameter_names_with_default_values:
-    NDIM: 3
     DTYPE: float
-    PACKING: C_packed
-    STORAGE: texture3d
   generate_variant_forall:
     DTYPE:
       - VALUE: float
@@ -11,4 +8,4 @@ pad_height_width:
       - VALUE: int32
       - VALUE: uint8
   shader_variants:
-    - NAME: pad_height_width
+    - NAME: pad_texture3d
diff --git a/backends/vulkan/runtime/graph/ops/impl/Pad.cpp b/backends/vulkan/runtime/graph/ops/impl/Pad.cpp
index d225af05633..7f872512c05 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Pad.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Pad.cpp
@@ -16,26 +16,24 @@
 
 namespace vkcompute {
 
-struct PadParam final {
-  int32_t left;
-  int32_t top;
-  int32_t front;
-};
-
-PadParam creat_pad_param(const std::vector<int64_t>& pad) {
-  if (pad.size() == 2) {
-    return PadParam{static_cast<int32_t>(pad[0]), 0, 0};
-  } else if (pad.size() == 4) {
-    return PadParam{
-        static_cast<int32_t>(pad[0]), static_cast<int32_t>(pad[2]), 0};
-  } else if (pad.size() == 6) {
-    return PadParam{
-        static_cast<int32_t>(pad[0]),
-        static_cast<int32_t>(pad[2]),
-        static_cast<int32_t>(pad[4])};
-  } else {
-    VK_THROW("invalid pad form");
+utils::ivec4 create_pad_per_dim(const std::vector<int64_t>& pad) {
+  // pad contains pairs of (before, after) values for each dimension, starting
+  // from the innermost (W). BufferMetadata/TextureMetadata use WHCN order, so
+  // map pad[0]->W, pad[2]->H, pad[4]->C, pad[6]->N.
+  utils::ivec4 pad_per_dim{0, 0, 0, 0};
+  if (pad.size() >= 2) {
+    pad_per_dim[0] = static_cast<int32_t>(pad[0]);
+  }
+  if (pad.size() >= 4) {
+    pad_per_dim[1] = static_cast<int32_t>(pad[2]);
+  }
+  if (pad.size() >= 6) {
+    pad_per_dim[2] = static_cast<int32_t>(pad[4]);
+  }
+  if (pad.size() >= 8) {
+    pad_per_dim[3] = static_cast<int32_t>(pad[6]);
   }
+  return pad_per_dim;
 }
 
 void resize_constant_pad_node(
@@ -59,40 +57,32 @@ void add_constant_pad_nd_node(
     ComputeGraph& graph,
     const ValueRef& in,
     const ValueRef& pad,
-    const ValueRef& fill_value,
+    const ValueRef& fill_value_ref,
     const ValueRef& out) {
-  const float fill_value_val = graph.extract_scalar<float>(fill_value);
+  const float fill_value_val = graph.extract_scalar<float>(fill_value_ref);
   const IntListPtr pad_vec = graph.get_int_list(pad);
+  const utils::ivec4 pad_per_dim = create_pad_per_dim(*pad_vec);
 
-  std::string kernel_name = "";
-  const PadParam pad_param = creat_pad_param(*pad_vec);
-
-  if (pad_vec->size() <= 4) {
-    kernel_name = "pad_height_width";
-    kernel_name.reserve(kShaderNameReserve);
-    add_dtype_suffix(kernel_name, graph.dtype_of(out));
-  } else {
-    kernel_name = "pad_channel";
-    kernel_name.reserve(kShaderNameReserve);
-    add_dtype_suffix(kernel_name, graph.dtype_of(out));
-  }
+  std::string kernel_name = "pad";
+  kernel_name.reserve(kShaderNameReserve);
+  add_storage_type_suffix(kernel_name, graph.storage_type_of(out));
+  add_dtype_suffix(kernel_name, graph.dtype_of(out));
 
   graph.execute_nodes().emplace_back(new DynamicDispatchNode(
       graph,
       VK_KERNEL_FROM_STR(kernel_name),
       default_pick_global_wg_size,
       default_pick_local_wg_size,
-      // Inputs and Outputs
       {{out, vkapi::kWrite}, {in, vkapi::kRead}},
-      // Shader params buffers
-      {graph.sizes_ubo(out),
-       graph.sizes_ubo(in),
-       graph.create_params_buffer(pad_param),
+      // Parameter buffers
+      {graph.meta_ubo(out),
+       graph.meta_ubo(in),
+       graph.create_params_buffer(pad_per_dim),
        graph.create_params_buffer(fill_value_val)},
       // Push Constants
       {},
       // Specialization Constants
-      {},
+      {graph.hashed_layout_of(out)},
       // Resize Args
       {pad},
       // Resizing Logic
@@ -100,7 +90,7 @@ void add_constant_pad_nd_node(
 }
 
 void constant_pad_nd(ComputeGraph& graph, const std::vector<ValueRef>& args) {
-  return add_constant_pad_nd_node(graph, args[0], args[1], args[2], args[3]);
+  add_constant_pad_nd_node(graph, args[0], args[1], args[2], args[3]);
 }
 
 REGISTER_OPERATORS {
diff --git a/backends/vulkan/test/op_tests/cases.py b/backends/vulkan/test/op_tests/cases.py
index 1ecf1c677ed..ad77cf509ae 100644
--- a/backends/vulkan/test/op_tests/cases.py
+++ b/backends/vulkan/test/op_tests/cases.py
@@ -11,7 +11,6 @@
 
 from executorch.backends.vulkan.test.op_tests.utils.test_suite import VkTestSuite
 
-
 # Prime numbers dim sizes for testing
 XL = 113
 L = 89
@@ -1898,14 +1897,22 @@ def get_constant_pad_nd_inputs():
         [
             ([S1, S2], [1, 1], 24.0),
             ([M, M1, M2], [2, 2], 23.2),
-            ([L, M, M1, M2], [3, 5], 12.2),
+            ([S2, M, M1, M2], [3, 5], 12.2),
             ([S1, S2], [1, 1, 1, 1], 24.0),
             ([M, M1, M2], [2, 2, 2, 2], 23.2),
-            ([L, M, M1, M2], [3, 5, 3, 5], 12.2),
+            ([S2, M, M1, M2], [3, 5, 3, 5], 12.2),
             ([M, M1, M2], [1, 2, 3, 4, 5, 6], 23.2),
-            ([L, M, M1, M2], [3, 3, 3, 3, 3, 3], 12.2),
+            ([S2, M, M1, M2], [3, 3, 3, 3, 3, 3], 12.2),
         ]
     )
+    test_suite.layouts = [
+        "utils::kWidthPacked",
+        "utils::kChannelsPacked",
+    ]
+    test_suite.storage_types = [
+        "utils::kTexture3D",
+        "utils::kBuffer",
+    ]
     return test_suite
 
 

From ad54f364eae4aed223814ca2f949dfee51adeeb7 Mon Sep 17 00:00:00 2001
From: ssjia <ssjia@devgpu053.atn3.facebook.com>
Date: Tue, 17 Mar 2026 14:19:16 -0700
Subject: [PATCH 05/13] [ET-VK] Modernize arange and full

Modernize arange and full operators to support ANY_STORAGE. Add separate
buffer and texture shader variants using BufferMetadata/TextureMetadata
with indexing.glslh. Unify dispatch with add_storage_type_suffix and
DynamicDispatchNode. Add symint support via read_symint_list for dynamic
output sizes.

Pull Request resolved: https://github.com/pytorch/executorch/pull/18052
ghstack-source-id: 353546693
@exported-using-ghexport

Differential Revision: [D95970169](https://our.internmc.facebook.com/intern/diff/D95970169/)
---
 backends/vulkan/op_registry.py                |  4 +-
 .../vulkan/runtime/graph/ops/glsl/arange.glsl | 39 -------------
 .../runtime/graph/ops/glsl/arange_buffer.glsl | 38 +++++++++++++
 .../glsl/{arange.yaml => arange_buffer.yaml}  | 10 ++--
 .../graph/ops/glsl/arange_texture.glsl        | 55 +++++++++++++++++++
 .../graph/ops/glsl/arange_texture.yaml        | 16 ++++++
 .../runtime/graph/ops/glsl/full_buffer.glsl   | 36 ++++++++++++
 .../runtime/graph/ops/glsl/full_buffer.yaml   | 18 ++++++
 .../ops/glsl/{full.glsl => full_texture.glsl} | 28 ++++++----
 .../ops/glsl/{full.yaml => full_texture.yaml} |  6 +-
 .../vulkan/runtime/graph/ops/impl/Arange.cpp  |  9 +--
 .../vulkan/runtime/graph/ops/impl/Full.cpp    |  7 ++-
 backends/vulkan/test/op_tests/cases.py        |  7 +++
 13 files changed, 203 insertions(+), 70 deletions(-)
 delete mode 100644 backends/vulkan/runtime/graph/ops/glsl/arange.glsl
 create mode 100644 backends/vulkan/runtime/graph/ops/glsl/arange_buffer.glsl
 rename backends/vulkan/runtime/graph/ops/glsl/{arange.yaml => arange_buffer.yaml} (78%)
 create mode 100644 backends/vulkan/runtime/graph/ops/glsl/arange_texture.glsl
 create mode 100644 backends/vulkan/runtime/graph/ops/glsl/arange_texture.yaml
 create mode 100644 backends/vulkan/runtime/graph/ops/glsl/full_buffer.glsl
 create mode 100644 backends/vulkan/runtime/graph/ops/glsl/full_buffer.yaml
 rename backends/vulkan/runtime/graph/ops/glsl/{full.glsl => full_texture.glsl} (54%)
 rename backends/vulkan/runtime/graph/ops/glsl/{full.yaml => full_texture.yaml} (88%)

diff --git a/backends/vulkan/op_registry.py b/backends/vulkan/op_registry.py
index fbd2d0cc7cb..7e3b2f0f77b 100644
--- a/backends/vulkan/op_registry.py
+++ b/backends/vulkan/op_registry.py
@@ -1285,7 +1285,7 @@ def check_index_tensor_node(node: torch.fx.Node) -> bool:
 @update_features(exir_ops.edge.aten.arange.start_step)
 def register_arange():
     return OpFeatures(
-        inputs_storage=utils.CHANNELS_PACKED_TEXTURE,
+        inputs_storage=utils.ANY_STORAGE,
         inputs_dtypes=utils.FP_INT_T,
     )
 
@@ -1321,7 +1321,7 @@ def register_constant_pad_nd():
 )
 def register_full_cpp_ops():
     return OpFeatures(
-        inputs_storage=utils.CHANNELS_PACKED_TEXTURE,
+        inputs_storage=utils.ANY_STORAGE,
         inputs_dtypes=utils.FP_INT_BOOL_T,
     )
 
diff --git a/backends/vulkan/runtime/graph/ops/glsl/arange.glsl b/backends/vulkan/runtime/graph/ops/glsl/arange.glsl
deleted file mode 100644
index 8b1841888ad..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/arange.glsl
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-#define VEC4_T ${texel_type(DTYPE)}
-
-layout(std430) buffer;
-
-#include "indexing_utils.h"
-
-${layout_declare_tensor(0, "w", "t_out", DTYPE, STORAGE)}
-${layout_declare_ubo(1, "ivec4", "sizes")}
-${layout_declare_ubo(2, "float", "start")}
-${layout_declare_ubo(3, "float", "step")}
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-layout(constant_id = 3) const int packed_dim = C_DIM;
-
-void main() {
-  const ivec3 pos = ivec3(gl_GlobalInvocationID);
-  const ivec4 idx = to_tensor_idx(pos, sizes, packed_dim);
-
-  if (pos_out_of_bounds(pos, sizes, packed_dim)) {
-    return;
-  }
-
-  VEC4_T outtex = VEC4_T(start + pos.x * step, 0, 0, 0);
-
-  imageStore(t_out, pos, outtex);
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/arange_buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/arange_buffer.glsl
new file mode 100644
index 00000000000..906ed91a466
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/arange_buffer.glsl
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+${define_required_extensions("buffer", DTYPE)}
+
+#define PRECISION ${PRECISION}
+
+#define T ${buffer_scalar_type(DTYPE)}
+
+${define_active_storage_type("buffer")}
+
+layout(std430) buffer;
+
+#include "indexing.glslh"
+
+${layout_declare_tensor(B, "w", "t_out", DTYPE, "buffer")}
+
+${layout_declare_ubo(B, "BufferMetadata", "outp")}
+${layout_declare_ubo(B, "float", "start")}
+${layout_declare_ubo(B, "float", "step")}
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+void main() {
+  const uint out_bufi = gl_GlobalInvocationID.x;
+  if (out_of_bounds(out_bufi, outp)) {
+    return;
+  }
+
+  t_out[out_bufi] = T(start + out_bufi * step);
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/arange.yaml b/backends/vulkan/runtime/graph/ops/glsl/arange_buffer.yaml
similarity index 78%
rename from backends/vulkan/runtime/graph/ops/glsl/arange.yaml
rename to backends/vulkan/runtime/graph/ops/glsl/arange_buffer.yaml
index 37b2027db85..d53d0387788 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/arange.yaml
+++ b/backends/vulkan/runtime/graph/ops/glsl/arange_buffer.yaml
@@ -4,16 +4,14 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-arange:
+arange_buffer:
   parameter_names_with_default_values:
-    NDIM: 3
-    DTYPE: int32
-    STORAGE: texture3d
-    PACKING: C_packed
+    DTYPE: float
+    STORAGE: buffer
   generate_variant_forall:
     DTYPE:
       - VALUE: half
       - VALUE: float
       - VALUE: int32
   shader_variants:
-    - NAME: arange
+    - NAME: arange_buffer
diff --git a/backends/vulkan/runtime/graph/ops/glsl/arange_texture.glsl b/backends/vulkan/runtime/graph/ops/glsl/arange_texture.glsl
new file mode 100644
index 00000000000..677b8213997
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/arange_texture.glsl
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+${define_required_extensions("texture3d", DTYPE)}
+
+#define PRECISION ${PRECISION}
+
+#define VEC4_T ${texel_load_type(DTYPE, "texture3d")}
+
+${define_active_storage_type("texture3d")}
+
+layout(std430) buffer;
+
+#include "indexing.glslh"
+
+${layout_declare_tensor(B, "w", "t_out", DTYPE, "texture3d")}
+
+${layout_declare_ubo(B, "TextureMetadata", "outp")}
+${layout_declare_ubo(B, "float", "start")}
+${layout_declare_ubo(B, "float", "step")}
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+${layout_declare_spec_const(C, "int", "out_layout", "CONTIG_LAYOUT_INT")}
+const int packed_dim = get_packed_dim(out_layout);
+
+void main() {
+  const ivec3 out_pos = ivec3(gl_GlobalInvocationID);
+
+  if (out_of_bounds(out_pos, outp)) {
+    return;
+  }
+
+  TensorIndex4D out_tidx = texture_pos_to_tensor4d_idx_simple(outp, out_pos);
+
+  // arange output is 1D, so the W dimension holds the element index.
+  // Compute the value for each element in the texel along the packed dim.
+  VEC4_T outtex = VEC4_T(0);
+  int limit = min(
+      4, outp.sizes[packed_dim] - out_tidx.data[packed_dim]);
+  for (int comp = 0; comp < limit; comp++) {
+    int elem_idx = out_tidx.data[0]; // W index is the linear element index
+    outtex[comp] = VEC4_T(start + elem_idx * step).x;
+    out_tidx.data[packed_dim]++;
+  }
+
+  imageStore(t_out, out_pos, outtex);
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/arange_texture.yaml b/backends/vulkan/runtime/graph/ops/glsl/arange_texture.yaml
new file mode 100644
index 00000000000..2cd9255d754
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/arange_texture.yaml
@@ -0,0 +1,16 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+arange_texture:
+  parameter_names_with_default_values:
+    DTYPE: float
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: half
+      - VALUE: float
+      - VALUE: int32
+  shader_variants:
+    - NAME: arange_texture3d
diff --git a/backends/vulkan/runtime/graph/ops/glsl/full_buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/full_buffer.glsl
new file mode 100644
index 00000000000..729baec324d
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/full_buffer.glsl
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+${define_required_extensions("buffer", DTYPE)}
+
+#define PRECISION ${PRECISION}
+
+#define T ${buffer_scalar_type(DTYPE)}
+
+${define_active_storage_type("buffer")}
+
+layout(std430) buffer;
+
+#include "indexing.glslh"
+
+${layout_declare_tensor(B, "w", "t_out", DTYPE, "buffer")}
+${layout_declare_ubo(B, "BufferMetadata", "outp")}
+${layout_declare_ubo(B, "float", "fill_value")}
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+void main() {
+  const uint out_bufi = gl_GlobalInvocationID.x;
+  if (out_of_bounds(out_bufi, outp)) {
+    return;
+  }
+
+  t_out[out_bufi] = T(fill_value);
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/full_buffer.yaml b/backends/vulkan/runtime/graph/ops/glsl/full_buffer.yaml
new file mode 100644
index 00000000000..7b1af0ab9c6
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/full_buffer.yaml
@@ -0,0 +1,18 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+full_buffer:
+  parameter_names_with_default_values:
+    DTYPE: float
+    STORAGE: buffer
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: half
+      - VALUE: float
+      - VALUE: int32
+      - VALUE: uint8
+  shader_variants:
+    - NAME: full_buffer
diff --git a/backends/vulkan/runtime/graph/ops/glsl/full.glsl b/backends/vulkan/runtime/graph/ops/glsl/full_texture.glsl
similarity index 54%
rename from backends/vulkan/runtime/graph/ops/glsl/full.glsl
rename to backends/vulkan/runtime/graph/ops/glsl/full_texture.glsl
index 81f1f182cdf..5295e72ce5d 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/full.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/full_texture.glsl
@@ -8,35 +8,39 @@
 
 #version 450 core
 
-#define PRECISION ${PRECISION}
+${define_required_extensions("texture3d", DTYPE)}
 
-#define VEC4_T ${texel_type(DTYPE)}
+#define PRECISION ${PRECISION}
 
-#define POS ${get_pos[NDIM]("pos")}
+#define VEC4_T ${texel_load_type(DTYPE, "texture3d")}
 
-#include "indexing_utils.h"
+${define_active_storage_type("texture3d")}
 
 layout(std430) buffer;
 
-${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)}
-${layout_declare_ubo(B, "ivec4", "sizes")}
+#include "indexing.glslh"
+
+${layout_declare_tensor(B, "w", "t_out", DTYPE, "texture3d")}
+${layout_declare_ubo(B, "TextureMetadata", "outp")}
 ${layout_declare_ubo(B, "float", "fill_value")}
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
-layout(constant_id = 3) const int packed_dim = C_DIM;
+${layout_declare_spec_const(C, "int", "out_layout", "CONTIG_LAYOUT_INT")}
+const int packed_dim = get_packed_dim(out_layout);
 
 void main() {
   const ivec3 pos = ivec3(gl_GlobalInvocationID);
-  const ivec4 idx = to_tensor_idx(pos, sizes, packed_dim);
 
-  if (any(greaterThanEqual(idx, sizes))) {
+  if (out_of_bounds(pos, outp)) {
     return;
   }
 
   VEC4_T outtex = VEC4_T(fill_value);
-  const int packed_dim_size = sizes[packed_dim];
-  int packed_idx = idx[packed_dim];
+
+  TensorIndex4D tidx = texture_pos_to_tensor4d_idx_simple(outp, pos);
+  const int packed_dim_size = outp.sizes[packed_dim];
+  int packed_idx = tidx.data[packed_dim];
 
   if (packed_idx + 3 >= packed_dim_size) {
     ivec4 packed_ind = ivec4(packed_idx) + ivec4(0, 1, 2, 3);
@@ -44,5 +48,5 @@ void main() {
     outtex = outtex * valid_idx;
   }
 
-  imageStore(t_out, POS, outtex);
+  imageStore(t_out, pos, outtex);
 }
diff --git a/backends/vulkan/runtime/graph/ops/glsl/full.yaml b/backends/vulkan/runtime/graph/ops/glsl/full_texture.yaml
similarity index 88%
rename from backends/vulkan/runtime/graph/ops/glsl/full.yaml
rename to backends/vulkan/runtime/graph/ops/glsl/full_texture.yaml
index 5d7a983cae3..80e87dde5fd 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/full.yaml
+++ b/backends/vulkan/runtime/graph/ops/glsl/full_texture.yaml
@@ -4,11 +4,9 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-full:
+full_texture:
   parameter_names_with_default_values:
-    NDIM: 3
     DTYPE: float
-    PACKING: C_packed
     STORAGE: texture3d
   generate_variant_forall:
     DTYPE:
@@ -17,4 +15,4 @@ full:
       - VALUE: int32
       - VALUE: uint8
   shader_variants:
-    - NAME: full
+    - NAME: full_texture3d
diff --git a/backends/vulkan/runtime/graph/ops/impl/Arange.cpp b/backends/vulkan/runtime/graph/ops/impl/Arange.cpp
index 3171fbeb488..bf6345c0f16 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Arange.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Arange.cpp
@@ -44,13 +44,13 @@ void check_arange_input(
     const ValueRef start,
     const ValueRef end,
     const ValueRef step) {
-  if (!graph.val_is_none(start) && !graph.val_is_int(end)) {
+  if (!graph.val_is_none(start) && !graph.val_is_int(start)) {
     VK_THROW("arange: start must be int!");
   }
   if (!graph.val_is_none(end) && !graph.val_is_int(end)) {
     VK_THROW("arange: end must be int!");
   }
-  if (!graph.val_is_none(step) && !graph.val_is_int(end)) {
+  if (!graph.val_is_none(step) && !graph.val_is_int(step)) {
     VK_THROW("arange: step must be int!");
   }
 }
@@ -85,6 +85,7 @@ void add_arange_node(
 
   std::string kernel_name("arange");
   kernel_name.reserve(kShaderNameReserve);
+  add_storage_type_suffix(kernel_name, graph.storage_type_of(out));
   add_dtype_suffix(kernel_name, graph.dtype_of(out));
 
   graph.execute_nodes().emplace_back(new DynamicDispatchNode(
@@ -95,13 +96,13 @@ void add_arange_node(
       // Inputs and Outputs
       {{out, vkapi::kWrite}},
       // Shader params buffers
-      {graph.sizes_ubo(out),
+      {graph.meta_ubo(out),
        graph.create_params_buffer(start_val),
        graph.create_params_buffer(step_val)},
       // Push Constants
       {},
       // Specialization Constants
-      {},
+      {graph.hashed_layout_of(out)},
       // Resize Args
       {start, end, step},
       // Resizing Logic
diff --git a/backends/vulkan/runtime/graph/ops/impl/Full.cpp b/backends/vulkan/runtime/graph/ops/impl/Full.cpp
index fe2676e91e0..5458fdce7df 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Full.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Full.cpp
@@ -25,7 +25,7 @@ void resize_full_node(
   if (graph->val_is_tensor(extra_args.at(0))) {
     out_sizes = graph->sizes_of(extra_args.at(0));
   } else {
-    out_sizes = *graph->get_int_list(extra_args.at(0));
+    out_sizes = graph->extract_int_or_symint_list(extra_args.at(0));
   }
 
   graph->virtual_resize(out, out_sizes);
@@ -41,6 +41,7 @@ void add_full_node(
   std::string kernel_name("full");
   kernel_name.reserve(kShaderNameReserve);
 
+  add_storage_type_suffix(kernel_name, graph.storage_type_of(out));
   add_dtype_suffix(kernel_name, graph.dtype_of(out));
 
   graph.execute_nodes().emplace_back(new DynamicDispatchNode(
@@ -51,11 +52,11 @@ void add_full_node(
       // Inputs and Outputs
       {{out, vkapi::kWrite}},
       // Shader params buffers
-      {graph.sizes_ubo(out), graph.create_params_buffer(fill_value_val)},
+      {graph.meta_ubo(out), graph.create_params_buffer(fill_value_val)},
       // Push Constants
       {},
       // Specialization Constants
-      {graph.packed_dim_of(out)},
+      {graph.hashed_layout_of(out)},
       // Resize Args
       {size_or_in},
       // Resizing Logic
diff --git a/backends/vulkan/test/op_tests/cases.py b/backends/vulkan/test/op_tests/cases.py
index ad77cf509ae..081dde4a619 100644
--- a/backends/vulkan/test/op_tests/cases.py
+++ b/backends/vulkan/test/op_tests/cases.py
@@ -839,6 +839,7 @@ def get_full_inputs():
             ([L, M, M1, M2], 2.72),
         ]
     )
+    test_suite.storage_types = ["utils::kTexture3D", "utils::kBuffer"]
     return test_suite
 
 
@@ -873,6 +874,7 @@ def get_ones_inputs():
             ([L, M, M1, M2]),
         ]
     )
+    test_suite.storage_types = ["utils::kTexture3D", "utils::kBuffer"]
     return test_suite
 
 
@@ -1886,8 +1888,13 @@ def get_arange_inputs():
     )
 
     test_suite.layouts = [
+        "utils::kWidthPacked",
         "utils::kChannelsPacked",
     ]
+    test_suite.storage_types = [
+        "utils::kTexture3D",
+        "utils::kBuffer",
+    ]
     return test_suite
 
 

From 91c5dc7ed9ac9f33418c2fd6372eaf99081779de Mon Sep 17 00:00:00 2001
From: ssjia <ssjia@devgpu053.atn3.facebook.com>
Date: Tue, 17 Mar 2026 14:19:18 -0700
Subject: [PATCH 06/13] [ET-VK] Modernize expand_copy

Modernize expand_copy to support ANY_STORAGE. Add buffer shader variant
using BufferMetadata with indexing.glslh. Unify dispatch with
add_storage_type_suffix and DynamicDispatchNode. Add resize function and
symint support for dynamic target sizes.

Pull Request resolved: https://github.com/pytorch/executorch/pull/18053
ghstack-source-id: 353546690
@exported-using-ghexport

Differential Revision: [D95970162](https://our.internmc.facebook.com/intern/diff/D95970162/)
---
 backends/vulkan/op_registry.py                |  2 +-
 .../graph/ops/glsl/expand_texture.glsl        | 70 +++++++++++++++++++
 .../graph/ops/glsl/expand_texture.yaml        | 11 +++
 .../vulkan/runtime/graph/ops/impl/Expand.cpp  | 48 ++++++++++---
 backends/vulkan/test/op_tests/cases.py        |  1 +
 5 files changed, 121 insertions(+), 11 deletions(-)
 create mode 100644 backends/vulkan/runtime/graph/ops/glsl/expand_texture.glsl
 create mode 100644 backends/vulkan/runtime/graph/ops/glsl/expand_texture.yaml

diff --git a/backends/vulkan/op_registry.py b/backends/vulkan/op_registry.py
index 7e3b2f0f77b..190074f998a 100644
--- a/backends/vulkan/op_registry.py
+++ b/backends/vulkan/op_registry.py
@@ -1120,7 +1120,7 @@ def register_gather():
 @update_features(exir_ops.edge.aten.expand_copy.default)
 def register_expand_copy():
     return OpFeatures(
-        inputs_storage=utils.ANY_BUFFER,
+        inputs_storage=utils.ANY_STORAGE,
         inputs_dtypes=utils.FP_INT_BOOL_T,
         supports_resize=False,
         supports_highdim=True,
diff --git a/backends/vulkan/runtime/graph/ops/glsl/expand_texture.glsl b/backends/vulkan/runtime/graph/ops/glsl/expand_texture.glsl
new file mode 100644
index 00000000000..6409e83eaa1
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/expand_texture.glsl
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+${define_required_extensions("texture3d", DTYPE)}
+
+#define PRECISION ${PRECISION}
+
+#define VEC4_T ${texel_load_type(DTYPE, "texture3d")}
+#define T ${texel_load_component_type(DTYPE, "texture3d")}
+
+${define_active_storage_type("texture3d")}
+
+layout(std430) buffer;
+
+#include "indexing.glslh"
+
+${layout_declare_tensor(B, "w", "t_outp", DTYPE, "texture3d")}
+${layout_declare_tensor(B, "r", "t_inp", DTYPE, "texture3d")}
+
+${layout_declare_ubo(B, "TextureMetadata", "outp")}
+${layout_declare_ubo(B, "TextureMetadata", "inp")}
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+${layout_declare_spec_const(C, "int", "out_layout", "CONTIG_LAYOUT_INT")}
+const int packed_dim = get_packed_dim(out_layout);
+
+void main() {
+  const ivec3 out_pos = ivec3(gl_GlobalInvocationID);
+
+  if (out_of_bounds(out_pos, outp)) {
+    return;
+  }
+
+  TensorIndex4D out_tidx = texture_pos_to_tensor4d_idx_simple(outp, out_pos);
+
+  VEC4_T out_texel = VEC4_T(0);
+
+  int limit = min(
+      4, outp.sizes[packed_dim] - out_tidx.data[packed_dim]);
+  for (int comp = 0; comp < 4; comp++) {
+    if (comp >= limit) {
+      break;
+    }
+
+    // Map output tensor index to input tensor index using modulo
+    TensorIndex4D inp_tidx;
+    inp_tidx.data.x = out_tidx.data.x % inp.sizes.x;
+    inp_tidx.data.y = out_tidx.data.y % inp.sizes.y;
+    inp_tidx.data.z = out_tidx.data.z % inp.sizes.z;
+    inp_tidx.data.w = out_tidx.data.w % inp.sizes.w;
+
+    TextureElementIndex inp_elem =
+        tensor4d_idx_to_texture_element_idx_simple(inp, inp_tidx);
+
+    VEC4_T inp_texel = texelFetch(t_inp, inp_elem.pos, 0);
+    out_texel[comp] = inp_texel[inp_elem.comp];
+
+    out_tidx.data[packed_dim]++;
+  }
+
+  imageStore(t_outp, out_pos, out_texel);
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/expand_texture.yaml b/backends/vulkan/runtime/graph/ops/glsl/expand_texture.yaml
new file mode 100644
index 00000000000..461b39b11bf
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/expand_texture.yaml
@@ -0,0 +1,11 @@
+expand_texture:
+  parameter_names_with_default_values:
+    DTYPE: float
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: half
+      - VALUE: float
+      - VALUE: int32
+      - VALUE: uint8
+  shader_variants:
+    - NAME: expand_texture3d
diff --git a/backends/vulkan/runtime/graph/ops/impl/Expand.cpp b/backends/vulkan/runtime/graph/ops/impl/Expand.cpp
index 1623a26b2a1..6308d333eaf 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Expand.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Expand.cpp
@@ -16,7 +16,39 @@
 
 namespace vkcompute {
 
-void add_expand_buffer_node(
+void resize_expand_node(
+    ComputeGraph* graph,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& extra_args) {
+  const ValueRef in = args.at(1).refs.at(0);
+  const ValueRef out = args.at(0).refs.at(0);
+  const ValueRef size_ref = extra_args.at(0);
+
+  const std::vector<int64_t> in_sizes = graph->sizes_of(in);
+  const std::vector<int64_t> target_sizes =
+      graph->extract_int_or_symint_list(size_ref);
+
+  VK_CHECK_COND(
+      target_sizes.size() >= in_sizes.size(),
+      "expand: target sizes must have at least as many dims as input");
+  VK_CHECK_COND(
+      !target_sizes.empty(), "expand: target sizes must not be empty");
+
+  const size_t dim_offset = target_sizes.size() - in_sizes.size();
+  std::vector<int64_t> out_sizes(target_sizes.size());
+  for (size_t i = 0; i < target_sizes.size(); i++) {
+    if (target_sizes[i] == -1 && i >= dim_offset) {
+      out_sizes[i] = in_sizes[i - dim_offset];
+    } else if (target_sizes[i] == -1) {
+      out_sizes[i] = 1;
+    } else {
+      out_sizes[i] = target_sizes[i];
+    }
+  }
+  graph->virtual_resize(out, out_sizes);
+}
+
+void add_expand_node(
     ComputeGraph& graph,
     const ValueRef in,
     const ValueRef size,
@@ -27,8 +59,8 @@ void add_expand_buffer_node(
   add_dtype_suffix(kernel_name, graph.dtype_of(out));
 
   vkapi::ParamsBindList param_buffers = {
-      graph.buffer_meta_ubo(out),
-      graph.buffer_meta_ubo(in),
+      graph.meta_ubo(out),
+      graph.meta_ubo(in),
   };
 
   graph.execute_nodes().emplace_back(new DynamicDispatchNode(
@@ -42,11 +74,11 @@ void add_expand_buffer_node(
       // Push Constants
       {},
       // Specialization Constants
-      {},
+      {graph.hashed_layout_of(out)},
       // Resize Args
       {size},
       // Resizing Logic
-      nullptr));
+      resize_expand_node));
 }
 
 void expand(ComputeGraph& graph, const std::vector<ValueRef>& args) {
@@ -57,11 +89,7 @@ void expand(ComputeGraph& graph, const std::vector<ValueRef>& args) {
   (void)implicit;
   const ValueRef out = args.at(idx++);
 
-  if (graph.is_buffer_storage(out)) {
-    return add_expand_buffer_node(graph, in, size, out);
-  }
-
-  VK_THROW("Expand operator only supports buffer storage");
+  add_expand_node(graph, in, size, out);
 }
 
 REGISTER_OPERATORS {
diff --git a/backends/vulkan/test/op_tests/cases.py b/backends/vulkan/test/op_tests/cases.py
index 081dde4a619..2ec0d721f4c 100644
--- a/backends/vulkan/test/op_tests/cases.py
+++ b/backends/vulkan/test/op_tests/cases.py
@@ -2031,6 +2031,7 @@ def get_expand_inputs():
     )
     test_suite.storage_types = [
         "utils::kBuffer",
+        "utils::kTexture3D",
     ]
     test_suite.layouts = [
         "utils::kWidthPacked",

From 2fa6b5c911b8c14edfe5ee3ac3dde53f0bedb2be Mon Sep 17 00:00:00 2001
From: ssjia <ssjia@devgpu053.atn3.facebook.com>
Date: Tue, 17 Mar 2026 14:19:19 -0700
Subject: [PATCH 07/13] [ET-VK] Modernize softmax and log_softmax

Modernize softmax and log_softmax to support ANY_STORAGE. Migrate both
buffer and texture shaders from indexing_utils.h to indexing.glslh with
BufferMetadata/TextureMetadata UBOs. Merge separate texture and buffer
dispatch functions into a unified add_softmax_node using
add_storage_type_suffix and graph.meta_ubo().

Pull Request resolved: https://github.com/pytorch/executorch/pull/18054
ghstack-source-id: 353546688
@exported-using-ghexport

Differential Revision: [D95970171](https://our.internmc.facebook.com/intern/diff/D95970171/)
---
 backends/vulkan/op_registry.py                |   2 +-
 .../runtime/graph/ops/glsl/softmax.glsl       | 152 ++++--------------
 .../runtime/graph/ops/glsl/softmax.yaml       |   4 +-
 .../graph/ops/glsl/softmax_buffer.glsl        | 124 ++++++++++++++
 .../graph/ops/glsl/softmax_buffer.yaml        |  21 +++
 .../vulkan/runtime/graph/ops/impl/Softmax.cpp |  93 +++++++----
 backends/vulkan/test/op_tests/cases.py        |   2 +
 7 files changed, 241 insertions(+), 157 deletions(-)
 create mode 100644 backends/vulkan/runtime/graph/ops/glsl/softmax_buffer.glsl
 create mode 100644 backends/vulkan/runtime/graph/ops/glsl/softmax_buffer.yaml

diff --git a/backends/vulkan/op_registry.py b/backends/vulkan/op_registry.py
index 190074f998a..45ce1359649 100644
--- a/backends/vulkan/op_registry.py
+++ b/backends/vulkan/op_registry.py
@@ -327,7 +327,7 @@ def check_to_copy_node(node: torch.fx.Node) -> bool:
 )
 def register_softmax_cpp_ops():
     return OpFeatures(
-        inputs_storage=utils.ANY_TEXTURE,
+        inputs_storage=utils.ANY_STORAGE,
         inputs_dtypes=utils.FP_T,
         supports_resize=True,
     )
diff --git a/backends/vulkan/runtime/graph/ops/glsl/softmax.glsl b/backends/vulkan/runtime/graph/ops/glsl/softmax.glsl
index 3176d0142bb..bf7facae761 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/softmax.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/softmax.glsl
@@ -20,94 +20,49 @@ ${define_active_storage_type(STORAGE)}
 
 layout(std430) buffer;
 
+#include "indexing.glslh"
+
 ${layout_declare_tensor(B, "w", "tout", DTYPE, STORAGE)}
 ${layout_declare_tensor(B, "r", "tin", DTYPE, STORAGE)}
 
-layout(push_constant) uniform restrict Block {
-  ivec4 tin_sizes;
-  ivec3 tout_limits;
-};
+${layout_declare_ubo(B, "TextureMetadata", "in_meta")}
+${layout_declare_ubo(B, "TextureMetadata", "out_meta")}
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
-layout(constant_id = 3) const int packed_dim = 0;
-layout(constant_id = 4) const int reduce_dim = 0;
-layout(constant_id = 5) const int group_dim = 1;
+${layout_declare_spec_const(C, "int", "out_layout", "CONTIG_LAYOUT_INT")}
+const int packed_dim = get_packed_dim(out_layout);
 
-// A more verbose name would be NWORKERS_PER_GROUP. This describes the number of
-// threads that will co-operate to compute one reduction output. There may be
-// multiple groups computing distinct reduction outputs within one work group.
-#define NWORKERS 4
+${layout_declare_spec_const(C, "int", "reduce_dim", "0")}
+${layout_declare_spec_const(C, "int", "group_dim", "1")}
 
-// Sets an upper limit on the total size of a work group based on how many
-// elements are allocated in the shared memory array below. Each thread in the
-// work group will write into its assigned element in the shared array.
+#define NWORKERS 4
 #define MAX_NTHREADS 16
 
 shared vec4 shared_max[MAX_NTHREADS];
 shared vec4 shared_sum[MAX_NTHREADS];
 
-#include "indexing_utils.h"
-
 int tid_to_smi(const ivec2 tid) {
   return tid.x + tid.y * NWORKERS;
 }
 
-/*
- * The shaders below compute softmax for a tensor. Softmax is an interesting mix
- * between a reduction operator and a unary elementwise operator, defined as
- * exp(x) / (sum of exp(x)). The general flow of the computation is:
- *
- * First, find the maximum element along the reduction dim. The maximum element
- * is used to preserve numerical stability, since division of exponents is
- * translation invariant.
- *
- * Next, compute the sum of exp(x - max_element) along the reduction dim.
- *
- * Finally, for each element along the reduction dim, we compute the output as
- * exp(x - max_element) / sum_of_exponents.
- *
- * The shaders below also utilize shared memory to have multiple threads help
- * compute the max and sum reduction operations. A total of NGROUPS x NWORKERS
- * threads are launched. Each group works on a unique reduction "row", and
- * within a group NWORKERS threads co-operate to compute the max and sum of one
- * "row". Each worker in the group is responsible for computing a partial output
- * of the "row" and uploading it to shared memory; the overall reduction output
- * can then be determined by aggregating the partial outputs stored in shared
- * memory.
- *
- * As a caveat, this shader does not currently support cases where `batch` > 1
- * and the reduce dim happens to also be the batch concatenation dim.  To support
- * this, there will need to be additional logic to set the starting value of
- * `scan_pos[reduce_dim]`. Since this is not expected to be a common use-case,
- * supporting this case is left as an exercise for when it is required.
- *
- * As a final note, log softmax is supported with this shader as well since via
- * the op1 and op2 macro definitions. See the corresponding YAML file for more
- * details.
- */
-
 /*
  * Computes softmax where the reduction dim is orthogonal to the packed dim.
  * This case is simpler because each element of a texel belongs to a separate
  * reduction dim, meaning we don't have to perform reduction along a texel.
  */
 void softmax_nonpacked_dim(const ivec2 tid, ivec3 scan_pos) {
-  // shared memory index of this thread
   const int smi = tid_to_smi(tid);
-  // used to iterate over all shared memory in the group
   int group_i;
 
   scan_pos[reduce_dim] = tid.x;
-  vec4 max_elements = load_texel(tin, scan_pos);
-  // This thread computes a partial maximum
-  for (int i = tid.x; i < tin_sizes[reduce_dim];
+  vec4 max_elements = texelFetch(tin, scan_pos, 0);
+  for (int i = tid.x; i < in_meta.sizes[reduce_dim];
        i += NWORKERS, scan_pos[reduce_dim] += NWORKERS) {
-    max_elements = max(max_elements, load_texel(tin, scan_pos));
+    max_elements = max(max_elements, texelFetch(tin, scan_pos, 0));
   }
   shared_max[smi] = max_elements;
   barrier();
-  // Iterate over the partial maximums to obtain the overall maximum
   group_i = tid.y * NWORKERS;
   max_elements = shared_max[group_i++];
   for (int i = 1; i < NWORKERS; ++i, group_i++) {
@@ -116,63 +71,44 @@ void softmax_nonpacked_dim(const ivec2 tid, ivec3 scan_pos) {
 
   scan_pos[reduce_dim] = tid.x;
   vec4 denominators = vec4(0);
-  // Compute partial sum
-  for (int i = tid.x; i < tin_sizes[reduce_dim];
+  for (int i = tid.x; i < in_meta.sizes[reduce_dim];
        i += NWORKERS, scan_pos[reduce_dim] += NWORKERS) {
-    denominators += exp(load_texel(tin, scan_pos) - max_elements);
+    denominators += exp(texelFetch(tin, scan_pos, 0) - max_elements);
   }
   shared_sum[smi] = denominators;
   barrier();
-  // Iterate over the partial sums to obtain the overall sum
   group_i = tid.y * NWORKERS;
   denominators = shared_sum[group_i++];
   for (int i = 1; i < NWORKERS; ++i, group_i++) {
     denominators += shared_sum[group_i];
   }
 
-  // Determine if there are any padding elements in the final texel of the
-  // packed dimension
-  const int nspill = mod4(tin_sizes[packed_dim]);
-  // Detect if this thread is working on the final texels of the packed
-  // dimension, which may have padding elements
+  const int nspill = mod_4(in_meta.sizes[packed_dim]);
   const bool is_last_texel =
-      scan_pos[packed_dim] == (tout_limits[packed_dim] - 1);
+      scan_pos[packed_dim] == (out_meta.limits[packed_dim] - 1);
 
   scan_pos[reduce_dim] = tid.x;
-  for (int i = tid.x; i < tin_sizes[reduce_dim];
+  for (int i = tid.x; i < in_meta.sizes[reduce_dim];
        i += NWORKERS, scan_pos[reduce_dim] += NWORKERS) {
-    const vec4 numerators = op1(load_texel(tin, scan_pos) - max_elements);
-    // Clamp denominator to avoid 0/0 = NaN when all exp values underflow.
+    const vec4 numerators = op1(texelFetch(tin, scan_pos, 0) - max_elements);
     const vec4 safe_denom = max(denominators, vec4(1e-37));
     vec4 outtex = op2(numerators, safe_denom);
-    // Replace any NaN/Inf with 0 using IEEE 754 bit-level manipulation.
-    // This avoids isnan()/x!=x which may not work reliably on all GPU drivers:
-    // - OpIsNan may have driver bugs for certain NaN bit patterns
-    // - OpFOrdNotEqual(NaN,NaN) = false (ordered comparison semantics)
-    // NaN/Inf pattern: all exponent bits set = (bits & 0x7F800000) == 0x7F800000
     {
       uvec4 bits = floatBitsToUint(outtex);
-      // Build a mask: 0xFFFFFFFF where NaN/Inf (exponent all-ones), else 0
       uvec4 nan_inf_mask = uvec4(
           ((bits.x & 0x7F800000u) == 0x7F800000u) ? 0xFFFFFFFFu : 0u,
           ((bits.y & 0x7F800000u) == 0x7F800000u) ? 0xFFFFFFFFu : 0u,
           ((bits.z & 0x7F800000u) == 0x7F800000u) ? 0xFFFFFFFFu : 0u,
           ((bits.w & 0x7F800000u) == 0x7F800000u) ? 0xFFFFFFFFu : 0u);
-      // Zero out bits where NaN/Inf: normal values are unchanged
       outtex = uintBitsToFloat(bits & ~nan_inf_mask);
     }
-    // For the last texel in the packed dim, make sure that the padding elements
-    // are explicitly set to 0. Otherwise, they may influence computations later
-    // down the line.
     if (is_last_texel && nspill > 0) {
       [[unroll]] for (int i = nspill; i < 4; ++i) {
         outtex[i] = 0;
       }
     }
-    write_texel(tout, scan_pos, outtex);
+    imageStore(tout, scan_pos, outtex);
   }
-  // Flush outstanding imageStore writes so they're committed to memory and
-  // visible to subsequent GPU operations on this image.
   memoryBarrierImage();
 }
 
@@ -185,44 +121,31 @@ void softmax_nonpacked_dim(const ivec2 tid, ivec3 scan_pos) {
  * multiple of 4) so that they do not influence the output of reduction.
  */
 void softmax_packed_dim(const ivec2 tid, ivec3 scan_pos) {
-  // shared memory index of this thread
   const int smi = tid_to_smi(tid);
-  // used to iterate over all shared memory in the group
   int group_i;
 
-  const int nspill = mod4(tin_sizes[packed_dim]);
-  const int reduce_len = tin_sizes[packed_dim] - nspill;
+  const int nspill = mod_4(in_meta.sizes[packed_dim]);
+  const int reduce_len = in_meta.sizes[packed_dim] - nspill;
 
   scan_pos[reduce_dim] = tid.x;
-  // Initialize with -FLT_MAX to avoid contaminating the maximum with out-of-
-  // bounds texture reads. When NWORKERS > number of texels (e.g. reduce_len=12
-  // has 3 texels but NWORKERS=4), worker threads with no valid texels would
-  // otherwise load from an OOB index and get 0, which corrupts the max for
-  // rows where all values are negative and causes denominator underflow -> NaN.
   vec4 max_elements = vec4(-3.402823e+38);
   for (int i = tid.x * 4; i < reduce_len;
        i += NWORKERS * 4, scan_pos[reduce_dim] += NWORKERS) {
-    max_elements = max(max_elements, load_texel(tin, scan_pos));
+    max_elements = max(max_elements, texelFetch(tin, scan_pos, 0));
   }
-  // For the last texel in the dim, if there are padding elements then each
-  // element of the texel needs to be processed individually such that the
-  // padding elements are ignored
-  if (scan_pos[reduce_dim] == tout_limits[reduce_dim] - 1 && nspill > 0) {
-    const vec4 intex = load_texel(tin, scan_pos);
+  if (scan_pos[reduce_dim] == out_meta.limits[reduce_dim] - 1 && nspill > 0) {
+    const vec4 intex = texelFetch(tin, scan_pos, 0);
     for (int i = 0; i < nspill; ++i) {
       max_elements.x = max(intex[i], max_elements.x);
     }
   }
   shared_max[smi] = max_elements;
   barrier();
-  // Iterate over the partial maximums to obtain the overall maximum
   group_i = tid.y * NWORKERS;
   max_elements = shared_max[group_i++];
   for (int i = 1; i < NWORKERS; ++i, group_i++) {
     max_elements = max(max_elements, shared_max[group_i]);
   }
-  // Each element of the texel is itself a partial maximum; iterate over the
-  // texel to find the actual maximum
   float max_element = max_elements.x;
   [[unroll]] for (int i = 1; i < 4; ++i) {
     max_element = max(max_elements[i], max_element);
@@ -232,49 +155,40 @@ void softmax_packed_dim(const ivec2 tid, ivec3 scan_pos) {
   vec4 denominators = vec4(0);
   for (int i = tid.x * 4; i < reduce_len;
        i += NWORKERS * 4, scan_pos[reduce_dim] += NWORKERS) {
-    denominators += exp(load_texel(tin, scan_pos) - max_element);
+    denominators += exp(texelFetch(tin, scan_pos, 0) - max_element);
   }
-  // For the last texel in the dim, if there are padding elements then each
-  // element of the texel needs to be processed individually such that the
-  // padding elements are ignored
-  if (nspill > 0 && scan_pos[reduce_dim] == tout_limits[reduce_dim] - 1) {
-    const vec4 intex = load_texel(tin, scan_pos);
+  if (nspill > 0 && scan_pos[reduce_dim] == out_meta.limits[reduce_dim] - 1) {
+    const vec4 intex = texelFetch(tin, scan_pos, 0);
     for (int i = 0; i < nspill; ++i) {
       denominators.x += exp(intex[i] - max_element);
     }
   }
   shared_sum[smi] = denominators;
   barrier();
-  // Iterate over the partial sums to obtain the overall sum
   group_i = tid.y * NWORKERS;
   denominators = shared_sum[group_i++];
   for (int i = 1; i < NWORKERS; ++i, group_i++) {
     denominators += shared_sum[group_i];
   }
-  // Reduce over the accumulated texel to find the overall sum
   float denominator = 0;
   [[unroll]] for (int i = 0; i < 4; ++i) {
     denominator += denominators[i];
   }
-  // Clamp denominator to avoid 0/0 = NaN when all exp values underflow.
   const float safe_denominator = max(denominator, 1e-37);
 
   scan_pos[reduce_dim] = tid.x;
   for (int i = tid.x * 4; i < reduce_len;
        i += NWORKERS * 4, scan_pos[reduce_dim] += NWORKERS) {
-    const vec4 numerators = op1(load_texel(tin, scan_pos) - max_element);
-    write_texel(tout, scan_pos, op2(numerators, safe_denominator));
+    const vec4 numerators = op1(texelFetch(tin, scan_pos, 0) - max_element);
+    imageStore(tout, scan_pos, op2(numerators, safe_denominator));
   }
-  // For the last texel in the dim, if there are padding elements then the
-  // padding elements need to be set to 0 explicitly, otherwise they may
-  // influence subsequent operations.
-  if (nspill > 0 && scan_pos[reduce_dim] == tout_limits[reduce_dim] - 1) {
-    const vec4 numerator = op1(load_texel(tin, scan_pos) - max_element);
+  if (nspill > 0 && scan_pos[reduce_dim] == out_meta.limits[reduce_dim] - 1) {
+    const vec4 numerator = op1(texelFetch(tin, scan_pos, 0) - max_element);
     vec4 outtex = op2(numerator, safe_denominator);
     [[unroll]] for (int i = nspill; i < 4; ++i) {
       outtex[i] = 0;
     }
-    write_texel(tout, scan_pos, outtex);
+    imageStore(tout, scan_pos, outtex);
   }
 }
 
@@ -286,7 +200,7 @@ void main() {
       gl_LocalInvocationID[reduce_dim],
       gl_LocalInvocationID[group_dim]);
 
-  if (any(greaterThanEqual(scan_pos, tout_limits))) {
+  if (any(greaterThanEqual(scan_pos, out_meta.limits))) {
     return;
   }
 
diff --git a/backends/vulkan/runtime/graph/ops/glsl/softmax.yaml b/backends/vulkan/runtime/graph/ops/glsl/softmax.yaml
index d50bbb85f33..fb7d9969e85 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/softmax.yaml
+++ b/backends/vulkan/runtime/graph/ops/glsl/softmax.yaml
@@ -15,7 +15,7 @@ softmax:
       - VALUE: half
       - VALUE: float
   shader_variants:
-    - NAME: softmax
-    - NAME: log_softmax
+    - NAME: softmax_texture3d
+    - NAME: log_softmax_texture3d
       OPERATOR1: X
       OPERATOR2: X - log(Y)
diff --git a/backends/vulkan/runtime/graph/ops/glsl/softmax_buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/softmax_buffer.glsl
new file mode 100644
index 00000000000..a49d599f1df
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/softmax_buffer.glsl
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+${define_required_extensions(STORAGE, DTYPE)}
+
+#define PRECISION ${PRECISION}
+#define T ${buffer_scalar_type(DTYPE)}
+
+#define op1(X) ${OPERATOR1}
+
+#define op2(X, Y) ${OPERATOR2}
+
+${define_active_storage_type(STORAGE)}
+
+layout(std430) buffer;
+
+#include "indexing.glslh"
+
+${layout_declare_tensor(B, "w", "out_buf", DTYPE, STORAGE)}
+${layout_declare_tensor(B, "r", "in_buf", DTYPE, STORAGE)}
+
+${layout_declare_ubo(B, "BufferMetadata", "in_meta")}
+${layout_declare_ubo(B, "BufferMetadata", "out_meta")}
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+layout(constant_id = 3) const int reduce_dim = 0;
+
+#define NWORKERS 4
+#define MAX_NTHREADS 16
+
+shared T shared_max[NWORKERS];
+shared T shared_sum[NWORKERS];
+
+/*
+ * Buffer-based softmax. Each workgroup processes one "row" along the reduction
+ * dimension. Within a workgroup, NWORKERS threads cooperate to compute the max
+ * and sum reductions, then each thread writes its portion of the final outputs.
+ *
+ * Thread mapping: the global WG size has 1 along reduce_dim, and all other
+ * dimensions correspond to output tensor sizes (WHCN order, with z encoding
+ * C*N). The local WG size has NWORKERS along reduce_dim. Each workgroup
+ * identifies a unique reduction "row" via the non-reduce dimensions of
+ * gl_GlobalInvocationID, and the NWORKERS threads within that workgroup
+ * cooperate on the reduction.
+ */
+void main() {
+  // Build the base 4D index for this workgroup's reduction row.
+  // gl_GlobalInvocationID has 0..NWORKERS-1 along reduce_dim; zero it out
+  // since the tid will iterate over the reduce_dim explicitly.
+  ivec3 gid = ivec3(gl_GlobalInvocationID);
+  gid[reduce_dim] = 0;
+
+  const int c_size = int(size_at(in_meta, 2));
+  TensorIndex4D base_idx;
+  base_idx.data = ivec4(gid.x, gid.y, gid.z % c_size, gid.z / c_size);
+
+  if (out_of_bounds(base_idx, in_meta)) {
+    return;
+  }
+
+  const uint tid = gl_LocalInvocationID[reduce_dim];
+  const int R = int(size_at(in_meta, reduce_dim));
+
+  // Phase 1: Find maximum along reduce_dim
+  TensorIndex4D in_idx = base_idx;
+
+  T local_max = T(-3.402823e+38);
+  for (int i = int(tid); i < R; i += NWORKERS) {
+    in_idx.data[reduce_dim] = i;
+    T v = in_buf[tensor4d_idx_to_linear_idx(in_meta, in_idx)];
+    local_max = max(local_max, v);
+  }
+  shared_max[tid] = local_max;
+  barrier();
+
+  // Reduce partial maximums across workers
+  T max_val = shared_max[0];
+  for (int i = 1; i < NWORKERS; ++i) {
+    max_val = max(max_val, shared_max[i]);
+  }
+
+  // Phase 2: Compute sum of exp(x - max_val)
+  T local_sum = T(0);
+  for (int i = int(tid); i < R; i += NWORKERS) {
+    in_idx.data[reduce_dim] = i;
+    T v = in_buf[tensor4d_idx_to_linear_idx(in_meta, in_idx)];
+    local_sum += exp(v - max_val);
+  }
+  shared_sum[tid] = local_sum;
+  barrier();
+
+  // Reduce partial sums across workers
+  T sum_val = shared_sum[0];
+  for (int i = 1; i < NWORKERS; ++i) {
+    sum_val += shared_sum[i];
+  }
+  // Clamp denominator to avoid 0/0 = NaN when all exp values underflow.
+  sum_val = max(sum_val, T(1e-37));
+
+  // Phase 3: Write outputs
+  for (int i = int(tid); i < R; i += NWORKERS) {
+    in_idx.data[reduce_dim] = i;
+    uint in_buf_idx = tensor4d_idx_to_linear_idx(in_meta, in_idx);
+    T v = in_buf[in_buf_idx];
+    T numerator = op1(v - max_val);
+    T result = op2(numerator, sum_val);
+
+    // Replace NaN/Inf with 0 using IEEE 754 bit-level manipulation
+    uint bits = floatBitsToUint(result);
+    if ((bits & 0x7F800000u) == 0x7F800000u) {
+      result = T(0);
+    }
+
+    out_buf[tensor4d_idx_to_linear_idx(out_meta, in_idx)] = result;
+  }
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/softmax_buffer.yaml b/backends/vulkan/runtime/graph/ops/glsl/softmax_buffer.yaml
new file mode 100644
index 00000000000..419e1a01ea7
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/softmax_buffer.yaml
@@ -0,0 +1,21 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+softmax_buffer:
+  parameter_names_with_default_values:
+    OPERATOR1: exp(X)
+    OPERATOR2: X / Y
+    DTYPE: float
+    STORAGE: buffer
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: half
+      - VALUE: float
+  shader_variants:
+    - NAME: softmax_buffer
+    - NAME: log_softmax_buffer
+      OPERATOR1: X
+      OPERATOR2: X - log(Y)
diff --git a/backends/vulkan/runtime/graph/ops/impl/Softmax.cpp b/backends/vulkan/runtime/graph/ops/impl/Softmax.cpp
index 2d683719ba2..55ecf466ab6 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Softmax.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Softmax.cpp
@@ -24,14 +24,26 @@ utils::uvec3 pick_softmax_global_wg_size(
     const std::vector<ArgGroup>& args,
     const std::vector<ValueRef>& resize_args) {
   (void)shader;
-  (void)resize_args;
 
   const ValueRef out = args.at(0).refs.at(0);
-  const int32_t reduce_dim_xyz =
-      graph->extract_scalar<int32_t>(resize_args.at(1));
+  const ValueRef in = args.at(1).refs.at(0);
+  const int dim = resize_args.at(0);
+
+  const int64_t ndim = graph->dim_of(in);
+  int32_t reduce_dim = normalize(dim, ndim);
+  reduce_dim = nchw_dim_to_whcn_dim(reduce_dim, ndim);
+
+  if (graph->is_buffer_storage(out)) {
+    utils::uvec3 global_size = {
+        graph->size_at<uint32_t>(-1, out),
+        graph->size_at<uint32_t>(-2, out),
+        graph->size_at<uint32_t>(-3, out) * graph->size_at<uint32_t>(-4, out)};
+    global_size[reduce_dim] = 1;
+    return global_size;
+  }
 
   utils::uvec3 global_size = graph->logical_limits_of(out);
-  global_size[reduce_dim_xyz] = 1;
+  global_size[reduce_dim] = 1;
   return global_size;
 }
 
@@ -43,22 +55,30 @@ utils::uvec3 pick_softmax_local_wg_size(
     const std::vector<ValueRef>& resize_args) {
   (void)shader;
   (void)global_workgroup_size;
-  (void)args;
 
-  const int64_t group_dim_xyz =
-      graph->extract_scalar<int64_t>(resize_args.at(2));
+  const ValueRef out = args.at(0).refs.at(0);
+  const ValueRef in = args.at(1).refs.at(0);
+  const int dim = resize_args.at(0);
 
-  const int32_t reduce_dim_xyz =
-      graph->extract_scalar<int32_t>(resize_args.at(1));
+  const int64_t ndim = graph->dim_of(in);
+  int32_t reduce_dim = normalize(dim, ndim);
+  reduce_dim = nchw_dim_to_whcn_dim(reduce_dim, ndim);
 
-  // These values are hardcoded in add_softmax_node
   const uint32_t nworkers_per_group = 4;
+
+  if (graph->is_buffer_storage(out)) {
+    utils::uvec3 local_wg_size{1, 1, 1};
+    local_wg_size[reduce_dim] = nworkers_per_group;
+    return local_wg_size;
+  }
+
+  const int64_t group_dim_xyz =
+      graph->extract_scalar<int64_t>(resize_args.at(1));
   const uint32_t ngroups = 4;
 
   utils::uvec3 local_wg_size{1, 1, 1};
-  local_wg_size[reduce_dim_xyz] = nworkers_per_group;
+  local_wg_size[reduce_dim] = nworkers_per_group;
   local_wg_size[group_dim_xyz] = ngroups;
-
   return local_wg_size;
 }
 
@@ -80,10 +100,6 @@ void add_softmax_node(
     const ValueRef dim_ref,
     const ValueRef out,
     bool log_softmax) {
-  VK_CHECK_COND(
-      !graph.is_buffer_storage(in) && !graph.is_buffer_storage(out),
-      "Vulkan softmax only supports texture storage");
-
   const int64_t ndim = graph.dim_of(in);
 
   int32_t reduce_dim_nchw = graph.extract_scalar<int32_t>(dim_ref);
@@ -101,9 +117,9 @@ void add_softmax_node(
         "Softmax shader currently does not support concat dim == reduce dim");
   }
 
-  vkapi::ShaderInfo shader_descriptor;
   std::string kernel_name = "softmax";
   kernel_name.reserve(kShaderNameReserve);
+  add_storage_type_suffix(kernel_name, graph.storage_type_of(out));
   add_dtype_suffix(kernel_name, graph.dtype_of(out));
   if (log_softmax) {
     kernel_name = "log_" + kernel_name;
@@ -111,25 +127,32 @@ void add_softmax_node(
 
   // This should match the value of MAX_NTHREADS in the softmax shader.
   constexpr uint32_t max_nthreads = 16;
-
   const uint32_t nworkers_per_group = 4;
   const uint32_t ngroups = 4;
   VK_CHECK_COND(nworkers_per_group * ngroups <= max_nthreads);
 
-  // Determine the group dimension
-  const int other_dim_1 = (reduce_dim_xyz + 1) % 3;
-  const int other_dim_2 = (reduce_dim_xyz + 2) % 3;
-  int32_t group_dim;
-  utils::uvec3 global_wg_size = graph.logical_limits_of(out);
-  if (global_wg_size[other_dim_1] > global_wg_size[other_dim_2]) {
-    group_dim = other_dim_1;
-  } else {
-    group_dim = other_dim_2;
-  }
+  const int dim_val = graph.extract_scalar<int>(dim_ref);
+
+  vkapi::SpecVarList spec_constants = {reduce_dim_xyz};
+  std::vector<ValueRef> resize_args = {dim_val};
 
-  const ValueRef reduce_dim_xyz_ref =
-      graph.get_or_add_value_for_int(reduce_dim_xyz);
-  const ValueRef group_dim_xyz_ref = graph.get_or_add_value_for_int(group_dim);
+  if (!graph.is_buffer_storage(out)) {
+    const int other_dim_1 = (reduce_dim_xyz + 1) % 3;
+    const int other_dim_2 = (reduce_dim_xyz + 2) % 3;
+    int32_t group_dim;
+    utils::uvec3 global_wg_size = graph.logical_limits_of(out);
+    if (global_wg_size[other_dim_1] > global_wg_size[other_dim_2]) {
+      group_dim = other_dim_1;
+    } else {
+      group_dim = other_dim_2;
+    }
+
+    spec_constants = {graph.hashed_layout_of(out), reduce_dim_xyz, group_dim};
+
+    const ValueRef group_dim_xyz_ref =
+        graph.get_or_add_value_for_int(group_dim);
+    resize_args = {dim_val, group_dim_xyz_ref};
+  }
 
   graph.execute_nodes().emplace_back(new DynamicDispatchNode(
       graph,
@@ -139,13 +162,13 @@ void add_softmax_node(
       // Inputs and Outputs
       {{out, vkapi::kWrite}, {in, vkapi::kRead}},
       // Shader params buffers
-      {},
+      {graph.meta_ubo(in), graph.meta_ubo(out)},
       // Push Constants
-      {graph.sizes_pc_of(in), graph.logical_limits_pc_of(out)},
+      {},
       // Specialization Constants
-      {graph.packed_dim_of(out), reduce_dim_xyz, group_dim},
+      spec_constants,
       // Resize Args
-      {dim_ref, reduce_dim_xyz_ref, group_dim_xyz_ref},
+      resize_args,
       // Resizing Logic
       resize_softmax_node));
 }
diff --git a/backends/vulkan/test/op_tests/cases.py b/backends/vulkan/test/op_tests/cases.py
index 2ec0d721f4c..c19ce168ad1 100644
--- a/backends/vulkan/test/op_tests/cases.py
+++ b/backends/vulkan/test/op_tests/cases.py
@@ -1627,6 +1627,7 @@ def get_softmax_inputs():
         "utils::kWidthPacked",
         "utils::kChannelsPacked",
     ]
+    test_suite.storage_types = ["utils::kTexture3D", "utils::kBuffer"]
 
     # Large negative values regression test (edgeTAM attention scores that
     # produced NaN due to missing max-shift in softmax numerics)
@@ -1639,6 +1640,7 @@ def get_softmax_inputs():
         "utils::kWidthPacked",
         "utils::kChannelsPacked",
     ]
+    large_neg_test_suite.storage_types = ["utils::kTexture3D", "utils::kBuffer"]
     large_neg_test_suite.data_range = (-1.8e10, -6.5e9)
     large_neg_test_suite.test_name_suffix = "large_negative"
     large_neg_test_suite.dtypes = ["at::kFloat"]

From 20c70d5cb9846591ace63750530f669b88228999 Mon Sep 17 00:00:00 2001
From: ssjia <ssjia@devgpu053.atn3.facebook.com>
Date: Tue, 17 Mar 2026 14:19:21 -0700
Subject: [PATCH 08/13] [ET-VK] Modernize native_layer_norm

Modernize native_layer_norm to support ANY_STORAGE. Migrate texture
shader from indexing_utils.h to indexing.glslh with TextureMetadata
UBOs. Merge separate texture and buffer dispatch functions into a
unified add_native_layer_norm_node using graph.meta_ubo(). Buffer
path retains custom workgroup sizing for cooperative shared-memory
reduction.

Pull Request resolved: https://github.com/pytorch/executorch/pull/18055
ghstack-source-id: 353546686
@exported-using-ghexport

Differential Revision: [D95970158](https://our.internmc.facebook.com/intern/diff/D95970158/)
---
 backends/vulkan/op_registry.py                |   2 +-
 .../graph/ops/glsl/native_layer_norm.glsl     | 314 ------------------
 .../ops/glsl/native_layer_norm_buffer.glsl    | 136 ++++++++
 .../ops/glsl/native_layer_norm_buffer.yaml    |  16 +
 .../ops/glsl/native_layer_norm_texture.glsl   | 266 +++++++++++++++
 ...rm.yaml => native_layer_norm_texture.yaml} |   5 +-
 .../graph/ops/impl/NativeLayerNorm.cpp        |  75 +++--
 backends/vulkan/test/op_tests/cases.py        |   4 +
 8 files changed, 471 insertions(+), 347 deletions(-)
 delete mode 100644 backends/vulkan/runtime/graph/ops/glsl/native_layer_norm.glsl
 create mode 100644 backends/vulkan/runtime/graph/ops/glsl/native_layer_norm_buffer.glsl
 create mode 100644 backends/vulkan/runtime/graph/ops/glsl/native_layer_norm_buffer.yaml
 create mode 100644 backends/vulkan/runtime/graph/ops/glsl/native_layer_norm_texture.glsl
 rename backends/vulkan/runtime/graph/ops/glsl/{native_layer_norm.yaml => native_layer_norm_texture.yaml} (83%)

diff --git a/backends/vulkan/op_registry.py b/backends/vulkan/op_registry.py
index 45ce1359649..f72e3dd5f0e 100644
--- a/backends/vulkan/op_registry.py
+++ b/backends/vulkan/op_registry.py
@@ -1440,7 +1440,7 @@ def register_native_group_norm():
 @update_features(exir_ops.edge.aten.native_layer_norm.default)
 def register_native_layer_norm():
     return OpFeatures(
-        inputs_storage=utils.ANY_TEXTURE,
+        inputs_storage=utils.ANY_STORAGE,
         inputs_dtypes=utils.FP_T,
         supports_prepacking=True,
         supports_resize=True,
diff --git a/backends/vulkan/runtime/graph/ops/glsl/native_layer_norm.glsl b/backends/vulkan/runtime/graph/ops/glsl/native_layer_norm.glsl
deleted file mode 100644
index 7897f0e8133..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/native_layer_norm.glsl
+++ /dev/null
@@ -1,314 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#include "broadcasting_utils.h"
-#include "indexing_utils.h"
-
-#define PRECISION ${PRECISION}
-
-#define VEC4_T ${texel_type(DTYPE)}
-
-#define T ${texel_component_type(DTYPE)}
-
-layout(std430) buffer;
-
-${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)}
-${layout_declare_tensor(B, "w", "t_mean", DTYPE, STORAGE)}
-${layout_declare_tensor(B, "w", "t_rstd", DTYPE, STORAGE)}
-
-${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)}
-${layout_declare_tensor(B, "r", "t_weight", DTYPE, STORAGE)}
-${layout_declare_tensor(B, "r", "t_bias", DTYPE, STORAGE)}
-
-layout(push_constant) uniform PRECISION restrict Block {
-  ivec3 out_limits;
-  ivec4 sizes;
-  float epsilon;
-};
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")}
-const lowp ivec4 in_axis_map = unhash_axis_map(in_layout);
-const lowp int in_packed_dim = unhash_packed_dim(in_layout);
-
-${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")}
-const lowp ivec4 out_axis_map = unhash_axis_map(out_layout);
-const lowp int out_packed_dim = unhash_packed_dim(out_layout);
-
-#define MAX_WORKGROUP_SIZE 64
-
-// Shared memory factor increases shared memory allocation by a scale that should either be 1 or a power of 2.
-//
-// Increasing factor allows more data to be stored in shared memory and increase thread utilization during reduction.
-// Why? Because when performing reduction, the number of active threads becomes half in each iteration.
-// Increasing scaling factor increases the thread occupancy and hence utilize the GPU better.
-// eg.
-// If local thread size in x dimension is 32, and SHARED_MEMORY_FACTOR is 1, 32 elements will be loaded into shared memory.
-// First iteration of reduce will have 16 threads sum up 32 elements.
-// Second iteration will have 8 threads sum up 16 elements from previous iteration and so on.
-// So thread utilization starts at 50%.
-//
-// By contrast if local thread size in x dimension is 32, and SHARED_MEMORY_FACTOR is 2, 64 elements will be loaded into shared memory.
-// First iteration of reduce will have 32 threads sum up 64 elements.
-// Second iteration will have 32 threads sum up 16 elements from previous iteration and so on.
-// Thus thread utilization starts at 100%.
-#define SHARED_MEMORY_FACTOR 1
-
-#define offset_pos_index(index) ((index) + ((index) >> 3))
-
-shared VEC4_T shared_input[offset_pos_index(MAX_WORKGROUP_SIZE * SHARED_MEMORY_FACTOR)];
-
-// Function to reduce input data in workgroup's x dimension
-//
-// The implementation resembles reduction as depicted below
-// | 10 | 1 | 8 | 1 | 0 | 2 | 3 | 5 | 2 | 3 | 2 | 7 | 0 | 11 | 0 | 2 | current_stride -> 1
-//   |   /    |   /   |   /   |   /   |   /   |   /   |   /    |   /
-//   |  /     |  /    |  /    |  /    |  /    |  /    |  /     |  /
-//   | /      | /     | /     | /     | /     | /     | /      | /
-// | 11 | 1 | 9 | 1 | 2 | 2 | 8 | 5 | 5 | 3 | 9 | 7 | 11 | 11 | 2 | 2 | current_stride -> 2
-//   |       /        |       /       |       /       |       /
-//   |    /           |    /          |    /          |    /
-//   | /              | /             | /             | /
-// | 20 | 1 | 9 | 1 | 10 | 2 | 8 | 5 |14 | 3 | 9 | 7 |13 | 11 | 2 | 2 | current_stride -> 4
-//   |                /               |               /
-//   |            /                   |            /
-//   |        /                       |         /
-//   |    /                           |     /
-//   | /                              | /
-// | 30 | 1 | 9 | 1 | 10 | 2 | 8 | 5 |27 | 3 | 9 | 7 |13 | 11 | 2 | 2 | current_stride -> 8
-//   |                                /
-//   |                             /
-//   |                         /
-//   |                     /
-//   |                 /
-//   |             /
-//   |         /
-//   |     /
-//   | /
-// | 57 | 1 | 9 | 1 | 10 | 2 | 8 | 5 |27 | 3 | 9 | 7 |13 | 11 | 2 | 2 | current_stride = -> 16
-//
-// Threads access shared index in following pattern
-// Thread       | 0 | 1 | 2 | 3 | 4 | 5  | 6  | 7  | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | current_stride -> 1
-// Shared Index | 0 | 2 | 4 | 6 | 8 | 10 | 12 | 14 | X | X | X  | X  | X  | X  | X  | X  | index *= 1
-//
-// Thread       | 0 | 1 | 2 | 3  | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | current_stride -> 2
-// Shared Index | 0 | 4 | 8 | 12 | X | X | X | X | X | X | X  | X  | X  | X  | X  | X  | index *= 2
-//
-// Thread       | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | current_stride -> 4
-// Shared Index | 0 | 8 | X | X | X | X | X | X | X | X | X  | X  | X  | X  | X  | X  | index *= 4
-//
-// Thread       | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | current_stride -> 8
-// Shared Index | 0 | X | X | X | X | X | X | X | X | X | X  | X  | X  | X  | X  | X  | index *= 8
-
-void reduce_input(const int width_stride, const int shared_idx_offset) {
-  // wait for all shared memory writes to finish
-  memoryBarrierShared();
-  barrier();
-
-  // loop log(width_stride) times
-  for (int current_stride = 1, index = int(gl_LocalInvocationID.x << 1); current_stride < width_stride; current_stride *= 2, index <<= 1) {
-    // if the index at this thread is within the width stride
-    if (index < width_stride) {
-      const int local_shared_idx = shared_idx_offset + index;
-      // add the value at current stride to this thread's value
-      shared_input[offset_pos_index(local_shared_idx)] += shared_input[offset_pos_index(local_shared_idx + current_stride)];
-    }
-
-    memoryBarrierShared();
-    barrier();
-  }
-}
-
-void reduce_non_packed_dim() {
-  const ivec3 lpos = ivec3(gl_GlobalInvocationID);
-  const int width = int(sizes.x);
-  ivec3 in_pos = lpos_to_pos(lpos, in_axis_map);
-
-  // width batch read stride
-  const int width_stride = int(gl_WorkGroupSize.x) * SHARED_MEMORY_FACTOR;
-
-  // local memory starting offset for this thread
-  const int shared_idx_offset = width_stride * int(gl_WorkGroupSize.y * gl_LocalInvocationID.z + gl_LocalInvocationID.y);
-
-  // local memory index for this thread
-  const int shared_idx = shared_idx_offset + int(gl_LocalInvocationID.x);
-
-  VEC4_T mean = VEC4_T(0);
-  VEC4_T var = VEC4_T(0);
-
-  // Loop over the width in stride increments
-  for (int width_offset = 0; width_offset < width; width_offset += width_stride) {
-    // Read input in shared memory
-    for (int si = 0; si < SHARED_MEMORY_FACTOR; si++) {
-      in_pos[in_axis_map.x] = width_offset + int(gl_LocalInvocationID.x + si * gl_WorkGroupSize.x);
-
-      VEC4_T in_val = VEC4_T(0);
-      if (all(lessThan(in_pos, out_limits))) {
-        in_val = load_texel(t_in, in_pos);
-      }
-      mean += in_val;
-    }
-  }
-
-  shared_input[offset_pos_index(shared_idx)] = mean;
-  reduce_input(width_stride, shared_idx_offset);
-  mean = shared_input[offset_pos_index(shared_idx_offset)] / width;
-
-  memoryBarrierShared();
-  barrier();
-
-  // Loop over the width in stride increments
-  for (int width_offset = 0; width_offset < width; width_offset += width_stride) {
-    // Read input in shared memory
-    for (int si = 0; si < SHARED_MEMORY_FACTOR; si++) {
-      in_pos[in_axis_map.x] = width_offset + int(gl_LocalInvocationID.x + si * gl_WorkGroupSize.x);
-
-      VEC4_T in_val = mean;
-      if (all(lessThan(in_pos, out_limits))) {
-        in_val = load_texel(t_in, in_pos);
-      }
-
-      const VEC4_T delta = in_val - mean;
-      var += delta * delta;
-    }
-  }
-
-  shared_input[offset_pos_index(shared_idx)] = var;
-  reduce_input(width_stride, shared_idx_offset);
-  var = shared_input[offset_pos_index(shared_idx_offset)] / width;
-
-  VEC4_T rstd = pow(var + epsilon, VEC4_T(-0.5));
-  VEC4_T offset = -rstd * mean;
-
-  VEC4_T v = load_texel(t_in, lpos);
-  VEC4_T weight = load_texel(t_weight, ivec3(lpos.x, 0, 0)).xxxx;
-  VEC4_T bias = load_texel(t_bias, ivec3(lpos.x, 0, 0)).xxxx;
-  VEC4_T outtex = (v * rstd + offset) * weight + bias;
-
-  if (all(lessThan(lpos, out_limits))) {
-    write_texel_lpos(t_out, lpos, outtex, out_axis_map);
-  }
-
-  if (gl_GlobalInvocationID.x == 0) {
-    write_texel(t_mean, lpos, mean);
-    write_texel(t_rstd, lpos, rstd);
-  }
-}
-
-void reduce_packed_dim() {
-  const ivec3 lpos = ivec3(gl_GlobalInvocationID);
-  const int width = int(sizes.x);
-  ivec3 in_pos = lpos_to_pos(lpos, in_axis_map);
-
-  // width batch read stride
-  const int width_stride = int(gl_WorkGroupSize.x) * SHARED_MEMORY_FACTOR;
-
-  // local memory starting offset for this thread
-  const int shared_idx_offset = width_stride * int(gl_WorkGroupSize.y * gl_LocalInvocationID.z + gl_LocalInvocationID.y);
-
-  // local memory index for this thread
-  const int shared_idx = shared_idx_offset + int(gl_LocalInvocationID.x);
-
-  const int last_packed_width_index = divup4(width) - 1;
-  T mean = T(0);
-  T var = T(0);
-  const int remain = width & 3;
-
-  const int in_pos_x_limit = out_limits[in_axis_map.x];
-
-  VEC4_T accum = VEC4_T(0);
-  // Loop over the width in stride increments
-  for (int width_offset = 0; width_offset <= last_packed_width_index; width_offset += width_stride) {
-    // Read input in shared memory
-    for (int si = 0; si < SHARED_MEMORY_FACTOR; si++) {
-      const int in_pos_x = width_offset + int(gl_LocalInvocationID.x + si * gl_WorkGroupSize.x);
-      in_pos[in_axis_map.x] = in_pos_x;
-
-      VEC4_T in_val = VEC4_T(0);
-      if (in_pos_x < in_pos_x_limit) {
-        in_val = load_texel(t_in, in_pos);
-      }
-
-      if (in_pos_x == last_packed_width_index && remain != 0) {
-        const int remain_inv = 4 - remain;
-        in_val.y = mix(in_val.y, T(0), remain_inv > 2);
-        in_val.z = mix(in_val.z, T(0), remain_inv > 1);
-        in_val.w = mix(in_val.w, T(0), remain_inv > 0);
-      }
-      accum += in_val;
-    }
-  }
-
-  shared_input[offset_pos_index(shared_idx)] = accum;
-  reduce_input(width_stride, shared_idx_offset);
-  VEC4_T val = shared_input[offset_pos_index(shared_idx_offset)];
-  mean = (val.x + val.y + val.z + val.w) / width;
-
-  memoryBarrierShared();
-  barrier();
-
-  VEC4_T delta2 = VEC4_T(0);
-
-  // Loop over the width in stride increments
-  for (int width_offset = 0; width_offset <= last_packed_width_index; width_offset += width_stride) {
-    // Read input in shared memory
-    for (int si = 0; si < SHARED_MEMORY_FACTOR; si++) {
-      const int in_pos_x = width_offset + int(gl_LocalInvocationID.x + si * gl_WorkGroupSize.x);
-      in_pos[in_axis_map.x] = in_pos_x;
-
-      VEC4_T in_val = VEC4_T(mean);
-      if (in_pos_x < in_pos_x_limit) {
-        in_val = load_texel(t_in, in_pos);
-      }
-
-      if (in_pos_x == last_packed_width_index && remain != 0) {
-        const int remain_inv = 4 - remain;
-        in_val.y = mix(in_val.y, mean.x, remain_inv > 2);
-        in_val.z = mix(in_val.z, mean.x, remain_inv > 1);
-        in_val.w = mix(in_val.w, mean.x, remain_inv > 0);
-      }
-
-      const VEC4_T delta = in_val - mean;
-      delta2 += delta * delta;
-    }
-  }
-
-  shared_input[offset_pos_index(shared_idx)] = delta2;
-  reduce_input(width_stride, shared_idx_offset);
-  val = shared_input[offset_pos_index(shared_idx_offset)];
-  var = (val.x + val.y + val.z + val.w) / width;
-
-  T rstd = pow(var + epsilon, T(-0.5));
-  T offset = -rstd * mean;
-
-  VEC4_T v = load_texel(t_in, lpos);
-  VEC4_T weight = load_texel(t_weight, ivec3(lpos.x, 0, 0));
-  VEC4_T bias = load_texel(t_bias, ivec3(lpos.x, 0, 0));
-  VEC4_T outtex = (v * rstd + offset) * weight + bias;
-
-  if (all(lessThan(lpos, out_limits))) {
-    write_texel_lpos(t_out, lpos, outtex, out_axis_map);
-  }
-
-  if (gl_GlobalInvocationID.x == 0) {
-    write_texel(t_mean, lpos, VEC4_T(mean));
-    write_texel(t_rstd, lpos, VEC4_T(rstd));
-  }
-}
-
-void main() {
-  // if packed dimension width
-  if (in_packed_dim != W_DIM) {
-    reduce_non_packed_dim();
-  } else {
-    reduce_packed_dim();
-  }
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/native_layer_norm_buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/native_layer_norm_buffer.glsl
new file mode 100644
index 00000000000..22d32d09b89
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/native_layer_norm_buffer.glsl
@@ -0,0 +1,136 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+${define_required_extensions("buffer", DTYPE)}
+
+#define PRECISION ${PRECISION}
+
+#define T ${buffer_scalar_type(DTYPE)}
+
+${define_active_storage_type("buffer")}
+
+#extension GL_EXT_control_flow_attributes : require
+
+layout(std430) buffer;
+
+#include "indexing.glslh"
+
+${layout_declare_tensor(B, "w", "t_out", DTYPE, "buffer")}
+${layout_declare_tensor(B, "w", "t_mean", DTYPE, "buffer")}
+${layout_declare_tensor(B, "w", "t_rstd", DTYPE, "buffer")}
+
+${layout_declare_tensor(B, "r", "t_in", DTYPE, "buffer")}
+${layout_declare_tensor(B, "r", "t_weight", DTYPE, "buffer")}
+${layout_declare_tensor(B, "r", "t_bias", DTYPE, "buffer")}
+
+${layout_declare_ubo(B, "BufferMetadata", "outp")}
+${layout_declare_ubo(B, "BufferMetadata", "inp")}
+${layout_declare_ubo(B, "BufferMetadata", "mean_meta")}
+
+layout(push_constant) uniform PRECISION restrict Block {
+  float epsilon;
+};
+
+#define NUM_WORKERS 64
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+${layout_declare_spec_const(C, "int", "in_layout", "CONTIG_LAYOUT_INT")}
+
+shared T shared_sum[NUM_WORKERS];
+
+void reduce_shared(const uint worker_id) {
+  memoryBarrierShared();
+  barrier();
+
+  [[unroll]] for (int stride = NUM_WORKERS / 2; stride > 0; stride >>= 1) {
+    if (worker_id < stride) {
+      shared_sum[worker_id] += shared_sum[worker_id + stride];
+    }
+    memoryBarrierShared();
+    barrier();
+  }
+}
+
+void main() {
+  // Each workgroup handles one output row (one mean/rstd element).
+  // gl_GlobalInvocationID.y = row index
+  // gl_LocalInvocationID.x = worker_id within the row
+  const uint row_idx = gl_GlobalInvocationID.y;
+  const uint worker_id = gl_LocalInvocationID.x;
+
+  const uint row_width = width(inp);
+
+  if (row_idx >= numel(mean_meta)) {
+    return;
+  }
+
+  // Convert row_idx to a tensor index using the mean/rstd metadata.
+  // The mean/rstd tensor has shape [..., 1] (width dimension is 1).
+  // This gives us the outer dimension indices for this row.
+  TensorIndex row_tidx = linear_idx_to_tensor_idx(mean_meta, row_idx, in_layout);
+
+  // The width stride in the input buffer tells us how to step through width
+  // elements. For contiguous layout, stride_at(inp, 0) == 1; for other
+  // layouts it may differ.
+  const uint width_stride = stride_at(inp, 0);
+
+  // Compute the base buffer index for this row in the input tensor.
+  // Set width component to 0 and compute the buffer offset.
+  row_tidx.data[0][0] = 0;
+  const uint base_bufi = tensor_idx_to_linear_idx(inp, row_tidx);
+
+  // Phase 1: Compute mean via cooperative reduction
+  T local_sum = T(0);
+  for (uint x = worker_id; x < row_width; x += NUM_WORKERS) {
+    const uint in_bufi = base_bufi + x * width_stride;
+    local_sum += t_in[in_bufi];
+  }
+
+  shared_sum[worker_id] = local_sum;
+  reduce_shared(worker_id);
+
+  const T mean_val = shared_sum[0] / T(row_width);
+
+  memoryBarrierShared();
+  barrier();
+
+  // Phase 2: Compute variance via cooperative reduction
+  T local_var = T(0);
+  for (uint x = worker_id; x < row_width; x += NUM_WORKERS) {
+    const uint in_bufi = base_bufi + x * width_stride;
+    const T delta = t_in[in_bufi] - mean_val;
+    local_var += delta * delta;
+  }
+
+  shared_sum[worker_id] = local_var;
+  reduce_shared(worker_id);
+
+  const T var_val = shared_sum[0] / T(row_width);
+  const T rstd_val = pow(var_val + T(epsilon), T(-0.5));
+
+  // Phase 3: Normalize and write output
+  // Weight and bias are 1D tensors of size [width], indexed directly by x.
+  for (uint x = worker_id; x < row_width; x += NUM_WORKERS) {
+    const uint in_bufi = base_bufi + x * width_stride;
+    const T in_val = t_in[in_bufi];
+    const T normalized = (in_val - mean_val) * rstd_val;
+    const T w = t_weight[x];
+    const T b = t_bias[x];
+    t_out[in_bufi] = normalized * w + b;
+  }
+
+  // Write mean and rstd (only one thread per row)
+  if (worker_id == 0) {
+    const uint mean_bufi = tensor_idx_to_linear_idx(mean_meta, row_tidx);
+    t_mean[mean_bufi] = mean_val;
+    t_rstd[mean_bufi] = rstd_val;
+  }
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/native_layer_norm_buffer.yaml b/backends/vulkan/runtime/graph/ops/glsl/native_layer_norm_buffer.yaml
new file mode 100644
index 00000000000..1978f237ea5
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/native_layer_norm_buffer.yaml
@@ -0,0 +1,16 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+native_layer_norm_buffer:
+  parameter_names_with_default_values:
+    DTYPE: float
+    STORAGE: buffer
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: half
+      - VALUE: float
+  shader_variants:
+    - NAME: native_layer_norm_buffer
diff --git a/backends/vulkan/runtime/graph/ops/glsl/native_layer_norm_texture.glsl b/backends/vulkan/runtime/graph/ops/glsl/native_layer_norm_texture.glsl
new file mode 100644
index 00000000000..1edf69c9a42
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/native_layer_norm_texture.glsl
@@ -0,0 +1,266 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+${define_required_extensions("texture3d", DTYPE)}
+
+#define PRECISION ${PRECISION}
+
+#define VEC4_T ${texel_load_type(DTYPE, "texture3d")}
+#define T ${texel_load_component_type(DTYPE, "texture3d")}
+
+${define_active_storage_type("texture3d")}
+
+#extension GL_EXT_control_flow_attributes : require
+
+layout(std430) buffer;
+
+#include "common.glslh"
+#include "indexing.glslh"
+
+${layout_declare_tensor(B, "w", "t_out", DTYPE, "texture3d")}
+${layout_declare_tensor(B, "w", "t_mean", DTYPE, "texture3d")}
+${layout_declare_tensor(B, "w", "t_rstd", DTYPE, "texture3d")}
+
+${layout_declare_tensor(B, "r", "t_in", DTYPE, "texture3d")}
+${layout_declare_tensor(B, "r", "t_weight", DTYPE, "texture3d")}
+${layout_declare_tensor(B, "r", "t_bias", DTYPE, "texture3d")}
+
+${layout_declare_ubo(B, "TextureMetadata", "out_meta")}
+${layout_declare_ubo(B, "TextureMetadata", "in_meta")}
+
+layout(push_constant) uniform PRECISION restrict Block {
+  float epsilon;
+};
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+${layout_declare_spec_const(C, "int", "in_layout", "CONTIG_LAYOUT_INT")}
+const lowp int packed_dim = get_packed_dim(in_layout);
+
+#define MAX_WORKGROUP_SIZE 64
+
+// Shared memory factor increases shared memory allocation by a scale that
+// should either be 1 or a power of 2.
+//
+// Increasing factor allows more data to be stored in shared memory and increase
+// thread utilization during reduction. Why? Because when performing reduction,
+// the number of active threads becomes half in each iteration. Increasing
+// scaling factor increases the thread occupancy and hence utilize the GPU
+// better.
+#define SHARED_MEMORY_FACTOR 1
+
+#define offset_pos_index(index) ((index) + ((index) >> 3))
+
+shared VEC4_T shared_input[offset_pos_index(MAX_WORKGROUP_SIZE * SHARED_MEMORY_FACTOR)];
+
+
+// Reduction of shared memory along the workgroup's x dimension.
+void reduce_input(const int width_stride, const int shared_idx_offset) {
+  memoryBarrierShared();
+  barrier();
+
+  for (int current_stride = 1, index = int(gl_LocalInvocationID.x << 1);
+       current_stride < width_stride;
+       current_stride *= 2, index <<= 1) {
+    if (index < width_stride) {
+      const int local_shared_idx = shared_idx_offset + index;
+      shared_input[offset_pos_index(local_shared_idx)] +=
+          shared_input[offset_pos_index(local_shared_idx + current_stride)];
+    }
+
+    memoryBarrierShared();
+    barrier();
+  }
+}
+
+void reduce_non_packed_dim() {
+  const ivec3 lpos = ivec3(gl_GlobalInvocationID);
+  const int width = in_meta.sizes.x;
+  ivec3 in_pos = lpos;
+
+  const int width_stride = int(gl_WorkGroupSize.x) * SHARED_MEMORY_FACTOR;
+
+  const int shared_idx_offset =
+      width_stride *
+      int(gl_WorkGroupSize.y * gl_LocalInvocationID.z +
+          gl_LocalInvocationID.y);
+
+  const int shared_idx = shared_idx_offset + int(gl_LocalInvocationID.x);
+
+  VEC4_T mean = VEC4_T(0);
+  VEC4_T var = VEC4_T(0);
+
+  for (int width_offset = 0; width_offset < width;
+       width_offset += width_stride) {
+    for (int si = 0; si < SHARED_MEMORY_FACTOR; si++) {
+      in_pos[0] =
+          width_offset + int(gl_LocalInvocationID.x + si * gl_WorkGroupSize.x);
+
+      VEC4_T in_val = VEC4_T(0);
+      if (all(lessThan(in_pos, out_meta.limits))) {
+        in_val = texelFetch(t_in, in_pos, 0);
+      }
+      mean += in_val;
+    }
+  }
+
+  shared_input[offset_pos_index(shared_idx)] = mean;
+  reduce_input(width_stride, shared_idx_offset);
+  mean = shared_input[offset_pos_index(shared_idx_offset)] / width;
+
+  memoryBarrierShared();
+  barrier();
+
+  for (int width_offset = 0; width_offset < width;
+       width_offset += width_stride) {
+    for (int si = 0; si < SHARED_MEMORY_FACTOR; si++) {
+      in_pos[0] =
+          width_offset + int(gl_LocalInvocationID.x + si * gl_WorkGroupSize.x);
+
+      VEC4_T in_val = mean;
+      if (all(lessThan(in_pos, out_meta.limits))) {
+        in_val = texelFetch(t_in, in_pos, 0);
+      }
+
+      const VEC4_T delta = in_val - mean;
+      var += delta * delta;
+    }
+  }
+
+  shared_input[offset_pos_index(shared_idx)] = var;
+  reduce_input(width_stride, shared_idx_offset);
+  var = shared_input[offset_pos_index(shared_idx_offset)] / width;
+
+  VEC4_T rstd = pow(var + epsilon, VEC4_T(-0.5));
+  VEC4_T offset = -rstd * mean;
+
+  VEC4_T v = texelFetch(t_in, lpos, 0);
+  VEC4_T weight = texelFetch(t_weight, ivec3(lpos.x, 0, 0), 0).xxxx;
+  VEC4_T bias = texelFetch(t_bias, ivec3(lpos.x, 0, 0), 0).xxxx;
+  VEC4_T outtex = (v * rstd + offset) * weight + bias;
+
+  if (all(lessThan(lpos, out_meta.limits))) {
+    imageStore(t_out, lpos, outtex);
+  }
+
+  if (gl_GlobalInvocationID.x == 0) {
+    imageStore(t_mean, lpos, mean);
+    imageStore(t_rstd, lpos, rstd);
+  }
+}
+
+void reduce_packed_dim() {
+  const ivec3 lpos = ivec3(gl_GlobalInvocationID);
+  const int width = in_meta.sizes.x;
+  ivec3 in_pos = lpos;
+
+  const int width_stride = int(gl_WorkGroupSize.x) * SHARED_MEMORY_FACTOR;
+
+  const int shared_idx_offset =
+      width_stride *
+      int(gl_WorkGroupSize.y * gl_LocalInvocationID.z +
+          gl_LocalInvocationID.y);
+
+  const int shared_idx = shared_idx_offset + int(gl_LocalInvocationID.x);
+
+  const int last_packed_width_index = div_up_4(width) - 1;
+  T mean = T(0);
+  T var = T(0);
+  const int remain = width & 3;
+
+  const int in_pos_x_limit = out_meta.limits[0];
+
+  VEC4_T accum = VEC4_T(0);
+  for (int width_offset = 0; width_offset <= last_packed_width_index;
+       width_offset += width_stride) {
+    for (int si = 0; si < SHARED_MEMORY_FACTOR; si++) {
+      const int in_pos_x =
+          width_offset + int(gl_LocalInvocationID.x + si * gl_WorkGroupSize.x);
+      in_pos[0] = in_pos_x;
+
+      VEC4_T in_val = VEC4_T(0);
+      if (in_pos_x < in_pos_x_limit) {
+        in_val = texelFetch(t_in, in_pos, 0);
+      }
+
+      if (in_pos_x == last_packed_width_index && remain != 0) {
+        const int remain_inv = 4 - remain;
+        in_val.y = mix(in_val.y, T(0), remain_inv > 2);
+        in_val.z = mix(in_val.z, T(0), remain_inv > 1);
+        in_val.w = mix(in_val.w, T(0), remain_inv > 0);
+      }
+      accum += in_val;
+    }
+  }
+
+  shared_input[offset_pos_index(shared_idx)] = accum;
+  reduce_input(width_stride, shared_idx_offset);
+  VEC4_T val = shared_input[offset_pos_index(shared_idx_offset)];
+  mean = (val.x + val.y + val.z + val.w) / width;
+
+  memoryBarrierShared();
+  barrier();
+
+  VEC4_T delta2 = VEC4_T(0);
+
+  for (int width_offset = 0; width_offset <= last_packed_width_index;
+       width_offset += width_stride) {
+    for (int si = 0; si < SHARED_MEMORY_FACTOR; si++) {
+      const int in_pos_x =
+          width_offset + int(gl_LocalInvocationID.x + si * gl_WorkGroupSize.x);
+      in_pos[0] = in_pos_x;
+
+      VEC4_T in_val = VEC4_T(mean);
+      if (in_pos_x < in_pos_x_limit) {
+        in_val = texelFetch(t_in, in_pos, 0);
+      }
+
+      if (in_pos_x == last_packed_width_index && remain != 0) {
+        const int remain_inv = 4 - remain;
+        in_val.y = mix(in_val.y, mean.x, remain_inv > 2);
+        in_val.z = mix(in_val.z, mean.x, remain_inv > 1);
+        in_val.w = mix(in_val.w, mean.x, remain_inv > 0);
+      }
+
+      const VEC4_T delta = in_val - mean;
+      delta2 += delta * delta;
+    }
+  }
+
+  shared_input[offset_pos_index(shared_idx)] = delta2;
+  reduce_input(width_stride, shared_idx_offset);
+  val = shared_input[offset_pos_index(shared_idx_offset)];
+  var = (val.x + val.y + val.z + val.w) / width;
+
+  T rstd = pow(var + T(epsilon), T(-0.5));
+  T offset = -rstd * mean;
+
+  VEC4_T v = texelFetch(t_in, lpos, 0);
+  VEC4_T weight = texelFetch(t_weight, ivec3(lpos.x, 0, 0), 0);
+  VEC4_T bias = texelFetch(t_bias, ivec3(lpos.x, 0, 0), 0);
+  VEC4_T outtex = (v * rstd + offset) * weight + bias;
+
+  if (all(lessThan(lpos, out_meta.limits))) {
+    imageStore(t_out, lpos, outtex);
+  }
+
+  if (gl_GlobalInvocationID.x == 0) {
+    imageStore(t_mean, lpos, VEC4_T(mean));
+    imageStore(t_rstd, lpos, VEC4_T(rstd));
+  }
+}
+
+void main() {
+  if (packed_dim != 0) {
+    reduce_non_packed_dim();
+  } else {
+    reduce_packed_dim();
+  }
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/native_layer_norm.yaml b/backends/vulkan/runtime/graph/ops/glsl/native_layer_norm_texture.yaml
similarity index 83%
rename from backends/vulkan/runtime/graph/ops/glsl/native_layer_norm.yaml
rename to backends/vulkan/runtime/graph/ops/glsl/native_layer_norm_texture.yaml
index ac478599f8a..185d5eff2f4 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/native_layer_norm.yaml
+++ b/backends/vulkan/runtime/graph/ops/glsl/native_layer_norm_texture.yaml
@@ -4,13 +4,12 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-native_layer_norm:
+native_layer_norm_texture:
   parameter_names_with_default_values:
     DTYPE: float
-    STORAGE: texture3d
   generate_variant_forall:
     DTYPE:
       - VALUE: half
       - VALUE: float
   shader_variants:
-    - NAME: native_layer_norm
+    - NAME: native_layer_norm_texture3d
diff --git a/backends/vulkan/runtime/graph/ops/impl/NativeLayerNorm.cpp b/backends/vulkan/runtime/graph/ops/impl/NativeLayerNorm.cpp
index 8e15b56b208..73d1ea908e9 100644
--- a/backends/vulkan/runtime/graph/ops/impl/NativeLayerNorm.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/NativeLayerNorm.cpp
@@ -50,6 +50,33 @@ void resize_native_layer_norm_node(
   graph->virtual_resize(rstd, mean_size);
 }
 
+utils::uvec3 layer_norm_buffer_global_wg_size(
+    ComputeGraph* graph,
+    const vkapi::ShaderInfo& shader,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& resize_args) {
+  (void)shader;
+  (void)resize_args;
+  const ValueRef mean_tensor = args.at(0).refs.at(1);
+  const uint32_t num_rows =
+      utils::safe_downcast<uint32_t>(graph->numel_of(mean_tensor));
+  return {1u, num_rows, 1u};
+}
+
+utils::uvec3 layer_norm_buffer_local_wg_size(
+    ComputeGraph* graph,
+    const vkapi::ShaderInfo& shader,
+    const utils::uvec3& global_workgroup_size,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& resize_args) {
+  (void)graph;
+  (void)shader;
+  (void)global_workgroup_size;
+  (void)args;
+  (void)resize_args;
+  return {64u, 1u, 1u};
+}
+
 void add_native_layer_norm_node(
     ComputeGraph& graph,
     const ValueRef in,
@@ -82,50 +109,40 @@ void add_native_layer_norm_node(
 
   float epsilon = graph.extract_scalar<float>(eps);
 
-  VK_CHECK_COND(check_same_packed_dim(graph, in, out_tensor));
-
-  const std::vector<int64_t> in_sizes = graph.sizes_of(in);
+  std::string kernel_name("native_layer_norm");
+  kernel_name.reserve(kShaderNameReserve);
+  add_storage_type_suffix(kernel_name, graph.storage_type_of(out_tensor));
+  add_dtype_suffix(kernel_name, graph.dtype_of(out_tensor));
 
-  utils::uvec3 global_size = graph.logical_limits_of(out_tensor);
-  utils::uvec3 local_size;
+  const bool is_buffer = graph.is_buffer_storage(in);
 
-  // Since the shader sets shared memory scale factor > 1, if dispatch is
-  // greater than maximum WG size. Setting WG size in X axis to max WG size,
-  // would allow best thread utilization.
-  if (global_size[0] > 64) {
-    local_size = {64, 1, 1};
-  } else {
-    // If thread size in X axis is smaller or equal to maximum WG size, we can
-    // let the function decide the best WG size.
-    local_size = graph.create_local_wg_size(global_size);
+  if (!is_buffer) {
+    VK_CHECK_COND(check_same_packed_dim(graph, in, out_tensor));
   }
 
-  std::string kernel_name("native_layer_norm");
-  kernel_name.reserve(kShaderNameReserve);
+  vkapi::ParamsBindList param_ubos = {
+      graph.meta_ubo(out_tensor), graph.meta_ubo(in)};
+  vkapi::SpecVarList spec_constants = {graph.hashed_layout_of(in)};
 
-  add_dtype_suffix(kernel_name, graph.dtype_of(out_tensor));
+  if (is_buffer) {
+    param_ubos.append(graph.meta_ubo(mean_tensor));
+  }
 
   graph.execute_nodes().emplace_back(new DynamicDispatchNode(
       graph,
       VK_KERNEL_FROM_STR(kernel_name),
-      default_pick_global_wg_size,
-      default_pick_local_wg_size,
+      is_buffer ? layer_norm_buffer_global_wg_size
+                : default_pick_global_wg_size,
+      is_buffer ? layer_norm_buffer_local_wg_size : default_pick_local_wg_size,
       // Inputs and Outputs
       {{{out_tensor, mean_tensor, rstd_tensor}, vkapi::kWrite},
        {{in, arg_weight, arg_bias}, vkapi::kRead}},
       // Shader params buffers
-      {},
+      param_ubos,
       // Push Constants
-      {
-          graph.logical_limits_pc_of(out_tensor),
-          graph.sizes_pc_of(out_tensor),
-          PushConstantDataInfo(&epsilon, sizeof(epsilon)),
-      },
+      {PushConstantDataInfo(&epsilon, sizeof(epsilon))},
       // Specialization Constants
-      {
-          graph.hashed_layout_of(in),
-          graph.hashed_layout_of(out_tensor),
-      },
+      spec_constants,
       // Resize Args
       {normalized_shape},
       // Resizing Logic
diff --git a/backends/vulkan/test/op_tests/cases.py b/backends/vulkan/test/op_tests/cases.py
index c19ce168ad1..d5f2305ce51 100644
--- a/backends/vulkan/test/op_tests/cases.py
+++ b/backends/vulkan/test/op_tests/cases.py
@@ -758,6 +758,10 @@ def get_native_layer_norm_inputs():
         "utils::kHeightPacked",
         "utils::kChannelsPacked",
     ]
+    test_suite.storage_types = [
+        "utils::kTexture3D",
+        "utils::kBuffer",
+    ]
     return test_suite
 
 

From 93c4582778960867eb06332dc872852e98ae63ec Mon Sep 17 00:00:00 2001
From: ssjia <ssjia@devgpu053.atn3.facebook.com>
Date: Tue, 17 Mar 2026 14:19:23 -0700
Subject: [PATCH 09/13] [ET-VK] Modernize repeat

Modernize repeat to support ANY_STORAGE. Rewrite texture shader to use
TextureMetadata with indexing.glslh helpers for coordinate conversion.
Add buffer shader variant using BufferMetadata. Unify dispatch to use
graph.meta_ubo() for both paths. Add symint support for dynamic repeat
counts.

Pull Request resolved: https://github.com/pytorch/executorch/pull/18056
ghstack-source-id: 353546685
@exported-using-ghexport

Differential Revision: [D95970170](https://our.internmc.facebook.com/intern/diff/D95970170/)
---
 backends/vulkan/op_registry.py                |   2 +-
 .../vulkan/runtime/graph/ops/glsl/repeat.glsl | 129 ------------------
 .../runtime/graph/ops/glsl/repeat_buffer.glsl |  51 +++++++
 .../glsl/{repeat.yaml => repeat_buffer.yaml}  |   7 +-
 .../graph/ops/glsl/repeat_texture.glsl        |  68 +++++++++
 .../graph/ops/glsl/repeat_texture.yaml        |  12 ++
 .../vulkan/runtime/graph/ops/impl/Repeat.cpp  | 116 ++++------------
 backends/vulkan/test/op_tests/cases.py        |   4 +-
 8 files changed, 167 insertions(+), 222 deletions(-)
 delete mode 100644 backends/vulkan/runtime/graph/ops/glsl/repeat.glsl
 create mode 100644 backends/vulkan/runtime/graph/ops/glsl/repeat_buffer.glsl
 rename backends/vulkan/runtime/graph/ops/glsl/{repeat.yaml => repeat_buffer.yaml} (77%)
 create mode 100644 backends/vulkan/runtime/graph/ops/glsl/repeat_texture.glsl
 create mode 100644 backends/vulkan/runtime/graph/ops/glsl/repeat_texture.yaml

diff --git a/backends/vulkan/op_registry.py b/backends/vulkan/op_registry.py
index f72e3dd5f0e..61135c18648 100644
--- a/backends/vulkan/op_registry.py
+++ b/backends/vulkan/op_registry.py
@@ -1378,7 +1378,7 @@ def register_grid_priors():
 @update_features(exir_ops.edge.aten.repeat.default)
 def register_repeat():
     return OpFeatures(
-        inputs_storage=utils.ANY_TEXTURE,
+        inputs_storage=utils.ANY_STORAGE,
         inputs_dtypes=utils.FP_INT_BOOL_T,
     )
 
diff --git a/backends/vulkan/runtime/graph/ops/glsl/repeat.glsl b/backends/vulkan/runtime/graph/ops/glsl/repeat.glsl
deleted file mode 100644
index 441cd57c17d..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/repeat.glsl
+++ /dev/null
@@ -1,129 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-#define VEC4_T ${texel_type(DTYPE)}
-
-layout(std430) buffer;
-
-${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)}
-${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)}
-
-layout(push_constant) uniform restrict Block {
-  ivec4 range;
-  // source tensor sizes in WHCB dims respectively
-  ivec4 src_dims;
-  // destination tensor repeats in WHCB dims respectively
-  ivec4 dst_repeats;
-};
-
-#include "indexing_utils.h"
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")}
-const lowp ivec4 out_axis_map = unhash_axis_map(out_layout);
-const lowp int packed_dim = unhash_packed_dim(out_layout);
-
-${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")}
-const lowp ivec4 in_axis_map = unhash_axis_map(in_layout);
-
-void main() {
-  ivec3 pos = ivec3(gl_GlobalInvocationID);
-
-  if (any(greaterThanEqual(pos, range.xyz))) {
-    return;
-  }
-
-  // expand position in packed dim
-  pos[packed_dim] <<= 2;
-
-  // channel size aligned by 4 when tensors are channel packed raw value otherwise
-  const int channel_size = (packed_dim == C_DIM ? alignup4(src_dims.z) : src_dims.z);
-
-  // find input texel's WHCB index
-  const int width_index = pos.x % src_dims.x;
-  const int height_index = pos.y % src_dims.y;
-  int channel_index;
-  int batch_index;
-
-  // if tensors are channel packed
-  if (packed_dim == C_DIM) {
-    // the output channels in a batch will be channel size * channel repetitions aligned by 4
-    const int out_channel_size = alignup4(src_dims.z * dst_repeats.z);
-
-    // batch index in the output
-    const int out_pos_batch_index = pos.z / out_channel_size;
-
-    // source batch index for based on current output pos
-    batch_index = out_pos_batch_index % src_dims.w;
-
-    // batch repetition count for current output pos
-    const int batch_repetition_index = out_pos_batch_index / src_dims.w;
-
-    // calculate input channel index based on current output pos and batch index
-    // its done this way because we want source channel to restart from zero when a batch index increments
-    // also batch_index will reset to zero after hitting batch repetition count
-    // so track the current repetition in batch_repetition_index so it can be used for determining current_index
-    channel_index = (pos.z - (batch_index + batch_repetition_index * src_dims.w) * out_channel_size) % src_dims.z;
-  } else {
-    // the output channels in a batch will be channel size * channel repetitions
-    const int out_channel_size = src_dims.z * dst_repeats.z;
-
-    // source batch index for based on current output pos
-    batch_index = (pos.z / out_channel_size) % src_dims.w;
-
-    // source channel index is current output pos wrapped based on channel count
-    channel_index = pos.z % src_dims.z;
-  }
-
-  // input texel's WCB position
-  const ivec3 in_pos = ivec3(width_index, height_index, channel_index);
-
-  // squeeze position in packed dim
-  pos[packed_dim] >>= 2;
-
-  // packed dim index of texel last fetched
-  int fetched_in_pos_packed_dim = -1;
-
-  // fetched input texel
-  VEC4_T in_value;
-
-  // output texel value
-  VEC4_T out_value = VEC4_T(0);
-
-  int src_lane_offset = in_pos[packed_dim];
-
-  for (int i=0; i<4; i++) {
-    if ((src_lane_offset >> 2) != fetched_in_pos_packed_dim) {
-      fetched_in_pos_packed_dim = (src_lane_offset >> 2);
-
-      ivec3 curr_in_pos = in_pos;
-      curr_in_pos[packed_dim] = src_lane_offset;
-      curr_in_pos.z = curr_in_pos.z + batch_index * channel_size;
-      curr_in_pos[packed_dim] >>= 2;
-
-      in_value = VEC4_T(load_texel_lpos(t_in, curr_in_pos, in_axis_map));
-    }
-
-    out_value[i] = in_value[src_lane_offset & 0x3];
-
-    src_lane_offset++;
-    // if packed index exceeded source packed dim round to zero
-    src_lane_offset = mix(src_lane_offset, 0, src_lane_offset >= src_dims[packed_dim]);
-  }
-
-  write_texel_lpos(
-    t_out,
-    pos,
-    out_value,
-    out_axis_map);
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/repeat_buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/repeat_buffer.glsl
new file mode 100644
index 00000000000..be2d87a168f
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/repeat_buffer.glsl
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+${define_required_extensions("buffer", DTYPE)}
+
+#define PRECISION ${PRECISION}
+
+#define T ${buffer_scalar_type(DTYPE)}
+
+${define_active_storage_type("buffer")}
+
+layout(std430) buffer;
+
+#include "indexing.glslh"
+
+${layout_declare_tensor(B, "w", "t_out", DTYPE, "buffer")}
+${layout_declare_tensor(B, "r", "t_in", DTYPE, "buffer")}
+
+${layout_declare_ubo(B, "BufferMetadata", "out_meta")}
+${layout_declare_ubo(B, "BufferMetadata", "in_meta")}
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+void main() {
+  const uint out_bufi = gl_GlobalInvocationID.x;
+  if (out_of_bounds(out_bufi, out_meta)) {
+    return;
+  }
+
+  TensorIndex out_tidx = linear_idx_to_tensor_idx(out_meta, out_bufi);
+
+  TensorIndex in_tidx;
+  initialize(in_tidx);
+
+  const int n = int_ndim(out_meta);
+  for (int d = 0; d < n; d++) {
+    in_tidx.data[div_4(d)][mod_4(d)] =
+        idx_at(out_tidx, d) % size_at(in_meta, d);
+  }
+
+  const uint in_bufi = tensor_idx_to_linear_idx(in_meta, in_tidx);
+
+  t_out[out_bufi] = t_in[in_bufi];
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/repeat.yaml b/backends/vulkan/runtime/graph/ops/glsl/repeat_buffer.yaml
similarity index 77%
rename from backends/vulkan/runtime/graph/ops/glsl/repeat.yaml
rename to backends/vulkan/runtime/graph/ops/glsl/repeat_buffer.yaml
index f40d94142e1..83d03d00b01 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/repeat.yaml
+++ b/backends/vulkan/runtime/graph/ops/glsl/repeat_buffer.yaml
@@ -1,8 +1,7 @@
-repeat:
+repeat_buffer:
   parameter_names_with_default_values:
     DTYPE: float
-    NDIM: 3
-    STORAGE: texture3d
+    STORAGE: buffer
   generate_variant_forall:
     DTYPE:
       - VALUE: half
@@ -11,4 +10,4 @@ repeat:
       - VALUE: int8
       - VALUE: uint8
   shader_variants:
-    - NAME: repeat
+    - NAME: repeat_buffer
diff --git a/backends/vulkan/runtime/graph/ops/glsl/repeat_texture.glsl b/backends/vulkan/runtime/graph/ops/glsl/repeat_texture.glsl
new file mode 100644
index 00000000000..6dbcdae1817
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/repeat_texture.glsl
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+${define_required_extensions("texture3d", DTYPE)}
+
+#define PRECISION ${PRECISION}
+
+#define VEC4_T ${texel_load_type(DTYPE, "texture3d")}
+
+${define_active_storage_type("texture3d")}
+
+#extension GL_EXT_control_flow_attributes : require
+
+layout(std430) buffer;
+
+#include "common.glslh"
+#include "indexing.glslh"
+
+${layout_declare_tensor(B, "w", "t_out", DTYPE, "texture3d")}
+${layout_declare_tensor(B, "r", "t_in", DTYPE, "texture3d")}
+
+${layout_declare_ubo(B, "TextureMetadata", "out_meta")}
+${layout_declare_ubo(B, "TextureMetadata", "in_meta")}
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+${layout_declare_spec_const(C, "int", "out_layout", "CONTIG_LAYOUT_INT")}
+const int packed_dim = get_packed_dim(out_layout);
+
+void main() {
+  const ivec3 out_pos = ivec3(gl_GlobalInvocationID);
+
+  if (out_of_bounds(out_pos, out_meta)) {
+    return;
+  }
+
+  TensorIndex4D out_tidx = texture_pos_to_tensor4d_idx_simple(out_meta, out_pos);
+
+  VEC4_T out_texel = VEC4_T(0);
+
+  const int limit = min(
+      4, out_meta.sizes[packed_dim] - out_tidx.data[packed_dim]);
+  for (int comp = 0; comp < limit; comp++) {
+    TensorIndex4D in_tidx = out_tidx;
+    in_tidx.data = ivec4(
+        out_tidx.data.x % in_meta.sizes.x,
+        out_tidx.data.y % in_meta.sizes.y,
+        out_tidx.data.z % in_meta.sizes.z,
+        out_tidx.data.w % in_meta.sizes.w);
+
+    TextureElementIndex in_elem =
+        tensor4d_idx_to_texture_element_idx_simple(in_meta, in_tidx);
+
+    VEC4_T in_texel = texelFetch(t_in, in_elem.pos, 0);
+    out_texel[comp] = in_texel[in_elem.comp];
+
+    out_tidx.data[packed_dim]++;
+  }
+
+  imageStore(t_out, out_pos, out_texel);
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/repeat_texture.yaml b/backends/vulkan/runtime/graph/ops/glsl/repeat_texture.yaml
new file mode 100644
index 00000000000..058da576e51
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/repeat_texture.yaml
@@ -0,0 +1,12 @@
+repeat_texture:
+  parameter_names_with_default_values:
+    DTYPE: float
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: half
+      - VALUE: float
+      - VALUE: int32
+      - VALUE: int8
+      - VALUE: uint8
+  shader_variants:
+    - NAME: repeat_texture3d
diff --git a/backends/vulkan/runtime/graph/ops/impl/Repeat.cpp b/backends/vulkan/runtime/graph/ops/impl/Repeat.cpp
index 2b42c0bd150..48b990f4622 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Repeat.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Repeat.cpp
@@ -9,114 +9,58 @@
 #include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
 
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
 
 namespace vkcompute {
 
-namespace {
-
-void check_args(
-    ComputeGraph& graph,
-    const ValueRef in,
-    const std::vector<int64_t>& repeats,
-    const ValueRef out) {
-  VK_CHECK_COND(graph.packed_dim_of(in) == graph.packed_dim_of(out));
-
-  VK_CHECK_COND(graph.storage_type_of(in) == graph.storage_type_of(out));
-  if (graph.storage_type_of(in) == utils::kTexture2D) {
-    VK_CHECK_COND(graph.dim_of(in) <= 2);
+void resize_repeat_node(
+    ComputeGraph* graph,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& extra_args) {
+  const ValueRef in = args.at(1).refs.at(0);
+  const ValueRef out = args.at(0).refs.at(0);
+  const ValueRef repeats_ref = extra_args.at(0);
+
+  const std::vector<int64_t> in_sizes = graph->sizes_of(in);
+  const std::vector<int64_t> repeats =
+      graph->extract_int_or_symint_list(repeats_ref);
+
+  const size_t out_ndim = std::max(in_sizes.size(), repeats.size());
+  std::vector<int64_t> out_sizes(out_ndim);
+  for (size_t i = 0; i < out_ndim; i++) {
+    const size_t in_offset = i + in_sizes.size() - out_ndim;
+    const size_t rep_offset = i + repeats.size() - out_ndim;
+    // Prepend 1s to in_sizes if repeats is longer, and vice versa
+    const int64_t in_size =
+        (i >= out_ndim - in_sizes.size()) ? in_sizes[in_offset] : 1;
+    const int64_t r =
+        (i >= out_ndim - repeats.size()) ? repeats[rep_offset] : 1;
+    out_sizes[i] = in_size * r;
   }
-
-  const int64_t in_dim = graph.dim_of(in);
-  VK_CHECK_COND(
-      in_dim <= repeats.size(),
-      "Input tensor dim size must be not greater than the repeat argument's size");
-
-  const std::vector<int64_t> in_sizes = graph.sizes_of(in);
-  const std::vector<int64_t> out_sizes = graph.sizes_of(out);
-
-  VK_CHECK_COND(
-      dim_at<kWidth4D>(in_sizes) * dim_at<kWidth4D>(repeats) ==
-          dim_at<kWidth4D>(out_sizes),
-      "Output's width doesn't match input's width * repeat count");
-
-  VK_CHECK_COND(
-      dim_at<kHeight4D>(in_sizes) * dim_at<kHeight4D>(repeats) ==
-          dim_at<kHeight4D>(out_sizes),
-      "Output's height doesn't match input's height * repeat count");
-
-  VK_CHECK_COND(
-      dim_at<kChannel4D>(in_sizes) * dim_at<kChannel4D>(repeats) ==
-          dim_at<kChannel4D>(out_sizes),
-      "Output's channel doesn't match input's channel * repeat count");
-
-  VK_CHECK_COND(
-      dim_at<kBatch4D>(in_sizes) * dim_at<kBatch4D>(repeats) ==
-          dim_at<kBatch4D>(out_sizes),
-      "Output's batch doesn't match input's batch * repeat count");
+  graph->virtual_resize(out, out_sizes);
 }
 
-} // namespace
-
 void add_repeat_node(
     ComputeGraph& graph,
     ValueRef in,
     ValueRef repeats_ref,
     ValueRef out) {
-  const std::vector<int64_t> repeats = *(graph.get_int_list(repeats_ref));
-
-  check_args(graph, in, repeats, out);
-
-  const std::vector<int64_t> in_sizes = graph.sizes_of(in);
-  const utils::ivec4 src_dims{
-      dim_at<kWidth4D>(in_sizes),
-      dim_at<kHeight4D>(in_sizes),
-      dim_at<kChannel4D>(in_sizes),
-      dim_at<kBatch4D>(in_sizes)};
-  const utils::ivec4 dst_repeats{
-      dim_at<kWidth4D>(repeats),
-      dim_at<kHeight4D>(repeats),
-      dim_at<kChannel4D>(repeats),
-      dim_at<kBatch4D>(repeats)};
-
   std::string kernel_name = "repeat";
   kernel_name.reserve(kShaderNameReserve);
+  add_storage_type_suffix(kernel_name, graph.storage_type_of(out));
   add_dtype_suffix(kernel_name, graph.dtype_of(out));
 
-  // A copy of range with the last element set to batch size of the input tensor
-  const utils::ivec3 wg_size = graph.logical_limits_of(out);
-
-  const auto shader = VK_KERNEL_FROM_STR(kernel_name);
-
   graph.execute_nodes().emplace_back(new DynamicDispatchNode(
       graph,
       VK_KERNEL_FROM_STR(kernel_name),
       default_pick_global_wg_size,
       default_pick_local_wg_size,
-      // Inputs and Outputs
-      {
-          {out, vkapi::kWrite},
-          {in, vkapi::kRead},
-      },
-      // Parameter buffers
-      {},
-      // Push Constants
-      {
-          PushConstantDataInfo(&wg_size, sizeof(wg_size), sizeof(utils::ivec4)),
-          PushConstantDataInfo(
-              &src_dims, sizeof(src_dims), sizeof(utils::ivec4)),
-          PushConstantDataInfo(
-              &dst_repeats, sizeof(dst_repeats), sizeof(utils::ivec4)),
-      },
-      // Specialization Constants
-      {graph.hashed_layout_of(out), graph.hashed_layout_of(in)},
-      // Resize Args
+      {{out, vkapi::kWrite}, {in, vkapi::kRead}},
+      {graph.meta_ubo(out), graph.meta_ubo(in)},
       {},
-      // Resizing Logic
-      nullptr));
+      {graph.hashed_layout_of(out)},
+      {repeats_ref},
+      resize_repeat_node));
 }
 
 void repeat(ComputeGraph& graph, const std::vector<ValueRef>& args) {
diff --git a/backends/vulkan/test/op_tests/cases.py b/backends/vulkan/test/op_tests/cases.py
index d5f2305ce51..99c4bebb64f 100644
--- a/backends/vulkan/test/op_tests/cases.py
+++ b/backends/vulkan/test/op_tests/cases.py
@@ -1345,7 +1345,7 @@ def get_repeat_inputs():
         "utils::kHeightPacked",
         "utils::kChannelsPacked",
     ]
-    test_suite_2d.storage_types = ["utils::kTexture3D"]
+    test_suite_2d.storage_types = ["utils::kTexture3D", "utils::kBuffer"]
     test_suite_2d.data_gen = "make_seq_tensor"
     test_suite_2d.dtypes = ["at::kFloat"]
     test_suite_2d.test_name_suffix = "2d"
@@ -1390,7 +1390,7 @@ def get_repeat_inputs():
         "utils::kHeightPacked",
         "utils::kChannelsPacked",
     ]
-    test_suite_3d.storage_types = ["utils::kTexture3D"]
+    test_suite_3d.storage_types = ["utils::kTexture3D", "utils::kBuffer"]
     test_suite_3d.data_gen = "make_seq_tensor"
     test_suite_3d.dtypes = ["at::kFloat"]
     test_suite_3d.test_name_suffix = "3d"

From 59beb2c21f7cd97efb0af5115c68a1c9b1b9a397 Mon Sep 17 00:00:00 2001
From: ssjia <ssjia@devgpu053.atn3.facebook.com>
Date: Tue, 17 Mar 2026 14:19:24 -0700
Subject: [PATCH 10/13] [ET-VK] Modernize embedding

Modernize embedding to support ANY_STORAGE. Add buffer and texture
shader variants using BufferMetadata/TextureMetadata with indexing.glslh.
Unify new dispatch path with add_storage_type_suffix and
graph.meta_ubo(). Legacy channels-packed texture path retained for
backward compatibility.

Pull Request resolved: https://github.com/pytorch/executorch/pull/18057
ghstack-source-id: 353546689
@exported-using-ghexport

Differential Revision: [D95970161](https://our.internmc.facebook.com/intern/diff/D95970161/)
---
 backends/vulkan/op_registry.py                        | 11 ++++++++++-
 .../runtime/graph/ops/glsl/embedding_legacy.glsl      |  9 +++------
 backends/vulkan/runtime/graph/ops/impl/Embedding.cpp  |  4 +---
 backends/vulkan/test/op_tests/cases.py                |  5 ++---
 4 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/backends/vulkan/op_registry.py b/backends/vulkan/op_registry.py
index 61135c18648..bdda551de27 100644
--- a/backends/vulkan/op_registry.py
+++ b/backends/vulkan/op_registry.py
@@ -1390,11 +1390,20 @@ def register_repeat():
 
 @update_features(exir_ops.edge.aten.embedding.default)
 def register_embedding():
+    def check_embedding_weight_size(node: torch.fx.Node) -> bool:
+        weight = node.args[0]
+        if isinstance(weight, torch.fx.Node) and utils.is_tensor_node(weight):
+            numel = weight.meta["val"].numel()
+            if numel > utils.DEFAULT_BUFFER_LIMIT:
+                return False
+        return True
+
     return OpFeatures(
-        inputs_storage=utils.CHANNELS_PACKED_TEXTURE,
+        inputs_storage=utils.ANY_STORAGE,
         inputs_dtypes=[utils.FP_T, utils.INT_T],
         supports_prepacking=True,
         supports_resize=True,
+        are_node_inputs_supported_fn=check_embedding_weight_size,
     )
 
 
diff --git a/backends/vulkan/runtime/graph/ops/glsl/embedding_legacy.glsl b/backends/vulkan/runtime/graph/ops/glsl/embedding_legacy.glsl
index 73a444cd84d..87cea50cdea 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/embedding_legacy.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/embedding_legacy.glsl
@@ -16,7 +16,7 @@ layout(std430) buffer;
 
 ${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)}
 ${layout_declare_tensor(B, "r", "t_in", "int", STORAGE)}
-${layout_declare_tensor(B, "r", "t_weight", DTYPE, STORAGE)}
+${layout_declare_tensor(B, "r", "t_weight", DTYPE, "texture2d")}
 ${layout_declare_ubo(B, "ivec4", "sizes")}
 
 #include "indexing_utils.h"
@@ -30,9 +30,6 @@ const lowp int packed_dim = unhash_packed_dim(out_layout);
 ${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")}
 const lowp ivec4 in_axis_map = unhash_axis_map(in_layout);
 
-${layout_declare_spec_const(C, "int", "weight_layout", "DEFAULT_LAYOUT")}
-const lowp ivec4 weight_axis_map = unhash_axis_map(weight_layout);
-
 void main() {
   const ivec3 out_lpos = ivec3(gl_GlobalInvocationID);
   const ivec4 out_tidx = lpos_to_tidx(out_lpos, sizes, out_axis_map.w, packed_dim);
@@ -48,8 +45,8 @@ void main() {
     const int in_texel_elem = load_texel_lpos(t_in, in_lpos, in_axis_map)[out_tidx.w % 4];
 
     // Read weight tensor for embedding, it is height-packed.
-    const ivec3 weight_lpos = ivec3(out_tidx.x, in_texel_elem / 4, 0);
-    out_texel[i] = load_texel_lpos(t_weight, weight_lpos, weight_axis_map)[in_texel_elem % 4];
+    const ivec2 weight_pos = ivec2(out_tidx.x, in_texel_elem / 4);
+    out_texel[i] = texelFetch(t_weight, weight_pos, 0)[in_texel_elem % 4];
   }
 
   write_texel_lpos(t_out, out_lpos, out_texel, out_axis_map);
diff --git a/backends/vulkan/runtime/graph/ops/impl/Embedding.cpp b/backends/vulkan/runtime/graph/ops/impl/Embedding.cpp
index 61d27d48f6c..b98eb75cebd 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Embedding.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Embedding.cpp
@@ -111,9 +111,7 @@ void add_embedding_legacy_node(
       // Push Constants
       {},
       // Specialization Constants
-      {graph.hashed_layout_of(out),
-       graph.hashed_layout_of(in),
-       graph.hashed_layout_of(weight)},
+      {graph.hashed_layout_of(out), graph.hashed_layout_of(in)},
       // Resize Args
       {},
       // Resizing Logic
diff --git a/backends/vulkan/test/op_tests/cases.py b/backends/vulkan/test/op_tests/cases.py
index 99c4bebb64f..87905860081 100644
--- a/backends/vulkan/test/op_tests/cases.py
+++ b/backends/vulkan/test/op_tests/cases.py
@@ -1167,14 +1167,13 @@ def get_embedding_inputs():
         Test(weight=[10, 9], indices=[[1, 2, 3], [1, 2, 3], [1, 2, 3], [1, 2, 3]]),
     ]
 
-    # Channels packed test cases currently fail on Mac, so they are not included.
-    # However the test case definition is kept for later debugging.
     test_suite_cpack = VkTestSuite(
         [tuple(tc) + (-1, "false", "false") for tc in test_cases]
     )
 
     test_suite_cpack.dtypes = ["at::kFloat"]
     test_suite_cpack.layouts = ["utils::kChannelsPacked"]
+    test_suite_cpack.storage_types = ["utils::kBuffer", "utils::kTexture3D"]
     test_suite_cpack.test_name_suffix = "cpacked"
 
     test_suite_wpack = VkTestSuite(
@@ -1186,7 +1185,7 @@ def get_embedding_inputs():
     test_suite_wpack.storage_types = ["utils::kBuffer", "utils::kTexture3D"]
     test_suite_wpack.test_name_suffix = "wpacked"
 
-    return test_suite_wpack
+    return [test_suite_cpack, test_suite_wpack]
 
 
 @register_test_suite("aten.gather.default")

From a8e01dcc0cbca75bb1e2cf156496e754fcfb077f Mon Sep 17 00:00:00 2001
From: ssjia <ssjia@devgpu053.atn3.facebook.com>
Date: Tue, 17 Mar 2026 14:19:26 -0700
Subject: [PATCH 11/13] [ET-VK] Modernize argmax and argmin

Modernize argmax and argmin to support ANY_STORAGE via the
add_reduce_per_row_node dispatch path. Buffer shader uses
BufferMetadata with indexing.glslh. Custom workgroup sizing retained
for cooperative row-reduction algorithm with shared memory.

Pull Request resolved: https://github.com/pytorch/executorch/pull/18058
ghstack-source-id: 353546687
@exported-using-ghexport

Differential Revision: [D95970165](https://our.internmc.facebook.com/intern/diff/D95970165/)
---
 backends/vulkan/op_registry.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backends/vulkan/op_registry.py b/backends/vulkan/op_registry.py
index bdda551de27..189da2c6baa 100644
--- a/backends/vulkan/op_registry.py
+++ b/backends/vulkan/op_registry.py
@@ -705,7 +705,7 @@ def register_reduce_cpp_ops():
 )
 def register_argreduce_cpp_ops():
     return OpFeatures(
-        inputs_storage=utils.ANY_TEXTURE,
+        inputs_storage=utils.ANY_STORAGE,
         inputs_dtypes=utils.FP_T,
         outputs_dtypes=utils.INT_T,
         supports_resize=True,

From d996fb6e1e373608b807c9ac00b8fe77f89d247a Mon Sep 17 00:00:00 2001
From: ssjia <ssjia@devgpu053.atn3.facebook.com>
Date: Tue, 17 Mar 2026 14:19:28 -0700
Subject: [PATCH 12/13] [ET-VK] Implement missing Vulkan operators for Parakeet
 TDT model

Pull Request resolved: https://github.com/pytorch/executorch/pull/18059

Add missing operators needed for Parakeet TDT model support:

- New symint ops: sym_sub, sym_floordiv, sym_mul in SymIntOps.cpp;
  register operator.floordiv and operator.mul as ephemeral ops in
  op_registry.py
- New tensor ops: bitwise_not (via unary_op shader with uint8 DTYPE),
  logical_and (alias for bitwise_and dispatch)
- Improve _to_copy: expand dtype support to FP_INT_BOOL_T and use
  pick_io_storage_fn to restrict to CONTIGUOUS_BUFFER for non-fp
  conversions
- Fix where resize: compute output shape via broadcast across all tensor
  inputs instead of always using the second input's shape
- Add symint support to split: use extract_int_or_symint_list instead of
  get_int_list in resize_split_node and split_with_sizes_copy_default
- Mark scalar_tensor as supporting resize
ghstack-source-id: 353546692
@exported-using-ghexport

Differential Revision: [D95970159](https://our.internmc.facebook.com/intern/diff/D95970159/)
---
 backends/vulkan/op_registry.py                | 41 +++++++--
 .../runtime/graph/ops/glsl/unary_op.yaml      |  3 +
 .../runtime/graph/ops/impl/BinaryOp.cpp       |  1 +
 .../vulkan/runtime/graph/ops/impl/Split.cpp   | 42 +--------
 .../runtime/graph/ops/impl/SymIntOps.cpp      | 87 +++++++++++++++++++
 .../vulkan/runtime/graph/ops/impl/ToCopy.cpp  |  4 +-
 .../vulkan/runtime/graph/ops/impl/UnaryOp.cpp |  2 +
 .../vulkan/runtime/graph/ops/impl/Where.cpp   | 18 +++-
 8 files changed, 147 insertions(+), 51 deletions(-)

diff --git a/backends/vulkan/op_registry.py b/backends/vulkan/op_registry.py
index 189da2c6baa..308718ade7d 100644
--- a/backends/vulkan/op_registry.py
+++ b/backends/vulkan/op_registry.py
@@ -159,6 +159,8 @@ def update_features_impl(op: OpKey):
         torch.ops.aten.sym_size.int,
         operator.add,
         operator.sub,
+        operator.floordiv,
+        operator.mul,
         operator.lt,
         operator.gt,
         operator.ge,
@@ -279,6 +281,26 @@ def register_bitwise_and():
     )
 
 
+@update_features(exir_ops.edge.aten.bitwise_not.default)
+def register_bitwise_not():
+    return OpFeatures(
+        inputs_storage=utils.ANY_STORAGE,
+        inputs_dtypes=utils.BOOL_T,
+        supports_resize=True,
+        supports_highdim=True,
+    )
+
+
+@update_features(exir_ops.edge.aten.logical_and.default)
+def register_logical_and():
+    return OpFeatures(
+        inputs_storage=utils.ANY_STORAGE,
+        inputs_dtypes=utils.BOOL_T,
+        supports_resize=True,
+        supports_highdim=True,
+    )
+
+
 # =============================================================================
 # BinaryScalarOp.cpp
 # =============================================================================
@@ -301,16 +323,22 @@ def register_pow_tensor_scalar():
 
 @update_features(exir_ops.edge.aten._to_copy.default)
 def register_to_copy():
-    def check_to_copy_node(node: torch.fx.Node) -> bool:
-        # Only single-arg _to_copy is supported
-        return len(node.args) == 1
+    def pick_to_copy_storage(
+        node: torch.fx.Node,
+    ) -> Tuple[utils.TensorRepSet, utils.TensorRepSet]:
+        in_dtype = node.args[0].meta["val"].dtype  # type: ignore[union-attr]
+        out_dtype = node.meta["val"].dtype
+        fp_types = {torch.float16, torch.float32}
+        if in_dtype in fp_types and out_dtype in fp_types:
+            return utils.ANY_STORAGE, utils.ANY_STORAGE
+        return utils.CONTIGUOUS_BUFFER, utils.CONTIGUOUS_BUFFER
 
     return OpFeatures(
         inputs_storage=utils.ANY_STORAGE,
-        inputs_dtypes=utils.FP_INT_T,
-        outputs_dtypes=utils.FP_INT_T,
+        inputs_dtypes=utils.FP_INT_BOOL_T,
+        outputs_dtypes=utils.FP_INT_BOOL_T,
         supports_resize=True,
-        are_node_inputs_supported_fn=check_to_copy_node,
+        pick_io_storage_fn=pick_to_copy_storage,
     )
 
 
@@ -1336,6 +1364,7 @@ def register_scalar_tensor():
     return OpFeatures(
         inputs_storage=utils.CHANNELS_PACKED_TEXTURE,
         inputs_dtypes=utils.FP_INT_T,
+        supports_resize=True,
     )
 
 
diff --git a/backends/vulkan/runtime/graph/ops/glsl/unary_op.yaml b/backends/vulkan/runtime/graph/ops/glsl/unary_op.yaml
index 47f538aee6c..1763f975058 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/unary_op.yaml
+++ b/backends/vulkan/runtime/graph/ops/glsl/unary_op.yaml
@@ -46,3 +46,6 @@ unary_op:
       OPERATOR: leaky_relu(X, A)
     - NAME: round
       OPERATOR: round(X)
+    - NAME: bitwise_not_uint8
+      OPERATOR: 1 - X
+      DTYPE: uint8
diff --git a/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp b/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp
index 92c2fa218ec..fa4c75463b7 100644
--- a/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp
@@ -214,6 +214,7 @@ REGISTER_OPERATORS {
   VK_REGISTER_OP(aten.gt.Tensor, gt);
   VK_REGISTER_OP(aten.ge.Tensor, ge);
   VK_REGISTER_OP(aten.bitwise_and.Tensor, bitwise_and);
+  VK_REGISTER_OP(aten.logical_and.default, bitwise_and);
 }
 
 } // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Split.cpp b/backends/vulkan/runtime/graph/ops/impl/Split.cpp
index 4e62ae8806d..f21d58f7b98 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Split.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Split.cpp
@@ -9,52 +9,13 @@
 #include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
 
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
 
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
 
 #include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
 
-#include <executorch/backends/vulkan/runtime/utils/StorageUtils.h>
-
 namespace vkcompute {
 
-using utils::GPUMemoryLayout;
-using utils::StorageType;
-
-void resize_split_node(
-    ComputeGraph* graph,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& resize_args) {
-  (void)resize_args;
-  const ValueRef input = args.at(0).refs.at(0);
-  const ValueRef split_sizes_ref = args.at(1).refs.at(0);
-  const ValueRef dim_ref = args.at(2).refs.at(0);
-  const ValueRef out_list_ref = args.at(3).refs.at(0);
-
-  const ValueListPtr out_list = graph->get_value_list(out_list_ref);
-  const std::vector<int64_t> split_sizes =
-      *(graph->get_int_list(split_sizes_ref));
-  const int64_t dim = graph->extract_scalar<int64_t>(dim_ref);
-
-  const int64_t input_ndim = graph->dim_of(input);
-  const DimIndex dim_index = dim < 0 ? static_cast<DimIndex>(dim)
-                                     : static_cast<DimIndex>(dim - input_ndim);
-
-  std::vector<int64_t> input_sizes = graph->sizes_of(input);
-
-  for (int split_idx = 0; split_idx < split_sizes.size(); split_idx++) {
-    const int64_t split_size = split_sizes.at(split_idx);
-    const ValueRef out_ref = out_list->at(split_idx);
-
-    std::vector<int64_t> out_sizes = input_sizes;
-    out_sizes.at(dim_index) = split_size;
-
-    graph->virtual_resize(out_ref, out_sizes);
-  }
-}
-
 void add_split_node(
     ComputeGraph& graph,
     const ValueRef input,
@@ -125,7 +86,8 @@ void split_with_sizes_copy_default(
   ValueRef out_list_ref = args[3];
 
   int64_t dim = graph.extract_scalar<int64_t>(dim_ref);
-  std::vector<int64_t> split_sizes = *(graph.get_int_list(split_sizes_ref));
+  std::vector<int64_t> split_sizes =
+      graph.extract_int_or_symint_list(split_sizes_ref);
 
   add_split_with_sizes_node(graph, input, split_sizes, dim, out_list_ref);
 }
diff --git a/backends/vulkan/runtime/graph/ops/impl/SymIntOps.cpp b/backends/vulkan/runtime/graph/ops/impl/SymIntOps.cpp
index eb03639abf1..3aef6bc988d 100644
--- a/backends/vulkan/runtime/graph/ops/impl/SymIntOps.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/SymIntOps.cpp
@@ -81,6 +81,90 @@ void sym_add(ComputeGraph& graph, const std::vector<ValueRef>& args) {
       new ExecuteNode(resize_sym_add_node, args));
 }
 
+void sym_sub_impl(ComputeGraph* graph, const std::vector<ValueRef>& args) {
+  const ValueRef a = args.at(0);
+  const ValueRef b = args.at(1);
+  const ValueRef out = args.at(2);
+
+  const int32_t a_val = graph->read_symint(a);
+  const int32_t b_val = graph->read_symint(b);
+  const int32_t result = a_val - b_val;
+
+  graph->set_symint(out, result);
+}
+
+void resize_sym_sub_node(
+    ComputeGraph* graph,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& resize_args) {
+  (void)args;
+  sym_sub_impl(graph, resize_args);
+}
+
+void sym_sub(ComputeGraph& graph, const std::vector<ValueRef>& args) {
+  sym_sub_impl(&graph, args);
+
+  graph.execute_nodes().emplace_back(
+      new ExecuteNode(resize_sym_sub_node, args));
+}
+
+void sym_floordiv_impl(ComputeGraph* graph, const std::vector<ValueRef>& args) {
+  const ValueRef a = args.at(0);
+  const ValueRef b = args.at(1);
+  const ValueRef out = args.at(2);
+
+  const int32_t a_val = graph->read_symint(a);
+  const int32_t b_val = graph->read_symint(b);
+  // Floor division: round towards negative infinity
+  const int32_t result = (a_val ^ b_val) < 0 && a_val % b_val != 0
+      ? a_val / b_val - 1
+      : a_val / b_val;
+
+  graph->set_symint(out, result);
+}
+
+void resize_sym_floordiv_node(
+    ComputeGraph* graph,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& resize_args) {
+  (void)args;
+  sym_floordiv_impl(graph, resize_args);
+}
+
+void sym_floordiv(ComputeGraph& graph, const std::vector<ValueRef>& args) {
+  sym_floordiv_impl(&graph, args);
+
+  graph.execute_nodes().emplace_back(
+      new ExecuteNode(resize_sym_floordiv_node, args));
+}
+
+void sym_mul_impl(ComputeGraph* graph, const std::vector<ValueRef>& args) {
+  const ValueRef a = args.at(0);
+  const ValueRef b = args.at(1);
+  const ValueRef out = args.at(2);
+
+  const int32_t a_val = graph->read_symint(a);
+  const int32_t b_val = graph->read_symint(b);
+  const int32_t result = a_val * b_val;
+
+  graph->set_symint(out, result);
+}
+
+void resize_sym_mul_node(
+    ComputeGraph* graph,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& resize_args) {
+  (void)args;
+  sym_mul_impl(graph, resize_args);
+}
+
+void sym_mul(ComputeGraph& graph, const std::vector<ValueRef>& args) {
+  sym_mul_impl(&graph, args);
+
+  graph.execute_nodes().emplace_back(
+      new ExecuteNode(resize_sym_mul_node, args));
+}
+
 void select_as_symint_impl(
     ComputeGraph* graph,
     const std::vector<ArgGroup>& unused,
@@ -132,6 +216,9 @@ void select_as_symint(ComputeGraph& graph, const std::vector<ValueRef>& args) {
 REGISTER_OPERATORS {
   VK_REGISTER_OP(sym_size.int, sym_size_int);
   VK_REGISTER_OP(add, sym_add);
+  VK_REGISTER_OP(sub, sym_sub);
+  VK_REGISTER_OP(floordiv, sym_floordiv);
+  VK_REGISTER_OP(mul, sym_mul);
   VK_REGISTER_OP(et_vk.select_as_symint.default, select_as_symint);
 }
 
diff --git a/backends/vulkan/runtime/graph/ops/impl/ToCopy.cpp b/backends/vulkan/runtime/graph/ops/impl/ToCopy.cpp
index 275023faa59..2de4a555860 100644
--- a/backends/vulkan/runtime/graph/ops/impl/ToCopy.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/ToCopy.cpp
@@ -30,8 +30,8 @@ bool is_float_type(vkapi::ScalarType dtype) {
 }
 
 void add_to_copy_node(ComputeGraph& graph, ValueRef in, ValueRef out) {
-  vkapi::ScalarType in_dtype = graph.dtype_of(in);
-  vkapi::ScalarType out_dtype = graph.dtype_of(out);
+  const vkapi::ScalarType in_dtype = graph.dtype_of(in);
+  const vkapi::ScalarType out_dtype = graph.dtype_of(out);
 
   // Same-dtype or float<->half conversions can use BlitNode
   if (in_dtype == out_dtype ||
diff --git a/backends/vulkan/runtime/graph/ops/impl/UnaryOp.cpp b/backends/vulkan/runtime/graph/ops/impl/UnaryOp.cpp
index 9830a8e8784..de6172da2b9 100644
--- a/backends/vulkan/runtime/graph/ops/impl/UnaryOp.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/UnaryOp.cpp
@@ -158,6 +158,7 @@ DEFINE_ACTIVATION_FN(hardswish);
 DEFINE_ACTIVATION_FN(hardsigmoid);
 DEFINE_LEAKY_RELU_FN(leaky_relu);
 DEFINE_ACTIVATION_FN(round);
+DEFINE_ACTIVATION_FN(bitwise_not);
 
 REGISTER_OPERATORS {
   VK_REGISTER_OP(aten.abs.default, abs);
@@ -179,6 +180,7 @@ REGISTER_OPERATORS {
   VK_REGISTER_OP(aten.hardsigmoid.default, hardsigmoid);
   VK_REGISTER_OP(aten.leaky_relu.default, leaky_relu);
   VK_REGISTER_OP(aten.round.default, round);
+  VK_REGISTER_OP(aten.bitwise_not.default, bitwise_not);
 }
 
 } // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Where.cpp b/backends/vulkan/runtime/graph/ops/impl/Where.cpp
index adb7fb1beca..c52a0c277cd 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Where.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Where.cpp
@@ -21,10 +21,22 @@ void resize_where_node(
     const std::vector<ValueRef>& extra_args) {
   (void)extra_args;
   const ValueRef out = args.at(0).refs.at(0);
-  const ValueRef self = args.at(1).refs.at(1);
 
-  const std::vector<int64_t> self_sizes = graph->sizes_of(self);
-  graph->virtual_resize(out, self_sizes);
+  std::vector<int64_t> out_sizes;
+  for (const ValueRef ref : args.at(1).refs) {
+    if (!graph->val_is_tensor(ref)) {
+      continue;
+    }
+    const std::vector<int64_t> s = graph->sizes_of(ref);
+    if (s.size() > out_sizes.size()) {
+      out_sizes.resize(s.size(), 1);
+    }
+    const size_t offset = out_sizes.size() - s.size();
+    for (size_t i = 0; i < s.size(); i++) {
+      out_sizes[offset + i] = std::max(out_sizes[offset + i], s[i]);
+    }
+  }
+  graph->virtual_resize(out, out_sizes);
 }
 
 void add_where_node(

From ce21031facff3e9d6f758401c895ed3391061ec4 Mon Sep 17 00:00:00 2001
From: ssjia <ssjia@devgpu053.atn3.facebook.com>
Date: Tue, 17 Mar 2026 14:19:30 -0700
Subject: [PATCH 13/13] [ET-VK][qlinear] Add bias support to q4gsw and
 dq8ca_q4gsw quantized linear ops

Pull Request resolved: https://github.com/pytorch/executorch/pull/18061

Wire bias through the q4gsw and dq8ca_q4gsw quantized linear operators.
Add add_bias_to_out_tile() helper in the output tile computation header and call
it from all three shader variants (tiled, coop, dq8ca_tiled). Remove the bias
guard in the pattern matcher to allow biased linear layers.
ghstack-source-id: 353546681
@exported-using-ghexport

Differential Revision: [D95970172](https://our.internmc.facebook.com/intern/diff/D95970172/)
---
 backends/vulkan/custom_ops_lib.py             |  5 ++--
 backends/vulkan/patterns/quantized_linear.py  | 26 +++++++++++--------
 .../ops/glsl/linear_dq8ca_q4gsw_tiled.glsl    |  6 +++++
 .../linear_fp_output_tile_fp_compute.glslh    | 10 +++++++
 .../graph/ops/glsl/linear_q4gsw_coop.glsl     |  5 ++++
 .../graph/ops/glsl/linear_q4gsw_tiled.glsl    |  6 +++++
 .../vulkan/test/custom_ops/q4gsw_linear.cpp   |  9 ++++---
 7 files changed, 49 insertions(+), 18 deletions(-)

diff --git a/backends/vulkan/custom_ops_lib.py b/backends/vulkan/custom_ops_lib.py
index 7f687bb10f4..5769c3c132b 100644
--- a/backends/vulkan/custom_ops_lib.py
+++ b/backends/vulkan/custom_ops_lib.py
@@ -8,7 +8,6 @@
 
 import executorch.backends.vulkan.patterns as vk_patterns
 import torch.library
-
 from torch._subclasses.fake_tensor import FakeTensor
 
 namespace = "et_vk"
@@ -259,7 +258,7 @@ def linear_q4gsw(
         weights, [1, group_size], weight_scales, weight_zeros, torch.int8, -8, 7
     )
 
-    out = torch.nn.functional.linear(x, weights)
+    out = torch.nn.functional.linear(x, weights, bias)
     return out
 
 
@@ -273,7 +272,7 @@ def linear_dq8ca_q4gsw(
     group_size: int,
     bias: Optional[torch.Tensor] = None,
 ):
-    return linear_q4gsw(x, weights, weight_scales, group_size)
+    return linear_q4gsw(x, weights, weight_scales, group_size, bias)
 
 
 name = "linear_q4gsw"
diff --git a/backends/vulkan/patterns/quantized_linear.py b/backends/vulkan/patterns/quantized_linear.py
index b9b307e14f1..c6524102ac6 100644
--- a/backends/vulkan/patterns/quantized_linear.py
+++ b/backends/vulkan/patterns/quantized_linear.py
@@ -5,28 +5,22 @@
 # LICENSE file in the root directory of this source tree.
 
 import operator
-
 from typing import Optional
 
 import executorch.backends.vulkan.utils as utils
-
 import torch
 import torch.nn.functional as F
-
 from executorch.backends.transforms.utils import (
     create_constant_placeholder,
     get_param_tensor,
 )
-
 from executorch.backends.vulkan.patterns.pattern_registry import (
     PatternMatch,
     register_pattern_detector,
     register_pattern_replacement,
 )
-
 from executorch.exir import ExportedProgram
 from executorch.exir.dialects._ops import ops as exir_ops
-
 from torch.export.graph_signature import InputKind
 
 
@@ -398,6 +392,12 @@ def make_linear_q4gsw_op(
         force_update=True,
     )
 
+    # Pad bias to multiple of 4 if present
+    if match.bias_node is not None:
+        bias_tensor = get_param_tensor(ep, match.bias_node)
+        if bias_tensor is not None:
+            utils.align_width_and_update_state_dict(ep, match.bias_node, bias_tensor)
+
     with graph_module.graph.inserting_before(match.output_node):
         linear_q4gsw_node = graph_module.graph.create_node(
             "call_function",
@@ -407,6 +407,7 @@ def make_linear_q4gsw_op(
                 match.weight_node,
                 match.weight_scales_node,
                 group_size,
+                match.bias_node,
             ),
         )
 
@@ -445,6 +446,12 @@ def make_linear_dq8ca_q4gsw_op(
         force_update=True,
     )
 
+    # Pad bias to multiple of 4 if present
+    if match.bias_node is not None:
+        bias_tensor = get_param_tensor(ep, match.bias_node)
+        if bias_tensor is not None:
+            utils.align_width_and_update_state_dict(ep, match.bias_node, bias_tensor)
+
     first_graph_node = list(graph_module.graph.nodes)[0]
     with graph_module.graph.inserting_before(first_graph_node):
         weight_tensor_name = utils.get_tensor_name(ep, match.weight_node)
@@ -474,6 +481,7 @@ def make_linear_dq8ca_q4gsw_op(
                 weight_sums_node,
                 match.weight_scales_node,
                 group_size,
+                match.bias_node,
             ),
         )
 
@@ -538,6 +546,7 @@ def make_linear_q8ta_q8csw_custom_op(
                 match.weight_node,
                 weight_sums_node,
                 match.weight_scales_node,
+                match.bias_node,
             ),
         )
 
@@ -637,7 +646,6 @@ def replace_quantized_linear_patterns(
     assert weight_zeros_tensor is not None
 
     # Route to appropriate custom op.
-    # q8ta_linear supports bias, so check it first before the bias guard.
     if (
         match.is_input_static_per_tensor_quantized()
         and match.is_weight_perchannel_quantized()
@@ -646,10 +654,6 @@ def replace_quantized_linear_patterns(
         make_q8ta_linear_custom_op(ep, graph_module, match, weight_tensor)
         return
 
-    # Remaining ops do not support bias
-    if match.bias_node is not None:
-        return
-
     if (
         match.is_weight_only_quantized()
         and match.is_weight_pergroup_quantized()
diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_dq8ca_q4gsw_tiled.glsl b/backends/vulkan/runtime/graph/ops/glsl/linear_dq8ca_q4gsw_tiled.glsl
index b6c32863eb9..fa0129b65a5 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/linear_dq8ca_q4gsw_tiled.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/linear_dq8ca_q4gsw_tiled.glsl
@@ -144,5 +144,11 @@ void main() {
         group_size);
   }
 
+  if (apply_bias > 0) {
+    FPPerOutChannelParams bias_tile;
+    load_bias_tile(bias_tile, n4);
+    add_bias_to_out_tile(out_tile, bias_tile);
+  }
+
   write_output_tile_with_checks(out_tile, n4, m, N4, M);
 }
diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_fp_output_tile_fp_compute.glslh b/backends/vulkan/runtime/graph/ops/glsl/linear_fp_output_tile_fp_compute.glslh
index 01b3c762e39..60a19ca9fc9 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/linear_fp_output_tile_fp_compute.glslh
+++ b/backends/vulkan/runtime/graph/ops/glsl/linear_fp_output_tile_fp_compute.glslh
@@ -73,6 +73,16 @@ void apply_weight_scales_and_biases(
   }
 }
 
+void add_bias_to_out_tile(
+    inout FPOutTile tile,
+    const FPPerOutChannelParams bias) {
+  [[unroll]] for (int m = 0; m < TILE_M; ++m) {
+    [[unroll]] for (int n4 = 0; n4 < TILE_N4; ++n4) {
+      tile.data[m][n4] = tile.data[m][n4] + bias.data[n4];
+    }
+  }
+}
+
 void accumulate_out_tile_with_out_tile(
     inout FPOutTile accum,
     const FPOutTile other) {
diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_q4gsw_coop.glsl b/backends/vulkan/runtime/graph/ops/glsl/linear_q4gsw_coop.glsl
index 02bfe3fff0f..053f27d6c9b 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/linear_q4gsw_coop.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/linear_q4gsw_coop.glsl
@@ -142,6 +142,11 @@ void main() {
   // Only the first thread will write out result
   if (lid == 0) {
     out_tile = partial_sums[0];
+    if (apply_bias > 0) {
+      FPPerOutChannelParams bias_tile;
+      load_bias_tile(bias_tile, n4);
+      add_bias_to_out_tile(out_tile, bias_tile);
+    }
     write_output_tile_with_checks(out_tile, n4, 0, N4, 1);
   }
 }
diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_q4gsw_tiled.glsl b/backends/vulkan/runtime/graph/ops/glsl/linear_q4gsw_tiled.glsl
index 9a42a7fa67f..70a637ed0f8 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/linear_q4gsw_tiled.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/linear_q4gsw_tiled.glsl
@@ -110,5 +110,11 @@ void main() {
     }
   }
 
+  if (apply_bias > 0) {
+    FPPerOutChannelParams bias_tile;
+    load_bias_tile(bias_tile, n4);
+    add_bias_to_out_tile(out_tile, bias_tile);
+  }
+
   write_output_tile_with_checks(out_tile, n4, m, N4, M);
 }
diff --git a/backends/vulkan/test/custom_ops/q4gsw_linear.cpp b/backends/vulkan/test/custom_ops/q4gsw_linear.cpp
index 1d1d48615e9..ef6369c6b1f 100644
--- a/backends/vulkan/test/custom_ops/q4gsw_linear.cpp
+++ b/backends/vulkan/test/custom_ops/q4gsw_linear.cpp
@@ -148,7 +148,7 @@ TestCase create_test_case_from_config(
       input_dtype,
       storage_type,
       utils::kWidthPacked,
-      DataGenType::ZEROS);
+      config.has_bias ? DataGenType::RANDOM : DataGenType::ZEROS);
   bias.set_constant(true);
   if (!config.has_bias) {
     bias.set_none(true);
@@ -237,9 +237,10 @@ std::vector<TestCase> generate_quantized_linear_test_cases() {
       {32, 64, 32, 16},
       {32, 128, 64, 32},
       {32, 256, 128, 64},
-      // No bias tests
-      {32, 128, 64, 32, false},
-      {32, 256, 128, 64, false},
+      // With bias
+      {4, 64, 32, 16, true},
+      {4, 128, 64, 32, true},
+      {32, 128, 64, 32, true},
       // Performance test cases
       {1, 2048, 2048, 128},
       {128, 2048, 2048, 128},