From 04a12984fd98402180ffeadc0872b60336c1bf71 Mon Sep 17 00:00:00 2001
From: Vivek Trivedi <5340687+trivedivivek@users.noreply.github.com>
Date: Fri, 11 Jul 2025 11:11:16 -0700
Subject: [PATCH 1/2] [ET-VK] Using push constants for unary op.

Pull Request resolved: https://github.com/pytorch/executorch/pull/12308

This diff transitions the unary op to utilize push constants, replacing the previous ubo implementation.
ghstack-source-id: 295691432

Differential Revision: [D77706459](https://our.internmc.facebook.com/intern/diff/D77706459/)
---
 .../runtime/graph/ops/glsl/unary_op.glsl      | 13 ++++++++-----
 .../vulkan/runtime/graph/ops/impl/UnaryOp.cpp | 19 ++++++++-----------
 2 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/backends/vulkan/runtime/graph/ops/glsl/unary_op.glsl b/backends/vulkan/runtime/graph/ops/glsl/unary_op.glsl
index b645905939f..bb7ce482a7a 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/unary_op.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/unary_op.glsl
@@ -25,12 +25,15 @@ layout(std430) buffer;
 
 ${layout_declare_tensor(0, "w", "t_out", DTYPE, STORAGE)}
 ${layout_declare_tensor(1, "r", "t_in", DTYPE, STORAGE)}
+
+layout(push_constant) uniform restrict Block {
 $if STORAGE == "buffer":
-  ${layout_declare_ubo(2, "int", "numel")}
+  int numel;
 $else:
-  ${layout_declare_ubo(2, "ivec3", "out_limits")}
-${layout_declare_ubo(3, "float", "minimum")}
-${layout_declare_ubo(4, "float", "maximum")}
+  ivec4 out_limits;
+float minimum;
+float maximum;
+};
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
@@ -53,7 +56,7 @@ void main() {
 void main() {
   const ivec3 pos = ivec3(gl_GlobalInvocationID);
 
-  if (any(greaterThanEqual(pos, out_limits))) {
+  if (any(greaterThanEqual(pos, out_limits.xyz))) {
     return;
   }
 
diff --git a/backends/vulkan/runtime/graph/ops/impl/UnaryOp.cpp b/backends/vulkan/runtime/graph/ops/impl/UnaryOp.cpp
index 518148f12eb..ea8daf2ea64 100644
--- a/backends/vulkan/runtime/graph/ops/impl/UnaryOp.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/UnaryOp.cpp
@@ -43,15 +43,7 @@ void add_unary_op_node(
   add_dtype_suffix(kernel_name, graph.dtype_of(out));
   add_storage_type_suffix(kernel_name, graph.storage_type_of(out));
 
-  vkapi::ParamsBindList ubos({});
-  if (graph.is_buffer_storage(out)) {
-    ubos.append({graph.numel_ubo(out)});
-  } else {
-    ubos.append({graph.logical_limits_ubo(out)});
-  }
-  ubos.append(
-      {graph.create_params_buffer(min), graph.create_params_buffer(max)});
-
+  const utils::vec2 min_max = {min, max};
   graph.execute_nodes().emplace_back(new DynamicDispatchNode(
       graph,
       VK_KERNEL_FROM_STR(kernel_name),
@@ -60,9 +52,14 @@ void add_unary_op_node(
       // Inputs and Outputs
       {{out, vkapi::kWrite}, {in, vkapi::kRead}},
       // Shader params buffers
-      ubos,
-      // Push Constants
       {},
+      // Push Constants
+      {
+          graph.is_buffer_storage(out) ? graph.numel_pc_of(out)
+                                       : graph.logical_limits_pc_of(out),
+          PushConstantDataInfo(&min_max, sizeof(min_max)),
+      },
+      // pcs,
       // Specialization Constants
       {},
       // Resize Args

From f820e1fb9fc7da1f2e3e91158c43d1b879c986a1 Mon Sep 17 00:00:00 2001
From: Vivek Trivedi <5340687+trivedivivek@users.noreply.github.com>
Date: Fri, 11 Jul 2025 11:43:23 -0700
Subject: [PATCH 2/2] [ET-VK] Minor performance improvements for buffer to int8
 quantized packing.

Pull Request resolved: https://github.com/pytorch/executorch/pull/12383

This diff provides minor performance improvements for buffer to int8 quantized packing in the Vulkan runtime graph ops.
ghstack-source-id: 295691433

Differential Revision: [D74616519](https://our.internmc.facebook.com/intern/diff/D74616519/)
---
 .../nchw_to_bitw8_image_nobitw8buffer.glsl    | 36 ++++---------------
 1 file changed, 7 insertions(+), 29 deletions(-)

diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_bitw8_image_nobitw8buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_bitw8_image_nobitw8buffer.glsl
index 4b18abbb1c5..1a2c257baec 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_bitw8_image_nobitw8buffer.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_bitw8_image_nobitw8buffer.glsl
@@ -42,47 +42,25 @@ const lowp int packed_dim = unhash_packed_dim(t_layout);
  * Extends sign of int8
  */
 int extend_sign(int x) {
-  if (x >> 7 == 1) {
-    return x | 0xFFFFFF00;
-  }
-  return x;
+  return x | mix(0, 0xFFFFFF00, x >= (1 << 7));
 }
 
 ivec4 read_texel(ivec4 tidx) {
-  ivec4 tidx_to_use = tidx;
-  ivec4 sizes_to_use = sizes;
-  int packed_dim_to_use = packed_dim;
-  if (transpose_hw == 1) {
-    sizes_to_use.xy = sizes_to_use.yx;
-    tidx_to_use.xy = tidx.yx;
-
-    if (packed_dim == 1) {
-      packed_dim_to_use = 0;
-    }
-    if (packed_dim == 0) {
-      packed_dim_to_use = 1;
-    }
-  }
+  const ivec4 tidx_to_use = ivec4(mix(tidx.xy, tidx.yx, bvec2(transpose_hw == 1)), tidx.zw);
+  const ivec4 sizes_to_use = ivec4(mix(sizes.xy, sizes.yx, bvec2(transpose_hw == 1)), sizes.zw);
+  const int packed_dim_to_use = mix(packed_dim, packed_dim ^ transpose_hw, packed_dim < 2);
 
   const ivec4 buf_indices = tidx_to_nchwi(
       tidx_to_use, sizes_to_use, packed_dim_to_use);
 
-  int shift = (1 << 8) - 1;
-  ivec4 masks;
-  // Masks used to unpack 4x 8-bit values from a 32 bit integer. Note that
-  // little endian is assumed, as most processors use little endian. Thus the
-  // most significant bytes correspond to the "latter" packed values.
-  masks.x = shift << (8 * (buf_indices.x % 4));
-  masks.y = shift << (8 * (buf_indices.y % 4));
-  masks.z = shift << (8 * (buf_indices.z % 4));
-  masks.w = shift << (8 * (buf_indices.w % 4));
+  const int mask = (1 << 8) - 1;
 
   ivec4 out_tex = ivec4(0);
 
   [[unroll]] for (int i = 0; i < 4; ++i) {
     if (tidx[packed_dim] + i < sizes[packed_dim]) {
-      int in_texel = nchw_in[buf_indices[i] / 4];
-      int extracted_val = (in_texel & masks[i]) >> (8 * (buf_indices[i] % 4));
+      const int in_texel = nchw_in[buf_indices[i] >> 2];
+      int extracted_val = (in_texel >> (8 * (buf_indices[i] & 3))) & mask;
       extracted_val = extend_sign(extracted_val);
       out_tex[i] = extracted_val;
     }